From 2a52ca7c98960aafb0eca9ef96b2d0c932171357 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 18 Jun 2024 10:09:17 -1000 Subject: sched_ext: Add scx_simple and scx_example_qmap example schedulers Add two simple example BPF schedulers - simple and qmap. * simple: In terms of scheduling, it behaves identical to not having any operation implemented at all. The two operations it implements are only to improve visibility and exit handling. On certain homogeneous configurations, this actually can perform pretty well. * qmap: A fixed five level priority scheduler to demonstrate queueing PIDs on BPF maps for scheduling. While not very practical, this is useful as a simple example and will be used to demonstrate different features. v7: - Compat helpers stripped out in prepartion of upstreaming as the upstreamed patchset will be the baselinfe. Utility macros that can be used to implement compat features are kept. - Explicitly disable map autoattach on struct_ops to avoid trying to attach twice while maintaining compatbility with older libbpf. v6: - Common header files reorganized and cleaned up. Compat helpers are added to demonstrate how schedulers can maintain backward compatibility with older kernels while making use of newly added features. - simple_select_cpu() added to keep track of the number of local dispatches. This is needed because the default ops.select_cpu() implementation is updated to dispatch directly and won't call ops.enqueue(). - Updated to reflect the sched_ext API changes. Switching all tasks is the default behavior now and scx_qmap supports partial switching when `-p` is specified. - tools/sched_ext/Kconfig dropped. This will be included in the doc instead. v5: - Improve Makefile. Build artifects are now collected into a separate dir which change be changed. Install and help targets are added and clean actually cleans everything. - MEMBER_VPTR() improved to improve access to structs. ARRAY_ELEM_PTR() and RESIZEABLE_ARRAY() are added to support resizable arrays in .bss. - Add scx_common.h which provides common utilities to user code such as SCX_BUG[_ON]() and RESIZE_ARRAY(). - Use SCX_BUG[_ON]() to simplify error handling. v4: - Dropped _example prefix from scheduler names. v3: - Rename scx_example_dummy to scx_example_simple and restructure a bit to ease later additions. Comment updates. - Added declarations for BPF inline iterators. In the future, hopefully, these will be consolidated into a generic BPF header so that they don't need to be replicated here. v2: - Updated with the generic BPF cpumask helpers. Signed-off-by: Tejun Heo Reviewed-by: David Vernet Acked-by: Josh Don Acked-by: Hao Luo Acked-by: Barret Rhoden --- tools/Makefile | 10 +- tools/sched_ext/.gitignore | 2 + tools/sched_ext/Makefile | 246 ++++++++++++++++ tools/sched_ext/include/bpf-compat/gnu/stubs.h | 11 + tools/sched_ext/include/scx/common.bpf.h | 379 +++++++++++++++++++++++++ tools/sched_ext/include/scx/common.h | 75 +++++ tools/sched_ext/include/scx/compat.bpf.h | 28 ++ tools/sched_ext/include/scx/compat.h | 153 ++++++++++ tools/sched_ext/include/scx/user_exit_info.h | 64 +++++ tools/sched_ext/scx_qmap.bpf.c | 264 +++++++++++++++++ tools/sched_ext/scx_qmap.c | 99 +++++++ tools/sched_ext/scx_simple.bpf.c | 63 ++++ tools/sched_ext/scx_simple.c | 99 +++++++ 13 files changed, 1492 insertions(+), 1 deletion(-) create mode 100644 tools/sched_ext/.gitignore create mode 100644 tools/sched_ext/Makefile create mode 100644 tools/sched_ext/include/bpf-compat/gnu/stubs.h create mode 100644 tools/sched_ext/include/scx/common.bpf.h create mode 100644 tools/sched_ext/include/scx/common.h create mode 100644 tools/sched_ext/include/scx/compat.bpf.h create mode 100644 tools/sched_ext/include/scx/compat.h create mode 100644 tools/sched_ext/include/scx/user_exit_info.h create mode 100644 tools/sched_ext/scx_qmap.bpf.c create mode 100644 tools/sched_ext/scx_qmap.c create mode 100644 tools/sched_ext/scx_simple.bpf.c create mode 100644 tools/sched_ext/scx_simple.c (limited to 'tools') diff --git a/tools/Makefile b/tools/Makefile index 276f5d0d53a4..278d24723b74 100644 --- a/tools/Makefile +++ b/tools/Makefile @@ -28,6 +28,7 @@ help: @echo ' pci - PCI tools' @echo ' perf - Linux performance measurement and analysis tool' @echo ' selftests - various kernel selftests' + @echo ' sched_ext - sched_ext example schedulers' @echo ' bootconfig - boot config tool' @echo ' spi - spi tools' @echo ' tmon - thermal monitoring and tuning tool' @@ -91,6 +92,9 @@ perf: FORCE $(Q)mkdir -p $(PERF_O) . $(Q)$(MAKE) --no-print-directory -C perf O=$(PERF_O) subdir= +sched_ext: FORCE + $(call descend,sched_ext) + selftests: FORCE $(call descend,testing/$@) @@ -184,6 +188,9 @@ perf_clean: $(Q)mkdir -p $(PERF_O) . $(Q)$(MAKE) --no-print-directory -C perf O=$(PERF_O) subdir= clean +sched_ext_clean: + $(call descend,sched_ext,clean) + selftests_clean: $(call descend,testing/$(@:_clean=),clean) @@ -213,6 +220,7 @@ clean: acpi_clean counter_clean cpupower_clean hv_clean firewire_clean \ mm_clean bpf_clean iio_clean x86_energy_perf_policy_clean tmon_clean \ freefall_clean build_clean libbpf_clean libsubcmd_clean \ gpio_clean objtool_clean leds_clean wmi_clean pci_clean firmware_clean debugging_clean \ - intel-speed-select_clean tracing_clean thermal_clean thermometer_clean thermal-engine_clean + intel-speed-select_clean tracing_clean thermal_clean thermometer_clean thermal-engine_clean \ + sched_ext_clean .PHONY: FORCE diff --git a/tools/sched_ext/.gitignore b/tools/sched_ext/.gitignore new file mode 100644 index 000000000000..d6264fe1c8cd --- /dev/null +++ b/tools/sched_ext/.gitignore @@ -0,0 +1,2 @@ +tools/ +build/ diff --git a/tools/sched_ext/Makefile b/tools/sched_ext/Makefile new file mode 100644 index 000000000000..626782a21375 --- /dev/null +++ b/tools/sched_ext/Makefile @@ -0,0 +1,246 @@ +# SPDX-License-Identifier: GPL-2.0 +# Copyright (c) 2022 Meta Platforms, Inc. and affiliates. +include ../build/Build.include +include ../scripts/Makefile.arch +include ../scripts/Makefile.include + +all: all_targets + +ifneq ($(LLVM),) +ifneq ($(filter %/,$(LLVM)),) +LLVM_PREFIX := $(LLVM) +else ifneq ($(filter -%,$(LLVM)),) +LLVM_SUFFIX := $(LLVM) +endif + +CLANG_TARGET_FLAGS_arm := arm-linux-gnueabi +CLANG_TARGET_FLAGS_arm64 := aarch64-linux-gnu +CLANG_TARGET_FLAGS_hexagon := hexagon-linux-musl +CLANG_TARGET_FLAGS_m68k := m68k-linux-gnu +CLANG_TARGET_FLAGS_mips := mipsel-linux-gnu +CLANG_TARGET_FLAGS_powerpc := powerpc64le-linux-gnu +CLANG_TARGET_FLAGS_riscv := riscv64-linux-gnu +CLANG_TARGET_FLAGS_s390 := s390x-linux-gnu +CLANG_TARGET_FLAGS_x86 := x86_64-linux-gnu +CLANG_TARGET_FLAGS := $(CLANG_TARGET_FLAGS_$(ARCH)) + +ifeq ($(CROSS_COMPILE),) +ifeq ($(CLANG_TARGET_FLAGS),) +$(error Specify CROSS_COMPILE or add '--target=' option to lib.mk) +else +CLANG_FLAGS += --target=$(CLANG_TARGET_FLAGS) +endif # CLANG_TARGET_FLAGS +else +CLANG_FLAGS += --target=$(notdir $(CROSS_COMPILE:%-=%)) +endif # CROSS_COMPILE + +CC := $(LLVM_PREFIX)clang$(LLVM_SUFFIX) $(CLANG_FLAGS) -fintegrated-as +else +CC := $(CROSS_COMPILE)gcc +endif # LLVM + +CURDIR := $(abspath .) +TOOLSDIR := $(abspath ..) +LIBDIR := $(TOOLSDIR)/lib +BPFDIR := $(LIBDIR)/bpf +TOOLSINCDIR := $(TOOLSDIR)/include +BPFTOOLDIR := $(TOOLSDIR)/bpf/bpftool +APIDIR := $(TOOLSINCDIR)/uapi +GENDIR := $(abspath ../../include/generated) +GENHDR := $(GENDIR)/autoconf.h + +ifeq ($(O),) +OUTPUT_DIR := $(CURDIR)/build +else +OUTPUT_DIR := $(O)/build +endif # O +OBJ_DIR := $(OUTPUT_DIR)/obj +INCLUDE_DIR := $(OUTPUT_DIR)/include +BPFOBJ_DIR := $(OBJ_DIR)/libbpf +SCXOBJ_DIR := $(OBJ_DIR)/sched_ext +BINDIR := $(OUTPUT_DIR)/bin +BPFOBJ := $(BPFOBJ_DIR)/libbpf.a +ifneq ($(CROSS_COMPILE),) +HOST_BUILD_DIR := $(OBJ_DIR)/host +HOST_OUTPUT_DIR := host-tools +HOST_INCLUDE_DIR := $(HOST_OUTPUT_DIR)/include +else +HOST_BUILD_DIR := $(OBJ_DIR) +HOST_OUTPUT_DIR := $(OUTPUT_DIR) +HOST_INCLUDE_DIR := $(INCLUDE_DIR) +endif +HOST_BPFOBJ := $(HOST_BUILD_DIR)/libbpf/libbpf.a +RESOLVE_BTFIDS := $(HOST_BUILD_DIR)/resolve_btfids/resolve_btfids +DEFAULT_BPFTOOL := $(HOST_OUTPUT_DIR)/sbin/bpftool + +VMLINUX_BTF_PATHS ?= $(if $(O),$(O)/vmlinux) \ + $(if $(KBUILD_OUTPUT),$(KBUILD_OUTPUT)/vmlinux) \ + ../../vmlinux \ + /sys/kernel/btf/vmlinux \ + /boot/vmlinux-$(shell uname -r) +VMLINUX_BTF ?= $(abspath $(firstword $(wildcard $(VMLINUX_BTF_PATHS)))) +ifeq ($(VMLINUX_BTF),) +$(error Cannot find a vmlinux for VMLINUX_BTF at any of "$(VMLINUX_BTF_PATHS)") +endif + +BPFTOOL ?= $(DEFAULT_BPFTOOL) + +ifneq ($(wildcard $(GENHDR)),) + GENFLAGS := -DHAVE_GENHDR +endif + +CFLAGS += -g -O2 -rdynamic -pthread -Wall -Werror $(GENFLAGS) \ + -I$(INCLUDE_DIR) -I$(GENDIR) -I$(LIBDIR) \ + -I$(TOOLSINCDIR) -I$(APIDIR) -I$(CURDIR)/include + +# Silence some warnings when compiled with clang +ifneq ($(LLVM),) +CFLAGS += -Wno-unused-command-line-argument +endif + +LDFLAGS = -lelf -lz -lpthread + +IS_LITTLE_ENDIAN = $(shell $(CC) -dM -E - &1 \ + | sed -n '/<...> search starts here:/,/End of search list./{ s| \(/.*\)|-idirafter \1|p }') \ +$(shell $(1) -dM -E - $@ +else + $(call msg,CP,,$@) + $(Q)cp "$(VMLINUX_H)" $@ +endif + +$(SCXOBJ_DIR)/%.bpf.o: %.bpf.c $(INCLUDE_DIR)/vmlinux.h include/scx/*.h \ + | $(BPFOBJ) $(SCXOBJ_DIR) + $(call msg,CLNG-BPF,,$(notdir $@)) + $(Q)$(CLANG) $(BPF_CFLAGS) -target bpf -c $< -o $@ + +$(INCLUDE_DIR)/%.bpf.skel.h: $(SCXOBJ_DIR)/%.bpf.o $(INCLUDE_DIR)/vmlinux.h $(BPFTOOL) + $(eval sched=$(notdir $@)) + $(call msg,GEN-SKEL,,$(sched)) + $(Q)$(BPFTOOL) gen object $(<:.o=.linked1.o) $< + $(Q)$(BPFTOOL) gen object $(<:.o=.linked2.o) $(<:.o=.linked1.o) + $(Q)$(BPFTOOL) gen object $(<:.o=.linked3.o) $(<:.o=.linked2.o) + $(Q)diff $(<:.o=.linked2.o) $(<:.o=.linked3.o) + $(Q)$(BPFTOOL) gen skeleton $(<:.o=.linked3.o) name $(subst .bpf.skel.h,,$(sched)) > $@ + $(Q)$(BPFTOOL) gen subskeleton $(<:.o=.linked3.o) name $(subst .bpf.skel.h,,$(sched)) > $(@:.skel.h=.subskel.h) + +SCX_COMMON_DEPS := include/scx/common.h include/scx/user_exit_info.h | $(BINDIR) + +c-sched-targets = scx_simple scx_qmap + +$(addprefix $(BINDIR)/,$(c-sched-targets)): \ + $(BINDIR)/%: \ + $(filter-out %.bpf.c,%.c) \ + $(INCLUDE_DIR)/%.bpf.skel.h \ + $(SCX_COMMON_DEPS) + $(eval sched=$(notdir $@)) + $(CC) $(CFLAGS) -c $(sched).c -o $(SCXOBJ_DIR)/$(sched).o + $(CC) -o $@ $(SCXOBJ_DIR)/$(sched).o $(HOST_BPFOBJ) $(LDFLAGS) + +$(c-sched-targets): %: $(BINDIR)/% + +install: all + $(Q)mkdir -p $(DESTDIR)/usr/local/bin/ + $(Q)cp $(BINDIR)/* $(DESTDIR)/usr/local/bin/ + +clean: + rm -rf $(OUTPUT_DIR) $(HOST_OUTPUT_DIR) + rm -f *.o *.bpf.o *.bpf.skel.h *.bpf.subskel.h + rm -f $(c-sched-targets) + +help: + @echo 'Building targets' + @echo '================' + @echo '' + @echo ' all - Compile all schedulers' + @echo '' + @echo 'Alternatively, you may compile individual schedulers:' + @echo '' + @printf ' %s\n' $(c-sched-targets) + @echo '' + @echo 'For any scheduler build target, you may specify an alternative' + @echo 'build output path with the O= environment variable. For example:' + @echo '' + @echo ' O=/tmp/sched_ext make all' + @echo '' + @echo 'will compile all schedulers, and emit the build artifacts to' + @echo '/tmp/sched_ext/build.' + @echo '' + @echo '' + @echo 'Installing targets' + @echo '==================' + @echo '' + @echo ' install - Compile and install all schedulers to /usr/bin.' + @echo ' You may specify the DESTDIR= environment variable' + @echo ' to indicate a prefix for /usr/bin. For example:' + @echo '' + @echo ' DESTDIR=/tmp/sched_ext make install' + @echo '' + @echo ' will build the schedulers in CWD/build, and' + @echo ' install the schedulers to /tmp/sched_ext/usr/bin.' + @echo '' + @echo '' + @echo 'Cleaning targets' + @echo '================' + @echo '' + @echo ' clean - Remove all generated files' + +all_targets: $(c-sched-targets) + +.PHONY: all all_targets $(c-sched-targets) clean help + +# delete failed targets +.DELETE_ON_ERROR: + +# keep intermediate (.bpf.skel.h, .bpf.o, etc) targets +.SECONDARY: diff --git a/tools/sched_ext/include/bpf-compat/gnu/stubs.h b/tools/sched_ext/include/bpf-compat/gnu/stubs.h new file mode 100644 index 000000000000..ad7d139ce907 --- /dev/null +++ b/tools/sched_ext/include/bpf-compat/gnu/stubs.h @@ -0,0 +1,11 @@ +/* + * Dummy gnu/stubs.h. clang can end up including /usr/include/gnu/stubs.h when + * compiling BPF files although its content doesn't play any role. The file in + * turn includes stubs-64.h or stubs-32.h depending on whether __x86_64__ is + * defined. When compiling a BPF source, __x86_64__ isn't set and thus + * stubs-32.h is selected. However, the file is not there if the system doesn't + * have 32bit glibc devel package installed leading to a build failure. + * + * The problem is worked around by making this file available in the include + * search paths before the system one when building BPF. + */ diff --git a/tools/sched_ext/include/scx/common.bpf.h b/tools/sched_ext/include/scx/common.bpf.h new file mode 100644 index 000000000000..833fe1bdccf9 --- /dev/null +++ b/tools/sched_ext/include/scx/common.bpf.h @@ -0,0 +1,379 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2022 Tejun Heo + * Copyright (c) 2022 David Vernet + */ +#ifndef __SCX_COMMON_BPF_H +#define __SCX_COMMON_BPF_H + +#include "vmlinux.h" +#include +#include +#include +#include "user_exit_info.h" + +#define PF_WQ_WORKER 0x00000020 /* I'm a workqueue worker */ +#define PF_KTHREAD 0x00200000 /* I am a kernel thread */ +#define PF_EXITING 0x00000004 +#define CLOCK_MONOTONIC 1 + +/* + * Earlier versions of clang/pahole lost upper 32bits in 64bit enums which can + * lead to really confusing misbehaviors. Let's trigger a build failure. + */ +static inline void ___vmlinux_h_sanity_check___(void) +{ + _Static_assert(SCX_DSQ_FLAG_BUILTIN, + "bpftool generated vmlinux.h is missing high bits for 64bit enums, upgrade clang and pahole"); +} + +s32 scx_bpf_create_dsq(u64 dsq_id, s32 node) __ksym; +s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, bool *is_idle) __ksym; +void scx_bpf_dispatch(struct task_struct *p, u64 dsq_id, u64 slice, u64 enq_flags) __ksym; +u32 scx_bpf_dispatch_nr_slots(void) __ksym; +void scx_bpf_dispatch_cancel(void) __ksym; +bool scx_bpf_consume(u64 dsq_id) __ksym; +s32 scx_bpf_dsq_nr_queued(u64 dsq_id) __ksym; +void scx_bpf_destroy_dsq(u64 dsq_id) __ksym; +void scx_bpf_exit_bstr(s64 exit_code, char *fmt, unsigned long long *data, u32 data__sz) __ksym __weak; +void scx_bpf_error_bstr(char *fmt, unsigned long long *data, u32 data_len) __ksym; +u32 scx_bpf_nr_cpu_ids(void) __ksym __weak; +const struct cpumask *scx_bpf_get_possible_cpumask(void) __ksym __weak; +const struct cpumask *scx_bpf_get_online_cpumask(void) __ksym __weak; +void scx_bpf_put_cpumask(const struct cpumask *cpumask) __ksym __weak; +const struct cpumask *scx_bpf_get_idle_cpumask(void) __ksym; +const struct cpumask *scx_bpf_get_idle_smtmask(void) __ksym; +void scx_bpf_put_idle_cpumask(const struct cpumask *cpumask) __ksym; +bool scx_bpf_test_and_clear_cpu_idle(s32 cpu) __ksym; +s32 scx_bpf_pick_idle_cpu(const cpumask_t *cpus_allowed, u64 flags) __ksym; +s32 scx_bpf_pick_any_cpu(const cpumask_t *cpus_allowed, u64 flags) __ksym; +bool scx_bpf_task_running(const struct task_struct *p) __ksym; +s32 scx_bpf_task_cpu(const struct task_struct *p) __ksym; + +static inline __attribute__((format(printf, 1, 2))) +void ___scx_bpf_bstr_format_checker(const char *fmt, ...) {} + +/* + * Helper macro for initializing the fmt and variadic argument inputs to both + * bstr exit kfuncs. Callers to this function should use ___fmt and ___param to + * refer to the initialized list of inputs to the bstr kfunc. + */ +#define scx_bpf_bstr_preamble(fmt, args...) \ + static char ___fmt[] = fmt; \ + /* \ + * Note that __param[] must have at least one \ + * element to keep the verifier happy. \ + */ \ + unsigned long long ___param[___bpf_narg(args) ?: 1] = {}; \ + \ + _Pragma("GCC diagnostic push") \ + _Pragma("GCC diagnostic ignored \"-Wint-conversion\"") \ + ___bpf_fill(___param, args); \ + _Pragma("GCC diagnostic pop") \ + +/* + * scx_bpf_exit() wraps the scx_bpf_exit_bstr() kfunc with variadic arguments + * instead of an array of u64. Using this macro will cause the scheduler to + * exit cleanly with the specified exit code being passed to user space. + */ +#define scx_bpf_exit(code, fmt, args...) \ +({ \ + scx_bpf_bstr_preamble(fmt, args) \ + scx_bpf_exit_bstr(code, ___fmt, ___param, sizeof(___param)); \ + ___scx_bpf_bstr_format_checker(fmt, ##args); \ +}) + +/* + * scx_bpf_error() wraps the scx_bpf_error_bstr() kfunc with variadic arguments + * instead of an array of u64. Invoking this macro will cause the scheduler to + * exit in an erroneous state, with diagnostic information being passed to the + * user. + */ +#define scx_bpf_error(fmt, args...) \ +({ \ + scx_bpf_bstr_preamble(fmt, args) \ + scx_bpf_error_bstr(___fmt, ___param, sizeof(___param)); \ + ___scx_bpf_bstr_format_checker(fmt, ##args); \ +}) + +#define BPF_STRUCT_OPS(name, args...) \ +SEC("struct_ops/"#name) \ +BPF_PROG(name, ##args) + +#define BPF_STRUCT_OPS_SLEEPABLE(name, args...) \ +SEC("struct_ops.s/"#name) \ +BPF_PROG(name, ##args) + +/** + * RESIZABLE_ARRAY - Generates annotations for an array that may be resized + * @elfsec: the data section of the BPF program in which to place the array + * @arr: the name of the array + * + * libbpf has an API for setting map value sizes. Since data sections (i.e. + * bss, data, rodata) themselves are maps, a data section can be resized. If + * a data section has an array as its last element, the BTF info for that + * array will be adjusted so that length of the array is extended to meet the + * new length of the data section. This macro annotates an array to have an + * element count of one with the assumption that this array can be resized + * within the userspace program. It also annotates the section specifier so + * this array exists in a custom sub data section which can be resized + * independently. + * + * See RESIZE_ARRAY() for the userspace convenience macro for resizing an + * array declared with RESIZABLE_ARRAY(). + */ +#define RESIZABLE_ARRAY(elfsec, arr) arr[1] SEC("."#elfsec"."#arr) + +/** + * MEMBER_VPTR - Obtain the verified pointer to a struct or array member + * @base: struct or array to index + * @member: dereferenced member (e.g. .field, [idx0][idx1], .field[idx0] ...) + * + * The verifier often gets confused by the instruction sequence the compiler + * generates for indexing struct fields or arrays. This macro forces the + * compiler to generate a code sequence which first calculates the byte offset, + * checks it against the struct or array size and add that byte offset to + * generate the pointer to the member to help the verifier. + * + * Ideally, we want to abort if the calculated offset is out-of-bounds. However, + * BPF currently doesn't support abort, so evaluate to %NULL instead. The caller + * must check for %NULL and take appropriate action to appease the verifier. To + * avoid confusing the verifier, it's best to check for %NULL and dereference + * immediately. + * + * vptr = MEMBER_VPTR(my_array, [i][j]); + * if (!vptr) + * return error; + * *vptr = new_value; + * + * sizeof(@base) should encompass the memory area to be accessed and thus can't + * be a pointer to the area. Use `MEMBER_VPTR(*ptr, .member)` instead of + * `MEMBER_VPTR(ptr, ->member)`. + */ +#define MEMBER_VPTR(base, member) (typeof((base) member) *) \ +({ \ + u64 __base = (u64)&(base); \ + u64 __addr = (u64)&((base) member) - __base; \ + _Static_assert(sizeof(base) >= sizeof((base) member), \ + "@base is smaller than @member, is @base a pointer?"); \ + asm volatile ( \ + "if %0 <= %[max] goto +2\n" \ + "%0 = 0\n" \ + "goto +1\n" \ + "%0 += %1\n" \ + : "+r"(__addr) \ + : "r"(__base), \ + [max]"i"(sizeof(base) - sizeof((base) member))); \ + __addr; \ +}) + +/** + * ARRAY_ELEM_PTR - Obtain the verified pointer to an array element + * @arr: array to index into + * @i: array index + * @n: number of elements in array + * + * Similar to MEMBER_VPTR() but is intended for use with arrays where the + * element count needs to be explicit. + * It can be used in cases where a global array is defined with an initial + * size but is intended to be be resized before loading the BPF program. + * Without this version of the macro, MEMBER_VPTR() will use the compile time + * size of the array to compute the max, which will result in rejection by + * the verifier. + */ +#define ARRAY_ELEM_PTR(arr, i, n) (typeof(arr[i]) *) \ +({ \ + u64 __base = (u64)arr; \ + u64 __addr = (u64)&(arr[i]) - __base; \ + asm volatile ( \ + "if %0 <= %[max] goto +2\n" \ + "%0 = 0\n" \ + "goto +1\n" \ + "%0 += %1\n" \ + : "+r"(__addr) \ + : "r"(__base), \ + [max]"r"(sizeof(arr[0]) * ((n) - 1))); \ + __addr; \ +}) + + +/* + * BPF declarations and helpers + */ + +/* list and rbtree */ +#define __contains(name, node) __attribute__((btf_decl_tag("contains:" #name ":" #node))) +#define private(name) SEC(".data." #name) __hidden __attribute__((aligned(8))) + +void *bpf_obj_new_impl(__u64 local_type_id, void *meta) __ksym; +void bpf_obj_drop_impl(void *kptr, void *meta) __ksym; + +#define bpf_obj_new(type) ((type *)bpf_obj_new_impl(bpf_core_type_id_local(type), NULL)) +#define bpf_obj_drop(kptr) bpf_obj_drop_impl(kptr, NULL) + +void bpf_list_push_front(struct bpf_list_head *head, struct bpf_list_node *node) __ksym; +void bpf_list_push_back(struct bpf_list_head *head, struct bpf_list_node *node) __ksym; +struct bpf_list_node *bpf_list_pop_front(struct bpf_list_head *head) __ksym; +struct bpf_list_node *bpf_list_pop_back(struct bpf_list_head *head) __ksym; +struct bpf_rb_node *bpf_rbtree_remove(struct bpf_rb_root *root, + struct bpf_rb_node *node) __ksym; +int bpf_rbtree_add_impl(struct bpf_rb_root *root, struct bpf_rb_node *node, + bool (less)(struct bpf_rb_node *a, const struct bpf_rb_node *b), + void *meta, __u64 off) __ksym; +#define bpf_rbtree_add(head, node, less) bpf_rbtree_add_impl(head, node, less, NULL, 0) + +struct bpf_rb_node *bpf_rbtree_first(struct bpf_rb_root *root) __ksym; + +void *bpf_refcount_acquire_impl(void *kptr, void *meta) __ksym; +#define bpf_refcount_acquire(kptr) bpf_refcount_acquire_impl(kptr, NULL) + +/* task */ +struct task_struct *bpf_task_from_pid(s32 pid) __ksym; +struct task_struct *bpf_task_acquire(struct task_struct *p) __ksym; +void bpf_task_release(struct task_struct *p) __ksym; + +/* cgroup */ +struct cgroup *bpf_cgroup_ancestor(struct cgroup *cgrp, int level) __ksym; +void bpf_cgroup_release(struct cgroup *cgrp) __ksym; +struct cgroup *bpf_cgroup_from_id(u64 cgid) __ksym; + +/* css iteration */ +struct bpf_iter_css; +struct cgroup_subsys_state; +extern int bpf_iter_css_new(struct bpf_iter_css *it, + struct cgroup_subsys_state *start, + unsigned int flags) __weak __ksym; +extern struct cgroup_subsys_state * +bpf_iter_css_next(struct bpf_iter_css *it) __weak __ksym; +extern void bpf_iter_css_destroy(struct bpf_iter_css *it) __weak __ksym; + +/* cpumask */ +struct bpf_cpumask *bpf_cpumask_create(void) __ksym; +struct bpf_cpumask *bpf_cpumask_acquire(struct bpf_cpumask *cpumask) __ksym; +void bpf_cpumask_release(struct bpf_cpumask *cpumask) __ksym; +u32 bpf_cpumask_first(const struct cpumask *cpumask) __ksym; +u32 bpf_cpumask_first_zero(const struct cpumask *cpumask) __ksym; +void bpf_cpumask_set_cpu(u32 cpu, struct bpf_cpumask *cpumask) __ksym; +void bpf_cpumask_clear_cpu(u32 cpu, struct bpf_cpumask *cpumask) __ksym; +bool bpf_cpumask_test_cpu(u32 cpu, const struct cpumask *cpumask) __ksym; +bool bpf_cpumask_test_and_set_cpu(u32 cpu, struct bpf_cpumask *cpumask) __ksym; +bool bpf_cpumask_test_and_clear_cpu(u32 cpu, struct bpf_cpumask *cpumask) __ksym; +void bpf_cpumask_setall(struct bpf_cpumask *cpumask) __ksym; +void bpf_cpumask_clear(struct bpf_cpumask *cpumask) __ksym; +bool bpf_cpumask_and(struct bpf_cpumask *dst, const struct cpumask *src1, + const struct cpumask *src2) __ksym; +void bpf_cpumask_or(struct bpf_cpumask *dst, const struct cpumask *src1, + const struct cpumask *src2) __ksym; +void bpf_cpumask_xor(struct bpf_cpumask *dst, const struct cpumask *src1, + const struct cpumask *src2) __ksym; +bool bpf_cpumask_equal(const struct cpumask *src1, const struct cpumask *src2) __ksym; +bool bpf_cpumask_intersects(const struct cpumask *src1, const struct cpumask *src2) __ksym; +bool bpf_cpumask_subset(const struct cpumask *src1, const struct cpumask *src2) __ksym; +bool bpf_cpumask_empty(const struct cpumask *cpumask) __ksym; +bool bpf_cpumask_full(const struct cpumask *cpumask) __ksym; +void bpf_cpumask_copy(struct bpf_cpumask *dst, const struct cpumask *src) __ksym; +u32 bpf_cpumask_any_distribute(const struct cpumask *cpumask) __ksym; +u32 bpf_cpumask_any_and_distribute(const struct cpumask *src1, + const struct cpumask *src2) __ksym; + +/* rcu */ +void bpf_rcu_read_lock(void) __ksym; +void bpf_rcu_read_unlock(void) __ksym; + + +/* + * Other helpers + */ + +/* useful compiler attributes */ +#define likely(x) __builtin_expect(!!(x), 1) +#define unlikely(x) __builtin_expect(!!(x), 0) +#define __maybe_unused __attribute__((__unused__)) + +/* + * READ/WRITE_ONCE() are from kernel (include/asm-generic/rwonce.h). They + * prevent compiler from caching, redoing or reordering reads or writes. + */ +typedef __u8 __attribute__((__may_alias__)) __u8_alias_t; +typedef __u16 __attribute__((__may_alias__)) __u16_alias_t; +typedef __u32 __attribute__((__may_alias__)) __u32_alias_t; +typedef __u64 __attribute__((__may_alias__)) __u64_alias_t; + +static __always_inline void __read_once_size(const volatile void *p, void *res, int size) +{ + switch (size) { + case 1: *(__u8_alias_t *) res = *(volatile __u8_alias_t *) p; break; + case 2: *(__u16_alias_t *) res = *(volatile __u16_alias_t *) p; break; + case 4: *(__u32_alias_t *) res = *(volatile __u32_alias_t *) p; break; + case 8: *(__u64_alias_t *) res = *(volatile __u64_alias_t *) p; break; + default: + barrier(); + __builtin_memcpy((void *)res, (const void *)p, size); + barrier(); + } +} + +static __always_inline void __write_once_size(volatile void *p, void *res, int size) +{ + switch (size) { + case 1: *(volatile __u8_alias_t *) p = *(__u8_alias_t *) res; break; + case 2: *(volatile __u16_alias_t *) p = *(__u16_alias_t *) res; break; + case 4: *(volatile __u32_alias_t *) p = *(__u32_alias_t *) res; break; + case 8: *(volatile __u64_alias_t *) p = *(__u64_alias_t *) res; break; + default: + barrier(); + __builtin_memcpy((void *)p, (const void *)res, size); + barrier(); + } +} + +#define READ_ONCE(x) \ +({ \ + union { typeof(x) __val; char __c[1]; } __u = \ + { .__c = { 0 } }; \ + __read_once_size(&(x), __u.__c, sizeof(x)); \ + __u.__val; \ +}) + +#define WRITE_ONCE(x, val) \ +({ \ + union { typeof(x) __val; char __c[1]; } __u = \ + { .__val = (val) }; \ + __write_once_size(&(x), __u.__c, sizeof(x)); \ + __u.__val; \ +}) + +/* + * log2_u32 - Compute the base 2 logarithm of a 32-bit exponential value. + * @v: The value for which we're computing the base 2 logarithm. + */ +static inline u32 log2_u32(u32 v) +{ + u32 r; + u32 shift; + + r = (v > 0xFFFF) << 4; v >>= r; + shift = (v > 0xFF) << 3; v >>= shift; r |= shift; + shift = (v > 0xF) << 2; v >>= shift; r |= shift; + shift = (v > 0x3) << 1; v >>= shift; r |= shift; + r |= (v >> 1); + return r; +} + +/* + * log2_u64 - Compute the base 2 logarithm of a 64-bit exponential value. + * @v: The value for which we're computing the base 2 logarithm. + */ +static inline u32 log2_u64(u64 v) +{ + u32 hi = v >> 32; + if (hi) + return log2_u32(hi) + 32 + 1; + else + return log2_u32(v) + 1; +} + +#include "compat.bpf.h" + +#endif /* __SCX_COMMON_BPF_H */ diff --git a/tools/sched_ext/include/scx/common.h b/tools/sched_ext/include/scx/common.h new file mode 100644 index 000000000000..5b0f90152152 --- /dev/null +++ b/tools/sched_ext/include/scx/common.h @@ -0,0 +1,75 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2023 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2023 Tejun Heo + * Copyright (c) 2023 David Vernet + */ +#ifndef __SCHED_EXT_COMMON_H +#define __SCHED_EXT_COMMON_H + +#ifdef __KERNEL__ +#error "Should not be included by BPF programs" +#endif + +#include +#include +#include +#include +#include + +typedef uint8_t u8; +typedef uint16_t u16; +typedef uint32_t u32; +typedef uint64_t u64; +typedef int8_t s8; +typedef int16_t s16; +typedef int32_t s32; +typedef int64_t s64; + +#define SCX_BUG(__fmt, ...) \ + do { \ + fprintf(stderr, "[SCX_BUG] %s:%d", __FILE__, __LINE__); \ + if (errno) \ + fprintf(stderr, " (%s)\n", strerror(errno)); \ + else \ + fprintf(stderr, "\n"); \ + fprintf(stderr, __fmt __VA_OPT__(,) __VA_ARGS__); \ + fprintf(stderr, "\n"); \ + \ + exit(EXIT_FAILURE); \ + } while (0) + +#define SCX_BUG_ON(__cond, __fmt, ...) \ + do { \ + if (__cond) \ + SCX_BUG((__fmt) __VA_OPT__(,) __VA_ARGS__); \ + } while (0) + +/** + * RESIZE_ARRAY - Convenience macro for resizing a BPF array + * @__skel: the skeleton containing the array + * @elfsec: the data section of the BPF program in which the array exists + * @arr: the name of the array + * @n: the desired array element count + * + * For BPF arrays declared with RESIZABLE_ARRAY(), this macro performs two + * operations. It resizes the map which corresponds to the custom data + * section that contains the target array. As a side effect, the BTF info for + * the array is adjusted so that the array length is sized to cover the new + * data section size. The second operation is reassigning the skeleton pointer + * for that custom data section so that it points to the newly memory mapped + * region. + */ +#define RESIZE_ARRAY(__skel, elfsec, arr, n) \ + do { \ + size_t __sz; \ + bpf_map__set_value_size((__skel)->maps.elfsec##_##arr, \ + sizeof((__skel)->elfsec##_##arr->arr[0]) * (n)); \ + (__skel)->elfsec##_##arr = \ + bpf_map__initial_value((__skel)->maps.elfsec##_##arr, &__sz); \ + } while (0) + +#include "user_exit_info.h" +#include "compat.h" + +#endif /* __SCHED_EXT_COMMON_H */ diff --git a/tools/sched_ext/include/scx/compat.bpf.h b/tools/sched_ext/include/scx/compat.bpf.h new file mode 100644 index 000000000000..3d2fe1208900 --- /dev/null +++ b/tools/sched_ext/include/scx/compat.bpf.h @@ -0,0 +1,28 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2024 Tejun Heo + * Copyright (c) 2024 David Vernet + */ +#ifndef __SCX_COMPAT_BPF_H +#define __SCX_COMPAT_BPF_H + +#define __COMPAT_ENUM_OR_ZERO(__type, __ent) \ +({ \ + __type __ret = 0; \ + if (bpf_core_enum_value_exists(__type, __ent)) \ + __ret = __ent; \ + __ret; \ +}) + +/* + * Define sched_ext_ops. This may be expanded to define multiple variants for + * backward compatibility. See compat.h::SCX_OPS_LOAD/ATTACH(). + */ +#define SCX_OPS_DEFINE(__name, ...) \ + SEC(".struct_ops.link") \ + struct sched_ext_ops __name = { \ + __VA_ARGS__, \ + }; + +#endif /* __SCX_COMPAT_BPF_H */ diff --git a/tools/sched_ext/include/scx/compat.h b/tools/sched_ext/include/scx/compat.h new file mode 100644 index 000000000000..a7fdaf8a858e --- /dev/null +++ b/tools/sched_ext/include/scx/compat.h @@ -0,0 +1,153 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2024 Tejun Heo + * Copyright (c) 2024 David Vernet + */ +#ifndef __SCX_COMPAT_H +#define __SCX_COMPAT_H + +#include + +struct btf *__COMPAT_vmlinux_btf __attribute__((weak)); + +static inline void __COMPAT_load_vmlinux_btf(void) +{ + if (!__COMPAT_vmlinux_btf) { + __COMPAT_vmlinux_btf = btf__load_vmlinux_btf(); + SCX_BUG_ON(!__COMPAT_vmlinux_btf, "btf__load_vmlinux_btf()"); + } +} + +static inline bool __COMPAT_read_enum(const char *type, const char *name, u64 *v) +{ + const struct btf_type *t; + const char *n; + s32 tid; + int i; + + __COMPAT_load_vmlinux_btf(); + + tid = btf__find_by_name(__COMPAT_vmlinux_btf, type); + if (tid < 0) + return false; + + t = btf__type_by_id(__COMPAT_vmlinux_btf, tid); + SCX_BUG_ON(!t, "btf__type_by_id(%d)", tid); + + if (btf_is_enum(t)) { + struct btf_enum *e = btf_enum(t); + + for (i = 0; i < BTF_INFO_VLEN(t->info); i++) { + n = btf__name_by_offset(__COMPAT_vmlinux_btf, e[i].name_off); + SCX_BUG_ON(!n, "btf__name_by_offset()"); + if (!strcmp(n, name)) { + *v = e[i].val; + return true; + } + } + } else if (btf_is_enum64(t)) { + struct btf_enum64 *e = btf_enum64(t); + + for (i = 0; i < BTF_INFO_VLEN(t->info); i++) { + n = btf__name_by_offset(__COMPAT_vmlinux_btf, e[i].name_off); + SCX_BUG_ON(!n, "btf__name_by_offset()"); + if (!strcmp(n, name)) { + *v = btf_enum64_value(&e[i]); + return true; + } + } + } + + return false; +} + +#define __COMPAT_ENUM_OR_ZERO(__type, __ent) \ +({ \ + u64 __val = 0; \ + __COMPAT_read_enum(__type, __ent, &__val); \ + __val; \ +}) + +static inline bool __COMPAT_has_ksym(const char *ksym) +{ + __COMPAT_load_vmlinux_btf(); + return btf__find_by_name(__COMPAT_vmlinux_btf, ksym) >= 0; +} + +static inline bool __COMPAT_struct_has_field(const char *type, const char *field) +{ + const struct btf_type *t; + const struct btf_member *m; + const char *n; + s32 tid; + int i; + + __COMPAT_load_vmlinux_btf(); + tid = btf__find_by_name_kind(__COMPAT_vmlinux_btf, type, BTF_KIND_STRUCT); + if (tid < 0) + return false; + + t = btf__type_by_id(__COMPAT_vmlinux_btf, tid); + SCX_BUG_ON(!t, "btf__type_by_id(%d)", tid); + + m = btf_members(t); + + for (i = 0; i < BTF_INFO_VLEN(t->info); i++) { + n = btf__name_by_offset(__COMPAT_vmlinux_btf, m[i].name_off); + SCX_BUG_ON(!n, "btf__name_by_offset()"); + if (!strcmp(n, field)) + return true; + } + + return false; +} + +#define SCX_OPS_SWITCH_PARTIAL \ + __COMPAT_ENUM_OR_ZERO("scx_ops_flags", "SCX_OPS_SWITCH_PARTIAL") + +/* + * struct sched_ext_ops can change over time. If compat.bpf.h::SCX_OPS_DEFINE() + * is used to define ops and compat.h::SCX_OPS_LOAD/ATTACH() are used to load + * and attach it, backward compatibility is automatically maintained where + * reasonable. + */ +#define SCX_OPS_OPEN(__ops_name, __scx_name) ({ \ + struct __scx_name *__skel; \ + \ + __skel = __scx_name##__open(); \ + SCX_BUG_ON(!__skel, "Could not open " #__scx_name); \ + __skel; \ +}) + +#define SCX_OPS_LOAD(__skel, __ops_name, __scx_name) ({ \ + SCX_BUG_ON(__scx_name##__load((__skel)), "Failed to load skel"); \ +}) + +/* + * New versions of bpftool now emit additional link placeholders for BPF maps, + * and set up BPF skeleton in such a way that libbpf will auto-attach BPF maps + * automatically, assumming libbpf is recent enough (v1.5+). Old libbpf will do + * nothing with those links and won't attempt to auto-attach maps. + * + * To maintain compatibility with older libbpf while avoiding trying to attach + * twice, disable the autoattach feature on newer libbpf. + */ +#if LIBBPF_MAJOR_VERSION > 1 || \ + (LIBBPF_MAJOR_VERSION == 1 && LIBBPF_MINOR_VERSION >= 5) +#define __SCX_OPS_DISABLE_AUTOATTACH(__skel, __ops_name) \ + bpf_map__set_autoattach((__skel)->maps.__ops_name, false) +#else +#define __SCX_OPS_DISABLE_AUTOATTACH(__skel, __ops_name) do {} while (0) +#endif + +#define SCX_OPS_ATTACH(__skel, __ops_name, __scx_name) ({ \ + struct bpf_link *__link; \ + __SCX_OPS_DISABLE_AUTOATTACH(__skel, __ops_name); \ + SCX_BUG_ON(__scx_name##__attach((__skel)), "Failed to attach skel"); \ + __link = bpf_map__attach_struct_ops((__skel)->maps.__ops_name); \ + SCX_BUG_ON(!__link, "Failed to attach struct_ops"); \ + __link; \ +}) + +#endif /* __SCX_COMPAT_H */ diff --git a/tools/sched_ext/include/scx/user_exit_info.h b/tools/sched_ext/include/scx/user_exit_info.h new file mode 100644 index 000000000000..8c3b7fac4d05 --- /dev/null +++ b/tools/sched_ext/include/scx/user_exit_info.h @@ -0,0 +1,64 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Define struct user_exit_info which is shared between BPF and userspace parts + * to communicate exit status and other information. + * + * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2022 Tejun Heo + * Copyright (c) 2022 David Vernet + */ +#ifndef __USER_EXIT_INFO_H +#define __USER_EXIT_INFO_H + +enum uei_sizes { + UEI_REASON_LEN = 128, + UEI_MSG_LEN = 1024, +}; + +struct user_exit_info { + int kind; + s64 exit_code; + char reason[UEI_REASON_LEN]; + char msg[UEI_MSG_LEN]; +}; + +#ifdef __bpf__ + +#include "vmlinux.h" +#include + +#define UEI_DEFINE(__name) \ + struct user_exit_info __name SEC(".data") + +#define UEI_RECORD(__uei_name, __ei) ({ \ + bpf_probe_read_kernel_str(__uei_name.reason, \ + sizeof(__uei_name.reason), (__ei)->reason); \ + bpf_probe_read_kernel_str(__uei_name.msg, \ + sizeof(__uei_name.msg), (__ei)->msg); \ + if (bpf_core_field_exists((__ei)->exit_code)) \ + __uei_name.exit_code = (__ei)->exit_code; \ + /* use __sync to force memory barrier */ \ + __sync_val_compare_and_swap(&__uei_name.kind, __uei_name.kind, \ + (__ei)->kind); \ +}) + +#else /* !__bpf__ */ + +#include +#include + +#define UEI_EXITED(__skel, __uei_name) ({ \ + /* use __sync to force memory barrier */ \ + __sync_val_compare_and_swap(&(__skel)->data->__uei_name.kind, -1, -1); \ +}) + +#define UEI_REPORT(__skel, __uei_name) ({ \ + struct user_exit_info *__uei = &(__skel)->data->__uei_name; \ + fprintf(stderr, "EXIT: %s", __uei->reason); \ + if (__uei->msg[0] != '\0') \ + fprintf(stderr, " (%s)", __uei->msg); \ + fputs("\n", stderr); \ +}) + +#endif /* __bpf__ */ +#endif /* __USER_EXIT_INFO_H */ diff --git a/tools/sched_ext/scx_qmap.bpf.c b/tools/sched_ext/scx_qmap.bpf.c new file mode 100644 index 000000000000..976a2693da71 --- /dev/null +++ b/tools/sched_ext/scx_qmap.bpf.c @@ -0,0 +1,264 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * A simple five-level FIFO queue scheduler. + * + * There are five FIFOs implemented using BPF_MAP_TYPE_QUEUE. A task gets + * assigned to one depending on its compound weight. Each CPU round robins + * through the FIFOs and dispatches more from FIFOs with higher indices - 1 from + * queue0, 2 from queue1, 4 from queue2 and so on. + * + * This scheduler demonstrates: + * + * - BPF-side queueing using PIDs. + * - Sleepable per-task storage allocation using ops.prep_enable(). + * + * This scheduler is primarily for demonstration and testing of sched_ext + * features and unlikely to be useful for actual workloads. + * + * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2022 Tejun Heo + * Copyright (c) 2022 David Vernet + */ +#include + +enum consts { + ONE_SEC_IN_NS = 1000000000, + SHARED_DSQ = 0, +}; + +char _license[] SEC("license") = "GPL"; + +const volatile u64 slice_ns = SCX_SLICE_DFL; +const volatile u32 dsp_batch; + +u32 test_error_cnt; + +UEI_DEFINE(uei); + +struct qmap { + __uint(type, BPF_MAP_TYPE_QUEUE); + __uint(max_entries, 4096); + __type(value, u32); +} queue0 SEC(".maps"), + queue1 SEC(".maps"), + queue2 SEC(".maps"), + queue3 SEC(".maps"), + queue4 SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS); + __uint(max_entries, 5); + __type(key, int); + __array(values, struct qmap); +} queue_arr SEC(".maps") = { + .values = { + [0] = &queue0, + [1] = &queue1, + [2] = &queue2, + [3] = &queue3, + [4] = &queue4, + }, +}; + +/* Per-task scheduling context */ +struct task_ctx { + bool force_local; /* Dispatch directly to local_dsq */ +}; + +struct { + __uint(type, BPF_MAP_TYPE_TASK_STORAGE); + __uint(map_flags, BPF_F_NO_PREALLOC); + __type(key, int); + __type(value, struct task_ctx); +} task_ctx_stor SEC(".maps"); + +struct cpu_ctx { + u64 dsp_idx; /* dispatch index */ + u64 dsp_cnt; /* remaining count */ +}; + +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __uint(max_entries, 1); + __type(key, u32); + __type(value, struct cpu_ctx); +} cpu_ctx_stor SEC(".maps"); + +/* Statistics */ +u64 nr_enqueued, nr_dispatched, nr_dequeued; + +s32 BPF_STRUCT_OPS(qmap_select_cpu, struct task_struct *p, + s32 prev_cpu, u64 wake_flags) +{ + struct task_ctx *tctx; + s32 cpu; + + tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0); + if (!tctx) { + scx_bpf_error("task_ctx lookup failed"); + return -ESRCH; + } + + if (p->nr_cpus_allowed == 1 || + scx_bpf_test_and_clear_cpu_idle(prev_cpu)) { + tctx->force_local = true; + return prev_cpu; + } + + cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0); + if (cpu >= 0) + return cpu; + + return prev_cpu; +} + +static int weight_to_idx(u32 weight) +{ + /* Coarsely map the compound weight to a FIFO. */ + if (weight <= 25) + return 0; + else if (weight <= 50) + return 1; + else if (weight < 200) + return 2; + else if (weight < 400) + return 3; + else + return 4; +} + +void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags) +{ + struct task_ctx *tctx; + u32 pid = p->pid; + int idx = weight_to_idx(p->scx.weight); + void *ring; + + if (test_error_cnt && !--test_error_cnt) + scx_bpf_error("test triggering error"); + + tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0); + if (!tctx) { + scx_bpf_error("task_ctx lookup failed"); + return; + } + + /* Is select_cpu() is telling us to enqueue locally? */ + if (tctx->force_local) { + tctx->force_local = false; + scx_bpf_dispatch(p, SCX_DSQ_LOCAL, slice_ns, enq_flags); + return; + } + + ring = bpf_map_lookup_elem(&queue_arr, &idx); + if (!ring) { + scx_bpf_error("failed to find ring %d", idx); + return; + } + + /* Queue on the selected FIFO. If the FIFO overflows, punt to global. */ + if (bpf_map_push_elem(ring, &pid, 0)) { + scx_bpf_dispatch(p, SHARED_DSQ, slice_ns, enq_flags); + return; + } + + __sync_fetch_and_add(&nr_enqueued, 1); +} + +/* + * The BPF queue map doesn't support removal and sched_ext can handle spurious + * dispatches. qmap_dequeue() is only used to collect statistics. + */ +void BPF_STRUCT_OPS(qmap_dequeue, struct task_struct *p, u64 deq_flags) +{ + __sync_fetch_and_add(&nr_dequeued, 1); +} + +void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev) +{ + struct task_struct *p; + struct cpu_ctx *cpuc; + u32 zero = 0, batch = dsp_batch ?: 1; + void *fifo; + s32 i, pid; + + if (scx_bpf_consume(SHARED_DSQ)) + return; + + if (!(cpuc = bpf_map_lookup_elem(&cpu_ctx_stor, &zero))) { + scx_bpf_error("failed to look up cpu_ctx"); + return; + } + + for (i = 0; i < 5; i++) { + /* Advance the dispatch cursor and pick the fifo. */ + if (!cpuc->dsp_cnt) { + cpuc->dsp_idx = (cpuc->dsp_idx + 1) % 5; + cpuc->dsp_cnt = 1 << cpuc->dsp_idx; + } + + fifo = bpf_map_lookup_elem(&queue_arr, &cpuc->dsp_idx); + if (!fifo) { + scx_bpf_error("failed to find ring %llu", cpuc->dsp_idx); + return; + } + + /* Dispatch or advance. */ + bpf_repeat(BPF_MAX_LOOPS) { + if (bpf_map_pop_elem(fifo, &pid)) + break; + + p = bpf_task_from_pid(pid); + if (!p) + continue; + + __sync_fetch_and_add(&nr_dispatched, 1); + scx_bpf_dispatch(p, SHARED_DSQ, slice_ns, 0); + bpf_task_release(p); + batch--; + cpuc->dsp_cnt--; + if (!batch || !scx_bpf_dispatch_nr_slots()) { + scx_bpf_consume(SHARED_DSQ); + return; + } + if (!cpuc->dsp_cnt) + break; + } + + cpuc->dsp_cnt = 0; + } +} + +s32 BPF_STRUCT_OPS(qmap_init_task, struct task_struct *p, + struct scx_init_task_args *args) +{ + /* + * @p is new. Let's ensure that its task_ctx is available. We can sleep + * in this function and the following will automatically use GFP_KERNEL. + */ + if (bpf_task_storage_get(&task_ctx_stor, p, 0, + BPF_LOCAL_STORAGE_GET_F_CREATE)) + return 0; + else + return -ENOMEM; +} + +s32 BPF_STRUCT_OPS_SLEEPABLE(qmap_init) +{ + return scx_bpf_create_dsq(SHARED_DSQ, -1); +} + +void BPF_STRUCT_OPS(qmap_exit, struct scx_exit_info *ei) +{ + UEI_RECORD(uei, ei); +} + +SCX_OPS_DEFINE(qmap_ops, + .select_cpu = (void *)qmap_select_cpu, + .enqueue = (void *)qmap_enqueue, + .dequeue = (void *)qmap_dequeue, + .dispatch = (void *)qmap_dispatch, + .init_task = (void *)qmap_init_task, + .init = (void *)qmap_init, + .exit = (void *)qmap_exit, + .name = "qmap"); diff --git a/tools/sched_ext/scx_qmap.c b/tools/sched_ext/scx_qmap.c new file mode 100644 index 000000000000..7c84ade7ecfb --- /dev/null +++ b/tools/sched_ext/scx_qmap.c @@ -0,0 +1,99 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2022 Tejun Heo + * Copyright (c) 2022 David Vernet + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include "scx_qmap.bpf.skel.h" + +const char help_fmt[] = +"A simple five-level FIFO queue sched_ext scheduler.\n" +"\n" +"See the top-level comment in .bpf.c for more details.\n" +"\n" +"Usage: %s [-s SLICE_US] [-e COUNT] [-b COUNT] [-p] [-v]\n" +"\n" +" -s SLICE_US Override slice duration\n" +" -e COUNT Trigger scx_bpf_error() after COUNT enqueues\n" +" -b COUNT Dispatch upto COUNT tasks together\n" +" -p Switch only tasks on SCHED_EXT policy intead of all\n" +" -v Print libbpf debug messages\n" +" -h Display this help and exit\n"; + +static bool verbose; +static volatile int exit_req; + +static int libbpf_print_fn(enum libbpf_print_level level, const char *format, va_list args) +{ + if (level == LIBBPF_DEBUG && !verbose) + return 0; + return vfprintf(stderr, format, args); +} + +static void sigint_handler(int dummy) +{ + exit_req = 1; +} + +int main(int argc, char **argv) +{ + struct scx_qmap *skel; + struct bpf_link *link; + int opt; + + libbpf_set_print(libbpf_print_fn); + signal(SIGINT, sigint_handler); + signal(SIGTERM, sigint_handler); + + skel = SCX_OPS_OPEN(qmap_ops, scx_qmap); + + while ((opt = getopt(argc, argv, "s:e:b:pvh")) != -1) { + switch (opt) { + case 's': + skel->rodata->slice_ns = strtoull(optarg, NULL, 0) * 1000; + break; + case 'e': + skel->bss->test_error_cnt = strtoul(optarg, NULL, 0); + break; + case 'b': + skel->rodata->dsp_batch = strtoul(optarg, NULL, 0); + break; + case 'p': + skel->struct_ops.qmap_ops->flags |= SCX_OPS_SWITCH_PARTIAL; + break; + case 'v': + verbose = true; + break; + default: + fprintf(stderr, help_fmt, basename(argv[0])); + return opt != 'h'; + } + } + + SCX_OPS_LOAD(skel, qmap_ops, scx_qmap); + link = SCX_OPS_ATTACH(skel, qmap_ops, scx_qmap); + + while (!exit_req && !UEI_EXITED(skel, uei)) { + long nr_enqueued = skel->bss->nr_enqueued; + long nr_dispatched = skel->bss->nr_dispatched; + + printf("stats : enq=%lu dsp=%lu delta=%ld deq=%"PRIu64"\n", + nr_enqueued, nr_dispatched, nr_enqueued - nr_dispatched, + skel->bss->nr_dequeued); + fflush(stdout); + sleep(1); + } + + bpf_link__destroy(link); + UEI_REPORT(skel, uei); + scx_qmap__destroy(skel); + return 0; +} diff --git a/tools/sched_ext/scx_simple.bpf.c b/tools/sched_ext/scx_simple.bpf.c new file mode 100644 index 000000000000..6bb13a3c801b --- /dev/null +++ b/tools/sched_ext/scx_simple.bpf.c @@ -0,0 +1,63 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * A simple scheduler. + * + * A simple global FIFO scheduler. It also demonstrates the following niceties. + * + * - Statistics tracking how many tasks are queued to local and global dsq's. + * - Termination notification for userspace. + * + * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2022 Tejun Heo + * Copyright (c) 2022 David Vernet + */ +#include + +char _license[] SEC("license") = "GPL"; + +UEI_DEFINE(uei); + +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __uint(key_size, sizeof(u32)); + __uint(value_size, sizeof(u64)); + __uint(max_entries, 2); /* [local, global] */ +} stats SEC(".maps"); + +static void stat_inc(u32 idx) +{ + u64 *cnt_p = bpf_map_lookup_elem(&stats, &idx); + if (cnt_p) + (*cnt_p)++; +} + +s32 BPF_STRUCT_OPS(simple_select_cpu, struct task_struct *p, s32 prev_cpu, u64 wake_flags) +{ + bool is_idle = false; + s32 cpu; + + cpu = scx_bpf_select_cpu_dfl(p, prev_cpu, wake_flags, &is_idle); + if (is_idle) { + stat_inc(0); /* count local queueing */ + scx_bpf_dispatch(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, 0); + } + + return cpu; +} + +void BPF_STRUCT_OPS(simple_enqueue, struct task_struct *p, u64 enq_flags) +{ + stat_inc(1); /* count global queueing */ + scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags); +} + +void BPF_STRUCT_OPS(simple_exit, struct scx_exit_info *ei) +{ + UEI_RECORD(uei, ei); +} + +SCX_OPS_DEFINE(simple_ops, + .select_cpu = (void *)simple_select_cpu, + .enqueue = (void *)simple_enqueue, + .exit = (void *)simple_exit, + .name = "simple"); diff --git a/tools/sched_ext/scx_simple.c b/tools/sched_ext/scx_simple.c new file mode 100644 index 000000000000..789ac62fea8e --- /dev/null +++ b/tools/sched_ext/scx_simple.c @@ -0,0 +1,99 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2022 Tejun Heo + * Copyright (c) 2022 David Vernet + */ +#include +#include +#include +#include +#include +#include +#include "scx_simple.bpf.skel.h" + +const char help_fmt[] = +"A simple sched_ext scheduler.\n" +"\n" +"See the top-level comment in .bpf.c for more details.\n" +"\n" +"Usage: %s [-v]\n" +"\n" +" -v Print libbpf debug messages\n" +" -h Display this help and exit\n"; + +static bool verbose; +static volatile int exit_req; + +static int libbpf_print_fn(enum libbpf_print_level level, const char *format, va_list args) +{ + if (level == LIBBPF_DEBUG && !verbose) + return 0; + return vfprintf(stderr, format, args); +} + +static void sigint_handler(int simple) +{ + exit_req = 1; +} + +static void read_stats(struct scx_simple *skel, __u64 *stats) +{ + int nr_cpus = libbpf_num_possible_cpus(); + __u64 cnts[2][nr_cpus]; + __u32 idx; + + memset(stats, 0, sizeof(stats[0]) * 2); + + for (idx = 0; idx < 2; idx++) { + int ret, cpu; + + ret = bpf_map_lookup_elem(bpf_map__fd(skel->maps.stats), + &idx, cnts[idx]); + if (ret < 0) + continue; + for (cpu = 0; cpu < nr_cpus; cpu++) + stats[idx] += cnts[idx][cpu]; + } +} + +int main(int argc, char **argv) +{ + struct scx_simple *skel; + struct bpf_link *link; + __u32 opt; + + libbpf_set_print(libbpf_print_fn); + signal(SIGINT, sigint_handler); + signal(SIGTERM, sigint_handler); + + skel = SCX_OPS_OPEN(simple_ops, scx_simple); + + while ((opt = getopt(argc, argv, "vh")) != -1) { + switch (opt) { + case 'v': + verbose = true; + break; + default: + fprintf(stderr, help_fmt, basename(argv[0])); + return opt != 'h'; + } + } + + SCX_OPS_LOAD(skel, simple_ops, scx_simple); + link = SCX_OPS_ATTACH(skel, simple_ops, scx_simple); + + while (!exit_req && !UEI_EXITED(skel, uei)) { + __u64 stats[2]; + + read_stats(skel, stats); + printf("local=%llu global=%llu\n", stats[0], stats[1]); + fflush(stdout); + sleep(1); + } + + bpf_link__destroy(link); + UEI_REPORT(skel, uei); + scx_simple__destroy(skel); + return 0; +} -- cgit v1.2.3 From 8a010b81b3a50b033fc3cddc613517abda586cbe Mon Sep 17 00:00:00 2001 From: David Vernet Date: Tue, 18 Jun 2024 10:09:18 -1000 Subject: sched_ext: Implement runnable task stall watchdog The most common and critical way that a BPF scheduler can misbehave is by failing to run runnable tasks for too long. This patch implements a watchdog. * All tasks record when they become runnable. * A watchdog work periodically scans all runnable tasks. If any task has stayed runnable for too long, the BPF scheduler is aborted. * scheduler_tick() monitors whether the watchdog itself is stuck. If so, the BPF scheduler is aborted. Because the watchdog only scans the tasks which are currently runnable and usually very infrequently, the overhead should be negligible. scx_qmap is updated so that it can be told to stall user and/or kernel tasks. A detected task stall looks like the following: sched_ext: BPF scheduler "qmap" errored, disabling sched_ext: runnable task stall (dbus-daemon[953] failed to run for 6.478s) scx_check_timeout_workfn+0x10e/0x1b0 process_one_work+0x287/0x560 worker_thread+0x234/0x420 kthread+0xe9/0x100 ret_from_fork+0x1f/0x30 A detected watchdog stall: sched_ext: BPF scheduler "qmap" errored, disabling sched_ext: runnable task stall (watchdog failed to check in for 5.001s) scheduler_tick+0x2eb/0x340 update_process_times+0x7a/0x90 tick_sched_timer+0xd8/0x130 __hrtimer_run_queues+0x178/0x3b0 hrtimer_interrupt+0xfc/0x390 __sysvec_apic_timer_interrupt+0xb7/0x2b0 sysvec_apic_timer_interrupt+0x90/0xb0 asm_sysvec_apic_timer_interrupt+0x1b/0x20 default_idle+0x14/0x20 arch_cpu_idle+0xf/0x20 default_idle_call+0x50/0x90 do_idle+0xe8/0x240 cpu_startup_entry+0x1d/0x20 kernel_init+0x0/0x190 start_kernel+0x0/0x392 start_kernel+0x324/0x392 x86_64_start_reservations+0x2a/0x2c x86_64_start_kernel+0x104/0x109 secondary_startup_64_no_verify+0xce/0xdb Note that this patch exposes scx_ops_error[_type]() in kernel/sched/ext.h to inline scx_notify_sched_tick(). v4: - While disabling, cancel_delayed_work_sync(&scx_watchdog_work) was being called before forward progress was guaranteed and thus could lead to system lockup. Relocated. - While enabling, it was comparing msecs against jiffies without conversion leading to spurious load failures on lower HZ kernels. Fixed. - runnable list management is now used by core bypass logic and moved to the patch implementing sched_ext core. v3: - bpf_scx_init_member() was incorrectly comparing ops->timeout_ms against SCX_WATCHDOG_MAX_TIMEOUT which is in jiffies without conversion leading to spurious load failures in lower HZ kernels. Fixed. v2: - Julia Lawall noticed that the watchdog code was mixing msecs and jiffies. Fix by using jiffies for everything. Signed-off-by: David Vernet Reviewed-by: Tejun Heo Signed-off-by: Tejun Heo Acked-by: Josh Don Acked-by: Hao Luo Acked-by: Barret Rhoden Cc: Julia Lawall --- include/linux/sched/ext.h | 1 + init/init_task.c | 1 + kernel/sched/core.c | 1 + kernel/sched/ext.c | 130 +++++++++++++++++++++++++++++++++++++++-- kernel/sched/ext.h | 2 + tools/sched_ext/scx_qmap.bpf.c | 12 ++++ tools/sched_ext/scx_qmap.c | 12 +++- 7 files changed, 153 insertions(+), 6 deletions(-) (limited to 'tools') diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h index c1530a7992cc..96031252436f 100644 --- a/include/linux/sched/ext.h +++ b/include/linux/sched/ext.h @@ -122,6 +122,7 @@ struct sched_ext_entity { atomic_long_t ops_state; struct list_head runnable_node; /* rq->scx.runnable_list */ + unsigned long runnable_at; u64 ddsp_dsq_id; u64 ddsp_enq_flags; diff --git a/init/init_task.c b/init/init_task.c index c6804396fe12..8a44c932d10f 100644 --- a/init/init_task.c +++ b/init/init_task.c @@ -106,6 +106,7 @@ struct task_struct init_task __aligned(L1_CACHE_BYTES) = { .sticky_cpu = -1, .holding_cpu = -1, .runnable_node = LIST_HEAD_INIT(init_task.scx.runnable_node), + .runnable_at = INITIAL_JIFFIES, .ddsp_dsq_id = SCX_DSQ_INVALID, .slice = SCX_SLICE_DFL, }, diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 6042ce3bfee0..f4365becdc13 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5516,6 +5516,7 @@ void sched_tick(void) calc_global_load_tick(rq); sched_core_tick(rq); task_tick_mm_cid(rq, curr); + scx_tick(rq); rq_unlock(rq, &rf); diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 1f5d80df263a..3dc515b3351f 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -8,6 +8,7 @@ enum scx_consts { SCX_DSP_DFL_MAX_BATCH = 32, + SCX_WATCHDOG_MAX_TIMEOUT = 30 * HZ, SCX_EXIT_BT_LEN = 64, SCX_EXIT_MSG_LEN = 1024, @@ -24,6 +25,7 @@ enum scx_exit_kind { SCX_EXIT_ERROR = 1024, /* runtime error, error msg contains details */ SCX_EXIT_ERROR_BPF, /* ERROR but triggered through scx_bpf_error() */ + SCX_EXIT_ERROR_STALL, /* watchdog detected stalled runnable tasks */ }; /* @@ -319,6 +321,15 @@ struct sched_ext_ops { */ u64 flags; + /** + * timeout_ms - The maximum amount of time, in milliseconds, that a + * runnable task should be able to wait before being scheduled. The + * maximum timeout may not exceed the default timeout of 30 seconds. + * + * Defaults to the maximum allowed timeout value of 30 seconds. + */ + u32 timeout_ms; + /** * name - BPF scheduler's name * @@ -472,6 +483,23 @@ struct static_key_false scx_has_op[SCX_OPI_END] = static atomic_t scx_exit_kind = ATOMIC_INIT(SCX_EXIT_DONE); static struct scx_exit_info *scx_exit_info; +/* + * The maximum amount of time in jiffies that a task may be runnable without + * being scheduled on a CPU. If this timeout is exceeded, it will trigger + * scx_ops_error(). + */ +static unsigned long scx_watchdog_timeout; + +/* + * The last time the delayed work was run. This delayed work relies on + * ksoftirqd being able to run to service timer interrupts, so it's possible + * that this work itself could get wedged. To account for this, we check that + * it's not stalled in the timer tick, and trigger an error if it is. + */ +static unsigned long scx_watchdog_timestamp = INITIAL_JIFFIES; + +static struct delayed_work scx_watchdog_work; + /* idle tracking */ #ifdef CONFIG_SMP #ifdef CONFIG_CPUMASK_OFFSTACK @@ -1170,6 +1198,11 @@ static void set_task_runnable(struct rq *rq, struct task_struct *p) { lockdep_assert_rq_held(rq); + if (p->scx.flags & SCX_TASK_RESET_RUNNABLE_AT) { + p->scx.runnable_at = jiffies; + p->scx.flags &= ~SCX_TASK_RESET_RUNNABLE_AT; + } + /* * list_add_tail() must be used. scx_ops_bypass() depends on tasks being * appened to the runnable_list. @@ -1177,9 +1210,11 @@ static void set_task_runnable(struct rq *rq, struct task_struct *p) list_add_tail(&p->scx.runnable_node, &rq->scx.runnable_list); } -static void clr_task_runnable(struct task_struct *p) +static void clr_task_runnable(struct task_struct *p, bool reset_runnable_at) { list_del_init(&p->scx.runnable_node); + if (reset_runnable_at) + p->scx.flags |= SCX_TASK_RESET_RUNNABLE_AT; } static void enqueue_task_scx(struct rq *rq, struct task_struct *p, int enq_flags) @@ -1217,7 +1252,8 @@ static void ops_dequeue(struct task_struct *p, u64 deq_flags) { unsigned long opss; - clr_task_runnable(p); + /* dequeue is always temporary, don't reset runnable_at */ + clr_task_runnable(p, false); /* acquire ensures that we see the preceding updates on QUEUED */ opss = atomic_long_read_acquire(&p->scx.ops_state); @@ -1826,7 +1862,7 @@ static void set_next_task_scx(struct rq *rq, struct task_struct *p, bool first) p->se.exec_start = rq_clock_task(rq); - clr_task_runnable(p); + clr_task_runnable(p, true); } static void put_prev_task_scx(struct rq *rq, struct task_struct *p) @@ -2176,9 +2212,71 @@ static void reset_idle_masks(void) {} #endif /* CONFIG_SMP */ -static void task_tick_scx(struct rq *rq, struct task_struct *curr, int queued) +static bool check_rq_for_timeouts(struct rq *rq) +{ + struct task_struct *p; + struct rq_flags rf; + bool timed_out = false; + + rq_lock_irqsave(rq, &rf); + list_for_each_entry(p, &rq->scx.runnable_list, scx.runnable_node) { + unsigned long last_runnable = p->scx.runnable_at; + + if (unlikely(time_after(jiffies, + last_runnable + scx_watchdog_timeout))) { + u32 dur_ms = jiffies_to_msecs(jiffies - last_runnable); + + scx_ops_error_kind(SCX_EXIT_ERROR_STALL, + "%s[%d] failed to run for %u.%03us", + p->comm, p->pid, + dur_ms / 1000, dur_ms % 1000); + timed_out = true; + break; + } + } + rq_unlock_irqrestore(rq, &rf); + + return timed_out; +} + +static void scx_watchdog_workfn(struct work_struct *work) +{ + int cpu; + + WRITE_ONCE(scx_watchdog_timestamp, jiffies); + + for_each_online_cpu(cpu) { + if (unlikely(check_rq_for_timeouts(cpu_rq(cpu)))) + break; + + cond_resched(); + } + queue_delayed_work(system_unbound_wq, to_delayed_work(work), + scx_watchdog_timeout / 2); +} + +void scx_tick(struct rq *rq) { + unsigned long last_check; + + if (!scx_enabled()) + return; + + last_check = READ_ONCE(scx_watchdog_timestamp); + if (unlikely(time_after(jiffies, + last_check + READ_ONCE(scx_watchdog_timeout)))) { + u32 dur_ms = jiffies_to_msecs(jiffies - last_check); + + scx_ops_error_kind(SCX_EXIT_ERROR_STALL, + "watchdog failed to check in for %u.%03us", + dur_ms / 1000, dur_ms % 1000); + } + update_other_load_avgs(rq); +} + +static void task_tick_scx(struct rq *rq, struct task_struct *curr, int queued) +{ update_curr_scx(rq); /* @@ -2248,6 +2346,7 @@ static int scx_ops_init_task(struct task_struct *p, struct task_group *tg, bool scx_set_task_state(p, SCX_TASK_INIT); + p->scx.flags |= SCX_TASK_RESET_RUNNABLE_AT; return 0; } @@ -2326,6 +2425,7 @@ void init_scx_entity(struct sched_ext_entity *scx) scx->sticky_cpu = -1; scx->holding_cpu = -1; INIT_LIST_HEAD(&scx->runnable_node); + scx->runnable_at = jiffies; scx->ddsp_dsq_id = SCX_DSQ_INVALID; scx->slice = SCX_SLICE_DFL; } @@ -2783,6 +2883,8 @@ static const char *scx_exit_reason(enum scx_exit_kind kind) return "runtime error"; case SCX_EXIT_ERROR_BPF: return "scx_bpf_error"; + case SCX_EXIT_ERROR_STALL: + return "runnable task stall"; default: return ""; } @@ -2904,6 +3006,8 @@ static void scx_ops_disable_workfn(struct kthread_work *work) if (scx_ops.exit) SCX_CALL_OP(SCX_KF_UNLOCKED, exit, ei); + cancel_delayed_work_sync(&scx_watchdog_work); + /* * Delete the kobject from the hierarchy eagerly in addition to just * dropping a reference. Otherwise, if the object is deleted @@ -3026,6 +3130,7 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) { struct scx_task_iter sti; struct task_struct *p; + unsigned long timeout; int i, ret; mutex_lock(&scx_ops_enable_mutex); @@ -3103,6 +3208,16 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) goto err_disable; } + if (ops->timeout_ms) + timeout = msecs_to_jiffies(ops->timeout_ms); + else + timeout = SCX_WATCHDOG_MAX_TIMEOUT; + + WRITE_ONCE(scx_watchdog_timeout, timeout); + WRITE_ONCE(scx_watchdog_timestamp, jiffies); + queue_delayed_work(system_unbound_wq, &scx_watchdog_work, + scx_watchdog_timeout / 2); + /* * Lock out forks before opening the floodgate so that they don't wander * into the operations prematurely. @@ -3413,6 +3528,12 @@ static int bpf_scx_init_member(const struct btf_type *t, if (ret == 0) return -EINVAL; return 1; + case offsetof(struct sched_ext_ops, timeout_ms): + if (msecs_to_jiffies(*(u32 *)(udata + moff)) > + SCX_WATCHDOG_MAX_TIMEOUT) + return -E2BIG; + ops->timeout_ms = *(u32 *)(udata + moff); + return 1; } return 0; @@ -3569,6 +3690,7 @@ void __init init_sched_ext_class(void) } register_sysrq_key('S', &sysrq_sched_ext_reset_op); + INIT_DELAYED_WORK(&scx_watchdog_work, scx_watchdog_workfn); } diff --git a/kernel/sched/ext.h b/kernel/sched/ext.h index 9c5a2d928281..56fcdb0b2c05 100644 --- a/kernel/sched/ext.h +++ b/kernel/sched/ext.h @@ -29,6 +29,7 @@ static inline bool task_on_scx(const struct task_struct *p) return scx_enabled() && p->sched_class == &ext_sched_class; } +void scx_tick(struct rq *rq); void init_scx_entity(struct sched_ext_entity *scx); void scx_pre_fork(struct task_struct *p); int scx_fork(struct task_struct *p); @@ -66,6 +67,7 @@ static inline const struct sched_class *next_active_class(const struct sched_cla #define scx_enabled() false #define scx_switched_all() false +static inline void scx_tick(struct rq *rq) {} static inline void scx_pre_fork(struct task_struct *p) {} static inline int scx_fork(struct task_struct *p) { return 0; } static inline void scx_post_fork(struct task_struct *p) {} diff --git a/tools/sched_ext/scx_qmap.bpf.c b/tools/sched_ext/scx_qmap.bpf.c index 976a2693da71..8beae08dfdc7 100644 --- a/tools/sched_ext/scx_qmap.bpf.c +++ b/tools/sched_ext/scx_qmap.bpf.c @@ -29,6 +29,8 @@ enum consts { char _license[] SEC("license") = "GPL"; const volatile u64 slice_ns = SCX_SLICE_DFL; +const volatile u32 stall_user_nth; +const volatile u32 stall_kernel_nth; const volatile u32 dsp_batch; u32 test_error_cnt; @@ -129,11 +131,20 @@ static int weight_to_idx(u32 weight) void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags) { + static u32 user_cnt, kernel_cnt; struct task_ctx *tctx; u32 pid = p->pid; int idx = weight_to_idx(p->scx.weight); void *ring; + if (p->flags & PF_KTHREAD) { + if (stall_kernel_nth && !(++kernel_cnt % stall_kernel_nth)) + return; + } else { + if (stall_user_nth && !(++user_cnt % stall_user_nth)) + return; + } + if (test_error_cnt && !--test_error_cnt) scx_bpf_error("test triggering error"); @@ -261,4 +272,5 @@ SCX_OPS_DEFINE(qmap_ops, .init_task = (void *)qmap_init_task, .init = (void *)qmap_init, .exit = (void *)qmap_exit, + .timeout_ms = 5000U, .name = "qmap"); diff --git a/tools/sched_ext/scx_qmap.c b/tools/sched_ext/scx_qmap.c index 7c84ade7ecfb..6e9e9726cd62 100644 --- a/tools/sched_ext/scx_qmap.c +++ b/tools/sched_ext/scx_qmap.c @@ -19,10 +19,12 @@ const char help_fmt[] = "\n" "See the top-level comment in .bpf.c for more details.\n" "\n" -"Usage: %s [-s SLICE_US] [-e COUNT] [-b COUNT] [-p] [-v]\n" +"Usage: %s [-s SLICE_US] [-e COUNT] [-t COUNT] [-T COUNT] [-b COUNT] [-p] [-v]\n" "\n" " -s SLICE_US Override slice duration\n" " -e COUNT Trigger scx_bpf_error() after COUNT enqueues\n" +" -t COUNT Stall every COUNT'th user thread\n" +" -T COUNT Stall every COUNT'th kernel thread\n" " -b COUNT Dispatch upto COUNT tasks together\n" " -p Switch only tasks on SCHED_EXT policy intead of all\n" " -v Print libbpf debug messages\n" @@ -55,7 +57,7 @@ int main(int argc, char **argv) skel = SCX_OPS_OPEN(qmap_ops, scx_qmap); - while ((opt = getopt(argc, argv, "s:e:b:pvh")) != -1) { + while ((opt = getopt(argc, argv, "s:e:t:T:b:pvh")) != -1) { switch (opt) { case 's': skel->rodata->slice_ns = strtoull(optarg, NULL, 0) * 1000; @@ -63,6 +65,12 @@ int main(int argc, char **argv) case 'e': skel->bss->test_error_cnt = strtoul(optarg, NULL, 0); break; + case 't': + skel->rodata->stall_user_nth = strtoul(optarg, NULL, 0); + break; + case 'T': + skel->rodata->stall_kernel_nth = strtoul(optarg, NULL, 0); + break; case 'b': skel->rodata->dsp_batch = strtoul(optarg, NULL, 0); break; -- cgit v1.2.3 From 7bb6f0810ecfb73a9d7a2ca56fb001e0201a6758 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 18 Jun 2024 10:09:18 -1000 Subject: sched_ext: Allow BPF schedulers to disallow specific tasks from joining SCHED_EXT BPF schedulers might not want to schedule certain tasks - e.g. kernel threads. This patch adds p->scx.disallow which can be set by BPF schedulers in such cases. The field can be changed anytime and setting it in ops.prep_enable() guarantees that the task can never be scheduled by sched_ext. scx_qmap is updated with the -d option to disallow a specific PID: # echo $$ 1092 # grep -E '(policy)|(ext\.enabled)' /proc/self/sched policy : 0 ext.enabled : 0 # ./set-scx 1092 # grep -E '(policy)|(ext\.enabled)' /proc/self/sched policy : 7 ext.enabled : 0 Run "scx_qmap -p -d 1092" in another terminal. # cat /sys/kernel/sched_ext/nr_rejected 1 # grep -E '(policy)|(ext\.enabled)' /proc/self/sched policy : 0 ext.enabled : 0 # ./set-scx 1092 setparam failed for 1092 (Permission denied) - v4: Refreshed on top of tip:sched/core. - v3: Update description to reflect /sys/kernel/sched_ext interface change. - v2: Use atomic_long_t instead of atomic64_t for scx_kick_cpus_pnt_seqs to accommodate 32bit archs. Signed-off-by: Tejun Heo Suggested-by: Barret Rhoden Reviewed-by: David Vernet Acked-by: Josh Don Acked-by: Hao Luo Acked-by: Barret Rhoden --- include/linux/sched/ext.h | 12 ++++++++++ kernel/sched/ext.c | 50 ++++++++++++++++++++++++++++++++++++++++++ kernel/sched/ext.h | 2 ++ kernel/sched/syscalls.c | 4 ++++ tools/sched_ext/scx_qmap.bpf.c | 4 ++++ tools/sched_ext/scx_qmap.c | 11 ++++++++-- 6 files changed, 81 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h index 96031252436f..ea7c501ac819 100644 --- a/include/linux/sched/ext.h +++ b/include/linux/sched/ext.h @@ -137,6 +137,18 @@ struct sched_ext_entity { */ u64 slice; + /* + * If set, reject future sched_setscheduler(2) calls updating the policy + * to %SCHED_EXT with -%EACCES. + * + * If set from ops.init_task() and the task's policy is already + * %SCHED_EXT, which can happen while the BPF scheduler is being loaded + * or by inhering the parent's policy during fork, the task's policy is + * rejected and forcefully reverted to %SCHED_NORMAL. The number of + * such events are reported through /sys/kernel/debug/sched_ext::nr_rejected. + */ + bool disallow; /* reject switching into SCX */ + /* cold fields */ /* must be the last field, see init_scx_entity() */ struct list_head tasks_node; diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 3dc515b3351f..8ff30b80e862 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -483,6 +483,8 @@ struct static_key_false scx_has_op[SCX_OPI_END] = static atomic_t scx_exit_kind = ATOMIC_INIT(SCX_EXIT_DONE); static struct scx_exit_info *scx_exit_info; +static atomic_long_t scx_nr_rejected = ATOMIC_LONG_INIT(0); + /* * The maximum amount of time in jiffies that a task may be runnable without * being scheduled on a CPU. If this timeout is exceeded, it will trigger @@ -2332,6 +2334,8 @@ static int scx_ops_init_task(struct task_struct *p, struct task_group *tg, bool { int ret; + p->scx.disallow = false; + if (SCX_HAS_OP(init_task)) { struct scx_init_task_args args = { .fork = fork, @@ -2346,6 +2350,27 @@ static int scx_ops_init_task(struct task_struct *p, struct task_group *tg, bool scx_set_task_state(p, SCX_TASK_INIT); + if (p->scx.disallow) { + struct rq *rq; + struct rq_flags rf; + + rq = task_rq_lock(p, &rf); + + /* + * We're either in fork or load path and @p->policy will be + * applied right after. Reverting @p->policy here and rejecting + * %SCHED_EXT transitions from scx_check_setscheduler() + * guarantees that if ops.init_task() sets @p->disallow, @p can + * never be in SCX. + */ + if (p->policy == SCHED_EXT) { + p->policy = SCHED_NORMAL; + atomic_long_inc(&scx_nr_rejected); + } + + task_rq_unlock(rq, p, &rf); + } + p->scx.flags |= SCX_TASK_RESET_RUNNABLE_AT; return 0; } @@ -2549,6 +2574,18 @@ static void switched_from_scx(struct rq *rq, struct task_struct *p) static void wakeup_preempt_scx(struct rq *rq, struct task_struct *p,int wake_flags) {} static void switched_to_scx(struct rq *rq, struct task_struct *p) {} +int scx_check_setscheduler(struct task_struct *p, int policy) +{ + lockdep_assert_rq_held(task_rq(p)); + + /* if disallow, reject transitioning into SCX */ + if (scx_enabled() && READ_ONCE(p->scx.disallow) && + p->policy != policy && policy == SCHED_EXT) + return -EACCES; + + return 0; +} + /* * Omitted operations: * @@ -2703,9 +2740,17 @@ static ssize_t scx_attr_switch_all_show(struct kobject *kobj, } SCX_ATTR(switch_all); +static ssize_t scx_attr_nr_rejected_show(struct kobject *kobj, + struct kobj_attribute *ka, char *buf) +{ + return sysfs_emit(buf, "%ld\n", atomic_long_read(&scx_nr_rejected)); +} +SCX_ATTR(nr_rejected); + static struct attribute *scx_global_attrs[] = { &scx_attr_state.attr, &scx_attr_switch_all.attr, + &scx_attr_nr_rejected.attr, NULL, }; @@ -3178,6 +3223,8 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) atomic_set(&scx_exit_kind, SCX_EXIT_NONE); scx_warned_zero_slice = false; + atomic_long_set(&scx_nr_rejected, 0); + /* * Keep CPUs stable during enable so that the BPF scheduler can track * online CPUs by watching ->on/offline_cpu() after ->init(). @@ -3476,6 +3523,9 @@ static int bpf_scx_btf_struct_access(struct bpf_verifier_log *log, if (off >= offsetof(struct task_struct, scx.slice) && off + size <= offsetofend(struct task_struct, scx.slice)) return SCALAR_VALUE; + if (off >= offsetof(struct task_struct, scx.disallow) && + off + size <= offsetofend(struct task_struct, scx.disallow)) + return SCALAR_VALUE; } return -EACCES; diff --git a/kernel/sched/ext.h b/kernel/sched/ext.h index 56fcdb0b2c05..33a9f7fe5832 100644 --- a/kernel/sched/ext.h +++ b/kernel/sched/ext.h @@ -35,6 +35,7 @@ void scx_pre_fork(struct task_struct *p); int scx_fork(struct task_struct *p); void scx_post_fork(struct task_struct *p); void scx_cancel_fork(struct task_struct *p); +int scx_check_setscheduler(struct task_struct *p, int policy); bool task_should_scx(struct task_struct *p); void init_sched_ext_class(void); @@ -72,6 +73,7 @@ static inline void scx_pre_fork(struct task_struct *p) {} static inline int scx_fork(struct task_struct *p) { return 0; } static inline void scx_post_fork(struct task_struct *p) {} static inline void scx_cancel_fork(struct task_struct *p) {} +static inline int scx_check_setscheduler(struct task_struct *p, int policy) { return 0; } static inline bool task_on_scx(const struct task_struct *p) { return false; } static inline void init_sched_ext_class(void) {} diff --git a/kernel/sched/syscalls.c b/kernel/sched/syscalls.c index 18d44d180db1..4fa59c9f69ac 100644 --- a/kernel/sched/syscalls.c +++ b/kernel/sched/syscalls.c @@ -714,6 +714,10 @@ recheck: goto unlock; } + retval = scx_check_setscheduler(p, policy); + if (retval) + goto unlock; + /* * If not changing anything there's no need to proceed further, * but store a possible modification of reset_on_fork. diff --git a/tools/sched_ext/scx_qmap.bpf.c b/tools/sched_ext/scx_qmap.bpf.c index 8beae08dfdc7..5ff217c4bfa0 100644 --- a/tools/sched_ext/scx_qmap.bpf.c +++ b/tools/sched_ext/scx_qmap.bpf.c @@ -32,6 +32,7 @@ const volatile u64 slice_ns = SCX_SLICE_DFL; const volatile u32 stall_user_nth; const volatile u32 stall_kernel_nth; const volatile u32 dsp_batch; +const volatile s32 disallow_tgid; u32 test_error_cnt; @@ -243,6 +244,9 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev) s32 BPF_STRUCT_OPS(qmap_init_task, struct task_struct *p, struct scx_init_task_args *args) { + if (p->tgid == disallow_tgid) + p->scx.disallow = true; + /* * @p is new. Let's ensure that its task_ctx is available. We can sleep * in this function and the following will automatically use GFP_KERNEL. diff --git a/tools/sched_ext/scx_qmap.c b/tools/sched_ext/scx_qmap.c index 6e9e9726cd62..a2614994cfaa 100644 --- a/tools/sched_ext/scx_qmap.c +++ b/tools/sched_ext/scx_qmap.c @@ -19,13 +19,15 @@ const char help_fmt[] = "\n" "See the top-level comment in .bpf.c for more details.\n" "\n" -"Usage: %s [-s SLICE_US] [-e COUNT] [-t COUNT] [-T COUNT] [-b COUNT] [-p] [-v]\n" +"Usage: %s [-s SLICE_US] [-e COUNT] [-t COUNT] [-T COUNT] [-b COUNT]\n" +" [-d PID] [-p] [-v]\n" "\n" " -s SLICE_US Override slice duration\n" " -e COUNT Trigger scx_bpf_error() after COUNT enqueues\n" " -t COUNT Stall every COUNT'th user thread\n" " -T COUNT Stall every COUNT'th kernel thread\n" " -b COUNT Dispatch upto COUNT tasks together\n" +" -d PID Disallow a process from switching into SCHED_EXT (-1 for self)\n" " -p Switch only tasks on SCHED_EXT policy intead of all\n" " -v Print libbpf debug messages\n" " -h Display this help and exit\n"; @@ -57,7 +59,7 @@ int main(int argc, char **argv) skel = SCX_OPS_OPEN(qmap_ops, scx_qmap); - while ((opt = getopt(argc, argv, "s:e:t:T:b:pvh")) != -1) { + while ((opt = getopt(argc, argv, "s:e:t:T:b:d:pvh")) != -1) { switch (opt) { case 's': skel->rodata->slice_ns = strtoull(optarg, NULL, 0) * 1000; @@ -74,6 +76,11 @@ int main(int argc, char **argv) case 'b': skel->rodata->dsp_batch = strtoul(optarg, NULL, 0); break; + case 'd': + skel->rodata->disallow_tgid = strtol(optarg, NULL, 0); + if (skel->rodata->disallow_tgid < 0) + skel->rodata->disallow_tgid = getpid(); + break; case 'p': skel->struct_ops.qmap_ops->flags |= SCX_OPS_SWITCH_PARTIAL; break; -- cgit v1.2.3 From 07814a9439a3b03d79a1001614b5bc1cab69bcec Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 18 Jun 2024 10:09:18 -1000 Subject: sched_ext: Print debug dump after an error exit If a BPF scheduler triggers an error, the scheduler is aborted and the system is reverted to the built-in scheduler. In the process, a lot of information which may be useful for figuring out what happened can be lost. This patch adds debug dump which captures information which may be useful for debugging including runqueue and runnable thread states at the time of failure. The following shows a debug dump after triggering the watchdog: root@test ~# os/work/tools/sched_ext/build/bin/scx_qmap -t 100 stats : enq=1 dsp=0 delta=1 deq=0 stats : enq=90 dsp=90 delta=0 deq=0 stats : enq=156 dsp=156 delta=0 deq=0 stats : enq=218 dsp=218 delta=0 deq=0 stats : enq=255 dsp=255 delta=0 deq=0 stats : enq=271 dsp=271 delta=0 deq=0 stats : enq=284 dsp=284 delta=0 deq=0 stats : enq=293 dsp=293 delta=0 deq=0 DEBUG DUMP ================================================================================ kworker/u32:12[320] triggered exit kind 1026: runnable task stall (stress[1530] failed to run for 6.841s) Backtrace: scx_watchdog_workfn+0x136/0x1c0 process_scheduled_works+0x2b5/0x600 worker_thread+0x269/0x360 kthread+0xeb/0x110 ret_from_fork+0x36/0x40 ret_from_fork_asm+0x1a/0x30 QMAP FIFO[0]: QMAP FIFO[1]: QMAP FIFO[2]: 1436 QMAP FIFO[3]: QMAP FIFO[4]: CPU states ---------- CPU 0 : nr_run=1 ops_qseq=244 curr=swapper/0[0] class=idle_sched_class QMAP: dsp_idx=1 dsp_cnt=0 R stress[1530] -6841ms scx_state/flags=3/0x1 ops_state/qseq=2/20 sticky/holding_cpu=-1/-1 dsq_id=(n/a) cpus=ff QMAP: force_local=0 asm_sysvec_apic_timer_interrupt+0x16/0x20 CPU 2 : nr_run=2 ops_qseq=142 curr=swapper/2[0] class=idle_sched_class QMAP: dsp_idx=1 dsp_cnt=0 R sshd[1703] -5905ms scx_state/flags=3/0x9 ops_state/qseq=2/88 sticky/holding_cpu=-1/-1 dsq_id=(n/a) cpus=ff QMAP: force_local=1 __x64_sys_ppoll+0xf6/0x120 do_syscall_64+0x7b/0x150 entry_SYSCALL_64_after_hwframe+0x76/0x7e R fish[1539] -4141ms scx_state/flags=3/0x9 ops_state/qseq=2/124 sticky/holding_cpu=-1/-1 dsq_id=(n/a) cpus=ff QMAP: force_local=1 futex_wait+0x60/0xe0 do_futex+0x109/0x180 __x64_sys_futex+0x117/0x190 do_syscall_64+0x7b/0x150 entry_SYSCALL_64_after_hwframe+0x76/0x7e CPU 3 : nr_run=2 ops_qseq=162 curr=kworker/u32:12[320] class=ext_sched_class QMAP: dsp_idx=1 dsp_cnt=0 *R kworker/u32:12[320] +0ms scx_state/flags=3/0xd ops_state/qseq=0/0 sticky/holding_cpu=-1/-1 dsq_id=(n/a) cpus=ff QMAP: force_local=0 scx_dump_state+0x613/0x6f0 scx_ops_error_irq_workfn+0x1f/0x40 irq_work_run_list+0x82/0xd0 irq_work_run+0x14/0x30 __sysvec_irq_work+0x40/0x140 sysvec_irq_work+0x60/0x70 asm_sysvec_irq_work+0x16/0x20 scx_watchdog_workfn+0x15f/0x1c0 process_scheduled_works+0x2b5/0x600 worker_thread+0x269/0x360 kthread+0xeb/0x110 ret_from_fork+0x36/0x40 ret_from_fork_asm+0x1a/0x30 R kworker/3:2[1436] +0ms scx_state/flags=3/0x9 ops_state/qseq=2/160 sticky/holding_cpu=-1/-1 dsq_id=(n/a) cpus=08 QMAP: force_local=0 kthread+0xeb/0x110 ret_from_fork+0x36/0x40 ret_from_fork_asm+0x1a/0x30 CPU 7 : nr_run=0 ops_qseq=76 curr=swapper/7[0] class=idle_sched_class ================================================================================ EXIT: runnable task stall (stress[1530] failed to run for 6.841s) It shows that CPU 3 was running the watchdog when it triggered the error condition and the scx_qmap thread has been queued on CPU 0 for over 5 seconds but failed to run. It also prints out scx_qmap specific information - e.g. which tasks are queued on each FIFO and so on using the dump_*() ops. This dump has proved pretty useful for developing and debugging BPF schedulers. Debug dump is generated automatically when the BPF scheduler exits due to an error. The debug buffer used in such cases is determined by sched_ext_ops.exit_dump_len and defaults to 32k. If the debug dump overruns the available buffer, the output is truncated and marked accordingly. Debug dump output can also be read through the sched_ext_dump tracepoint. When read through the tracepoint, there is no length limit. SysRq-D can be used to trigger debug dump at any time while a BPF scheduler is loaded. This is non-destructive - the scheduler keeps running afterwards. The output can be read through the sched_ext_dump tracepoint. v2: - The size of exit debug dump buffer can now be customized using sched_ext_ops.exit_dump_len. - sched_ext_ops.dump*() added to enable dumping of BPF scheduler specific information. - Tracpoint output and SysRq-D triggering added. Signed-off-by: Tejun Heo Reviewed-by: David Vernet --- include/trace/events/sched_ext.h | 32 ++ kernel/sched/ext.c | 421 ++++++++++++++++++++++++++- tools/sched_ext/include/scx/common.bpf.h | 12 + tools/sched_ext/include/scx/compat.h | 9 +- tools/sched_ext/include/scx/user_exit_info.h | 19 ++ tools/sched_ext/scx_qmap.bpf.c | 54 ++++ tools/sched_ext/scx_qmap.c | 14 +- tools/sched_ext/scx_simple.c | 2 +- 8 files changed, 555 insertions(+), 8 deletions(-) create mode 100644 include/trace/events/sched_ext.h (limited to 'tools') diff --git a/include/trace/events/sched_ext.h b/include/trace/events/sched_ext.h new file mode 100644 index 000000000000..fe19da7315a9 --- /dev/null +++ b/include/trace/events/sched_ext.h @@ -0,0 +1,32 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM sched_ext + +#if !defined(_TRACE_SCHED_EXT_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_SCHED_EXT_H + +#include + +TRACE_EVENT(sched_ext_dump, + + TP_PROTO(const char *line), + + TP_ARGS(line), + + TP_STRUCT__entry( + __string(line, line) + ), + + TP_fast_assign( + __assign_str(line); + ), + + TP_printk("%s", + __get_str(line) + ) +); + +#endif /* _TRACE_SCHED_EXT_H */ + +/* This part must be outside protection */ +#include diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 6f4de29d7372..66bb9cf075f0 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -12,6 +12,7 @@ enum scx_consts { SCX_EXIT_BT_LEN = 64, SCX_EXIT_MSG_LEN = 1024, + SCX_EXIT_DUMP_DFL_LEN = 32768, }; enum scx_exit_kind { @@ -48,6 +49,9 @@ struct scx_exit_info { /* informational message */ char *msg; + + /* debug dump */ + char *dump; }; /* sched_ext_ops.flags */ @@ -105,6 +109,17 @@ struct scx_exit_task_args { bool cancelled; }; +/* + * Informational context provided to dump operations. + */ +struct scx_dump_ctx { + enum scx_exit_kind kind; + s64 exit_code; + const char *reason; + u64 at_ns; + u64 at_jiffies; +}; + /** * struct sched_ext_ops - Operation table for BPF scheduler implementation * @@ -296,6 +311,36 @@ struct sched_ext_ops { */ void (*disable)(struct task_struct *p); + /** + * dump - Dump BPF scheduler state on error + * @ctx: debug dump context + * + * Use scx_bpf_dump() to generate BPF scheduler specific debug dump. + */ + void (*dump)(struct scx_dump_ctx *ctx); + + /** + * dump_cpu - Dump BPF scheduler state for a CPU on error + * @ctx: debug dump context + * @cpu: CPU to generate debug dump for + * @idle: @cpu is currently idle without any runnable tasks + * + * Use scx_bpf_dump() to generate BPF scheduler specific debug dump for + * @cpu. If @idle is %true and this operation doesn't produce any + * output, @cpu is skipped for dump. + */ + void (*dump_cpu)(struct scx_dump_ctx *ctx, s32 cpu, bool idle); + + /** + * dump_task - Dump BPF scheduler state for a runnable task on error + * @ctx: debug dump context + * @p: runnable task to generate debug dump for + * + * Use scx_bpf_dump() to generate BPF scheduler specific debug dump for + * @p. + */ + void (*dump_task)(struct scx_dump_ctx *ctx, struct task_struct *p); + /* * All online ops must come before ops.init(). */ @@ -330,6 +375,12 @@ struct sched_ext_ops { */ u32 timeout_ms; + /** + * exit_dump_len - scx_exit_info.dump buffer length. If 0, the default + * value of 32768 is used. + */ + u32 exit_dump_len; + /** * name - BPF scheduler's name * @@ -567,10 +618,27 @@ struct scx_bstr_buf { static DEFINE_RAW_SPINLOCK(scx_exit_bstr_buf_lock); static struct scx_bstr_buf scx_exit_bstr_buf; +/* ops debug dump */ +struct scx_dump_data { + s32 cpu; + bool first; + s32 cursor; + struct seq_buf *s; + const char *prefix; + struct scx_bstr_buf buf; +}; + +struct scx_dump_data scx_dump_data = { + .cpu = -1, +}; + /* /sys/kernel/sched_ext interface */ static struct kset *scx_kset; static struct kobject *scx_root_kobj; +#define CREATE_TRACE_POINTS +#include + static __printf(3, 4) void scx_ops_exit_kind(enum scx_exit_kind kind, s64 exit_code, const char *fmt, ...); @@ -2897,12 +2965,13 @@ static void scx_ops_bypass(bool bypass) static void free_exit_info(struct scx_exit_info *ei) { + kfree(ei->dump); kfree(ei->msg); kfree(ei->bt); kfree(ei); } -static struct scx_exit_info *alloc_exit_info(void) +static struct scx_exit_info *alloc_exit_info(size_t exit_dump_len) { struct scx_exit_info *ei; @@ -2912,8 +2981,9 @@ static struct scx_exit_info *alloc_exit_info(void) ei->bt = kcalloc(sizeof(ei->bt[0]), SCX_EXIT_BT_LEN, GFP_KERNEL); ei->msg = kzalloc(SCX_EXIT_MSG_LEN, GFP_KERNEL); + ei->dump = kzalloc(exit_dump_len, GFP_KERNEL); - if (!ei->bt || !ei->msg) { + if (!ei->bt || !ei->msg || !ei->dump) { free_exit_info(ei); return NULL; } @@ -3125,8 +3195,274 @@ static void scx_ops_disable(enum scx_exit_kind kind) schedule_scx_ops_disable_work(); } +static void dump_newline(struct seq_buf *s) +{ + trace_sched_ext_dump(""); + + /* @s may be zero sized and seq_buf triggers WARN if so */ + if (s->size) + seq_buf_putc(s, '\n'); +} + +static __printf(2, 3) void dump_line(struct seq_buf *s, const char *fmt, ...) +{ + va_list args; + +#ifdef CONFIG_TRACEPOINTS + if (trace_sched_ext_dump_enabled()) { + /* protected by scx_dump_state()::dump_lock */ + static char line_buf[SCX_EXIT_MSG_LEN]; + + va_start(args, fmt); + vscnprintf(line_buf, sizeof(line_buf), fmt, args); + va_end(args); + + trace_sched_ext_dump(line_buf); + } +#endif + /* @s may be zero sized and seq_buf triggers WARN if so */ + if (s->size) { + va_start(args, fmt); + seq_buf_vprintf(s, fmt, args); + va_end(args); + + seq_buf_putc(s, '\n'); + } +} + +static void dump_stack_trace(struct seq_buf *s, const char *prefix, + const unsigned long *bt, unsigned int len) +{ + unsigned int i; + + for (i = 0; i < len; i++) + dump_line(s, "%s%pS", prefix, (void *)bt[i]); +} + +static void ops_dump_init(struct seq_buf *s, const char *prefix) +{ + struct scx_dump_data *dd = &scx_dump_data; + + lockdep_assert_irqs_disabled(); + + dd->cpu = smp_processor_id(); /* allow scx_bpf_dump() */ + dd->first = true; + dd->cursor = 0; + dd->s = s; + dd->prefix = prefix; +} + +static void ops_dump_flush(void) +{ + struct scx_dump_data *dd = &scx_dump_data; + char *line = dd->buf.line; + + if (!dd->cursor) + return; + + /* + * There's something to flush and this is the first line. Insert a blank + * line to distinguish ops dump. + */ + if (dd->first) { + dump_newline(dd->s); + dd->first = false; + } + + /* + * There may be multiple lines in $line. Scan and emit each line + * separately. + */ + while (true) { + char *end = line; + char c; + + while (*end != '\n' && *end != '\0') + end++; + + /* + * If $line overflowed, it may not have newline at the end. + * Always emit with a newline. + */ + c = *end; + *end = '\0'; + dump_line(dd->s, "%s%s", dd->prefix, line); + if (c == '\0') + break; + + /* move to the next line */ + end++; + if (*end == '\0') + break; + line = end; + } + + dd->cursor = 0; +} + +static void ops_dump_exit(void) +{ + ops_dump_flush(); + scx_dump_data.cpu = -1; +} + +static void scx_dump_task(struct seq_buf *s, struct scx_dump_ctx *dctx, + struct task_struct *p, char marker) +{ + static unsigned long bt[SCX_EXIT_BT_LEN]; + char dsq_id_buf[19] = "(n/a)"; + unsigned long ops_state = atomic_long_read(&p->scx.ops_state); + unsigned int bt_len; + + if (p->scx.dsq) + scnprintf(dsq_id_buf, sizeof(dsq_id_buf), "0x%llx", + (unsigned long long)p->scx.dsq->id); + + dump_newline(s); + dump_line(s, " %c%c %s[%d] %+ldms", + marker, task_state_to_char(p), p->comm, p->pid, + jiffies_delta_msecs(p->scx.runnable_at, dctx->at_jiffies)); + dump_line(s, " scx_state/flags=%u/0x%x ops_state/qseq=%lu/%lu", + scx_get_task_state(p), p->scx.flags & ~SCX_TASK_STATE_MASK, + ops_state & SCX_OPSS_STATE_MASK, + ops_state >> SCX_OPSS_QSEQ_SHIFT); + dump_line(s, " sticky/holding_cpu=%d/%d dsq_id=%s", + p->scx.sticky_cpu, p->scx.holding_cpu, dsq_id_buf); + dump_line(s, " cpus=%*pb", cpumask_pr_args(p->cpus_ptr)); + + if (SCX_HAS_OP(dump_task)) { + ops_dump_init(s, " "); + SCX_CALL_OP(SCX_KF_REST, dump_task, dctx, p); + ops_dump_exit(); + } + + bt_len = stack_trace_save_tsk(p, bt, SCX_EXIT_BT_LEN, 1); + if (bt_len) { + dump_newline(s); + dump_stack_trace(s, " ", bt, bt_len); + } +} + +static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len) +{ + static DEFINE_SPINLOCK(dump_lock); + static const char trunc_marker[] = "\n\n~~~~ TRUNCATED ~~~~\n"; + struct scx_dump_ctx dctx = { + .kind = ei->kind, + .exit_code = ei->exit_code, + .reason = ei->reason, + .at_ns = ktime_get_ns(), + .at_jiffies = jiffies, + }; + struct seq_buf s; + unsigned long flags; + char *buf; + int cpu; + + spin_lock_irqsave(&dump_lock, flags); + + seq_buf_init(&s, ei->dump, dump_len); + + if (ei->kind == SCX_EXIT_NONE) { + dump_line(&s, "Debug dump triggered by %s", ei->reason); + } else { + dump_line(&s, "%s[%d] triggered exit kind %d:", + current->comm, current->pid, ei->kind); + dump_line(&s, " %s (%s)", ei->reason, ei->msg); + dump_newline(&s); + dump_line(&s, "Backtrace:"); + dump_stack_trace(&s, " ", ei->bt, ei->bt_len); + } + + if (SCX_HAS_OP(dump)) { + ops_dump_init(&s, ""); + SCX_CALL_OP(SCX_KF_UNLOCKED, dump, &dctx); + ops_dump_exit(); + } + + dump_newline(&s); + dump_line(&s, "CPU states"); + dump_line(&s, "----------"); + + for_each_possible_cpu(cpu) { + struct rq *rq = cpu_rq(cpu); + struct rq_flags rf; + struct task_struct *p; + struct seq_buf ns; + size_t avail, used; + bool idle; + + rq_lock(rq, &rf); + + idle = list_empty(&rq->scx.runnable_list) && + rq->curr->sched_class == &idle_sched_class; + + if (idle && !SCX_HAS_OP(dump_cpu)) + goto next; + + /* + * We don't yet know whether ops.dump_cpu() will produce output + * and we may want to skip the default CPU dump if it doesn't. + * Use a nested seq_buf to generate the standard dump so that we + * can decide whether to commit later. + */ + avail = seq_buf_get_buf(&s, &buf); + seq_buf_init(&ns, buf, avail); + + dump_newline(&ns); + dump_line(&ns, "CPU %-4d: nr_run=%u ops_qseq=%lu", + cpu, rq->scx.nr_running, rq->scx.ops_qseq); + dump_line(&ns, " curr=%s[%d] class=%ps", + rq->curr->comm, rq->curr->pid, + rq->curr->sched_class); + + used = seq_buf_used(&ns); + if (SCX_HAS_OP(dump_cpu)) { + ops_dump_init(&ns, " "); + SCX_CALL_OP(SCX_KF_REST, dump_cpu, &dctx, cpu, idle); + ops_dump_exit(); + } + + /* + * If idle && nothing generated by ops.dump_cpu(), there's + * nothing interesting. Skip. + */ + if (idle && used == seq_buf_used(&ns)) + goto next; + + /* + * $s may already have overflowed when $ns was created. If so, + * calling commit on it will trigger BUG. + */ + if (avail) { + seq_buf_commit(&s, seq_buf_used(&ns)); + if (seq_buf_has_overflowed(&ns)) + seq_buf_set_overflow(&s); + } + + if (rq->curr->sched_class == &ext_sched_class) + scx_dump_task(&s, &dctx, rq->curr, '*'); + + list_for_each_entry(p, &rq->scx.runnable_list, scx.runnable_node) + scx_dump_task(&s, &dctx, p, ' '); + next: + rq_unlock(rq, &rf); + } + + if (seq_buf_has_overflowed(&s) && dump_len >= sizeof(trunc_marker)) + memcpy(ei->dump + dump_len - sizeof(trunc_marker), + trunc_marker, sizeof(trunc_marker)); + + spin_unlock_irqrestore(&dump_lock, flags); +} + static void scx_ops_error_irq_workfn(struct irq_work *irq_work) { + struct scx_exit_info *ei = scx_exit_info; + + if (ei->kind >= SCX_EXIT_ERROR) + scx_dump_state(ei, scx_ops.exit_dump_len); + schedule_scx_ops_disable_work(); } @@ -3152,6 +3488,13 @@ static __printf(3, 4) void scx_ops_exit_kind(enum scx_exit_kind kind, vscnprintf(ei->msg, SCX_EXIT_MSG_LEN, fmt, args); va_end(args); + /* + * Set ei->kind and ->reason for scx_dump_state(). They'll be set again + * in scx_ops_disable_workfn(). + */ + ei->kind = kind; + ei->reason = scx_exit_reason(ei->kind); + irq_work_queue(&scx_ops_error_irq_work); } @@ -3213,7 +3556,7 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) if (ret < 0) goto err; - scx_exit_info = alloc_exit_info(); + scx_exit_info = alloc_exit_info(ops->exit_dump_len); if (!scx_exit_info) { ret = -ENOMEM; goto err_del; @@ -3592,6 +3935,10 @@ static int bpf_scx_init_member(const struct btf_type *t, return -E2BIG; ops->timeout_ms = *(u32 *)(udata + moff); return 1; + case offsetof(struct sched_ext_ops, exit_dump_len): + ops->exit_dump_len = + *(u32 *)(udata + moff) ?: SCX_EXIT_DUMP_DFL_LEN; + return 1; } return 0; @@ -3723,6 +4070,21 @@ static const struct sysrq_key_op sysrq_sched_ext_reset_op = { .enable_mask = SYSRQ_ENABLE_RTNICE, }; +static void sysrq_handle_sched_ext_dump(u8 key) +{ + struct scx_exit_info ei = { .kind = SCX_EXIT_NONE, .reason = "SysRq-D" }; + + if (scx_enabled()) + scx_dump_state(&ei, 0); +} + +static const struct sysrq_key_op sysrq_sched_ext_dump_op = { + .handler = sysrq_handle_sched_ext_dump, + .help_msg = "dump-sched-ext(D)", + .action_msg = "Trigger sched_ext debug dump", + .enable_mask = SYSRQ_ENABLE_RTNICE, +}; + /** * print_scx_info - print out sched_ext scheduler state * @log_lvl: the log level to use when printing @@ -3793,6 +4155,7 @@ void __init init_sched_ext_class(void) } register_sysrq_key('S', &sysrq_sched_ext_reset_op); + register_sysrq_key('D', &sysrq_sched_ext_dump_op); INIT_DELAYED_WORK(&scx_watchdog_work, scx_watchdog_workfn); } @@ -4218,6 +4581,57 @@ __bpf_kfunc void scx_bpf_error_bstr(char *fmt, unsigned long long *data, raw_spin_unlock_irqrestore(&scx_exit_bstr_buf_lock, flags); } +/** + * scx_bpf_dump - Generate extra debug dump specific to the BPF scheduler + * @fmt: format string + * @data: format string parameters packaged using ___bpf_fill() macro + * @data__sz: @data len, must end in '__sz' for the verifier + * + * To be called through scx_bpf_dump() helper from ops.dump(), dump_cpu() and + * dump_task() to generate extra debug dump specific to the BPF scheduler. + * + * The extra dump may be multiple lines. A single line may be split over + * multiple calls. The last line is automatically terminated. + */ +__bpf_kfunc void scx_bpf_dump_bstr(char *fmt, unsigned long long *data, + u32 data__sz) +{ + struct scx_dump_data *dd = &scx_dump_data; + struct scx_bstr_buf *buf = &dd->buf; + s32 ret; + + if (raw_smp_processor_id() != dd->cpu) { + scx_ops_error("scx_bpf_dump() must only be called from ops.dump() and friends"); + return; + } + + /* append the formatted string to the line buf */ + ret = __bstr_format(buf->data, buf->line + dd->cursor, + sizeof(buf->line) - dd->cursor, fmt, data, data__sz); + if (ret < 0) { + dump_line(dd->s, "%s[!] (\"%s\", %p, %u) failed to format (%d)", + dd->prefix, fmt, data, data__sz, ret); + return; + } + + dd->cursor += ret; + dd->cursor = min_t(s32, dd->cursor, sizeof(buf->line)); + + if (!dd->cursor) + return; + + /* + * If the line buf overflowed or ends in a newline, flush it into the + * dump. This is to allow the caller to generate a single line over + * multiple calls. As ops_dump_flush() can also handle multiple lines in + * the line buf, the only case which can lead to an unexpected + * truncation is when the caller keeps generating newlines in the middle + * instead of the end consecutively. Don't do that. + */ + if (dd->cursor >= sizeof(buf->line) || buf->line[dd->cursor - 1] == '\n') + ops_dump_flush(); +} + /** * scx_bpf_nr_cpu_ids - Return the number of possible CPU IDs * @@ -4426,6 +4840,7 @@ BTF_ID_FLAGS(func, scx_bpf_dsq_nr_queued) BTF_ID_FLAGS(func, scx_bpf_destroy_dsq) BTF_ID_FLAGS(func, scx_bpf_exit_bstr, KF_TRUSTED_ARGS) BTF_ID_FLAGS(func, scx_bpf_error_bstr, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, scx_bpf_dump_bstr, KF_TRUSTED_ARGS) BTF_ID_FLAGS(func, scx_bpf_nr_cpu_ids) BTF_ID_FLAGS(func, scx_bpf_get_possible_cpumask, KF_ACQUIRE) BTF_ID_FLAGS(func, scx_bpf_get_online_cpumask, KF_ACQUIRE) diff --git a/tools/sched_ext/include/scx/common.bpf.h b/tools/sched_ext/include/scx/common.bpf.h index 833fe1bdccf9..3ea5cdf58bc7 100644 --- a/tools/sched_ext/include/scx/common.bpf.h +++ b/tools/sched_ext/include/scx/common.bpf.h @@ -38,6 +38,7 @@ s32 scx_bpf_dsq_nr_queued(u64 dsq_id) __ksym; void scx_bpf_destroy_dsq(u64 dsq_id) __ksym; void scx_bpf_exit_bstr(s64 exit_code, char *fmt, unsigned long long *data, u32 data__sz) __ksym __weak; void scx_bpf_error_bstr(char *fmt, unsigned long long *data, u32 data_len) __ksym; +void scx_bpf_dump_bstr(char *fmt, unsigned long long *data, u32 data_len) __ksym __weak; u32 scx_bpf_nr_cpu_ids(void) __ksym __weak; const struct cpumask *scx_bpf_get_possible_cpumask(void) __ksym __weak; const struct cpumask *scx_bpf_get_online_cpumask(void) __ksym __weak; @@ -97,6 +98,17 @@ void ___scx_bpf_bstr_format_checker(const char *fmt, ...) {} ___scx_bpf_bstr_format_checker(fmt, ##args); \ }) +/* + * scx_bpf_dump() wraps the scx_bpf_dump_bstr() kfunc with variadic arguments + * instead of an array of u64. To be used from ops.dump() and friends. + */ +#define scx_bpf_dump(fmt, args...) \ +({ \ + scx_bpf_bstr_preamble(fmt, args) \ + scx_bpf_dump_bstr(___fmt, ___param, sizeof(___param)); \ + ___scx_bpf_bstr_format_checker(fmt, ##args); \ +}) + #define BPF_STRUCT_OPS(name, args...) \ SEC("struct_ops/"#name) \ BPF_PROG(name, ##args) diff --git a/tools/sched_ext/include/scx/compat.h b/tools/sched_ext/include/scx/compat.h index a7fdaf8a858e..c58024c980c8 100644 --- a/tools/sched_ext/include/scx/compat.h +++ b/tools/sched_ext/include/scx/compat.h @@ -111,16 +111,23 @@ static inline bool __COMPAT_struct_has_field(const char *type, const char *field * is used to define ops and compat.h::SCX_OPS_LOAD/ATTACH() are used to load * and attach it, backward compatibility is automatically maintained where * reasonable. + * + * ec7e3b0463e1 ("implement-ops") in https://github.com/sched-ext/sched_ext is + * the current minimum required kernel version. */ #define SCX_OPS_OPEN(__ops_name, __scx_name) ({ \ struct __scx_name *__skel; \ \ + SCX_BUG_ON(!__COMPAT_struct_has_field("sched_ext_ops", "dump"), \ + "sched_ext_ops.dump() missing, kernel too old?"); \ + \ __skel = __scx_name##__open(); \ SCX_BUG_ON(!__skel, "Could not open " #__scx_name); \ __skel; \ }) -#define SCX_OPS_LOAD(__skel, __ops_name, __scx_name) ({ \ +#define SCX_OPS_LOAD(__skel, __ops_name, __scx_name, __uei_name) ({ \ + UEI_SET_SIZE(__skel, __ops_name, __uei_name); \ SCX_BUG_ON(__scx_name##__load((__skel)), "Failed to load skel"); \ }) diff --git a/tools/sched_ext/include/scx/user_exit_info.h b/tools/sched_ext/include/scx/user_exit_info.h index 8c3b7fac4d05..c2ef85c645e1 100644 --- a/tools/sched_ext/include/scx/user_exit_info.h +++ b/tools/sched_ext/include/scx/user_exit_info.h @@ -13,6 +13,7 @@ enum uei_sizes { UEI_REASON_LEN = 128, UEI_MSG_LEN = 1024, + UEI_DUMP_DFL_LEN = 32768, }; struct user_exit_info { @@ -28,6 +29,8 @@ struct user_exit_info { #include #define UEI_DEFINE(__name) \ + char RESIZABLE_ARRAY(data, __name##_dump); \ + const volatile u32 __name##_dump_len; \ struct user_exit_info __name SEC(".data") #define UEI_RECORD(__uei_name, __ei) ({ \ @@ -35,6 +38,8 @@ struct user_exit_info { sizeof(__uei_name.reason), (__ei)->reason); \ bpf_probe_read_kernel_str(__uei_name.msg, \ sizeof(__uei_name.msg), (__ei)->msg); \ + bpf_probe_read_kernel_str(__uei_name##_dump, \ + __uei_name##_dump_len, (__ei)->dump); \ if (bpf_core_field_exists((__ei)->exit_code)) \ __uei_name.exit_code = (__ei)->exit_code; \ /* use __sync to force memory barrier */ \ @@ -47,6 +52,13 @@ struct user_exit_info { #include #include +/* no need to call the following explicitly if SCX_OPS_LOAD() is used */ +#define UEI_SET_SIZE(__skel, __ops_name, __uei_name) ({ \ + u32 __len = (__skel)->struct_ops.__ops_name->exit_dump_len ?: UEI_DUMP_DFL_LEN; \ + (__skel)->rodata->__uei_name##_dump_len = __len; \ + RESIZE_ARRAY((__skel), data, __uei_name##_dump, __len); \ +}) + #define UEI_EXITED(__skel, __uei_name) ({ \ /* use __sync to force memory barrier */ \ __sync_val_compare_and_swap(&(__skel)->data->__uei_name.kind, -1, -1); \ @@ -54,6 +66,13 @@ struct user_exit_info { #define UEI_REPORT(__skel, __uei_name) ({ \ struct user_exit_info *__uei = &(__skel)->data->__uei_name; \ + char *__uei_dump = (__skel)->data_##__uei_name##_dump->__uei_name##_dump; \ + if (__uei_dump[0] != '\0') { \ + fputs("\nDEBUG DUMP\n", stderr); \ + fputs("================================================================================\n\n", stderr); \ + fputs(__uei_dump, stderr); \ + fputs("\n================================================================================\n\n", stderr); \ + } \ fprintf(stderr, "EXIT: %s", __uei->reason); \ if (__uei->msg[0] != '\0') \ fprintf(stderr, " (%s)", __uei->msg); \ diff --git a/tools/sched_ext/scx_qmap.bpf.c b/tools/sched_ext/scx_qmap.bpf.c index 5ff217c4bfa0..5b3da28bf042 100644 --- a/tools/sched_ext/scx_qmap.bpf.c +++ b/tools/sched_ext/scx_qmap.bpf.c @@ -33,6 +33,7 @@ const volatile u32 stall_user_nth; const volatile u32 stall_kernel_nth; const volatile u32 dsp_batch; const volatile s32 disallow_tgid; +const volatile bool suppress_dump; u32 test_error_cnt; @@ -258,6 +259,56 @@ s32 BPF_STRUCT_OPS(qmap_init_task, struct task_struct *p, return -ENOMEM; } +void BPF_STRUCT_OPS(qmap_dump, struct scx_dump_ctx *dctx) +{ + s32 i, pid; + + if (suppress_dump) + return; + + bpf_for(i, 0, 5) { + void *fifo; + + if (!(fifo = bpf_map_lookup_elem(&queue_arr, &i))) + return; + + scx_bpf_dump("QMAP FIFO[%d]:", i); + bpf_repeat(4096) { + if (bpf_map_pop_elem(fifo, &pid)) + break; + scx_bpf_dump(" %d", pid); + } + scx_bpf_dump("\n"); + } +} + +void BPF_STRUCT_OPS(qmap_dump_cpu, struct scx_dump_ctx *dctx, s32 cpu, bool idle) +{ + u32 zero = 0; + struct cpu_ctx *cpuc; + + if (suppress_dump || idle) + return; + if (!(cpuc = bpf_map_lookup_percpu_elem(&cpu_ctx_stor, &zero, cpu))) + return; + + scx_bpf_dump("QMAP: dsp_idx=%llu dsp_cnt=%llu", + cpuc->dsp_idx, cpuc->dsp_cnt); +} + +void BPF_STRUCT_OPS(qmap_dump_task, struct scx_dump_ctx *dctx, struct task_struct *p) +{ + struct task_ctx *taskc; + + if (suppress_dump) + return; + if (!(taskc = bpf_task_storage_get(&task_ctx_stor, p, 0, 0))) + return; + + scx_bpf_dump("QMAP: force_local=%d", + taskc->force_local); +} + s32 BPF_STRUCT_OPS_SLEEPABLE(qmap_init) { return scx_bpf_create_dsq(SHARED_DSQ, -1); @@ -274,6 +325,9 @@ SCX_OPS_DEFINE(qmap_ops, .dequeue = (void *)qmap_dequeue, .dispatch = (void *)qmap_dispatch, .init_task = (void *)qmap_init_task, + .dump = (void *)qmap_dump, + .dump_cpu = (void *)qmap_dump_cpu, + .dump_task = (void *)qmap_dump_task, .init = (void *)qmap_init, .exit = (void *)qmap_exit, .timeout_ms = 5000U, diff --git a/tools/sched_ext/scx_qmap.c b/tools/sched_ext/scx_qmap.c index a2614994cfaa..a1123a17581b 100644 --- a/tools/sched_ext/scx_qmap.c +++ b/tools/sched_ext/scx_qmap.c @@ -20,7 +20,7 @@ const char help_fmt[] = "See the top-level comment in .bpf.c for more details.\n" "\n" "Usage: %s [-s SLICE_US] [-e COUNT] [-t COUNT] [-T COUNT] [-b COUNT]\n" -" [-d PID] [-p] [-v]\n" +" [-d PID] [-D LEN] [-p] [-v]\n" "\n" " -s SLICE_US Override slice duration\n" " -e COUNT Trigger scx_bpf_error() after COUNT enqueues\n" @@ -28,6 +28,8 @@ const char help_fmt[] = " -T COUNT Stall every COUNT'th kernel thread\n" " -b COUNT Dispatch upto COUNT tasks together\n" " -d PID Disallow a process from switching into SCHED_EXT (-1 for self)\n" +" -D LEN Set scx_exit_info.dump buffer length\n" +" -S Suppress qmap-specific debug dump\n" " -p Switch only tasks on SCHED_EXT policy intead of all\n" " -v Print libbpf debug messages\n" " -h Display this help and exit\n"; @@ -59,7 +61,7 @@ int main(int argc, char **argv) skel = SCX_OPS_OPEN(qmap_ops, scx_qmap); - while ((opt = getopt(argc, argv, "s:e:t:T:b:d:pvh")) != -1) { + while ((opt = getopt(argc, argv, "s:e:t:T:b:d:D:Spvh")) != -1) { switch (opt) { case 's': skel->rodata->slice_ns = strtoull(optarg, NULL, 0) * 1000; @@ -81,6 +83,12 @@ int main(int argc, char **argv) if (skel->rodata->disallow_tgid < 0) skel->rodata->disallow_tgid = getpid(); break; + case 'D': + skel->struct_ops.qmap_ops->exit_dump_len = strtoul(optarg, NULL, 0); + break; + case 'S': + skel->rodata->suppress_dump = true; + break; case 'p': skel->struct_ops.qmap_ops->flags |= SCX_OPS_SWITCH_PARTIAL; break; @@ -93,7 +101,7 @@ int main(int argc, char **argv) } } - SCX_OPS_LOAD(skel, qmap_ops, scx_qmap); + SCX_OPS_LOAD(skel, qmap_ops, scx_qmap, uei); link = SCX_OPS_ATTACH(skel, qmap_ops, scx_qmap); while (!exit_req && !UEI_EXITED(skel, uei)) { diff --git a/tools/sched_ext/scx_simple.c b/tools/sched_ext/scx_simple.c index 789ac62fea8e..7f500d1d56ac 100644 --- a/tools/sched_ext/scx_simple.c +++ b/tools/sched_ext/scx_simple.c @@ -80,7 +80,7 @@ int main(int argc, char **argv) } } - SCX_OPS_LOAD(skel, simple_ops, scx_simple); + SCX_OPS_LOAD(skel, simple_ops, scx_simple, uei); link = SCX_OPS_ATTACH(skel, simple_ops, scx_simple); while (!exit_req && !UEI_EXITED(skel, uei)) { -- cgit v1.2.3 From 1c3ae1cb2f2cf245d529fe363ce420ec8028a547 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 18 Jun 2024 10:09:18 -1000 Subject: tools/sched_ext: Add scx_show_state.py There are states which are interesting but don't quite fit the interface exposed under /sys/kernel/sched_ext. Add tools/scx_show_state.py to show them. Signed-off-by: Tejun Heo Reviewed-by: David Vernet --- tools/sched_ext/scx_show_state.py | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) create mode 100644 tools/sched_ext/scx_show_state.py (limited to 'tools') diff --git a/tools/sched_ext/scx_show_state.py b/tools/sched_ext/scx_show_state.py new file mode 100644 index 000000000000..d457d2a74e1e --- /dev/null +++ b/tools/sched_ext/scx_show_state.py @@ -0,0 +1,39 @@ +#!/usr/bin/env drgn +# +# Copyright (C) 2024 Tejun Heo +# Copyright (C) 2024 Meta Platforms, Inc. and affiliates. + +desc = """ +This is a drgn script to show the current sched_ext state. +For more info on drgn, visit https://github.com/osandov/drgn. +""" + +import drgn +import sys + +def err(s): + print(s, file=sys.stderr, flush=True) + sys.exit(1) + +def read_int(name): + return int(prog[name].value_()) + +def read_atomic(name): + return prog[name].counter.value_() + +def read_static_key(name): + return prog[name].key.enabled.counter.value_() + +def ops_state_str(state): + return prog['scx_ops_enable_state_str'][state].string_().decode() + +ops = prog['scx_ops'] +enable_state = read_atomic("scx_ops_enable_state_var") + +print(f'ops : {ops.name.string_().decode()}') +print(f'enabled : {read_static_key("__scx_ops_enabled")}') +print(f'switching_all : {read_int("scx_switching_all")}') +print(f'switched_all : {read_static_key("__scx_switched_all")}') +print(f'enable_state : {ops_state_str(enable_state)} ({enable_state})') +print(f'bypass_depth : {read_atomic("scx_ops_bypass_depth")}') +print(f'nr_rejected : {read_atomic("scx_nr_rejected")}') -- cgit v1.2.3 From 81aae789181b5850d77dfdf74d4b85c63f0705e9 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 18 Jun 2024 10:09:19 -1000 Subject: sched_ext: Implement scx_bpf_kick_cpu() and task preemption support It's often useful to wake up and/or trigger reschedule on other CPUs. This patch adds scx_bpf_kick_cpu() kfunc helper that BPF scheduler can call to kick the target CPU into the scheduling path. As a sched_ext task relinquishes its CPU only after its slice is depleted, this patch also adds SCX_KICK_PREEMPT and SCX_ENQ_PREEMPT which clears the slice of the target CPU's current task to guarantee that sched_ext's scheduling path runs on the CPU. If SCX_KICK_IDLE is specified, the target CPU is kicked iff the CPU is idle to guarantee that the target CPU will go through at least one full sched_ext scheduling cycle after the kicking. This can be used to wake up idle CPUs without incurring unnecessary overhead if it isn't currently idle. As a demonstration of how backward compatibility can be supported using BPF CO-RE, tools/sched_ext/include/scx/compat.bpf.h is added. It provides __COMPAT_scx_bpf_kick_cpu_IDLE() which uses SCX_KICK_IDLE if available or becomes a regular kicking otherwise. This allows schedulers to use the new SCX_KICK_IDLE while maintaining support for older kernels. The plan is to temporarily use compat helpers to ease API updates and drop them after a few kernel releases. v5: - SCX_KICK_IDLE added. Note that this also adds a compat mechanism for schedulers so that they can support kernels without SCX_KICK_IDLE. This is useful as a demonstration of how new feature flags can be added in a backward compatible way. - kick_cpus_irq_workfn() reimplemented so that it touches the pending cpumasks only as necessary to reduce kicking overhead on machines with a lot of CPUs. - tools/sched_ext/include/scx/compat.bpf.h added. v4: - Move example scheduler to its own patch. v3: - Make scx_example_central switch all tasks by default. - Convert to BPF inline iterators. v2: - Julia Lawall reported that scx_example_central can overflow the dispatch buffer and malfunction. As scheduling for other CPUs can't be handled by the automatic retry mechanism, fix by implementing an explicit overflow and retry handling. - Updated to use generic BPF cpumask helpers. Signed-off-by: Tejun Heo Reviewed-by: David Vernet Acked-by: Josh Don Acked-by: Hao Luo Acked-by: Barret Rhoden --- include/linux/sched/ext.h | 4 + kernel/sched/ext.c | 225 +++++++++++++++++++++++++++++-- kernel/sched/sched.h | 10 ++ tools/sched_ext/include/scx/common.bpf.h | 1 + 4 files changed, 227 insertions(+), 13 deletions(-) (limited to 'tools') diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h index 85fb5dc725ef..3b2809b980ac 100644 --- a/include/linux/sched/ext.h +++ b/include/linux/sched/ext.h @@ -134,6 +134,10 @@ struct sched_ext_entity { * scx_bpf_dispatch() but can also be modified directly by the BPF * scheduler. Automatically decreased by SCX as the task executes. On * depletion, a scheduling event is triggered. + * + * This value is cleared to zero if the task is preempted by + * %SCX_KICK_PREEMPT and shouldn't be used to determine how long the + * task ran. Use p->se.sum_exec_runtime instead. */ u64 slice; diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 66bb9cf075f0..213793d086d7 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -412,6 +412,14 @@ enum scx_enq_flags { /* high 32bits are SCX specific */ + /* + * Set the following to trigger preemption when calling + * scx_bpf_dispatch() with a local dsq as the target. The slice of the + * current task is cleared to zero and the CPU is kicked into the + * scheduling path. Implies %SCX_ENQ_HEAD. + */ + SCX_ENQ_PREEMPT = 1LLU << 32, + /* * The task being enqueued is the only task available for the cpu. By * default, ext core keeps executing such tasks but when @@ -441,6 +449,24 @@ enum scx_pick_idle_cpu_flags { SCX_PICK_IDLE_CORE = 1LLU << 0, /* pick a CPU whose SMT siblings are also idle */ }; +enum scx_kick_flags { + /* + * Kick the target CPU if idle. Guarantees that the target CPU goes + * through at least one full scheduling cycle before going idle. If the + * target CPU can be determined to be currently not idle and going to go + * through a scheduling cycle before going idle, noop. + */ + SCX_KICK_IDLE = 1LLU << 0, + + /* + * Preempt the current task and execute the dispatch path. If the + * current task of the target CPU is an SCX task, its ->scx.slice is + * cleared to zero before the scheduling path is invoked so that the + * task expires and the dispatch path is invoked. + */ + SCX_KICK_PREEMPT = 1LLU << 1, +}; + enum scx_ops_enable_state { SCX_OPS_PREPPING, SCX_OPS_ENABLING, @@ -1019,7 +1045,7 @@ static void dispatch_enqueue(struct scx_dispatch_q *dsq, struct task_struct *p, } } - if (enq_flags & SCX_ENQ_HEAD) + if (enq_flags & (SCX_ENQ_HEAD | SCX_ENQ_PREEMPT)) list_add(&p->scx.dsq_node, &dsq->list); else list_add_tail(&p->scx.dsq_node, &dsq->list); @@ -1045,8 +1071,16 @@ static void dispatch_enqueue(struct scx_dispatch_q *dsq, struct task_struct *p, if (is_local) { struct rq *rq = container_of(dsq, struct rq, scx.local_dsq); + bool preempt = false; + + if ((enq_flags & SCX_ENQ_PREEMPT) && p != rq->curr && + rq->curr->sched_class == &ext_sched_class) { + rq->curr->scx.slice = 0; + preempt = true; + } - if (sched_class_above(&ext_sched_class, rq->curr->sched_class)) + if (preempt || sched_class_above(&ext_sched_class, + rq->curr->sched_class)) resched_curr(rq); } else { raw_spin_unlock(&dsq->lock); @@ -1872,8 +1906,10 @@ static int balance_scx(struct rq *rq, struct task_struct *prev, { struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx); bool prev_on_scx = prev->sched_class == &ext_sched_class; + bool has_tasks = false; lockdep_assert_rq_held(rq); + rq->scx.flags |= SCX_RQ_BALANCING; if (prev_on_scx) { WARN_ON_ONCE(prev->scx.flags & SCX_TASK_BAL_KEEP); @@ -1890,19 +1926,19 @@ static int balance_scx(struct rq *rq, struct task_struct *prev, if ((prev->scx.flags & SCX_TASK_QUEUED) && prev->scx.slice && !scx_ops_bypassing()) { prev->scx.flags |= SCX_TASK_BAL_KEEP; - return 1; + goto has_tasks; } } /* if there already are tasks to run, nothing to do */ if (rq->scx.local_dsq.nr) - return 1; + goto has_tasks; if (consume_dispatch_q(rq, rf, &scx_dsq_global)) - return 1; + goto has_tasks; if (!SCX_HAS_OP(dispatch) || scx_ops_bypassing() || !scx_rq_online(rq)) - return 0; + goto out; dspc->rq = rq; dspc->rf = rf; @@ -1923,12 +1959,18 @@ static int balance_scx(struct rq *rq, struct task_struct *prev, flush_dispatch_buf(rq, rf); if (rq->scx.local_dsq.nr) - return 1; + goto has_tasks; if (consume_dispatch_q(rq, rf, &scx_dsq_global)) - return 1; + goto has_tasks; } while (dspc->nr_tasks); - return 0; + goto out; + +has_tasks: + has_tasks = true; +out: + rq->scx.flags &= ~SCX_RQ_BALANCING; + return has_tasks; } static void set_next_task_scx(struct rq *rq, struct task_struct *p, bool first) @@ -2666,7 +2708,8 @@ int scx_check_setscheduler(struct task_struct *p, int policy) * Omitted operations: * * - wakeup_preempt: NOOP as it isn't useful in the wakeup path because the task - * isn't tied to the CPU at that point. + * isn't tied to the CPU at that point. Preemption is implemented by resetting + * the victim task's slice to 0 and triggering reschedule on the target CPU. * * - migrate_task_rq: Unnecessary as task to cpu mapping is transient. * @@ -2902,6 +2945,9 @@ bool task_should_scx(struct task_struct *p) * of the queue. * * d. pick_next_task() suppresses zero slice warning. + * + * e. scx_bpf_kick_cpu() is disabled to avoid irq_work malfunction during PM + * operations. */ static void scx_ops_bypass(bool bypass) { @@ -3410,11 +3456,21 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len) seq_buf_init(&ns, buf, avail); dump_newline(&ns); - dump_line(&ns, "CPU %-4d: nr_run=%u ops_qseq=%lu", - cpu, rq->scx.nr_running, rq->scx.ops_qseq); + dump_line(&ns, "CPU %-4d: nr_run=%u flags=0x%x ops_qseq=%lu", + cpu, rq->scx.nr_running, rq->scx.flags, + rq->scx.ops_qseq); dump_line(&ns, " curr=%s[%d] class=%ps", rq->curr->comm, rq->curr->pid, rq->curr->sched_class); + if (!cpumask_empty(rq->scx.cpus_to_kick)) + dump_line(&ns, " cpus_to_kick : %*pb", + cpumask_pr_args(rq->scx.cpus_to_kick)); + if (!cpumask_empty(rq->scx.cpus_to_kick_if_idle)) + dump_line(&ns, " idle_to_kick : %*pb", + cpumask_pr_args(rq->scx.cpus_to_kick_if_idle)); + if (!cpumask_empty(rq->scx.cpus_to_preempt)) + dump_line(&ns, " cpus_to_preempt: %*pb", + cpumask_pr_args(rq->scx.cpus_to_preempt)); used = seq_buf_used(&ns); if (SCX_HAS_OP(dump_cpu)) { @@ -4085,6 +4141,82 @@ static const struct sysrq_key_op sysrq_sched_ext_dump_op = { .enable_mask = SYSRQ_ENABLE_RTNICE, }; +static bool can_skip_idle_kick(struct rq *rq) +{ + lockdep_assert_rq_held(rq); + + /* + * We can skip idle kicking if @rq is going to go through at least one + * full SCX scheduling cycle before going idle. Just checking whether + * curr is not idle is insufficient because we could be racing + * balance_one() trying to pull the next task from a remote rq, which + * may fail, and @rq may become idle afterwards. + * + * The race window is small and we don't and can't guarantee that @rq is + * only kicked while idle anyway. Skip only when sure. + */ + return !is_idle_task(rq->curr) && !(rq->scx.flags & SCX_RQ_BALANCING); +} + +static void kick_one_cpu(s32 cpu, struct rq *this_rq) +{ + struct rq *rq = cpu_rq(cpu); + struct scx_rq *this_scx = &this_rq->scx; + unsigned long flags; + + raw_spin_rq_lock_irqsave(rq, flags); + + /* + * During CPU hotplug, a CPU may depend on kicking itself to make + * forward progress. Allow kicking self regardless of online state. + */ + if (cpu_online(cpu) || cpu == cpu_of(this_rq)) { + if (cpumask_test_cpu(cpu, this_scx->cpus_to_preempt)) { + if (rq->curr->sched_class == &ext_sched_class) + rq->curr->scx.slice = 0; + cpumask_clear_cpu(cpu, this_scx->cpus_to_preempt); + } + + resched_curr(rq); + } else { + cpumask_clear_cpu(cpu, this_scx->cpus_to_preempt); + } + + raw_spin_rq_unlock_irqrestore(rq, flags); +} + +static void kick_one_cpu_if_idle(s32 cpu, struct rq *this_rq) +{ + struct rq *rq = cpu_rq(cpu); + unsigned long flags; + + raw_spin_rq_lock_irqsave(rq, flags); + + if (!can_skip_idle_kick(rq) && + (cpu_online(cpu) || cpu == cpu_of(this_rq))) + resched_curr(rq); + + raw_spin_rq_unlock_irqrestore(rq, flags); +} + +static void kick_cpus_irq_workfn(struct irq_work *irq_work) +{ + struct rq *this_rq = this_rq(); + struct scx_rq *this_scx = &this_rq->scx; + s32 cpu; + + for_each_cpu(cpu, this_scx->cpus_to_kick) { + kick_one_cpu(cpu, this_rq); + cpumask_clear_cpu(cpu, this_scx->cpus_to_kick); + cpumask_clear_cpu(cpu, this_scx->cpus_to_kick_if_idle); + } + + for_each_cpu(cpu, this_scx->cpus_to_kick_if_idle) { + kick_one_cpu_if_idle(cpu, this_rq); + cpumask_clear_cpu(cpu, this_scx->cpus_to_kick_if_idle); + } +} + /** * print_scx_info - print out sched_ext scheduler state * @log_lvl: the log level to use when printing @@ -4139,7 +4271,7 @@ void __init init_sched_ext_class(void) * definitions so that BPF scheduler implementations can use them * through the generated vmlinux.h. */ - WRITE_ONCE(v, SCX_ENQ_WAKEUP | SCX_DEQ_SLEEP); + WRITE_ONCE(v, SCX_ENQ_WAKEUP | SCX_DEQ_SLEEP | SCX_KICK_PREEMPT); BUG_ON(rhashtable_init(&dsq_hash, &dsq_hash_params)); init_dsq(&scx_dsq_global, SCX_DSQ_GLOBAL); @@ -4152,6 +4284,11 @@ void __init init_sched_ext_class(void) init_dsq(&rq->scx.local_dsq, SCX_DSQ_LOCAL); INIT_LIST_HEAD(&rq->scx.runnable_list); + + BUG_ON(!zalloc_cpumask_var(&rq->scx.cpus_to_kick, GFP_KERNEL)); + BUG_ON(!zalloc_cpumask_var(&rq->scx.cpus_to_kick_if_idle, GFP_KERNEL)); + BUG_ON(!zalloc_cpumask_var(&rq->scx.cpus_to_preempt, GFP_KERNEL)); + init_irq_work(&rq->scx.kick_cpus_irq_work, kick_cpus_irq_workfn); } register_sysrq_key('S', &sysrq_sched_ext_reset_op); @@ -4438,6 +4575,67 @@ static const struct btf_kfunc_id_set scx_kfunc_set_dispatch = { __bpf_kfunc_start_defs(); +/** + * scx_bpf_kick_cpu - Trigger reschedule on a CPU + * @cpu: cpu to kick + * @flags: %SCX_KICK_* flags + * + * Kick @cpu into rescheduling. This can be used to wake up an idle CPU or + * trigger rescheduling on a busy CPU. This can be called from any online + * scx_ops operation and the actual kicking is performed asynchronously through + * an irq work. + */ +__bpf_kfunc void scx_bpf_kick_cpu(s32 cpu, u64 flags) +{ + struct rq *this_rq; + unsigned long irq_flags; + + if (!ops_cpu_valid(cpu, NULL)) + return; + + /* + * While bypassing for PM ops, IRQ handling may not be online which can + * lead to irq_work_queue() malfunction such as infinite busy wait for + * IRQ status update. Suppress kicking. + */ + if (scx_ops_bypassing()) + return; + + local_irq_save(irq_flags); + + this_rq = this_rq(); + + /* + * Actual kicking is bounced to kick_cpus_irq_workfn() to avoid nesting + * rq locks. We can probably be smarter and avoid bouncing if called + * from ops which don't hold a rq lock. + */ + if (flags & SCX_KICK_IDLE) { + struct rq *target_rq = cpu_rq(cpu); + + if (unlikely(flags & SCX_KICK_PREEMPT)) + scx_ops_error("PREEMPT cannot be used with SCX_KICK_IDLE"); + + if (raw_spin_rq_trylock(target_rq)) { + if (can_skip_idle_kick(target_rq)) { + raw_spin_rq_unlock(target_rq); + goto out; + } + raw_spin_rq_unlock(target_rq); + } + cpumask_set_cpu(cpu, this_rq->scx.cpus_to_kick_if_idle); + } else { + cpumask_set_cpu(cpu, this_rq->scx.cpus_to_kick); + + if (flags & SCX_KICK_PREEMPT) + cpumask_set_cpu(cpu, this_rq->scx.cpus_to_preempt); + } + + irq_work_queue(&this_rq->scx.kick_cpus_irq_work); +out: + local_irq_restore(irq_flags); +} + /** * scx_bpf_dsq_nr_queued - Return the number of queued tasks * @dsq_id: id of the DSQ @@ -4836,6 +5034,7 @@ __bpf_kfunc s32 scx_bpf_task_cpu(const struct task_struct *p) __bpf_kfunc_end_defs(); BTF_KFUNCS_START(scx_kfunc_ids_any) +BTF_ID_FLAGS(func, scx_bpf_kick_cpu) BTF_ID_FLAGS(func, scx_bpf_dsq_nr_queued) BTF_ID_FLAGS(func, scx_bpf_destroy_dsq) BTF_ID_FLAGS(func, scx_bpf_exit_bstr, KF_TRUSTED_ARGS) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 2960e153c3a7..d9054eb4ba82 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -724,12 +724,22 @@ struct cfs_rq { }; #ifdef CONFIG_SCHED_CLASS_EXT +/* scx_rq->flags, protected by the rq lock */ +enum scx_rq_flags { + SCX_RQ_BALANCING = 1 << 1, +}; + struct scx_rq { struct scx_dispatch_q local_dsq; struct list_head runnable_list; /* runnable tasks on this rq */ unsigned long ops_qseq; u64 extra_enq_flags; /* see move_task_to_local_dsq() */ u32 nr_running; + u32 flags; + cpumask_var_t cpus_to_kick; + cpumask_var_t cpus_to_kick_if_idle; + cpumask_var_t cpus_to_preempt; + struct irq_work kick_cpus_irq_work; }; #endif /* CONFIG_SCHED_CLASS_EXT */ diff --git a/tools/sched_ext/include/scx/common.bpf.h b/tools/sched_ext/include/scx/common.bpf.h index 3ea5cdf58bc7..421118bc56ff 100644 --- a/tools/sched_ext/include/scx/common.bpf.h +++ b/tools/sched_ext/include/scx/common.bpf.h @@ -34,6 +34,7 @@ void scx_bpf_dispatch(struct task_struct *p, u64 dsq_id, u64 slice, u64 enq_flag u32 scx_bpf_dispatch_nr_slots(void) __ksym; void scx_bpf_dispatch_cancel(void) __ksym; bool scx_bpf_consume(u64 dsq_id) __ksym; +void scx_bpf_kick_cpu(s32 cpu, u64 flags) __ksym; s32 scx_bpf_dsq_nr_queued(u64 dsq_id) __ksym; void scx_bpf_destroy_dsq(u64 dsq_id) __ksym; void scx_bpf_exit_bstr(s64 exit_code, char *fmt, unsigned long long *data, u32 data__sz) __ksym __weak; -- cgit v1.2.3 From 037df2a314a0f2b1c6778602909dc57e5f1790f5 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 18 Jun 2024 10:09:19 -1000 Subject: sched_ext: Add a central scheduler which makes all scheduling decisions on one CPU This patch adds a new example scheduler, scx_central, which demonstrates central scheduling where one CPU is responsible for making all scheduling decisions in the system using scx_bpf_kick_cpu(). The central CPU makes scheduling decisions for all CPUs in the system, queues tasks on the appropriate local dsq's and preempts the worker CPUs. The worker CPUs in turn preempt the central CPU when it needs tasks to run. Currently, every CPU depends on its own tick to expire the current task. A follow-up patch implementing tickless support for sched_ext will allow the worker CPUs to go full tickless so that they can run completely undisturbed. v3: - Kumar fixed a bug where the dispatch path could overflow the dispatch buffer if too many are dispatched to the fallback DSQ. - Use the new SCX_KICK_IDLE to wake up non-central CPUs. - Dropped '-p' option. v2: - Use RESIZABLE_ARRAY() instead of fixed MAX_CPUS and use SCX_BUG[_ON]() to simplify error handling. Signed-off-by: Tejun Heo Reviewed-by: David Vernet Acked-by: Josh Don Acked-by: Hao Luo Acked-by: Barret Rhoden Cc: Kumar Kartikeya Dwivedi Cc: Julia Lawall --- tools/sched_ext/Makefile | 2 +- tools/sched_ext/scx_central.bpf.c | 214 ++++++++++++++++++++++++++++++++++++++ tools/sched_ext/scx_central.c | 105 +++++++++++++++++++ 3 files changed, 320 insertions(+), 1 deletion(-) create mode 100644 tools/sched_ext/scx_central.bpf.c create mode 100644 tools/sched_ext/scx_central.c (limited to 'tools') diff --git a/tools/sched_ext/Makefile b/tools/sched_ext/Makefile index 626782a21375..bf7e108f5ae1 100644 --- a/tools/sched_ext/Makefile +++ b/tools/sched_ext/Makefile @@ -176,7 +176,7 @@ $(INCLUDE_DIR)/%.bpf.skel.h: $(SCXOBJ_DIR)/%.bpf.o $(INCLUDE_DIR)/vmlinux.h $(BP SCX_COMMON_DEPS := include/scx/common.h include/scx/user_exit_info.h | $(BINDIR) -c-sched-targets = scx_simple scx_qmap +c-sched-targets = scx_simple scx_qmap scx_central $(addprefix $(BINDIR)/,$(c-sched-targets)): \ $(BINDIR)/%: \ diff --git a/tools/sched_ext/scx_central.bpf.c b/tools/sched_ext/scx_central.bpf.c new file mode 100644 index 000000000000..428b2262faa3 --- /dev/null +++ b/tools/sched_ext/scx_central.bpf.c @@ -0,0 +1,214 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * A central FIFO sched_ext scheduler which demonstrates the followings: + * + * a. Making all scheduling decisions from one CPU: + * + * The central CPU is the only one making scheduling decisions. All other + * CPUs kick the central CPU when they run out of tasks to run. + * + * There is one global BPF queue and the central CPU schedules all CPUs by + * dispatching from the global queue to each CPU's local dsq from dispatch(). + * This isn't the most straightforward. e.g. It'd be easier to bounce + * through per-CPU BPF queues. The current design is chosen to maximally + * utilize and verify various SCX mechanisms such as LOCAL_ON dispatching. + * + * b. Preemption + * + * SCX_KICK_PREEMPT is used to trigger scheduling and CPUs to move to the + * next tasks. + * + * This scheduler is designed to maximize usage of various SCX mechanisms. A + * more practical implementation would likely put the scheduling loop outside + * the central CPU's dispatch() path and add some form of priority mechanism. + * + * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2022 Tejun Heo + * Copyright (c) 2022 David Vernet + */ +#include + +char _license[] SEC("license") = "GPL"; + +enum { + FALLBACK_DSQ_ID = 0, +}; + +const volatile s32 central_cpu; +const volatile u32 nr_cpu_ids = 1; /* !0 for veristat, set during init */ +const volatile u64 slice_ns = SCX_SLICE_DFL; + +u64 nr_total, nr_locals, nr_queued, nr_lost_pids; +u64 nr_dispatches, nr_mismatches, nr_retries; +u64 nr_overflows; + +UEI_DEFINE(uei); + +struct { + __uint(type, BPF_MAP_TYPE_QUEUE); + __uint(max_entries, 4096); + __type(value, s32); +} central_q SEC(".maps"); + +/* can't use percpu map due to bad lookups */ +bool RESIZABLE_ARRAY(data, cpu_gimme_task); + +s32 BPF_STRUCT_OPS(central_select_cpu, struct task_struct *p, + s32 prev_cpu, u64 wake_flags) +{ + /* + * Steer wakeups to the central CPU as much as possible to avoid + * disturbing other CPUs. It's safe to blindly return the central cpu as + * select_cpu() is a hint and if @p can't be on it, the kernel will + * automatically pick a fallback CPU. + */ + return central_cpu; +} + +void BPF_STRUCT_OPS(central_enqueue, struct task_struct *p, u64 enq_flags) +{ + s32 pid = p->pid; + + __sync_fetch_and_add(&nr_total, 1); + + if (bpf_map_push_elem(¢ral_q, &pid, 0)) { + __sync_fetch_and_add(&nr_overflows, 1); + scx_bpf_dispatch(p, FALLBACK_DSQ_ID, SCX_SLICE_DFL, enq_flags); + return; + } + + __sync_fetch_and_add(&nr_queued, 1); + + if (!scx_bpf_task_running(p)) + scx_bpf_kick_cpu(central_cpu, SCX_KICK_PREEMPT); +} + +static bool dispatch_to_cpu(s32 cpu) +{ + struct task_struct *p; + s32 pid; + + bpf_repeat(BPF_MAX_LOOPS) { + if (bpf_map_pop_elem(¢ral_q, &pid)) + break; + + __sync_fetch_and_sub(&nr_queued, 1); + + p = bpf_task_from_pid(pid); + if (!p) { + __sync_fetch_and_add(&nr_lost_pids, 1); + continue; + } + + /* + * If we can't run the task at the top, do the dumb thing and + * bounce it to the fallback dsq. + */ + if (!bpf_cpumask_test_cpu(cpu, p->cpus_ptr)) { + __sync_fetch_and_add(&nr_mismatches, 1); + scx_bpf_dispatch(p, FALLBACK_DSQ_ID, SCX_SLICE_DFL, 0); + bpf_task_release(p); + /* + * We might run out of dispatch buffer slots if we continue dispatching + * to the fallback DSQ, without dispatching to the local DSQ of the + * target CPU. In such a case, break the loop now as will fail the + * next dispatch operation. + */ + if (!scx_bpf_dispatch_nr_slots()) + break; + continue; + } + + /* dispatch to local and mark that @cpu doesn't need more */ + scx_bpf_dispatch(p, SCX_DSQ_LOCAL_ON | cpu, SCX_SLICE_DFL, 0); + + if (cpu != central_cpu) + scx_bpf_kick_cpu(cpu, SCX_KICK_IDLE); + + bpf_task_release(p); + return true; + } + + return false; +} + +void BPF_STRUCT_OPS(central_dispatch, s32 cpu, struct task_struct *prev) +{ + if (cpu == central_cpu) { + /* dispatch for all other CPUs first */ + __sync_fetch_and_add(&nr_dispatches, 1); + + bpf_for(cpu, 0, nr_cpu_ids) { + bool *gimme; + + if (!scx_bpf_dispatch_nr_slots()) + break; + + /* central's gimme is never set */ + gimme = ARRAY_ELEM_PTR(cpu_gimme_task, cpu, nr_cpu_ids); + if (gimme && !*gimme) + continue; + + if (dispatch_to_cpu(cpu)) + *gimme = false; + } + + /* + * Retry if we ran out of dispatch buffer slots as we might have + * skipped some CPUs and also need to dispatch for self. The ext + * core automatically retries if the local dsq is empty but we + * can't rely on that as we're dispatching for other CPUs too. + * Kick self explicitly to retry. + */ + if (!scx_bpf_dispatch_nr_slots()) { + __sync_fetch_and_add(&nr_retries, 1); + scx_bpf_kick_cpu(central_cpu, SCX_KICK_PREEMPT); + return; + } + + /* look for a task to run on the central CPU */ + if (scx_bpf_consume(FALLBACK_DSQ_ID)) + return; + dispatch_to_cpu(central_cpu); + } else { + bool *gimme; + + if (scx_bpf_consume(FALLBACK_DSQ_ID)) + return; + + gimme = ARRAY_ELEM_PTR(cpu_gimme_task, cpu, nr_cpu_ids); + if (gimme) + *gimme = true; + + /* + * Force dispatch on the scheduling CPU so that it finds a task + * to run for us. + */ + scx_bpf_kick_cpu(central_cpu, SCX_KICK_PREEMPT); + } +} + +int BPF_STRUCT_OPS_SLEEPABLE(central_init) +{ + return scx_bpf_create_dsq(FALLBACK_DSQ_ID, -1); +} + +void BPF_STRUCT_OPS(central_exit, struct scx_exit_info *ei) +{ + UEI_RECORD(uei, ei); +} + +SCX_OPS_DEFINE(central_ops, + /* + * We are offloading all scheduling decisions to the central CPU + * and thus being the last task on a given CPU doesn't mean + * anything special. Enqueue the last tasks like any other tasks. + */ + .flags = SCX_OPS_ENQ_LAST, + + .select_cpu = (void *)central_select_cpu, + .enqueue = (void *)central_enqueue, + .dispatch = (void *)central_dispatch, + .init = (void *)central_init, + .exit = (void *)central_exit, + .name = "central"); diff --git a/tools/sched_ext/scx_central.c b/tools/sched_ext/scx_central.c new file mode 100644 index 000000000000..5f09fc666a63 --- /dev/null +++ b/tools/sched_ext/scx_central.c @@ -0,0 +1,105 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2022 Tejun Heo + * Copyright (c) 2022 David Vernet + */ +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include "scx_central.bpf.skel.h" + +const char help_fmt[] = +"A central FIFO sched_ext scheduler.\n" +"\n" +"See the top-level comment in .bpf.c for more details.\n" +"\n" +"Usage: %s [-s SLICE_US] [-c CPU]\n" +"\n" +" -s SLICE_US Override slice duration\n" +" -c CPU Override the central CPU (default: 0)\n" +" -v Print libbpf debug messages\n" +" -h Display this help and exit\n"; + +static bool verbose; +static volatile int exit_req; + +static int libbpf_print_fn(enum libbpf_print_level level, const char *format, va_list args) +{ + if (level == LIBBPF_DEBUG && !verbose) + return 0; + return vfprintf(stderr, format, args); +} + +static void sigint_handler(int dummy) +{ + exit_req = 1; +} + +int main(int argc, char **argv) +{ + struct scx_central *skel; + struct bpf_link *link; + __u64 seq = 0; + __s32 opt; + + libbpf_set_print(libbpf_print_fn); + signal(SIGINT, sigint_handler); + signal(SIGTERM, sigint_handler); + + skel = SCX_OPS_OPEN(central_ops, scx_central); + + skel->rodata->central_cpu = 0; + skel->rodata->nr_cpu_ids = libbpf_num_possible_cpus(); + + while ((opt = getopt(argc, argv, "s:c:pvh")) != -1) { + switch (opt) { + case 's': + skel->rodata->slice_ns = strtoull(optarg, NULL, 0) * 1000; + break; + case 'c': + skel->rodata->central_cpu = strtoul(optarg, NULL, 0); + break; + case 'v': + verbose = true; + break; + default: + fprintf(stderr, help_fmt, basename(argv[0])); + return opt != 'h'; + } + } + + /* Resize arrays so their element count is equal to cpu count. */ + RESIZE_ARRAY(skel, data, cpu_gimme_task, skel->rodata->nr_cpu_ids); + + SCX_OPS_LOAD(skel, central_ops, scx_central, uei); + link = SCX_OPS_ATTACH(skel, central_ops, scx_central); + + while (!exit_req && !UEI_EXITED(skel, uei)) { + printf("[SEQ %llu]\n", seq++); + printf("total :%10" PRIu64 " local:%10" PRIu64 " queued:%10" PRIu64 " lost:%10" PRIu64 "\n", + skel->bss->nr_total, + skel->bss->nr_locals, + skel->bss->nr_queued, + skel->bss->nr_lost_pids); + printf(" dispatch:%10" PRIu64 " mismatch:%10" PRIu64 " retry:%10" PRIu64 "\n", + skel->bss->nr_dispatches, + skel->bss->nr_mismatches, + skel->bss->nr_retries); + printf("overflow:%10" PRIu64 "\n", + skel->bss->nr_overflows); + fflush(stdout); + sleep(1); + } + + bpf_link__destroy(link); + UEI_REPORT(skel, uei); + scx_central__destroy(skel); + return 0; +} -- cgit v1.2.3 From 0922f54fdd15aedb93730eb8cfa0c069cbad4e08 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 18 Jun 2024 10:09:19 -1000 Subject: sched_ext: Make watchdog handle ops.dispatch() looping stall The dispatch path retries if the local DSQ is still empty after ops.dispatch() either dispatched or consumed a task. This is both out of necessity and for convenience. It has to retry because the dispatch path might lose the tasks to dequeue while the rq lock is released while trying to migrate tasks across CPUs, and the retry mechanism makes ops.dispatch() implementation easier as it only needs to make some forward progress each iteration. However, this makes it possible for ops.dispatch() to stall CPUs by repeatedly dispatching ineligible tasks. If all CPUs are stalled that way, the watchdog or sysrq handler can't run and the system can't be saved. Let's address the issue by breaking out of the dispatch loop after 32 iterations. It is unlikely but not impossible for ops.dispatch() to legitimately go over the iteration limit. We want to come back to the dispatch path in such cases as not doing so risks stalling the CPU by idling with runnable tasks pending. As the previous task is still current in balance_scx(), resched_curr() doesn't do anything - it will just get cleared. Let's instead use scx_kick_bpf() which will trigger reschedule after switching to the next task which will likely be the idle task. Signed-off-by: Tejun Heo Reviewed-by: David Vernet --- kernel/sched/ext.c | 17 +++++++++++++++++ tools/sched_ext/scx_qmap.bpf.c | 15 +++++++++++++++ tools/sched_ext/scx_qmap.c | 8 ++++++-- 3 files changed, 38 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 213793d086d7..89bcca84d6b5 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -8,6 +8,7 @@ enum scx_consts { SCX_DSP_DFL_MAX_BATCH = 32, + SCX_DSP_MAX_LOOPS = 32, SCX_WATCHDOG_MAX_TIMEOUT = 30 * HZ, SCX_EXIT_BT_LEN = 64, @@ -665,6 +666,7 @@ static struct kobject *scx_root_kobj; #define CREATE_TRACE_POINTS #include +static void scx_bpf_kick_cpu(s32 cpu, u64 flags); static __printf(3, 4) void scx_ops_exit_kind(enum scx_exit_kind kind, s64 exit_code, const char *fmt, ...); @@ -1906,6 +1908,7 @@ static int balance_scx(struct rq *rq, struct task_struct *prev, { struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx); bool prev_on_scx = prev->sched_class == &ext_sched_class; + int nr_loops = SCX_DSP_MAX_LOOPS; bool has_tasks = false; lockdep_assert_rq_held(rq); @@ -1962,6 +1965,20 @@ static int balance_scx(struct rq *rq, struct task_struct *prev, goto has_tasks; if (consume_dispatch_q(rq, rf, &scx_dsq_global)) goto has_tasks; + + /* + * ops.dispatch() can trap us in this loop by repeatedly + * dispatching ineligible tasks. Break out once in a while to + * allow the watchdog to run. As IRQ can't be enabled in + * balance(), we want to complete this scheduling cycle and then + * start a new one. IOW, we want to call resched_curr() on the + * next, most likely idle, task, not the current one. Use + * scx_bpf_kick_cpu() for deferred kicking. + */ + if (unlikely(!--nr_loops)) { + scx_bpf_kick_cpu(cpu_of(rq), 0); + break; + } } while (dspc->nr_tasks); goto out; diff --git a/tools/sched_ext/scx_qmap.bpf.c b/tools/sched_ext/scx_qmap.bpf.c index 5b3da28bf042..879fc9c788e5 100644 --- a/tools/sched_ext/scx_qmap.bpf.c +++ b/tools/sched_ext/scx_qmap.bpf.c @@ -31,6 +31,7 @@ char _license[] SEC("license") = "GPL"; const volatile u64 slice_ns = SCX_SLICE_DFL; const volatile u32 stall_user_nth; const volatile u32 stall_kernel_nth; +const volatile u32 dsp_inf_loop_after; const volatile u32 dsp_batch; const volatile s32 disallow_tgid; const volatile bool suppress_dump; @@ -198,6 +199,20 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev) if (scx_bpf_consume(SHARED_DSQ)) return; + if (dsp_inf_loop_after && nr_dispatched > dsp_inf_loop_after) { + /* + * PID 2 should be kthreadd which should mostly be idle and off + * the scheduler. Let's keep dispatching it to force the kernel + * to call this function over and over again. + */ + p = bpf_task_from_pid(2); + if (p) { + scx_bpf_dispatch(p, SCX_DSQ_LOCAL, slice_ns, 0); + bpf_task_release(p); + return; + } + } + if (!(cpuc = bpf_map_lookup_elem(&cpu_ctx_stor, &zero))) { scx_bpf_error("failed to look up cpu_ctx"); return; diff --git a/tools/sched_ext/scx_qmap.c b/tools/sched_ext/scx_qmap.c index a1123a17581b..594147a710a8 100644 --- a/tools/sched_ext/scx_qmap.c +++ b/tools/sched_ext/scx_qmap.c @@ -19,13 +19,14 @@ const char help_fmt[] = "\n" "See the top-level comment in .bpf.c for more details.\n" "\n" -"Usage: %s [-s SLICE_US] [-e COUNT] [-t COUNT] [-T COUNT] [-b COUNT]\n" +"Usage: %s [-s SLICE_US] [-e COUNT] [-t COUNT] [-T COUNT] [-l COUNT] [-b COUNT]\n" " [-d PID] [-D LEN] [-p] [-v]\n" "\n" " -s SLICE_US Override slice duration\n" " -e COUNT Trigger scx_bpf_error() after COUNT enqueues\n" " -t COUNT Stall every COUNT'th user thread\n" " -T COUNT Stall every COUNT'th kernel thread\n" +" -l COUNT Trigger dispatch infinite looping after COUNT dispatches\n" " -b COUNT Dispatch upto COUNT tasks together\n" " -d PID Disallow a process from switching into SCHED_EXT (-1 for self)\n" " -D LEN Set scx_exit_info.dump buffer length\n" @@ -61,7 +62,7 @@ int main(int argc, char **argv) skel = SCX_OPS_OPEN(qmap_ops, scx_qmap); - while ((opt = getopt(argc, argv, "s:e:t:T:b:d:D:Spvh")) != -1) { + while ((opt = getopt(argc, argv, "s:e:t:T:l:b:d:D:Spvh")) != -1) { switch (opt) { case 's': skel->rodata->slice_ns = strtoull(optarg, NULL, 0) * 1000; @@ -75,6 +76,9 @@ int main(int argc, char **argv) case 'T': skel->rodata->stall_kernel_nth = strtoul(optarg, NULL, 0); break; + case 'l': + skel->rodata->dsp_inf_loop_after = strtoul(optarg, NULL, 0); + break; case 'b': skel->rodata->dsp_batch = strtoul(optarg, NULL, 0); break; -- cgit v1.2.3 From 22a920209ab6aa4f8ec960ed81041643fddeaec6 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 18 Jun 2024 10:09:19 -1000 Subject: sched_ext: Implement tickless support Allow BPF schedulers to indicate tickless operation by setting p->scx.slice to SCX_SLICE_INF. A CPU whose current task has infinte slice goes into tickless operation. scx_central is updated to use tickless operations for all tasks and instead use a BPF timer to expire slices. This also uses the SCX_ENQ_PREEMPT and task state tracking added by the previous patches. Currently, there is no way to pin the timer on the central CPU, so it may end up on one of the worker CPUs; however, outside of that, the worker CPUs can go tickless both while running sched_ext tasks and idling. With schbench running, scx_central shows: root@test ~# grep ^LOC /proc/interrupts; sleep 10; grep ^LOC /proc/interrupts LOC: 142024 656 664 449 Local timer interrupts LOC: 161663 663 665 449 Local timer interrupts Without it: root@test ~ [SIGINT]# grep ^LOC /proc/interrupts; sleep 10; grep ^LOC /proc/interrupts LOC: 188778 3142 3793 3993 Local timer interrupts LOC: 198993 5314 6323 6438 Local timer interrupts While scx_central itself is too barebone to be useful as a production scheduler, a more featureful central scheduler can be built using the same approach. Google's experience shows that such an approach can have significant benefits for certain applications such as VM hosting. v4: Allow operation even if BPF_F_TIMER_CPU_PIN is not available. v3: Pin the central scheduler's timer on the central_cpu using BPF_F_TIMER_CPU_PIN. v2: Convert to BPF inline iterators. Signed-off-by: Tejun Heo Reviewed-by: David Vernet Acked-by: Josh Don Acked-by: Hao Luo Acked-by: Barret Rhoden --- include/linux/sched/ext.h | 1 + kernel/sched/core.c | 11 ++- kernel/sched/ext.c | 52 ++++++++++++- kernel/sched/ext.h | 2 + kernel/sched/sched.h | 1 + tools/sched_ext/scx_central.bpf.c | 159 ++++++++++++++++++++++++++++++++++++-- tools/sched_ext/scx_central.c | 29 ++++++- 7 files changed, 242 insertions(+), 13 deletions(-) (limited to 'tools') diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h index 3b2809b980ac..6f1a4977e9f8 100644 --- a/include/linux/sched/ext.h +++ b/include/linux/sched/ext.h @@ -16,6 +16,7 @@ enum scx_public_consts { SCX_OPS_NAME_LEN = 128, SCX_SLICE_DFL = 20 * 1000000, /* 20ms */ + SCX_SLICE_INF = U64_MAX, /* infinite, implies nohz */ }; /* diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 1a3144c80af8..d5eff4036be7 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1256,11 +1256,14 @@ bool sched_can_stop_tick(struct rq *rq) return true; /* - * If there are no DL,RR/FIFO tasks, there must only be CFS tasks left; - * if there's more than one we need the tick for involuntary - * preemption. + * If there are no DL,RR/FIFO tasks, there must only be CFS or SCX tasks + * left. For CFS, if there's more than one we need the tick for + * involuntary preemption. For SCX, ask. */ - if (rq->nr_running > 1) + if (!scx_switched_all() && rq->nr_running > 1) + return false; + + if (scx_enabled() && !scx_can_stop_tick(rq)) return false; /* diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 2e652f7b8f54..ce32fc6b05cd 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -1086,7 +1086,8 @@ static void update_curr_scx(struct rq *rq) account_group_exec_runtime(curr, delta_exec); cgroup_account_cputime(curr, delta_exec); - curr->scx.slice -= min(curr->scx.slice, delta_exec); + if (curr->scx.slice != SCX_SLICE_INF) + curr->scx.slice -= min(curr->scx.slice, delta_exec); } static void dsq_mod_nr(struct scx_dispatch_q *dsq, s32 delta) @@ -2093,6 +2094,28 @@ static void set_next_task_scx(struct rq *rq, struct task_struct *p, bool first) SCX_CALL_OP(SCX_KF_REST, running, p); clr_task_runnable(p, true); + + /* + * @p is getting newly scheduled or got kicked after someone updated its + * slice. Refresh whether tick can be stopped. See scx_can_stop_tick(). + */ + if ((p->scx.slice == SCX_SLICE_INF) != + (bool)(rq->scx.flags & SCX_RQ_CAN_STOP_TICK)) { + if (p->scx.slice == SCX_SLICE_INF) + rq->scx.flags |= SCX_RQ_CAN_STOP_TICK; + else + rq->scx.flags &= ~SCX_RQ_CAN_STOP_TICK; + + sched_update_tick_dependency(rq); + + /* + * For now, let's refresh the load_avgs just when transitioning + * in and out of nohz. In the future, we might want to add a + * mechanism which calls the following periodically on + * tick-stopped CPUs. + */ + update_other_load_avgs(rq); + } } static void put_prev_task_scx(struct rq *rq, struct task_struct *p) @@ -2818,6 +2841,26 @@ int scx_check_setscheduler(struct task_struct *p, int policy) return 0; } +#ifdef CONFIG_NO_HZ_FULL +bool scx_can_stop_tick(struct rq *rq) +{ + struct task_struct *p = rq->curr; + + if (scx_ops_bypassing()) + return false; + + if (p->sched_class != &ext_sched_class) + return true; + + /* + * @rq can dispatch from different DSQs, so we can't tell whether it + * needs the tick or not by looking at nr_running. Allow stopping ticks + * iff the BPF scheduler indicated so. See set_next_task_scx(). + */ + return rq->scx.flags & SCX_RQ_CAN_STOP_TICK; +} +#endif + /* * Omitted operations: * @@ -3120,6 +3163,9 @@ static void scx_ops_bypass(bool bypass) } rq_unlock_irqrestore(rq, &rf); + + /* kick to restore ticks */ + resched_cpu(cpu); } } @@ -4576,7 +4622,9 @@ __bpf_kfunc_start_defs(); * BPF locks (in the future when BPF introduces more flexible locking). * * @p is allowed to run for @slice. The scheduling path is triggered on slice - * exhaustion. If zero, the current residual slice is maintained. + * exhaustion. If zero, the current residual slice is maintained. If + * %SCX_SLICE_INF, @p never expires and the BPF scheduler must kick the CPU with + * scx_bpf_kick_cpu() to trigger scheduling. */ __bpf_kfunc void scx_bpf_dispatch(struct task_struct *p, u64 dsq_id, u64 slice, u64 enq_flags) diff --git a/kernel/sched/ext.h b/kernel/sched/ext.h index 33a9f7fe5832..6ed946f72489 100644 --- a/kernel/sched/ext.h +++ b/kernel/sched/ext.h @@ -35,6 +35,7 @@ void scx_pre_fork(struct task_struct *p); int scx_fork(struct task_struct *p); void scx_post_fork(struct task_struct *p); void scx_cancel_fork(struct task_struct *p); +bool scx_can_stop_tick(struct rq *rq); int scx_check_setscheduler(struct task_struct *p, int policy); bool task_should_scx(struct task_struct *p); void init_sched_ext_class(void); @@ -73,6 +74,7 @@ static inline void scx_pre_fork(struct task_struct *p) {} static inline int scx_fork(struct task_struct *p) { return 0; } static inline void scx_post_fork(struct task_struct *p) {} static inline void scx_cancel_fork(struct task_struct *p) {} +static inline bool scx_can_stop_tick(struct rq *rq) { return true; } static inline int scx_check_setscheduler(struct task_struct *p, int policy) { return 0; } static inline bool task_on_scx(const struct task_struct *p) { return false; } static inline void init_sched_ext_class(void) {} diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index d9054eb4ba82..b3c578cb43cd 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -727,6 +727,7 @@ struct cfs_rq { /* scx_rq->flags, protected by the rq lock */ enum scx_rq_flags { SCX_RQ_BALANCING = 1 << 1, + SCX_RQ_CAN_STOP_TICK = 1 << 2, }; struct scx_rq { diff --git a/tools/sched_ext/scx_central.bpf.c b/tools/sched_ext/scx_central.bpf.c index 428b2262faa3..1d8fd570eaa7 100644 --- a/tools/sched_ext/scx_central.bpf.c +++ b/tools/sched_ext/scx_central.bpf.c @@ -13,7 +13,26 @@ * through per-CPU BPF queues. The current design is chosen to maximally * utilize and verify various SCX mechanisms such as LOCAL_ON dispatching. * - * b. Preemption + * b. Tickless operation + * + * All tasks are dispatched with the infinite slice which allows stopping the + * ticks on CONFIG_NO_HZ_FULL kernels running with the proper nohz_full + * parameter. The tickless operation can be observed through + * /proc/interrupts. + * + * Periodic switching is enforced by a periodic timer checking all CPUs and + * preempting them as necessary. Unfortunately, BPF timer currently doesn't + * have a way to pin to a specific CPU, so the periodic timer isn't pinned to + * the central CPU. + * + * c. Preemption + * + * Kthreads are unconditionally queued to the head of a matching local dsq + * and dispatched with SCX_DSQ_PREEMPT. This ensures that a kthread is always + * prioritized over user threads, which is required for ensuring forward + * progress as e.g. the periodic timer may run on a ksoftirqd and if the + * ksoftirqd gets starved by a user thread, there may not be anything else to + * vacate that user thread. * * SCX_KICK_PREEMPT is used to trigger scheduling and CPUs to move to the * next tasks. @@ -32,14 +51,17 @@ char _license[] SEC("license") = "GPL"; enum { FALLBACK_DSQ_ID = 0, + MS_TO_NS = 1000LLU * 1000, + TIMER_INTERVAL_NS = 1 * MS_TO_NS, }; const volatile s32 central_cpu; const volatile u32 nr_cpu_ids = 1; /* !0 for veristat, set during init */ const volatile u64 slice_ns = SCX_SLICE_DFL; +bool timer_pinned = true; u64 nr_total, nr_locals, nr_queued, nr_lost_pids; -u64 nr_dispatches, nr_mismatches, nr_retries; +u64 nr_timers, nr_dispatches, nr_mismatches, nr_retries; u64 nr_overflows; UEI_DEFINE(uei); @@ -52,6 +74,23 @@ struct { /* can't use percpu map due to bad lookups */ bool RESIZABLE_ARRAY(data, cpu_gimme_task); +u64 RESIZABLE_ARRAY(data, cpu_started_at); + +struct central_timer { + struct bpf_timer timer; +}; + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, 1); + __type(key, u32); + __type(value, struct central_timer); +} central_timer SEC(".maps"); + +static bool vtime_before(u64 a, u64 b) +{ + return (s64)(a - b) < 0; +} s32 BPF_STRUCT_OPS(central_select_cpu, struct task_struct *p, s32 prev_cpu, u64 wake_flags) @@ -71,9 +110,22 @@ void BPF_STRUCT_OPS(central_enqueue, struct task_struct *p, u64 enq_flags) __sync_fetch_and_add(&nr_total, 1); + /* + * Push per-cpu kthreads at the head of local dsq's and preempt the + * corresponding CPU. This ensures that e.g. ksoftirqd isn't blocked + * behind other threads which is necessary for forward progress + * guarantee as we depend on the BPF timer which may run from ksoftirqd. + */ + if ((p->flags & PF_KTHREAD) && p->nr_cpus_allowed == 1) { + __sync_fetch_and_add(&nr_locals, 1); + scx_bpf_dispatch(p, SCX_DSQ_LOCAL, SCX_SLICE_INF, + enq_flags | SCX_ENQ_PREEMPT); + return; + } + if (bpf_map_push_elem(¢ral_q, &pid, 0)) { __sync_fetch_and_add(&nr_overflows, 1); - scx_bpf_dispatch(p, FALLBACK_DSQ_ID, SCX_SLICE_DFL, enq_flags); + scx_bpf_dispatch(p, FALLBACK_DSQ_ID, SCX_SLICE_INF, enq_flags); return; } @@ -106,7 +158,7 @@ static bool dispatch_to_cpu(s32 cpu) */ if (!bpf_cpumask_test_cpu(cpu, p->cpus_ptr)) { __sync_fetch_and_add(&nr_mismatches, 1); - scx_bpf_dispatch(p, FALLBACK_DSQ_ID, SCX_SLICE_DFL, 0); + scx_bpf_dispatch(p, FALLBACK_DSQ_ID, SCX_SLICE_INF, 0); bpf_task_release(p); /* * We might run out of dispatch buffer slots if we continue dispatching @@ -120,7 +172,7 @@ static bool dispatch_to_cpu(s32 cpu) } /* dispatch to local and mark that @cpu doesn't need more */ - scx_bpf_dispatch(p, SCX_DSQ_LOCAL_ON | cpu, SCX_SLICE_DFL, 0); + scx_bpf_dispatch(p, SCX_DSQ_LOCAL_ON | cpu, SCX_SLICE_INF, 0); if (cpu != central_cpu) scx_bpf_kick_cpu(cpu, SCX_KICK_IDLE); @@ -188,9 +240,102 @@ void BPF_STRUCT_OPS(central_dispatch, s32 cpu, struct task_struct *prev) } } +void BPF_STRUCT_OPS(central_running, struct task_struct *p) +{ + s32 cpu = scx_bpf_task_cpu(p); + u64 *started_at = ARRAY_ELEM_PTR(cpu_started_at, cpu, nr_cpu_ids); + if (started_at) + *started_at = bpf_ktime_get_ns() ?: 1; /* 0 indicates idle */ +} + +void BPF_STRUCT_OPS(central_stopping, struct task_struct *p, bool runnable) +{ + s32 cpu = scx_bpf_task_cpu(p); + u64 *started_at = ARRAY_ELEM_PTR(cpu_started_at, cpu, nr_cpu_ids); + if (started_at) + *started_at = 0; +} + +static int central_timerfn(void *map, int *key, struct bpf_timer *timer) +{ + u64 now = bpf_ktime_get_ns(); + u64 nr_to_kick = nr_queued; + s32 i, curr_cpu; + + curr_cpu = bpf_get_smp_processor_id(); + if (timer_pinned && (curr_cpu != central_cpu)) { + scx_bpf_error("Central timer ran on CPU %d, not central CPU %d", + curr_cpu, central_cpu); + return 0; + } + + bpf_for(i, 0, nr_cpu_ids) { + s32 cpu = (nr_timers + i) % nr_cpu_ids; + u64 *started_at; + + if (cpu == central_cpu) + continue; + + /* kick iff the current one exhausted its slice */ + started_at = ARRAY_ELEM_PTR(cpu_started_at, cpu, nr_cpu_ids); + if (started_at && *started_at && + vtime_before(now, *started_at + slice_ns)) + continue; + + /* and there's something pending */ + if (scx_bpf_dsq_nr_queued(FALLBACK_DSQ_ID) || + scx_bpf_dsq_nr_queued(SCX_DSQ_LOCAL_ON | cpu)) + ; + else if (nr_to_kick) + nr_to_kick--; + else + continue; + + scx_bpf_kick_cpu(cpu, SCX_KICK_PREEMPT); + } + + bpf_timer_start(timer, TIMER_INTERVAL_NS, BPF_F_TIMER_CPU_PIN); + __sync_fetch_and_add(&nr_timers, 1); + return 0; +} + int BPF_STRUCT_OPS_SLEEPABLE(central_init) { - return scx_bpf_create_dsq(FALLBACK_DSQ_ID, -1); + u32 key = 0; + struct bpf_timer *timer; + int ret; + + ret = scx_bpf_create_dsq(FALLBACK_DSQ_ID, -1); + if (ret) + return ret; + + timer = bpf_map_lookup_elem(¢ral_timer, &key); + if (!timer) + return -ESRCH; + + if (bpf_get_smp_processor_id() != central_cpu) { + scx_bpf_error("init from non-central CPU"); + return -EINVAL; + } + + bpf_timer_init(timer, ¢ral_timer, CLOCK_MONOTONIC); + bpf_timer_set_callback(timer, central_timerfn); + + ret = bpf_timer_start(timer, TIMER_INTERVAL_NS, BPF_F_TIMER_CPU_PIN); + /* + * BPF_F_TIMER_CPU_PIN is pretty new (>=6.7). If we're running in a + * kernel which doesn't have it, bpf_timer_start() will return -EINVAL. + * Retry without the PIN. This would be the perfect use case for + * bpf_core_enum_value_exists() but the enum type doesn't have a name + * and can't be used with bpf_core_enum_value_exists(). Oh well... + */ + if (ret == -EINVAL) { + timer_pinned = false; + ret = bpf_timer_start(timer, TIMER_INTERVAL_NS, 0); + } + if (ret) + scx_bpf_error("bpf_timer_start failed (%d)", ret); + return ret; } void BPF_STRUCT_OPS(central_exit, struct scx_exit_info *ei) @@ -209,6 +354,8 @@ SCX_OPS_DEFINE(central_ops, .select_cpu = (void *)central_select_cpu, .enqueue = (void *)central_enqueue, .dispatch = (void *)central_dispatch, + .running = (void *)central_running, + .stopping = (void *)central_stopping, .init = (void *)central_init, .exit = (void *)central_exit, .name = "central"); diff --git a/tools/sched_ext/scx_central.c b/tools/sched_ext/scx_central.c index 5f09fc666a63..fb3f50886552 100644 --- a/tools/sched_ext/scx_central.c +++ b/tools/sched_ext/scx_central.c @@ -48,6 +48,7 @@ int main(int argc, char **argv) struct bpf_link *link; __u64 seq = 0; __s32 opt; + cpu_set_t *cpuset; libbpf_set_print(libbpf_print_fn); signal(SIGINT, sigint_handler); @@ -77,10 +78,35 @@ int main(int argc, char **argv) /* Resize arrays so their element count is equal to cpu count. */ RESIZE_ARRAY(skel, data, cpu_gimme_task, skel->rodata->nr_cpu_ids); + RESIZE_ARRAY(skel, data, cpu_started_at, skel->rodata->nr_cpu_ids); SCX_OPS_LOAD(skel, central_ops, scx_central, uei); + + /* + * Affinitize the loading thread to the central CPU, as: + * - That's where the BPF timer is first invoked in the BPF program. + * - We probably don't want this user space component to take up a core + * from a task that would benefit from avoiding preemption on one of + * the tickless cores. + * + * Until BPF supports pinning the timer, it's not guaranteed that it + * will always be invoked on the central CPU. In practice, this + * suffices the majority of the time. + */ + cpuset = CPU_ALLOC(skel->rodata->nr_cpu_ids); + SCX_BUG_ON(!cpuset, "Failed to allocate cpuset"); + CPU_ZERO(cpuset); + CPU_SET(skel->rodata->central_cpu, cpuset); + SCX_BUG_ON(sched_setaffinity(0, sizeof(cpuset), cpuset), + "Failed to affinitize to central CPU %d (max %d)", + skel->rodata->central_cpu, skel->rodata->nr_cpu_ids - 1); + CPU_FREE(cpuset); + link = SCX_OPS_ATTACH(skel, central_ops, scx_central); + if (!skel->data->timer_pinned) + printf("WARNING : BPF_F_TIMER_CPU_PIN not available, timer not pinned to central\n"); + while (!exit_req && !UEI_EXITED(skel, uei)) { printf("[SEQ %llu]\n", seq++); printf("total :%10" PRIu64 " local:%10" PRIu64 " queued:%10" PRIu64 " lost:%10" PRIu64 "\n", @@ -88,7 +114,8 @@ int main(int argc, char **argv) skel->bss->nr_locals, skel->bss->nr_queued, skel->bss->nr_lost_pids); - printf(" dispatch:%10" PRIu64 " mismatch:%10" PRIu64 " retry:%10" PRIu64 "\n", + printf("timer :%10" PRIu64 " dispatch:%10" PRIu64 " mismatch:%10" PRIu64 " retry:%10" PRIu64 "\n", + skel->bss->nr_timers, skel->bss->nr_dispatches, skel->bss->nr_mismatches, skel->bss->nr_retries); -- cgit v1.2.3 From 245254f7081dbe1c8da54675d0e4ddbe74cee61b Mon Sep 17 00:00:00 2001 From: David Vernet Date: Tue, 18 Jun 2024 10:09:20 -1000 Subject: sched_ext: Implement sched_ext_ops.cpu_acquire/release() Scheduler classes are strictly ordered and when a higher priority class has tasks to run, the lower priority ones lose access to the CPU. Being able to monitor and act on these events are necessary for use cases includling strict core-scheduling and latency management. This patch adds two operations ops.cpu_acquire() and .cpu_release(). The former is invoked when a CPU becomes available to the BPF scheduler and the opposite for the latter. This patch also implements scx_bpf_reenqueue_local() which can be called from .cpu_release() to trigger requeueing of all tasks in the local dsq of the CPU so that the tasks can be reassigned to other available CPUs. scx_pair is updated to use .cpu_acquire/release() along with %SCX_KICK_WAIT to make the pair scheduling guarantee strict even when a CPU is preempted by a higher priority scheduler class. scx_qmap is updated to use .cpu_acquire/release() to empty the local dsq of a preempted CPU. A similar approach can be adopted by BPF schedulers that want to have a tight control over latency. v4: Use the new SCX_KICK_IDLE to wake up a CPU after re-enqueueing. v3: Drop the const qualifier from scx_cpu_release_args.task. BPF enforces access control through the verifier, so the qualifier isn't actually operative and only gets in the way when interacting with various helpers. v2: Add p->scx.kf_mask annotation to allow calling scx_bpf_reenqueue_local() from ops.cpu_release() nested inside ops.init() and other sleepable operations. Signed-off-by: David Vernet Reviewed-by: Tejun Heo Signed-off-by: Tejun Heo Acked-by: Josh Don Acked-by: Hao Luo Acked-by: Barret Rhoden --- include/linux/sched/ext.h | 4 +- kernel/sched/ext.c | 198 ++++++++++++++++++++++++++++++- kernel/sched/ext.h | 2 + kernel/sched/sched.h | 1 + tools/sched_ext/include/scx/common.bpf.h | 1 + tools/sched_ext/scx_qmap.bpf.c | 37 +++++- tools/sched_ext/scx_qmap.c | 4 +- 7 files changed, 240 insertions(+), 7 deletions(-) (limited to 'tools') diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h index 74341dbc6a19..21c627337e01 100644 --- a/include/linux/sched/ext.h +++ b/include/linux/sched/ext.h @@ -98,13 +98,15 @@ enum scx_kf_mask { SCX_KF_UNLOCKED = 0, /* not sleepable, not rq locked */ /* all non-sleepables may be nested inside SLEEPABLE */ SCX_KF_SLEEPABLE = 1 << 0, /* sleepable init operations */ + /* ENQUEUE and DISPATCH may be nested inside CPU_RELEASE */ + SCX_KF_CPU_RELEASE = 1 << 1, /* ops.cpu_release() */ /* ops.dequeue (in REST) may be nested inside DISPATCH */ SCX_KF_DISPATCH = 1 << 2, /* ops.dispatch() */ SCX_KF_ENQUEUE = 1 << 3, /* ops.enqueue() and ops.select_cpu() */ SCX_KF_SELECT_CPU = 1 << 4, /* ops.select_cpu() */ SCX_KF_REST = 1 << 5, /* other rq-locked operations */ - __SCX_KF_RQ_LOCKED = SCX_KF_DISPATCH | + __SCX_KF_RQ_LOCKED = SCX_KF_CPU_RELEASE | SCX_KF_DISPATCH | SCX_KF_ENQUEUE | SCX_KF_SELECT_CPU | SCX_KF_REST, __SCX_KF_TERMINAL = SCX_KF_ENQUEUE | SCX_KF_SELECT_CPU | SCX_KF_REST, }; diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 1ca3067b4e0a..686dab6ab592 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -110,6 +110,32 @@ struct scx_exit_task_args { bool cancelled; }; +enum scx_cpu_preempt_reason { + /* next task is being scheduled by &sched_class_rt */ + SCX_CPU_PREEMPT_RT, + /* next task is being scheduled by &sched_class_dl */ + SCX_CPU_PREEMPT_DL, + /* next task is being scheduled by &sched_class_stop */ + SCX_CPU_PREEMPT_STOP, + /* unknown reason for SCX being preempted */ + SCX_CPU_PREEMPT_UNKNOWN, +}; + +/* + * Argument container for ops->cpu_acquire(). Currently empty, but may be + * expanded in the future. + */ +struct scx_cpu_acquire_args {}; + +/* argument container for ops->cpu_release() */ +struct scx_cpu_release_args { + /* the reason the CPU was preempted */ + enum scx_cpu_preempt_reason reason; + + /* the task that's going to be scheduled on the CPU */ + struct task_struct *task; +}; + /* * Informational context provided to dump operations. */ @@ -335,6 +361,28 @@ struct sched_ext_ops { */ void (*update_idle)(s32 cpu, bool idle); + /** + * cpu_acquire - A CPU is becoming available to the BPF scheduler + * @cpu: The CPU being acquired by the BPF scheduler. + * @args: Acquire arguments, see the struct definition. + * + * A CPU that was previously released from the BPF scheduler is now once + * again under its control. + */ + void (*cpu_acquire)(s32 cpu, struct scx_cpu_acquire_args *args); + + /** + * cpu_release - A CPU is taken away from the BPF scheduler + * @cpu: The CPU being released by the BPF scheduler. + * @args: Release arguments, see the struct definition. + * + * The specified CPU is no longer under the control of the BPF + * scheduler. This could be because it was preempted by a higher + * priority sched_class, though there may be other reasons as well. The + * caller should consult @args->reason to determine the cause. + */ + void (*cpu_release)(s32 cpu, struct scx_cpu_release_args *args); + /** * init_task - Initialize a task to run in a BPF scheduler * @p: task to initialize for BPF scheduling @@ -487,6 +535,17 @@ enum scx_enq_flags { */ SCX_ENQ_PREEMPT = 1LLU << 32, + /* + * The task being enqueued was previously enqueued on the current CPU's + * %SCX_DSQ_LOCAL, but was removed from it in a call to the + * bpf_scx_reenqueue_local() kfunc. If bpf_scx_reenqueue_local() was + * invoked in a ->cpu_release() callback, and the task is again + * dispatched back to %SCX_LOCAL_DSQ by this current ->enqueue(), the + * task will not be scheduled on the CPU until at least the next invocation + * of the ->cpu_acquire() callback. + */ + SCX_ENQ_REENQ = 1LLU << 40, + /* * The task being enqueued is the only task available for the cpu. By * default, ext core keeps executing such tasks but when @@ -625,6 +684,7 @@ static bool scx_warned_zero_slice; static DEFINE_STATIC_KEY_FALSE(scx_ops_enq_last); static DEFINE_STATIC_KEY_FALSE(scx_ops_enq_exiting); +DEFINE_STATIC_KEY_FALSE(scx_ops_cpu_preempt); static DEFINE_STATIC_KEY_FALSE(scx_builtin_idle_enabled); struct static_key_false scx_has_op[SCX_OPI_END] = @@ -887,6 +947,12 @@ static __always_inline bool scx_kf_allowed(u32 mask) * inside ops.dispatch(). We don't need to check the SCX_KF_SLEEPABLE * boundary thanks to the above in_interrupt() check. */ + if (unlikely(highest_bit(mask) == SCX_KF_CPU_RELEASE && + (current->scx.kf_mask & higher_bits(SCX_KF_CPU_RELEASE)))) { + scx_ops_error("cpu_release kfunc called from a nested operation"); + return false; + } + if (unlikely(highest_bit(mask) == SCX_KF_DISPATCH && (current->scx.kf_mask & higher_bits(SCX_KF_DISPATCH)))) { scx_ops_error("dispatch kfunc called from a nested operation"); @@ -2070,6 +2136,19 @@ static int balance_scx(struct rq *rq, struct task_struct *prev, lockdep_assert_rq_held(rq); rq->scx.flags |= SCX_RQ_BALANCING; + if (static_branch_unlikely(&scx_ops_cpu_preempt) && + unlikely(rq->scx.cpu_released)) { + /* + * If the previous sched_class for the current CPU was not SCX, + * notify the BPF scheduler that it again has control of the + * core. This callback complements ->cpu_release(), which is + * emitted in scx_next_task_picked(). + */ + if (SCX_HAS_OP(cpu_acquire)) + SCX_CALL_OP(0, cpu_acquire, cpu_of(rq), NULL); + rq->scx.cpu_released = false; + } + if (prev_on_scx) { WARN_ON_ONCE(prev->scx.flags & SCX_TASK_BAL_KEEP); update_curr_scx(rq); @@ -2077,7 +2156,9 @@ static int balance_scx(struct rq *rq, struct task_struct *prev, /* * If @prev is runnable & has slice left, it has priority and * fetching more just increases latency for the fetched tasks. - * Tell put_prev_task_scx() to put @prev on local_dsq. + * Tell put_prev_task_scx() to put @prev on local_dsq. If the + * BPF scheduler wants to handle this explicitly, it should + * implement ->cpu_released(). * * See scx_ops_disable_workfn() for the explanation on the * bypassing test. @@ -2297,6 +2378,20 @@ static struct task_struct *pick_next_task_scx(struct rq *rq) return p; } +static enum scx_cpu_preempt_reason +preempt_reason_from_class(const struct sched_class *class) +{ +#ifdef CONFIG_SMP + if (class == &stop_sched_class) + return SCX_CPU_PREEMPT_STOP; +#endif + if (class == &dl_sched_class) + return SCX_CPU_PREEMPT_DL; + if (class == &rt_sched_class) + return SCX_CPU_PREEMPT_RT; + return SCX_CPU_PREEMPT_UNKNOWN; +} + void scx_next_task_picked(struct rq *rq, struct task_struct *p, const struct sched_class *active) { @@ -2312,6 +2407,40 @@ void scx_next_task_picked(struct rq *rq, struct task_struct *p, */ smp_store_release(&rq->scx.pnt_seq, rq->scx.pnt_seq + 1); #endif + if (!static_branch_unlikely(&scx_ops_cpu_preempt)) + return; + + /* + * The callback is conceptually meant to convey that the CPU is no + * longer under the control of SCX. Therefore, don't invoke the + * callback if the CPU is is staying on SCX, or going idle (in which + * case the SCX scheduler has actively decided not to schedule any + * tasks on the CPU). + */ + if (likely(active >= &ext_sched_class)) + return; + + /* + * At this point we know that SCX was preempted by a higher priority + * sched_class, so invoke the ->cpu_release() callback if we have not + * done so already. We only send the callback once between SCX being + * preempted, and it regaining control of the CPU. + * + * ->cpu_release() complements ->cpu_acquire(), which is emitted the + * next time that balance_scx() is invoked. + */ + if (!rq->scx.cpu_released) { + if (SCX_HAS_OP(cpu_release)) { + struct scx_cpu_release_args args = { + .reason = preempt_reason_from_class(active), + .task = p, + }; + + SCX_CALL_OP(SCX_KF_CPU_RELEASE, + cpu_release, cpu_of(rq), &args); + } + rq->scx.cpu_released = true; + } } #ifdef CONFIG_SMP @@ -3398,6 +3527,7 @@ static void scx_ops_disable_workfn(struct kthread_work *work) static_branch_disable_cpuslocked(&scx_has_op[i]); static_branch_disable_cpuslocked(&scx_ops_enq_last); static_branch_disable_cpuslocked(&scx_ops_enq_exiting); + static_branch_disable_cpuslocked(&scx_ops_cpu_preempt); static_branch_disable_cpuslocked(&scx_builtin_idle_enabled); synchronize_rcu(); @@ -3699,9 +3829,10 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len) seq_buf_init(&ns, buf, avail); dump_newline(&ns); - dump_line(&ns, "CPU %-4d: nr_run=%u flags=0x%x ops_qseq=%lu pnt_seq=%lu", + dump_line(&ns, "CPU %-4d: nr_run=%u flags=0x%x cpu_rel=%d ops_qseq=%lu pnt_seq=%lu", cpu, rq->scx.nr_running, rq->scx.flags, - rq->scx.ops_qseq, rq->scx.pnt_seq); + rq->scx.cpu_released, rq->scx.ops_qseq, + rq->scx.pnt_seq); dump_line(&ns, " curr=%s[%d] class=%ps", rq->curr->comm, rq->curr->pid, rq->curr->sched_class); @@ -3942,6 +4073,8 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) if (ops->flags & SCX_OPS_ENQ_EXITING) static_branch_enable_cpuslocked(&scx_ops_enq_exiting); + if (scx_ops.cpu_acquire || scx_ops.cpu_release) + static_branch_enable_cpuslocked(&scx_ops_cpu_preempt); if (!ops->update_idle || (ops->flags & SCX_OPS_KEEP_BUILTIN_IDLE)) { reset_idle_masks(); @@ -4318,6 +4451,8 @@ static bool yield_stub(struct task_struct *from, struct task_struct *to) { retur static void set_weight_stub(struct task_struct *p, u32 weight) {} static void set_cpumask_stub(struct task_struct *p, const struct cpumask *mask) {} static void update_idle_stub(s32 cpu, bool idle) {} +static void cpu_acquire_stub(s32 cpu, struct scx_cpu_acquire_args *args) {} +static void cpu_release_stub(s32 cpu, struct scx_cpu_release_args *args) {} static s32 init_task_stub(struct task_struct *p, struct scx_init_task_args *args) { return -EINVAL; } static void exit_task_stub(struct task_struct *p, struct scx_exit_task_args *args) {} static void enable_stub(struct task_struct *p) {} @@ -4338,6 +4473,8 @@ static struct sched_ext_ops __bpf_ops_sched_ext_ops = { .set_weight = set_weight_stub, .set_cpumask = set_cpumask_stub, .update_idle = update_idle_stub, + .cpu_acquire = cpu_acquire_stub, + .cpu_release = cpu_release_stub, .init_task = init_task_stub, .exit_task = exit_task_stub, .enable = enable_stub, @@ -4870,6 +5007,59 @@ static const struct btf_kfunc_id_set scx_kfunc_set_dispatch = { __bpf_kfunc_start_defs(); +/** + * scx_bpf_reenqueue_local - Re-enqueue tasks on a local DSQ + * + * Iterate over all of the tasks currently enqueued on the local DSQ of the + * caller's CPU, and re-enqueue them in the BPF scheduler. Returns the number of + * processed tasks. Can only be called from ops.cpu_release(). + */ +__bpf_kfunc u32 scx_bpf_reenqueue_local(void) +{ + u32 nr_enqueued, i; + struct rq *rq; + + if (!scx_kf_allowed(SCX_KF_CPU_RELEASE)) + return 0; + + rq = cpu_rq(smp_processor_id()); + lockdep_assert_rq_held(rq); + + /* + * Get the number of tasks on the local DSQ before iterating over it to + * pull off tasks. The enqueue callback below can signal that it wants + * the task to stay on the local DSQ, and we want to prevent the BPF + * scheduler from causing us to loop indefinitely. + */ + nr_enqueued = rq->scx.local_dsq.nr; + for (i = 0; i < nr_enqueued; i++) { + struct task_struct *p; + + p = first_local_task(rq); + WARN_ON_ONCE(atomic_long_read(&p->scx.ops_state) != + SCX_OPSS_NONE); + WARN_ON_ONCE(!(p->scx.flags & SCX_TASK_QUEUED)); + WARN_ON_ONCE(p->scx.holding_cpu != -1); + dispatch_dequeue(rq, p); + do_enqueue_task(rq, p, SCX_ENQ_REENQ, -1); + } + + return nr_enqueued; +} + +__bpf_kfunc_end_defs(); + +BTF_KFUNCS_START(scx_kfunc_ids_cpu_release) +BTF_ID_FLAGS(func, scx_bpf_reenqueue_local) +BTF_KFUNCS_END(scx_kfunc_ids_cpu_release) + +static const struct btf_kfunc_id_set scx_kfunc_set_cpu_release = { + .owner = THIS_MODULE, + .set = &scx_kfunc_ids_cpu_release, +}; + +__bpf_kfunc_start_defs(); + /** * scx_bpf_kick_cpu - Trigger reschedule on a CPU * @cpu: cpu to kick @@ -5379,6 +5569,8 @@ static int __init scx_init(void) &scx_kfunc_set_enqueue_dispatch)) || (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &scx_kfunc_set_dispatch)) || + (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, + &scx_kfunc_set_cpu_release)) || (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &scx_kfunc_set_any)) || (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, diff --git a/kernel/sched/ext.h b/kernel/sched/ext.h index 0aeb1fda1794..4ebd1c2478f1 100644 --- a/kernel/sched/ext.h +++ b/kernel/sched/ext.h @@ -24,6 +24,8 @@ DECLARE_STATIC_KEY_FALSE(__scx_switched_all); #define scx_enabled() static_branch_unlikely(&__scx_ops_enabled) #define scx_switched_all() static_branch_unlikely(&__scx_switched_all) +DECLARE_STATIC_KEY_FALSE(scx_ops_cpu_preempt); + static inline bool task_on_scx(const struct task_struct *p) { return scx_enabled() && p->sched_class == &ext_sched_class; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 734206e13897..147d18cf01ce 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -737,6 +737,7 @@ struct scx_rq { u64 extra_enq_flags; /* see move_task_to_local_dsq() */ u32 nr_running; u32 flags; + bool cpu_released; cpumask_var_t cpus_to_kick; cpumask_var_t cpus_to_kick_if_idle; cpumask_var_t cpus_to_preempt; diff --git a/tools/sched_ext/include/scx/common.bpf.h b/tools/sched_ext/include/scx/common.bpf.h index 421118bc56ff..8686f84497db 100644 --- a/tools/sched_ext/include/scx/common.bpf.h +++ b/tools/sched_ext/include/scx/common.bpf.h @@ -34,6 +34,7 @@ void scx_bpf_dispatch(struct task_struct *p, u64 dsq_id, u64 slice, u64 enq_flag u32 scx_bpf_dispatch_nr_slots(void) __ksym; void scx_bpf_dispatch_cancel(void) __ksym; bool scx_bpf_consume(u64 dsq_id) __ksym; +u32 scx_bpf_reenqueue_local(void) __ksym; void scx_bpf_kick_cpu(s32 cpu, u64 flags) __ksym; s32 scx_bpf_dsq_nr_queued(u64 dsq_id) __ksym; void scx_bpf_destroy_dsq(u64 dsq_id) __ksym; diff --git a/tools/sched_ext/scx_qmap.bpf.c b/tools/sched_ext/scx_qmap.bpf.c index 879fc9c788e5..4a87377558c8 100644 --- a/tools/sched_ext/scx_qmap.bpf.c +++ b/tools/sched_ext/scx_qmap.bpf.c @@ -11,6 +11,8 @@ * * - BPF-side queueing using PIDs. * - Sleepable per-task storage allocation using ops.prep_enable(). + * - Using ops.cpu_release() to handle a higher priority scheduling class taking + * the CPU away. * * This scheduler is primarily for demonstration and testing of sched_ext * features and unlikely to be useful for actual workloads. @@ -90,7 +92,7 @@ struct { } cpu_ctx_stor SEC(".maps"); /* Statistics */ -u64 nr_enqueued, nr_dispatched, nr_dequeued; +u64 nr_enqueued, nr_dispatched, nr_reenqueued, nr_dequeued; s32 BPF_STRUCT_OPS(qmap_select_cpu, struct task_struct *p, s32 prev_cpu, u64 wake_flags) @@ -164,6 +166,22 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags) return; } + /* + * If the task was re-enqueued due to the CPU being preempted by a + * higher priority scheduling class, just re-enqueue the task directly + * on the global DSQ. As we want another CPU to pick it up, find and + * kick an idle CPU. + */ + if (enq_flags & SCX_ENQ_REENQ) { + s32 cpu; + + scx_bpf_dispatch(p, SHARED_DSQ, 0, enq_flags); + cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0); + if (cpu >= 0) + scx_bpf_kick_cpu(cpu, SCX_KICK_IDLE); + return; + } + ring = bpf_map_lookup_elem(&queue_arr, &idx); if (!ring) { scx_bpf_error("failed to find ring %d", idx); @@ -257,6 +275,22 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev) } } +void BPF_STRUCT_OPS(qmap_cpu_release, s32 cpu, struct scx_cpu_release_args *args) +{ + u32 cnt; + + /* + * Called when @cpu is taken by a higher priority scheduling class. This + * makes @cpu no longer available for executing sched_ext tasks. As we + * don't want the tasks in @cpu's local dsq to sit there until @cpu + * becomes available again, re-enqueue them into the global dsq. See + * %SCX_ENQ_REENQ handling in qmap_enqueue(). + */ + cnt = scx_bpf_reenqueue_local(); + if (cnt) + __sync_fetch_and_add(&nr_reenqueued, cnt); +} + s32 BPF_STRUCT_OPS(qmap_init_task, struct task_struct *p, struct scx_init_task_args *args) { @@ -339,6 +373,7 @@ SCX_OPS_DEFINE(qmap_ops, .enqueue = (void *)qmap_enqueue, .dequeue = (void *)qmap_dequeue, .dispatch = (void *)qmap_dispatch, + .cpu_release = (void *)qmap_cpu_release, .init_task = (void *)qmap_init_task, .dump = (void *)qmap_dump, .dump_cpu = (void *)qmap_dump_cpu, diff --git a/tools/sched_ext/scx_qmap.c b/tools/sched_ext/scx_qmap.c index 594147a710a8..2a97421afe9a 100644 --- a/tools/sched_ext/scx_qmap.c +++ b/tools/sched_ext/scx_qmap.c @@ -112,9 +112,9 @@ int main(int argc, char **argv) long nr_enqueued = skel->bss->nr_enqueued; long nr_dispatched = skel->bss->nr_dispatched; - printf("stats : enq=%lu dsp=%lu delta=%ld deq=%"PRIu64"\n", + printf("stats : enq=%lu dsp=%lu delta=%ld reenq=%"PRIu64" deq=%"PRIu64"\n", nr_enqueued, nr_dispatched, nr_enqueued - nr_dispatched, - skel->bss->nr_dequeued); + skel->bss->nr_reenqueued, skel->bss->nr_dequeued); fflush(stdout); sleep(1); } -- cgit v1.2.3 From 60c27fb59f6cffa73fc8c60e3a22323c78044576 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 18 Jun 2024 10:09:20 -1000 Subject: sched_ext: Implement sched_ext_ops.cpu_online/offline() Add ops.cpu_online/offline() which are invoked when CPUs come online and offline respectively. As the enqueue path already automatically bypasses tasks to the local dsq on a deactivated CPU, BPF schedulers are guaranteed to see tasks only on CPUs which are between online() and offline(). If the BPF scheduler doesn't implement ops.cpu_online/offline(), the scheduler is automatically exited with SCX_ECODE_RESTART | SCX_ECODE_RSN_HOTPLUG. Userspace can implement CPU hotpplug support trivially by simply reinitializing and reloading the scheduler. scx_qmap is updated to print out online CPUs on hotplug events. Other schedulers are updated to restart based on ecode. v3: - The previous implementation added @reason to sched_class.rq_on/offline() to distinguish between CPU hotplug events and topology updates. This was buggy and fragile as the methods are skipped if the current state equals the target state. Instead, add scx_rq_[de]activate() which are directly called from sched_cpu_de/activate(). This also allows ops.cpu_on/offline() to sleep which can be useful. - ops.dispatch() could be called on a CPU that the BPF scheduler was told to be offline. The dispatch patch is updated to bypass in such cases. v2: - To accommodate lock ordering change between scx_cgroup_rwsem and cpus_read_lock(), CPU hotplug operations are put into its own SCX_OPI block and enabled eariler during scx_ope_enable() so that cpus_read_lock() can be dropped before acquiring scx_cgroup_rwsem. - Auto exit with ECODE added. Signed-off-by: Tejun Heo Reviewed-by: David Vernet Acked-by: Josh Don Acked-by: Hao Luo Acked-by: Barret Rhoden --- kernel/sched/core.c | 4 + kernel/sched/ext.c | 156 +++++++++++++++++++++++++-- kernel/sched/ext.h | 4 + kernel/sched/sched.h | 6 ++ tools/sched_ext/include/scx/compat.h | 26 +++++ tools/sched_ext/include/scx/user_exit_info.h | 28 +++++ tools/sched_ext/scx_central.c | 9 +- tools/sched_ext/scx_qmap.bpf.c | 57 ++++++++++ tools/sched_ext/scx_qmap.c | 4 + tools/sched_ext/scx_simple.c | 8 +- 10 files changed, 290 insertions(+), 12 deletions(-) (limited to 'tools') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 0e6ff33f34e4..c798c847d57e 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7984,6 +7984,8 @@ int sched_cpu_activate(unsigned int cpu) cpuset_cpu_active(); } + scx_rq_activate(rq); + /* * Put the rq online, if not already. This happens: * @@ -8044,6 +8046,8 @@ int sched_cpu_deactivate(unsigned int cpu) } rq_unlock_irqrestore(rq, &rf); + scx_rq_deactivate(rq); + #ifdef CONFIG_SCHED_SMT /* * When going down, decrement the number of cores with SMT present. diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 686dab6ab592..7c2f2a542b32 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -30,6 +30,29 @@ enum scx_exit_kind { SCX_EXIT_ERROR_STALL, /* watchdog detected stalled runnable tasks */ }; +/* + * An exit code can be specified when exiting with scx_bpf_exit() or + * scx_ops_exit(), corresponding to exit_kind UNREG_BPF and UNREG_KERN + * respectively. The codes are 64bit of the format: + * + * Bits: [63 .. 48 47 .. 32 31 .. 0] + * [ SYS ACT ] [ SYS RSN ] [ USR ] + * + * SYS ACT: System-defined exit actions + * SYS RSN: System-defined exit reasons + * USR : User-defined exit codes and reasons + * + * Using the above, users may communicate intention and context by ORing system + * actions and/or system reasons with a user-defined exit code. + */ +enum scx_exit_code { + /* Reasons */ + SCX_ECODE_RSN_HOTPLUG = 1LLU << 32, + + /* Actions */ + SCX_ECODE_ACT_RESTART = 1LLU << 48, +}; + /* * scx_exit_info is passed to ops.exit() to describe why the BPF scheduler is * being disabled. @@ -457,7 +480,29 @@ struct sched_ext_ops { void (*dump_task)(struct scx_dump_ctx *ctx, struct task_struct *p); /* - * All online ops must come before ops.init(). + * All online ops must come before ops.cpu_online(). + */ + + /** + * cpu_online - A CPU became online + * @cpu: CPU which just came up + * + * @cpu just came online. @cpu will not call ops.enqueue() or + * ops.dispatch(), nor run tasks associated with other CPUs beforehand. + */ + void (*cpu_online)(s32 cpu); + + /** + * cpu_offline - A CPU is going offline + * @cpu: CPU which is going offline + * + * @cpu is going offline. @cpu will not call ops.enqueue() or + * ops.dispatch(), nor run tasks associated with other CPUs afterwards. + */ + void (*cpu_offline)(s32 cpu); + + /* + * All CPU hotplug ops must come before ops.init(). */ /** @@ -496,6 +541,15 @@ struct sched_ext_ops { */ u32 exit_dump_len; + /** + * hotplug_seq - A sequence number that may be set by the scheduler to + * detect when a hotplug event has occurred during the loading process. + * If 0, no detection occurs. Otherwise, the scheduler will fail to + * load if the sequence number does not match @scx_hotplug_seq on the + * enable path. + */ + u64 hotplug_seq; + /** * name - BPF scheduler's name * @@ -509,7 +563,9 @@ struct sched_ext_ops { enum scx_opi { SCX_OPI_BEGIN = 0, SCX_OPI_NORMAL_BEGIN = 0, - SCX_OPI_NORMAL_END = SCX_OP_IDX(init), + SCX_OPI_NORMAL_END = SCX_OP_IDX(cpu_online), + SCX_OPI_CPU_HOTPLUG_BEGIN = SCX_OP_IDX(cpu_online), + SCX_OPI_CPU_HOTPLUG_END = SCX_OP_IDX(init), SCX_OPI_END = SCX_OP_IDX(init), }; @@ -694,6 +750,7 @@ static atomic_t scx_exit_kind = ATOMIC_INIT(SCX_EXIT_DONE); static struct scx_exit_info *scx_exit_info; static atomic_long_t scx_nr_rejected = ATOMIC_LONG_INIT(0); +static atomic_long_t scx_hotplug_seq = ATOMIC_LONG_INIT(0); /* * The maximum amount of time in jiffies that a task may be runnable without @@ -1419,11 +1476,7 @@ static void direct_dispatch(struct task_struct *p, u64 enq_flags) static bool scx_rq_online(struct rq *rq) { -#ifdef CONFIG_SMP - return likely(rq->online); -#else - return true; -#endif + return likely(rq->scx.flags & SCX_RQ_ONLINE); } static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags, @@ -1438,6 +1491,11 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags, if (sticky_cpu == cpu_of(rq)) goto local_norefill; + /* + * If !scx_rq_online(), we already told the BPF scheduler that the CPU + * is offline and are just running the hotplug path. Don't bother the + * BPF scheduler. + */ if (!scx_rq_online(rq)) goto local; @@ -2673,6 +2731,42 @@ void __scx_update_idle(struct rq *rq, bool idle) #endif } +static void handle_hotplug(struct rq *rq, bool online) +{ + int cpu = cpu_of(rq); + + atomic_long_inc(&scx_hotplug_seq); + + if (online && SCX_HAS_OP(cpu_online)) + SCX_CALL_OP(SCX_KF_SLEEPABLE, cpu_online, cpu); + else if (!online && SCX_HAS_OP(cpu_offline)) + SCX_CALL_OP(SCX_KF_SLEEPABLE, cpu_offline, cpu); + else + scx_ops_exit(SCX_ECODE_ACT_RESTART | SCX_ECODE_RSN_HOTPLUG, + "cpu %d going %s, exiting scheduler", cpu, + online ? "online" : "offline"); +} + +void scx_rq_activate(struct rq *rq) +{ + handle_hotplug(rq, true); +} + +void scx_rq_deactivate(struct rq *rq) +{ + handle_hotplug(rq, false); +} + +static void rq_online_scx(struct rq *rq) +{ + rq->scx.flags |= SCX_RQ_ONLINE; +} + +static void rq_offline_scx(struct rq *rq) +{ + rq->scx.flags &= ~SCX_RQ_ONLINE; +} + #else /* CONFIG_SMP */ static bool test_and_clear_cpu_idle(int cpu) { return false; } @@ -3104,6 +3198,9 @@ DEFINE_SCHED_CLASS(ext) = { .balance = balance_scx, .select_task_rq = select_task_rq_scx, .set_cpus_allowed = set_cpus_allowed_scx, + + .rq_online = rq_online_scx, + .rq_offline = rq_offline_scx, #endif .task_tick = task_tick_scx, @@ -3235,10 +3332,18 @@ static ssize_t scx_attr_nr_rejected_show(struct kobject *kobj, } SCX_ATTR(nr_rejected); +static ssize_t scx_attr_hotplug_seq_show(struct kobject *kobj, + struct kobj_attribute *ka, char *buf) +{ + return sysfs_emit(buf, "%ld\n", atomic_long_read(&scx_hotplug_seq)); +} +SCX_ATTR(hotplug_seq); + static struct attribute *scx_global_attrs[] = { &scx_attr_state.attr, &scx_attr_switch_all.attr, &scx_attr_nr_rejected.attr, + &scx_attr_hotplug_seq.attr, NULL, }; @@ -3941,6 +4046,25 @@ static struct kthread_worker *scx_create_rt_helper(const char *name) return helper; } +static void check_hotplug_seq(const struct sched_ext_ops *ops) +{ + unsigned long long global_hotplug_seq; + + /* + * If a hotplug event has occurred between when a scheduler was + * initialized, and when we were able to attach, exit and notify user + * space about it. + */ + if (ops->hotplug_seq) { + global_hotplug_seq = atomic_long_read(&scx_hotplug_seq); + if (ops->hotplug_seq != global_hotplug_seq) { + scx_ops_exit(SCX_ECODE_ACT_RESTART | SCX_ECODE_RSN_HOTPLUG, + "expected hotplug seq %llu did not match actual %llu", + ops->hotplug_seq, global_hotplug_seq); + } + } +} + static int validate_ops(const struct sched_ext_ops *ops) { /* @@ -4023,6 +4147,10 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) } } + for (i = SCX_OPI_CPU_HOTPLUG_BEGIN; i < SCX_OPI_CPU_HOTPLUG_END; i++) + if (((void (**)(void))ops)[i]) + static_branch_enable_cpuslocked(&scx_has_op[i]); + cpus_read_unlock(); ret = validate_ops(ops); @@ -4064,6 +4192,8 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) percpu_down_write(&scx_fork_rwsem); cpus_read_lock(); + check_hotplug_seq(ops); + for (i = SCX_OPI_NORMAL_BEGIN; i < SCX_OPI_NORMAL_END; i++) if (((void (**)(void))ops)[i]) static_branch_enable_cpuslocked(&scx_has_op[i]); @@ -4374,6 +4504,9 @@ static int bpf_scx_init_member(const struct btf_type *t, ops->exit_dump_len = *(u32 *)(udata + moff) ?: SCX_EXIT_DUMP_DFL_LEN; return 1; + case offsetof(struct sched_ext_ops, hotplug_seq): + ops->hotplug_seq = *(u64 *)(udata + moff); + return 1; } return 0; @@ -4387,6 +4520,8 @@ static int bpf_scx_check_member(const struct btf_type *t, switch (moff) { case offsetof(struct sched_ext_ops, init_task): + case offsetof(struct sched_ext_ops, cpu_online): + case offsetof(struct sched_ext_ops, cpu_offline): case offsetof(struct sched_ext_ops, init): case offsetof(struct sched_ext_ops, exit): break; @@ -4457,6 +4592,8 @@ static s32 init_task_stub(struct task_struct *p, struct scx_init_task_args *args static void exit_task_stub(struct task_struct *p, struct scx_exit_task_args *args) {} static void enable_stub(struct task_struct *p) {} static void disable_stub(struct task_struct *p) {} +static void cpu_online_stub(s32 cpu) {} +static void cpu_offline_stub(s32 cpu) {} static s32 init_stub(void) { return -EINVAL; } static void exit_stub(struct scx_exit_info *info) {} @@ -4479,6 +4616,8 @@ static struct sched_ext_ops __bpf_ops_sched_ext_ops = { .exit_task = exit_task_stub, .enable = enable_stub, .disable = disable_stub, + .cpu_online = cpu_online_stub, + .cpu_offline = cpu_offline_stub, .init = init_stub, .exit = exit_stub, }; @@ -4719,6 +4858,9 @@ void __init init_sched_ext_class(void) BUG_ON(!zalloc_cpumask_var(&rq->scx.cpus_to_preempt, GFP_KERNEL)); BUG_ON(!zalloc_cpumask_var(&rq->scx.cpus_to_wait, GFP_KERNEL)); init_irq_work(&rq->scx.kick_cpus_irq_work, kick_cpus_irq_workfn); + + if (cpu_online(cpu)) + cpu_rq(cpu)->scx.flags |= SCX_RQ_ONLINE; } register_sysrq_key('S', &sysrq_sched_ext_reset_op); diff --git a/kernel/sched/ext.h b/kernel/sched/ext.h index 4ebd1c2478f1..037f9acdf443 100644 --- a/kernel/sched/ext.h +++ b/kernel/sched/ext.h @@ -40,6 +40,8 @@ int scx_fork(struct task_struct *p); void scx_post_fork(struct task_struct *p); void scx_cancel_fork(struct task_struct *p); bool scx_can_stop_tick(struct rq *rq); +void scx_rq_activate(struct rq *rq); +void scx_rq_deactivate(struct rq *rq); int scx_check_setscheduler(struct task_struct *p, int policy); bool task_should_scx(struct task_struct *p); void init_sched_ext_class(void); @@ -81,6 +83,8 @@ static inline int scx_fork(struct task_struct *p) { return 0; } static inline void scx_post_fork(struct task_struct *p) {} static inline void scx_cancel_fork(struct task_struct *p) {} static inline bool scx_can_stop_tick(struct rq *rq) { return true; } +static inline void scx_rq_activate(struct rq *rq) {} +static inline void scx_rq_deactivate(struct rq *rq) {} static inline int scx_check_setscheduler(struct task_struct *p, int policy) { return 0; } static inline bool task_on_scx(const struct task_struct *p) { return false; } static inline void init_sched_ext_class(void) {} diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 147d18cf01ce..c0d6e42c99cc 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -726,6 +726,12 @@ struct cfs_rq { #ifdef CONFIG_SCHED_CLASS_EXT /* scx_rq->flags, protected by the rq lock */ enum scx_rq_flags { + /* + * A hotplugged CPU starts scheduling before rq_online_scx(). Track + * ops.cpu_on/offline() state so that ops.enqueue/dispatch() are called + * only while the BPF scheduler considers the CPU to be online. + */ + SCX_RQ_ONLINE = 1 << 0, SCX_RQ_BALANCING = 1 << 1, SCX_RQ_CAN_STOP_TICK = 1 << 2, }; diff --git a/tools/sched_ext/include/scx/compat.h b/tools/sched_ext/include/scx/compat.h index c58024c980c8..cc56ff9aa252 100644 --- a/tools/sched_ext/include/scx/compat.h +++ b/tools/sched_ext/include/scx/compat.h @@ -8,6 +8,9 @@ #define __SCX_COMPAT_H #include +#include +#include +#include struct btf *__COMPAT_vmlinux_btf __attribute__((weak)); @@ -106,6 +109,28 @@ static inline bool __COMPAT_struct_has_field(const char *type, const char *field #define SCX_OPS_SWITCH_PARTIAL \ __COMPAT_ENUM_OR_ZERO("scx_ops_flags", "SCX_OPS_SWITCH_PARTIAL") +static inline long scx_hotplug_seq(void) +{ + int fd; + char buf[32]; + ssize_t len; + long val; + + fd = open("/sys/kernel/sched_ext/hotplug_seq", O_RDONLY); + if (fd < 0) + return -ENOENT; + + len = read(fd, buf, sizeof(buf) - 1); + SCX_BUG_ON(len <= 0, "read failed (%ld)", len); + buf[len] = 0; + close(fd); + + val = strtoul(buf, NULL, 10); + SCX_BUG_ON(val < 0, "invalid num hotplug events: %lu", val); + + return val; +} + /* * struct sched_ext_ops can change over time. If compat.bpf.h::SCX_OPS_DEFINE() * is used to define ops and compat.h::SCX_OPS_LOAD/ATTACH() are used to load @@ -123,6 +148,7 @@ static inline bool __COMPAT_struct_has_field(const char *type, const char *field \ __skel = __scx_name##__open(); \ SCX_BUG_ON(!__skel, "Could not open " #__scx_name); \ + __skel->struct_ops.__ops_name->hotplug_seq = scx_hotplug_seq(); \ __skel; \ }) diff --git a/tools/sched_ext/include/scx/user_exit_info.h b/tools/sched_ext/include/scx/user_exit_info.h index c2ef85c645e1..891693ee604e 100644 --- a/tools/sched_ext/include/scx/user_exit_info.h +++ b/tools/sched_ext/include/scx/user_exit_info.h @@ -77,7 +77,35 @@ struct user_exit_info { if (__uei->msg[0] != '\0') \ fprintf(stderr, " (%s)", __uei->msg); \ fputs("\n", stderr); \ + __uei->exit_code; \ }) +/* + * We can't import vmlinux.h while compiling user C code. Let's duplicate + * scx_exit_code definition. + */ +enum scx_exit_code { + /* Reasons */ + SCX_ECODE_RSN_HOTPLUG = 1LLU << 32, + + /* Actions */ + SCX_ECODE_ACT_RESTART = 1LLU << 48, +}; + +enum uei_ecode_mask { + UEI_ECODE_USER_MASK = ((1LLU << 32) - 1), + UEI_ECODE_SYS_RSN_MASK = ((1LLU << 16) - 1) << 32, + UEI_ECODE_SYS_ACT_MASK = ((1LLU << 16) - 1) << 48, +}; + +/* + * These macro interpret the ecode returned from UEI_REPORT(). + */ +#define UEI_ECODE_USER(__ecode) ((__ecode) & UEI_ECODE_USER_MASK) +#define UEI_ECODE_SYS_RSN(__ecode) ((__ecode) & UEI_ECODE_SYS_RSN_MASK) +#define UEI_ECODE_SYS_ACT(__ecode) ((__ecode) & UEI_ECODE_SYS_ACT_MASK) + +#define UEI_ECODE_RESTART(__ecode) (UEI_ECODE_SYS_ACT((__ecode)) == SCX_ECODE_ACT_RESTART) + #endif /* __bpf__ */ #endif /* __USER_EXIT_INFO_H */ diff --git a/tools/sched_ext/scx_central.c b/tools/sched_ext/scx_central.c index fb3f50886552..21deea320bd7 100644 --- a/tools/sched_ext/scx_central.c +++ b/tools/sched_ext/scx_central.c @@ -46,14 +46,14 @@ int main(int argc, char **argv) { struct scx_central *skel; struct bpf_link *link; - __u64 seq = 0; + __u64 seq = 0, ecode; __s32 opt; cpu_set_t *cpuset; libbpf_set_print(libbpf_print_fn); signal(SIGINT, sigint_handler); signal(SIGTERM, sigint_handler); - +restart: skel = SCX_OPS_OPEN(central_ops, scx_central); skel->rodata->central_cpu = 0; @@ -126,7 +126,10 @@ int main(int argc, char **argv) } bpf_link__destroy(link); - UEI_REPORT(skel, uei); + ecode = UEI_REPORT(skel, uei); scx_central__destroy(skel); + + if (UEI_ECODE_RESTART(ecode)) + goto restart; return 0; } diff --git a/tools/sched_ext/scx_qmap.bpf.c b/tools/sched_ext/scx_qmap.bpf.c index 4a87377558c8..619078355bf5 100644 --- a/tools/sched_ext/scx_qmap.bpf.c +++ b/tools/sched_ext/scx_qmap.bpf.c @@ -358,8 +358,63 @@ void BPF_STRUCT_OPS(qmap_dump_task, struct scx_dump_ctx *dctx, struct task_struc taskc->force_local); } +/* + * Print out the online and possible CPU map using bpf_printk() as a + * demonstration of using the cpumask kfuncs and ops.cpu_on/offline(). + */ +static void print_cpus(void) +{ + const struct cpumask *possible, *online; + s32 cpu; + char buf[128] = "", *p; + int idx; + + possible = scx_bpf_get_possible_cpumask(); + online = scx_bpf_get_online_cpumask(); + + idx = 0; + bpf_for(cpu, 0, scx_bpf_nr_cpu_ids()) { + if (!(p = MEMBER_VPTR(buf, [idx++]))) + break; + if (bpf_cpumask_test_cpu(cpu, online)) + *p++ = 'O'; + else if (bpf_cpumask_test_cpu(cpu, possible)) + *p++ = 'X'; + else + *p++ = ' '; + + if ((cpu & 7) == 7) { + if (!(p = MEMBER_VPTR(buf, [idx++]))) + break; + *p++ = '|'; + } + } + buf[sizeof(buf) - 1] = '\0'; + + scx_bpf_put_cpumask(online); + scx_bpf_put_cpumask(possible); + + bpf_printk("CPUS: |%s", buf); +} + +void BPF_STRUCT_OPS(qmap_cpu_online, s32 cpu) +{ + bpf_printk("CPU %d coming online", cpu); + /* @cpu is already online at this point */ + print_cpus(); +} + +void BPF_STRUCT_OPS(qmap_cpu_offline, s32 cpu) +{ + bpf_printk("CPU %d going offline", cpu); + /* @cpu is still online at this point */ + print_cpus(); +} + s32 BPF_STRUCT_OPS_SLEEPABLE(qmap_init) { + print_cpus(); + return scx_bpf_create_dsq(SHARED_DSQ, -1); } @@ -378,6 +433,8 @@ SCX_OPS_DEFINE(qmap_ops, .dump = (void *)qmap_dump, .dump_cpu = (void *)qmap_dump_cpu, .dump_task = (void *)qmap_dump_task, + .cpu_online = (void *)qmap_cpu_online, + .cpu_offline = (void *)qmap_cpu_offline, .init = (void *)qmap_init, .exit = (void *)qmap_exit, .timeout_ms = 5000U, diff --git a/tools/sched_ext/scx_qmap.c b/tools/sched_ext/scx_qmap.c index 2a97421afe9a..920fb54f9c77 100644 --- a/tools/sched_ext/scx_qmap.c +++ b/tools/sched_ext/scx_qmap.c @@ -122,5 +122,9 @@ int main(int argc, char **argv) bpf_link__destroy(link); UEI_REPORT(skel, uei); scx_qmap__destroy(skel); + /* + * scx_qmap implements ops.cpu_on/offline() and doesn't need to restart + * on CPU hotplug events. + */ return 0; } diff --git a/tools/sched_ext/scx_simple.c b/tools/sched_ext/scx_simple.c index 7f500d1d56ac..bead482e1383 100644 --- a/tools/sched_ext/scx_simple.c +++ b/tools/sched_ext/scx_simple.c @@ -62,11 +62,12 @@ int main(int argc, char **argv) struct scx_simple *skel; struct bpf_link *link; __u32 opt; + __u64 ecode; libbpf_set_print(libbpf_print_fn); signal(SIGINT, sigint_handler); signal(SIGTERM, sigint_handler); - +restart: skel = SCX_OPS_OPEN(simple_ops, scx_simple); while ((opt = getopt(argc, argv, "vh")) != -1) { @@ -93,7 +94,10 @@ int main(int argc, char **argv) } bpf_link__destroy(link); - UEI_REPORT(skel, uei); + ecode = UEI_REPORT(skel, uei); scx_simple__destroy(skel); + + if (UEI_ECODE_RESTART(ecode)) + goto restart; return 0; } -- cgit v1.2.3 From 7b0888b7cc1924b74ce660e02f6211df8dd46e7b Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 18 Jun 2024 10:09:20 -1000 Subject: sched_ext: Implement core-sched support The core-sched support is composed of the following parts: - task_struct->scx.core_sched_at is added. This is a timestamp which can be used to order tasks. Depending on whether the BPF scheduler implements custom ordering, it tracks either global FIFO ordering of all tasks or local-DSQ ordering within the dispatched tasks on a CPU. - prio_less() is updated to call scx_prio_less() when comparing SCX tasks. scx_prio_less() calls ops.core_sched_before() if available or uses the core_sched_at timestamp. For global FIFO ordering, the BPF scheduler doesn't need to do anything. Otherwise, it should implement ops.core_sched_before() which reflects the ordering. - When core-sched is enabled, balance_scx() balances all SMT siblings so that they all have tasks dispatched if necessary before pick_task_scx() is called. pick_task_scx() picks between the current task and the first dispatched task on the local DSQ based on availability and the core_sched_at timestamps. Note that FIFO ordering is expected among the already dispatched tasks whether running or on the local DSQ, so this path always compares core_sched_at instead of calling into ops.core_sched_before(). qmap_core_sched_before() is added to scx_qmap. It scales the distances from the heads of the queues to compare the tasks across different priority queues and seems to behave as expected. v3: Fixed build error when !CONFIG_SCHED_SMT reported by Andrea Righi. v2: Sched core added the const qualifiers to prio_less task arguments. Explicitly drop them for ops.core_sched_before() task arguments. BPF enforces access control through the verifier, so the qualifier isn't actually operative and only gets in the way when interacting with various helpers. Signed-off-by: Tejun Heo Reviewed-by: David Vernet Reviewed-by: Josh Don Cc: Andrea Righi --- include/linux/sched/ext.h | 3 + kernel/Kconfig.preempt | 2 +- kernel/sched/core.c | 10 +- kernel/sched/ext.c | 250 +++++++++++++++++++++++++++++++++++++++-- kernel/sched/ext.h | 5 + tools/sched_ext/scx_qmap.bpf.c | 91 ++++++++++++++- tools/sched_ext/scx_qmap.c | 5 +- 7 files changed, 346 insertions(+), 20 deletions(-) (limited to 'tools') diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h index 21c627337e01..3db7b32b2d1d 100644 --- a/include/linux/sched/ext.h +++ b/include/linux/sched/ext.h @@ -129,6 +129,9 @@ struct sched_ext_entity { struct list_head runnable_node; /* rq->scx.runnable_list */ unsigned long runnable_at; +#ifdef CONFIG_SCHED_CORE + u64 core_sched_at; /* see scx_prio_less() */ +#endif u64 ddsp_dsq_id; u64 ddsp_enq_flags; diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt index 39ecfc2b5a1c..7dde5e424ac3 100644 --- a/kernel/Kconfig.preempt +++ b/kernel/Kconfig.preempt @@ -135,7 +135,7 @@ config SCHED_CORE config SCHED_CLASS_EXT bool "Extensible Scheduling Class" - depends on BPF_SYSCALL && BPF_JIT && !SCHED_CORE + depends on BPF_SYSCALL && BPF_JIT help This option enables a new scheduler class sched_ext (SCX), which allows scheduling policies to be implemented as BPF programs to diff --git a/kernel/sched/core.c b/kernel/sched/core.c index c798c847d57e..5eec6639773b 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -169,7 +169,10 @@ static inline int __task_prio(const struct task_struct *p) if (p->sched_class == &idle_sched_class) return MAX_RT_PRIO + NICE_WIDTH; /* 140 */ - return MAX_RT_PRIO + MAX_NICE; /* 120, squash fair */ + if (task_on_scx(p)) + return MAX_RT_PRIO + MAX_NICE + 1; /* 120, squash ext */ + + return MAX_RT_PRIO + MAX_NICE; /* 119, squash fair */ } /* @@ -198,6 +201,11 @@ static inline bool prio_less(const struct task_struct *a, if (pa == MAX_RT_PRIO + MAX_NICE) /* fair */ return cfs_prio_less(a, b, in_fi); +#ifdef CONFIG_SCHED_CLASS_EXT + if (pa == MAX_RT_PRIO + MAX_NICE + 1) /* ext */ + return scx_prio_less(a, b, in_fi); +#endif + return false; } diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 26616cd0c5df..1feb690be9d8 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -344,6 +344,24 @@ struct sched_ext_ops { */ bool (*yield)(struct task_struct *from, struct task_struct *to); + /** + * core_sched_before - Task ordering for core-sched + * @a: task A + * @b: task B + * + * Used by core-sched to determine the ordering between two tasks. See + * Documentation/admin-guide/hw-vuln/core-scheduling.rst for details on + * core-sched. + * + * Both @a and @b are runnable and may or may not currently be queued on + * the BPF scheduler. Should return %true if @a should run before @b. + * %false if there's no required ordering or @b should run before @a. + * + * If not specified, the default is ordering them according to when they + * became runnable. + */ + bool (*core_sched_before)(struct task_struct *a, struct task_struct *b); + /** * set_weight - Set task weight * @p: task to set weight for @@ -625,6 +643,14 @@ enum scx_enq_flags { enum scx_deq_flags { /* expose select DEQUEUE_* flags as enums */ SCX_DEQ_SLEEP = DEQUEUE_SLEEP, + + /* high 32bits are SCX specific */ + + /* + * The generic core-sched layer decided to execute the task even though + * it hasn't been dispatched yet. Dequeue from the BPF side. + */ + SCX_DEQ_CORE_SCHED_EXEC = 1LLU << 32, }; enum scx_pick_idle_cpu_flags { @@ -1260,6 +1286,49 @@ static int ops_sanitize_err(const char *ops_name, s32 err) return -EPROTO; } +/** + * touch_core_sched - Update timestamp used for core-sched task ordering + * @rq: rq to read clock from, must be locked + * @p: task to update the timestamp for + * + * Update @p->scx.core_sched_at timestamp. This is used by scx_prio_less() to + * implement global or local-DSQ FIFO ordering for core-sched. Should be called + * when a task becomes runnable and its turn on the CPU ends (e.g. slice + * exhaustion). + */ +static void touch_core_sched(struct rq *rq, struct task_struct *p) +{ +#ifdef CONFIG_SCHED_CORE + /* + * It's okay to update the timestamp spuriously. Use + * sched_core_disabled() which is cheaper than enabled(). + */ + if (!sched_core_disabled()) + p->scx.core_sched_at = rq_clock_task(rq); +#endif +} + +/** + * touch_core_sched_dispatch - Update core-sched timestamp on dispatch + * @rq: rq to read clock from, must be locked + * @p: task being dispatched + * + * If the BPF scheduler implements custom core-sched ordering via + * ops.core_sched_before(), @p->scx.core_sched_at is used to implement FIFO + * ordering within each local DSQ. This function is called from dispatch paths + * and updates @p->scx.core_sched_at if custom core-sched ordering is in effect. + */ +static void touch_core_sched_dispatch(struct rq *rq, struct task_struct *p) +{ + lockdep_assert_rq_held(rq); + assert_clock_updated(rq); + +#ifdef CONFIG_SCHED_CORE + if (SCX_HAS_OP(core_sched_before)) + touch_core_sched(rq, p); +#endif +} + static void update_curr_scx(struct rq *rq) { struct task_struct *curr = rq->curr; @@ -1275,8 +1344,11 @@ static void update_curr_scx(struct rq *rq) account_group_exec_runtime(curr, delta_exec); cgroup_account_cputime(curr, delta_exec); - if (curr->scx.slice != SCX_SLICE_INF) + if (curr->scx.slice != SCX_SLICE_INF) { curr->scx.slice -= min(curr->scx.slice, delta_exec); + if (!curr->scx.slice) + touch_core_sched(rq, curr); + } } static void dsq_mod_nr(struct scx_dispatch_q *dsq, s32 delta) @@ -1469,6 +1541,8 @@ static void direct_dispatch(struct task_struct *p, u64 enq_flags) { struct scx_dispatch_q *dsq; + touch_core_sched_dispatch(task_rq(p), p); + enq_flags |= (p->scx.ddsp_enq_flags | SCX_ENQ_CLEAR_OPSS); dsq = find_dsq_for_dispatch(task_rq(p), p->scx.ddsp_dsq_id, p); dispatch_enqueue(dsq, p, enq_flags); @@ -1550,12 +1624,19 @@ direct: return; local: + /* + * For task-ordering, slice refill must be treated as implying the end + * of the current slice. Otherwise, the longer @p stays on the CPU, the + * higher priority it becomes from scx_prio_less()'s POV. + */ + touch_core_sched(rq, p); p->scx.slice = SCX_SLICE_DFL; local_norefill: dispatch_enqueue(&rq->scx.local_dsq, p, enq_flags); return; global: + touch_core_sched(rq, p); /* see the comment in local: */ p->scx.slice = SCX_SLICE_DFL; dispatch_enqueue(&scx_dsq_global, p, enq_flags); } @@ -1619,6 +1700,9 @@ static void enqueue_task_scx(struct rq *rq, struct task_struct *p, int enq_flags if (SCX_HAS_OP(runnable)) SCX_CALL_OP_TASK(SCX_KF_REST, runnable, p, enq_flags); + if (enq_flags & SCX_ENQ_WAKEUP) + touch_core_sched(rq, p); + do_enqueue_task(rq, p, enq_flags, sticky_cpu); } @@ -2106,6 +2190,7 @@ static void finish_dispatch(struct rq *rq, struct rq_flags *rf, struct scx_dispatch_q *dsq; unsigned long opss; + touch_core_sched_dispatch(rq, p); retry: /* * No need for _acquire here. @p is accessed only after a successful @@ -2183,8 +2268,8 @@ static void flush_dispatch_buf(struct rq *rq, struct rq_flags *rf) dspc->cursor = 0; } -static int balance_scx(struct rq *rq, struct task_struct *prev, - struct rq_flags *rf) +static int balance_one(struct rq *rq, struct task_struct *prev, + struct rq_flags *rf, bool local) { struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx); bool prev_on_scx = prev->sched_class == &ext_sched_class; @@ -2208,7 +2293,7 @@ static int balance_scx(struct rq *rq, struct task_struct *prev, } if (prev_on_scx) { - WARN_ON_ONCE(prev->scx.flags & SCX_TASK_BAL_KEEP); + WARN_ON_ONCE(local && (prev->scx.flags & SCX_TASK_BAL_KEEP)); update_curr_scx(rq); /* @@ -2220,10 +2305,16 @@ static int balance_scx(struct rq *rq, struct task_struct *prev, * * See scx_ops_disable_workfn() for the explanation on the * bypassing test. + * + * When balancing a remote CPU for core-sched, there won't be a + * following put_prev_task_scx() call and we don't own + * %SCX_TASK_BAL_KEEP. Instead, pick_task_scx() will test the + * same conditions later and pick @rq->curr accordingly. */ if ((prev->scx.flags & SCX_TASK_QUEUED) && prev->scx.slice && !scx_ops_bypassing()) { - prev->scx.flags |= SCX_TASK_BAL_KEEP; + if (local) + prev->scx.flags |= SCX_TASK_BAL_KEEP; goto has_tasks; } } @@ -2285,10 +2376,56 @@ out: return has_tasks; } +static int balance_scx(struct rq *rq, struct task_struct *prev, + struct rq_flags *rf) +{ + int ret; + + ret = balance_one(rq, prev, rf, true); + +#ifdef CONFIG_SCHED_SMT + /* + * When core-sched is enabled, this ops.balance() call will be followed + * by put_prev_scx() and pick_task_scx() on this CPU and pick_task_scx() + * on the SMT siblings. Balance the siblings too. + */ + if (sched_core_enabled(rq)) { + const struct cpumask *smt_mask = cpu_smt_mask(cpu_of(rq)); + int scpu; + + for_each_cpu_andnot(scpu, smt_mask, cpumask_of(cpu_of(rq))) { + struct rq *srq = cpu_rq(scpu); + struct rq_flags srf; + struct task_struct *sprev = srq->curr; + + /* + * While core-scheduling, rq lock is shared among + * siblings but the debug annotations and rq clock + * aren't. Do pinning dance to transfer the ownership. + */ + WARN_ON_ONCE(__rq_lockp(rq) != __rq_lockp(srq)); + rq_unpin_lock(rq, rf); + rq_pin_lock(srq, &srf); + + update_rq_clock(srq); + balance_one(srq, sprev, &srf, false); + + rq_unpin_lock(srq, &srf); + rq_repin_lock(rq, rf); + } + } +#endif + return ret; +} + static void set_next_task_scx(struct rq *rq, struct task_struct *p, bool first) { if (p->scx.flags & SCX_TASK_QUEUED) { - WARN_ON_ONCE(atomic_long_read(&p->scx.ops_state) != SCX_OPSS_NONE); + /* + * Core-sched might decide to execute @p before it is + * dispatched. Call ops_dequeue() to notify the BPF scheduler. + */ + ops_dequeue(p, SCX_DEQ_CORE_SCHED_EXEC); dispatch_dequeue(rq, p); } @@ -2379,7 +2516,8 @@ static void put_prev_task_scx(struct rq *rq, struct task_struct *p) /* * If @p has slice left and balance_scx() didn't tag it for * keeping, @p is getting preempted by a higher priority - * scheduler class. Leave it at the head of the local DSQ. + * scheduler class or core-sched forcing a different task. Leave + * it at the head of the local DSQ. */ if (p->scx.slice && !scx_ops_bypassing()) { dispatch_enqueue(&rq->scx.local_dsq, p, SCX_ENQ_HEAD); @@ -2436,6 +2574,84 @@ static struct task_struct *pick_next_task_scx(struct rq *rq) return p; } +#ifdef CONFIG_SCHED_CORE +/** + * scx_prio_less - Task ordering for core-sched + * @a: task A + * @b: task B + * + * Core-sched is implemented as an additional scheduling layer on top of the + * usual sched_class'es and needs to find out the expected task ordering. For + * SCX, core-sched calls this function to interrogate the task ordering. + * + * Unless overridden by ops.core_sched_before(), @p->scx.core_sched_at is used + * to implement the default task ordering. The older the timestamp, the higher + * prority the task - the global FIFO ordering matching the default scheduling + * behavior. + * + * When ops.core_sched_before() is enabled, @p->scx.core_sched_at is used to + * implement FIFO ordering within each local DSQ. See pick_task_scx(). + */ +bool scx_prio_less(const struct task_struct *a, const struct task_struct *b, + bool in_fi) +{ + /* + * The const qualifiers are dropped from task_struct pointers when + * calling ops.core_sched_before(). Accesses are controlled by the + * verifier. + */ + if (SCX_HAS_OP(core_sched_before) && !scx_ops_bypassing()) + return SCX_CALL_OP_2TASKS_RET(SCX_KF_REST, core_sched_before, + (struct task_struct *)a, + (struct task_struct *)b); + else + return time_after64(a->scx.core_sched_at, b->scx.core_sched_at); +} + +/** + * pick_task_scx - Pick a candidate task for core-sched + * @rq: rq to pick the candidate task from + * + * Core-sched calls this function on each SMT sibling to determine the next + * tasks to run on the SMT siblings. balance_one() has been called on all + * siblings and put_prev_task_scx() has been called only for the current CPU. + * + * As put_prev_task_scx() hasn't been called on remote CPUs, we can't just look + * at the first task in the local dsq. @rq->curr has to be considered explicitly + * to mimic %SCX_TASK_BAL_KEEP. + */ +static struct task_struct *pick_task_scx(struct rq *rq) +{ + struct task_struct *curr = rq->curr; + struct task_struct *first = first_local_task(rq); + + if (curr->scx.flags & SCX_TASK_QUEUED) { + /* is curr the only runnable task? */ + if (!first) + return curr; + + /* + * Does curr trump first? We can always go by core_sched_at for + * this comparison as it represents global FIFO ordering when + * the default core-sched ordering is used and local-DSQ FIFO + * ordering otherwise. + * + * We can have a task with an earlier timestamp on the DSQ. For + * example, when a current task is preempted by a sibling + * picking a different cookie, the task would be requeued at the + * head of the local DSQ with an earlier timestamp than the + * core-sched picked next task. Besides, the BPF scheduler may + * dispatch any tasks to the local DSQ anytime. + */ + if (curr->scx.slice && time_before64(curr->scx.core_sched_at, + first->scx.core_sched_at)) + return curr; + } + + return first; /* this may be %NULL */ +} +#endif /* CONFIG_SCHED_CORE */ + static enum scx_cpu_preempt_reason preempt_reason_from_class(const struct sched_class *class) { @@ -2843,13 +3059,15 @@ static void task_tick_scx(struct rq *rq, struct task_struct *curr, int queued) update_curr_scx(rq); /* - * While bypassing, always resched as we can't trust the slice - * management. + * While disabling, always resched and refresh core-sched timestamp as + * we can't trust the slice management or ops.core_sched_before(). */ - if (scx_ops_bypassing()) + if (scx_ops_bypassing()) { curr->scx.slice = 0; - else if (SCX_HAS_OP(tick)) + touch_core_sched(rq, curr); + } else if (SCX_HAS_OP(tick)) { SCX_CALL_OP(SCX_KF_REST, tick, curr); + } if (!curr->scx.slice) resched_curr(rq); @@ -3203,6 +3421,10 @@ DEFINE_SCHED_CLASS(ext) = { .rq_offline = rq_offline_scx, #endif +#ifdef CONFIG_SCHED_CORE + .pick_task = pick_task_scx, +#endif + .task_tick = task_tick_scx, .switching_to = switching_to_scx, @@ -3416,12 +3638,14 @@ bool task_should_scx(struct task_struct *p) * * c. balance_scx() never sets %SCX_TASK_BAL_KEEP as the slice value can't be * trusted. Whenever a tick triggers, the running task is rotated to the tail - * of the queue. + * of the queue with core_sched_at touched. * * d. pick_next_task() suppresses zero slice warning. * * e. scx_bpf_kick_cpu() is disabled to avoid irq_work malfunction during PM * operations. + * + * f. scx_prio_less() reverts to the default core_sched_at order. */ static void scx_ops_bypass(bool bypass) { @@ -4583,6 +4807,7 @@ static void running_stub(struct task_struct *p) {} static void stopping_stub(struct task_struct *p, bool runnable) {} static void quiescent_stub(struct task_struct *p, u64 deq_flags) {} static bool yield_stub(struct task_struct *from, struct task_struct *to) { return false; } +static bool core_sched_before_stub(struct task_struct *a, struct task_struct *b) { return false; } static void set_weight_stub(struct task_struct *p, u32 weight) {} static void set_cpumask_stub(struct task_struct *p, const struct cpumask *mask) {} static void update_idle_stub(s32 cpu, bool idle) {} @@ -4607,6 +4832,7 @@ static struct sched_ext_ops __bpf_ops_sched_ext_ops = { .stopping = stopping_stub, .quiescent = quiescent_stub, .yield = yield_stub, + .core_sched_before = core_sched_before_stub, .set_weight = set_weight_stub, .set_cpumask = set_cpumask_stub, .update_idle = update_idle_stub, diff --git a/kernel/sched/ext.h b/kernel/sched/ext.h index 037f9acdf443..6555878c5da3 100644 --- a/kernel/sched/ext.h +++ b/kernel/sched/ext.h @@ -70,6 +70,11 @@ static inline const struct sched_class *next_active_class(const struct sched_cla for_active_class_range(class, (prev_class) > &ext_sched_class ? \ &ext_sched_class : (prev_class), (end_class)) +#ifdef CONFIG_SCHED_CORE +bool scx_prio_less(const struct task_struct *a, const struct task_struct *b, + bool in_fi); +#endif + #else /* CONFIG_SCHED_CLASS_EXT */ #define scx_enabled() false diff --git a/tools/sched_ext/scx_qmap.bpf.c b/tools/sched_ext/scx_qmap.bpf.c index 619078355bf5..c75c70d6a8eb 100644 --- a/tools/sched_ext/scx_qmap.bpf.c +++ b/tools/sched_ext/scx_qmap.bpf.c @@ -13,6 +13,7 @@ * - Sleepable per-task storage allocation using ops.prep_enable(). * - Using ops.cpu_release() to handle a higher priority scheduling class taking * the CPU away. + * - Core-sched support. * * This scheduler is primarily for demonstration and testing of sched_ext * features and unlikely to be useful for actual workloads. @@ -67,9 +68,21 @@ struct { }, }; +/* + * Per-queue sequence numbers to implement core-sched ordering. + * + * Tail seq is assigned to each queued task and incremented. Head seq tracks the + * sequence number of the latest dispatched task. The distance between the a + * task's seq and the associated queue's head seq is called the queue distance + * and used when comparing two tasks for ordering. See qmap_core_sched_before(). + */ +static u64 core_sched_head_seqs[5]; +static u64 core_sched_tail_seqs[5]; + /* Per-task scheduling context */ struct task_ctx { bool force_local; /* Dispatch directly to local_dsq */ + u64 core_sched_seq; }; struct { @@ -93,6 +106,7 @@ struct { /* Statistics */ u64 nr_enqueued, nr_dispatched, nr_reenqueued, nr_dequeued; +u64 nr_core_sched_execed; s32 BPF_STRUCT_OPS(qmap_select_cpu, struct task_struct *p, s32 prev_cpu, u64 wake_flags) @@ -159,8 +173,18 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags) return; } - /* Is select_cpu() is telling us to enqueue locally? */ - if (tctx->force_local) { + /* + * All enqueued tasks must have their core_sched_seq updated for correct + * core-sched ordering, which is why %SCX_OPS_ENQ_LAST is specified in + * qmap_ops.flags. + */ + tctx->core_sched_seq = core_sched_tail_seqs[idx]++; + + /* + * If qmap_select_cpu() is telling us to or this is the last runnable + * task on the CPU, enqueue locally. + */ + if (tctx->force_local || (enq_flags & SCX_ENQ_LAST)) { tctx->force_local = false; scx_bpf_dispatch(p, SCX_DSQ_LOCAL, slice_ns, enq_flags); return; @@ -204,6 +228,19 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags) void BPF_STRUCT_OPS(qmap_dequeue, struct task_struct *p, u64 deq_flags) { __sync_fetch_and_add(&nr_dequeued, 1); + if (deq_flags & SCX_DEQ_CORE_SCHED_EXEC) + __sync_fetch_and_add(&nr_core_sched_execed, 1); +} + +static void update_core_sched_head_seq(struct task_struct *p) +{ + struct task_ctx *tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0); + int idx = weight_to_idx(p->scx.weight); + + if (tctx) + core_sched_head_seqs[idx] = tctx->core_sched_seq; + else + scx_bpf_error("task_ctx lookup failed"); } void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev) @@ -258,6 +295,7 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev) if (!p) continue; + update_core_sched_head_seq(p); __sync_fetch_and_add(&nr_dispatched, 1); scx_bpf_dispatch(p, SHARED_DSQ, slice_ns, 0); bpf_task_release(p); @@ -275,6 +313,49 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev) } } +/* + * The distance from the head of the queue scaled by the weight of the queue. + * The lower the number, the older the task and the higher the priority. + */ +static s64 task_qdist(struct task_struct *p) +{ + int idx = weight_to_idx(p->scx.weight); + struct task_ctx *tctx; + s64 qdist; + + tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0); + if (!tctx) { + scx_bpf_error("task_ctx lookup failed"); + return 0; + } + + qdist = tctx->core_sched_seq - core_sched_head_seqs[idx]; + + /* + * As queue index increments, the priority doubles. The queue w/ index 3 + * is dispatched twice more frequently than 2. Reflect the difference by + * scaling qdists accordingly. Note that the shift amount needs to be + * flipped depending on the sign to avoid flipping priority direction. + */ + if (qdist >= 0) + return qdist << (4 - idx); + else + return qdist << idx; +} + +/* + * This is called to determine the task ordering when core-sched is picking + * tasks to execute on SMT siblings and should encode about the same ordering as + * the regular scheduling path. Use the priority-scaled distances from the head + * of the queues to compare the two tasks which should be consistent with the + * dispatch path behavior. + */ +bool BPF_STRUCT_OPS(qmap_core_sched_before, + struct task_struct *a, struct task_struct *b) +{ + return task_qdist(a) > task_qdist(b); +} + void BPF_STRUCT_OPS(qmap_cpu_release, s32 cpu, struct scx_cpu_release_args *args) { u32 cnt; @@ -354,8 +435,8 @@ void BPF_STRUCT_OPS(qmap_dump_task, struct scx_dump_ctx *dctx, struct task_struc if (!(taskc = bpf_task_storage_get(&task_ctx_stor, p, 0, 0))) return; - scx_bpf_dump("QMAP: force_local=%d", - taskc->force_local); + scx_bpf_dump("QMAP: force_local=%d core_sched_seq=%llu", + taskc->force_local, taskc->core_sched_seq); } /* @@ -428,6 +509,7 @@ SCX_OPS_DEFINE(qmap_ops, .enqueue = (void *)qmap_enqueue, .dequeue = (void *)qmap_dequeue, .dispatch = (void *)qmap_dispatch, + .core_sched_before = (void *)qmap_core_sched_before, .cpu_release = (void *)qmap_cpu_release, .init_task = (void *)qmap_init_task, .dump = (void *)qmap_dump, @@ -437,5 +519,6 @@ SCX_OPS_DEFINE(qmap_ops, .cpu_offline = (void *)qmap_cpu_offline, .init = (void *)qmap_init, .exit = (void *)qmap_exit, + .flags = SCX_OPS_ENQ_LAST, .timeout_ms = 5000U, .name = "qmap"); diff --git a/tools/sched_ext/scx_qmap.c b/tools/sched_ext/scx_qmap.c index 920fb54f9c77..bc36ec4f88a7 100644 --- a/tools/sched_ext/scx_qmap.c +++ b/tools/sched_ext/scx_qmap.c @@ -112,9 +112,10 @@ int main(int argc, char **argv) long nr_enqueued = skel->bss->nr_enqueued; long nr_dispatched = skel->bss->nr_dispatched; - printf("stats : enq=%lu dsp=%lu delta=%ld reenq=%"PRIu64" deq=%"PRIu64"\n", + printf("stats : enq=%lu dsp=%lu delta=%ld reenq=%"PRIu64" deq=%"PRIu64" core=%"PRIu64"\n", nr_enqueued, nr_dispatched, nr_enqueued - nr_dispatched, - skel->bss->nr_reenqueued, skel->bss->nr_dequeued); + skel->bss->nr_reenqueued, skel->bss->nr_dequeued, + skel->bss->nr_core_sched_execed); fflush(stdout); sleep(1); } -- cgit v1.2.3 From 06e51be3d5e7a07aea5c9012773df8d5de01db6c Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 18 Jun 2024 10:09:21 -1000 Subject: sched_ext: Add vtime-ordered priority queue to dispatch_q's Currently, a dsq is always a FIFO. A task which is dispatched earlier gets consumed or executed earlier. While this is sufficient when dsq's are used for simple staging areas for tasks which are ready to execute, it'd make dsq's a lot more useful if they can implement custom ordering. This patch adds a vtime-ordered priority queue to dsq's. When the BPF scheduler dispatches a task with the new scx_bpf_dispatch_vtime() helper, it can specify the vtime tha the task should be inserted at and the task is inserted into the priority queue in the dsq which is ordered according to time_before64() comparison of the vtime values. A DSQ can either be a FIFO or priority queue and automatically switches between the two depending on whether scx_bpf_dispatch() or scx_bpf_dispatch_vtime() is used. Using the wrong variant while the DSQ already has the other type queued is not allowed and triggers an ops error. Built-in DSQs must always be FIFOs. This makes it very easy for the BPF schedulers to implement proper vtime based scheduling within each dsq very easy and efficient at a negligible cost in terms of code complexity and overhead. scx_simple and scx_example_flatcg are updated to default to weighted vtime scheduling (the latter within each cgroup). FIFO scheduling can be selected with -f option. v4: - As allowing mixing priority queue and FIFO on the same DSQ sometimes led to unexpected starvations, DSQs now error out if both modes are used at the same time and the built-in DSQs are no longer allowed to be priority queues. - Explicit type struct scx_dsq_node added to contain fields needed to be linked on DSQs. This will be used to implement stateful iterator. - Tasks are now always linked on dsq->list whether the DSQ is in FIFO or PRIQ mode. This confines PRIQ related complexities to the enqueue and dequeue paths. Other paths only need to look at dsq->list. This will also ease implementing BPF iterator. - Print p->scx.dsq_flags in debug dump. v3: - SCX_TASK_DSQ_ON_PRIQ flag is moved from p->scx.flags into its own p->scx.dsq_flags. The flag is protected with the dsq lock unlike other flags in p->scx.flags. This led to flag corruption in some cases. - Add comments explaining the interaction between using consumption of p->scx.slice to determine vtime progress and yielding. v2: - p->scx.dsq_vtime was not initialized on load or across cgroup migrations leading to some tasks being stalled for extended period of time depending on how saturated the machine is. Fixed. Signed-off-by: Tejun Heo Reviewed-by: David Vernet --- include/linux/sched/ext.h | 29 +++++- init/init_task.c | 2 +- kernel/sched/ext.c | 156 +++++++++++++++++++++++++++---- tools/sched_ext/include/scx/common.bpf.h | 1 + tools/sched_ext/scx_simple.bpf.c | 97 ++++++++++++++++++- tools/sched_ext/scx_simple.c | 8 +- 6 files changed, 267 insertions(+), 26 deletions(-) (limited to 'tools') diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h index 3db7b32b2d1d..9cee193dab19 100644 --- a/include/linux/sched/ext.h +++ b/include/linux/sched/ext.h @@ -49,12 +49,15 @@ enum scx_dsq_id_flags { }; /* - * Dispatch queue (dsq) is a simple FIFO which is used to buffer between the - * scheduler core and the BPF scheduler. See the documentation for more details. + * A dispatch queue (DSQ) can be either a FIFO or p->scx.dsq_vtime ordered + * queue. A built-in DSQ is always a FIFO. The built-in local DSQs are used to + * buffer between the scheduler core and the BPF scheduler. See the + * documentation for more details. */ struct scx_dispatch_q { raw_spinlock_t lock; struct list_head list; /* tasks in dispatch order */ + struct rb_root priq; /* used to order by p->scx.dsq_vtime */ u32 nr; u64 id; struct rhash_head hash_node; @@ -86,6 +89,11 @@ enum scx_task_state { SCX_TASK_NR_STATES, }; +/* scx_entity.dsq_flags */ +enum scx_ent_dsq_flags { + SCX_TASK_DSQ_ON_PRIQ = 1 << 0, /* task is queued on the priority queue of a dsq */ +}; + /* * Mask bits for scx_entity.kf_mask. Not all kfuncs can be called from * everywhere and the following bits track which kfunc sets are currently @@ -111,13 +119,19 @@ enum scx_kf_mask { __SCX_KF_TERMINAL = SCX_KF_ENQUEUE | SCX_KF_SELECT_CPU | SCX_KF_REST, }; +struct scx_dsq_node { + struct list_head list; /* dispatch order */ + struct rb_node priq; /* p->scx.dsq_vtime order */ + u32 flags; /* SCX_TASK_DSQ_* flags */ +}; + /* * The following is embedded in task_struct and contains all fields necessary * for a task to be scheduled by SCX. */ struct sched_ext_entity { struct scx_dispatch_q *dsq; - struct list_head dsq_node; + struct scx_dsq_node dsq_node; /* protected by dsq lock */ u32 flags; /* protected by rq lock */ u32 weight; s32 sticky_cpu; @@ -149,6 +163,15 @@ struct sched_ext_entity { */ u64 slice; + /* + * Used to order tasks when dispatching to the vtime-ordered priority + * queue of a dsq. This is usually set through scx_bpf_dispatch_vtime() + * but can also be modified directly by the BPF scheduler. Modifying it + * while a task is queued on a dsq may mangle the ordering and is not + * recommended. + */ + u64 dsq_vtime; + /* * If set, reject future sched_setscheduler(2) calls updating the policy * to %SCHED_EXT with -%EACCES. diff --git a/init/init_task.c b/init/init_task.c index 8a44c932d10f..5726b3a0eea9 100644 --- a/init/init_task.c +++ b/init/init_task.c @@ -102,7 +102,7 @@ struct task_struct init_task __aligned(L1_CACHE_BYTES) = { #endif #ifdef CONFIG_SCHED_CLASS_EXT .scx = { - .dsq_node = LIST_HEAD_INIT(init_task.scx.dsq_node), + .dsq_node.list = LIST_HEAD_INIT(init_task.scx.dsq_node.list), .sticky_cpu = -1, .holding_cpu = -1, .runnable_node = LIST_HEAD_INIT(init_task.scx.runnable_node), diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 1feb690be9d8..f186c576e7d9 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -638,6 +638,7 @@ enum scx_enq_flags { __SCX_ENQ_INTERNAL_MASK = 0xffLLU << 56, SCX_ENQ_CLEAR_OPSS = 1LLU << 56, + SCX_ENQ_DSQ_PRIQ = 1LLU << 57, }; enum scx_deq_flags { @@ -1351,6 +1352,17 @@ static void update_curr_scx(struct rq *rq) } } +static bool scx_dsq_priq_less(struct rb_node *node_a, + const struct rb_node *node_b) +{ + const struct task_struct *a = + container_of(node_a, struct task_struct, scx.dsq_node.priq); + const struct task_struct *b = + container_of(node_b, struct task_struct, scx.dsq_node.priq); + + return time_before64(a->scx.dsq_vtime, b->scx.dsq_vtime); +} + static void dsq_mod_nr(struct scx_dispatch_q *dsq, s32 delta) { /* scx_bpf_dsq_nr_queued() reads ->nr without locking, use WRITE_ONCE() */ @@ -1362,7 +1374,9 @@ static void dispatch_enqueue(struct scx_dispatch_q *dsq, struct task_struct *p, { bool is_local = dsq->id == SCX_DSQ_LOCAL; - WARN_ON_ONCE(p->scx.dsq || !list_empty(&p->scx.dsq_node)); + WARN_ON_ONCE(p->scx.dsq || !list_empty(&p->scx.dsq_node.list)); + WARN_ON_ONCE((p->scx.dsq_node.flags & SCX_TASK_DSQ_ON_PRIQ) || + !RB_EMPTY_NODE(&p->scx.dsq_node.priq)); if (!is_local) { raw_spin_lock(&dsq->lock); @@ -1375,10 +1389,59 @@ static void dispatch_enqueue(struct scx_dispatch_q *dsq, struct task_struct *p, } } - if (enq_flags & (SCX_ENQ_HEAD | SCX_ENQ_PREEMPT)) - list_add(&p->scx.dsq_node, &dsq->list); - else - list_add_tail(&p->scx.dsq_node, &dsq->list); + if (unlikely((dsq->id & SCX_DSQ_FLAG_BUILTIN) && + (enq_flags & SCX_ENQ_DSQ_PRIQ))) { + /* + * SCX_DSQ_LOCAL and SCX_DSQ_GLOBAL DSQs always consume from + * their FIFO queues. To avoid confusion and accidentally + * starving vtime-dispatched tasks by FIFO-dispatched tasks, we + * disallow any internal DSQ from doing vtime ordering of + * tasks. + */ + scx_ops_error("cannot use vtime ordering for built-in DSQs"); + enq_flags &= ~SCX_ENQ_DSQ_PRIQ; + } + + if (enq_flags & SCX_ENQ_DSQ_PRIQ) { + struct rb_node *rbp; + + /* + * A PRIQ DSQ shouldn't be using FIFO enqueueing. As tasks are + * linked to both the rbtree and list on PRIQs, this can only be + * tested easily when adding the first task. + */ + if (unlikely(RB_EMPTY_ROOT(&dsq->priq) && + !list_empty(&dsq->list))) + scx_ops_error("DSQ ID 0x%016llx already had FIFO-enqueued tasks", + dsq->id); + + p->scx.dsq_node.flags |= SCX_TASK_DSQ_ON_PRIQ; + rb_add(&p->scx.dsq_node.priq, &dsq->priq, scx_dsq_priq_less); + + /* + * Find the previous task and insert after it on the list so + * that @dsq->list is vtime ordered. + */ + rbp = rb_prev(&p->scx.dsq_node.priq); + if (rbp) { + struct task_struct *prev = + container_of(rbp, struct task_struct, + scx.dsq_node.priq); + list_add(&p->scx.dsq_node.list, &prev->scx.dsq_node.list); + } else { + list_add(&p->scx.dsq_node.list, &dsq->list); + } + } else { + /* a FIFO DSQ shouldn't be using PRIQ enqueuing */ + if (unlikely(!RB_EMPTY_ROOT(&dsq->priq))) + scx_ops_error("DSQ ID 0x%016llx already had PRIQ-enqueued tasks", + dsq->id); + + if (enq_flags & (SCX_ENQ_HEAD | SCX_ENQ_PREEMPT)) + list_add(&p->scx.dsq_node.list, &dsq->list); + else + list_add_tail(&p->scx.dsq_node.list, &dsq->list); + } dsq_mod_nr(dsq, 1); p->scx.dsq = dsq; @@ -1417,13 +1480,30 @@ static void dispatch_enqueue(struct scx_dispatch_q *dsq, struct task_struct *p, } } +static void task_unlink_from_dsq(struct task_struct *p, + struct scx_dispatch_q *dsq) +{ + if (p->scx.dsq_node.flags & SCX_TASK_DSQ_ON_PRIQ) { + rb_erase(&p->scx.dsq_node.priq, &dsq->priq); + RB_CLEAR_NODE(&p->scx.dsq_node.priq); + p->scx.dsq_node.flags &= ~SCX_TASK_DSQ_ON_PRIQ; + } + + list_del_init(&p->scx.dsq_node.list); +} + +static bool task_linked_on_dsq(struct task_struct *p) +{ + return !list_empty(&p->scx.dsq_node.list); +} + static void dispatch_dequeue(struct rq *rq, struct task_struct *p) { struct scx_dispatch_q *dsq = p->scx.dsq; bool is_local = dsq == &rq->scx.local_dsq; if (!dsq) { - WARN_ON_ONCE(!list_empty(&p->scx.dsq_node)); + WARN_ON_ONCE(task_linked_on_dsq(p)); /* * When dispatching directly from the BPF scheduler to a local * DSQ, the task isn't associated with any DSQ but @@ -1444,8 +1524,8 @@ static void dispatch_dequeue(struct rq *rq, struct task_struct *p) */ if (p->scx.holding_cpu < 0) { /* @p must still be on @dsq, dequeue */ - WARN_ON_ONCE(list_empty(&p->scx.dsq_node)); - list_del_init(&p->scx.dsq_node); + WARN_ON_ONCE(!task_linked_on_dsq(p)); + task_unlink_from_dsq(p, dsq); dsq_mod_nr(dsq, -1); } else { /* @@ -1454,7 +1534,7 @@ static void dispatch_dequeue(struct rq *rq, struct task_struct *p) * holding_cpu which tells dispatch_to_local_dsq() that it lost * the race. */ - WARN_ON_ONCE(!list_empty(&p->scx.dsq_node)); + WARN_ON_ONCE(task_linked_on_dsq(p)); p->scx.holding_cpu = -1; } p->scx.dsq = NULL; @@ -1949,7 +2029,8 @@ static void consume_local_task(struct rq *rq, struct scx_dispatch_q *dsq, /* @dsq is locked and @p is on this rq */ WARN_ON_ONCE(p->scx.holding_cpu >= 0); - list_move_tail(&p->scx.dsq_node, &rq->scx.local_dsq.list); + task_unlink_from_dsq(p, dsq); + list_add_tail(&p->scx.dsq_node.list, &rq->scx.local_dsq.list); dsq_mod_nr(dsq, -1); dsq_mod_nr(&rq->scx.local_dsq, 1); p->scx.dsq = &rq->scx.local_dsq; @@ -1992,7 +2073,7 @@ static bool consume_remote_task(struct rq *rq, struct rq_flags *rf, * move_task_to_local_dsq(). */ WARN_ON_ONCE(p->scx.holding_cpu >= 0); - list_del_init(&p->scx.dsq_node); + task_unlink_from_dsq(p, dsq); dsq_mod_nr(dsq, -1); p->scx.holding_cpu = raw_smp_processor_id(); raw_spin_unlock(&dsq->lock); @@ -2024,7 +2105,7 @@ retry: raw_spin_lock(&dsq->lock); - list_for_each_entry(p, &dsq->list, scx.dsq_node) { + list_for_each_entry(p, &dsq->list, scx.dsq_node.list) { struct rq *task_rq = task_rq(p); if (rq == task_rq) { @@ -2543,7 +2624,7 @@ static void put_prev_task_scx(struct rq *rq, struct task_struct *p) static struct task_struct *first_local_task(struct rq *rq) { return list_first_entry_or_null(&rq->scx.local_dsq.list, - struct task_struct, scx.dsq_node); + struct task_struct, scx.dsq_node.list); } static struct task_struct *pick_next_task_scx(struct rq *rq) @@ -3225,7 +3306,8 @@ void init_scx_entity(struct sched_ext_entity *scx) */ memset(scx, 0, offsetof(struct sched_ext_entity, tasks_node)); - INIT_LIST_HEAD(&scx->dsq_node); + INIT_LIST_HEAD(&scx->dsq_node.list); + RB_CLEAR_NODE(&scx->dsq_node.priq); scx->sticky_cpu = -1; scx->holding_cpu = -1; INIT_LIST_HEAD(&scx->runnable_node); @@ -4070,12 +4152,13 @@ static void scx_dump_task(struct seq_buf *s, struct scx_dump_ctx *dctx, dump_line(s, " %c%c %s[%d] %+ldms", marker, task_state_to_char(p), p->comm, p->pid, jiffies_delta_msecs(p->scx.runnable_at, dctx->at_jiffies)); - dump_line(s, " scx_state/flags=%u/0x%x ops_state/qseq=%lu/%lu", + dump_line(s, " scx_state/flags=%u/0x%x dsq_flags=0x%x ops_state/qseq=%lu/%lu", scx_get_task_state(p), p->scx.flags & ~SCX_TASK_STATE_MASK, - ops_state & SCX_OPSS_STATE_MASK, + p->scx.dsq_node.flags, ops_state & SCX_OPSS_STATE_MASK, ops_state >> SCX_OPSS_QSEQ_SHIFT); - dump_line(s, " sticky/holding_cpu=%d/%d dsq_id=%s", - p->scx.sticky_cpu, p->scx.holding_cpu, dsq_id_buf); + dump_line(s, " sticky/holding_cpu=%d/%d dsq_id=%s dsq_vtime=%llu", + p->scx.sticky_cpu, p->scx.holding_cpu, dsq_id_buf, + p->scx.dsq_vtime); dump_line(s, " cpus=%*pb", cpumask_pr_args(p->cpus_ptr)); if (SCX_HAS_OP(dump_task)) { @@ -4663,6 +4746,9 @@ static int bpf_scx_btf_struct_access(struct bpf_verifier_log *log, if (off >= offsetof(struct task_struct, scx.slice) && off + size <= offsetofend(struct task_struct, scx.slice)) return SCALAR_VALUE; + if (off >= offsetof(struct task_struct, scx.dsq_vtime) && + off + size <= offsetofend(struct task_struct, scx.dsq_vtime)) + return SCALAR_VALUE; if (off >= offsetof(struct task_struct, scx.disallow) && off + size <= offsetofend(struct task_struct, scx.disallow)) return SCALAR_VALUE; @@ -5298,10 +5384,44 @@ __bpf_kfunc void scx_bpf_dispatch(struct task_struct *p, u64 dsq_id, u64 slice, scx_dispatch_commit(p, dsq_id, enq_flags); } +/** + * scx_bpf_dispatch_vtime - Dispatch a task into the vtime priority queue of a DSQ + * @p: task_struct to dispatch + * @dsq_id: DSQ to dispatch to + * @slice: duration @p can run for in nsecs + * @vtime: @p's ordering inside the vtime-sorted queue of the target DSQ + * @enq_flags: SCX_ENQ_* + * + * Dispatch @p into the vtime priority queue of the DSQ identified by @dsq_id. + * Tasks queued into the priority queue are ordered by @vtime and always + * consumed after the tasks in the FIFO queue. All other aspects are identical + * to scx_bpf_dispatch(). + * + * @vtime ordering is according to time_before64() which considers wrapping. A + * numerically larger vtime may indicate an earlier position in the ordering and + * vice-versa. + */ +__bpf_kfunc void scx_bpf_dispatch_vtime(struct task_struct *p, u64 dsq_id, + u64 slice, u64 vtime, u64 enq_flags) +{ + if (!scx_dispatch_preamble(p, enq_flags)) + return; + + if (slice) + p->scx.slice = slice; + else + p->scx.slice = p->scx.slice ?: 1; + + p->scx.dsq_vtime = vtime; + + scx_dispatch_commit(p, dsq_id, enq_flags | SCX_ENQ_DSQ_PRIQ); +} + __bpf_kfunc_end_defs(); BTF_KFUNCS_START(scx_kfunc_ids_enqueue_dispatch) BTF_ID_FLAGS(func, scx_bpf_dispatch, KF_RCU) +BTF_ID_FLAGS(func, scx_bpf_dispatch_vtime, KF_RCU) BTF_KFUNCS_END(scx_kfunc_ids_enqueue_dispatch) static const struct btf_kfunc_id_set scx_kfunc_set_enqueue_dispatch = { diff --git a/tools/sched_ext/include/scx/common.bpf.h b/tools/sched_ext/include/scx/common.bpf.h index 8686f84497db..3fa87084cf17 100644 --- a/tools/sched_ext/include/scx/common.bpf.h +++ b/tools/sched_ext/include/scx/common.bpf.h @@ -31,6 +31,7 @@ static inline void ___vmlinux_h_sanity_check___(void) s32 scx_bpf_create_dsq(u64 dsq_id, s32 node) __ksym; s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, bool *is_idle) __ksym; void scx_bpf_dispatch(struct task_struct *p, u64 dsq_id, u64 slice, u64 enq_flags) __ksym; +void scx_bpf_dispatch_vtime(struct task_struct *p, u64 dsq_id, u64 slice, u64 vtime, u64 enq_flags) __ksym; u32 scx_bpf_dispatch_nr_slots(void) __ksym; void scx_bpf_dispatch_cancel(void) __ksym; bool scx_bpf_consume(u64 dsq_id) __ksym; diff --git a/tools/sched_ext/scx_simple.bpf.c b/tools/sched_ext/scx_simple.bpf.c index 6bb13a3c801b..ed7e8d535fc5 100644 --- a/tools/sched_ext/scx_simple.bpf.c +++ b/tools/sched_ext/scx_simple.bpf.c @@ -2,11 +2,20 @@ /* * A simple scheduler. * - * A simple global FIFO scheduler. It also demonstrates the following niceties. + * By default, it operates as a simple global weighted vtime scheduler and can + * be switched to FIFO scheduling. It also demonstrates the following niceties. * * - Statistics tracking how many tasks are queued to local and global dsq's. * - Termination notification for userspace. * + * While very simple, this scheduler should work reasonably well on CPUs with a + * uniform L3 cache topology. While preemption is not implemented, the fact that + * the scheduling queue is shared across all CPUs means that whatever is at the + * front of the queue is likely to be executed fairly quickly given enough + * number of CPUs. The FIFO scheduling mode may be beneficial to some workloads + * but comes with the usual problems with FIFO scheduling where saturating + * threads can easily drown out interactive ones. + * * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. * Copyright (c) 2022 Tejun Heo * Copyright (c) 2022 David Vernet @@ -15,8 +24,20 @@ char _license[] SEC("license") = "GPL"; +const volatile bool fifo_sched; + +static u64 vtime_now; UEI_DEFINE(uei); +/* + * Built-in DSQs such as SCX_DSQ_GLOBAL cannot be used as priority queues + * (meaning, cannot be dispatched to with scx_bpf_dispatch_vtime()). We + * therefore create a separate DSQ with ID 0 that we dispatch to and consume + * from. If scx_simple only supported global FIFO scheduling, then we could + * just use SCX_DSQ_GLOBAL. + */ +#define SHARED_DSQ 0 + struct { __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); __uint(key_size, sizeof(u32)); @@ -31,6 +52,11 @@ static void stat_inc(u32 idx) (*cnt_p)++; } +static inline bool vtime_before(u64 a, u64 b) +{ + return (s64)(a - b) < 0; +} + s32 BPF_STRUCT_OPS(simple_select_cpu, struct task_struct *p, s32 prev_cpu, u64 wake_flags) { bool is_idle = false; @@ -48,7 +74,69 @@ s32 BPF_STRUCT_OPS(simple_select_cpu, struct task_struct *p, s32 prev_cpu, u64 w void BPF_STRUCT_OPS(simple_enqueue, struct task_struct *p, u64 enq_flags) { stat_inc(1); /* count global queueing */ - scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags); + + if (fifo_sched) { + scx_bpf_dispatch(p, SHARED_DSQ, SCX_SLICE_DFL, enq_flags); + } else { + u64 vtime = p->scx.dsq_vtime; + + /* + * Limit the amount of budget that an idling task can accumulate + * to one slice. + */ + if (vtime_before(vtime, vtime_now - SCX_SLICE_DFL)) + vtime = vtime_now - SCX_SLICE_DFL; + + scx_bpf_dispatch_vtime(p, SHARED_DSQ, SCX_SLICE_DFL, vtime, + enq_flags); + } +} + +void BPF_STRUCT_OPS(simple_dispatch, s32 cpu, struct task_struct *prev) +{ + scx_bpf_consume(SHARED_DSQ); +} + +void BPF_STRUCT_OPS(simple_running, struct task_struct *p) +{ + if (fifo_sched) + return; + + /* + * Global vtime always progresses forward as tasks start executing. The + * test and update can be performed concurrently from multiple CPUs and + * thus racy. Any error should be contained and temporary. Let's just + * live with it. + */ + if (vtime_before(vtime_now, p->scx.dsq_vtime)) + vtime_now = p->scx.dsq_vtime; +} + +void BPF_STRUCT_OPS(simple_stopping, struct task_struct *p, bool runnable) +{ + if (fifo_sched) + return; + + /* + * Scale the execution time by the inverse of the weight and charge. + * + * Note that the default yield implementation yields by setting + * @p->scx.slice to zero and the following would treat the yielding task + * as if it has consumed all its slice. If this penalizes yielding tasks + * too much, determine the execution time by taking explicit timestamps + * instead of depending on @p->scx.slice. + */ + p->scx.dsq_vtime += (SCX_SLICE_DFL - p->scx.slice) * 100 / p->scx.weight; +} + +void BPF_STRUCT_OPS(simple_enable, struct task_struct *p) +{ + p->scx.dsq_vtime = vtime_now; +} + +s32 BPF_STRUCT_OPS_SLEEPABLE(simple_init) +{ + return scx_bpf_create_dsq(SHARED_DSQ, -1); } void BPF_STRUCT_OPS(simple_exit, struct scx_exit_info *ei) @@ -59,5 +147,10 @@ void BPF_STRUCT_OPS(simple_exit, struct scx_exit_info *ei) SCX_OPS_DEFINE(simple_ops, .select_cpu = (void *)simple_select_cpu, .enqueue = (void *)simple_enqueue, + .dispatch = (void *)simple_dispatch, + .running = (void *)simple_running, + .stopping = (void *)simple_stopping, + .enable = (void *)simple_enable, + .init = (void *)simple_init, .exit = (void *)simple_exit, .name = "simple"); diff --git a/tools/sched_ext/scx_simple.c b/tools/sched_ext/scx_simple.c index bead482e1383..76d83199545c 100644 --- a/tools/sched_ext/scx_simple.c +++ b/tools/sched_ext/scx_simple.c @@ -17,8 +17,9 @@ const char help_fmt[] = "\n" "See the top-level comment in .bpf.c for more details.\n" "\n" -"Usage: %s [-v]\n" +"Usage: %s [-f] [-v]\n" "\n" +" -f Use FIFO scheduling instead of weighted vtime scheduling\n" " -v Print libbpf debug messages\n" " -h Display this help and exit\n"; @@ -70,8 +71,11 @@ int main(int argc, char **argv) restart: skel = SCX_OPS_OPEN(simple_ops, scx_simple); - while ((opt = getopt(argc, argv, "vh")) != -1) { + while ((opt = getopt(argc, argv, "fvh")) != -1) { switch (opt) { + case 'f': + skel->rodata->fifo_sched = true; + break; case 'v': verbose = true; break; -- cgit v1.2.3 From fa48e8d2c7b58d242c1db3a09c14f4274e055087 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 18 Jun 2024 10:09:21 -1000 Subject: sched_ext: Documentation: scheduler: Document extensible scheduler class Add Documentation/scheduler/sched-ext.rst which gives a high-level overview and pointers to the examples. v6: - Add paragraph explaining debug dump. v5: - Updated to reflect /sys/kernel interface change. Kconfig options added. v4: - README improved, reformatted in markdown and renamed to README.md. v3: - Added tools/sched_ext/README. - Dropped _example prefix from scheduler names. v2: - Apply minor edits suggested by Bagas. Caveats section dropped as all of them are addressed. Signed-off-by: Tejun Heo Reviewed-by: David Vernet Acked-by: Josh Don Acked-by: Hao Luo Acked-by: Barret Rhoden Cc: Bagas Sanjaya --- Documentation/scheduler/index.rst | 1 + Documentation/scheduler/sched-ext.rst | 314 ++++++++++++++++++++++++++++++++++ include/linux/sched/ext.h | 2 + kernel/Kconfig.preempt | 1 + kernel/sched/ext.c | 2 + kernel/sched/ext.h | 2 + tools/sched_ext/README.md | 258 ++++++++++++++++++++++++++++ 7 files changed, 580 insertions(+) create mode 100644 Documentation/scheduler/sched-ext.rst create mode 100644 tools/sched_ext/README.md (limited to 'tools') diff --git a/Documentation/scheduler/index.rst b/Documentation/scheduler/index.rst index 43bd8a145b7a..0611dc3dda8e 100644 --- a/Documentation/scheduler/index.rst +++ b/Documentation/scheduler/index.rst @@ -20,6 +20,7 @@ Scheduler sched-nice-design sched-rt-group sched-stats + sched-ext sched-debug text_files diff --git a/Documentation/scheduler/sched-ext.rst b/Documentation/scheduler/sched-ext.rst new file mode 100644 index 000000000000..497eeaa5ecbe --- /dev/null +++ b/Documentation/scheduler/sched-ext.rst @@ -0,0 +1,314 @@ +========================== +Extensible Scheduler Class +========================== + +sched_ext is a scheduler class whose behavior can be defined by a set of BPF +programs - the BPF scheduler. + +* sched_ext exports a full scheduling interface so that any scheduling + algorithm can be implemented on top. + +* The BPF scheduler can group CPUs however it sees fit and schedule them + together, as tasks aren't tied to specific CPUs at the time of wakeup. + +* The BPF scheduler can be turned on and off dynamically anytime. + +* The system integrity is maintained no matter what the BPF scheduler does. + The default scheduling behavior is restored anytime an error is detected, + a runnable task stalls, or on invoking the SysRq key sequence + :kbd:`SysRq-S`. + +* When the BPF scheduler triggers an error, debug information is dumped to + aid debugging. The debug dump is passed to and printed out by the + scheduler binary. The debug dump can also be accessed through the + `sched_ext_dump` tracepoint. The SysRq key sequence :kbd:`SysRq-D` + triggers a debug dump. This doesn't terminate the BPF scheduler and can + only be read through the tracepoint. + +Switching to and from sched_ext +=============================== + +``CONFIG_SCHED_CLASS_EXT`` is the config option to enable sched_ext and +``tools/sched_ext`` contains the example schedulers. The following config +options should be enabled to use sched_ext: + +.. code-block:: none + + CONFIG_BPF=y + CONFIG_SCHED_CLASS_EXT=y + CONFIG_BPF_SYSCALL=y + CONFIG_BPF_JIT=y + CONFIG_DEBUG_INFO_BTF=y + CONFIG_BPF_JIT_ALWAYS_ON=y + CONFIG_BPF_JIT_DEFAULT_ON=y + CONFIG_PAHOLE_HAS_SPLIT_BTF=y + CONFIG_PAHOLE_HAS_BTF_TAG=y + +sched_ext is used only when the BPF scheduler is loaded and running. + +If a task explicitly sets its scheduling policy to ``SCHED_EXT``, it will be +treated as ``SCHED_NORMAL`` and scheduled by CFS until the BPF scheduler is +loaded. On load, such tasks will be switched to and scheduled by sched_ext. + +The BPF scheduler can choose to schedule all normal and lower class tasks by +calling ``scx_bpf_switch_all()`` from its ``init()`` operation. In this +case, all ``SCHED_NORMAL``, ``SCHED_BATCH``, ``SCHED_IDLE`` and +``SCHED_EXT`` tasks are scheduled by sched_ext. In the example schedulers, +this mode can be selected with the ``-a`` option. + +Terminating the sched_ext scheduler program, triggering :kbd:`SysRq-S`, or +detection of any internal error including stalled runnable tasks aborts the +BPF scheduler and reverts all tasks back to CFS. + +.. code-block:: none + + # make -j16 -C tools/sched_ext + # tools/sched_ext/scx_simple + local=0 global=3 + local=5 global=24 + local=9 global=44 + local=13 global=56 + local=17 global=72 + ^CEXIT: BPF scheduler unregistered + +The current status of the BPF scheduler can be determined as follows: + +.. code-block:: none + + # cat /sys/kernel/sched_ext/state + enabled + # cat /sys/kernel/sched_ext/root/ops + simple + +``tools/sched_ext/scx_show_state.py`` is a drgn script which shows more +detailed information: + +.. code-block:: none + + # tools/sched_ext/scx_show_state.py + ops : simple + enabled : 1 + switching_all : 1 + switched_all : 1 + enable_state : enabled (2) + bypass_depth : 0 + nr_rejected : 0 + +If ``CONFIG_SCHED_DEBUG`` is set, whether a given task is on sched_ext can +be determined as follows: + +.. code-block:: none + + # grep ext /proc/self/sched + ext.enabled : 1 + +The Basics +========== + +Userspace can implement an arbitrary BPF scheduler by loading a set of BPF +programs that implement ``struct sched_ext_ops``. The only mandatory field +is ``ops.name`` which must be a valid BPF object name. All operations are +optional. The following modified excerpt is from +``tools/sched/scx_simple.bpf.c`` showing a minimal global FIFO scheduler. + +.. code-block:: c + + /* + * Decide which CPU a task should be migrated to before being + * enqueued (either at wakeup, fork time, or exec time). If an + * idle core is found by the default ops.select_cpu() implementation, + * then dispatch the task directly to SCX_DSQ_LOCAL and skip the + * ops.enqueue() callback. + * + * Note that this implementation has exactly the same behavior as the + * default ops.select_cpu implementation. The behavior of the scheduler + * would be exactly same if the implementation just didn't define the + * simple_select_cpu() struct_ops prog. + */ + s32 BPF_STRUCT_OPS(simple_select_cpu, struct task_struct *p, + s32 prev_cpu, u64 wake_flags) + { + s32 cpu; + /* Need to initialize or the BPF verifier will reject the program */ + bool direct = false; + + cpu = scx_bpf_select_cpu_dfl(p, prev_cpu, wake_flags, &direct); + + if (direct) + scx_bpf_dispatch(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, 0); + + return cpu; + } + + /* + * Do a direct dispatch of a task to the global DSQ. This ops.enqueue() + * callback will only be invoked if we failed to find a core to dispatch + * to in ops.select_cpu() above. + * + * Note that this implementation has exactly the same behavior as the + * default ops.enqueue implementation, which just dispatches the task + * to SCX_DSQ_GLOBAL. The behavior of the scheduler would be exactly same + * if the implementation just didn't define the simple_enqueue struct_ops + * prog. + */ + void BPF_STRUCT_OPS(simple_enqueue, struct task_struct *p, u64 enq_flags) + { + scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags); + } + + s32 BPF_STRUCT_OPS(simple_init) + { + /* + * All SCHED_OTHER, SCHED_IDLE, and SCHED_BATCH tasks should + * use sched_ext. + */ + scx_bpf_switch_all(); + return 0; + } + + void BPF_STRUCT_OPS(simple_exit, struct scx_exit_info *ei) + { + exit_type = ei->type; + } + + SEC(".struct_ops") + struct sched_ext_ops simple_ops = { + .select_cpu = (void *)simple_select_cpu, + .enqueue = (void *)simple_enqueue, + .init = (void *)simple_init, + .exit = (void *)simple_exit, + .name = "simple", + }; + +Dispatch Queues +--------------- + +To match the impedance between the scheduler core and the BPF scheduler, +sched_ext uses DSQs (dispatch queues) which can operate as both a FIFO and a +priority queue. By default, there is one global FIFO (``SCX_DSQ_GLOBAL``), +and one local dsq per CPU (``SCX_DSQ_LOCAL``). The BPF scheduler can manage +an arbitrary number of dsq's using ``scx_bpf_create_dsq()`` and +``scx_bpf_destroy_dsq()``. + +A CPU always executes a task from its local DSQ. A task is "dispatched" to a +DSQ. A non-local DSQ is "consumed" to transfer a task to the consuming CPU's +local DSQ. + +When a CPU is looking for the next task to run, if the local DSQ is not +empty, the first task is picked. Otherwise, the CPU tries to consume the +global DSQ. If that doesn't yield a runnable task either, ``ops.dispatch()`` +is invoked. + +Scheduling Cycle +---------------- + +The following briefly shows how a waking task is scheduled and executed. + +1. When a task is waking up, ``ops.select_cpu()`` is the first operation + invoked. This serves two purposes. First, CPU selection optimization + hint. Second, waking up the selected CPU if idle. + + The CPU selected by ``ops.select_cpu()`` is an optimization hint and not + binding. The actual decision is made at the last step of scheduling. + However, there is a small performance gain if the CPU + ``ops.select_cpu()`` returns matches the CPU the task eventually runs on. + + A side-effect of selecting a CPU is waking it up from idle. While a BPF + scheduler can wake up any cpu using the ``scx_bpf_kick_cpu()`` helper, + using ``ops.select_cpu()`` judiciously can be simpler and more efficient. + + A task can be immediately dispatched to a DSQ from ``ops.select_cpu()`` by + calling ``scx_bpf_dispatch()``. If the task is dispatched to + ``SCX_DSQ_LOCAL`` from ``ops.select_cpu()``, it will be dispatched to the + local DSQ of whichever CPU is returned from ``ops.select_cpu()``. + Additionally, dispatching directly from ``ops.select_cpu()`` will cause the + ``ops.enqueue()`` callback to be skipped. + + Note that the scheduler core will ignore an invalid CPU selection, for + example, if it's outside the allowed cpumask of the task. + +2. Once the target CPU is selected, ``ops.enqueue()`` is invoked (unless the + task was dispatched directly from ``ops.select_cpu()``). ``ops.enqueue()`` + can make one of the following decisions: + + * Immediately dispatch the task to either the global or local DSQ by + calling ``scx_bpf_dispatch()`` with ``SCX_DSQ_GLOBAL`` or + ``SCX_DSQ_LOCAL``, respectively. + + * Immediately dispatch the task to a custom DSQ by calling + ``scx_bpf_dispatch()`` with a DSQ ID which is smaller than 2^63. + + * Queue the task on the BPF side. + +3. When a CPU is ready to schedule, it first looks at its local DSQ. If + empty, it then looks at the global DSQ. If there still isn't a task to + run, ``ops.dispatch()`` is invoked which can use the following two + functions to populate the local DSQ. + + * ``scx_bpf_dispatch()`` dispatches a task to a DSQ. Any target DSQ can + be used - ``SCX_DSQ_LOCAL``, ``SCX_DSQ_LOCAL_ON | cpu``, + ``SCX_DSQ_GLOBAL`` or a custom DSQ. While ``scx_bpf_dispatch()`` + currently can't be called with BPF locks held, this is being worked on + and will be supported. ``scx_bpf_dispatch()`` schedules dispatching + rather than performing them immediately. There can be up to + ``ops.dispatch_max_batch`` pending tasks. + + * ``scx_bpf_consume()`` tranfers a task from the specified non-local DSQ + to the dispatching DSQ. This function cannot be called with any BPF + locks held. ``scx_bpf_consume()`` flushes the pending dispatched tasks + before trying to consume the specified DSQ. + +4. After ``ops.dispatch()`` returns, if there are tasks in the local DSQ, + the CPU runs the first one. If empty, the following steps are taken: + + * Try to consume the global DSQ. If successful, run the task. + + * If ``ops.dispatch()`` has dispatched any tasks, retry #3. + + * If the previous task is an SCX task and still runnable, keep executing + it (see ``SCX_OPS_ENQ_LAST``). + + * Go idle. + +Note that the BPF scheduler can always choose to dispatch tasks immediately +in ``ops.enqueue()`` as illustrated in the above simple example. If only the +built-in DSQs are used, there is no need to implement ``ops.dispatch()`` as +a task is never queued on the BPF scheduler and both the local and global +DSQs are consumed automatically. + +``scx_bpf_dispatch()`` queues the task on the FIFO of the target DSQ. Use +``scx_bpf_dispatch_vtime()`` for the priority queue. Internal DSQs such as +``SCX_DSQ_LOCAL`` and ``SCX_DSQ_GLOBAL`` do not support priority-queue +dispatching, and must be dispatched to with ``scx_bpf_dispatch()``. See the +function documentation and usage in ``tools/sched_ext/scx_simple.bpf.c`` for +more information. + +Where to Look +============= + +* ``include/linux/sched/ext.h`` defines the core data structures, ops table + and constants. + +* ``kernel/sched/ext.c`` contains sched_ext core implementation and helpers. + The functions prefixed with ``scx_bpf_`` can be called from the BPF + scheduler. + +* ``tools/sched_ext/`` hosts example BPF scheduler implementations. + + * ``scx_simple[.bpf].c``: Minimal global FIFO scheduler example using a + custom DSQ. + + * ``scx_qmap[.bpf].c``: A multi-level FIFO scheduler supporting five + levels of priority implemented with ``BPF_MAP_TYPE_QUEUE``. + +ABI Instability +=============== + +The APIs provided by sched_ext to BPF schedulers programs have no stability +guarantees. This includes the ops table callbacks and constants defined in +``include/linux/sched/ext.h``, as well as the ``scx_bpf_`` kfuncs defined in +``kernel/sched/ext.c``. + +While we will attempt to provide a relatively stable API surface when +possible, they are subject to change without warning between kernel +versions. diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h index 9cee193dab19..fe9a67ffe6b1 100644 --- a/include/linux/sched/ext.h +++ b/include/linux/sched/ext.h @@ -1,5 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0 */ /* + * BPF extensible scheduler class: Documentation/scheduler/sched-ext.rst + * * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. * Copyright (c) 2022 Tejun Heo * Copyright (c) 2022 David Vernet diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt index 7dde5e424ac3..f035c87d02f1 100644 --- a/kernel/Kconfig.preempt +++ b/kernel/Kconfig.preempt @@ -156,4 +156,5 @@ config SCHED_CLASS_EXT similar to struct sched_class. For more information: + Documentation/scheduler/sched-ext.rst https://github.com/sched-ext/scx diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index f186c576e7d9..f814e84ceeb3 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -1,5 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0 */ /* + * BPF extensible scheduler class: Documentation/scheduler/sched-ext.rst + * * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. * Copyright (c) 2022 Tejun Heo * Copyright (c) 2022 David Vernet diff --git a/kernel/sched/ext.h b/kernel/sched/ext.h index 6555878c5da3..c41d742b5d62 100644 --- a/kernel/sched/ext.h +++ b/kernel/sched/ext.h @@ -1,5 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0 */ /* + * BPF extensible scheduler class: Documentation/scheduler/sched-ext.rst + * * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. * Copyright (c) 2022 Tejun Heo * Copyright (c) 2022 David Vernet diff --git a/tools/sched_ext/README.md b/tools/sched_ext/README.md new file mode 100644 index 000000000000..8efe70cc4363 --- /dev/null +++ b/tools/sched_ext/README.md @@ -0,0 +1,258 @@ +SCHED_EXT EXAMPLE SCHEDULERS +============================ + +# Introduction + +This directory contains a number of example sched_ext schedulers. These +schedulers are meant to provide examples of different types of schedulers +that can be built using sched_ext, and illustrate how various features of +sched_ext can be used. + +Some of the examples are performant, production-ready schedulers. That is, for +the correct workload and with the correct tuning, they may be deployed in a +production environment with acceptable or possibly even improved performance. +Others are just examples that in practice, would not provide acceptable +performance (though they could be improved to get there). + +This README will describe these example schedulers, including describing the +types of workloads or scenarios they're designed to accommodate, and whether or +not they're production ready. For more details on any of these schedulers, +please see the header comment in their .bpf.c file. + + +# Compiling the examples + +There are a few toolchain dependencies for compiling the example schedulers. + +## Toolchain dependencies + +1. clang >= 16.0.0 + +The schedulers are BPF programs, and therefore must be compiled with clang. gcc +is actively working on adding a BPF backend compiler as well, but are still +missing some features such as BTF type tags which are necessary for using +kptrs. + +2. pahole >= 1.25 + +You may need pahole in order to generate BTF from DWARF. + +3. rust >= 1.70.0 + +Rust schedulers uses features present in the rust toolchain >= 1.70.0. You +should be able to use the stable build from rustup, but if that doesn't +work, try using the rustup nightly build. + +There are other requirements as well, such as make, but these are the main / +non-trivial ones. + +## Compiling the kernel + +In order to run a sched_ext scheduler, you'll have to run a kernel compiled +with the patches in this repository, and with a minimum set of necessary +Kconfig options: + +``` +CONFIG_BPF=y +CONFIG_SCHED_CLASS_EXT=y +CONFIG_BPF_SYSCALL=y +CONFIG_BPF_JIT=y +CONFIG_DEBUG_INFO_BTF=y +``` + +It's also recommended that you also include the following Kconfig options: + +``` +CONFIG_BPF_JIT_ALWAYS_ON=y +CONFIG_BPF_JIT_DEFAULT_ON=y +CONFIG_PAHOLE_HAS_SPLIT_BTF=y +CONFIG_PAHOLE_HAS_BTF_TAG=y +``` + +There is a `Kconfig` file in this directory whose contents you can append to +your local `.config` file, as long as there are no conflicts with any existing +options in the file. + +## Getting a vmlinux.h file + +You may notice that most of the example schedulers include a "vmlinux.h" file. +This is a large, auto-generated header file that contains all of the types +defined in some vmlinux binary that was compiled with +[BTF](https://docs.kernel.org/bpf/btf.html) (i.e. with the BTF-related Kconfig +options specified above). + +The header file is created using `bpftool`, by passing it a vmlinux binary +compiled with BTF as follows: + +```bash +$ bpftool btf dump file /path/to/vmlinux format c > vmlinux.h +``` + +`bpftool` analyzes all of the BTF encodings in the binary, and produces a +header file that can be included by BPF programs to access those types. For +example, using vmlinux.h allows a scheduler to access fields defined directly +in vmlinux as follows: + +```c +#include "vmlinux.h" +// vmlinux.h is also implicitly included by scx_common.bpf.h. +#include "scx_common.bpf.h" + +/* + * vmlinux.h provides definitions for struct task_struct and + * struct scx_enable_args. + */ +void BPF_STRUCT_OPS(example_enable, struct task_struct *p, + struct scx_enable_args *args) +{ + bpf_printk("Task %s enabled in example scheduler", p->comm); +} + +// vmlinux.h provides the definition for struct sched_ext_ops. +SEC(".struct_ops.link") +struct sched_ext_ops example_ops { + .enable = (void *)example_enable, + .name = "example", +} +``` + +The scheduler build system will generate this vmlinux.h file as part of the +scheduler build pipeline. It looks for a vmlinux file in the following +dependency order: + +1. If the O= environment variable is defined, at `$O/vmlinux` +2. If the KBUILD_OUTPUT= environment variable is defined, at + `$KBUILD_OUTPUT/vmlinux` +3. At `../../vmlinux` (i.e. at the root of the kernel tree where you're + compiling the schedulers) +3. `/sys/kernel/btf/vmlinux` +4. `/boot/vmlinux-$(uname -r)` + +In other words, if you have compiled a kernel in your local repo, its vmlinux +file will be used to generate vmlinux.h. Otherwise, it will be the vmlinux of +the kernel you're currently running on. This means that if you're running on a +kernel with sched_ext support, you may not need to compile a local kernel at +all. + +### Aside on CO-RE + +One of the cooler features of BPF is that it supports +[CO-RE](https://nakryiko.com/posts/bpf-core-reference-guide/) (Compile Once Run +Everywhere). This feature allows you to reference fields inside of structs with +types defined internal to the kernel, and not have to recompile if you load the +BPF program on a different kernel with the field at a different offset. In our +example above, we print out a task name with `p->comm`. CO-RE would perform +relocations for that access when the program is loaded to ensure that it's +referencing the correct offset for the currently running kernel. + +## Compiling the schedulers + +Once you have your toolchain setup, and a vmlinux that can be used to generate +a full vmlinux.h file, you can compile the schedulers using `make`: + +```bash +$ make -j($nproc) +``` + +# Example schedulers + +This directory contains the following example schedulers. These schedulers are +for testing and demonstrating different aspects of sched_ext. While some may be +useful in limited scenarios, they are not intended to be practical. + +For more scheduler implementations, tools and documentation, visit +https://github.com/sched-ext/scx. + +## scx_simple + +A simple scheduler that provides an example of a minimal sched_ext scheduler. +scx_simple can be run in either global weighted vtime mode, or FIFO mode. + +Though very simple, in limited scenarios, this scheduler can perform reasonably +well on single-socket systems with a unified L3 cache. + +## scx_qmap + +Another simple, yet slightly more complex scheduler that provides an example of +a basic weighted FIFO queuing policy. It also provides examples of some common +useful BPF features, such as sleepable per-task storage allocation in the +`ops.prep_enable()` callback, and using the `BPF_MAP_TYPE_QUEUE` map type to +enqueue tasks. It also illustrates how core-sched support could be implemented. + +## scx_central + +A "central" scheduler where scheduling decisions are made from a single CPU. +This scheduler illustrates how scheduling decisions can be dispatched from a +single CPU, allowing other cores to run with infinite slices, without timer +ticks, and without having to incur the overhead of making scheduling decisions. + +The approach demonstrated by this scheduler may be useful for any workload that +benefits from minimizing scheduling overhead and timer ticks. An example of +where this could be particularly useful is running VMs, where running with +infinite slices and no timer ticks allows the VM to avoid unnecessary expensive +vmexits. + + +# Troubleshooting + +There are a number of common issues that you may run into when building the +schedulers. We'll go over some of the common ones here. + +## Build Failures + +### Old version of clang + +``` +error: static assertion failed due to requirement 'SCX_DSQ_FLAG_BUILTIN': bpftool generated vmlinux.h is missing high bits for 64bit enums, upgrade clang and pahole + _Static_assert(SCX_DSQ_FLAG_BUILTIN, + ^~~~~~~~~~~~~~~~~~~~ +1 error generated. +``` + +This means you built the kernel or the schedulers with an older version of +clang than what's supported (i.e. older than 16.0.0). To remediate this: + +1. `which clang` to make sure you're using a sufficiently new version of clang. + +2. `make fullclean` in the root path of the repository, and rebuild the kernel + and schedulers. + +3. Rebuild the kernel, and then your example schedulers. + +The schedulers are also cleaned if you invoke `make mrproper` in the root +directory of the tree. + +### Stale kernel build / incomplete vmlinux.h file + +As described above, you'll need a `vmlinux.h` file that was generated from a +vmlinux built with BTF, and with sched_ext support enabled. If you don't, +you'll see errors such as the following which indicate that a type being +referenced in a scheduler is unknown: + +``` +/path/to/sched_ext/tools/sched_ext/user_exit_info.h:25:23: note: forward declaration of 'struct scx_exit_info' + +const struct scx_exit_info *ei) + +^ +``` + +In order to resolve this, please follow the steps above in +[Getting a vmlinux.h file](#getting-a-vmlinuxh-file) in order to ensure your +schedulers are using a vmlinux.h file that includes the requisite types. + +## Misc + +### llvm: [OFF] + +You may see the following output when building the schedulers: + +``` +Auto-detecting system features: +... clang-bpf-co-re: [ on ] +... llvm: [ OFF ] +... libcap: [ on ] +... libbfd: [ on ] +``` + +Seeing `llvm: [ OFF ]` here is not an issue. You can safely ignore. -- cgit v1.2.3 From a5db7817af780db6a7f290c79677eff4fd13c5fa Mon Sep 17 00:00:00 2001 From: David Vernet Date: Tue, 18 Jun 2024 10:09:21 -1000 Subject: sched_ext: Add selftests Add basic selftests. Signed-off-by: David Vernet Acked-by: Tejun Heo --- tools/testing/selftests/sched_ext/.gitignore | 6 + tools/testing/selftests/sched_ext/Makefile | 218 +++++++++++++++++++++ tools/testing/selftests/sched_ext/config | 9 + tools/testing/selftests/sched_ext/create_dsq.bpf.c | 58 ++++++ tools/testing/selftests/sched_ext/create_dsq.c | 57 ++++++ .../selftests/sched_ext/ddsp_bogus_dsq_fail.bpf.c | 42 ++++ .../selftests/sched_ext/ddsp_bogus_dsq_fail.c | 57 ++++++ .../selftests/sched_ext/ddsp_vtimelocal_fail.bpf.c | 39 ++++ .../selftests/sched_ext/ddsp_vtimelocal_fail.c | 56 ++++++ .../testing/selftests/sched_ext/dsp_local_on.bpf.c | 65 ++++++ tools/testing/selftests/sched_ext/dsp_local_on.c | 58 ++++++ .../sched_ext/enq_last_no_enq_fails.bpf.c | 21 ++ .../selftests/sched_ext/enq_last_no_enq_fails.c | 60 ++++++ .../selftests/sched_ext/enq_select_cpu_fails.bpf.c | 43 ++++ .../selftests/sched_ext/enq_select_cpu_fails.c | 61 ++++++ tools/testing/selftests/sched_ext/exit.bpf.c | 84 ++++++++ tools/testing/selftests/sched_ext/exit.c | 55 ++++++ tools/testing/selftests/sched_ext/exit_test.h | 20 ++ tools/testing/selftests/sched_ext/hotplug.bpf.c | 61 ++++++ tools/testing/selftests/sched_ext/hotplug.c | 168 ++++++++++++++++ tools/testing/selftests/sched_ext/hotplug_test.h | 15 ++ .../selftests/sched_ext/init_enable_count.bpf.c | 53 +++++ .../selftests/sched_ext/init_enable_count.c | 166 ++++++++++++++++ tools/testing/selftests/sched_ext/maximal.bpf.c | 132 +++++++++++++ tools/testing/selftests/sched_ext/maximal.c | 51 +++++ tools/testing/selftests/sched_ext/maybe_null.bpf.c | 36 ++++ tools/testing/selftests/sched_ext/maybe_null.c | 49 +++++ .../selftests/sched_ext/maybe_null_fail_dsp.bpf.c | 25 +++ .../selftests/sched_ext/maybe_null_fail_yld.bpf.c | 28 +++ tools/testing/selftests/sched_ext/minimal.bpf.c | 21 ++ tools/testing/selftests/sched_ext/minimal.c | 58 ++++++ tools/testing/selftests/sched_ext/prog_run.bpf.c | 32 +++ tools/testing/selftests/sched_ext/prog_run.c | 78 ++++++++ tools/testing/selftests/sched_ext/reload_loop.c | 75 +++++++ tools/testing/selftests/sched_ext/runner.c | 201 +++++++++++++++++++ tools/testing/selftests/sched_ext/scx_test.h | 131 +++++++++++++ .../selftests/sched_ext/select_cpu_dfl.bpf.c | 40 ++++ tools/testing/selftests/sched_ext/select_cpu_dfl.c | 72 +++++++ .../sched_ext/select_cpu_dfl_nodispatch.bpf.c | 89 +++++++++ .../sched_ext/select_cpu_dfl_nodispatch.c | 72 +++++++ .../selftests/sched_ext/select_cpu_dispatch.bpf.c | 41 ++++ .../selftests/sched_ext/select_cpu_dispatch.c | 70 +++++++ .../sched_ext/select_cpu_dispatch_bad_dsq.bpf.c | 37 ++++ .../sched_ext/select_cpu_dispatch_bad_dsq.c | 56 ++++++ .../sched_ext/select_cpu_dispatch_dbl_dsp.bpf.c | 38 ++++ .../sched_ext/select_cpu_dispatch_dbl_dsp.c | 56 ++++++ .../selftests/sched_ext/select_cpu_vtime.bpf.c | 92 +++++++++ .../testing/selftests/sched_ext/select_cpu_vtime.c | 59 ++++++ tools/testing/selftests/sched_ext/test_example.c | 49 +++++ tools/testing/selftests/sched_ext/util.c | 71 +++++++ tools/testing/selftests/sched_ext/util.h | 13 ++ 51 files changed, 3244 insertions(+) create mode 100644 tools/testing/selftests/sched_ext/.gitignore create mode 100644 tools/testing/selftests/sched_ext/Makefile create mode 100644 tools/testing/selftests/sched_ext/config create mode 100644 tools/testing/selftests/sched_ext/create_dsq.bpf.c create mode 100644 tools/testing/selftests/sched_ext/create_dsq.c create mode 100644 tools/testing/selftests/sched_ext/ddsp_bogus_dsq_fail.bpf.c create mode 100644 tools/testing/selftests/sched_ext/ddsp_bogus_dsq_fail.c create mode 100644 tools/testing/selftests/sched_ext/ddsp_vtimelocal_fail.bpf.c create mode 100644 tools/testing/selftests/sched_ext/ddsp_vtimelocal_fail.c create mode 100644 tools/testing/selftests/sched_ext/dsp_local_on.bpf.c create mode 100644 tools/testing/selftests/sched_ext/dsp_local_on.c create mode 100644 tools/testing/selftests/sched_ext/enq_last_no_enq_fails.bpf.c create mode 100644 tools/testing/selftests/sched_ext/enq_last_no_enq_fails.c create mode 100644 tools/testing/selftests/sched_ext/enq_select_cpu_fails.bpf.c create mode 100644 tools/testing/selftests/sched_ext/enq_select_cpu_fails.c create mode 100644 tools/testing/selftests/sched_ext/exit.bpf.c create mode 100644 tools/testing/selftests/sched_ext/exit.c create mode 100644 tools/testing/selftests/sched_ext/exit_test.h create mode 100644 tools/testing/selftests/sched_ext/hotplug.bpf.c create mode 100644 tools/testing/selftests/sched_ext/hotplug.c create mode 100644 tools/testing/selftests/sched_ext/hotplug_test.h create mode 100644 tools/testing/selftests/sched_ext/init_enable_count.bpf.c create mode 100644 tools/testing/selftests/sched_ext/init_enable_count.c create mode 100644 tools/testing/selftests/sched_ext/maximal.bpf.c create mode 100644 tools/testing/selftests/sched_ext/maximal.c create mode 100644 tools/testing/selftests/sched_ext/maybe_null.bpf.c create mode 100644 tools/testing/selftests/sched_ext/maybe_null.c create mode 100644 tools/testing/selftests/sched_ext/maybe_null_fail_dsp.bpf.c create mode 100644 tools/testing/selftests/sched_ext/maybe_null_fail_yld.bpf.c create mode 100644 tools/testing/selftests/sched_ext/minimal.bpf.c create mode 100644 tools/testing/selftests/sched_ext/minimal.c create mode 100644 tools/testing/selftests/sched_ext/prog_run.bpf.c create mode 100644 tools/testing/selftests/sched_ext/prog_run.c create mode 100644 tools/testing/selftests/sched_ext/reload_loop.c create mode 100644 tools/testing/selftests/sched_ext/runner.c create mode 100644 tools/testing/selftests/sched_ext/scx_test.h create mode 100644 tools/testing/selftests/sched_ext/select_cpu_dfl.bpf.c create mode 100644 tools/testing/selftests/sched_ext/select_cpu_dfl.c create mode 100644 tools/testing/selftests/sched_ext/select_cpu_dfl_nodispatch.bpf.c create mode 100644 tools/testing/selftests/sched_ext/select_cpu_dfl_nodispatch.c create mode 100644 tools/testing/selftests/sched_ext/select_cpu_dispatch.bpf.c create mode 100644 tools/testing/selftests/sched_ext/select_cpu_dispatch.c create mode 100644 tools/testing/selftests/sched_ext/select_cpu_dispatch_bad_dsq.bpf.c create mode 100644 tools/testing/selftests/sched_ext/select_cpu_dispatch_bad_dsq.c create mode 100644 tools/testing/selftests/sched_ext/select_cpu_dispatch_dbl_dsp.bpf.c create mode 100644 tools/testing/selftests/sched_ext/select_cpu_dispatch_dbl_dsp.c create mode 100644 tools/testing/selftests/sched_ext/select_cpu_vtime.bpf.c create mode 100644 tools/testing/selftests/sched_ext/select_cpu_vtime.c create mode 100644 tools/testing/selftests/sched_ext/test_example.c create mode 100644 tools/testing/selftests/sched_ext/util.c create mode 100644 tools/testing/selftests/sched_ext/util.h (limited to 'tools') diff --git a/tools/testing/selftests/sched_ext/.gitignore b/tools/testing/selftests/sched_ext/.gitignore new file mode 100644 index 000000000000..ae5491a114c0 --- /dev/null +++ b/tools/testing/selftests/sched_ext/.gitignore @@ -0,0 +1,6 @@ +* +!*.c +!*.h +!Makefile +!.gitignore +!config diff --git a/tools/testing/selftests/sched_ext/Makefile b/tools/testing/selftests/sched_ext/Makefile new file mode 100644 index 000000000000..0754a2c110a1 --- /dev/null +++ b/tools/testing/selftests/sched_ext/Makefile @@ -0,0 +1,218 @@ +# SPDX-License-Identifier: GPL-2.0 +# Copyright (c) 2022 Meta Platforms, Inc. and affiliates. +include ../../../build/Build.include +include ../../../scripts/Makefile.arch +include ../../../scripts/Makefile.include +include ../lib.mk + +ifneq ($(LLVM),) +ifneq ($(filter %/,$(LLVM)),) +LLVM_PREFIX := $(LLVM) +else ifneq ($(filter -%,$(LLVM)),) +LLVM_SUFFIX := $(LLVM) +endif + +CC := $(LLVM_PREFIX)clang$(LLVM_SUFFIX) $(CLANG_FLAGS) -fintegrated-as +else +CC := gcc +endif # LLVM + +ifneq ($(CROSS_COMPILE),) +$(error CROSS_COMPILE not supported for scx selftests) +endif # CROSS_COMPILE + +CURDIR := $(abspath .) +REPOROOT := $(abspath ../../../..) +TOOLSDIR := $(REPOROOT)/tools +LIBDIR := $(TOOLSDIR)/lib +BPFDIR := $(LIBDIR)/bpf +TOOLSINCDIR := $(TOOLSDIR)/include +BPFTOOLDIR := $(TOOLSDIR)/bpf/bpftool +APIDIR := $(TOOLSINCDIR)/uapi +GENDIR := $(REPOROOT)/include/generated +GENHDR := $(GENDIR)/autoconf.h +SCXTOOLSDIR := $(TOOLSDIR)/sched_ext +SCXTOOLSINCDIR := $(TOOLSDIR)/sched_ext/include + +OUTPUT_DIR := $(CURDIR)/build +OBJ_DIR := $(OUTPUT_DIR)/obj +INCLUDE_DIR := $(OUTPUT_DIR)/include +BPFOBJ_DIR := $(OBJ_DIR)/libbpf +SCXOBJ_DIR := $(OBJ_DIR)/sched_ext +BPFOBJ := $(BPFOBJ_DIR)/libbpf.a +LIBBPF_OUTPUT := $(OBJ_DIR)/libbpf/libbpf.a +DEFAULT_BPFTOOL := $(OUTPUT_DIR)/sbin/bpftool +HOST_BUILD_DIR := $(OBJ_DIR) +HOST_OUTPUT_DIR := $(OUTPUT_DIR) + +VMLINUX_BTF_PATHS ?= ../../../../vmlinux \ + /sys/kernel/btf/vmlinux \ + /boot/vmlinux-$(shell uname -r) +VMLINUX_BTF ?= $(abspath $(firstword $(wildcard $(VMLINUX_BTF_PATHS)))) +ifeq ($(VMLINUX_BTF),) +$(error Cannot find a vmlinux for VMLINUX_BTF at any of "$(VMLINUX_BTF_PATHS)") +endif + +BPFTOOL ?= $(DEFAULT_BPFTOOL) + +ifneq ($(wildcard $(GENHDR)),) + GENFLAGS := -DHAVE_GENHDR +endif + +CFLAGS += -g -O2 -rdynamic -pthread -Wall -Werror $(GENFLAGS) \ + -I$(INCLUDE_DIR) -I$(GENDIR) -I$(LIBDIR) \ + -I$(TOOLSINCDIR) -I$(APIDIR) -I$(CURDIR)/include -I$(SCXTOOLSINCDIR) + +# Silence some warnings when compiled with clang +ifneq ($(LLVM),) +CFLAGS += -Wno-unused-command-line-argument +endif + +LDFLAGS = -lelf -lz -lpthread -lzstd + +IS_LITTLE_ENDIAN = $(shell $(CC) -dM -E - &1 \ + | sed -n '/<...> search starts here:/,/End of search list./{ s| \(/.*\)|-idirafter \1|p }') \ +$(shell $(1) -dM -E - $@ +else + $(call msg,CP,,$@) + $(Q)cp "$(VMLINUX_H)" $@ +endif + +$(SCXOBJ_DIR)/%.bpf.o: %.bpf.c $(INCLUDE_DIR)/vmlinux.h | $(BPFOBJ) $(SCXOBJ_DIR) + $(call msg,CLNG-BPF,,$(notdir $@)) + $(Q)$(CLANG) $(BPF_CFLAGS) -target bpf -c $< -o $@ + +$(INCLUDE_DIR)/%.bpf.skel.h: $(SCXOBJ_DIR)/%.bpf.o $(INCLUDE_DIR)/vmlinux.h $(BPFTOOL) | $(INCLUDE_DIR) + $(eval sched=$(notdir $@)) + $(call msg,GEN-SKEL,,$(sched)) + $(Q)$(BPFTOOL) gen object $(<:.o=.linked1.o) $< + $(Q)$(BPFTOOL) gen object $(<:.o=.linked2.o) $(<:.o=.linked1.o) + $(Q)$(BPFTOOL) gen object $(<:.o=.linked3.o) $(<:.o=.linked2.o) + $(Q)diff $(<:.o=.linked2.o) $(<:.o=.linked3.o) + $(Q)$(BPFTOOL) gen skeleton $(<:.o=.linked3.o) name $(subst .bpf.skel.h,,$(sched)) > $@ + $(Q)$(BPFTOOL) gen subskeleton $(<:.o=.linked3.o) name $(subst .bpf.skel.h,,$(sched)) > $(@:.skel.h=.subskel.h) + +################ +# C schedulers # +################ + +override define CLEAN + rm -rf $(OUTPUT_DIR) + rm -f *.o *.bpf.o *.bpf.skel.h *.bpf.subskel.h + rm -f $(TEST_GEN_PROGS) + rm -f runner +endef + +# Every testcase takes all of the BPF progs are dependencies by default. This +# allows testcases to load any BPF scheduler, which is useful for testcases +# that don't need their own prog to run their test. +all_test_bpfprogs := $(foreach prog,$(wildcard *.bpf.c),$(INCLUDE_DIR)/$(patsubst %.c,%.skel.h,$(prog))) + +auto-test-targets := \ + create_dsq \ + enq_last_no_enq_fails \ + enq_select_cpu_fails \ + ddsp_bogus_dsq_fail \ + ddsp_vtimelocal_fail \ + dsp_local_on \ + exit \ + hotplug \ + init_enable_count \ + maximal \ + maybe_null \ + minimal \ + prog_run \ + reload_loop \ + select_cpu_dfl \ + select_cpu_dfl_nodispatch \ + select_cpu_dispatch \ + select_cpu_dispatch_bad_dsq \ + select_cpu_dispatch_dbl_dsp \ + select_cpu_vtime \ + test_example \ + +testcase-targets := $(addsuffix .o,$(addprefix $(SCXOBJ_DIR)/,$(auto-test-targets))) + +$(SCXOBJ_DIR)/runner.o: runner.c | $(SCXOBJ_DIR) + $(CC) $(CFLAGS) -c $< -o $@ + +# Create all of the test targets object files, whose testcase objects will be +# registered into the runner in ELF constructors. +# +# Note that we must do double expansion here in order to support conditionally +# compiling BPF object files only if one is present, as the wildcard Make +# function doesn't support using implicit rules otherwise. +$(testcase-targets): $(SCXOBJ_DIR)/%.o: %.c $(SCXOBJ_DIR)/runner.o $(all_test_bpfprogs) | $(SCXOBJ_DIR) + $(eval test=$(patsubst %.o,%.c,$(notdir $@))) + $(CC) $(CFLAGS) -c $< -o $@ $(SCXOBJ_DIR)/runner.o + +$(SCXOBJ_DIR)/util.o: util.c | $(SCXOBJ_DIR) + $(CC) $(CFLAGS) -c $< -o $@ + +runner: $(SCXOBJ_DIR)/runner.o $(SCXOBJ_DIR)/util.o $(BPFOBJ) $(testcase-targets) + @echo "$(testcase-targets)" + $(CC) $(CFLAGS) -o $@ $^ $(LDFLAGS) + +TEST_GEN_PROGS := runner + +all: runner + +.PHONY: all clean help + +.DEFAULT_GOAL := all + +.DELETE_ON_ERROR: + +.SECONDARY: diff --git a/tools/testing/selftests/sched_ext/config b/tools/testing/selftests/sched_ext/config new file mode 100644 index 000000000000..0de9b4ee249d --- /dev/null +++ b/tools/testing/selftests/sched_ext/config @@ -0,0 +1,9 @@ +CONFIG_SCHED_DEBUG=y +CONFIG_SCHED_CLASS_EXT=y +CONFIG_CGROUPS=y +CONFIG_CGROUP_SCHED=y +CONFIG_EXT_GROUP_SCHED=y +CONFIG_BPF=y +CONFIG_BPF_SYSCALL=y +CONFIG_DEBUG_INFO=y +CONFIG_DEBUG_INFO_BTF=y diff --git a/tools/testing/selftests/sched_ext/create_dsq.bpf.c b/tools/testing/selftests/sched_ext/create_dsq.bpf.c new file mode 100644 index 000000000000..23f79ed343f0 --- /dev/null +++ b/tools/testing/selftests/sched_ext/create_dsq.bpf.c @@ -0,0 +1,58 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Create and destroy DSQs in a loop. + * + * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2024 David Vernet + */ + +#include + +char _license[] SEC("license") = "GPL"; + +void BPF_STRUCT_OPS(create_dsq_exit_task, struct task_struct *p, + struct scx_exit_task_args *args) +{ + scx_bpf_destroy_dsq(p->pid); +} + +s32 BPF_STRUCT_OPS_SLEEPABLE(create_dsq_init_task, struct task_struct *p, + struct scx_init_task_args *args) +{ + s32 err; + + err = scx_bpf_create_dsq(p->pid, -1); + if (err) + scx_bpf_error("Failed to create DSQ for %s[%d]", + p->comm, p->pid); + + return err; +} + +s32 BPF_STRUCT_OPS_SLEEPABLE(create_dsq_init) +{ + u32 i; + s32 err; + + bpf_for(i, 0, 1024) { + err = scx_bpf_create_dsq(i, -1); + if (err) { + scx_bpf_error("Failed to create DSQ %d", i); + return 0; + } + } + + bpf_for(i, 0, 1024) { + scx_bpf_destroy_dsq(i); + } + + return 0; +} + +SEC(".struct_ops.link") +struct sched_ext_ops create_dsq_ops = { + .init_task = create_dsq_init_task, + .exit_task = create_dsq_exit_task, + .init = create_dsq_init, + .name = "create_dsq", +}; diff --git a/tools/testing/selftests/sched_ext/create_dsq.c b/tools/testing/selftests/sched_ext/create_dsq.c new file mode 100644 index 000000000000..fa946d9146d4 --- /dev/null +++ b/tools/testing/selftests/sched_ext/create_dsq.c @@ -0,0 +1,57 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2024 David Vernet + */ +#include +#include +#include +#include +#include "create_dsq.bpf.skel.h" +#include "scx_test.h" + +static enum scx_test_status setup(void **ctx) +{ + struct create_dsq *skel; + + skel = create_dsq__open_and_load(); + if (!skel) { + SCX_ERR("Failed to open and load skel"); + return SCX_TEST_FAIL; + } + *ctx = skel; + + return SCX_TEST_PASS; +} + +static enum scx_test_status run(void *ctx) +{ + struct create_dsq *skel = ctx; + struct bpf_link *link; + + link = bpf_map__attach_struct_ops(skel->maps.create_dsq_ops); + if (!link) { + SCX_ERR("Failed to attach scheduler"); + return SCX_TEST_FAIL; + } + + bpf_link__destroy(link); + + return SCX_TEST_PASS; +} + +static void cleanup(void *ctx) +{ + struct create_dsq *skel = ctx; + + create_dsq__destroy(skel); +} + +struct scx_test create_dsq = { + .name = "create_dsq", + .description = "Create and destroy a dsq in a loop", + .setup = setup, + .run = run, + .cleanup = cleanup, +}; +REGISTER_SCX_TEST(&create_dsq) diff --git a/tools/testing/selftests/sched_ext/ddsp_bogus_dsq_fail.bpf.c b/tools/testing/selftests/sched_ext/ddsp_bogus_dsq_fail.bpf.c new file mode 100644 index 000000000000..e97ad41d354a --- /dev/null +++ b/tools/testing/selftests/sched_ext/ddsp_bogus_dsq_fail.bpf.c @@ -0,0 +1,42 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2024 David Vernet + * Copyright (c) 2024 Tejun Heo + */ +#include + +char _license[] SEC("license") = "GPL"; + +UEI_DEFINE(uei); + +s32 BPF_STRUCT_OPS(ddsp_bogus_dsq_fail_select_cpu, struct task_struct *p, + s32 prev_cpu, u64 wake_flags) +{ + s32 cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0); + + if (cpu >= 0) { + /* + * If we dispatch to a bogus DSQ that will fall back to the + * builtin global DSQ, we fail gracefully. + */ + scx_bpf_dispatch_vtime(p, 0xcafef00d, SCX_SLICE_DFL, + p->scx.dsq_vtime, 0); + return cpu; + } + + return prev_cpu; +} + +void BPF_STRUCT_OPS(ddsp_bogus_dsq_fail_exit, struct scx_exit_info *ei) +{ + UEI_RECORD(uei, ei); +} + +SEC(".struct_ops.link") +struct sched_ext_ops ddsp_bogus_dsq_fail_ops = { + .select_cpu = ddsp_bogus_dsq_fail_select_cpu, + .exit = ddsp_bogus_dsq_fail_exit, + .name = "ddsp_bogus_dsq_fail", + .timeout_ms = 1000U, +}; diff --git a/tools/testing/selftests/sched_ext/ddsp_bogus_dsq_fail.c b/tools/testing/selftests/sched_ext/ddsp_bogus_dsq_fail.c new file mode 100644 index 000000000000..e65d22f23f3b --- /dev/null +++ b/tools/testing/selftests/sched_ext/ddsp_bogus_dsq_fail.c @@ -0,0 +1,57 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2024 David Vernet + * Copyright (c) 2024 Tejun Heo + */ +#include +#include +#include +#include +#include "ddsp_bogus_dsq_fail.bpf.skel.h" +#include "scx_test.h" + +static enum scx_test_status setup(void **ctx) +{ + struct ddsp_bogus_dsq_fail *skel; + + skel = ddsp_bogus_dsq_fail__open_and_load(); + SCX_FAIL_IF(!skel, "Failed to open and load skel"); + *ctx = skel; + + return SCX_TEST_PASS; +} + +static enum scx_test_status run(void *ctx) +{ + struct ddsp_bogus_dsq_fail *skel = ctx; + struct bpf_link *link; + + link = bpf_map__attach_struct_ops(skel->maps.ddsp_bogus_dsq_fail_ops); + SCX_FAIL_IF(!link, "Failed to attach struct_ops"); + + sleep(1); + + SCX_EQ(skel->data->uei.kind, EXIT_KIND(SCX_EXIT_ERROR)); + bpf_link__destroy(link); + + return SCX_TEST_PASS; +} + +static void cleanup(void *ctx) +{ + struct ddsp_bogus_dsq_fail *skel = ctx; + + ddsp_bogus_dsq_fail__destroy(skel); +} + +struct scx_test ddsp_bogus_dsq_fail = { + .name = "ddsp_bogus_dsq_fail", + .description = "Verify we gracefully fail, and fall back to using a " + "built-in DSQ, if we do a direct dispatch to an invalid" + " DSQ in ops.select_cpu()", + .setup = setup, + .run = run, + .cleanup = cleanup, +}; +REGISTER_SCX_TEST(&ddsp_bogus_dsq_fail) diff --git a/tools/testing/selftests/sched_ext/ddsp_vtimelocal_fail.bpf.c b/tools/testing/selftests/sched_ext/ddsp_vtimelocal_fail.bpf.c new file mode 100644 index 000000000000..dde7e7dafbfb --- /dev/null +++ b/tools/testing/selftests/sched_ext/ddsp_vtimelocal_fail.bpf.c @@ -0,0 +1,39 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2024 David Vernet + * Copyright (c) 2024 Tejun Heo + */ +#include + +char _license[] SEC("license") = "GPL"; + +UEI_DEFINE(uei); + +s32 BPF_STRUCT_OPS(ddsp_vtimelocal_fail_select_cpu, struct task_struct *p, + s32 prev_cpu, u64 wake_flags) +{ + s32 cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0); + + if (cpu >= 0) { + /* Shouldn't be allowed to vtime dispatch to a builtin DSQ. */ + scx_bpf_dispatch_vtime(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, + p->scx.dsq_vtime, 0); + return cpu; + } + + return prev_cpu; +} + +void BPF_STRUCT_OPS(ddsp_vtimelocal_fail_exit, struct scx_exit_info *ei) +{ + UEI_RECORD(uei, ei); +} + +SEC(".struct_ops.link") +struct sched_ext_ops ddsp_vtimelocal_fail_ops = { + .select_cpu = ddsp_vtimelocal_fail_select_cpu, + .exit = ddsp_vtimelocal_fail_exit, + .name = "ddsp_vtimelocal_fail", + .timeout_ms = 1000U, +}; diff --git a/tools/testing/selftests/sched_ext/ddsp_vtimelocal_fail.c b/tools/testing/selftests/sched_ext/ddsp_vtimelocal_fail.c new file mode 100644 index 000000000000..abafee587cd6 --- /dev/null +++ b/tools/testing/selftests/sched_ext/ddsp_vtimelocal_fail.c @@ -0,0 +1,56 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2024 David Vernet + * Copyright (c) 2024 Tejun Heo + */ +#include +#include +#include +#include "ddsp_vtimelocal_fail.bpf.skel.h" +#include "scx_test.h" + +static enum scx_test_status setup(void **ctx) +{ + struct ddsp_vtimelocal_fail *skel; + + skel = ddsp_vtimelocal_fail__open_and_load(); + SCX_FAIL_IF(!skel, "Failed to open and load skel"); + *ctx = skel; + + return SCX_TEST_PASS; +} + +static enum scx_test_status run(void *ctx) +{ + struct ddsp_vtimelocal_fail *skel = ctx; + struct bpf_link *link; + + link = bpf_map__attach_struct_ops(skel->maps.ddsp_vtimelocal_fail_ops); + SCX_FAIL_IF(!link, "Failed to attach struct_ops"); + + sleep(1); + + SCX_EQ(skel->data->uei.kind, EXIT_KIND(SCX_EXIT_ERROR)); + bpf_link__destroy(link); + + return SCX_TEST_PASS; +} + +static void cleanup(void *ctx) +{ + struct ddsp_vtimelocal_fail *skel = ctx; + + ddsp_vtimelocal_fail__destroy(skel); +} + +struct scx_test ddsp_vtimelocal_fail = { + .name = "ddsp_vtimelocal_fail", + .description = "Verify we gracefully fail, and fall back to using a " + "built-in DSQ, if we do a direct vtime dispatch to a " + "built-in DSQ from DSQ in ops.select_cpu()", + .setup = setup, + .run = run, + .cleanup = cleanup, +}; +REGISTER_SCX_TEST(&ddsp_vtimelocal_fail) diff --git a/tools/testing/selftests/sched_ext/dsp_local_on.bpf.c b/tools/testing/selftests/sched_ext/dsp_local_on.bpf.c new file mode 100644 index 000000000000..efb4672decb4 --- /dev/null +++ b/tools/testing/selftests/sched_ext/dsp_local_on.bpf.c @@ -0,0 +1,65 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2024 David Vernet + */ +#include + +char _license[] SEC("license") = "GPL"; +const volatile s32 nr_cpus; + +UEI_DEFINE(uei); + +struct { + __uint(type, BPF_MAP_TYPE_QUEUE); + __uint(max_entries, 8192); + __type(value, s32); +} queue SEC(".maps"); + +s32 BPF_STRUCT_OPS(dsp_local_on_select_cpu, struct task_struct *p, + s32 prev_cpu, u64 wake_flags) +{ + return prev_cpu; +} + +void BPF_STRUCT_OPS(dsp_local_on_enqueue, struct task_struct *p, + u64 enq_flags) +{ + s32 pid = p->pid; + + if (bpf_map_push_elem(&queue, &pid, 0)) + scx_bpf_error("Failed to enqueue %s[%d]", p->comm, p->pid); +} + +void BPF_STRUCT_OPS(dsp_local_on_dispatch, s32 cpu, struct task_struct *prev) +{ + s32 pid, target; + struct task_struct *p; + + if (bpf_map_pop_elem(&queue, &pid)) + return; + + p = bpf_task_from_pid(pid); + if (!p) + return; + + target = bpf_get_prandom_u32() % nr_cpus; + + scx_bpf_dispatch(p, SCX_DSQ_LOCAL_ON | target, SCX_SLICE_DFL, 0); + bpf_task_release(p); +} + +void BPF_STRUCT_OPS(dsp_local_on_exit, struct scx_exit_info *ei) +{ + UEI_RECORD(uei, ei); +} + +SEC(".struct_ops.link") +struct sched_ext_ops dsp_local_on_ops = { + .select_cpu = dsp_local_on_select_cpu, + .enqueue = dsp_local_on_enqueue, + .dispatch = dsp_local_on_dispatch, + .exit = dsp_local_on_exit, + .name = "dsp_local_on", + .timeout_ms = 1000U, +}; diff --git a/tools/testing/selftests/sched_ext/dsp_local_on.c b/tools/testing/selftests/sched_ext/dsp_local_on.c new file mode 100644 index 000000000000..472851b56854 --- /dev/null +++ b/tools/testing/selftests/sched_ext/dsp_local_on.c @@ -0,0 +1,58 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2024 David Vernet + */ +#include +#include +#include +#include "dsp_local_on.bpf.skel.h" +#include "scx_test.h" + +static enum scx_test_status setup(void **ctx) +{ + struct dsp_local_on *skel; + + skel = dsp_local_on__open(); + SCX_FAIL_IF(!skel, "Failed to open"); + + skel->rodata->nr_cpus = libbpf_num_possible_cpus(); + SCX_FAIL_IF(dsp_local_on__load(skel), "Failed to load skel"); + *ctx = skel; + + return SCX_TEST_PASS; +} + +static enum scx_test_status run(void *ctx) +{ + struct dsp_local_on *skel = ctx; + struct bpf_link *link; + + link = bpf_map__attach_struct_ops(skel->maps.dsp_local_on_ops); + SCX_FAIL_IF(!link, "Failed to attach struct_ops"); + + /* Just sleeping is fine, plenty of scheduling events happening */ + sleep(1); + + SCX_EQ(skel->data->uei.kind, EXIT_KIND(SCX_EXIT_ERROR)); + bpf_link__destroy(link); + + return SCX_TEST_PASS; +} + +static void cleanup(void *ctx) +{ + struct dsp_local_on *skel = ctx; + + dsp_local_on__destroy(skel); +} + +struct scx_test dsp_local_on = { + .name = "dsp_local_on", + .description = "Verify we can directly dispatch tasks to a local DSQs " + "from osp.dispatch()", + .setup = setup, + .run = run, + .cleanup = cleanup, +}; +REGISTER_SCX_TEST(&dsp_local_on) diff --git a/tools/testing/selftests/sched_ext/enq_last_no_enq_fails.bpf.c b/tools/testing/selftests/sched_ext/enq_last_no_enq_fails.bpf.c new file mode 100644 index 000000000000..b0b99531d5d5 --- /dev/null +++ b/tools/testing/selftests/sched_ext/enq_last_no_enq_fails.bpf.c @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * A scheduler that validates the behavior of direct dispatching with a default + * select_cpu implementation. + * + * Copyright (c) 2023 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2023 David Vernet + * Copyright (c) 2023 Tejun Heo + */ + +#include + +char _license[] SEC("license") = "GPL"; + +SEC(".struct_ops.link") +struct sched_ext_ops enq_last_no_enq_fails_ops = { + .name = "enq_last_no_enq_fails", + /* Need to define ops.enqueue() with SCX_OPS_ENQ_LAST */ + .flags = SCX_OPS_ENQ_LAST, + .timeout_ms = 1000U, +}; diff --git a/tools/testing/selftests/sched_ext/enq_last_no_enq_fails.c b/tools/testing/selftests/sched_ext/enq_last_no_enq_fails.c new file mode 100644 index 000000000000..2a3eda5e2c0b --- /dev/null +++ b/tools/testing/selftests/sched_ext/enq_last_no_enq_fails.c @@ -0,0 +1,60 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2023 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2023 David Vernet + * Copyright (c) 2023 Tejun Heo + */ +#include +#include +#include +#include +#include "enq_last_no_enq_fails.bpf.skel.h" +#include "scx_test.h" + +static enum scx_test_status setup(void **ctx) +{ + struct enq_last_no_enq_fails *skel; + + skel = enq_last_no_enq_fails__open_and_load(); + if (!skel) { + SCX_ERR("Failed to open and load skel"); + return SCX_TEST_FAIL; + } + *ctx = skel; + + return SCX_TEST_PASS; +} + +static enum scx_test_status run(void *ctx) +{ + struct enq_last_no_enq_fails *skel = ctx; + struct bpf_link *link; + + link = bpf_map__attach_struct_ops(skel->maps.enq_last_no_enq_fails_ops); + if (link) { + SCX_ERR("Incorrectly succeeded in to attaching scheduler"); + return SCX_TEST_FAIL; + } + + bpf_link__destroy(link); + + return SCX_TEST_PASS; +} + +static void cleanup(void *ctx) +{ + struct enq_last_no_enq_fails *skel = ctx; + + enq_last_no_enq_fails__destroy(skel); +} + +struct scx_test enq_last_no_enq_fails = { + .name = "enq_last_no_enq_fails", + .description = "Verify we fail to load a scheduler if we specify " + "the SCX_OPS_ENQ_LAST flag without defining " + "ops.enqueue()", + .setup = setup, + .run = run, + .cleanup = cleanup, +}; +REGISTER_SCX_TEST(&enq_last_no_enq_fails) diff --git a/tools/testing/selftests/sched_ext/enq_select_cpu_fails.bpf.c b/tools/testing/selftests/sched_ext/enq_select_cpu_fails.bpf.c new file mode 100644 index 000000000000..b3dfc1033cd6 --- /dev/null +++ b/tools/testing/selftests/sched_ext/enq_select_cpu_fails.bpf.c @@ -0,0 +1,43 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2023 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2023 David Vernet + * Copyright (c) 2023 Tejun Heo + */ + +#include + +char _license[] SEC("license") = "GPL"; + +/* Manually specify the signature until the kfunc is added to the scx repo. */ +s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, + bool *found) __ksym; + +s32 BPF_STRUCT_OPS(enq_select_cpu_fails_select_cpu, struct task_struct *p, + s32 prev_cpu, u64 wake_flags) +{ + return prev_cpu; +} + +void BPF_STRUCT_OPS(enq_select_cpu_fails_enqueue, struct task_struct *p, + u64 enq_flags) +{ + /* + * Need to initialize the variable or the verifier will fail to load. + * Improving these semantics is actively being worked on. + */ + bool found = false; + + /* Can only call from ops.select_cpu() */ + scx_bpf_select_cpu_dfl(p, 0, 0, &found); + + scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags); +} + +SEC(".struct_ops.link") +struct sched_ext_ops enq_select_cpu_fails_ops = { + .select_cpu = enq_select_cpu_fails_select_cpu, + .enqueue = enq_select_cpu_fails_enqueue, + .name = "enq_select_cpu_fails", + .timeout_ms = 1000U, +}; diff --git a/tools/testing/selftests/sched_ext/enq_select_cpu_fails.c b/tools/testing/selftests/sched_ext/enq_select_cpu_fails.c new file mode 100644 index 000000000000..dd1350e5f002 --- /dev/null +++ b/tools/testing/selftests/sched_ext/enq_select_cpu_fails.c @@ -0,0 +1,61 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2023 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2023 David Vernet + * Copyright (c) 2023 Tejun Heo + */ +#include +#include +#include +#include +#include "enq_select_cpu_fails.bpf.skel.h" +#include "scx_test.h" + +static enum scx_test_status setup(void **ctx) +{ + struct enq_select_cpu_fails *skel; + + skel = enq_select_cpu_fails__open_and_load(); + if (!skel) { + SCX_ERR("Failed to open and load skel"); + return SCX_TEST_FAIL; + } + *ctx = skel; + + return SCX_TEST_PASS; +} + +static enum scx_test_status run(void *ctx) +{ + struct enq_select_cpu_fails *skel = ctx; + struct bpf_link *link; + + link = bpf_map__attach_struct_ops(skel->maps.enq_select_cpu_fails_ops); + if (!link) { + SCX_ERR("Failed to attach scheduler"); + return SCX_TEST_FAIL; + } + + sleep(1); + + bpf_link__destroy(link); + + return SCX_TEST_PASS; +} + +static void cleanup(void *ctx) +{ + struct enq_select_cpu_fails *skel = ctx; + + enq_select_cpu_fails__destroy(skel); +} + +struct scx_test enq_select_cpu_fails = { + .name = "enq_select_cpu_fails", + .description = "Verify we fail to call scx_bpf_select_cpu_dfl() " + "from ops.enqueue()", + .setup = setup, + .run = run, + .cleanup = cleanup, +}; +REGISTER_SCX_TEST(&enq_select_cpu_fails) diff --git a/tools/testing/selftests/sched_ext/exit.bpf.c b/tools/testing/selftests/sched_ext/exit.bpf.c new file mode 100644 index 000000000000..ae12ddaac921 --- /dev/null +++ b/tools/testing/selftests/sched_ext/exit.bpf.c @@ -0,0 +1,84 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2024 David Vernet + */ + +#include + +char _license[] SEC("license") = "GPL"; + +#include "exit_test.h" + +const volatile int exit_point; +UEI_DEFINE(uei); + +#define EXIT_CLEANLY() scx_bpf_exit(exit_point, "%d", exit_point) + +s32 BPF_STRUCT_OPS(exit_select_cpu, struct task_struct *p, + s32 prev_cpu, u64 wake_flags) +{ + bool found; + + if (exit_point == EXIT_SELECT_CPU) + EXIT_CLEANLY(); + + return scx_bpf_select_cpu_dfl(p, prev_cpu, wake_flags, &found); +} + +void BPF_STRUCT_OPS(exit_enqueue, struct task_struct *p, u64 enq_flags) +{ + if (exit_point == EXIT_ENQUEUE) + EXIT_CLEANLY(); + + scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags); +} + +void BPF_STRUCT_OPS(exit_dispatch, s32 cpu, struct task_struct *p) +{ + if (exit_point == EXIT_DISPATCH) + EXIT_CLEANLY(); + + scx_bpf_consume(SCX_DSQ_GLOBAL); +} + +void BPF_STRUCT_OPS(exit_enable, struct task_struct *p) +{ + if (exit_point == EXIT_ENABLE) + EXIT_CLEANLY(); +} + +s32 BPF_STRUCT_OPS(exit_init_task, struct task_struct *p, + struct scx_init_task_args *args) +{ + if (exit_point == EXIT_INIT_TASK) + EXIT_CLEANLY(); + + return 0; +} + +void BPF_STRUCT_OPS(exit_exit, struct scx_exit_info *ei) +{ + UEI_RECORD(uei, ei); +} + +s32 BPF_STRUCT_OPS_SLEEPABLE(exit_init) +{ + if (exit_point == EXIT_INIT) + EXIT_CLEANLY(); + + return 0; +} + +SEC(".struct_ops.link") +struct sched_ext_ops exit_ops = { + .select_cpu = exit_select_cpu, + .enqueue = exit_enqueue, + .dispatch = exit_dispatch, + .init_task = exit_init_task, + .enable = exit_enable, + .exit = exit_exit, + .init = exit_init, + .name = "exit", + .timeout_ms = 1000U, +}; diff --git a/tools/testing/selftests/sched_ext/exit.c b/tools/testing/selftests/sched_ext/exit.c new file mode 100644 index 000000000000..31bcd06e21cd --- /dev/null +++ b/tools/testing/selftests/sched_ext/exit.c @@ -0,0 +1,55 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2024 David Vernet + */ +#include +#include +#include +#include +#include +#include "exit.bpf.skel.h" +#include "scx_test.h" + +#include "exit_test.h" + +static enum scx_test_status run(void *ctx) +{ + enum exit_test_case tc; + + for (tc = 0; tc < NUM_EXITS; tc++) { + struct exit *skel; + struct bpf_link *link; + char buf[16]; + + skel = exit__open(); + skel->rodata->exit_point = tc; + exit__load(skel); + link = bpf_map__attach_struct_ops(skel->maps.exit_ops); + if (!link) { + SCX_ERR("Failed to attach scheduler"); + exit__destroy(skel); + return SCX_TEST_FAIL; + } + + /* Assumes uei.kind is written last */ + while (skel->data->uei.kind == EXIT_KIND(SCX_EXIT_NONE)) + sched_yield(); + + SCX_EQ(skel->data->uei.kind, EXIT_KIND(SCX_EXIT_UNREG_BPF)); + SCX_EQ(skel->data->uei.exit_code, tc); + sprintf(buf, "%d", tc); + SCX_ASSERT(!strcmp(skel->data->uei.msg, buf)); + bpf_link__destroy(link); + exit__destroy(skel); + } + + return SCX_TEST_PASS; +} + +struct scx_test exit_test = { + .name = "exit", + .description = "Verify we can cleanly exit a scheduler in multiple places", + .run = run, +}; +REGISTER_SCX_TEST(&exit_test) diff --git a/tools/testing/selftests/sched_ext/exit_test.h b/tools/testing/selftests/sched_ext/exit_test.h new file mode 100644 index 000000000000..94f0268b9cb8 --- /dev/null +++ b/tools/testing/selftests/sched_ext/exit_test.h @@ -0,0 +1,20 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2024 David Vernet + */ + +#ifndef __EXIT_TEST_H__ +#define __EXIT_TEST_H__ + +enum exit_test_case { + EXIT_SELECT_CPU, + EXIT_ENQUEUE, + EXIT_DISPATCH, + EXIT_ENABLE, + EXIT_INIT_TASK, + EXIT_INIT, + NUM_EXITS, +}; + +#endif // # __EXIT_TEST_H__ diff --git a/tools/testing/selftests/sched_ext/hotplug.bpf.c b/tools/testing/selftests/sched_ext/hotplug.bpf.c new file mode 100644 index 000000000000..8f2601db39f3 --- /dev/null +++ b/tools/testing/selftests/sched_ext/hotplug.bpf.c @@ -0,0 +1,61 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2024 David Vernet + */ + +#include + +char _license[] SEC("license") = "GPL"; + +#include "hotplug_test.h" + +UEI_DEFINE(uei); + +void BPF_STRUCT_OPS(hotplug_exit, struct scx_exit_info *ei) +{ + UEI_RECORD(uei, ei); +} + +static void exit_from_hotplug(s32 cpu, bool onlining) +{ + /* + * Ignored, just used to verify that we can invoke blocking kfuncs + * from the hotplug path. + */ + scx_bpf_create_dsq(0, -1); + + s64 code = SCX_ECODE_ACT_RESTART | HOTPLUG_EXIT_RSN; + + if (onlining) + code |= HOTPLUG_ONLINING; + + scx_bpf_exit(code, "hotplug event detected (%d going %s)", cpu, + onlining ? "online" : "offline"); +} + +void BPF_STRUCT_OPS_SLEEPABLE(hotplug_cpu_online, s32 cpu) +{ + exit_from_hotplug(cpu, true); +} + +void BPF_STRUCT_OPS_SLEEPABLE(hotplug_cpu_offline, s32 cpu) +{ + exit_from_hotplug(cpu, false); +} + +SEC(".struct_ops.link") +struct sched_ext_ops hotplug_cb_ops = { + .cpu_online = hotplug_cpu_online, + .cpu_offline = hotplug_cpu_offline, + .exit = hotplug_exit, + .name = "hotplug_cbs", + .timeout_ms = 1000U, +}; + +SEC(".struct_ops.link") +struct sched_ext_ops hotplug_nocb_ops = { + .exit = hotplug_exit, + .name = "hotplug_nocbs", + .timeout_ms = 1000U, +}; diff --git a/tools/testing/selftests/sched_ext/hotplug.c b/tools/testing/selftests/sched_ext/hotplug.c new file mode 100644 index 000000000000..87bf220b1bce --- /dev/null +++ b/tools/testing/selftests/sched_ext/hotplug.c @@ -0,0 +1,168 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2024 David Vernet + */ +#include +#include +#include +#include +#include +#include + +#include "hotplug_test.h" +#include "hotplug.bpf.skel.h" +#include "scx_test.h" +#include "util.h" + +const char *online_path = "/sys/devices/system/cpu/cpu1/online"; + +static bool is_cpu_online(void) +{ + return file_read_long(online_path) > 0; +} + +static void toggle_online_status(bool online) +{ + long val = online ? 1 : 0; + int ret; + + ret = file_write_long(online_path, val); + if (ret != 0) + fprintf(stderr, "Failed to bring CPU %s (%s)", + online ? "online" : "offline", strerror(errno)); +} + +static enum scx_test_status setup(void **ctx) +{ + if (!is_cpu_online()) + return SCX_TEST_SKIP; + + return SCX_TEST_PASS; +} + +static enum scx_test_status test_hotplug(bool onlining, bool cbs_defined) +{ + struct hotplug *skel; + struct bpf_link *link; + long kind, code; + + SCX_ASSERT(is_cpu_online()); + + skel = hotplug__open_and_load(); + SCX_ASSERT(skel); + + /* Testing the offline -> online path, so go offline before starting */ + if (onlining) + toggle_online_status(0); + + if (cbs_defined) { + kind = SCX_KIND_VAL(SCX_EXIT_UNREG_BPF); + code = SCX_ECODE_VAL(SCX_ECODE_ACT_RESTART) | HOTPLUG_EXIT_RSN; + if (onlining) + code |= HOTPLUG_ONLINING; + } else { + kind = SCX_KIND_VAL(SCX_EXIT_UNREG_KERN); + code = SCX_ECODE_VAL(SCX_ECODE_ACT_RESTART) | + SCX_ECODE_VAL(SCX_ECODE_RSN_HOTPLUG); + } + + if (cbs_defined) + link = bpf_map__attach_struct_ops(skel->maps.hotplug_cb_ops); + else + link = bpf_map__attach_struct_ops(skel->maps.hotplug_nocb_ops); + + if (!link) { + SCX_ERR("Failed to attach scheduler"); + hotplug__destroy(skel); + return SCX_TEST_FAIL; + } + + toggle_online_status(onlining ? 1 : 0); + + while (!UEI_EXITED(skel, uei)) + sched_yield(); + + SCX_EQ(skel->data->uei.kind, kind); + SCX_EQ(UEI_REPORT(skel, uei), code); + + if (!onlining) + toggle_online_status(1); + + bpf_link__destroy(link); + hotplug__destroy(skel); + + return SCX_TEST_PASS; +} + +static enum scx_test_status test_hotplug_attach(void) +{ + struct hotplug *skel; + struct bpf_link *link; + enum scx_test_status status = SCX_TEST_PASS; + long kind, code; + + SCX_ASSERT(is_cpu_online()); + SCX_ASSERT(scx_hotplug_seq() > 0); + + skel = SCX_OPS_OPEN(hotplug_nocb_ops, hotplug); + SCX_ASSERT(skel); + + SCX_OPS_LOAD(skel, hotplug_nocb_ops, hotplug, uei); + + /* + * Take the CPU offline to increment the global hotplug seq, which + * should cause attach to fail due to us setting the hotplug seq above + */ + toggle_online_status(0); + link = bpf_map__attach_struct_ops(skel->maps.hotplug_nocb_ops); + + toggle_online_status(1); + + SCX_ASSERT(link); + while (!UEI_EXITED(skel, uei)) + sched_yield(); + + kind = SCX_KIND_VAL(SCX_EXIT_UNREG_KERN); + code = SCX_ECODE_VAL(SCX_ECODE_ACT_RESTART) | + SCX_ECODE_VAL(SCX_ECODE_RSN_HOTPLUG); + SCX_EQ(skel->data->uei.kind, kind); + SCX_EQ(UEI_REPORT(skel, uei), code); + + bpf_link__destroy(link); + hotplug__destroy(skel); + + return status; +} + +static enum scx_test_status run(void *ctx) +{ + +#define HP_TEST(__onlining, __cbs_defined) ({ \ + if (test_hotplug(__onlining, __cbs_defined) != SCX_TEST_PASS) \ + return SCX_TEST_FAIL; \ +}) + + HP_TEST(true, true); + HP_TEST(false, true); + HP_TEST(true, false); + HP_TEST(false, false); + +#undef HP_TEST + + return test_hotplug_attach(); +} + +static void cleanup(void *ctx) +{ + toggle_online_status(1); +} + +struct scx_test hotplug_test = { + .name = "hotplug", + .description = "Verify hotplug behavior", + .setup = setup, + .run = run, + .cleanup = cleanup, +}; +REGISTER_SCX_TEST(&hotplug_test) diff --git a/tools/testing/selftests/sched_ext/hotplug_test.h b/tools/testing/selftests/sched_ext/hotplug_test.h new file mode 100644 index 000000000000..73d236f90787 --- /dev/null +++ b/tools/testing/selftests/sched_ext/hotplug_test.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2024 David Vernet + */ + +#ifndef __HOTPLUG_TEST_H__ +#define __HOTPLUG_TEST_H__ + +enum hotplug_test_flags { + HOTPLUG_EXIT_RSN = 1LLU << 0, + HOTPLUG_ONLINING = 1LLU << 1, +}; + +#endif // # __HOTPLUG_TEST_H__ diff --git a/tools/testing/selftests/sched_ext/init_enable_count.bpf.c b/tools/testing/selftests/sched_ext/init_enable_count.bpf.c new file mode 100644 index 000000000000..47ea89a626c3 --- /dev/null +++ b/tools/testing/selftests/sched_ext/init_enable_count.bpf.c @@ -0,0 +1,53 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * A scheduler that verifies that we do proper counting of init, enable, etc + * callbacks. + * + * Copyright (c) 2023 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2023 David Vernet + * Copyright (c) 2023 Tejun Heo + */ + +#include + +char _license[] SEC("license") = "GPL"; + +u64 init_task_cnt, exit_task_cnt, enable_cnt, disable_cnt; +u64 init_fork_cnt, init_transition_cnt; + +s32 BPF_STRUCT_OPS_SLEEPABLE(cnt_init_task, struct task_struct *p, + struct scx_init_task_args *args) +{ + __sync_fetch_and_add(&init_task_cnt, 1); + + if (args->fork) + __sync_fetch_and_add(&init_fork_cnt, 1); + else + __sync_fetch_and_add(&init_transition_cnt, 1); + + return 0; +} + +void BPF_STRUCT_OPS(cnt_exit_task, struct task_struct *p) +{ + __sync_fetch_and_add(&exit_task_cnt, 1); +} + +void BPF_STRUCT_OPS(cnt_enable, struct task_struct *p) +{ + __sync_fetch_and_add(&enable_cnt, 1); +} + +void BPF_STRUCT_OPS(cnt_disable, struct task_struct *p) +{ + __sync_fetch_and_add(&disable_cnt, 1); +} + +SEC(".struct_ops.link") +struct sched_ext_ops init_enable_count_ops = { + .init_task = cnt_init_task, + .exit_task = cnt_exit_task, + .enable = cnt_enable, + .disable = cnt_disable, + .name = "init_enable_count", +}; diff --git a/tools/testing/selftests/sched_ext/init_enable_count.c b/tools/testing/selftests/sched_ext/init_enable_count.c new file mode 100644 index 000000000000..97d45f1e5597 --- /dev/null +++ b/tools/testing/selftests/sched_ext/init_enable_count.c @@ -0,0 +1,166 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2023 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2023 David Vernet + * Copyright (c) 2023 Tejun Heo + */ +#include +#include +#include +#include +#include +#include +#include "scx_test.h" +#include "init_enable_count.bpf.skel.h" + +#define SCHED_EXT 7 + +static struct init_enable_count * +open_load_prog(bool global) +{ + struct init_enable_count *skel; + + skel = init_enable_count__open(); + SCX_BUG_ON(!skel, "Failed to open skel"); + + if (!global) + skel->struct_ops.init_enable_count_ops->flags |= SCX_OPS_SWITCH_PARTIAL; + + SCX_BUG_ON(init_enable_count__load(skel), "Failed to load skel"); + + return skel; +} + +static enum scx_test_status run_test(bool global) +{ + struct init_enable_count *skel; + struct bpf_link *link; + const u32 num_children = 5, num_pre_forks = 1024; + int ret, i, status; + struct sched_param param = {}; + pid_t pids[num_pre_forks]; + + skel = open_load_prog(global); + + /* + * Fork a bunch of children before we attach the scheduler so that we + * ensure (at least in practical terms) that there are more tasks that + * transition from SCHED_OTHER -> SCHED_EXT than there are tasks that + * take the fork() path either below or in other processes. + */ + for (i = 0; i < num_pre_forks; i++) { + pids[i] = fork(); + SCX_FAIL_IF(pids[i] < 0, "Failed to fork child"); + if (pids[i] == 0) { + sleep(1); + exit(0); + } + } + + link = bpf_map__attach_struct_ops(skel->maps.init_enable_count_ops); + SCX_FAIL_IF(!link, "Failed to attach struct_ops"); + + for (i = 0; i < num_pre_forks; i++) { + SCX_FAIL_IF(waitpid(pids[i], &status, 0) != pids[i], + "Failed to wait for pre-forked child\n"); + + SCX_FAIL_IF(status != 0, "Pre-forked child %d exited with status %d\n", i, + status); + } + + bpf_link__destroy(link); + SCX_GE(skel->bss->init_task_cnt, num_pre_forks); + SCX_GE(skel->bss->exit_task_cnt, num_pre_forks); + + link = bpf_map__attach_struct_ops(skel->maps.init_enable_count_ops); + SCX_FAIL_IF(!link, "Failed to attach struct_ops"); + + /* SCHED_EXT children */ + for (i = 0; i < num_children; i++) { + pids[i] = fork(); + SCX_FAIL_IF(pids[i] < 0, "Failed to fork child"); + + if (pids[i] == 0) { + ret = sched_setscheduler(0, SCHED_EXT, ¶m); + SCX_BUG_ON(ret, "Failed to set sched to sched_ext"); + + /* + * Reset to SCHED_OTHER for half of them. Counts for + * everything should still be the same regardless, as + * ops.disable() is invoked even if a task is still on + * SCHED_EXT before it exits. + */ + if (i % 2 == 0) { + ret = sched_setscheduler(0, SCHED_OTHER, ¶m); + SCX_BUG_ON(ret, "Failed to reset sched to normal"); + } + exit(0); + } + } + for (i = 0; i < num_children; i++) { + SCX_FAIL_IF(waitpid(pids[i], &status, 0) != pids[i], + "Failed to wait for SCX child\n"); + + SCX_FAIL_IF(status != 0, "SCX child %d exited with status %d\n", i, + status); + } + + /* SCHED_OTHER children */ + for (i = 0; i < num_children; i++) { + pids[i] = fork(); + if (pids[i] == 0) + exit(0); + } + + for (i = 0; i < num_children; i++) { + SCX_FAIL_IF(waitpid(pids[i], &status, 0) != pids[i], + "Failed to wait for normal child\n"); + + SCX_FAIL_IF(status != 0, "Normal child %d exited with status %d\n", i, + status); + } + + bpf_link__destroy(link); + + SCX_GE(skel->bss->init_task_cnt, 2 * num_children); + SCX_GE(skel->bss->exit_task_cnt, 2 * num_children); + + if (global) { + SCX_GE(skel->bss->enable_cnt, 2 * num_children); + SCX_GE(skel->bss->disable_cnt, 2 * num_children); + } else { + SCX_EQ(skel->bss->enable_cnt, num_children); + SCX_EQ(skel->bss->disable_cnt, num_children); + } + /* + * We forked a ton of tasks before we attached the scheduler above, so + * this should be fine. Technically it could be flaky if a ton of forks + * are happening at the same time in other processes, but that should + * be exceedingly unlikely. + */ + SCX_GT(skel->bss->init_transition_cnt, skel->bss->init_fork_cnt); + SCX_GE(skel->bss->init_fork_cnt, 2 * num_children); + + init_enable_count__destroy(skel); + + return SCX_TEST_PASS; +} + +static enum scx_test_status run(void *ctx) +{ + enum scx_test_status status; + + status = run_test(true); + if (status != SCX_TEST_PASS) + return status; + + return run_test(false); +} + +struct scx_test init_enable_count = { + .name = "init_enable_count", + .description = "Verify we do the correct amount of counting of init, " + "enable, etc callbacks.", + .run = run, +}; +REGISTER_SCX_TEST(&init_enable_count) diff --git a/tools/testing/selftests/sched_ext/maximal.bpf.c b/tools/testing/selftests/sched_ext/maximal.bpf.c new file mode 100644 index 000000000000..44612fdaf399 --- /dev/null +++ b/tools/testing/selftests/sched_ext/maximal.bpf.c @@ -0,0 +1,132 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * A scheduler with every callback defined. + * + * This scheduler defines every callback. + * + * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2024 David Vernet + */ + +#include + +char _license[] SEC("license") = "GPL"; + +s32 BPF_STRUCT_OPS(maximal_select_cpu, struct task_struct *p, s32 prev_cpu, + u64 wake_flags) +{ + return prev_cpu; +} + +void BPF_STRUCT_OPS(maximal_enqueue, struct task_struct *p, u64 enq_flags) +{ + scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags); +} + +void BPF_STRUCT_OPS(maximal_dequeue, struct task_struct *p, u64 deq_flags) +{} + +void BPF_STRUCT_OPS(maximal_dispatch, s32 cpu, struct task_struct *prev) +{ + scx_bpf_consume(SCX_DSQ_GLOBAL); +} + +void BPF_STRUCT_OPS(maximal_runnable, struct task_struct *p, u64 enq_flags) +{} + +void BPF_STRUCT_OPS(maximal_running, struct task_struct *p) +{} + +void BPF_STRUCT_OPS(maximal_stopping, struct task_struct *p, bool runnable) +{} + +void BPF_STRUCT_OPS(maximal_quiescent, struct task_struct *p, u64 deq_flags) +{} + +bool BPF_STRUCT_OPS(maximal_yield, struct task_struct *from, + struct task_struct *to) +{ + return false; +} + +bool BPF_STRUCT_OPS(maximal_core_sched_before, struct task_struct *a, + struct task_struct *b) +{ + return false; +} + +void BPF_STRUCT_OPS(maximal_set_weight, struct task_struct *p, u32 weight) +{} + +void BPF_STRUCT_OPS(maximal_set_cpumask, struct task_struct *p, + const struct cpumask *cpumask) +{} + +void BPF_STRUCT_OPS(maximal_update_idle, s32 cpu, bool idle) +{} + +void BPF_STRUCT_OPS(maximal_cpu_acquire, s32 cpu, + struct scx_cpu_acquire_args *args) +{} + +void BPF_STRUCT_OPS(maximal_cpu_release, s32 cpu, + struct scx_cpu_release_args *args) +{} + +void BPF_STRUCT_OPS(maximal_cpu_online, s32 cpu) +{} + +void BPF_STRUCT_OPS(maximal_cpu_offline, s32 cpu) +{} + +s32 BPF_STRUCT_OPS(maximal_init_task, struct task_struct *p, + struct scx_init_task_args *args) +{ + return 0; +} + +void BPF_STRUCT_OPS(maximal_enable, struct task_struct *p) +{} + +void BPF_STRUCT_OPS(maximal_exit_task, struct task_struct *p, + struct scx_exit_task_args *args) +{} + +void BPF_STRUCT_OPS(maximal_disable, struct task_struct *p) +{} + +s32 BPF_STRUCT_OPS_SLEEPABLE(maximal_init) +{ + return 0; +} + +void BPF_STRUCT_OPS(maximal_exit, struct scx_exit_info *info) +{} + +SEC(".struct_ops.link") +struct sched_ext_ops maximal_ops = { + .select_cpu = maximal_select_cpu, + .enqueue = maximal_enqueue, + .dequeue = maximal_dequeue, + .dispatch = maximal_dispatch, + .runnable = maximal_runnable, + .running = maximal_running, + .stopping = maximal_stopping, + .quiescent = maximal_quiescent, + .yield = maximal_yield, + .core_sched_before = maximal_core_sched_before, + .set_weight = maximal_set_weight, + .set_cpumask = maximal_set_cpumask, + .update_idle = maximal_update_idle, + .cpu_acquire = maximal_cpu_acquire, + .cpu_release = maximal_cpu_release, + .cpu_online = maximal_cpu_online, + .cpu_offline = maximal_cpu_offline, + .init_task = maximal_init_task, + .enable = maximal_enable, + .exit_task = maximal_exit_task, + .disable = maximal_disable, + .init = maximal_init, + .exit = maximal_exit, + .name = "maximal", +}; diff --git a/tools/testing/selftests/sched_ext/maximal.c b/tools/testing/selftests/sched_ext/maximal.c new file mode 100644 index 000000000000..f38fc973c380 --- /dev/null +++ b/tools/testing/selftests/sched_ext/maximal.c @@ -0,0 +1,51 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2024 David Vernet + */ +#include +#include +#include +#include +#include "maximal.bpf.skel.h" +#include "scx_test.h" + +static enum scx_test_status setup(void **ctx) +{ + struct maximal *skel; + + skel = maximal__open_and_load(); + SCX_FAIL_IF(!skel, "Failed to open and load skel"); + *ctx = skel; + + return SCX_TEST_PASS; +} + +static enum scx_test_status run(void *ctx) +{ + struct maximal *skel = ctx; + struct bpf_link *link; + + link = bpf_map__attach_struct_ops(skel->maps.maximal_ops); + SCX_FAIL_IF(!link, "Failed to attach scheduler"); + + bpf_link__destroy(link); + + return SCX_TEST_PASS; +} + +static void cleanup(void *ctx) +{ + struct maximal *skel = ctx; + + maximal__destroy(skel); +} + +struct scx_test maximal = { + .name = "maximal", + .description = "Verify we can load a scheduler with every callback defined", + .setup = setup, + .run = run, + .cleanup = cleanup, +}; +REGISTER_SCX_TEST(&maximal) diff --git a/tools/testing/selftests/sched_ext/maybe_null.bpf.c b/tools/testing/selftests/sched_ext/maybe_null.bpf.c new file mode 100644 index 000000000000..27d0f386acfb --- /dev/null +++ b/tools/testing/selftests/sched_ext/maybe_null.bpf.c @@ -0,0 +1,36 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. + */ + +#include + +char _license[] SEC("license") = "GPL"; + +u64 vtime_test; + +void BPF_STRUCT_OPS(maybe_null_running, struct task_struct *p) +{} + +void BPF_STRUCT_OPS(maybe_null_success_dispatch, s32 cpu, struct task_struct *p) +{ + if (p != NULL) + vtime_test = p->scx.dsq_vtime; +} + +bool BPF_STRUCT_OPS(maybe_null_success_yield, struct task_struct *from, + struct task_struct *to) +{ + if (to) + bpf_printk("Yielding to %s[%d]", to->comm, to->pid); + + return false; +} + +SEC(".struct_ops.link") +struct sched_ext_ops maybe_null_success = { + .dispatch = maybe_null_success_dispatch, + .yield = maybe_null_success_yield, + .enable = maybe_null_running, + .name = "minimal", +}; diff --git a/tools/testing/selftests/sched_ext/maybe_null.c b/tools/testing/selftests/sched_ext/maybe_null.c new file mode 100644 index 000000000000..31cfafb0cf65 --- /dev/null +++ b/tools/testing/selftests/sched_ext/maybe_null.c @@ -0,0 +1,49 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. + */ +#include +#include +#include +#include +#include "maybe_null.bpf.skel.h" +#include "maybe_null_fail_dsp.bpf.skel.h" +#include "maybe_null_fail_yld.bpf.skel.h" +#include "scx_test.h" + +static enum scx_test_status run(void *ctx) +{ + struct maybe_null *skel; + struct maybe_null_fail_dsp *fail_dsp; + struct maybe_null_fail_yld *fail_yld; + + skel = maybe_null__open_and_load(); + if (!skel) { + SCX_ERR("Failed to open and load maybe_null skel"); + return SCX_TEST_FAIL; + } + maybe_null__destroy(skel); + + fail_dsp = maybe_null_fail_dsp__open_and_load(); + if (fail_dsp) { + maybe_null_fail_dsp__destroy(fail_dsp); + SCX_ERR("Should failed to open and load maybe_null_fail_dsp skel"); + return SCX_TEST_FAIL; + } + + fail_yld = maybe_null_fail_yld__open_and_load(); + if (fail_yld) { + maybe_null_fail_yld__destroy(fail_yld); + SCX_ERR("Should failed to open and load maybe_null_fail_yld skel"); + return SCX_TEST_FAIL; + } + + return SCX_TEST_PASS; +} + +struct scx_test maybe_null = { + .name = "maybe_null", + .description = "Verify if PTR_MAYBE_NULL work for .dispatch", + .run = run, +}; +REGISTER_SCX_TEST(&maybe_null) diff --git a/tools/testing/selftests/sched_ext/maybe_null_fail_dsp.bpf.c b/tools/testing/selftests/sched_ext/maybe_null_fail_dsp.bpf.c new file mode 100644 index 000000000000..c0641050271d --- /dev/null +++ b/tools/testing/selftests/sched_ext/maybe_null_fail_dsp.bpf.c @@ -0,0 +1,25 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. + */ + +#include + +char _license[] SEC("license") = "GPL"; + +u64 vtime_test; + +void BPF_STRUCT_OPS(maybe_null_running, struct task_struct *p) +{} + +void BPF_STRUCT_OPS(maybe_null_fail_dispatch, s32 cpu, struct task_struct *p) +{ + vtime_test = p->scx.dsq_vtime; +} + +SEC(".struct_ops.link") +struct sched_ext_ops maybe_null_fail = { + .dispatch = maybe_null_fail_dispatch, + .enable = maybe_null_running, + .name = "maybe_null_fail_dispatch", +}; diff --git a/tools/testing/selftests/sched_ext/maybe_null_fail_yld.bpf.c b/tools/testing/selftests/sched_ext/maybe_null_fail_yld.bpf.c new file mode 100644 index 000000000000..3c1740028e3b --- /dev/null +++ b/tools/testing/selftests/sched_ext/maybe_null_fail_yld.bpf.c @@ -0,0 +1,28 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. + */ + +#include + +char _license[] SEC("license") = "GPL"; + +u64 vtime_test; + +void BPF_STRUCT_OPS(maybe_null_running, struct task_struct *p) +{} + +bool BPF_STRUCT_OPS(maybe_null_fail_yield, struct task_struct *from, + struct task_struct *to) +{ + bpf_printk("Yielding to %s[%d]", to->comm, to->pid); + + return false; +} + +SEC(".struct_ops.link") +struct sched_ext_ops maybe_null_fail = { + .yield = maybe_null_fail_yield, + .enable = maybe_null_running, + .name = "maybe_null_fail_yield", +}; diff --git a/tools/testing/selftests/sched_ext/minimal.bpf.c b/tools/testing/selftests/sched_ext/minimal.bpf.c new file mode 100644 index 000000000000..6a7eccef0104 --- /dev/null +++ b/tools/testing/selftests/sched_ext/minimal.bpf.c @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * A completely minimal scheduler. + * + * This scheduler defines the absolute minimal set of struct sched_ext_ops + * fields: its name. It should _not_ fail to be loaded, and can be used to + * exercise the default scheduling paths in ext.c. + * + * Copyright (c) 2023 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2023 David Vernet + * Copyright (c) 2023 Tejun Heo + */ + +#include + +char _license[] SEC("license") = "GPL"; + +SEC(".struct_ops.link") +struct sched_ext_ops minimal_ops = { + .name = "minimal", +}; diff --git a/tools/testing/selftests/sched_ext/minimal.c b/tools/testing/selftests/sched_ext/minimal.c new file mode 100644 index 000000000000..6c5db8ebbf8a --- /dev/null +++ b/tools/testing/selftests/sched_ext/minimal.c @@ -0,0 +1,58 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2023 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2023 David Vernet + * Copyright (c) 2023 Tejun Heo + */ +#include +#include +#include +#include +#include "minimal.bpf.skel.h" +#include "scx_test.h" + +static enum scx_test_status setup(void **ctx) +{ + struct minimal *skel; + + skel = minimal__open_and_load(); + if (!skel) { + SCX_ERR("Failed to open and load skel"); + return SCX_TEST_FAIL; + } + *ctx = skel; + + return SCX_TEST_PASS; +} + +static enum scx_test_status run(void *ctx) +{ + struct minimal *skel = ctx; + struct bpf_link *link; + + link = bpf_map__attach_struct_ops(skel->maps.minimal_ops); + if (!link) { + SCX_ERR("Failed to attach scheduler"); + return SCX_TEST_FAIL; + } + + bpf_link__destroy(link); + + return SCX_TEST_PASS; +} + +static void cleanup(void *ctx) +{ + struct minimal *skel = ctx; + + minimal__destroy(skel); +} + +struct scx_test minimal = { + .name = "minimal", + .description = "Verify we can load a fully minimal scheduler", + .setup = setup, + .run = run, + .cleanup = cleanup, +}; +REGISTER_SCX_TEST(&minimal) diff --git a/tools/testing/selftests/sched_ext/prog_run.bpf.c b/tools/testing/selftests/sched_ext/prog_run.bpf.c new file mode 100644 index 000000000000..fd2c8f12af16 --- /dev/null +++ b/tools/testing/selftests/sched_ext/prog_run.bpf.c @@ -0,0 +1,32 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * A scheduler that validates that we can invoke sched_ext kfuncs in + * BPF_PROG_TYPE_SYSCALL programs. + * + * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2024 David Vernet + */ + +#include + +UEI_DEFINE(uei); + +char _license[] SEC("license") = "GPL"; + +SEC("syscall") +int BPF_PROG(prog_run_syscall) +{ + scx_bpf_exit(0xdeadbeef, "Exited from PROG_RUN"); + return 0; +} + +void BPF_STRUCT_OPS(prog_run_exit, struct scx_exit_info *ei) +{ + UEI_RECORD(uei, ei); +} + +SEC(".struct_ops.link") +struct sched_ext_ops prog_run_ops = { + .exit = prog_run_exit, + .name = "prog_run", +}; diff --git a/tools/testing/selftests/sched_ext/prog_run.c b/tools/testing/selftests/sched_ext/prog_run.c new file mode 100644 index 000000000000..3cd57ef8daaa --- /dev/null +++ b/tools/testing/selftests/sched_ext/prog_run.c @@ -0,0 +1,78 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2024 David Vernet + */ +#include +#include +#include +#include +#include +#include "prog_run.bpf.skel.h" +#include "scx_test.h" + +static enum scx_test_status setup(void **ctx) +{ + struct prog_run *skel; + + skel = prog_run__open_and_load(); + if (!skel) { + SCX_ERR("Failed to open and load skel"); + return SCX_TEST_FAIL; + } + *ctx = skel; + + return SCX_TEST_PASS; +} + +static enum scx_test_status run(void *ctx) +{ + struct prog_run *skel = ctx; + struct bpf_link *link; + int prog_fd, err = 0; + + prog_fd = bpf_program__fd(skel->progs.prog_run_syscall); + if (prog_fd < 0) { + SCX_ERR("Failed to get BPF_PROG_RUN prog"); + return SCX_TEST_FAIL; + } + + LIBBPF_OPTS(bpf_test_run_opts, topts); + + link = bpf_map__attach_struct_ops(skel->maps.prog_run_ops); + if (!link) { + SCX_ERR("Failed to attach scheduler"); + close(prog_fd); + return SCX_TEST_FAIL; + } + + err = bpf_prog_test_run_opts(prog_fd, &topts); + SCX_EQ(err, 0); + + /* Assumes uei.kind is written last */ + while (skel->data->uei.kind == EXIT_KIND(SCX_EXIT_NONE)) + sched_yield(); + + SCX_EQ(skel->data->uei.kind, EXIT_KIND(SCX_EXIT_UNREG_BPF)); + SCX_EQ(skel->data->uei.exit_code, 0xdeadbeef); + close(prog_fd); + bpf_link__destroy(link); + + return SCX_TEST_PASS; +} + +static void cleanup(void *ctx) +{ + struct prog_run *skel = ctx; + + prog_run__destroy(skel); +} + +struct scx_test prog_run = { + .name = "prog_run", + .description = "Verify we can call into a scheduler with BPF_PROG_RUN, and invoke kfuncs", + .setup = setup, + .run = run, + .cleanup = cleanup, +}; +REGISTER_SCX_TEST(&prog_run) diff --git a/tools/testing/selftests/sched_ext/reload_loop.c b/tools/testing/selftests/sched_ext/reload_loop.c new file mode 100644 index 000000000000..5cfba2d6e056 --- /dev/null +++ b/tools/testing/selftests/sched_ext/reload_loop.c @@ -0,0 +1,75 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2024 David Vernet + */ +#include +#include +#include +#include +#include +#include "maximal.bpf.skel.h" +#include "scx_test.h" + +static struct maximal *skel; +static pthread_t threads[2]; + +bool force_exit = false; + +static enum scx_test_status setup(void **ctx) +{ + skel = maximal__open_and_load(); + if (!skel) { + SCX_ERR("Failed to open and load skel"); + return SCX_TEST_FAIL; + } + + return SCX_TEST_PASS; +} + +static void *do_reload_loop(void *arg) +{ + u32 i; + + for (i = 0; i < 1024 && !force_exit; i++) { + struct bpf_link *link; + + link = bpf_map__attach_struct_ops(skel->maps.maximal_ops); + if (link) + bpf_link__destroy(link); + } + + return NULL; +} + +static enum scx_test_status run(void *ctx) +{ + int err; + void *ret; + + err = pthread_create(&threads[0], NULL, do_reload_loop, NULL); + SCX_FAIL_IF(err, "Failed to create thread 0"); + + err = pthread_create(&threads[1], NULL, do_reload_loop, NULL); + SCX_FAIL_IF(err, "Failed to create thread 1"); + + SCX_FAIL_IF(pthread_join(threads[0], &ret), "thread 0 failed"); + SCX_FAIL_IF(pthread_join(threads[1], &ret), "thread 1 failed"); + + return SCX_TEST_PASS; +} + +static void cleanup(void *ctx) +{ + force_exit = true; + maximal__destroy(skel); +} + +struct scx_test reload_loop = { + .name = "reload_loop", + .description = "Stress test loading and unloading schedulers repeatedly in a tight loop", + .setup = setup, + .run = run, + .cleanup = cleanup, +}; +REGISTER_SCX_TEST(&reload_loop) diff --git a/tools/testing/selftests/sched_ext/runner.c b/tools/testing/selftests/sched_ext/runner.c new file mode 100644 index 000000000000..eab48c7ff309 --- /dev/null +++ b/tools/testing/selftests/sched_ext/runner.c @@ -0,0 +1,201 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2024 David Vernet + * Copyright (c) 2024 Tejun Heo + */ +#include +#include +#include +#include +#include +#include "scx_test.h" + +const char help_fmt[] = +"The runner for sched_ext tests.\n" +"\n" +"The runner is statically linked against all testcases, and runs them all serially.\n" +"It's required for the testcases to be serial, as only a single host-wide sched_ext\n" +"scheduler may be loaded at any given time." +"\n" +"Usage: %s [-t TEST] [-h]\n" +"\n" +" -t TEST Only run tests whose name includes this string\n" +" -s Include print output for skipped tests\n" +" -q Don't print the test descriptions during run\n" +" -h Display this help and exit\n"; + +static volatile int exit_req; +static bool quiet, print_skipped; + +#define MAX_SCX_TESTS 2048 + +static struct scx_test __scx_tests[MAX_SCX_TESTS]; +static unsigned __scx_num_tests = 0; + +static void sigint_handler(int simple) +{ + exit_req = 1; +} + +static void print_test_preamble(const struct scx_test *test, bool quiet) +{ + printf("===== START =====\n"); + printf("TEST: %s\n", test->name); + if (!quiet) + printf("DESCRIPTION: %s\n", test->description); + printf("OUTPUT:\n"); +} + +static const char *status_to_result(enum scx_test_status status) +{ + switch (status) { + case SCX_TEST_PASS: + case SCX_TEST_SKIP: + return "ok"; + case SCX_TEST_FAIL: + return "not ok"; + default: + return ""; + } +} + +static void print_test_result(const struct scx_test *test, + enum scx_test_status status, + unsigned int testnum) +{ + const char *result = status_to_result(status); + const char *directive = status == SCX_TEST_SKIP ? "SKIP " : ""; + + printf("%s %u %s # %s\n", result, testnum, test->name, directive); + printf("===== END =====\n"); +} + +static bool should_skip_test(const struct scx_test *test, const char * filter) +{ + return !strstr(test->name, filter); +} + +static enum scx_test_status run_test(const struct scx_test *test) +{ + enum scx_test_status status; + void *context = NULL; + + if (test->setup) { + status = test->setup(&context); + if (status != SCX_TEST_PASS) + return status; + } + + status = test->run(context); + + if (test->cleanup) + test->cleanup(context); + + return status; +} + +static bool test_valid(const struct scx_test *test) +{ + if (!test) { + fprintf(stderr, "NULL test detected\n"); + return false; + } + + if (!test->name) { + fprintf(stderr, + "Test with no name found. Must specify test name.\n"); + return false; + } + + if (!test->description) { + fprintf(stderr, "Test %s requires description.\n", test->name); + return false; + } + + if (!test->run) { + fprintf(stderr, "Test %s has no run() callback\n", test->name); + return false; + } + + return true; +} + +int main(int argc, char **argv) +{ + const char *filter = NULL; + unsigned testnum = 0, i; + unsigned passed = 0, skipped = 0, failed = 0; + int opt; + + signal(SIGINT, sigint_handler); + signal(SIGTERM, sigint_handler); + + libbpf_set_strict_mode(LIBBPF_STRICT_ALL); + + while ((opt = getopt(argc, argv, "qst:h")) != -1) { + switch (opt) { + case 'q': + quiet = true; + break; + case 's': + print_skipped = true; + break; + case 't': + filter = optarg; + break; + default: + fprintf(stderr, help_fmt, basename(argv[0])); + return opt != 'h'; + } + } + + for (i = 0; i < __scx_num_tests; i++) { + enum scx_test_status status; + struct scx_test *test = &__scx_tests[i]; + + if (filter && should_skip_test(test, filter)) { + /* + * Printing the skipped tests and their preambles can + * add a lot of noise to the runner output. Printing + * this is only really useful for CI, so let's skip it + * by default. + */ + if (print_skipped) { + print_test_preamble(test, quiet); + print_test_result(test, SCX_TEST_SKIP, ++testnum); + } + continue; + } + + print_test_preamble(test, quiet); + status = run_test(test); + print_test_result(test, status, ++testnum); + switch (status) { + case SCX_TEST_PASS: + passed++; + break; + case SCX_TEST_SKIP: + skipped++; + break; + case SCX_TEST_FAIL: + failed++; + break; + } + } + printf("\n\n=============================\n\n"); + printf("RESULTS:\n\n"); + printf("PASSED: %u\n", passed); + printf("SKIPPED: %u\n", skipped); + printf("FAILED: %u\n", failed); + + return 0; +} + +void scx_test_register(struct scx_test *test) +{ + SCX_BUG_ON(!test_valid(test), "Invalid test found"); + SCX_BUG_ON(__scx_num_tests >= MAX_SCX_TESTS, "Maximum tests exceeded"); + + __scx_tests[__scx_num_tests++] = *test; +} diff --git a/tools/testing/selftests/sched_ext/scx_test.h b/tools/testing/selftests/sched_ext/scx_test.h new file mode 100644 index 000000000000..90b8d6915bb7 --- /dev/null +++ b/tools/testing/selftests/sched_ext/scx_test.h @@ -0,0 +1,131 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2023 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2023 Tejun Heo + * Copyright (c) 2023 David Vernet + */ + +#ifndef __SCX_TEST_H__ +#define __SCX_TEST_H__ + +#include +#include +#include + +enum scx_test_status { + SCX_TEST_PASS = 0, + SCX_TEST_SKIP, + SCX_TEST_FAIL, +}; + +#define EXIT_KIND(__ent) __COMPAT_ENUM_OR_ZERO("scx_exit_kind", #__ent) + +struct scx_test { + /** + * name - The name of the testcase. + */ + const char *name; + + /** + * description - A description of your testcase: what it tests and is + * meant to validate. + */ + const char *description; + + /* + * setup - Setup the test. + * @ctx: A pointer to a context object that will be passed to run and + * cleanup. + * + * An optional callback that allows a testcase to perform setup for its + * run. A test may return SCX_TEST_SKIP to skip the run. + */ + enum scx_test_status (*setup)(void **ctx); + + /* + * run - Run the test. + * @ctx: Context set in the setup() callback. If @ctx was not set in + * setup(), it is NULL. + * + * The main test. Callers should return one of: + * + * - SCX_TEST_PASS: Test passed + * - SCX_TEST_SKIP: Test should be skipped + * - SCX_TEST_FAIL: Test failed + * + * This callback must be defined. + */ + enum scx_test_status (*run)(void *ctx); + + /* + * cleanup - Perform cleanup following the test + * @ctx: Context set in the setup() callback. If @ctx was not set in + * setup(), it is NULL. + * + * An optional callback that allows a test to perform cleanup after + * being run. This callback is run even if the run() callback returns + * SCX_TEST_SKIP or SCX_TEST_FAIL. It is not run if setup() returns + * SCX_TEST_SKIP or SCX_TEST_FAIL. + */ + void (*cleanup)(void *ctx); +}; + +void scx_test_register(struct scx_test *test); + +#define REGISTER_SCX_TEST(__test) \ + __attribute__((constructor)) \ + static void ___scxregister##__LINE__(void) \ + { \ + scx_test_register(__test); \ + } + +#define SCX_ERR(__fmt, ...) \ + do { \ + fprintf(stderr, "ERR: %s:%d\n", __FILE__, __LINE__); \ + fprintf(stderr, __fmt"\n", ##__VA_ARGS__); \ + } while (0) + +#define SCX_FAIL(__fmt, ...) \ + do { \ + SCX_ERR(__fmt, ##__VA_ARGS__); \ + return SCX_TEST_FAIL; \ + } while (0) + +#define SCX_FAIL_IF(__cond, __fmt, ...) \ + do { \ + if (__cond) \ + SCX_FAIL(__fmt, ##__VA_ARGS__); \ + } while (0) + +#define SCX_GT(_x, _y) SCX_FAIL_IF((_x) <= (_y), "Expected %s > %s (%lu > %lu)", \ + #_x, #_y, (u64)(_x), (u64)(_y)) +#define SCX_GE(_x, _y) SCX_FAIL_IF((_x) < (_y), "Expected %s >= %s (%lu >= %lu)", \ + #_x, #_y, (u64)(_x), (u64)(_y)) +#define SCX_LT(_x, _y) SCX_FAIL_IF((_x) >= (_y), "Expected %s < %s (%lu < %lu)", \ + #_x, #_y, (u64)(_x), (u64)(_y)) +#define SCX_LE(_x, _y) SCX_FAIL_IF((_x) > (_y), "Expected %s <= %s (%lu <= %lu)", \ + #_x, #_y, (u64)(_x), (u64)(_y)) +#define SCX_EQ(_x, _y) SCX_FAIL_IF((_x) != (_y), "Expected %s == %s (%lu == %lu)", \ + #_x, #_y, (u64)(_x), (u64)(_y)) +#define SCX_ASSERT(_x) SCX_FAIL_IF(!(_x), "Expected %s to be true (%lu)", \ + #_x, (u64)(_x)) + +#define SCX_ECODE_VAL(__ecode) ({ \ + u64 __val = 0; \ + bool __found = false; \ + \ + __found = __COMPAT_read_enum("scx_exit_code", #__ecode, &__val); \ + SCX_ASSERT(__found); \ + (s64)__val; \ +}) + +#define SCX_KIND_VAL(__kind) ({ \ + u64 __val = 0; \ + bool __found = false; \ + \ + __found = __COMPAT_read_enum("scx_exit_kind", #__kind, &__val); \ + SCX_ASSERT(__found); \ + __val; \ +}) + +#endif // # __SCX_TEST_H__ diff --git a/tools/testing/selftests/sched_ext/select_cpu_dfl.bpf.c b/tools/testing/selftests/sched_ext/select_cpu_dfl.bpf.c new file mode 100644 index 000000000000..2ed2991afafe --- /dev/null +++ b/tools/testing/selftests/sched_ext/select_cpu_dfl.bpf.c @@ -0,0 +1,40 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * A scheduler that validates the behavior of direct dispatching with a default + * select_cpu implementation. + * + * Copyright (c) 2023 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2023 David Vernet + * Copyright (c) 2023 Tejun Heo + */ + +#include + +char _license[] SEC("license") = "GPL"; + +bool saw_local = false; + +static bool task_is_test(const struct task_struct *p) +{ + return !bpf_strncmp(p->comm, 9, "select_cpu"); +} + +void BPF_STRUCT_OPS(select_cpu_dfl_enqueue, struct task_struct *p, + u64 enq_flags) +{ + const struct cpumask *idle_mask = scx_bpf_get_idle_cpumask(); + + if (task_is_test(p) && + bpf_cpumask_test_cpu(scx_bpf_task_cpu(p), idle_mask)) { + saw_local = true; + } + scx_bpf_put_idle_cpumask(idle_mask); + + scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags); +} + +SEC(".struct_ops.link") +struct sched_ext_ops select_cpu_dfl_ops = { + .enqueue = select_cpu_dfl_enqueue, + .name = "select_cpu_dfl", +}; diff --git a/tools/testing/selftests/sched_ext/select_cpu_dfl.c b/tools/testing/selftests/sched_ext/select_cpu_dfl.c new file mode 100644 index 000000000000..a53a40c2d2f0 --- /dev/null +++ b/tools/testing/selftests/sched_ext/select_cpu_dfl.c @@ -0,0 +1,72 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2023 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2023 David Vernet + * Copyright (c) 2023 Tejun Heo + */ +#include +#include +#include +#include +#include "select_cpu_dfl.bpf.skel.h" +#include "scx_test.h" + +#define NUM_CHILDREN 1028 + +static enum scx_test_status setup(void **ctx) +{ + struct select_cpu_dfl *skel; + + skel = select_cpu_dfl__open_and_load(); + SCX_FAIL_IF(!skel, "Failed to open and load skel"); + *ctx = skel; + + return SCX_TEST_PASS; +} + +static enum scx_test_status run(void *ctx) +{ + struct select_cpu_dfl *skel = ctx; + struct bpf_link *link; + pid_t pids[NUM_CHILDREN]; + int i, status; + + link = bpf_map__attach_struct_ops(skel->maps.select_cpu_dfl_ops); + SCX_FAIL_IF(!link, "Failed to attach scheduler"); + + for (i = 0; i < NUM_CHILDREN; i++) { + pids[i] = fork(); + if (pids[i] == 0) { + sleep(1); + exit(0); + } + } + + for (i = 0; i < NUM_CHILDREN; i++) { + SCX_EQ(waitpid(pids[i], &status, 0), pids[i]); + SCX_EQ(status, 0); + } + + SCX_ASSERT(!skel->bss->saw_local); + + bpf_link__destroy(link); + + return SCX_TEST_PASS; +} + +static void cleanup(void *ctx) +{ + struct select_cpu_dfl *skel = ctx; + + select_cpu_dfl__destroy(skel); +} + +struct scx_test select_cpu_dfl = { + .name = "select_cpu_dfl", + .description = "Verify the default ops.select_cpu() dispatches tasks " + "when idles cores are found, and skips ops.enqueue()", + .setup = setup, + .run = run, + .cleanup = cleanup, +}; +REGISTER_SCX_TEST(&select_cpu_dfl) diff --git a/tools/testing/selftests/sched_ext/select_cpu_dfl_nodispatch.bpf.c b/tools/testing/selftests/sched_ext/select_cpu_dfl_nodispatch.bpf.c new file mode 100644 index 000000000000..4bb5abb2d369 --- /dev/null +++ b/tools/testing/selftests/sched_ext/select_cpu_dfl_nodispatch.bpf.c @@ -0,0 +1,89 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * A scheduler that validates the behavior of direct dispatching with a default + * select_cpu implementation, and with the SCX_OPS_ENQ_DFL_NO_DISPATCH ops flag + * specified. + * + * Copyright (c) 2023 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2023 David Vernet + * Copyright (c) 2023 Tejun Heo + */ + +#include + +char _license[] SEC("license") = "GPL"; + +bool saw_local = false; + +/* Per-task scheduling context */ +struct task_ctx { + bool force_local; /* CPU changed by ops.select_cpu() */ +}; + +struct { + __uint(type, BPF_MAP_TYPE_TASK_STORAGE); + __uint(map_flags, BPF_F_NO_PREALLOC); + __type(key, int); + __type(value, struct task_ctx); +} task_ctx_stor SEC(".maps"); + +/* Manually specify the signature until the kfunc is added to the scx repo. */ +s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, + bool *found) __ksym; + +s32 BPF_STRUCT_OPS(select_cpu_dfl_nodispatch_select_cpu, struct task_struct *p, + s32 prev_cpu, u64 wake_flags) +{ + struct task_ctx *tctx; + s32 cpu; + + tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0); + if (!tctx) { + scx_bpf_error("task_ctx lookup failed"); + return -ESRCH; + } + + cpu = scx_bpf_select_cpu_dfl(p, prev_cpu, wake_flags, + &tctx->force_local); + + return cpu; +} + +void BPF_STRUCT_OPS(select_cpu_dfl_nodispatch_enqueue, struct task_struct *p, + u64 enq_flags) +{ + u64 dsq_id = SCX_DSQ_GLOBAL; + struct task_ctx *tctx; + + tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0); + if (!tctx) { + scx_bpf_error("task_ctx lookup failed"); + return; + } + + if (tctx->force_local) { + dsq_id = SCX_DSQ_LOCAL; + tctx->force_local = false; + saw_local = true; + } + + scx_bpf_dispatch(p, dsq_id, SCX_SLICE_DFL, enq_flags); +} + +s32 BPF_STRUCT_OPS(select_cpu_dfl_nodispatch_init_task, + struct task_struct *p, struct scx_init_task_args *args) +{ + if (bpf_task_storage_get(&task_ctx_stor, p, 0, + BPF_LOCAL_STORAGE_GET_F_CREATE)) + return 0; + else + return -ENOMEM; +} + +SEC(".struct_ops.link") +struct sched_ext_ops select_cpu_dfl_nodispatch_ops = { + .select_cpu = select_cpu_dfl_nodispatch_select_cpu, + .enqueue = select_cpu_dfl_nodispatch_enqueue, + .init_task = select_cpu_dfl_nodispatch_init_task, + .name = "select_cpu_dfl_nodispatch", +}; diff --git a/tools/testing/selftests/sched_ext/select_cpu_dfl_nodispatch.c b/tools/testing/selftests/sched_ext/select_cpu_dfl_nodispatch.c new file mode 100644 index 000000000000..1d85bf4bf3a3 --- /dev/null +++ b/tools/testing/selftests/sched_ext/select_cpu_dfl_nodispatch.c @@ -0,0 +1,72 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2023 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2023 David Vernet + * Copyright (c) 2023 Tejun Heo + */ +#include +#include +#include +#include +#include "select_cpu_dfl_nodispatch.bpf.skel.h" +#include "scx_test.h" + +#define NUM_CHILDREN 1028 + +static enum scx_test_status setup(void **ctx) +{ + struct select_cpu_dfl_nodispatch *skel; + + skel = select_cpu_dfl_nodispatch__open_and_load(); + SCX_FAIL_IF(!skel, "Failed to open and load skel"); + *ctx = skel; + + return SCX_TEST_PASS; +} + +static enum scx_test_status run(void *ctx) +{ + struct select_cpu_dfl_nodispatch *skel = ctx; + struct bpf_link *link; + pid_t pids[NUM_CHILDREN]; + int i, status; + + link = bpf_map__attach_struct_ops(skel->maps.select_cpu_dfl_nodispatch_ops); + SCX_FAIL_IF(!link, "Failed to attach scheduler"); + + for (i = 0; i < NUM_CHILDREN; i++) { + pids[i] = fork(); + if (pids[i] == 0) { + sleep(1); + exit(0); + } + } + + for (i = 0; i < NUM_CHILDREN; i++) { + SCX_EQ(waitpid(pids[i], &status, 0), pids[i]); + SCX_EQ(status, 0); + } + + SCX_ASSERT(skel->bss->saw_local); + + bpf_link__destroy(link); + + return SCX_TEST_PASS; +} + +static void cleanup(void *ctx) +{ + struct select_cpu_dfl_nodispatch *skel = ctx; + + select_cpu_dfl_nodispatch__destroy(skel); +} + +struct scx_test select_cpu_dfl_nodispatch = { + .name = "select_cpu_dfl_nodispatch", + .description = "Verify behavior of scx_bpf_select_cpu_dfl() in " + "ops.select_cpu()", + .setup = setup, + .run = run, + .cleanup = cleanup, +}; +REGISTER_SCX_TEST(&select_cpu_dfl_nodispatch) diff --git a/tools/testing/selftests/sched_ext/select_cpu_dispatch.bpf.c b/tools/testing/selftests/sched_ext/select_cpu_dispatch.bpf.c new file mode 100644 index 000000000000..f0b96a4a04b2 --- /dev/null +++ b/tools/testing/selftests/sched_ext/select_cpu_dispatch.bpf.c @@ -0,0 +1,41 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * A scheduler that validates the behavior of direct dispatching with a default + * select_cpu implementation. + * + * Copyright (c) 2023 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2023 David Vernet + * Copyright (c) 2023 Tejun Heo + */ + +#include + +char _license[] SEC("license") = "GPL"; + +s32 BPF_STRUCT_OPS(select_cpu_dispatch_select_cpu, struct task_struct *p, + s32 prev_cpu, u64 wake_flags) +{ + u64 dsq_id = SCX_DSQ_LOCAL; + s32 cpu = prev_cpu; + + if (scx_bpf_test_and_clear_cpu_idle(cpu)) + goto dispatch; + + cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0); + if (cpu >= 0) + goto dispatch; + + dsq_id = SCX_DSQ_GLOBAL; + cpu = prev_cpu; + +dispatch: + scx_bpf_dispatch(p, dsq_id, SCX_SLICE_DFL, 0); + return cpu; +} + +SEC(".struct_ops.link") +struct sched_ext_ops select_cpu_dispatch_ops = { + .select_cpu = select_cpu_dispatch_select_cpu, + .name = "select_cpu_dispatch", + .timeout_ms = 1000U, +}; diff --git a/tools/testing/selftests/sched_ext/select_cpu_dispatch.c b/tools/testing/selftests/sched_ext/select_cpu_dispatch.c new file mode 100644 index 000000000000..0309ca8785b3 --- /dev/null +++ b/tools/testing/selftests/sched_ext/select_cpu_dispatch.c @@ -0,0 +1,70 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2023 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2023 David Vernet + * Copyright (c) 2023 Tejun Heo + */ +#include +#include +#include +#include +#include "select_cpu_dispatch.bpf.skel.h" +#include "scx_test.h" + +#define NUM_CHILDREN 1028 + +static enum scx_test_status setup(void **ctx) +{ + struct select_cpu_dispatch *skel; + + skel = select_cpu_dispatch__open_and_load(); + SCX_FAIL_IF(!skel, "Failed to open and load skel"); + *ctx = skel; + + return SCX_TEST_PASS; +} + +static enum scx_test_status run(void *ctx) +{ + struct select_cpu_dispatch *skel = ctx; + struct bpf_link *link; + pid_t pids[NUM_CHILDREN]; + int i, status; + + link = bpf_map__attach_struct_ops(skel->maps.select_cpu_dispatch_ops); + SCX_FAIL_IF(!link, "Failed to attach scheduler"); + + for (i = 0; i < NUM_CHILDREN; i++) { + pids[i] = fork(); + if (pids[i] == 0) { + sleep(1); + exit(0); + } + } + + for (i = 0; i < NUM_CHILDREN; i++) { + SCX_EQ(waitpid(pids[i], &status, 0), pids[i]); + SCX_EQ(status, 0); + } + + bpf_link__destroy(link); + + return SCX_TEST_PASS; +} + +static void cleanup(void *ctx) +{ + struct select_cpu_dispatch *skel = ctx; + + select_cpu_dispatch__destroy(skel); +} + +struct scx_test select_cpu_dispatch = { + .name = "select_cpu_dispatch", + .description = "Test direct dispatching to built-in DSQs from " + "ops.select_cpu()", + .setup = setup, + .run = run, + .cleanup = cleanup, +}; +REGISTER_SCX_TEST(&select_cpu_dispatch) diff --git a/tools/testing/selftests/sched_ext/select_cpu_dispatch_bad_dsq.bpf.c b/tools/testing/selftests/sched_ext/select_cpu_dispatch_bad_dsq.bpf.c new file mode 100644 index 000000000000..7b42ddce0f56 --- /dev/null +++ b/tools/testing/selftests/sched_ext/select_cpu_dispatch_bad_dsq.bpf.c @@ -0,0 +1,37 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * A scheduler that validates the behavior of direct dispatching with a default + * select_cpu implementation. + * + * Copyright (c) 2023 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2023 David Vernet + * Copyright (c) 2023 Tejun Heo + */ + +#include + +char _license[] SEC("license") = "GPL"; + +UEI_DEFINE(uei); + +s32 BPF_STRUCT_OPS(select_cpu_dispatch_bad_dsq_select_cpu, struct task_struct *p, + s32 prev_cpu, u64 wake_flags) +{ + /* Dispatching to a random DSQ should fail. */ + scx_bpf_dispatch(p, 0xcafef00d, SCX_SLICE_DFL, 0); + + return prev_cpu; +} + +void BPF_STRUCT_OPS(select_cpu_dispatch_bad_dsq_exit, struct scx_exit_info *ei) +{ + UEI_RECORD(uei, ei); +} + +SEC(".struct_ops.link") +struct sched_ext_ops select_cpu_dispatch_bad_dsq_ops = { + .select_cpu = select_cpu_dispatch_bad_dsq_select_cpu, + .exit = select_cpu_dispatch_bad_dsq_exit, + .name = "select_cpu_dispatch_bad_dsq", + .timeout_ms = 1000U, +}; diff --git a/tools/testing/selftests/sched_ext/select_cpu_dispatch_bad_dsq.c b/tools/testing/selftests/sched_ext/select_cpu_dispatch_bad_dsq.c new file mode 100644 index 000000000000..47eb6ed7627d --- /dev/null +++ b/tools/testing/selftests/sched_ext/select_cpu_dispatch_bad_dsq.c @@ -0,0 +1,56 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2023 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2023 David Vernet + * Copyright (c) 2023 Tejun Heo + */ +#include +#include +#include +#include +#include "select_cpu_dispatch_bad_dsq.bpf.skel.h" +#include "scx_test.h" + +static enum scx_test_status setup(void **ctx) +{ + struct select_cpu_dispatch_bad_dsq *skel; + + skel = select_cpu_dispatch_bad_dsq__open_and_load(); + SCX_FAIL_IF(!skel, "Failed to open and load skel"); + *ctx = skel; + + return SCX_TEST_PASS; +} + +static enum scx_test_status run(void *ctx) +{ + struct select_cpu_dispatch_bad_dsq *skel = ctx; + struct bpf_link *link; + + link = bpf_map__attach_struct_ops(skel->maps.select_cpu_dispatch_bad_dsq_ops); + SCX_FAIL_IF(!link, "Failed to attach scheduler"); + + sleep(1); + + SCX_EQ(skel->data->uei.kind, EXIT_KIND(SCX_EXIT_ERROR)); + bpf_link__destroy(link); + + return SCX_TEST_PASS; +} + +static void cleanup(void *ctx) +{ + struct select_cpu_dispatch_bad_dsq *skel = ctx; + + select_cpu_dispatch_bad_dsq__destroy(skel); +} + +struct scx_test select_cpu_dispatch_bad_dsq = { + .name = "select_cpu_dispatch_bad_dsq", + .description = "Verify graceful failure if we direct-dispatch to a " + "bogus DSQ in ops.select_cpu()", + .setup = setup, + .run = run, + .cleanup = cleanup, +}; +REGISTER_SCX_TEST(&select_cpu_dispatch_bad_dsq) diff --git a/tools/testing/selftests/sched_ext/select_cpu_dispatch_dbl_dsp.bpf.c b/tools/testing/selftests/sched_ext/select_cpu_dispatch_dbl_dsp.bpf.c new file mode 100644 index 000000000000..653e3dc0b4dc --- /dev/null +++ b/tools/testing/selftests/sched_ext/select_cpu_dispatch_dbl_dsp.bpf.c @@ -0,0 +1,38 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * A scheduler that validates the behavior of direct dispatching with a default + * select_cpu implementation. + * + * Copyright (c) 2023 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2023 David Vernet + * Copyright (c) 2023 Tejun Heo + */ + +#include + +char _license[] SEC("license") = "GPL"; + +UEI_DEFINE(uei); + +s32 BPF_STRUCT_OPS(select_cpu_dispatch_dbl_dsp_select_cpu, struct task_struct *p, + s32 prev_cpu, u64 wake_flags) +{ + /* Dispatching twice in a row is disallowed. */ + scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, 0); + scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, 0); + + return prev_cpu; +} + +void BPF_STRUCT_OPS(select_cpu_dispatch_dbl_dsp_exit, struct scx_exit_info *ei) +{ + UEI_RECORD(uei, ei); +} + +SEC(".struct_ops.link") +struct sched_ext_ops select_cpu_dispatch_dbl_dsp_ops = { + .select_cpu = select_cpu_dispatch_dbl_dsp_select_cpu, + .exit = select_cpu_dispatch_dbl_dsp_exit, + .name = "select_cpu_dispatch_dbl_dsp", + .timeout_ms = 1000U, +}; diff --git a/tools/testing/selftests/sched_ext/select_cpu_dispatch_dbl_dsp.c b/tools/testing/selftests/sched_ext/select_cpu_dispatch_dbl_dsp.c new file mode 100644 index 000000000000..48ff028a3c46 --- /dev/null +++ b/tools/testing/selftests/sched_ext/select_cpu_dispatch_dbl_dsp.c @@ -0,0 +1,56 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2023 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2023 David Vernet + * Copyright (c) 2023 Tejun Heo + */ +#include +#include +#include +#include +#include "select_cpu_dispatch_dbl_dsp.bpf.skel.h" +#include "scx_test.h" + +static enum scx_test_status setup(void **ctx) +{ + struct select_cpu_dispatch_dbl_dsp *skel; + + skel = select_cpu_dispatch_dbl_dsp__open_and_load(); + SCX_FAIL_IF(!skel, "Failed to open and load skel"); + *ctx = skel; + + return SCX_TEST_PASS; +} + +static enum scx_test_status run(void *ctx) +{ + struct select_cpu_dispatch_dbl_dsp *skel = ctx; + struct bpf_link *link; + + link = bpf_map__attach_struct_ops(skel->maps.select_cpu_dispatch_dbl_dsp_ops); + SCX_FAIL_IF(!link, "Failed to attach scheduler"); + + sleep(1); + + SCX_EQ(skel->data->uei.kind, EXIT_KIND(SCX_EXIT_ERROR)); + bpf_link__destroy(link); + + return SCX_TEST_PASS; +} + +static void cleanup(void *ctx) +{ + struct select_cpu_dispatch_dbl_dsp *skel = ctx; + + select_cpu_dispatch_dbl_dsp__destroy(skel); +} + +struct scx_test select_cpu_dispatch_dbl_dsp = { + .name = "select_cpu_dispatch_dbl_dsp", + .description = "Verify graceful failure if we dispatch twice to a " + "DSQ in ops.select_cpu()", + .setup = setup, + .run = run, + .cleanup = cleanup, +}; +REGISTER_SCX_TEST(&select_cpu_dispatch_dbl_dsp) diff --git a/tools/testing/selftests/sched_ext/select_cpu_vtime.bpf.c b/tools/testing/selftests/sched_ext/select_cpu_vtime.bpf.c new file mode 100644 index 000000000000..7f3ebf4fc2ea --- /dev/null +++ b/tools/testing/selftests/sched_ext/select_cpu_vtime.bpf.c @@ -0,0 +1,92 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * A scheduler that validates that enqueue flags are properly stored and + * applied at dispatch time when a task is directly dispatched from + * ops.select_cpu(). We validate this by using scx_bpf_dispatch_vtime(), and + * making the test a very basic vtime scheduler. + * + * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2024 David Vernet + * Copyright (c) 2024 Tejun Heo + */ + +#include + +char _license[] SEC("license") = "GPL"; + +volatile bool consumed; + +static u64 vtime_now; + +#define VTIME_DSQ 0 + +static inline bool vtime_before(u64 a, u64 b) +{ + return (s64)(a - b) < 0; +} + +static inline u64 task_vtime(const struct task_struct *p) +{ + u64 vtime = p->scx.dsq_vtime; + + if (vtime_before(vtime, vtime_now - SCX_SLICE_DFL)) + return vtime_now - SCX_SLICE_DFL; + else + return vtime; +} + +s32 BPF_STRUCT_OPS(select_cpu_vtime_select_cpu, struct task_struct *p, + s32 prev_cpu, u64 wake_flags) +{ + s32 cpu; + + cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0); + if (cpu >= 0) + goto ddsp; + + cpu = prev_cpu; + scx_bpf_test_and_clear_cpu_idle(cpu); +ddsp: + scx_bpf_dispatch_vtime(p, VTIME_DSQ, SCX_SLICE_DFL, task_vtime(p), 0); + return cpu; +} + +void BPF_STRUCT_OPS(select_cpu_vtime_dispatch, s32 cpu, struct task_struct *p) +{ + if (scx_bpf_consume(VTIME_DSQ)) + consumed = true; +} + +void BPF_STRUCT_OPS(select_cpu_vtime_running, struct task_struct *p) +{ + if (vtime_before(vtime_now, p->scx.dsq_vtime)) + vtime_now = p->scx.dsq_vtime; +} + +void BPF_STRUCT_OPS(select_cpu_vtime_stopping, struct task_struct *p, + bool runnable) +{ + p->scx.dsq_vtime += (SCX_SLICE_DFL - p->scx.slice) * 100 / p->scx.weight; +} + +void BPF_STRUCT_OPS(select_cpu_vtime_enable, struct task_struct *p) +{ + p->scx.dsq_vtime = vtime_now; +} + +s32 BPF_STRUCT_OPS_SLEEPABLE(select_cpu_vtime_init) +{ + return scx_bpf_create_dsq(VTIME_DSQ, -1); +} + +SEC(".struct_ops.link") +struct sched_ext_ops select_cpu_vtime_ops = { + .select_cpu = select_cpu_vtime_select_cpu, + .dispatch = select_cpu_vtime_dispatch, + .running = select_cpu_vtime_running, + .stopping = select_cpu_vtime_stopping, + .enable = select_cpu_vtime_enable, + .init = select_cpu_vtime_init, + .name = "select_cpu_vtime", + .timeout_ms = 1000U, +}; diff --git a/tools/testing/selftests/sched_ext/select_cpu_vtime.c b/tools/testing/selftests/sched_ext/select_cpu_vtime.c new file mode 100644 index 000000000000..b4629c2364f5 --- /dev/null +++ b/tools/testing/selftests/sched_ext/select_cpu_vtime.c @@ -0,0 +1,59 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2024 David Vernet + * Copyright (c) 2024 Tejun Heo + */ +#include +#include +#include +#include +#include "select_cpu_vtime.bpf.skel.h" +#include "scx_test.h" + +static enum scx_test_status setup(void **ctx) +{ + struct select_cpu_vtime *skel; + + skel = select_cpu_vtime__open_and_load(); + SCX_FAIL_IF(!skel, "Failed to open and load skel"); + *ctx = skel; + + return SCX_TEST_PASS; +} + +static enum scx_test_status run(void *ctx) +{ + struct select_cpu_vtime *skel = ctx; + struct bpf_link *link; + + SCX_ASSERT(!skel->bss->consumed); + + link = bpf_map__attach_struct_ops(skel->maps.select_cpu_vtime_ops); + SCX_FAIL_IF(!link, "Failed to attach scheduler"); + + sleep(1); + + SCX_ASSERT(skel->bss->consumed); + + bpf_link__destroy(link); + + return SCX_TEST_PASS; +} + +static void cleanup(void *ctx) +{ + struct select_cpu_vtime *skel = ctx; + + select_cpu_vtime__destroy(skel); +} + +struct scx_test select_cpu_vtime = { + .name = "select_cpu_vtime", + .description = "Test doing direct vtime-dispatching from " + "ops.select_cpu(), to a non-built-in DSQ", + .setup = setup, + .run = run, + .cleanup = cleanup, +}; +REGISTER_SCX_TEST(&select_cpu_vtime) diff --git a/tools/testing/selftests/sched_ext/test_example.c b/tools/testing/selftests/sched_ext/test_example.c new file mode 100644 index 000000000000..ce36cdf03cdc --- /dev/null +++ b/tools/testing/selftests/sched_ext/test_example.c @@ -0,0 +1,49 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2024 Tejun Heo + * Copyright (c) 2024 David Vernet + */ +#include +#include +#include "scx_test.h" + +static bool setup_called = false; +static bool run_called = false; +static bool cleanup_called = false; + +static int context = 10; + +static enum scx_test_status setup(void **ctx) +{ + setup_called = true; + *ctx = &context; + + return SCX_TEST_PASS; +} + +static enum scx_test_status run(void *ctx) +{ + int *arg = ctx; + + SCX_ASSERT(setup_called); + SCX_ASSERT(!run_called && !cleanup_called); + SCX_EQ(*arg, context); + + run_called = true; + return SCX_TEST_PASS; +} + +static void cleanup (void *ctx) +{ + SCX_BUG_ON(!run_called || cleanup_called, "Wrong callbacks invoked"); +} + +struct scx_test example = { + .name = "example", + .description = "Validate the basic function of the test suite itself", + .setup = setup, + .run = run, + .cleanup = cleanup, +}; +REGISTER_SCX_TEST(&example) diff --git a/tools/testing/selftests/sched_ext/util.c b/tools/testing/selftests/sched_ext/util.c new file mode 100644 index 000000000000..e47769c91918 --- /dev/null +++ b/tools/testing/selftests/sched_ext/util.c @@ -0,0 +1,71 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2024 David Vernet + */ +#include +#include +#include +#include +#include +#include + +/* Returns read len on success, or -errno on failure. */ +static ssize_t read_text(const char *path, char *buf, size_t max_len) +{ + ssize_t len; + int fd; + + fd = open(path, O_RDONLY); + if (fd < 0) + return -errno; + + len = read(fd, buf, max_len - 1); + + if (len >= 0) + buf[len] = 0; + + close(fd); + return len < 0 ? -errno : len; +} + +/* Returns written len on success, or -errno on failure. */ +static ssize_t write_text(const char *path, char *buf, ssize_t len) +{ + int fd; + ssize_t written; + + fd = open(path, O_WRONLY | O_APPEND); + if (fd < 0) + return -errno; + + written = write(fd, buf, len); + close(fd); + return written < 0 ? -errno : written; +} + +long file_read_long(const char *path) +{ + char buf[128]; + + + if (read_text(path, buf, sizeof(buf)) <= 0) + return -1; + + return atol(buf); +} + +int file_write_long(const char *path, long val) +{ + char buf[64]; + int ret; + + ret = sprintf(buf, "%lu", val); + if (ret < 0) + return ret; + + if (write_text(path, buf, sizeof(buf)) <= 0) + return -1; + + return 0; +} diff --git a/tools/testing/selftests/sched_ext/util.h b/tools/testing/selftests/sched_ext/util.h new file mode 100644 index 000000000000..bc13dfec1267 --- /dev/null +++ b/tools/testing/selftests/sched_ext/util.h @@ -0,0 +1,13 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2024 David Vernet + */ + +#ifndef __SCX_TEST_UTIL_H__ +#define __SCX_TEST_UTIL_H__ + +long file_read_long(const char *path); +int file_write_long(const char *path, long val); + +#endif // __SCX_TEST_H__ -- cgit v1.2.3 From d86adb4fc0655a0867da811d000df75d2a325ef6 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 21 Jun 2024 12:37:22 -1000 Subject: sched_ext: Add cpuperf support sched_ext currently does not integrate with schedutil. When schedutil is the governor, frequencies are left unregulated and usually get stuck close to the highest performance level from running RT tasks. Add CPU performance monitoring and scaling support by integrating into schedutil. The following kfuncs are added: - scx_bpf_cpuperf_cap(): Query the relative performance capacity of different CPUs in the system. - scx_bpf_cpuperf_cur(): Query the current performance level of a CPU relative to its max performance. - scx_bpf_cpuperf_set(): Set the current target performance level of a CPU. This gives direct control over CPU performance setting to the BPF scheduler. The only changes on the schedutil side are accounting for the utilization factor from sched_ext and disabling frequency holding heuristics as it may not apply well to sched_ext schedulers which may have a lot weaker connection between tasks and their current / last CPU. With cpuperf support added, there is no reason to block uclamp. Enable while at it. A toy implementation of cpuperf is added to scx_qmap as a demonstration of the feature. v2: Ignore cpu_util_cfs_boost() when scx_switched_all() in sugov_get_util() to avoid factoring in stale util metric. (Christian) Signed-off-by: Tejun Heo Reviewed-by: David Vernet Cc: Rafael J. Wysocki Cc: Viresh Kumar Cc: Christian Loehle --- kernel/sched/cpufreq_schedutil.c | 12 ++- kernel/sched/ext.c | 83 +++++++++++++++++- kernel/sched/ext.h | 9 ++ kernel/sched/sched.h | 1 + tools/sched_ext/include/scx/common.bpf.h | 3 + tools/sched_ext/scx_qmap.bpf.c | 142 ++++++++++++++++++++++++++++++- tools/sched_ext/scx_qmap.c | 8 ++ 7 files changed, 252 insertions(+), 6 deletions(-) (limited to 'tools') diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 972b7dd65af2..e683e5d08daa 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -197,8 +197,10 @@ unsigned long sugov_effective_cpu_perf(int cpu, unsigned long actual, static void sugov_get_util(struct sugov_cpu *sg_cpu, unsigned long boost) { - unsigned long min, max, util = cpu_util_cfs_boost(sg_cpu->cpu); + unsigned long min, max, util = scx_cpuperf_target(sg_cpu->cpu); + if (!scx_switched_all()) + util += cpu_util_cfs_boost(sg_cpu->cpu); util = effective_cpu_util(sg_cpu->cpu, util, &min, &max); util = max(util, boost); sg_cpu->bw_min = min; @@ -330,6 +332,14 @@ static bool sugov_hold_freq(struct sugov_cpu *sg_cpu) unsigned long idle_calls; bool ret; + /* + * The heuristics in this function is for the fair class. For SCX, the + * performance target comes directly from the BPF scheduler. Let's just + * follow it. + */ + if (scx_switched_all()) + return false; + /* if capped by uclamp_max, always update to be in compliance */ if (uclamp_rq_is_capped(cpu_rq(sg_cpu->cpu))) return false; diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 390623a4a376..28f7a4266fde 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -16,6 +16,8 @@ enum scx_consts { SCX_EXIT_BT_LEN = 64, SCX_EXIT_MSG_LEN = 1024, SCX_EXIT_DUMP_DFL_LEN = 32768, + + SCX_CPUPERF_ONE = SCHED_CAPACITY_SCALE, }; enum scx_exit_kind { @@ -3520,7 +3522,7 @@ DEFINE_SCHED_CLASS(ext) = { .update_curr = update_curr_scx, #ifdef CONFIG_UCLAMP_TASK - .uclamp_enabled = 0, + .uclamp_enabled = 1, #endif }; @@ -4393,7 +4395,7 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) struct scx_task_iter sti; struct task_struct *p; unsigned long timeout; - int i, ret; + int i, cpu, ret; mutex_lock(&scx_ops_enable_mutex); @@ -4442,6 +4444,9 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) atomic_long_set(&scx_nr_rejected, 0); + for_each_possible_cpu(cpu) + cpu_rq(cpu)->scx.cpuperf_target = SCX_CPUPERF_ONE; + /* * Keep CPUs stable during enable so that the BPF scheduler can track * online CPUs by watching ->on/offline_cpu() after ->init(). @@ -5835,6 +5840,77 @@ __bpf_kfunc void scx_bpf_dump_bstr(char *fmt, unsigned long long *data, ops_dump_flush(); } +/** + * scx_bpf_cpuperf_cap - Query the maximum relative capacity of a CPU + * @cpu: CPU of interest + * + * Return the maximum relative capacity of @cpu in relation to the most + * performant CPU in the system. The return value is in the range [1, + * %SCX_CPUPERF_ONE]. See scx_bpf_cpuperf_cur(). + */ +__bpf_kfunc u32 scx_bpf_cpuperf_cap(s32 cpu) +{ + if (ops_cpu_valid(cpu, NULL)) + return arch_scale_cpu_capacity(cpu); + else + return SCX_CPUPERF_ONE; +} + +/** + * scx_bpf_cpuperf_cur - Query the current relative performance of a CPU + * @cpu: CPU of interest + * + * Return the current relative performance of @cpu in relation to its maximum. + * The return value is in the range [1, %SCX_CPUPERF_ONE]. + * + * The current performance level of a CPU in relation to the maximum performance + * available in the system can be calculated as follows: + * + * scx_bpf_cpuperf_cap() * scx_bpf_cpuperf_cur() / %SCX_CPUPERF_ONE + * + * The result is in the range [1, %SCX_CPUPERF_ONE]. + */ +__bpf_kfunc u32 scx_bpf_cpuperf_cur(s32 cpu) +{ + if (ops_cpu_valid(cpu, NULL)) + return arch_scale_freq_capacity(cpu); + else + return SCX_CPUPERF_ONE; +} + +/** + * scx_bpf_cpuperf_set - Set the relative performance target of a CPU + * @cpu: CPU of interest + * @perf: target performance level [0, %SCX_CPUPERF_ONE] + * @flags: %SCX_CPUPERF_* flags + * + * Set the target performance level of @cpu to @perf. @perf is in linear + * relative scale between 0 and %SCX_CPUPERF_ONE. This determines how the + * schedutil cpufreq governor chooses the target frequency. + * + * The actual performance level chosen, CPU grouping, and the overhead and + * latency of the operations are dependent on the hardware and cpufreq driver in + * use. Consult hardware and cpufreq documentation for more information. The + * current performance level can be monitored using scx_bpf_cpuperf_cur(). + */ +__bpf_kfunc void scx_bpf_cpuperf_set(u32 cpu, u32 perf) +{ + if (unlikely(perf > SCX_CPUPERF_ONE)) { + scx_ops_error("Invalid cpuperf target %u for CPU %d", perf, cpu); + return; + } + + if (ops_cpu_valid(cpu, NULL)) { + struct rq *rq = cpu_rq(cpu); + + rq->scx.cpuperf_target = perf; + + rcu_read_lock_sched_notrace(); + cpufreq_update_util(cpu_rq(cpu), 0); + rcu_read_unlock_sched_notrace(); + } +} + /** * scx_bpf_nr_cpu_ids - Return the number of possible CPU IDs * @@ -6045,6 +6121,9 @@ BTF_ID_FLAGS(func, scx_bpf_destroy_dsq) BTF_ID_FLAGS(func, scx_bpf_exit_bstr, KF_TRUSTED_ARGS) BTF_ID_FLAGS(func, scx_bpf_error_bstr, KF_TRUSTED_ARGS) BTF_ID_FLAGS(func, scx_bpf_dump_bstr, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, scx_bpf_cpuperf_cap) +BTF_ID_FLAGS(func, scx_bpf_cpuperf_cur) +BTF_ID_FLAGS(func, scx_bpf_cpuperf_set) BTF_ID_FLAGS(func, scx_bpf_nr_cpu_ids) BTF_ID_FLAGS(func, scx_bpf_get_possible_cpumask, KF_ACQUIRE) BTF_ID_FLAGS(func, scx_bpf_get_online_cpumask, KF_ACQUIRE) diff --git a/kernel/sched/ext.h b/kernel/sched/ext.h index bf6f2cfa49d5..0a7b9a34b18f 100644 --- a/kernel/sched/ext.h +++ b/kernel/sched/ext.h @@ -46,6 +46,14 @@ int scx_check_setscheduler(struct task_struct *p, int policy); bool task_should_scx(struct task_struct *p); void init_sched_ext_class(void); +static inline u32 scx_cpuperf_target(s32 cpu) +{ + if (scx_enabled()) + return cpu_rq(cpu)->scx.cpuperf_target; + else + return 0; +} + static inline const struct sched_class *next_active_class(const struct sched_class *class) { class++; @@ -85,6 +93,7 @@ static inline void scx_pre_fork(struct task_struct *p) {} static inline int scx_fork(struct task_struct *p) { return 0; } static inline void scx_post_fork(struct task_struct *p) {} static inline void scx_cancel_fork(struct task_struct *p) {} +static inline u32 scx_cpuperf_target(s32 cpu) { return 0; } static inline bool scx_can_stop_tick(struct rq *rq) { return true; } static inline void scx_rq_activate(struct rq *rq) {} static inline void scx_rq_deactivate(struct rq *rq) {} diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 3989bf8f2a1b..963a2fa180ad 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -743,6 +743,7 @@ struct scx_rq { u64 extra_enq_flags; /* see move_task_to_local_dsq() */ u32 nr_running; u32 flags; + u32 cpuperf_target; /* [0, SCHED_CAPACITY_SCALE] */ bool cpu_released; cpumask_var_t cpus_to_kick; cpumask_var_t cpus_to_kick_if_idle; diff --git a/tools/sched_ext/include/scx/common.bpf.h b/tools/sched_ext/include/scx/common.bpf.h index 3fa87084cf17..dbbda0e35c5d 100644 --- a/tools/sched_ext/include/scx/common.bpf.h +++ b/tools/sched_ext/include/scx/common.bpf.h @@ -42,6 +42,9 @@ void scx_bpf_destroy_dsq(u64 dsq_id) __ksym; void scx_bpf_exit_bstr(s64 exit_code, char *fmt, unsigned long long *data, u32 data__sz) __ksym __weak; void scx_bpf_error_bstr(char *fmt, unsigned long long *data, u32 data_len) __ksym; void scx_bpf_dump_bstr(char *fmt, unsigned long long *data, u32 data_len) __ksym __weak; +u32 scx_bpf_cpuperf_cap(s32 cpu) __ksym __weak; +u32 scx_bpf_cpuperf_cur(s32 cpu) __ksym __weak; +void scx_bpf_cpuperf_set(s32 cpu, u32 perf) __ksym __weak; u32 scx_bpf_nr_cpu_ids(void) __ksym __weak; const struct cpumask *scx_bpf_get_possible_cpumask(void) __ksym __weak; const struct cpumask *scx_bpf_get_online_cpumask(void) __ksym __weak; diff --git a/tools/sched_ext/scx_qmap.bpf.c b/tools/sched_ext/scx_qmap.bpf.c index c75c70d6a8eb..b1d0b09c966e 100644 --- a/tools/sched_ext/scx_qmap.bpf.c +++ b/tools/sched_ext/scx_qmap.bpf.c @@ -68,6 +68,18 @@ struct { }, }; +/* + * If enabled, CPU performance target is set according to the queue index + * according to the following table. + */ +static const u32 qidx_to_cpuperf_target[] = { + [0] = SCX_CPUPERF_ONE * 0 / 4, + [1] = SCX_CPUPERF_ONE * 1 / 4, + [2] = SCX_CPUPERF_ONE * 2 / 4, + [3] = SCX_CPUPERF_ONE * 3 / 4, + [4] = SCX_CPUPERF_ONE * 4 / 4, +}; + /* * Per-queue sequence numbers to implement core-sched ordering. * @@ -95,6 +107,8 @@ struct { struct cpu_ctx { u64 dsp_idx; /* dispatch index */ u64 dsp_cnt; /* remaining count */ + u32 avg_weight; + u32 cpuperf_target; }; struct { @@ -107,6 +121,8 @@ struct { /* Statistics */ u64 nr_enqueued, nr_dispatched, nr_reenqueued, nr_dequeued; u64 nr_core_sched_execed; +u32 cpuperf_min, cpuperf_avg, cpuperf_max; +u32 cpuperf_target_min, cpuperf_target_avg, cpuperf_target_max; s32 BPF_STRUCT_OPS(qmap_select_cpu, struct task_struct *p, s32 prev_cpu, u64 wake_flags) @@ -313,6 +329,29 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev) } } +void BPF_STRUCT_OPS(qmap_tick, struct task_struct *p) +{ + struct cpu_ctx *cpuc; + u32 zero = 0; + int idx; + + if (!(cpuc = bpf_map_lookup_elem(&cpu_ctx_stor, &zero))) { + scx_bpf_error("failed to look up cpu_ctx"); + return; + } + + /* + * Use the running avg of weights to select the target cpuperf level. + * This is a demonstration of the cpuperf feature rather than a + * practical strategy to regulate CPU frequency. + */ + cpuc->avg_weight = cpuc->avg_weight * 3 / 4 + p->scx.weight / 4; + idx = weight_to_idx(cpuc->avg_weight); + cpuc->cpuperf_target = qidx_to_cpuperf_target[idx]; + + scx_bpf_cpuperf_set(scx_bpf_task_cpu(p), cpuc->cpuperf_target); +} + /* * The distance from the head of the queue scaled by the weight of the queue. * The lower the number, the older the task and the higher the priority. @@ -422,8 +461,9 @@ void BPF_STRUCT_OPS(qmap_dump_cpu, struct scx_dump_ctx *dctx, s32 cpu, bool idle if (!(cpuc = bpf_map_lookup_percpu_elem(&cpu_ctx_stor, &zero, cpu))) return; - scx_bpf_dump("QMAP: dsp_idx=%llu dsp_cnt=%llu", - cpuc->dsp_idx, cpuc->dsp_cnt); + scx_bpf_dump("QMAP: dsp_idx=%llu dsp_cnt=%llu avg_weight=%u cpuperf_target=%u", + cpuc->dsp_idx, cpuc->dsp_cnt, cpuc->avg_weight, + cpuc->cpuperf_target); } void BPF_STRUCT_OPS(qmap_dump_task, struct scx_dump_ctx *dctx, struct task_struct *p) @@ -492,11 +532,106 @@ void BPF_STRUCT_OPS(qmap_cpu_offline, s32 cpu) print_cpus(); } +struct monitor_timer { + struct bpf_timer timer; +}; + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, 1); + __type(key, u32); + __type(value, struct monitor_timer); +} monitor_timer SEC(".maps"); + +/* + * Print out the min, avg and max performance levels of CPUs every second to + * demonstrate the cpuperf interface. + */ +static void monitor_cpuperf(void) +{ + u32 zero = 0, nr_cpu_ids; + u64 cap_sum = 0, cur_sum = 0, cur_min = SCX_CPUPERF_ONE, cur_max = 0; + u64 target_sum = 0, target_min = SCX_CPUPERF_ONE, target_max = 0; + const struct cpumask *online; + int i, nr_online_cpus = 0; + + nr_cpu_ids = scx_bpf_nr_cpu_ids(); + online = scx_bpf_get_online_cpumask(); + + bpf_for(i, 0, nr_cpu_ids) { + struct cpu_ctx *cpuc; + u32 cap, cur; + + if (!bpf_cpumask_test_cpu(i, online)) + continue; + nr_online_cpus++; + + /* collect the capacity and current cpuperf */ + cap = scx_bpf_cpuperf_cap(i); + cur = scx_bpf_cpuperf_cur(i); + + cur_min = cur < cur_min ? cur : cur_min; + cur_max = cur > cur_max ? cur : cur_max; + + /* + * $cur is relative to $cap. Scale it down accordingly so that + * it's in the same scale as other CPUs and $cur_sum/$cap_sum + * makes sense. + */ + cur_sum += cur * cap / SCX_CPUPERF_ONE; + cap_sum += cap; + + if (!(cpuc = bpf_map_lookup_percpu_elem(&cpu_ctx_stor, &zero, i))) { + scx_bpf_error("failed to look up cpu_ctx"); + goto out; + } + + /* collect target */ + cur = cpuc->cpuperf_target; + target_sum += cur; + target_min = cur < target_min ? cur : target_min; + target_max = cur > target_max ? cur : target_max; + } + + cpuperf_min = cur_min; + cpuperf_avg = cur_sum * SCX_CPUPERF_ONE / cap_sum; + cpuperf_max = cur_max; + + cpuperf_target_min = target_min; + cpuperf_target_avg = target_sum / nr_online_cpus; + cpuperf_target_max = target_max; +out: + scx_bpf_put_cpumask(online); +} + +static int monitor_timerfn(void *map, int *key, struct bpf_timer *timer) +{ + monitor_cpuperf(); + + bpf_timer_start(timer, ONE_SEC_IN_NS, 0); + return 0; +} + s32 BPF_STRUCT_OPS_SLEEPABLE(qmap_init) { + u32 key = 0; + struct bpf_timer *timer; + s32 ret; + print_cpus(); - return scx_bpf_create_dsq(SHARED_DSQ, -1); + ret = scx_bpf_create_dsq(SHARED_DSQ, -1); + if (ret) + return ret; + + timer = bpf_map_lookup_elem(&monitor_timer, &key); + if (!timer) + return -ESRCH; + + bpf_timer_init(timer, &monitor_timer, CLOCK_MONOTONIC); + bpf_timer_set_callback(timer, monitor_timerfn); + + return bpf_timer_start(timer, ONE_SEC_IN_NS, 0); } void BPF_STRUCT_OPS(qmap_exit, struct scx_exit_info *ei) @@ -509,6 +644,7 @@ SCX_OPS_DEFINE(qmap_ops, .enqueue = (void *)qmap_enqueue, .dequeue = (void *)qmap_dequeue, .dispatch = (void *)qmap_dispatch, + .tick = (void *)qmap_tick, .core_sched_before = (void *)qmap_core_sched_before, .cpu_release = (void *)qmap_cpu_release, .init_task = (void *)qmap_init_task, diff --git a/tools/sched_ext/scx_qmap.c b/tools/sched_ext/scx_qmap.c index bc36ec4f88a7..4d41c0cb1dab 100644 --- a/tools/sched_ext/scx_qmap.c +++ b/tools/sched_ext/scx_qmap.c @@ -116,6 +116,14 @@ int main(int argc, char **argv) nr_enqueued, nr_dispatched, nr_enqueued - nr_dispatched, skel->bss->nr_reenqueued, skel->bss->nr_dequeued, skel->bss->nr_core_sched_execed); + if (__COMPAT_has_ksym("scx_bpf_cpuperf_cur")) + printf("cpuperf: cur min/avg/max=%u/%u/%u target min/avg/max=%u/%u/%u\n", + skel->bss->cpuperf_min, + skel->bss->cpuperf_avg, + skel->bss->cpuperf_max, + skel->bss->cpuperf_target_min, + skel->bss->cpuperf_target_avg, + skel->bss->cpuperf_target_max); fflush(stdout); sleep(1); } -- cgit v1.2.3 From f97dcd0fcf7a95aaf448a9d1a7ed6c95e16dfcdb Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Thu, 27 Jun 2024 18:11:32 +0100 Subject: sched_ext: Fix spelling mistake: "intead" -> "instead" There is a spelling mistake in the help text. Fix it. Signed-off-by: Colin Ian King Signed-off-by: Tejun Heo --- tools/sched_ext/scx_qmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/sched_ext/scx_qmap.c b/tools/sched_ext/scx_qmap.c index 4d41c0cb1dab..e4e3ecffc4cf 100644 --- a/tools/sched_ext/scx_qmap.c +++ b/tools/sched_ext/scx_qmap.c @@ -31,7 +31,7 @@ const char help_fmt[] = " -d PID Disallow a process from switching into SCHED_EXT (-1 for self)\n" " -D LEN Set scx_exit_info.dump buffer length\n" " -S Suppress qmap-specific debug dump\n" -" -p Switch only tasks on SCHED_EXT policy intead of all\n" +" -p Switch only tasks on SCHED_EXT policy instead of all\n" " -v Print libbpf debug messages\n" " -h Display this help and exit\n"; -- cgit v1.2.3 From 6203ef73fa5c0358f7960b038628259be1448724 Mon Sep 17 00:00:00 2001 From: Hongyan Xia Date: Mon, 8 Jul 2024 15:01:18 +0100 Subject: sched/ext: Add BPF function to fetch rq rq contains many useful fields to implement a custom scheduler. For example, various clock signals like clock_task and clock_pelt can be used to track load. It also contains stats in other sched_classes, which are useful to drive scheduling decisions in ext. tj: Put the new helper below scx_bpf_task_*() helpers. Signed-off-by: Hongyan Xia Signed-off-by: Tejun Heo --- kernel/sched/ext.c | 13 +++++++++++++ tools/sched_ext/include/scx/common.bpf.h | 1 + 2 files changed, 14 insertions(+) (limited to 'tools') diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 33bfb5a88b48..525102f3ff5b 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -6113,6 +6113,18 @@ __bpf_kfunc s32 scx_bpf_task_cpu(const struct task_struct *p) return task_cpu(p); } +/** + * scx_bpf_cpu_rq - Fetch the rq of a CPU + * @cpu: CPU of the rq + */ +__bpf_kfunc struct rq *scx_bpf_cpu_rq(s32 cpu) +{ + if (!ops_cpu_valid(cpu, NULL)) + return NULL; + + return cpu_rq(cpu); +} + __bpf_kfunc_end_defs(); BTF_KFUNCS_START(scx_kfunc_ids_any) @@ -6137,6 +6149,7 @@ BTF_ID_FLAGS(func, scx_bpf_pick_idle_cpu, KF_RCU) BTF_ID_FLAGS(func, scx_bpf_pick_any_cpu, KF_RCU) BTF_ID_FLAGS(func, scx_bpf_task_running, KF_RCU) BTF_ID_FLAGS(func, scx_bpf_task_cpu, KF_RCU) +BTF_ID_FLAGS(func, scx_bpf_cpu_rq) BTF_KFUNCS_END(scx_kfunc_ids_any) static const struct btf_kfunc_id_set scx_kfunc_set_any = { diff --git a/tools/sched_ext/include/scx/common.bpf.h b/tools/sched_ext/include/scx/common.bpf.h index dbbda0e35c5d..965d20324114 100644 --- a/tools/sched_ext/include/scx/common.bpf.h +++ b/tools/sched_ext/include/scx/common.bpf.h @@ -57,6 +57,7 @@ s32 scx_bpf_pick_idle_cpu(const cpumask_t *cpus_allowed, u64 flags) __ksym; s32 scx_bpf_pick_any_cpu(const cpumask_t *cpus_allowed, u64 flags) __ksym; bool scx_bpf_task_running(const struct task_struct *p) __ksym; s32 scx_bpf_task_cpu(const struct task_struct *p) __ksym; +struct rq *scx_bpf_cpu_rq(s32 cpu) __ksym; static inline __attribute__((format(printf, 1, 2))) void ___scx_bpf_bstr_format_checker(const char *fmt, ...) {} -- cgit v1.2.3 From 650ba21b131ed1f8ee57826b2c6295a3be221132 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 8 Jul 2024 14:30:55 -1000 Subject: sched_ext: Implement DSQ iterator DSQs are very opaque in the consumption path. The BPF scheduler has no way of knowing which tasks are being considered and which is picked. This patch adds BPF DSQ iterator. - Allows iterating tasks queued on a DSQ in the dispatch order or reverse from anywhere using bpf_for_each(scx_dsq) or calling the iterator kfuncs directly. - Has ordering guarantee where only tasks which were already queued when the iteration started are visible and consumable during the iteration. v5: - Add a comment to the naked list_empty(&dsq->list) test in consume_dispatch_q() to explain the reasoning behind the lockless test and by extension why nldsq_next_task() isn't used there. - scx_qmap changes separated into its own patch. v4: - bpf_iter_scx_dsq_new() declaration in common.bpf.h was using the wrong type for the last argument (bool rev instead of u64 flags). Fix it. v3: - Alexei pointed out that the iterator is too big to allocate on stack. Added a prep patch to reduce the size of the cursor. Now bpf_iter_scx_dsq is 48 bytes and bpf_iter_scx_dsq_kern is 40 bytes on 64bit. - u32_before() comparison factored out. v2: - scx_bpf_consume_task() is separated out into a separate patch. - DSQ seq and iter flags don't need to be u64. Use u32. Signed-off-by: Tejun Heo Reviewed-by: David Vernet Acked-by: Alexei Starovoitov Cc: bpf@vger.kernel.org --- include/linux/sched/ext.h | 3 + kernel/sched/ext.c | 192 ++++++++++++++++++++++++++++++- tools/sched_ext/include/scx/common.bpf.h | 3 + 3 files changed, 196 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h index eb9cfd18a923..593d2f4909dd 100644 --- a/include/linux/sched/ext.h +++ b/include/linux/sched/ext.h @@ -61,6 +61,7 @@ struct scx_dispatch_q { struct list_head list; /* tasks in dispatch order */ struct rb_root priq; /* used to order by p->scx.dsq_vtime */ u32 nr; + u32 seq; /* used by BPF iter */ u64 id; struct rhash_head hash_node; struct llist_node free_node; @@ -123,6 +124,7 @@ enum scx_kf_mask { struct scx_dsq_list_node { struct list_head node; + bool is_bpf_iter_cursor; }; /* @@ -133,6 +135,7 @@ struct sched_ext_entity { struct scx_dispatch_q *dsq; struct scx_dsq_list_node dsq_list; /* dispatch order */ struct rb_node dsq_priq; /* p->scx.dsq_vtime order */ + u32 dsq_seq; u32 dsq_flags; /* protected by DSQ lock */ u32 flags; /* protected by rq lock */ u32 weight; diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 069c2f33883c..f16d72d72635 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -926,6 +926,11 @@ static u32 highest_bit(u32 flags) return ((u64)1 << bit) >> 1; } +static bool u32_before(u32 a, u32 b) +{ + return (s32)(a - b) < 0; +} + /* * scx_kf_mask enforcement. Some kfuncs can only be called from specific SCX * ops. When invoking SCX ops, SCX_CALL_OP[_RET]() should be used to indicate @@ -1066,6 +1071,73 @@ static __always_inline bool scx_kf_allowed_on_arg_tasks(u32 mask, return true; } +/** + * nldsq_next_task - Iterate to the next task in a non-local DSQ + * @dsq: user dsq being interated + * @cur: current position, %NULL to start iteration + * @rev: walk backwards + * + * Returns %NULL when iteration is finished. + */ +static struct task_struct *nldsq_next_task(struct scx_dispatch_q *dsq, + struct task_struct *cur, bool rev) +{ + struct list_head *list_node; + struct scx_dsq_list_node *dsq_lnode; + + lockdep_assert_held(&dsq->lock); + + if (cur) + list_node = &cur->scx.dsq_list.node; + else + list_node = &dsq->list; + + /* find the next task, need to skip BPF iteration cursors */ + do { + if (rev) + list_node = list_node->prev; + else + list_node = list_node->next; + + if (list_node == &dsq->list) + return NULL; + + dsq_lnode = container_of(list_node, struct scx_dsq_list_node, + node); + } while (dsq_lnode->is_bpf_iter_cursor); + + return container_of(dsq_lnode, struct task_struct, scx.dsq_list); +} + +#define nldsq_for_each_task(p, dsq) \ + for ((p) = nldsq_next_task((dsq), NULL, false); (p); \ + (p) = nldsq_next_task((dsq), (p), false)) + + +/* + * BPF DSQ iterator. Tasks in a non-local DSQ can be iterated in [reverse] + * dispatch order. BPF-visible iterator is opaque and larger to allow future + * changes without breaking backward compatibility. Can be used with + * bpf_for_each(). See bpf_iter_scx_dsq_*(). + */ +enum scx_dsq_iter_flags { + /* iterate in the reverse dispatch order */ + SCX_DSQ_ITER_REV = 1U << 0, + + __SCX_DSQ_ITER_ALL_FLAGS = SCX_DSQ_ITER_REV, +}; + +struct bpf_iter_scx_dsq_kern { + struct scx_dsq_list_node cursor; + struct scx_dispatch_q *dsq; + u32 dsq_seq; + u32 flags; +} __attribute__((aligned(8))); + +struct bpf_iter_scx_dsq { + u64 __opaque[6]; +} __attribute__((aligned(8))); + /* * SCX task iterator. @@ -1415,7 +1487,7 @@ static void dispatch_enqueue(struct scx_dispatch_q *dsq, struct task_struct *p, * tested easily when adding the first task. */ if (unlikely(RB_EMPTY_ROOT(&dsq->priq) && - !list_empty(&dsq->list))) + nldsq_next_task(dsq, NULL, false))) scx_ops_error("DSQ ID 0x%016llx already had FIFO-enqueued tasks", dsq->id); @@ -1447,6 +1519,10 @@ static void dispatch_enqueue(struct scx_dispatch_q *dsq, struct task_struct *p, list_add_tail(&p->scx.dsq_list.node, &dsq->list); } + /* seq records the order tasks are queued, used by BPF DSQ iterator */ + dsq->seq++; + p->scx.dsq_seq = dsq->seq; + dsq_mod_nr(dsq, 1); p->scx.dsq = dsq; @@ -2104,12 +2180,17 @@ static bool consume_dispatch_q(struct rq *rq, struct rq_flags *rf, { struct task_struct *p; retry: + /* + * The caller can't expect to successfully consume a task if the task's + * addition to @dsq isn't guaranteed to be visible somehow. Test + * @dsq->list without locking and skip if it seems empty. + */ if (list_empty(&dsq->list)) return false; raw_spin_lock(&dsq->lock); - list_for_each_entry(p, &dsq->list, scx.dsq_list.node) { + nldsq_for_each_task(p, dsq) { struct rq *task_rq = task_rq(p); if (rq == task_rq) { @@ -5705,6 +5786,110 @@ __bpf_kfunc void scx_bpf_destroy_dsq(u64 dsq_id) destroy_dsq(dsq_id); } +/** + * bpf_iter_scx_dsq_new - Create a DSQ iterator + * @it: iterator to initialize + * @dsq_id: DSQ to iterate + * @flags: %SCX_DSQ_ITER_* + * + * Initialize BPF iterator @it which can be used with bpf_for_each() to walk + * tasks in the DSQ specified by @dsq_id. Iteration using @it only includes + * tasks which are already queued when this function is invoked. + */ +__bpf_kfunc int bpf_iter_scx_dsq_new(struct bpf_iter_scx_dsq *it, u64 dsq_id, + u64 flags) +{ + struct bpf_iter_scx_dsq_kern *kit = (void *)it; + + BUILD_BUG_ON(sizeof(struct bpf_iter_scx_dsq_kern) > + sizeof(struct bpf_iter_scx_dsq)); + BUILD_BUG_ON(__alignof__(struct bpf_iter_scx_dsq_kern) != + __alignof__(struct bpf_iter_scx_dsq)); + + if (flags & ~__SCX_DSQ_ITER_ALL_FLAGS) + return -EINVAL; + + kit->dsq = find_non_local_dsq(dsq_id); + if (!kit->dsq) + return -ENOENT; + + INIT_LIST_HEAD(&kit->cursor.node); + kit->cursor.is_bpf_iter_cursor = true; + kit->dsq_seq = READ_ONCE(kit->dsq->seq); + kit->flags = flags; + + return 0; +} + +/** + * bpf_iter_scx_dsq_next - Progress a DSQ iterator + * @it: iterator to progress + * + * Return the next task. See bpf_iter_scx_dsq_new(). + */ +__bpf_kfunc struct task_struct *bpf_iter_scx_dsq_next(struct bpf_iter_scx_dsq *it) +{ + struct bpf_iter_scx_dsq_kern *kit = (void *)it; + bool rev = kit->flags & SCX_DSQ_ITER_REV; + struct task_struct *p; + unsigned long flags; + + if (!kit->dsq) + return NULL; + + raw_spin_lock_irqsave(&kit->dsq->lock, flags); + + if (list_empty(&kit->cursor.node)) + p = NULL; + else + p = container_of(&kit->cursor, struct task_struct, scx.dsq_list); + + /* + * Only tasks which were queued before the iteration started are + * visible. This bounds BPF iterations and guarantees that vtime never + * jumps in the other direction while iterating. + */ + do { + p = nldsq_next_task(kit->dsq, p, rev); + } while (p && unlikely(u32_before(kit->dsq_seq, p->scx.dsq_seq))); + + if (p) { + if (rev) + list_move_tail(&kit->cursor.node, &p->scx.dsq_list.node); + else + list_move(&kit->cursor.node, &p->scx.dsq_list.node); + } else { + list_del_init(&kit->cursor.node); + } + + raw_spin_unlock_irqrestore(&kit->dsq->lock, flags); + + return p; +} + +/** + * bpf_iter_scx_dsq_destroy - Destroy a DSQ iterator + * @it: iterator to destroy + * + * Undo scx_iter_scx_dsq_new(). + */ +__bpf_kfunc void bpf_iter_scx_dsq_destroy(struct bpf_iter_scx_dsq *it) +{ + struct bpf_iter_scx_dsq_kern *kit = (void *)it; + + if (!kit->dsq) + return; + + if (!list_empty(&kit->cursor.node)) { + unsigned long flags; + + raw_spin_lock_irqsave(&kit->dsq->lock, flags); + list_del_init(&kit->cursor.node); + raw_spin_unlock_irqrestore(&kit->dsq->lock, flags); + } + kit->dsq = NULL; +} + __bpf_kfunc_end_defs(); static s32 __bstr_format(u64 *data_buf, char *line_buf, size_t line_size, @@ -6138,6 +6323,9 @@ BTF_KFUNCS_START(scx_kfunc_ids_any) BTF_ID_FLAGS(func, scx_bpf_kick_cpu) BTF_ID_FLAGS(func, scx_bpf_dsq_nr_queued) BTF_ID_FLAGS(func, scx_bpf_destroy_dsq) +BTF_ID_FLAGS(func, bpf_iter_scx_dsq_new, KF_ITER_NEW | KF_RCU_PROTECTED) +BTF_ID_FLAGS(func, bpf_iter_scx_dsq_next, KF_ITER_NEXT | KF_RET_NULL) +BTF_ID_FLAGS(func, bpf_iter_scx_dsq_destroy, KF_ITER_DESTROY) BTF_ID_FLAGS(func, scx_bpf_exit_bstr, KF_TRUSTED_ARGS) BTF_ID_FLAGS(func, scx_bpf_error_bstr, KF_TRUSTED_ARGS) BTF_ID_FLAGS(func, scx_bpf_dump_bstr, KF_TRUSTED_ARGS) diff --git a/tools/sched_ext/include/scx/common.bpf.h b/tools/sched_ext/include/scx/common.bpf.h index 965d20324114..20280df62857 100644 --- a/tools/sched_ext/include/scx/common.bpf.h +++ b/tools/sched_ext/include/scx/common.bpf.h @@ -39,6 +39,9 @@ u32 scx_bpf_reenqueue_local(void) __ksym; void scx_bpf_kick_cpu(s32 cpu, u64 flags) __ksym; s32 scx_bpf_dsq_nr_queued(u64 dsq_id) __ksym; void scx_bpf_destroy_dsq(u64 dsq_id) __ksym; +int bpf_iter_scx_dsq_new(struct bpf_iter_scx_dsq *it, u64 dsq_id, u64 flags) __ksym __weak; +struct task_struct *bpf_iter_scx_dsq_next(struct bpf_iter_scx_dsq *it) __ksym __weak; +void bpf_iter_scx_dsq_destroy(struct bpf_iter_scx_dsq *it) __ksym __weak; void scx_bpf_exit_bstr(s64 exit_code, char *fmt, unsigned long long *data, u32 data__sz) __ksym __weak; void scx_bpf_error_bstr(char *fmt, unsigned long long *data, u32 data_len) __ksym; void scx_bpf_dump_bstr(char *fmt, unsigned long long *data, u32 data_len) __ksym __weak; -- cgit v1.2.3 From 6fbd643318a1a5f3caea7f94bfe035efbb293ddb Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 8 Jul 2024 14:30:55 -1000 Subject: sched_ext/scx_qmap: Add an example usage of DSQ iterator Implement periodic dumping of the shared DSQ to demonstrate the use of the newly added DSQ iterator. Signed-off-by: Tejun Heo Reviewed-by: David Vernet Acked-by: Alexei Starovoitov Cc: bpf@vger.kernel.org --- tools/sched_ext/scx_qmap.bpf.c | 25 +++++++++++++++++++++++++ tools/sched_ext/scx_qmap.c | 8 ++++++-- 2 files changed, 31 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/sched_ext/scx_qmap.bpf.c b/tools/sched_ext/scx_qmap.bpf.c index b1d0b09c966e..27e35066a602 100644 --- a/tools/sched_ext/scx_qmap.bpf.c +++ b/tools/sched_ext/scx_qmap.bpf.c @@ -36,6 +36,7 @@ const volatile u32 stall_user_nth; const volatile u32 stall_kernel_nth; const volatile u32 dsp_inf_loop_after; const volatile u32 dsp_batch; +const volatile bool print_shared_dsq; const volatile s32 disallow_tgid; const volatile bool suppress_dump; @@ -604,10 +605,34 @@ out: scx_bpf_put_cpumask(online); } +/* + * Dump the currently queued tasks in the shared DSQ to demonstrate the usage of + * scx_bpf_dsq_nr_queued() and DSQ iterator. Raise the dispatch batch count to + * see meaningful dumps in the trace pipe. + */ +static void dump_shared_dsq(void) +{ + struct task_struct *p; + s32 nr; + + if (!(nr = scx_bpf_dsq_nr_queued(SHARED_DSQ))) + return; + + bpf_printk("Dumping %d tasks in SHARED_DSQ in reverse order", nr); + + bpf_rcu_read_lock(); + bpf_for_each(scx_dsq, p, SHARED_DSQ, SCX_DSQ_ITER_REV) + bpf_printk("%s[%d]", p->comm, p->pid); + bpf_rcu_read_unlock(); +} + static int monitor_timerfn(void *map, int *key, struct bpf_timer *timer) { monitor_cpuperf(); + if (print_shared_dsq) + dump_shared_dsq(); + bpf_timer_start(timer, ONE_SEC_IN_NS, 0); return 0; } diff --git a/tools/sched_ext/scx_qmap.c b/tools/sched_ext/scx_qmap.c index e4e3ecffc4cf..304f0488a386 100644 --- a/tools/sched_ext/scx_qmap.c +++ b/tools/sched_ext/scx_qmap.c @@ -20,7 +20,7 @@ const char help_fmt[] = "See the top-level comment in .bpf.c for more details.\n" "\n" "Usage: %s [-s SLICE_US] [-e COUNT] [-t COUNT] [-T COUNT] [-l COUNT] [-b COUNT]\n" -" [-d PID] [-D LEN] [-p] [-v]\n" +" [-P] [-d PID] [-D LEN] [-p] [-v]\n" "\n" " -s SLICE_US Override slice duration\n" " -e COUNT Trigger scx_bpf_error() after COUNT enqueues\n" @@ -28,6 +28,7 @@ const char help_fmt[] = " -T COUNT Stall every COUNT'th kernel thread\n" " -l COUNT Trigger dispatch infinite looping after COUNT dispatches\n" " -b COUNT Dispatch upto COUNT tasks together\n" +" -P Print out DSQ content to trace_pipe every second, use with -b\n" " -d PID Disallow a process from switching into SCHED_EXT (-1 for self)\n" " -D LEN Set scx_exit_info.dump buffer length\n" " -S Suppress qmap-specific debug dump\n" @@ -62,7 +63,7 @@ int main(int argc, char **argv) skel = SCX_OPS_OPEN(qmap_ops, scx_qmap); - while ((opt = getopt(argc, argv, "s:e:t:T:l:b:d:D:Spvh")) != -1) { + while ((opt = getopt(argc, argv, "s:e:t:T:l:b:Pd:D:Spvh")) != -1) { switch (opt) { case 's': skel->rodata->slice_ns = strtoull(optarg, NULL, 0) * 1000; @@ -82,6 +83,9 @@ int main(int argc, char **argv) case 'b': skel->rodata->dsp_batch = strtoul(optarg, NULL, 0); break; + case 'P': + skel->rodata->print_shared_dsq = true; + break; case 'd': skel->rodata->disallow_tgid = strtol(optarg, NULL, 0); if (skel->rodata->disallow_tgid < 0) -- cgit v1.2.3 From 1edab907b57d42e2dcf4c16a00185a89209e8700 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 12 Jul 2024 08:20:33 -1000 Subject: sched_ext/scx_qmap: Pick idle CPU for direct dispatch on !wakeup enqueues Because there was no way to directly dispatch to the local DSQ of a remote CPU from ops.enqueue(), scx_qmap skipped looking for an idle CPU on !wakeup enqueues. This restriction was removed and sched_ext now allows SCX_DSQ_LOCAL_ON verdicts for direct dispatches. Factor out pick_direct_dispatch_cpu() from ops.select_cpu() and use it to direct dispatch from ops.enqueue() on !wakeup enqueues. Signed-off-by: Tejun Heo Acked-by: David Vernet Cc: Dan Schatzberg Cc: Changwoo Min Cc: Andrea Righi --- tools/sched_ext/scx_qmap.bpf.c | 39 ++++++++++++++++++++++++++++++--------- tools/sched_ext/scx_qmap.c | 5 +++-- 2 files changed, 33 insertions(+), 11 deletions(-) (limited to 'tools') diff --git a/tools/sched_ext/scx_qmap.bpf.c b/tools/sched_ext/scx_qmap.bpf.c index 27e35066a602..892278f12dce 100644 --- a/tools/sched_ext/scx_qmap.bpf.c +++ b/tools/sched_ext/scx_qmap.bpf.c @@ -120,11 +120,26 @@ struct { } cpu_ctx_stor SEC(".maps"); /* Statistics */ -u64 nr_enqueued, nr_dispatched, nr_reenqueued, nr_dequeued; +u64 nr_enqueued, nr_dispatched, nr_reenqueued, nr_dequeued, nr_ddsp_from_enq; u64 nr_core_sched_execed; u32 cpuperf_min, cpuperf_avg, cpuperf_max; u32 cpuperf_target_min, cpuperf_target_avg, cpuperf_target_max; +static s32 pick_direct_dispatch_cpu(struct task_struct *p, s32 prev_cpu) +{ + s32 cpu; + + if (p->nr_cpus_allowed == 1 || + scx_bpf_test_and_clear_cpu_idle(prev_cpu)) + return prev_cpu; + + cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0); + if (cpu >= 0) + return cpu; + + return -1; +} + s32 BPF_STRUCT_OPS(qmap_select_cpu, struct task_struct *p, s32 prev_cpu, u64 wake_flags) { @@ -137,17 +152,14 @@ s32 BPF_STRUCT_OPS(qmap_select_cpu, struct task_struct *p, return -ESRCH; } - if (p->nr_cpus_allowed == 1 || - scx_bpf_test_and_clear_cpu_idle(prev_cpu)) { + cpu = pick_direct_dispatch_cpu(p, prev_cpu); + + if (cpu >= 0) { tctx->force_local = true; + return cpu; + } else { return prev_cpu; } - - cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0); - if (cpu >= 0) - return cpu; - - return prev_cpu; } static int weight_to_idx(u32 weight) @@ -172,6 +184,7 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags) u32 pid = p->pid; int idx = weight_to_idx(p->scx.weight); void *ring; + s32 cpu; if (p->flags & PF_KTHREAD) { if (stall_kernel_nth && !(++kernel_cnt % stall_kernel_nth)) @@ -207,6 +220,14 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags) return; } + /* if !WAKEUP, select_cpu() wasn't called, try direct dispatch */ + if (!(enq_flags & SCX_ENQ_WAKEUP) && + (cpu = pick_direct_dispatch_cpu(p, scx_bpf_task_cpu(p))) >= 0) { + __sync_fetch_and_add(&nr_ddsp_from_enq, 1); + scx_bpf_dispatch(p, SCX_DSQ_LOCAL_ON | cpu, slice_ns, enq_flags); + return; + } + /* * If the task was re-enqueued due to the CPU being preempted by a * higher priority scheduling class, just re-enqueue the task directly diff --git a/tools/sched_ext/scx_qmap.c b/tools/sched_ext/scx_qmap.c index 304f0488a386..c9ca30d62b2b 100644 --- a/tools/sched_ext/scx_qmap.c +++ b/tools/sched_ext/scx_qmap.c @@ -116,10 +116,11 @@ int main(int argc, char **argv) long nr_enqueued = skel->bss->nr_enqueued; long nr_dispatched = skel->bss->nr_dispatched; - printf("stats : enq=%lu dsp=%lu delta=%ld reenq=%"PRIu64" deq=%"PRIu64" core=%"PRIu64"\n", + printf("stats : enq=%lu dsp=%lu delta=%ld reenq=%"PRIu64" deq=%"PRIu64" core=%"PRIu64" enq_ddsp=%"PRIu64"\n", nr_enqueued, nr_dispatched, nr_enqueued - nr_dispatched, skel->bss->nr_reenqueued, skel->bss->nr_dequeued, - skel->bss->nr_core_sched_execed); + skel->bss->nr_core_sched_execed, + skel->bss->nr_ddsp_from_enq); if (__COMPAT_has_ksym("scx_bpf_cpuperf_cur")) printf("cpuperf: cur min/avg/max=%u/%u/%u target min/avg/max=%u/%u/%u\n", skel->bss->cpuperf_min, -- cgit v1.2.3 From dc86460e77e8cea4896ff19de905001922653311 Mon Sep 17 00:00:00 2001 From: Zhouyi Zhou Date: Wed, 19 Jun 2024 23:06:58 +0000 Subject: rcutorture: Add CFcommon.arch for arch-specific Kconfig options Add CFcommon.arch for arch-specific Kconfig options. In accordance with [1], [2] and [3], move the x86-specific kernel option CONFIG_HYPERVISOR_GUEST to CFcommon.i686 and CFcommon.x86_64, and also move the x86/PowerPC CONFIG_KVM_GUEST Kconfig option to CFcommon.i686, CFcommon.x86_64, and CFcommon.ppc64le. The "arch" in CFcommon.arch is taken from the "uname -m" command. [1] https://lore.kernel.org/all/20240427005626.1365935-1-zhouzhouyi@gmail.com/ [2] https://lore.kernel.org/all/059d36ce-6453-42be-a31e-895abd35d590@paulmck-laptop/ [3] https://lore.kernel.org/all/ZnBkHosMDhsh4H8g@J2N7QTR9R3/ Tested in x86_64 and PPC VM of Open Source Lab of Oregon State University. Fixes: a6fda6dab93c ("rcutorture: Tweak kvm options") Suggested-by: Paul E. McKenney Suggested-by: Mark Rutland Signed-off-by: Zhouyi Zhou Acked-by: Mark Rutland Signed-off-by: Paul E. McKenney Signed-off-by: Neeraj Upadhyay --- tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh | 2 ++ tools/testing/selftests/rcutorture/configs/rcu/CFcommon | 2 -- tools/testing/selftests/rcutorture/configs/rcu/CFcommon.i686 | 2 ++ tools/testing/selftests/rcutorture/configs/rcu/CFcommon.ppc64le | 1 + tools/testing/selftests/rcutorture/configs/rcu/CFcommon.x86_64 | 2 ++ 5 files changed, 7 insertions(+), 2 deletions(-) create mode 100644 tools/testing/selftests/rcutorture/configs/rcu/CFcommon.i686 create mode 100644 tools/testing/selftests/rcutorture/configs/rcu/CFcommon.ppc64le create mode 100644 tools/testing/selftests/rcutorture/configs/rcu/CFcommon.x86_64 (limited to 'tools') diff --git a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh index b33cd8753689..ad79784e552d 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh @@ -68,6 +68,8 @@ config_override_param "--gdb options" KcList "$TORTURE_KCONFIG_GDB_ARG" config_override_param "--kasan options" KcList "$TORTURE_KCONFIG_KASAN_ARG" config_override_param "--kcsan options" KcList "$TORTURE_KCONFIG_KCSAN_ARG" config_override_param "--kconfig argument" KcList "$TORTURE_KCONFIG_ARG" +config_override_param "$config_dir/CFcommon.$(uname -m)" KcList \ + "`cat $config_dir/CFcommon.$(uname -m) 2> /dev/null`" cp $T/KcList $resdir/ConfigFragment base_resdir=`echo $resdir | sed -e 's/\.[0-9]\+$//'` diff --git a/tools/testing/selftests/rcutorture/configs/rcu/CFcommon b/tools/testing/selftests/rcutorture/configs/rcu/CFcommon index 0e92d85313aa..217597e84905 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/CFcommon +++ b/tools/testing/selftests/rcutorture/configs/rcu/CFcommon @@ -1,7 +1,5 @@ CONFIG_RCU_TORTURE_TEST=y CONFIG_PRINTK_TIME=y -CONFIG_HYPERVISOR_GUEST=y CONFIG_PARAVIRT=y -CONFIG_KVM_GUEST=y CONFIG_KCSAN_ASSUME_PLAIN_WRITES_ATOMIC=n CONFIG_KCSAN_REPORT_VALUE_CHANGE_ONLY=n diff --git a/tools/testing/selftests/rcutorture/configs/rcu/CFcommon.i686 b/tools/testing/selftests/rcutorture/configs/rcu/CFcommon.i686 new file mode 100644 index 000000000000..d8b2f555686f --- /dev/null +++ b/tools/testing/selftests/rcutorture/configs/rcu/CFcommon.i686 @@ -0,0 +1,2 @@ +CONFIG_HYPERVISOR_GUEST=y +CONFIG_KVM_GUEST=y diff --git a/tools/testing/selftests/rcutorture/configs/rcu/CFcommon.ppc64le b/tools/testing/selftests/rcutorture/configs/rcu/CFcommon.ppc64le new file mode 100644 index 000000000000..133da04247ee --- /dev/null +++ b/tools/testing/selftests/rcutorture/configs/rcu/CFcommon.ppc64le @@ -0,0 +1 @@ +CONFIG_KVM_GUEST=y diff --git a/tools/testing/selftests/rcutorture/configs/rcu/CFcommon.x86_64 b/tools/testing/selftests/rcutorture/configs/rcu/CFcommon.x86_64 new file mode 100644 index 000000000000..d8b2f555686f --- /dev/null +++ b/tools/testing/selftests/rcutorture/configs/rcu/CFcommon.x86_64 @@ -0,0 +1,2 @@ +CONFIG_HYPERVISOR_GUEST=y +CONFIG_KVM_GUEST=y -- cgit v1.2.3 From 9a13a324f40fc3846cf7867daffaccd785beb10c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 3 Jul 2024 19:51:02 -0700 Subject: tools/rcu: Remove RCU Tasks Rude asynchronous APIs from rcu-updaters.sh The call_rcu_tasks_rude() and rcu_barrier_tasks_rude() APIs are no longer. This commit therefore removes them from the rcu-updaters.sh script. Signed-off-by: Paul E. McKenney Signed-off-by: Neeraj Upadhyay --- tools/rcu/rcu-updaters.sh | 2 -- 1 file changed, 2 deletions(-) (limited to 'tools') diff --git a/tools/rcu/rcu-updaters.sh b/tools/rcu/rcu-updaters.sh index 4ef1397927bb..8a5df3f22550 100755 --- a/tools/rcu/rcu-updaters.sh +++ b/tools/rcu/rcu-updaters.sh @@ -21,12 +21,10 @@ fi bpftrace -e 'kprobe:kvfree_call_rcu, kprobe:call_rcu, kprobe:call_rcu_tasks, - kprobe:call_rcu_tasks_rude, kprobe:call_rcu_tasks_trace, kprobe:call_srcu, kprobe:rcu_barrier, kprobe:rcu_barrier_tasks, - kprobe:rcu_barrier_tasks_rude, kprobe:rcu_barrier_tasks_trace, kprobe:srcu_barrier, kprobe:synchronize_rcu, -- cgit v1.2.3 From 6ea2987c9a7b6c5f37d08a3eaa664c9ff7467670 Mon Sep 17 00:00:00 2001 From: Thomas Weißschuh Date: Thu, 25 Jul 2024 18:54:18 +0200 Subject: tools/nolibc: include arch.h from string.h MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit string.h tests for the macros NOLIBC_ARCH_HAS_$FUNC to use the architecture-optimized function variants. However if string.h is included before arch.h header then that check does not work, leading to duplicate function definitions. Fixes: 553845eebd60 ("tools/nolibc: x86-64: Use `rep movsb` for `memcpy()` and `memmove()`") Fixes: 12108aa8c1a1 ("tools/nolibc: x86-64: Use `rep stosb` for `memset()`") Cc: stable@vger.kernel.org Acked-by: Willy Tarreau Link: https://lore.kernel.org/r/20240725-arch-has-func-v1-1-5521ed354acd@weissschuh.net Signed-off-by: Thomas Weißschuh --- tools/include/nolibc/string.h | 1 + 1 file changed, 1 insertion(+) (limited to 'tools') diff --git a/tools/include/nolibc/string.h b/tools/include/nolibc/string.h index f9ab28421e6d..9ec9c24f38c0 100644 --- a/tools/include/nolibc/string.h +++ b/tools/include/nolibc/string.h @@ -7,6 +7,7 @@ #ifndef _NOLIBC_STRING_H #define _NOLIBC_STRING_H +#include "arch.h" #include "std.h" static void *malloc(size_t len); -- cgit v1.2.3 From ae1f550efc11eaf1496c431d9c6e784cb49124c5 Mon Sep 17 00:00:00 2001 From: Thomas Weißschuh Date: Thu, 25 Jul 2024 19:10:44 +0200 Subject: tools/nolibc: add stdbool.h header MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit stdbool.h is very simple. Provide an implementation for the user convenience. Acked-by: Willy Tarreau Link: https://lore.kernel.org/r/20240725-nolibc-stdbool-v1-1-a6ee2c80bcde@weissschuh.net Signed-off-by: Thomas Weißschuh --- tools/include/nolibc/Makefile | 1 + tools/include/nolibc/nolibc.h | 3 ++- tools/include/nolibc/stdbool.h | 16 ++++++++++++++++ 3 files changed, 19 insertions(+), 1 deletion(-) create mode 100644 tools/include/nolibc/stdbool.h (limited to 'tools') diff --git a/tools/include/nolibc/Makefile b/tools/include/nolibc/Makefile index e69c26abe1ea..a1f55fb24bb3 100644 --- a/tools/include/nolibc/Makefile +++ b/tools/include/nolibc/Makefile @@ -35,6 +35,7 @@ all_files := \ stackprotector.h \ std.h \ stdarg.h \ + stdbool.h \ stdint.h \ stdlib.h \ string.h \ diff --git a/tools/include/nolibc/nolibc.h b/tools/include/nolibc/nolibc.h index 989e707263a4..92436b1e4441 100644 --- a/tools/include/nolibc/nolibc.h +++ b/tools/include/nolibc/nolibc.h @@ -74,7 +74,8 @@ * -I../nolibc -o hello hello.c -lgcc * * The available standard (but limited) include files are: - * ctype.h, errno.h, signal.h, stdarg.h, stdio.h, stdlib.h, string.h, time.h + * ctype.h, errno.h, signal.h, stdarg.h, stdbool.h stdio.h, stdlib.h, + * string.h, time.h * * In addition, the following ones are expected to be provided by the compiler: * float.h, stddef.h diff --git a/tools/include/nolibc/stdbool.h b/tools/include/nolibc/stdbool.h new file mode 100644 index 000000000000..60feece22f17 --- /dev/null +++ b/tools/include/nolibc/stdbool.h @@ -0,0 +1,16 @@ +/* SPDX-License-Identifier: LGPL-2.1 OR MIT */ +/* + * Boolean types support for NOLIBC + * Copyright (C) 2024 Thomas Weißschuh + */ + +#ifndef _NOLIBC_STDBOOL_H +#define _NOLIBC_STDBOOL_H + +#define bool _Bool +#define true 1 +#define false 0 + +#define __bool_true_false_are_defined 1 + +#endif /* _NOLIBC_STDBOOL_H */ -- cgit v1.2.3 From a63507f3b11de30fd7711ec244fe354fb4a01a09 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Thu, 18 Jul 2024 14:22:29 +0800 Subject: selftests/bpf: Drop type of connect_to_fd_opts The "type" parameter of connect_to_fd_opts() is redundant of "server_fd". Since the "type" can be obtained inside by invoking getsockopt(SO_TYPE), without passing it in as a parameter. This patch drops the "type" parameter of connect_to_fd_opts() and updates its callers. Suggested-by: Martin KaFai Lau Signed-off-by: Geliang Tang Link: https://lore.kernel.org/r/50d8ce7ab7ab0c0f4d211fc7cc4ebe3d3f63424c.1721282219.git.tanggeliang@kylinos.cn Signed-off-by: Martin KaFai Lau Signed-off-by: Andrii Nakryiko --- tools/testing/selftests/bpf/network_helpers.c | 21 +++++++++++---------- tools/testing/selftests/bpf/network_helpers.h | 2 +- tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c | 2 +- .../testing/selftests/bpf/prog_tests/cgroup_v1v2.c | 4 ++-- 4 files changed, 15 insertions(+), 14 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/network_helpers.c b/tools/testing/selftests/bpf/network_helpers.c index e0cba4178e41..15e0e0bb7553 100644 --- a/tools/testing/selftests/bpf/network_helpers.c +++ b/tools/testing/selftests/bpf/network_helpers.c @@ -328,14 +328,21 @@ error_close: return -1; } -int connect_to_fd_opts(int server_fd, int type, const struct network_helper_opts *opts) +int connect_to_fd_opts(int server_fd, const struct network_helper_opts *opts) { struct sockaddr_storage addr; - socklen_t addrlen; + socklen_t addrlen, optlen; + int type; if (!opts) opts = &default_opts; + optlen = sizeof(type); + if (getsockopt(server_fd, SOL_SOCKET, SO_TYPE, &type, &optlen)) { + log_err("getsockopt(SOL_TYPE)"); + return -1; + } + addrlen = sizeof(addr); if (getsockname(server_fd, (struct sockaddr *)&addr, &addrlen)) { log_err("Failed to get server addr"); @@ -350,14 +357,8 @@ int connect_to_fd(int server_fd, int timeout_ms) struct network_helper_opts opts = { .timeout_ms = timeout_ms, }; - int type, protocol; socklen_t optlen; - - optlen = sizeof(type); - if (getsockopt(server_fd, SOL_SOCKET, SO_TYPE, &type, &optlen)) { - log_err("getsockopt(SOL_TYPE)"); - return -1; - } + int protocol; optlen = sizeof(protocol); if (getsockopt(server_fd, SOL_SOCKET, SO_PROTOCOL, &protocol, &optlen)) { @@ -366,7 +367,7 @@ int connect_to_fd(int server_fd, int timeout_ms) } opts.proto = protocol; - return connect_to_fd_opts(server_fd, type, &opts); + return connect_to_fd_opts(server_fd, &opts); } int connect_fd_to_fd(int client_fd, int server_fd, int timeout_ms) diff --git a/tools/testing/selftests/bpf/network_helpers.h b/tools/testing/selftests/bpf/network_helpers.h index aac5b94d6379..5b548c0c60de 100644 --- a/tools/testing/selftests/bpf/network_helpers.h +++ b/tools/testing/selftests/bpf/network_helpers.h @@ -71,7 +71,7 @@ int client_socket(int family, int type, int connect_to_addr(int type, const struct sockaddr_storage *addr, socklen_t len, const struct network_helper_opts *opts); int connect_to_fd(int server_fd, int timeout_ms); -int connect_to_fd_opts(int server_fd, int type, const struct network_helper_opts *opts); +int connect_to_fd_opts(int server_fd, const struct network_helper_opts *opts); int connect_fd_to_fd(int client_fd, int server_fd, int timeout_ms); int fastopen_connect(int server_fd, const char *data, unsigned int data_len, int timeout_ms); diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c b/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c index 63422f4f3896..1d494b4453f4 100644 --- a/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c +++ b/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c @@ -49,7 +49,7 @@ static bool start_test(char *addr_str, goto err; /* connect to server */ - *cli_fd = connect_to_fd_opts(*srv_fd, SOCK_STREAM, cli_opts); + *cli_fd = connect_to_fd_opts(*srv_fd, cli_opts); if (!ASSERT_NEQ(*cli_fd, -1, "connect_to_fd_opts")) goto err; diff --git a/tools/testing/selftests/bpf/prog_tests/cgroup_v1v2.c b/tools/testing/selftests/bpf/prog_tests/cgroup_v1v2.c index 9709c8db7275..addf720428f7 100644 --- a/tools/testing/selftests/bpf/prog_tests/cgroup_v1v2.c +++ b/tools/testing/selftests/bpf/prog_tests/cgroup_v1v2.c @@ -32,7 +32,7 @@ static int run_test(int cgroup_fd, int server_fd, bool classid) goto out; } - fd = connect_to_fd_opts(server_fd, SOCK_STREAM, &opts); + fd = connect_to_fd_opts(server_fd, &opts); if (fd < 0) err = -1; else @@ -52,7 +52,7 @@ void test_cgroup_v1v2(void) server_fd = start_server(AF_INET, SOCK_STREAM, NULL, port, 0); if (!ASSERT_GE(server_fd, 0, "server_fd")) return; - client_fd = connect_to_fd_opts(server_fd, SOCK_STREAM, &opts); + client_fd = connect_to_fd_opts(server_fd, &opts); if (!ASSERT_GE(client_fd, 0, "client_fd")) { close(server_fd); return; -- cgit v1.2.3 From e1ee5a48b5b27e3e7bb294f80f7429c3d0466d19 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Thu, 18 Jul 2024 14:22:30 +0800 Subject: selftests/bpf: Drop must_fail from network_helper_opts The struct member "must_fail" of network_helper_opts() is only used in cgroup_v1v2 tests, it makes sense to drop it from network_helper_opts. Return value (fd) of connect_to_fd_opts() and the expect errno (EPERM) can be checked in cgroup_v1v2.c directly, no need to check them in connect_fd_to_addr() in network_helpers.c. This also makes connect_fd_to_addr() function useless. It can be replaced by connect(). Suggested-by: Martin KaFai Lau Signed-off-by: Geliang Tang Link: https://lore.kernel.org/r/3faf336019a9a48e2e8951f4cdebf19e3ac6e441.1721282219.git.tanggeliang@kylinos.cn Signed-off-by: Martin KaFai Lau Signed-off-by: Andrii Nakryiko --- tools/testing/selftests/bpf/network_helpers.c | 42 +++++----------------- tools/testing/selftests/bpf/network_helpers.h | 1 - .../testing/selftests/bpf/prog_tests/cgroup_v1v2.c | 14 ++++---- 3 files changed, 16 insertions(+), 41 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/network_helpers.c b/tools/testing/selftests/bpf/network_helpers.c index 15e0e0bb7553..bc4947afa0b4 100644 --- a/tools/testing/selftests/bpf/network_helpers.c +++ b/tools/testing/selftests/bpf/network_helpers.c @@ -277,33 +277,6 @@ error_close: return -1; } -static int connect_fd_to_addr(int fd, - const struct sockaddr_storage *addr, - socklen_t addrlen, const bool must_fail) -{ - int ret; - - errno = 0; - ret = connect(fd, (const struct sockaddr *)addr, addrlen); - if (must_fail) { - if (!ret) { - log_err("Unexpected success to connect to server"); - return -1; - } - if (errno != EPERM) { - log_err("Unexpected error from connect to server"); - return -1; - } - } else { - if (ret) { - log_err("Failed to connect to server"); - return -1; - } - } - - return 0; -} - int connect_to_addr(int type, const struct sockaddr_storage *addr, socklen_t addrlen, const struct network_helper_opts *opts) { @@ -318,14 +291,13 @@ int connect_to_addr(int type, const struct sockaddr_storage *addr, socklen_t add return -1; } - if (connect_fd_to_addr(fd, addr, addrlen, opts->must_fail)) - goto error_close; + if (connect(fd, (const struct sockaddr *)addr, addrlen)) { + log_err("Failed to connect to server"); + save_errno_close(fd); + return -1; + } return fd; - -error_close: - save_errno_close(fd); - return -1; } int connect_to_fd_opts(int server_fd, const struct network_helper_opts *opts) @@ -383,8 +355,10 @@ int connect_fd_to_fd(int client_fd, int server_fd, int timeout_ms) return -1; } - if (connect_fd_to_addr(client_fd, &addr, len, false)) + if (connect(client_fd, (const struct sockaddr *)&addr, len)) { + log_err("Failed to connect to server"); return -1; + } return 0; } diff --git a/tools/testing/selftests/bpf/network_helpers.h b/tools/testing/selftests/bpf/network_helpers.h index 5b548c0c60de..f39eeb5a4594 100644 --- a/tools/testing/selftests/bpf/network_helpers.h +++ b/tools/testing/selftests/bpf/network_helpers.h @@ -23,7 +23,6 @@ typedef __u16 __sum16; struct network_helper_opts { int timeout_ms; - bool must_fail; int proto; /* +ve: Passed to listen() as-is. * 0: Default when the test does not set diff --git a/tools/testing/selftests/bpf/prog_tests/cgroup_v1v2.c b/tools/testing/selftests/bpf/prog_tests/cgroup_v1v2.c index addf720428f7..64abba72ac10 100644 --- a/tools/testing/selftests/bpf/prog_tests/cgroup_v1v2.c +++ b/tools/testing/selftests/bpf/prog_tests/cgroup_v1v2.c @@ -9,9 +9,6 @@ static int run_test(int cgroup_fd, int server_fd, bool classid) { - struct network_helper_opts opts = { - .must_fail = true, - }; struct connect4_dropper *skel; int fd, err = 0; @@ -32,11 +29,16 @@ static int run_test(int cgroup_fd, int server_fd, bool classid) goto out; } - fd = connect_to_fd_opts(server_fd, &opts); - if (fd < 0) + errno = 0; + fd = connect_to_fd_opts(server_fd, NULL); + if (fd >= 0) { + log_err("Unexpected success to connect to server"); err = -1; - else close(fd); + } else if (errno != EPERM) { + log_err("Unexpected errno from connect to server"); + err = -1; + } out: connect4_dropper__destroy(skel); return err; -- cgit v1.2.3 From c70b2d9027ca39e84d4ee01da78a70308ce1fd4f Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Thu, 18 Jul 2024 14:22:31 +0800 Subject: selftests/bpf: Add connect_to_addr_str helper Similar to connect_to_addr() helper for connecting to a server with the given sockaddr_storage type address, this patch adds a new helper named connect_to_addr_str() for connecting to a server with the given string type address "addr_str", together with its "family" and "port" as other parameters of connect_to_addr_str(). In connect_to_addr_str(), the parameters "family", "addr_str" and "port" are used to create a sockaddr_storage type address "addr" by invoking make_sockaddr(). Then pass this "addr" together with "addrlen", "type" and "opts" to connect_to_addr(). Suggested-by: Martin KaFai Lau Signed-off-by: Geliang Tang Link: https://lore.kernel.org/r/647e82170831558dbde132a7a3d86df660dba2c4.1721282219.git.tanggeliang@kylinos.cn Signed-off-by: Martin KaFai Lau Signed-off-by: Andrii Nakryiko --- tools/testing/selftests/bpf/network_helpers.c | 15 +++++++++++++++ tools/testing/selftests/bpf/network_helpers.h | 2 ++ 2 files changed, 17 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/network_helpers.c b/tools/testing/selftests/bpf/network_helpers.c index bc4947afa0b4..9c98a60cf1e2 100644 --- a/tools/testing/selftests/bpf/network_helpers.c +++ b/tools/testing/selftests/bpf/network_helpers.c @@ -300,6 +300,21 @@ int connect_to_addr(int type, const struct sockaddr_storage *addr, socklen_t add return fd; } +int connect_to_addr_str(int family, int type, const char *addr_str, __u16 port, + const struct network_helper_opts *opts) +{ + struct sockaddr_storage addr; + socklen_t addrlen; + + if (!opts) + opts = &default_opts; + + if (make_sockaddr(family, addr_str, port, &addr, &addrlen)) + return -1; + + return connect_to_addr(type, &addr, addrlen, opts); +} + int connect_to_fd_opts(int server_fd, const struct network_helper_opts *opts) { struct sockaddr_storage addr; diff --git a/tools/testing/selftests/bpf/network_helpers.h b/tools/testing/selftests/bpf/network_helpers.h index f39eeb5a4594..cce56955371f 100644 --- a/tools/testing/selftests/bpf/network_helpers.h +++ b/tools/testing/selftests/bpf/network_helpers.h @@ -69,6 +69,8 @@ int client_socket(int family, int type, const struct network_helper_opts *opts); int connect_to_addr(int type, const struct sockaddr_storage *addr, socklen_t len, const struct network_helper_opts *opts); +int connect_to_addr_str(int family, int type, const char *addr_str, __u16 port, + const struct network_helper_opts *opts); int connect_to_fd(int server_fd, int timeout_ms); int connect_to_fd_opts(int server_fd, const struct network_helper_opts *opts); int connect_fd_to_fd(int client_fd, int server_fd, int timeout_ms); -- cgit v1.2.3 From 844f7315e77a017fe90652b411e88b119c782a1f Mon Sep 17 00:00:00 2001 From: Ihor Solodrai Date: Thu, 18 Jul 2024 22:57:43 +0000 Subject: selftests/bpf: Use auto-dependencies for test objects Make use of -M compiler options when building .test.o objects to generate .d files and avoid re-building all tests every time. Previously, if a single test bpf program under selftests/bpf/progs/*.c has changed, make would rebuild all the *.bpf.o, *.skel.h and *.test.o objects, which is a lot of unnecessary work. A typical dependency chain is: progs/x.c -> x.bpf.o -> x.skel.h -> x.test.o -> trunner_binary However for many tests it's not a 1:1 mapping by name, and so far %.test.o have been simply dependent on all %.skel.h files, and %.skel.h files on all %.bpf.o objects. Avoid full rebuilds by instructing the compiler (via -MMD) to produce *.d files with real dependencies, and appropriately including them. Exploit make feature that rebuilds included makefiles if they were changed by setting %.test.d as prerequisite for %.test.o files. A couple of examples of compilation time speedup (after the first clean build): $ touch progs/verifier_and.c && time make -j8 Before: real 0m16.651s After: real 0m2.245s $ touch progs/read_vsyscall.c && time make -j8 Before: real 0m15.743s After: real 0m1.575s A drawback of this change is that now there is an overhead due to make processing lots of .d files, which potentially may slow down unrelated targets. However a time to make all from scratch hasn't changed significantly: $ make clean && time make -j8 Before: real 1m31.148s After: real 1m30.309s Suggested-by: Eduard Zingerman Signed-off-by: Ihor Solodrai Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/VJihUTnvtwEgv_mOnpfy7EgD9D2MPNoHO-MlANeLIzLJPGhDeyOuGKIYyKgk0O6KPjfM-MuhtvPwZcngN8WFqbTnTRyCSMc2aMZ1ODm1T_g=@pm.me --- tools/testing/selftests/bpf/.gitignore | 1 + tools/testing/selftests/bpf/Makefile | 43 +++++++++++++++++++++++++--------- 2 files changed, 33 insertions(+), 11 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/.gitignore b/tools/testing/selftests/bpf/.gitignore index 5025401323af..4e4aae8aa7ec 100644 --- a/tools/testing/selftests/bpf/.gitignore +++ b/tools/testing/selftests/bpf/.gitignore @@ -31,6 +31,7 @@ test_tcp_check_syncookie_user test_sysctl xdping test_cpp +*.d *.subskel.h *.skel.h *.lskel.h diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index dd49c1d23a60..05b234248b38 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -477,7 +477,8 @@ xsk_xdp_progs.skel.h-deps := xsk_xdp_progs.bpf.o xdp_hw_metadata.skel.h-deps := xdp_hw_metadata.bpf.o xdp_features.skel.h-deps := xdp_features.bpf.o -LINKED_BPF_SRCS := $(patsubst %.bpf.o,%.c,$(foreach skel,$(LINKED_SKELS),$($(skel)-deps))) +LINKED_BPF_OBJS := $(foreach skel,$(LINKED_SKELS),$($(skel)-deps)) +LINKED_BPF_SRCS := $(patsubst %.bpf.o,%.c,$(LINKED_BPF_OBJS)) # Set up extra TRUNNER_XXX "temporary" variables in the environment (relies on # $eval()) and pass control to DEFINE_TEST_RUNNER_RULES. @@ -556,7 +557,11 @@ $(TRUNNER_BPF_LSKELS): %.lskel.h: %.bpf.o $(BPFTOOL) | $(TRUNNER_OUTPUT) $(Q)$$(BPFTOOL) gen skeleton -L $$(<:.o=.llinked3.o) name $$(notdir $$(<:.bpf.o=_lskel)) > $$@ $(Q)rm -f $$(<:.o=.llinked1.o) $$(<:.o=.llinked2.o) $$(<:.o=.llinked3.o) -$(TRUNNER_BPF_SKELS_LINKED): $(TRUNNER_BPF_OBJS) $(BPFTOOL) | $(TRUNNER_OUTPUT) +$(LINKED_BPF_OBJS): %: $(TRUNNER_OUTPUT)/% + +# .SECONDEXPANSION here allows to correctly expand %-deps variables as prerequisites +.SECONDEXPANSION: +$(TRUNNER_BPF_SKELS_LINKED): $(TRUNNER_OUTPUT)/%: $$$$(%-deps) $(BPFTOOL) | $(TRUNNER_OUTPUT) $$(call msg,LINK-BPF,$(TRUNNER_BINARY),$$(@:.skel.h=.bpf.o)) $(Q)$$(BPFTOOL) gen object $$(@:.skel.h=.linked1.o) $$(addprefix $(TRUNNER_OUTPUT)/,$$($$(@F)-deps)) $(Q)$$(BPFTOOL) gen object $$(@:.skel.h=.linked2.o) $$(@:.skel.h=.linked1.o) @@ -566,6 +571,14 @@ $(TRUNNER_BPF_SKELS_LINKED): $(TRUNNER_BPF_OBJS) $(BPFTOOL) | $(TRUNNER_OUTPUT) $(Q)$$(BPFTOOL) gen skeleton $$(@:.skel.h=.linked3.o) name $$(notdir $$(@:.skel.h=)) > $$@ $(Q)$$(BPFTOOL) gen subskeleton $$(@:.skel.h=.linked3.o) name $$(notdir $$(@:.skel.h=)) > $$(@:.skel.h=.subskel.h) $(Q)rm -f $$(@:.skel.h=.linked1.o) $$(@:.skel.h=.linked2.o) $$(@:.skel.h=.linked3.o) + +# When the compiler generates a %.d file, only skel basenames (not +# full paths) are specified as prerequisites for corresponding %.o +# file. This target makes %.skel.h basename dependent on full paths, +# linking generated %.d dependency with actual %.skel.h files. +$(notdir %.skel.h): $(TRUNNER_OUTPUT)/%.skel.h + @true + endif # ensure we set up tests.h header generation rule just once @@ -583,14 +596,19 @@ endif # Note: we cd into output directory to ensure embedded BPF object is found $(TRUNNER_TEST_OBJS): $(TRUNNER_OUTPUT)/%.test.o: \ $(TRUNNER_TESTS_DIR)/%.c \ - $(TRUNNER_EXTRA_HDRS) \ - $(TRUNNER_BPF_OBJS) \ - $(TRUNNER_BPF_SKELS) \ - $(TRUNNER_BPF_LSKELS) \ - $(TRUNNER_BPF_SKELS_LINKED) \ - $$(BPFOBJ) | $(TRUNNER_OUTPUT) + $(TRUNNER_OUTPUT)/%.test.d $$(call msg,TEST-OBJ,$(TRUNNER_BINARY),$$@) - $(Q)cd $$(@D) && $$(CC) -I. $$(CFLAGS) -c $(CURDIR)/$$< $$(LDLIBS) -o $$(@F) + $(Q)cd $$(@D) && $$(CC) -I. $$(CFLAGS) -MMD -MT $$@ -c $(CURDIR)/$$< $$(LDLIBS) -o $$(@F) + +$(TRUNNER_TEST_OBJS:.o=.d): $(TRUNNER_OUTPUT)/%.test.d: \ + $(TRUNNER_TESTS_DIR)/%.c \ + $(TRUNNER_EXTRA_HDRS) \ + $(TRUNNER_BPF_SKELS) \ + $(TRUNNER_BPF_LSKELS) \ + $(TRUNNER_BPF_SKELS_LINKED) \ + $$(BPFOBJ) | $(TRUNNER_OUTPUT) + +include $(wildcard $(TRUNNER_TEST_OBJS:.o=.d)) $(TRUNNER_EXTRA_OBJS): $(TRUNNER_OUTPUT)/%.o: \ %.c \ @@ -608,6 +626,9 @@ ifneq ($2:$(OUTPUT),:$(shell pwd)) $(Q)rsync -aq $$^ $(TRUNNER_OUTPUT)/ endif +# some X.test.o files have runtime dependencies on Y.bpf.o files +$(OUTPUT)/$(TRUNNER_BINARY): | $(TRUNNER_BPF_OBJS) + $(OUTPUT)/$(TRUNNER_BINARY): $(TRUNNER_TEST_OBJS) \ $(TRUNNER_EXTRA_OBJS) $$(BPFOBJ) \ $(RESOLVE_BTFIDS) \ @@ -768,8 +789,8 @@ $(OUTPUT)/uprobe_multi: uprobe_multi.c EXTRA_CLEAN := $(SCRATCH_DIR) $(HOST_SCRATCH_DIR) \ prog_tests/tests.h map_tests/tests.h verifier/tests.h \ - feature bpftool \ - $(addprefix $(OUTPUT)/,*.o *.skel.h *.lskel.h *.subskel.h \ + feature bpftool \ + $(addprefix $(OUTPUT)/,*.o *.d *.skel.h *.lskel.h *.subskel.h \ no_alu32 cpuv4 bpf_gcc bpf_testmod.ko \ bpf_test_no_cfi.ko \ liburandom_read.so) -- cgit v1.2.3 From 4bf79f9be434e000c8e12fe83b2f4402480f1460 Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Thu, 18 Jul 2024 13:23:53 -0700 Subject: bpf: Track equal scalars history on per-instruction level Use bpf_verifier_state->jmp_history to track which registers were updated by find_equal_scalars() (renamed to collect_linked_regs()) when conditional jump was verified. Use recorded information in backtrack_insn() to propagate precision. E.g. for the following program: while verifying instructions 1: r1 = r0 | 2: if r1 < 8 goto ... | push r0,r1 as linked registers in jmp_history 3: if r0 > 16 goto ... | push r0,r1 as linked registers in jmp_history 4: r2 = r10 | 5: r2 += r0 v mark_chain_precision(r0) while doing mark_chain_precision(r0) 5: r2 += r0 | mark r0 precise 4: r2 = r10 | 3: if r0 > 16 goto ... | mark r0,r1 as precise 2: if r1 < 8 goto ... | mark r0,r1 as precise 1: r1 = r0 v Technically, do this as follows: - Use 10 bits to identify each register that gains range because of sync_linked_regs(): - 3 bits for frame number; - 6 bits for register or stack slot number; - 1 bit to indicate if register is spilled. - Use u64 as a vector of 6 such records + 4 bits for vector length. - Augment struct bpf_jmp_history_entry with a field 'linked_regs' representing such vector. - When doing check_cond_jmp_op() remember up to 6 registers that gain range because of sync_linked_regs() in such a vector. - Don't propagate range information and reset IDs for registers that don't fit in 6-value vector. - Push a pair {instruction index, linked registers vector} to bpf_verifier_state->jmp_history. - When doing backtrack_insn() check if any of recorded linked registers is currently marked precise, if so mark all linked registers as precise. This also requires fixes for two test_verifier tests: - precise: test 1 - precise: test 2 Both tests contain the following instruction sequence: 19: (bf) r2 = r9 ; R2=scalar(id=3) R9=scalar(id=3) 20: (a5) if r2 < 0x8 goto pc+1 ; R2=scalar(id=3,umin=8) 21: (95) exit 22: (07) r2 += 1 ; R2_w=scalar(id=3+1,...) 23: (bf) r1 = r10 ; R1_w=fp0 R10=fp0 24: (07) r1 += -8 ; R1_w=fp-8 25: (b7) r3 = 0 ; R3_w=0 26: (85) call bpf_probe_read_kernel#113 The call to bpf_probe_read_kernel() at (26) forces r2 to be precise. Previously, this forced all registers with same id to become precise immediately when mark_chain_precision() is called. After this change, the precision is propagated to registers sharing same id only when 'if' instruction is backtracked. Hence verification log for both tests is changed: regs=r2,r9 -> regs=r2 for instructions 25..20. Fixes: 904e6ddf4133 ("bpf: Use scalar ids in mark_chain_precision()") Reported-by: Hao Sun Suggested-by: Andrii Nakryiko Signed-off-by: Eduard Zingerman Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20240718202357.1746514-2-eddyz87@gmail.com Closes: https://lore.kernel.org/bpf/CAEf4BzZ0xidVCqB47XnkXcNhkPWF6_nTV7yt+_Lf0kcFEut2Mg@mail.gmail.com/ --- include/linux/bpf_verifier.h | 4 + kernel/bpf/verifier.c | 245 +++++++++++++++++++-- .../bpf/progs/verifier_subprog_precision.c | 2 +- tools/testing/selftests/bpf/verifier/precise.c | 20 +- 4 files changed, 239 insertions(+), 32 deletions(-) (limited to 'tools') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 6503c85b10a3..731a0a4ac13c 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -371,6 +371,10 @@ struct bpf_jmp_history_entry { u32 prev_idx : 22; /* special flags, e.g., whether insn is doing register stack spill/load */ u32 flags : 10; + /* additional registers that need precision tracking when this + * jump is backtracked, vector of six 10-bit records + */ + u64 linked_regs; }; /* Maximum number of register states that can exist at once */ diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 4cb5441ad75f..8dade7b51430 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3335,9 +3335,87 @@ static bool is_jmp_point(struct bpf_verifier_env *env, int insn_idx) return env->insn_aux_data[insn_idx].jmp_point; } +#define LR_FRAMENO_BITS 3 +#define LR_SPI_BITS 6 +#define LR_ENTRY_BITS (LR_SPI_BITS + LR_FRAMENO_BITS + 1) +#define LR_SIZE_BITS 4 +#define LR_FRAMENO_MASK ((1ull << LR_FRAMENO_BITS) - 1) +#define LR_SPI_MASK ((1ull << LR_SPI_BITS) - 1) +#define LR_SIZE_MASK ((1ull << LR_SIZE_BITS) - 1) +#define LR_SPI_OFF LR_FRAMENO_BITS +#define LR_IS_REG_OFF (LR_SPI_BITS + LR_FRAMENO_BITS) +#define LINKED_REGS_MAX 6 + +struct linked_reg { + u8 frameno; + union { + u8 spi; + u8 regno; + }; + bool is_reg; +}; + +struct linked_regs { + int cnt; + struct linked_reg entries[LINKED_REGS_MAX]; +}; + +static struct linked_reg *linked_regs_push(struct linked_regs *s) +{ + if (s->cnt < LINKED_REGS_MAX) + return &s->entries[s->cnt++]; + + return NULL; +} + +/* Use u64 as a vector of 6 10-bit values, use first 4-bits to track + * number of elements currently in stack. + * Pack one history entry for linked registers as 10 bits in the following format: + * - 3-bits frameno + * - 6-bits spi_or_reg + * - 1-bit is_reg + */ +static u64 linked_regs_pack(struct linked_regs *s) +{ + u64 val = 0; + int i; + + for (i = 0; i < s->cnt; ++i) { + struct linked_reg *e = &s->entries[i]; + u64 tmp = 0; + + tmp |= e->frameno; + tmp |= e->spi << LR_SPI_OFF; + tmp |= (e->is_reg ? 1 : 0) << LR_IS_REG_OFF; + + val <<= LR_ENTRY_BITS; + val |= tmp; + } + val <<= LR_SIZE_BITS; + val |= s->cnt; + return val; +} + +static void linked_regs_unpack(u64 val, struct linked_regs *s) +{ + int i; + + s->cnt = val & LR_SIZE_MASK; + val >>= LR_SIZE_BITS; + + for (i = 0; i < s->cnt; ++i) { + struct linked_reg *e = &s->entries[i]; + + e->frameno = val & LR_FRAMENO_MASK; + e->spi = (val >> LR_SPI_OFF) & LR_SPI_MASK; + e->is_reg = (val >> LR_IS_REG_OFF) & 0x1; + val >>= LR_ENTRY_BITS; + } +} + /* for any branch, call, exit record the history of jmps in the given state */ static int push_jmp_history(struct bpf_verifier_env *env, struct bpf_verifier_state *cur, - int insn_flags) + int insn_flags, u64 linked_regs) { u32 cnt = cur->jmp_history_cnt; struct bpf_jmp_history_entry *p; @@ -3353,6 +3431,10 @@ static int push_jmp_history(struct bpf_verifier_env *env, struct bpf_verifier_st "verifier insn history bug: insn_idx %d cur flags %x new flags %x\n", env->insn_idx, env->cur_hist_ent->flags, insn_flags); env->cur_hist_ent->flags |= insn_flags; + WARN_ONCE(env->cur_hist_ent->linked_regs != 0, + "verifier insn history bug: insn_idx %d linked_regs != 0: %#llx\n", + env->insn_idx, env->cur_hist_ent->linked_regs); + env->cur_hist_ent->linked_regs = linked_regs; return 0; } @@ -3367,6 +3449,7 @@ static int push_jmp_history(struct bpf_verifier_env *env, struct bpf_verifier_st p->idx = env->insn_idx; p->prev_idx = env->prev_insn_idx; p->flags = insn_flags; + p->linked_regs = linked_regs; cur->jmp_history_cnt = cnt; env->cur_hist_ent = p; @@ -3532,6 +3615,11 @@ static inline bool bt_is_reg_set(struct backtrack_state *bt, u32 reg) return bt->reg_masks[bt->frame] & (1 << reg); } +static inline bool bt_is_frame_reg_set(struct backtrack_state *bt, u32 frame, u32 reg) +{ + return bt->reg_masks[frame] & (1 << reg); +} + static inline bool bt_is_frame_slot_set(struct backtrack_state *bt, u32 frame, u32 slot) { return bt->stack_masks[frame] & (1ull << slot); @@ -3576,6 +3664,42 @@ static void fmt_stack_mask(char *buf, ssize_t buf_sz, u64 stack_mask) } } +/* If any register R in hist->linked_regs is marked as precise in bt, + * do bt_set_frame_{reg,slot}(bt, R) for all registers in hist->linked_regs. + */ +static void bt_sync_linked_regs(struct backtrack_state *bt, struct bpf_jmp_history_entry *hist) +{ + struct linked_regs linked_regs; + bool some_precise = false; + int i; + + if (!hist || hist->linked_regs == 0) + return; + + linked_regs_unpack(hist->linked_regs, &linked_regs); + for (i = 0; i < linked_regs.cnt; ++i) { + struct linked_reg *e = &linked_regs.entries[i]; + + if ((e->is_reg && bt_is_frame_reg_set(bt, e->frameno, e->regno)) || + (!e->is_reg && bt_is_frame_slot_set(bt, e->frameno, e->spi))) { + some_precise = true; + break; + } + } + + if (!some_precise) + return; + + for (i = 0; i < linked_regs.cnt; ++i) { + struct linked_reg *e = &linked_regs.entries[i]; + + if (e->is_reg) + bt_set_frame_reg(bt, e->frameno, e->regno); + else + bt_set_frame_slot(bt, e->frameno, e->spi); + } +} + static bool calls_callback(struct bpf_verifier_env *env, int insn_idx); /* For given verifier state backtrack_insn() is called from the last insn to @@ -3615,6 +3739,12 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx, print_bpf_insn(&cbs, insn, env->allow_ptr_leaks); } + /* If there is a history record that some registers gained range at this insn, + * propagate precision marks to those registers, so that bt_is_reg_set() + * accounts for these registers. + */ + bt_sync_linked_regs(bt, hist); + if (class == BPF_ALU || class == BPF_ALU64) { if (!bt_is_reg_set(bt, dreg)) return 0; @@ -3844,7 +3974,8 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx, */ bt_set_reg(bt, dreg); bt_set_reg(bt, sreg); - /* else dreg K + } else if (BPF_SRC(insn->code) == BPF_K) { + /* dreg K * Only dreg still needs precision before * this insn, so for the K-based conditional * there is nothing new to be marked. @@ -3862,6 +3993,10 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx, /* to be analyzed */ return -ENOTSUPP; } + /* Propagate precision marks to linked registers, to account for + * registers marked as precise in this function. + */ + bt_sync_linked_regs(bt, hist); return 0; } @@ -4456,7 +4591,7 @@ static void assign_scalar_id_before_mov(struct bpf_verifier_env *env, if (!src_reg->id && !tnum_is_const(src_reg->var_off)) /* Ensure that src_reg has a valid ID that will be copied to - * dst_reg and then will be used by find_equal_scalars() to + * dst_reg and then will be used by sync_linked_regs() to * propagate min/max range. */ src_reg->id = ++env->id_gen; @@ -4625,7 +4760,7 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env, } if (insn_flags) - return push_jmp_history(env, env->cur_state, insn_flags); + return push_jmp_history(env, env->cur_state, insn_flags, 0); return 0; } @@ -4930,7 +5065,7 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env, insn_flags = 0; /* we are not restoring spilled register */ } if (insn_flags) - return push_jmp_history(env, env->cur_state, insn_flags); + return push_jmp_history(env, env->cur_state, insn_flags, 0); return 0; } @@ -14099,7 +14234,7 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env, u64 val = reg_const_value(src_reg, alu32); if ((dst_reg->id & BPF_ADD_CONST) || - /* prevent overflow in find_equal_scalars() later */ + /* prevent overflow in sync_linked_regs() later */ val > (u32)S32_MAX) { /* * If the register already went through rX += val @@ -14114,7 +14249,7 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env, } else { /* * Make sure ID is cleared otherwise dst_reg min/max could be - * incorrectly propagated into other registers by find_equal_scalars() + * incorrectly propagated into other registers by sync_linked_regs() */ dst_reg->id = 0; } @@ -14264,7 +14399,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) copy_register_state(dst_reg, src_reg); /* Make sure ID is cleared if src_reg is not in u32 * range otherwise dst_reg min/max could be incorrectly - * propagated into src_reg by find_equal_scalars() + * propagated into src_reg by sync_linked_regs() */ if (!is_src_reg_u32) dst_reg->id = 0; @@ -15087,14 +15222,66 @@ static bool try_match_pkt_pointers(const struct bpf_insn *insn, return true; } -static void find_equal_scalars(struct bpf_verifier_state *vstate, - struct bpf_reg_state *known_reg) +static void __collect_linked_regs(struct linked_regs *reg_set, struct bpf_reg_state *reg, + u32 id, u32 frameno, u32 spi_or_reg, bool is_reg) +{ + struct linked_reg *e; + + if (reg->type != SCALAR_VALUE || (reg->id & ~BPF_ADD_CONST) != id) + return; + + e = linked_regs_push(reg_set); + if (e) { + e->frameno = frameno; + e->is_reg = is_reg; + e->regno = spi_or_reg; + } else { + reg->id = 0; + } +} + +/* For all R being scalar registers or spilled scalar registers + * in verifier state, save R in linked_regs if R->id == id. + * If there are too many Rs sharing same id, reset id for leftover Rs. + */ +static void collect_linked_regs(struct bpf_verifier_state *vstate, u32 id, + struct linked_regs *linked_regs) +{ + struct bpf_func_state *func; + struct bpf_reg_state *reg; + int i, j; + + id = id & ~BPF_ADD_CONST; + for (i = vstate->curframe; i >= 0; i--) { + func = vstate->frame[i]; + for (j = 0; j < BPF_REG_FP; j++) { + reg = &func->regs[j]; + __collect_linked_regs(linked_regs, reg, id, i, j, true); + } + for (j = 0; j < func->allocated_stack / BPF_REG_SIZE; j++) { + if (!is_spilled_reg(&func->stack[j])) + continue; + reg = &func->stack[j].spilled_ptr; + __collect_linked_regs(linked_regs, reg, id, i, j, false); + } + } +} + +/* For all R in linked_regs, copy known_reg range into R + * if R->id == known_reg->id. + */ +static void sync_linked_regs(struct bpf_verifier_state *vstate, struct bpf_reg_state *known_reg, + struct linked_regs *linked_regs) { struct bpf_reg_state fake_reg; - struct bpf_func_state *state; struct bpf_reg_state *reg; + struct linked_reg *e; + int i; - bpf_for_each_reg_in_vstate(vstate, state, reg, ({ + for (i = 0; i < linked_regs->cnt; ++i) { + e = &linked_regs->entries[i]; + reg = e->is_reg ? &vstate->frame[e->frameno]->regs[e->regno] + : &vstate->frame[e->frameno]->stack[e->spi].spilled_ptr; if (reg->type != SCALAR_VALUE || reg == known_reg) continue; if ((reg->id & ~BPF_ADD_CONST) != (known_reg->id & ~BPF_ADD_CONST)) @@ -15112,7 +15299,7 @@ static void find_equal_scalars(struct bpf_verifier_state *vstate, copy_register_state(reg, known_reg); /* * Must preserve off, id and add_const flag, - * otherwise another find_equal_scalars() will be incorrect. + * otherwise another sync_linked_regs() will be incorrect. */ reg->off = saved_off; @@ -15120,7 +15307,7 @@ static void find_equal_scalars(struct bpf_verifier_state *vstate, scalar_min_max_add(reg, &fake_reg); reg->var_off = tnum_add(reg->var_off, fake_reg.var_off); } - })); + } } static int check_cond_jmp_op(struct bpf_verifier_env *env, @@ -15131,6 +15318,7 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, struct bpf_reg_state *regs = this_branch->frame[this_branch->curframe]->regs; struct bpf_reg_state *dst_reg, *other_branch_regs, *src_reg = NULL; struct bpf_reg_state *eq_branch_regs; + struct linked_regs linked_regs = {}; u8 opcode = BPF_OP(insn->code); bool is_jmp32; int pred = -1; @@ -15245,6 +15433,21 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, return 0; } + /* Push scalar registers sharing same ID to jump history, + * do this before creating 'other_branch', so that both + * 'this_branch' and 'other_branch' share this history + * if parent state is created. + */ + if (BPF_SRC(insn->code) == BPF_X && src_reg->type == SCALAR_VALUE && src_reg->id) + collect_linked_regs(this_branch, src_reg->id, &linked_regs); + if (dst_reg->type == SCALAR_VALUE && dst_reg->id) + collect_linked_regs(this_branch, dst_reg->id, &linked_regs); + if (linked_regs.cnt > 1) { + err = push_jmp_history(env, this_branch, 0, linked_regs_pack(&linked_regs)); + if (err) + return err; + } + other_branch = push_stack(env, *insn_idx + insn->off + 1, *insn_idx, false); if (!other_branch) @@ -15275,13 +15478,13 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, if (BPF_SRC(insn->code) == BPF_X && src_reg->type == SCALAR_VALUE && src_reg->id && !WARN_ON_ONCE(src_reg->id != other_branch_regs[insn->src_reg].id)) { - find_equal_scalars(this_branch, src_reg); - find_equal_scalars(other_branch, &other_branch_regs[insn->src_reg]); + sync_linked_regs(this_branch, src_reg, &linked_regs); + sync_linked_regs(other_branch, &other_branch_regs[insn->src_reg], &linked_regs); } if (dst_reg->type == SCALAR_VALUE && dst_reg->id && !WARN_ON_ONCE(dst_reg->id != other_branch_regs[insn->dst_reg].id)) { - find_equal_scalars(this_branch, dst_reg); - find_equal_scalars(other_branch, &other_branch_regs[insn->dst_reg]); + sync_linked_regs(this_branch, dst_reg, &linked_regs); + sync_linked_regs(other_branch, &other_branch_regs[insn->dst_reg], &linked_regs); } /* if one pointer register is compared to another pointer @@ -16770,7 +16973,7 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold, * * First verification path is [1-6]: * - at (4) same bpf_reg_state::id (b) would be assigned to r6 and r7; - * - at (5) r6 would be marked <= X, find_equal_scalars() would also mark + * - at (5) r6 would be marked <= X, sync_linked_regs() would also mark * r7 <= X, because r6 and r7 share same id. * Next verification path is [1-4, 6]. * @@ -17563,7 +17766,7 @@ hit: * the current state. */ if (is_jmp_point(env, env->insn_idx)) - err = err ? : push_jmp_history(env, cur, 0); + err = err ? : push_jmp_history(env, cur, 0, 0); err = err ? : propagate_precision(env, &sl->state); if (err) return err; @@ -17831,7 +18034,7 @@ static int do_check(struct bpf_verifier_env *env) } if (is_jmp_point(env, env->insn_idx)) { - err = push_jmp_history(env, state, 0); + err = push_jmp_history(env, state, 0, 0); if (err) return err; } diff --git a/tools/testing/selftests/bpf/progs/verifier_subprog_precision.c b/tools/testing/selftests/bpf/progs/verifier_subprog_precision.c index 6a6fad625f7e..9d415f7ce599 100644 --- a/tools/testing/selftests/bpf/progs/verifier_subprog_precision.c +++ b/tools/testing/selftests/bpf/progs/verifier_subprog_precision.c @@ -278,7 +278,7 @@ __msg("mark_precise: frame0: last_idx 14 first_idx 9") __msg("mark_precise: frame0: regs=r6 stack= before 13: (bf) r1 = r7") __msg("mark_precise: frame0: regs=r6 stack= before 12: (27) r6 *= 4") __msg("mark_precise: frame0: regs=r6 stack= before 11: (25) if r6 > 0x3 goto pc+4") -__msg("mark_precise: frame0: regs=r6 stack= before 10: (bf) r6 = r0") +__msg("mark_precise: frame0: regs=r0,r6 stack= before 10: (bf) r6 = r0") __msg("mark_precise: frame0: regs=r0 stack= before 9: (85) call bpf_loop") /* State entering callback body popped from states stack */ __msg("from 9 to 17: frame1:") diff --git a/tools/testing/selftests/bpf/verifier/precise.c b/tools/testing/selftests/bpf/verifier/precise.c index 90643ccc221d..64d722199e8f 100644 --- a/tools/testing/selftests/bpf/verifier/precise.c +++ b/tools/testing/selftests/bpf/verifier/precise.c @@ -39,11 +39,11 @@ .result = VERBOSE_ACCEPT, .errstr = "mark_precise: frame0: last_idx 26 first_idx 20\ - mark_precise: frame0: regs=r2,r9 stack= before 25\ - mark_precise: frame0: regs=r2,r9 stack= before 24\ - mark_precise: frame0: regs=r2,r9 stack= before 23\ - mark_precise: frame0: regs=r2,r9 stack= before 22\ - mark_precise: frame0: regs=r2,r9 stack= before 20\ + mark_precise: frame0: regs=r2 stack= before 25\ + mark_precise: frame0: regs=r2 stack= before 24\ + mark_precise: frame0: regs=r2 stack= before 23\ + mark_precise: frame0: regs=r2 stack= before 22\ + mark_precise: frame0: regs=r2 stack= before 20\ mark_precise: frame0: parent state regs=r2,r9 stack=:\ mark_precise: frame0: last_idx 19 first_idx 10\ mark_precise: frame0: regs=r2,r9 stack= before 19\ @@ -100,11 +100,11 @@ .errstr = "26: (85) call bpf_probe_read_kernel#113\ mark_precise: frame0: last_idx 26 first_idx 22\ - mark_precise: frame0: regs=r2,r9 stack= before 25\ - mark_precise: frame0: regs=r2,r9 stack= before 24\ - mark_precise: frame0: regs=r2,r9 stack= before 23\ - mark_precise: frame0: regs=r2,r9 stack= before 22\ - mark_precise: frame0: parent state regs=r2,r9 stack=:\ + mark_precise: frame0: regs=r2 stack= before 25\ + mark_precise: frame0: regs=r2 stack= before 24\ + mark_precise: frame0: regs=r2 stack= before 23\ + mark_precise: frame0: regs=r2 stack= before 22\ + mark_precise: frame0: parent state regs=r2 stack=:\ mark_precise: frame0: last_idx 20 first_idx 20\ mark_precise: frame0: regs=r2,r9 stack= before 20\ mark_precise: frame0: parent state regs=r2,r9 stack=:\ -- cgit v1.2.3 From 842edb5507a1038e009d27e69d13b94b6f085763 Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Thu, 18 Jul 2024 13:23:54 -0700 Subject: bpf: Remove mark_precise_scalar_ids() Function mark_precise_scalar_ids() is superseded by bt_sync_linked_regs() and equal scalars tracking in jump history. mark_precise_scalar_ids() propagates precision over registers sharing same ID on parent/child state boundaries, while jump history records allow bt_sync_linked_regs() to propagate same information with instruction level granularity, which is strictly more precise. This commit removes mark_precise_scalar_ids() and updates test cases in progs/verifier_scalar_ids to reflect new verifier behavior. The tests are updated in the following manner: - mark_precise_scalar_ids() propagated precision regardless of presence of conditional jumps, while new jump history based logic only kicks in when conditional jumps are present. Hence test cases are augmented with conditional jumps to still trigger precision propagation. - As equal scalars tracking no longer relies on parent/child state boundaries some test cases are no longer interesting, such test cases are removed, namely: - precision_same_state and precision_cross_state are superseded by linked_regs_bpf_k; - precision_same_state_broken_link and equal_scalars_broken_link are superseded by linked_regs_broken_link. Signed-off-by: Eduard Zingerman Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20240718202357.1746514-3-eddyz87@gmail.com --- kernel/bpf/verifier.c | 115 -------------- .../selftests/bpf/progs/verifier_scalar_ids.c | 171 +++++++-------------- tools/testing/selftests/bpf/verifier/precise.c | 8 +- 3 files changed, 59 insertions(+), 235 deletions(-) (limited to 'tools') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 8dade7b51430..f8c474e8e597 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4124,96 +4124,6 @@ static void mark_all_scalars_imprecise(struct bpf_verifier_env *env, struct bpf_ } } -static bool idset_contains(struct bpf_idset *s, u32 id) -{ - u32 i; - - for (i = 0; i < s->count; ++i) - if (s->ids[i] == (id & ~BPF_ADD_CONST)) - return true; - - return false; -} - -static int idset_push(struct bpf_idset *s, u32 id) -{ - if (WARN_ON_ONCE(s->count >= ARRAY_SIZE(s->ids))) - return -EFAULT; - s->ids[s->count++] = id & ~BPF_ADD_CONST; - return 0; -} - -static void idset_reset(struct bpf_idset *s) -{ - s->count = 0; -} - -/* Collect a set of IDs for all registers currently marked as precise in env->bt. - * Mark all registers with these IDs as precise. - */ -static int mark_precise_scalar_ids(struct bpf_verifier_env *env, struct bpf_verifier_state *st) -{ - struct bpf_idset *precise_ids = &env->idset_scratch; - struct backtrack_state *bt = &env->bt; - struct bpf_func_state *func; - struct bpf_reg_state *reg; - DECLARE_BITMAP(mask, 64); - int i, fr; - - idset_reset(precise_ids); - - for (fr = bt->frame; fr >= 0; fr--) { - func = st->frame[fr]; - - bitmap_from_u64(mask, bt_frame_reg_mask(bt, fr)); - for_each_set_bit(i, mask, 32) { - reg = &func->regs[i]; - if (!reg->id || reg->type != SCALAR_VALUE) - continue; - if (idset_push(precise_ids, reg->id)) - return -EFAULT; - } - - bitmap_from_u64(mask, bt_frame_stack_mask(bt, fr)); - for_each_set_bit(i, mask, 64) { - if (i >= func->allocated_stack / BPF_REG_SIZE) - break; - if (!is_spilled_scalar_reg(&func->stack[i])) - continue; - reg = &func->stack[i].spilled_ptr; - if (!reg->id) - continue; - if (idset_push(precise_ids, reg->id)) - return -EFAULT; - } - } - - for (fr = 0; fr <= st->curframe; ++fr) { - func = st->frame[fr]; - - for (i = BPF_REG_0; i < BPF_REG_10; ++i) { - reg = &func->regs[i]; - if (!reg->id) - continue; - if (!idset_contains(precise_ids, reg->id)) - continue; - bt_set_frame_reg(bt, fr, i); - } - for (i = 0; i < func->allocated_stack / BPF_REG_SIZE; ++i) { - if (!is_spilled_scalar_reg(&func->stack[i])) - continue; - reg = &func->stack[i].spilled_ptr; - if (!reg->id) - continue; - if (!idset_contains(precise_ids, reg->id)) - continue; - bt_set_frame_slot(bt, fr, i); - } - } - - return 0; -} - /* * __mark_chain_precision() backtracks BPF program instruction sequence and * chain of verifier states making sure that register *regno* (if regno >= 0) @@ -4346,31 +4256,6 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno) bt->frame, last_idx, first_idx, subseq_idx); } - /* If some register with scalar ID is marked as precise, - * make sure that all registers sharing this ID are also precise. - * This is needed to estimate effect of find_equal_scalars(). - * Do this at the last instruction of each state, - * bpf_reg_state::id fields are valid for these instructions. - * - * Allows to track precision in situation like below: - * - * r2 = unknown value - * ... - * --- state #0 --- - * ... - * r1 = r2 // r1 and r2 now share the same ID - * ... - * --- state #1 {r1.id = A, r2.id = A} --- - * ... - * if (r2 > 10) goto exit; // find_equal_scalars() assigns range to r1 - * ... - * --- state #2 {r1.id = A, r2.id = A} --- - * r3 = r10 - * r3 += r1 // need to mark both r1 and r2 - */ - if (mark_precise_scalar_ids(env, st)) - return -EFAULT; - if (last_idx < 0) { /* we are at the entry into subprog, which * is expected for global funcs, but only if diff --git a/tools/testing/selftests/bpf/progs/verifier_scalar_ids.c b/tools/testing/selftests/bpf/progs/verifier_scalar_ids.c index 13b29a7faa71..b642d5c24154 100644 --- a/tools/testing/selftests/bpf/progs/verifier_scalar_ids.c +++ b/tools/testing/selftests/bpf/progs/verifier_scalar_ids.c @@ -5,54 +5,27 @@ #include "bpf_misc.h" /* Check that precision marks propagate through scalar IDs. - * Registers r{0,1,2} have the same scalar ID at the moment when r0 is - * marked to be precise, this mark is immediately propagated to r{1,2}. + * Registers r{0,1,2} have the same scalar ID. + * Range information is propagated for scalars sharing same ID. + * Check that precision mark for r0 causes precision marks for r{1,2} + * when range information is propagated for 'if ' insn. */ SEC("socket") __success __log_level(2) -__msg("frame0: regs=r0,r1,r2 stack= before 4: (bf) r3 = r10") -__msg("frame0: regs=r0,r1,r2 stack= before 3: (bf) r2 = r0") -__msg("frame0: regs=r0,r1 stack= before 2: (bf) r1 = r0") -__msg("frame0: regs=r0 stack= before 1: (57) r0 &= 255") -__msg("frame0: regs=r0 stack= before 0: (85) call bpf_ktime_get_ns") -__flag(BPF_F_TEST_STATE_FREQ) -__naked void precision_same_state(void) -{ - asm volatile ( - /* r0 = random number up to 0xff */ - "call %[bpf_ktime_get_ns];" - "r0 &= 0xff;" - /* tie r0.id == r1.id == r2.id */ - "r1 = r0;" - "r2 = r0;" - /* force r0 to be precise, this immediately marks r1 and r2 as - * precise as well because of shared IDs - */ - "r3 = r10;" - "r3 += r0;" - "r0 = 0;" - "exit;" - : - : __imm(bpf_ktime_get_ns) - : __clobber_all); -} - -/* Same as precision_same_state, but mark propagates through state / - * parent state boundary. - */ -SEC("socket") -__success __log_level(2) -__msg("frame0: last_idx 6 first_idx 5 subseq_idx -1") -__msg("frame0: regs=r0,r1,r2 stack= before 5: (bf) r3 = r10") +/* first 'if' branch */ +__msg("6: (0f) r3 += r0") +__msg("frame0: regs=r0 stack= before 4: (25) if r1 > 0x7 goto pc+0") __msg("frame0: parent state regs=r0,r1,r2 stack=:") -__msg("frame0: regs=r0,r1,r2 stack= before 4: (05) goto pc+0") __msg("frame0: regs=r0,r1,r2 stack= before 3: (bf) r2 = r0") -__msg("frame0: regs=r0,r1 stack= before 2: (bf) r1 = r0") -__msg("frame0: regs=r0 stack= before 1: (57) r0 &= 255") -__msg("frame0: parent state regs=r0 stack=:") -__msg("frame0: regs=r0 stack= before 0: (85) call bpf_ktime_get_ns") +/* second 'if' branch */ +__msg("from 4 to 5: ") +__msg("6: (0f) r3 += r0") +__msg("frame0: regs=r0 stack= before 5: (bf) r3 = r10") +__msg("frame0: regs=r0 stack= before 4: (25) if r1 > 0x7 goto pc+0") +/* parent state already has r{0,1,2} as precise */ +__msg("frame0: parent state regs= stack=:") __flag(BPF_F_TEST_STATE_FREQ) -__naked void precision_cross_state(void) +__naked void linked_regs_bpf_k(void) { asm volatile ( /* r0 = random number up to 0xff */ @@ -61,9 +34,8 @@ __naked void precision_cross_state(void) /* tie r0.id == r1.id == r2.id */ "r1 = r0;" "r2 = r0;" - /* force checkpoint */ - "goto +0;" - /* force r0 to be precise, this immediately marks r1 and r2 as + "if r1 > 7 goto +0;" + /* force r0 to be precise, this eventually marks r1 and r2 as * precise as well because of shared IDs */ "r3 = r10;" @@ -75,59 +47,18 @@ __naked void precision_cross_state(void) : __clobber_all); } -/* Same as precision_same_state, but break one of the +/* Same as linked_regs_bpf_k, but break one of the * links, note that r1 is absent from regs=... in __msg below. */ SEC("socket") __success __log_level(2) -__msg("frame0: regs=r0,r2 stack= before 5: (bf) r3 = r10") -__msg("frame0: regs=r0,r2 stack= before 4: (b7) r1 = 0") -__msg("frame0: regs=r0,r2 stack= before 3: (bf) r2 = r0") -__msg("frame0: regs=r0 stack= before 2: (bf) r1 = r0") -__msg("frame0: regs=r0 stack= before 1: (57) r0 &= 255") -__msg("frame0: regs=r0 stack= before 0: (85) call bpf_ktime_get_ns") -__flag(BPF_F_TEST_STATE_FREQ) -__naked void precision_same_state_broken_link(void) -{ - asm volatile ( - /* r0 = random number up to 0xff */ - "call %[bpf_ktime_get_ns];" - "r0 &= 0xff;" - /* tie r0.id == r1.id == r2.id */ - "r1 = r0;" - "r2 = r0;" - /* break link for r1, this is the only line that differs - * compared to the previous test - */ - "r1 = 0;" - /* force r0 to be precise, this immediately marks r1 and r2 as - * precise as well because of shared IDs - */ - "r3 = r10;" - "r3 += r0;" - "r0 = 0;" - "exit;" - : - : __imm(bpf_ktime_get_ns) - : __clobber_all); -} - -/* Same as precision_same_state_broken_link, but with state / - * parent state boundary. - */ -SEC("socket") -__success __log_level(2) -__msg("frame0: regs=r0,r2 stack= before 6: (bf) r3 = r10") -__msg("frame0: regs=r0,r2 stack= before 5: (b7) r1 = 0") -__msg("frame0: parent state regs=r0,r2 stack=:") -__msg("frame0: regs=r0,r1,r2 stack= before 4: (05) goto pc+0") -__msg("frame0: regs=r0,r1,r2 stack= before 3: (bf) r2 = r0") -__msg("frame0: regs=r0,r1 stack= before 2: (bf) r1 = r0") -__msg("frame0: regs=r0 stack= before 1: (57) r0 &= 255") +__msg("7: (0f) r3 += r0") +__msg("frame0: regs=r0 stack= before 6: (bf) r3 = r10") __msg("frame0: parent state regs=r0 stack=:") -__msg("frame0: regs=r0 stack= before 0: (85) call bpf_ktime_get_ns") +__msg("frame0: regs=r0 stack= before 5: (25) if r0 > 0x7 goto pc+0") +__msg("frame0: parent state regs=r0,r2 stack=:") __flag(BPF_F_TEST_STATE_FREQ) -__naked void precision_cross_state_broken_link(void) +__naked void linked_regs_broken_link(void) { asm volatile ( /* r0 = random number up to 0xff */ @@ -136,18 +67,13 @@ __naked void precision_cross_state_broken_link(void) /* tie r0.id == r1.id == r2.id */ "r1 = r0;" "r2 = r0;" - /* force checkpoint, although link between r1 and r{0,2} is - * broken by the next statement current precision tracking - * algorithm can't react to it and propagates mark for r1 to - * the parent state. - */ - "goto +0;" /* break link for r1, this is the only line that differs - * compared to precision_cross_state() + * compared to the previous test */ "r1 = 0;" - /* force r0 to be precise, this immediately marks r1 and r2 as - * precise as well because of shared IDs + "if r0 > 7 goto +0;" + /* force r0 to be precise, + * this eventually marks r2 as precise because of shared IDs */ "r3 = r10;" "r3 += r0;" @@ -164,10 +90,16 @@ __naked void precision_cross_state_broken_link(void) */ SEC("socket") __success __log_level(2) -__msg("11: (0f) r2 += r1") +__msg("12: (0f) r2 += r1") /* Current state */ -__msg("frame2: last_idx 11 first_idx 10 subseq_idx -1") -__msg("frame2: regs=r1 stack= before 10: (bf) r2 = r10") +__msg("frame2: last_idx 12 first_idx 11 subseq_idx -1 ") +__msg("frame2: regs=r1 stack= before 11: (bf) r2 = r10") +__msg("frame2: parent state regs=r1 stack=") +__msg("frame1: parent state regs= stack=") +__msg("frame0: parent state regs= stack=") +/* Parent state */ +__msg("frame2: last_idx 10 first_idx 10 subseq_idx 11 ") +__msg("frame2: regs=r1 stack= before 10: (25) if r1 > 0x7 goto pc+0") __msg("frame2: parent state regs=r1 stack=") /* frame1.r{6,7} are marked because mark_precise_scalar_ids() * looks for all registers with frame2.r1.id in the current state @@ -192,7 +124,7 @@ __msg("frame1: regs=r1 stack= before 4: (85) call pc+1") __msg("frame0: parent state regs=r1,r6 stack=") /* Parent state */ __msg("frame0: last_idx 3 first_idx 1 subseq_idx 4") -__msg("frame0: regs=r0,r1,r6 stack= before 3: (bf) r6 = r0") +__msg("frame0: regs=r1,r6 stack= before 3: (bf) r6 = r0") __msg("frame0: regs=r0,r1 stack= before 2: (bf) r1 = r0") __msg("frame0: regs=r0 stack= before 1: (57) r0 &= 255") __flag(BPF_F_TEST_STATE_FREQ) @@ -230,7 +162,8 @@ static __naked __noinline __used void precision_many_frames__bar(void) { asm volatile ( - /* force r1 to be precise, this immediately marks: + "if r1 > 7 goto +0;" + /* force r1 to be precise, this eventually marks: * - bar frame r1 * - foo frame r{1,6,7} * - main frame r{1,6} @@ -247,14 +180,16 @@ void precision_many_frames__bar(void) */ SEC("socket") __success __log_level(2) +__msg("11: (0f) r2 += r1") /* foo frame */ -__msg("frame1: regs=r1 stack=-8,-16 before 9: (bf) r2 = r10") +__msg("frame1: regs=r1 stack= before 10: (bf) r2 = r10") +__msg("frame1: regs=r1 stack= before 9: (25) if r1 > 0x7 goto pc+0") __msg("frame1: regs=r1 stack=-8,-16 before 8: (7b) *(u64 *)(r10 -16) = r1") __msg("frame1: regs=r1 stack=-8 before 7: (7b) *(u64 *)(r10 -8) = r1") __msg("frame1: regs=r1 stack= before 4: (85) call pc+2") /* main frame */ -__msg("frame0: regs=r0,r1 stack=-8 before 3: (7b) *(u64 *)(r10 -8) = r1") -__msg("frame0: regs=r0,r1 stack= before 2: (bf) r1 = r0") +__msg("frame0: regs=r1 stack=-8 before 3: (7b) *(u64 *)(r10 -8) = r1") +__msg("frame0: regs=r1 stack= before 2: (bf) r1 = r0") __msg("frame0: regs=r0 stack= before 1: (57) r0 &= 255") __flag(BPF_F_TEST_STATE_FREQ) __naked void precision_stack(void) @@ -283,7 +218,8 @@ void precision_stack__foo(void) */ "*(u64*)(r10 - 8) = r1;" "*(u64*)(r10 - 16) = r1;" - /* force r1 to be precise, this immediately marks: + "if r1 > 7 goto +0;" + /* force r1 to be precise, this eventually marks: * - foo frame r1,fp{-8,-16} * - main frame r1,fp{-8} */ @@ -299,15 +235,17 @@ void precision_stack__foo(void) SEC("socket") __success __log_level(2) /* r{6,7} */ -__msg("11: (0f) r3 += r7") -__msg("frame0: regs=r6,r7 stack= before 10: (bf) r3 = r10") +__msg("12: (0f) r3 += r7") +__msg("frame0: regs=r7 stack= before 11: (bf) r3 = r10") +__msg("frame0: regs=r7 stack= before 9: (25) if r7 > 0x7 goto pc+0") /* ... skip some insns ... */ __msg("frame0: regs=r6,r7 stack= before 3: (bf) r7 = r0") __msg("frame0: regs=r0,r6 stack= before 2: (bf) r6 = r0") /* r{8,9} */ -__msg("12: (0f) r3 += r9") -__msg("frame0: regs=r8,r9 stack= before 11: (0f) r3 += r7") +__msg("13: (0f) r3 += r9") +__msg("frame0: regs=r9 stack= before 12: (0f) r3 += r7") /* ... skip some insns ... */ +__msg("frame0: regs=r9 stack= before 10: (25) if r9 > 0x7 goto pc+0") __msg("frame0: regs=r8,r9 stack= before 7: (bf) r9 = r0") __msg("frame0: regs=r0,r8 stack= before 6: (bf) r8 = r0") __flag(BPF_F_TEST_STATE_FREQ) @@ -328,8 +266,9 @@ __naked void precision_two_ids(void) "r9 = r0;" /* clear r0 id */ "r0 = 0;" - /* force checkpoint */ - "goto +0;" + /* propagate equal scalars precision */ + "if r7 > 7 goto +0;" + "if r9 > 7 goto +0;" "r3 = r10;" /* force r7 to be precise, this also marks r6 */ "r3 += r7;" diff --git a/tools/testing/selftests/bpf/verifier/precise.c b/tools/testing/selftests/bpf/verifier/precise.c index 64d722199e8f..59a020c35647 100644 --- a/tools/testing/selftests/bpf/verifier/precise.c +++ b/tools/testing/selftests/bpf/verifier/precise.c @@ -106,7 +106,7 @@ mark_precise: frame0: regs=r2 stack= before 22\ mark_precise: frame0: parent state regs=r2 stack=:\ mark_precise: frame0: last_idx 20 first_idx 20\ - mark_precise: frame0: regs=r2,r9 stack= before 20\ + mark_precise: frame0: regs=r2 stack= before 20\ mark_precise: frame0: parent state regs=r2,r9 stack=:\ mark_precise: frame0: last_idx 19 first_idx 17\ mark_precise: frame0: regs=r2,r9 stack= before 19\ @@ -183,10 +183,10 @@ .prog_type = BPF_PROG_TYPE_XDP, .flags = BPF_F_TEST_STATE_FREQ, .errstr = "mark_precise: frame0: last_idx 7 first_idx 7\ - mark_precise: frame0: parent state regs=r4 stack=-8:\ + mark_precise: frame0: parent state regs=r4 stack=:\ mark_precise: frame0: last_idx 6 first_idx 4\ - mark_precise: frame0: regs=r4 stack=-8 before 6: (b7) r0 = -1\ - mark_precise: frame0: regs=r4 stack=-8 before 5: (79) r4 = *(u64 *)(r10 -8)\ + mark_precise: frame0: regs=r4 stack= before 6: (b7) r0 = -1\ + mark_precise: frame0: regs=r4 stack= before 5: (79) r4 = *(u64 *)(r10 -8)\ mark_precise: frame0: regs= stack=-8 before 4: (7b) *(u64 *)(r3 -8) = r0\ mark_precise: frame0: parent state regs=r0 stack=:\ mark_precise: frame0: last_idx 3 first_idx 3\ -- cgit v1.2.3 From bebc17b1c03b224a0b4aec6a171815e39f8ba9bc Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Thu, 18 Jul 2024 13:23:55 -0700 Subject: selftests/bpf: Tests for per-insn sync_linked_regs() precision tracking Add a few test cases to verify precision tracking for scalars gaining range because of sync_linked_regs(): - check what happens when more than 6 registers might gain range in sync_linked_regs(); - check if precision is propagated correctly when operand of conditional jump gained range in sync_linked_regs() and one of linked registers is marked precise; - check if precision is propagated correctly when operand of conditional jump gained range in sync_linked_regs() and a other-linked operand of the conditional jump is marked precise; - add a minimized reproducer for precision tracking bug reported in [0]; - Check that mark_chain_precision() for one of the conditional jump operands does not trigger equal scalars precision propagation. [0] https://lore.kernel.org/bpf/CAEf4BzZ0xidVCqB47XnkXcNhkPWF6_nTV7yt+_Lf0kcFEut2Mg@mail.gmail.com/ Signed-off-by: Eduard Zingerman Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20240718202357.1746514-4-eddyz87@gmail.com --- .../selftests/bpf/progs/verifier_scalar_ids.c | 165 +++++++++++++++++++++ 1 file changed, 165 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/progs/verifier_scalar_ids.c b/tools/testing/selftests/bpf/progs/verifier_scalar_ids.c index b642d5c24154..2ecf77b623e0 100644 --- a/tools/testing/selftests/bpf/progs/verifier_scalar_ids.c +++ b/tools/testing/selftests/bpf/progs/verifier_scalar_ids.c @@ -47,6 +47,72 @@ __naked void linked_regs_bpf_k(void) : __clobber_all); } +/* Registers r{0,1,2} share same ID when 'if r1 > ...' insn is processed, + * check that verifier marks r{1,2} as precise while backtracking + * 'if r1 > ...' with r0 already marked. + */ +SEC("socket") +__success __log_level(2) +__flag(BPF_F_TEST_STATE_FREQ) +__msg("frame0: regs=r0 stack= before 5: (2d) if r1 > r3 goto pc+0") +__msg("frame0: parent state regs=r0,r1,r2,r3 stack=:") +__msg("frame0: regs=r0,r1,r2,r3 stack= before 4: (b7) r3 = 7") +__naked void linked_regs_bpf_x_src(void) +{ + asm volatile ( + /* r0 = random number up to 0xff */ + "call %[bpf_ktime_get_ns];" + "r0 &= 0xff;" + /* tie r0.id == r1.id == r2.id */ + "r1 = r0;" + "r2 = r0;" + "r3 = 7;" + "if r1 > r3 goto +0;" + /* force r0 to be precise, this eventually marks r1 and r2 as + * precise as well because of shared IDs + */ + "r4 = r10;" + "r4 += r0;" + "r0 = 0;" + "exit;" + : + : __imm(bpf_ktime_get_ns) + : __clobber_all); +} + +/* Registers r{0,1,2} share same ID when 'if r1 > r3' insn is processed, + * check that verifier marks r{0,1,2} as precise while backtracking + * 'if r1 > r3' with r3 already marked. + */ +SEC("socket") +__success __log_level(2) +__flag(BPF_F_TEST_STATE_FREQ) +__msg("frame0: regs=r3 stack= before 5: (2d) if r1 > r3 goto pc+0") +__msg("frame0: parent state regs=r0,r1,r2,r3 stack=:") +__msg("frame0: regs=r0,r1,r2,r3 stack= before 4: (b7) r3 = 7") +__naked void linked_regs_bpf_x_dst(void) +{ + asm volatile ( + /* r0 = random number up to 0xff */ + "call %[bpf_ktime_get_ns];" + "r0 &= 0xff;" + /* tie r0.id == r1.id == r2.id */ + "r1 = r0;" + "r2 = r0;" + "r3 = 7;" + "if r1 > r3 goto +0;" + /* force r0 to be precise, this eventually marks r1 and r2 as + * precise as well because of shared IDs + */ + "r4 = r10;" + "r4 += r3;" + "r0 = 0;" + "exit;" + : + : __imm(bpf_ktime_get_ns) + : __clobber_all); +} + /* Same as linked_regs_bpf_k, but break one of the * links, note that r1 is absent from regs=... in __msg below. */ @@ -280,6 +346,105 @@ __naked void precision_two_ids(void) : __clobber_all); } +SEC("socket") +__success __log_level(2) +__flag(BPF_F_TEST_STATE_FREQ) +/* check thar r0 and r6 have different IDs after 'if', + * collect_linked_regs() can't tie more than 6 registers for a single insn. + */ +__msg("8: (25) if r0 > 0x7 goto pc+0 ; R0=scalar(id=1") +__msg("9: (bf) r6 = r6 ; R6_w=scalar(id=2") +/* check that r{0-5} are marked precise after 'if' */ +__msg("frame0: regs=r0 stack= before 8: (25) if r0 > 0x7 goto pc+0") +__msg("frame0: parent state regs=r0,r1,r2,r3,r4,r5 stack=:") +__naked void linked_regs_too_many_regs(void) +{ + asm volatile ( + /* r0 = random number up to 0xff */ + "call %[bpf_ktime_get_ns];" + "r0 &= 0xff;" + /* tie r{0-6} IDs */ + "r1 = r0;" + "r2 = r0;" + "r3 = r0;" + "r4 = r0;" + "r5 = r0;" + "r6 = r0;" + /* propagate range for r{0-6} */ + "if r0 > 7 goto +0;" + /* make r6 appear in the log */ + "r6 = r6;" + /* force r0 to be precise, + * this would cause r{0-4} to be precise because of shared IDs + */ + "r7 = r10;" + "r7 += r0;" + "r0 = 0;" + "exit;" + : + : __imm(bpf_ktime_get_ns) + : __clobber_all); +} + +SEC("socket") +__failure __log_level(2) +__flag(BPF_F_TEST_STATE_FREQ) +__msg("regs=r7 stack= before 5: (3d) if r8 >= r0") +__msg("parent state regs=r0,r7,r8") +__msg("regs=r0,r7,r8 stack= before 4: (25) if r0 > 0x1") +__msg("div by zero") +__naked void linked_regs_broken_link_2(void) +{ + asm volatile ( + "call %[bpf_get_prandom_u32];" + "r7 = r0;" + "r8 = r0;" + "call %[bpf_get_prandom_u32];" + "if r0 > 1 goto +0;" + /* r7.id == r8.id, + * thus r7 precision implies r8 precision, + * which implies r0 precision because of the conditional below. + */ + "if r8 >= r0 goto 1f;" + /* break id relation between r7 and r8 */ + "r8 += r8;" + /* make r7 precise */ + "if r7 == 0 goto 1f;" + "r0 /= 0;" +"1:" + "r0 = 42;" + "exit;" + : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +/* Check that mark_chain_precision() for one of the conditional jump + * operands does not trigger equal scalars precision propagation. + */ +SEC("socket") +__success __log_level(2) +__msg("3: (25) if r1 > 0x100 goto pc+0") +__msg("frame0: regs=r1 stack= before 2: (bf) r1 = r0") +__naked void cjmp_no_linked_regs_trigger(void) +{ + asm volatile ( + /* r0 = random number up to 0xff */ + "call %[bpf_ktime_get_ns];" + "r0 &= 0xff;" + /* tie r0.id == r1.id */ + "r1 = r0;" + /* the jump below would be predicted, thus r1 would be marked precise, + * this should not imply precision mark for r0 + */ + "if r1 > 256 goto +0;" + "r0 = 0;" + "exit;" + : + : __imm(bpf_ktime_get_ns) + : __clobber_all); +} + /* Verify that check_ids() is used by regsafe() for scalars. * * r9 = ... some pointer with range X ... -- cgit v1.2.3 From cfbf25481d6dec0089c99c9d33a2ea634fe8f008 Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Thu, 18 Jul 2024 13:23:56 -0700 Subject: selftests/bpf: Update comments find_equal_scalars->sync_linked_regs find_equal_scalars() is renamed to sync_linked_regs(), this commit updates existing references in the selftests comments. Signed-off-by: Eduard Zingerman Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20240718202357.1746514-5-eddyz87@gmail.com --- tools/testing/selftests/bpf/progs/verifier_spill_fill.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/progs/verifier_spill_fill.c b/tools/testing/selftests/bpf/progs/verifier_spill_fill.c index 85e48069c9e6..9d288ec7a168 100644 --- a/tools/testing/selftests/bpf/progs/verifier_spill_fill.c +++ b/tools/testing/selftests/bpf/progs/verifier_spill_fill.c @@ -402,7 +402,7 @@ __naked void spill_32bit_of_64bit_fail(void) *(u32*)(r10 - 8) = r1; \ /* 32-bit fill r2 from stack. */ \ r2 = *(u32*)(r10 - 8); \ - /* Compare r2 with another register to trigger find_equal_scalars.\ + /* Compare r2 with another register to trigger sync_linked_regs.\ * Having one random bit is important here, otherwise the verifier cuts\ * the corners. If the ID was mistakenly preserved on spill, this would\ * cause the verifier to think that r1 is also equal to zero in one of\ @@ -441,7 +441,7 @@ __naked void spill_16bit_of_32bit_fail(void) *(u16*)(r10 - 8) = r1; \ /* 16-bit fill r2 from stack. */ \ r2 = *(u16*)(r10 - 8); \ - /* Compare r2 with another register to trigger find_equal_scalars.\ + /* Compare r2 with another register to trigger sync_linked_regs.\ * Having one random bit is important here, otherwise the verifier cuts\ * the corners. If the ID was mistakenly preserved on spill, this would\ * cause the verifier to think that r1 is also equal to zero in one of\ @@ -833,7 +833,7 @@ __naked void spill_64bit_of_64bit_ok(void) *(u64*)(r10 - 8) = r0; \ /* 64-bit fill r1 from stack - should preserve the ID. */\ r1 = *(u64*)(r10 - 8); \ - /* Compare r1 with another register to trigger find_equal_scalars.\ + /* Compare r1 with another register to trigger sync_linked_regs.\ * Having one random bit is important here, otherwise the verifier cuts\ * the corners. \ */ \ @@ -866,7 +866,7 @@ __naked void spill_32bit_of_32bit_ok(void) *(u32*)(r10 - 8) = r0; \ /* 32-bit fill r1 from stack - should preserve the ID. */\ r1 = *(u32*)(r10 - 8); \ - /* Compare r1 with another register to trigger find_equal_scalars.\ + /* Compare r1 with another register to trigger sync_linked_regs.\ * Having one random bit is important here, otherwise the verifier cuts\ * the corners. \ */ \ @@ -899,7 +899,7 @@ __naked void spill_16bit_of_16bit_ok(void) *(u16*)(r10 - 8) = r0; \ /* 16-bit fill r1 from stack - should preserve the ID. */\ r1 = *(u16*)(r10 - 8); \ - /* Compare r1 with another register to trigger find_equal_scalars.\ + /* Compare r1 with another register to trigger sync_linked_regs.\ * Having one random bit is important here, otherwise the verifier cuts\ * the corners. \ */ \ @@ -932,7 +932,7 @@ __naked void spill_8bit_of_8bit_ok(void) *(u8*)(r10 - 8) = r0; \ /* 8-bit fill r1 from stack - should preserve the ID. */\ r1 = *(u8*)(r10 - 8); \ - /* Compare r1 with another register to trigger find_equal_scalars.\ + /* Compare r1 with another register to trigger sync_linked_regs.\ * Having one random bit is important here, otherwise the verifier cuts\ * the corners. \ */ \ @@ -1029,7 +1029,7 @@ __naked void fill_32bit_after_spill_64bit_preserve_id(void) "r1 = *(u32*)(r10 - 4);" #endif " \ - /* Compare r1 with another register to trigger find_equal_scalars. */\ + /* Compare r1 with another register to trigger sync_linked_regs. */\ r2 = 0; \ if r1 != r2 goto l0_%=; \ /* The result of this comparison is predefined. */\ @@ -1070,7 +1070,7 @@ __naked void fill_32bit_after_spill_64bit_clear_id(void) "r2 = *(u32*)(r10 - 4);" #endif " \ - /* Compare r2 with another register to trigger find_equal_scalars.\ + /* Compare r2 with another register to trigger sync_linked_regs.\ * Having one random bit is important here, otherwise the verifier cuts\ * the corners. If the ID was mistakenly preserved on fill, this would\ * cause the verifier to think that r1 is also equal to zero in one of\ -- cgit v1.2.3 From b83b936f3e9a3c63896852198a1814e90e68eef5 Mon Sep 17 00:00:00 2001 From: Leon Hwang Date: Sun, 14 Jul 2024 20:39:02 +0800 Subject: selftests/bpf: Add testcases for tailcall hierarchy fixing Add some test cases to confirm the tailcall hierarchy issue has been fixed. On x64, the selftests result is: cd tools/testing/selftests/bpf && ./test_progs -t tailcalls 327/18 tailcalls/tailcall_bpf2bpf_hierarchy_1:OK 327/19 tailcalls/tailcall_bpf2bpf_hierarchy_fentry:OK 327/20 tailcalls/tailcall_bpf2bpf_hierarchy_fexit:OK 327/21 tailcalls/tailcall_bpf2bpf_hierarchy_fentry_fexit:OK 327/22 tailcalls/tailcall_bpf2bpf_hierarchy_fentry_entry:OK 327/23 tailcalls/tailcall_bpf2bpf_hierarchy_2:OK 327/24 tailcalls/tailcall_bpf2bpf_hierarchy_3:OK 327 tailcalls:OK Summary: 1/24 PASSED, 0 SKIPPED, 0 FAILED On arm64, the selftests result is: cd tools/testing/selftests/bpf && ./test_progs -t tailcalls 327/18 tailcalls/tailcall_bpf2bpf_hierarchy_1:OK 327/19 tailcalls/tailcall_bpf2bpf_hierarchy_fentry:OK 327/20 tailcalls/tailcall_bpf2bpf_hierarchy_fexit:OK 327/21 tailcalls/tailcall_bpf2bpf_hierarchy_fentry_fexit:OK 327/22 tailcalls/tailcall_bpf2bpf_hierarchy_fentry_entry:OK 327/23 tailcalls/tailcall_bpf2bpf_hierarchy_2:OK 327/24 tailcalls/tailcall_bpf2bpf_hierarchy_3:OK 327 tailcalls:OK Summary: 1/24 PASSED, 0 SKIPPED, 0 FAILED Acked-by: Eduard Zingerman Signed-off-by: Leon Hwang Link: https://lore.kernel.org/r/20240714123902.32305-4-hffilwlqm@gmail.com Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko --- tools/testing/selftests/bpf/prog_tests/tailcalls.c | 320 +++++++++++++++++++++ .../bpf/progs/tailcall_bpf2bpf_hierarchy1.c | 34 +++ .../bpf/progs/tailcall_bpf2bpf_hierarchy2.c | 70 +++++ .../bpf/progs/tailcall_bpf2bpf_hierarchy3.c | 62 ++++ .../bpf/progs/tailcall_bpf2bpf_hierarchy_fentry.c | 35 +++ tools/testing/selftests/bpf/progs/tc_dummy.c | 12 + 6 files changed, 533 insertions(+) create mode 100644 tools/testing/selftests/bpf/progs/tailcall_bpf2bpf_hierarchy1.c create mode 100644 tools/testing/selftests/bpf/progs/tailcall_bpf2bpf_hierarchy2.c create mode 100644 tools/testing/selftests/bpf/progs/tailcall_bpf2bpf_hierarchy3.c create mode 100644 tools/testing/selftests/bpf/progs/tailcall_bpf2bpf_hierarchy_fentry.c create mode 100644 tools/testing/selftests/bpf/progs/tc_dummy.c (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/tailcalls.c b/tools/testing/selftests/bpf/prog_tests/tailcalls.c index 59993fc9c0d7..e01fabb8cc41 100644 --- a/tools/testing/selftests/bpf/prog_tests/tailcalls.c +++ b/tools/testing/selftests/bpf/prog_tests/tailcalls.c @@ -3,6 +3,8 @@ #include #include #include "tailcall_poke.skel.h" +#include "tailcall_bpf2bpf_hierarchy2.skel.h" +#include "tailcall_bpf2bpf_hierarchy3.skel.h" /* test_tailcall_1 checks basic functionality by patching multiple locations @@ -1187,6 +1189,312 @@ out: tailcall_poke__destroy(call); } +static void test_tailcall_hierarchy_count(const char *which, bool test_fentry, + bool test_fexit, + bool test_fentry_entry) +{ + int err, map_fd, prog_fd, main_data_fd, fentry_data_fd, fexit_data_fd, i, val; + struct bpf_object *obj = NULL, *fentry_obj = NULL, *fexit_obj = NULL; + struct bpf_link *fentry_link = NULL, *fexit_link = NULL; + struct bpf_program *prog, *fentry_prog; + struct bpf_map *prog_array, *data_map; + int fentry_prog_fd; + char buff[128] = {}; + + LIBBPF_OPTS(bpf_test_run_opts, topts, + .data_in = buff, + .data_size_in = sizeof(buff), + .repeat = 1, + ); + + err = bpf_prog_test_load(which, BPF_PROG_TYPE_SCHED_CLS, &obj, + &prog_fd); + if (!ASSERT_OK(err, "load obj")) + return; + + prog = bpf_object__find_program_by_name(obj, "entry"); + if (!ASSERT_OK_PTR(prog, "find entry prog")) + goto out; + + prog_fd = bpf_program__fd(prog); + if (!ASSERT_GE(prog_fd, 0, "prog_fd")) + goto out; + + if (test_fentry_entry) { + fentry_obj = bpf_object__open_file("tailcall_bpf2bpf_hierarchy_fentry.bpf.o", + NULL); + if (!ASSERT_OK_PTR(fentry_obj, "open fentry_obj file")) + goto out; + + fentry_prog = bpf_object__find_program_by_name(fentry_obj, + "fentry"); + if (!ASSERT_OK_PTR(prog, "find fentry prog")) + goto out; + + err = bpf_program__set_attach_target(fentry_prog, prog_fd, + "entry"); + if (!ASSERT_OK(err, "set_attach_target entry")) + goto out; + + err = bpf_object__load(fentry_obj); + if (!ASSERT_OK(err, "load fentry_obj")) + goto out; + + fentry_link = bpf_program__attach_trace(fentry_prog); + if (!ASSERT_OK_PTR(fentry_link, "attach_trace")) + goto out; + + fentry_prog_fd = bpf_program__fd(fentry_prog); + if (!ASSERT_GE(fentry_prog_fd, 0, "fentry_prog_fd")) + goto out; + + prog_array = bpf_object__find_map_by_name(fentry_obj, "jmp_table"); + if (!ASSERT_OK_PTR(prog_array, "find jmp_table")) + goto out; + + map_fd = bpf_map__fd(prog_array); + if (!ASSERT_GE(map_fd, 0, "map_fd")) + goto out; + + i = 0; + err = bpf_map_update_elem(map_fd, &i, &fentry_prog_fd, BPF_ANY); + if (!ASSERT_OK(err, "update jmp_table")) + goto out; + + data_map = bpf_object__find_map_by_name(fentry_obj, ".bss"); + if (!ASSERT_FALSE(!data_map || !bpf_map__is_internal(data_map), + "find data_map")) + goto out; + + } else { + prog_array = bpf_object__find_map_by_name(obj, "jmp_table"); + if (!ASSERT_OK_PTR(prog_array, "find jmp_table")) + goto out; + + map_fd = bpf_map__fd(prog_array); + if (!ASSERT_GE(map_fd, 0, "map_fd")) + goto out; + + i = 0; + err = bpf_map_update_elem(map_fd, &i, &prog_fd, BPF_ANY); + if (!ASSERT_OK(err, "update jmp_table")) + goto out; + + data_map = bpf_object__find_map_by_name(obj, ".bss"); + if (!ASSERT_FALSE(!data_map || !bpf_map__is_internal(data_map), + "find data_map")) + goto out; + } + + if (test_fentry) { + fentry_obj = bpf_object__open_file("tailcall_bpf2bpf_fentry.bpf.o", + NULL); + if (!ASSERT_OK_PTR(fentry_obj, "open fentry_obj file")) + goto out; + + prog = bpf_object__find_program_by_name(fentry_obj, "fentry"); + if (!ASSERT_OK_PTR(prog, "find fentry prog")) + goto out; + + err = bpf_program__set_attach_target(prog, prog_fd, + "subprog_tail"); + if (!ASSERT_OK(err, "set_attach_target subprog_tail")) + goto out; + + err = bpf_object__load(fentry_obj); + if (!ASSERT_OK(err, "load fentry_obj")) + goto out; + + fentry_link = bpf_program__attach_trace(prog); + if (!ASSERT_OK_PTR(fentry_link, "attach_trace")) + goto out; + } + + if (test_fexit) { + fexit_obj = bpf_object__open_file("tailcall_bpf2bpf_fexit.bpf.o", + NULL); + if (!ASSERT_OK_PTR(fexit_obj, "open fexit_obj file")) + goto out; + + prog = bpf_object__find_program_by_name(fexit_obj, "fexit"); + if (!ASSERT_OK_PTR(prog, "find fexit prog")) + goto out; + + err = bpf_program__set_attach_target(prog, prog_fd, + "subprog_tail"); + if (!ASSERT_OK(err, "set_attach_target subprog_tail")) + goto out; + + err = bpf_object__load(fexit_obj); + if (!ASSERT_OK(err, "load fexit_obj")) + goto out; + + fexit_link = bpf_program__attach_trace(prog); + if (!ASSERT_OK_PTR(fexit_link, "attach_trace")) + goto out; + } + + err = bpf_prog_test_run_opts(prog_fd, &topts); + ASSERT_OK(err, "tailcall"); + ASSERT_EQ(topts.retval, 1, "tailcall retval"); + + main_data_fd = bpf_map__fd(data_map); + if (!ASSERT_GE(main_data_fd, 0, "main_data_fd")) + goto out; + + i = 0; + err = bpf_map_lookup_elem(main_data_fd, &i, &val); + ASSERT_OK(err, "tailcall count"); + ASSERT_EQ(val, 34, "tailcall count"); + + if (test_fentry) { + data_map = bpf_object__find_map_by_name(fentry_obj, ".bss"); + if (!ASSERT_FALSE(!data_map || !bpf_map__is_internal(data_map), + "find tailcall_bpf2bpf_fentry.bss map")) + goto out; + + fentry_data_fd = bpf_map__fd(data_map); + if (!ASSERT_GE(fentry_data_fd, 0, + "find tailcall_bpf2bpf_fentry.bss map fd")) + goto out; + + i = 0; + err = bpf_map_lookup_elem(fentry_data_fd, &i, &val); + ASSERT_OK(err, "fentry count"); + ASSERT_EQ(val, 68, "fentry count"); + } + + if (test_fexit) { + data_map = bpf_object__find_map_by_name(fexit_obj, ".bss"); + if (!ASSERT_FALSE(!data_map || !bpf_map__is_internal(data_map), + "find tailcall_bpf2bpf_fexit.bss map")) + goto out; + + fexit_data_fd = bpf_map__fd(data_map); + if (!ASSERT_GE(fexit_data_fd, 0, + "find tailcall_bpf2bpf_fexit.bss map fd")) + goto out; + + i = 0; + err = bpf_map_lookup_elem(fexit_data_fd, &i, &val); + ASSERT_OK(err, "fexit count"); + ASSERT_EQ(val, 68, "fexit count"); + } + + i = 0; + err = bpf_map_delete_elem(map_fd, &i); + if (!ASSERT_OK(err, "delete_elem from jmp_table")) + goto out; + + err = bpf_prog_test_run_opts(prog_fd, &topts); + ASSERT_OK(err, "tailcall"); + ASSERT_EQ(topts.retval, 1, "tailcall retval"); + + i = 0; + err = bpf_map_lookup_elem(main_data_fd, &i, &val); + ASSERT_OK(err, "tailcall count"); + ASSERT_EQ(val, 35, "tailcall count"); + + if (test_fentry) { + i = 0; + err = bpf_map_lookup_elem(fentry_data_fd, &i, &val); + ASSERT_OK(err, "fentry count"); + ASSERT_EQ(val, 70, "fentry count"); + } + + if (test_fexit) { + i = 0; + err = bpf_map_lookup_elem(fexit_data_fd, &i, &val); + ASSERT_OK(err, "fexit count"); + ASSERT_EQ(val, 70, "fexit count"); + } + +out: + bpf_link__destroy(fentry_link); + bpf_link__destroy(fexit_link); + bpf_object__close(fentry_obj); + bpf_object__close(fexit_obj); + bpf_object__close(obj); +} + +/* test_tailcall_bpf2bpf_hierarchy_1 checks that the count value of the tail + * call limit enforcement matches with expectations when tailcalls are preceded + * with two bpf2bpf calls. + * + * subprog --tailcall-> entry + * entry < + * subprog --tailcall-> entry + */ +static void test_tailcall_bpf2bpf_hierarchy_1(void) +{ + test_tailcall_hierarchy_count("tailcall_bpf2bpf_hierarchy1.bpf.o", + false, false, false); +} + +/* test_tailcall_bpf2bpf_hierarchy_fentry checks that the count value of the + * tail call limit enforcement matches with expectations when tailcalls are + * preceded with two bpf2bpf calls, and the two subprogs are traced by fentry. + */ +static void test_tailcall_bpf2bpf_hierarchy_fentry(void) +{ + test_tailcall_hierarchy_count("tailcall_bpf2bpf_hierarchy1.bpf.o", + true, false, false); +} + +/* test_tailcall_bpf2bpf_hierarchy_fexit checks that the count value of the tail + * call limit enforcement matches with expectations when tailcalls are preceded + * with two bpf2bpf calls, and the two subprogs are traced by fexit. + */ +static void test_tailcall_bpf2bpf_hierarchy_fexit(void) +{ + test_tailcall_hierarchy_count("tailcall_bpf2bpf_hierarchy1.bpf.o", + false, true, false); +} + +/* test_tailcall_bpf2bpf_hierarchy_fentry_fexit checks that the count value of + * the tail call limit enforcement matches with expectations when tailcalls are + * preceded with two bpf2bpf calls, and the two subprogs are traced by both + * fentry and fexit. + */ +static void test_tailcall_bpf2bpf_hierarchy_fentry_fexit(void) +{ + test_tailcall_hierarchy_count("tailcall_bpf2bpf_hierarchy1.bpf.o", + true, true, false); +} + +/* test_tailcall_bpf2bpf_hierarchy_fentry_entry checks that the count value of + * the tail call limit enforcement matches with expectations when tailcalls are + * preceded with two bpf2bpf calls in fentry. + */ +static void test_tailcall_bpf2bpf_hierarchy_fentry_entry(void) +{ + test_tailcall_hierarchy_count("tc_dummy.bpf.o", false, false, true); +} + +/* test_tailcall_bpf2bpf_hierarchy_2 checks that the count value of the tail + * call limit enforcement matches with expectations: + * + * subprog_tail0 --tailcall-> classifier_0 -> subprog_tail0 + * entry < + * subprog_tail1 --tailcall-> classifier_1 -> subprog_tail1 + */ +static void test_tailcall_bpf2bpf_hierarchy_2(void) +{ + RUN_TESTS(tailcall_bpf2bpf_hierarchy2); +} + +/* test_tailcall_bpf2bpf_hierarchy_3 checks that the count value of the tail + * call limit enforcement matches with expectations: + * + * subprog with jmp_table0 to classifier_0 + * entry --tailcall-> classifier_0 < + * subprog with jmp_table1 to classifier_0 + */ +static void test_tailcall_bpf2bpf_hierarchy_3(void) +{ + RUN_TESTS(tailcall_bpf2bpf_hierarchy3); +} + void test_tailcalls(void) { if (test__start_subtest("tailcall_1")) @@ -1223,4 +1531,16 @@ void test_tailcalls(void) test_tailcall_bpf2bpf_fentry_entry(); if (test__start_subtest("tailcall_poke")) test_tailcall_poke(); + if (test__start_subtest("tailcall_bpf2bpf_hierarchy_1")) + test_tailcall_bpf2bpf_hierarchy_1(); + if (test__start_subtest("tailcall_bpf2bpf_hierarchy_fentry")) + test_tailcall_bpf2bpf_hierarchy_fentry(); + if (test__start_subtest("tailcall_bpf2bpf_hierarchy_fexit")) + test_tailcall_bpf2bpf_hierarchy_fexit(); + if (test__start_subtest("tailcall_bpf2bpf_hierarchy_fentry_fexit")) + test_tailcall_bpf2bpf_hierarchy_fentry_fexit(); + if (test__start_subtest("tailcall_bpf2bpf_hierarchy_fentry_entry")) + test_tailcall_bpf2bpf_hierarchy_fentry_entry(); + test_tailcall_bpf2bpf_hierarchy_2(); + test_tailcall_bpf2bpf_hierarchy_3(); } diff --git a/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf_hierarchy1.c b/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf_hierarchy1.c new file mode 100644 index 000000000000..327ca395e860 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf_hierarchy1.c @@ -0,0 +1,34 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include "bpf_legacy.h" + +struct { + __uint(type, BPF_MAP_TYPE_PROG_ARRAY); + __uint(max_entries, 1); + __uint(key_size, sizeof(__u32)); + __uint(value_size, sizeof(__u32)); +} jmp_table SEC(".maps"); + +int count = 0; + +static __noinline +int subprog_tail(struct __sk_buff *skb) +{ + bpf_tail_call_static(skb, &jmp_table, 0); + return 0; +} + +SEC("tc") +int entry(struct __sk_buff *skb) +{ + int ret = 1; + + count++; + subprog_tail(skb); + subprog_tail(skb); + + return ret; +} + +char __license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf_hierarchy2.c b/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf_hierarchy2.c new file mode 100644 index 000000000000..37604b0b97af --- /dev/null +++ b/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf_hierarchy2.c @@ -0,0 +1,70 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include "bpf_misc.h" + +int classifier_0(struct __sk_buff *skb); +int classifier_1(struct __sk_buff *skb); + +struct { + __uint(type, BPF_MAP_TYPE_PROG_ARRAY); + __uint(max_entries, 2); + __uint(key_size, sizeof(__u32)); + __array(values, void (void)); +} jmp_table SEC(".maps") = { + .values = { + [0] = (void *) &classifier_0, + [1] = (void *) &classifier_1, + }, +}; + +int count0 = 0; +int count1 = 0; + +static __noinline +int subprog_tail0(struct __sk_buff *skb) +{ + bpf_tail_call_static(skb, &jmp_table, 0); + return 0; +} + +__auxiliary +SEC("tc") +int classifier_0(struct __sk_buff *skb) +{ + count0++; + subprog_tail0(skb); + return 0; +} + +static __noinline +int subprog_tail1(struct __sk_buff *skb) +{ + bpf_tail_call_static(skb, &jmp_table, 1); + return 0; +} + +__auxiliary +SEC("tc") +int classifier_1(struct __sk_buff *skb) +{ + count1++; + subprog_tail1(skb); + return 0; +} + +__success +__retval(33) +SEC("tc") +int tailcall_bpf2bpf_hierarchy_2(struct __sk_buff *skb) +{ + volatile int ret = 0; + + subprog_tail0(skb); + subprog_tail1(skb); + + asm volatile (""::"r+"(ret)); + return (count1 << 16) | count0; +} + +char __license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf_hierarchy3.c b/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf_hierarchy3.c new file mode 100644 index 000000000000..0cdbb781fcbc --- /dev/null +++ b/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf_hierarchy3.c @@ -0,0 +1,62 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include "bpf_misc.h" + +int classifier_0(struct __sk_buff *skb); + +struct { + __uint(type, BPF_MAP_TYPE_PROG_ARRAY); + __uint(max_entries, 1); + __uint(key_size, sizeof(__u32)); + __array(values, void (void)); +} jmp_table0 SEC(".maps") = { + .values = { + [0] = (void *) &classifier_0, + }, +}; + +struct { + __uint(type, BPF_MAP_TYPE_PROG_ARRAY); + __uint(max_entries, 1); + __uint(key_size, sizeof(__u32)); + __array(values, void (void)); +} jmp_table1 SEC(".maps") = { + .values = { + [0] = (void *) &classifier_0, + }, +}; + +int count = 0; + +static __noinline +int subprog_tail(struct __sk_buff *skb, void *jmp_table) +{ + bpf_tail_call_static(skb, jmp_table, 0); + return 0; +} + +__auxiliary +SEC("tc") +int classifier_0(struct __sk_buff *skb) +{ + count++; + subprog_tail(skb, &jmp_table0); + subprog_tail(skb, &jmp_table1); + return count; +} + +__success +__retval(33) +SEC("tc") +int tailcall_bpf2bpf_hierarchy_3(struct __sk_buff *skb) +{ + volatile int ret = 0; + + bpf_tail_call_static(skb, &jmp_table0, 0); + + asm volatile (""::"r+"(ret)); + return ret; +} + +char __license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf_hierarchy_fentry.c b/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf_hierarchy_fentry.c new file mode 100644 index 000000000000..c87f9ca982d3 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf_hierarchy_fentry.c @@ -0,0 +1,35 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright Leon Hwang */ + +#include "vmlinux.h" +#include +#include + +struct { + __uint(type, BPF_MAP_TYPE_PROG_ARRAY); + __uint(max_entries, 1); + __uint(key_size, sizeof(__u32)); + __uint(value_size, sizeof(__u32)); +} jmp_table SEC(".maps"); + +int count = 0; + +static __noinline +int subprog_tail(void *ctx) +{ + bpf_tail_call_static(ctx, &jmp_table, 0); + return 0; +} + +SEC("fentry/dummy") +int BPF_PROG(fentry, struct sk_buff *skb) +{ + count++; + subprog_tail(ctx); + subprog_tail(ctx); + + return 0; +} + + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/tc_dummy.c b/tools/testing/selftests/bpf/progs/tc_dummy.c new file mode 100644 index 000000000000..69a3d0dc8787 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/tc_dummy.c @@ -0,0 +1,12 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include "bpf_legacy.h" + +SEC("tc") +int entry(struct __sk_buff *skb) +{ + return 1; +} + +char __license[] SEC("license") = "GPL"; -- cgit v1.2.3 From b7264f87f76c59698c7fd019a8797378fa17e7e3 Mon Sep 17 00:00:00 2001 From: Tao Chen Date: Sun, 21 Jul 2024 22:33:50 +0800 Subject: bpftool: Refactor xdp attach/detach type judgment This commit no logical changed, just increases code readability and facilitates TCX prog expansion, which will be implemented in the next patch. Signed-off-by: Tao Chen Signed-off-by: Daniel Borkmann Acked-by: Quentin Monnet Link: https://lore.kernel.org/bpf/20240721143353.95980-2-chen.dylane@gmail.com Signed-off-by: Andrii Nakryiko --- tools/bpf/bpftool/net.c | 26 +++++++++++++++++++++----- 1 file changed, 21 insertions(+), 5 deletions(-) (limited to 'tools') diff --git a/tools/bpf/bpftool/net.c b/tools/bpf/bpftool/net.c index 968714b4c3d4..1b9f4225b394 100644 --- a/tools/bpf/bpftool/net.c +++ b/tools/bpf/bpftool/net.c @@ -684,10 +684,18 @@ static int do_attach(int argc, char **argv) } } + switch (attach_type) { /* attach xdp prog */ - if (is_prefix("xdp", attach_type_strings[attach_type])) - err = do_attach_detach_xdp(progfd, attach_type, ifindex, - overwrite); + case NET_ATTACH_TYPE_XDP: + case NET_ATTACH_TYPE_XDP_GENERIC: + case NET_ATTACH_TYPE_XDP_DRIVER: + case NET_ATTACH_TYPE_XDP_OFFLOAD: + err = do_attach_detach_xdp(progfd, attach_type, ifindex, overwrite); + break; + default: + break; + } + if (err) { p_err("interface %s attach failed: %s", attach_type_strings[attach_type], strerror(-err)); @@ -721,10 +729,18 @@ static int do_detach(int argc, char **argv) if (ifindex < 1) return -EINVAL; + switch (attach_type) { /* detach xdp prog */ - progfd = -1; - if (is_prefix("xdp", attach_type_strings[attach_type])) + case NET_ATTACH_TYPE_XDP: + case NET_ATTACH_TYPE_XDP_GENERIC: + case NET_ATTACH_TYPE_XDP_DRIVER: + case NET_ATTACH_TYPE_XDP_OFFLOAD: + progfd = -1; err = do_attach_detach_xdp(progfd, attach_type, ifindex, NULL); + break; + default: + break; + } if (err < 0) { p_err("interface %s detach failed: %s", -- cgit v1.2.3 From 3b9d4fee8ad32599ba20ab36da7694feef022b48 Mon Sep 17 00:00:00 2001 From: Tao Chen Date: Sun, 21 Jul 2024 22:42:21 +0800 Subject: bpftool: Add net attach/detach command to tcx prog Now, attach/detach tcx prog supported in libbpf, so we can add new command 'bpftool attach/detach tcx' to attach tcx prog with bpftool for user. # bpftool prog load tc_prog.bpf.o /sys/fs/bpf/tc_prog # bpftool prog show ... 192: sched_cls name tc_prog tag 187aeb611ad00cfc gpl loaded_at 2024-07-11T15:58:16+0800 uid 0 xlated 152B jited 97B memlock 4096B map_ids 100,99,97 btf_id 260 # bpftool net attach tcx_ingress name tc_prog dev lo # bpftool net ... tc: lo(1) tcx/ingress tc_prog prog_id 29 # bpftool net detach tcx_ingress dev lo # bpftool net ... tc: # bpftool net attach tcx_ingress name tc_prog dev lo # bpftool net tc: lo(1) tcx/ingress tc_prog prog_id 29 Test environment: ubuntu_22_04, 6.7.0-060700-generic Signed-off-by: Tao Chen Signed-off-by: Daniel Borkmann Acked-by: Quentin Monnet Link: https://lore.kernel.org/bpf/20240721144221.96228-1-chen.dylane@gmail.com Signed-off-by: Andrii Nakryiko --- tools/bpf/bpftool/net.c | 43 ++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 42 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/bpf/bpftool/net.c b/tools/bpf/bpftool/net.c index 1b9f4225b394..2a51f1c25732 100644 --- a/tools/bpf/bpftool/net.c +++ b/tools/bpf/bpftool/net.c @@ -67,6 +67,8 @@ enum net_attach_type { NET_ATTACH_TYPE_XDP_GENERIC, NET_ATTACH_TYPE_XDP_DRIVER, NET_ATTACH_TYPE_XDP_OFFLOAD, + NET_ATTACH_TYPE_TCX_INGRESS, + NET_ATTACH_TYPE_TCX_EGRESS, }; static const char * const attach_type_strings[] = { @@ -74,6 +76,8 @@ static const char * const attach_type_strings[] = { [NET_ATTACH_TYPE_XDP_GENERIC] = "xdpgeneric", [NET_ATTACH_TYPE_XDP_DRIVER] = "xdpdrv", [NET_ATTACH_TYPE_XDP_OFFLOAD] = "xdpoffload", + [NET_ATTACH_TYPE_TCX_INGRESS] = "tcx_ingress", + [NET_ATTACH_TYPE_TCX_EGRESS] = "tcx_egress", }; static const char * const attach_loc_strings[] = { @@ -647,6 +651,32 @@ static int do_attach_detach_xdp(int progfd, enum net_attach_type attach_type, return bpf_xdp_attach(ifindex, progfd, flags, NULL); } +static int get_tcx_type(enum net_attach_type attach_type) +{ + switch (attach_type) { + case NET_ATTACH_TYPE_TCX_INGRESS: + return BPF_TCX_INGRESS; + case NET_ATTACH_TYPE_TCX_EGRESS: + return BPF_TCX_EGRESS; + default: + return -1; + } +} + +static int do_attach_tcx(int progfd, enum net_attach_type attach_type, int ifindex) +{ + int type = get_tcx_type(attach_type); + + return bpf_prog_attach(progfd, ifindex, type, 0); +} + +static int do_detach_tcx(int targetfd, enum net_attach_type attach_type) +{ + int type = get_tcx_type(attach_type); + + return bpf_prog_detach(targetfd, type); +} + static int do_attach(int argc, char **argv) { enum net_attach_type attach_type; @@ -692,6 +722,11 @@ static int do_attach(int argc, char **argv) case NET_ATTACH_TYPE_XDP_OFFLOAD: err = do_attach_detach_xdp(progfd, attach_type, ifindex, overwrite); break; + /* attach tcx prog */ + case NET_ATTACH_TYPE_TCX_INGRESS: + case NET_ATTACH_TYPE_TCX_EGRESS: + err = do_attach_tcx(progfd, attach_type, ifindex); + break; default: break; } @@ -738,6 +773,11 @@ static int do_detach(int argc, char **argv) progfd = -1; err = do_attach_detach_xdp(progfd, attach_type, ifindex, NULL); break; + /* detach tcx prog */ + case NET_ATTACH_TYPE_TCX_INGRESS: + case NET_ATTACH_TYPE_TCX_EGRESS: + err = do_detach_tcx(ifindex, attach_type); + break; default: break; } @@ -944,7 +984,8 @@ static int do_help(int argc, char **argv) " %1$s %2$s help\n" "\n" " " HELP_SPEC_PROGRAM "\n" - " ATTACH_TYPE := { xdp | xdpgeneric | xdpdrv | xdpoffload }\n" + " ATTACH_TYPE := { xdp | xdpgeneric | xdpdrv | xdpoffload | tcx_ingress\n" + " | tcx_egress }\n" " " HELP_SPEC_OPTIONS " }\n" "\n" "Note: Only xdp, tcx, tc, netkit, flow_dissector and netfilter attachments\n" -- cgit v1.2.3 From 4f88dde0e1525ee69d29ba235c3965685439d0e3 Mon Sep 17 00:00:00 2001 From: Tao Chen Date: Sun, 21 Jul 2024 22:42:38 +0800 Subject: bpftool: Add bash-completion for tcx subcommand This commit adds bash-completion for attaching tcx program on interface. Signed-off-by: Tao Chen Signed-off-by: Daniel Borkmann Acked-by: Quentin Monnet Link: https://lore.kernel.org/bpf/20240721144238.96246-1-chen.dylane@gmail.com Signed-off-by: Andrii Nakryiko --- tools/bpf/bpftool/bash-completion/bpftool | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/bpf/bpftool/bash-completion/bpftool b/tools/bpf/bpftool/bash-completion/bpftool index be99d49b8714..0c541498c301 100644 --- a/tools/bpf/bpftool/bash-completion/bpftool +++ b/tools/bpf/bpftool/bash-completion/bpftool @@ -1079,7 +1079,7 @@ _bpftool() esac ;; net) - local ATTACH_TYPES='xdp xdpgeneric xdpdrv xdpoffload' + local ATTACH_TYPES='xdp xdpgeneric xdpdrv xdpoffload tcx_ingress tcx_egress' case $command in show|list) [[ $prev != "$command" ]] && return 0 -- cgit v1.2.3 From 0d7c06125cea53b3d86d685d790b03b9ae9d6336 Mon Sep 17 00:00:00 2001 From: Tao Chen Date: Sun, 21 Jul 2024 22:42:52 +0800 Subject: bpftool: Add document for net attach/detach on tcx subcommand This commit adds sample output for net attach/detach on tcx subcommand. Signed-off-by: Tao Chen Signed-off-by: Daniel Borkmann Acked-by: Quentin Monnet Link: https://lore.kernel.org/bpf/20240721144252.96264-1-chen.dylane@gmail.com Signed-off-by: Andrii Nakryiko --- tools/bpf/bpftool/Documentation/bpftool-net.rst | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/bpf/bpftool/Documentation/bpftool-net.rst b/tools/bpf/bpftool/Documentation/bpftool-net.rst index 348812881297..4a8cb5e0d94b 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-net.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-net.rst @@ -29,7 +29,7 @@ NET COMMANDS | **bpftool** **net help** | | *PROG* := { **id** *PROG_ID* | **pinned** *FILE* | **tag** *PROG_TAG* | **name** *PROG_NAME* } -| *ATTACH_TYPE* := { **xdp** | **xdpgeneric** | **xdpdrv** | **xdpoffload** } +| *ATTACH_TYPE* := { **xdp** | **xdpgeneric** | **xdpdrv** | **xdpoffload** | **tcx_ingress** | **tcx_egress** } DESCRIPTION =========== @@ -69,6 +69,8 @@ bpftool net attach *ATTACH_TYPE* *PROG* dev *NAME* [ overwrite ] **xdpgeneric** - Generic XDP. runs at generic XDP hook when packet already enters receive path as skb; **xdpdrv** - Native XDP. runs earliest point in driver's receive path; **xdpoffload** - Offload XDP. runs directly on NIC on each packet reception; + **tcx_ingress** - Ingress TCX. runs on ingress net traffic; + **tcx_egress** - Egress TCX. runs on egress net traffic; bpftool net detach *ATTACH_TYPE* dev *NAME* Detach bpf program attached to network interface *NAME* with type specified @@ -178,3 +180,21 @@ EXAMPLES :: xdp: + +| +| **# bpftool net attach tcx_ingress name tc_prog dev lo** +| **# bpftool net** +| + +:: + tc: + lo(1) tcx/ingress tc_prog prog_id 29 + +| +| **# bpftool net attach tcx_ingress name tc_prog dev lo** +| **# bpftool net detach tcx_ingress dev lo** +| **# bpftool net** +| + +:: + tc: -- cgit v1.2.3 From 0af02a8e356fc6d3b1eebb32fae7d35625127835 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 10 Jun 2024 18:42:06 +0200 Subject: selftests/timers/posix_timers: Simplify error handling No point in returning to main() on fatal errors. Just exit right away. Signed-off-by: Thomas Gleixner Signed-off-by: Frederic Weisbecker Reviewed-by: Anna-Maria Behnsen --- tools/testing/selftests/timers/posix_timers.c | 151 +++++++++----------------- 1 file changed, 52 insertions(+), 99 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/timers/posix_timers.c b/tools/testing/selftests/timers/posix_timers.c index 07c81c0093c0..38db82cf2a3a 100644 --- a/tools/testing/selftests/timers/posix_timers.c +++ b/tools/testing/selftests/timers/posix_timers.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include @@ -19,6 +20,20 @@ #define DELAY 2 #define USECS_PER_SEC 1000000 +static void __fatal_error(const char *test, const char *name, const char *what) +{ + char buf[64]; + + strerror_r(errno, buf, sizeof(buf)); + + if (name && strlen(name)) + ksft_exit_fail_msg("%s %s %s %s\n", test, name, what, buf); + else + ksft_exit_fail_msg("%s %s %s\n", test, what, buf); +} + +#define fatal_error(name, what) __fatal_error(__func__, name, what) + static volatile int done; /* Busy loop in userspace to elapse ITIMER_VIRTUAL */ @@ -74,24 +89,13 @@ static int check_diff(struct timeval start, struct timeval end) return 0; } -static int check_itimer(int which) +static void check_itimer(int which, const char *name) { - const char *name; - int err; struct timeval start, end; struct itimerval val = { .it_value.tv_sec = DELAY, }; - if (which == ITIMER_VIRTUAL) - name = "ITIMER_VIRTUAL"; - else if (which == ITIMER_PROF) - name = "ITIMER_PROF"; - else if (which == ITIMER_REAL) - name = "ITIMER_REAL"; - else - return -1; - done = 0; if (which == ITIMER_VIRTUAL) @@ -101,17 +105,11 @@ static int check_itimer(int which) else if (which == ITIMER_REAL) signal(SIGALRM, sig_handler); - err = gettimeofday(&start, NULL); - if (err < 0) { - ksft_perror("Can't call gettimeofday()"); - return -1; - } + if (gettimeofday(&start, NULL) < 0) + fatal_error(name, "gettimeofday()"); - err = setitimer(which, &val, NULL); - if (err < 0) { - ksft_perror("Can't set timer"); - return -1; - } + if (setitimer(which, &val, NULL) < 0) + fatal_error(name, "setitimer()"); if (which == ITIMER_VIRTUAL) user_loop(); @@ -120,68 +118,41 @@ static int check_itimer(int which) else if (which == ITIMER_REAL) idle_loop(); - err = gettimeofday(&end, NULL); - if (err < 0) { - ksft_perror("Can't call gettimeofday()"); - return -1; - } + if (gettimeofday(&end, NULL) < 0) + fatal_error(name, "gettimeofday()"); ksft_test_result(check_diff(start, end) == 0, "%s\n", name); - - return 0; } -static int check_timer_create(int which) +static void check_timer_create(int which, const char *name) { - const char *type; - int err; - timer_t id; struct timeval start, end; struct itimerspec val = { .it_value.tv_sec = DELAY, }; - - if (which == CLOCK_THREAD_CPUTIME_ID) { - type = "thread"; - } else if (which == CLOCK_PROCESS_CPUTIME_ID) { - type = "process"; - } else { - ksft_print_msg("Unknown timer_create() type %d\n", which); - return -1; - } + timer_t id; done = 0; - err = timer_create(which, NULL, &id); - if (err < 0) { - ksft_perror("Can't create timer"); - return -1; - } - signal(SIGALRM, sig_handler); - err = gettimeofday(&start, NULL); - if (err < 0) { - ksft_perror("Can't call gettimeofday()"); - return -1; - } + if (timer_create(which, NULL, &id) < 0) + fatal_error(name, "timer_create()"); - err = timer_settime(id, 0, &val, NULL); - if (err < 0) { - ksft_perror("Can't set timer"); - return -1; - } + if (signal(SIGALRM, sig_handler) == SIG_ERR) + fatal_error(name, "signal()"); + + if (gettimeofday(&start, NULL) < 0) + fatal_error(name, "gettimeofday()"); + + if (timer_settime(id, 0, &val, NULL) < 0) + fatal_error(name, "timer_settime()"); user_loop(); - err = gettimeofday(&end, NULL); - if (err < 0) { - ksft_perror("Can't call gettimeofday()"); - return -1; - } + if (gettimeofday(&end, NULL) < 0) + fatal_error(name, "gettimeofday()"); ksft_test_result(check_diff(start, end) == 0, - "timer_create() per %s\n", type); - - return 0; + "timer_create() per %s\n", name); } static pthread_t ctd_thread; @@ -209,15 +180,14 @@ static void *ctd_thread_func(void *arg) ctd_count = 100; if (timer_create(CLOCK_PROCESS_CPUTIME_ID, NULL, &id)) - return "Can't create timer\n"; + fatal_error(NULL, "timer_create()"); if (timer_settime(id, 0, &val, NULL)) - return "Can't set timer\n"; - + fatal_error(NULL, "timer_settime()"); while (ctd_count > 0 && !ctd_failed) ; if (timer_delete(id)) - return "Can't delete timer\n"; + fatal_error(NULL, "timer_delete()"); return NULL; } @@ -225,19 +195,16 @@ static void *ctd_thread_func(void *arg) /* * Test that only the running thread receives the timer signal. */ -static int check_timer_distribution(void) +static void check_timer_distribution(void) { - const char *errmsg; - - signal(SIGALRM, ctd_sighandler); + if (signal(SIGALRM, ctd_sighandler) == SIG_ERR) + fatal_error(NULL, "signal()"); - errmsg = "Can't create thread\n"; if (pthread_create(&ctd_thread, NULL, ctd_thread_func, NULL)) - goto err; + fatal_error(NULL, "pthread_create()"); - errmsg = "Can't join thread\n"; - if (pthread_join(ctd_thread, (void **)&errmsg) || errmsg) - goto err; + if (pthread_join(ctd_thread, NULL)) + fatal_error(NULL, "pthread_join()"); if (!ctd_failed) ksft_test_result_pass("check signal distribution\n"); @@ -245,10 +212,6 @@ static int check_timer_distribution(void) ksft_test_result_fail("check signal distribution\n"); else ksft_test_result_skip("check signal distribution (old kernel)\n"); - return 0; -err: - ksft_print_msg("%s", errmsg); - return -1; } int main(int argc, char **argv) @@ -259,17 +222,10 @@ int main(int argc, char **argv) ksft_print_msg("Testing posix timers. False negative may happen on CPU execution \n"); ksft_print_msg("based timers if other threads run on the CPU...\n"); - if (check_itimer(ITIMER_VIRTUAL) < 0) - ksft_exit_fail(); - - if (check_itimer(ITIMER_PROF) < 0) - ksft_exit_fail(); - - if (check_itimer(ITIMER_REAL) < 0) - ksft_exit_fail(); - - if (check_timer_create(CLOCK_THREAD_CPUTIME_ID) < 0) - ksft_exit_fail(); + check_itimer(ITIMER_VIRTUAL, "ITIMER_VIRTUAL"); + check_itimer(ITIMER_PROF, "ITIMER_PROF"); + check_itimer(ITIMER_REAL, "ITIMER_REAL"); + check_timer_create(CLOCK_THREAD_CPUTIME_ID, "CLOCK_THREAD_CPUTIME_ID"); /* * It's unfortunately hard to reliably test a timer expiration @@ -280,11 +236,8 @@ int main(int argc, char **argv) * to ensure true parallelism. So test only one thread until we * find a better solution. */ - if (check_timer_create(CLOCK_PROCESS_CPUTIME_ID) < 0) - ksft_exit_fail(); - - if (check_timer_distribution() < 0) - ksft_exit_fail(); + check_timer_create(CLOCK_PROCESS_CPUTIME_ID, "CLOCK_PROCESS_CPUTIME_ID"); + check_timer_distribution(); ksft_finished(); } -- cgit v1.2.3 From 45c4225c3dcc7db2c0cdbf889cc7a9c72a53f742 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 10 Jun 2024 18:42:07 +0200 Subject: selftests/timers/posix_timers: Add SIG_IGN test Add a test case to validate correct behaviour vs. SIG_IGN. The posix specification states: "Setting a signal action to SIG_IGN for a signal that is pending shall cause the pending signal to be discarded, whether or not it is blocked." The kernel implements this in the signal handling code, but due to the way how posix timers are handling SIG_IGN for periodic timers, the behaviour after installing a real handler again is inconsistent and suprising. The following sequence is expected to deliver a signal: install_handler(SIG); block_signal(SIG); timer_create(...); <- Should send SIG timer_settime(value=100ms, interval=100ms); sleep(1); <- Timer expires and queues signal, timer is not rearmed as that should happen in the signal delivery path ignore_signal(SIG); <- Discards queued signal install_handler(SIG); <- Restore handler, should rearm but does not sleep(1); unblock_signal(SIG); <- Should deliver one signal with overrun count set in siginfo This fails because nothing rearms the timer when the signal handler is restored. Add a test for this case which fails until the signal and posix timer code is fixed. Signed-off-by: Thomas Gleixner Signed-off-by: Frederic Weisbecker --- tools/testing/selftests/timers/posix_timers.c | 127 +++++++++++++++++++++++++- 1 file changed, 125 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/timers/posix_timers.c b/tools/testing/selftests/timers/posix_timers.c index 38db82cf2a3a..8a6139a1d27a 100644 --- a/tools/testing/selftests/timers/posix_timers.c +++ b/tools/testing/selftests/timers/posix_timers.c @@ -6,8 +6,9 @@ * * Kernel loop code stolen from Steven Rostedt */ - +#define _GNU_SOURCE #include +#include #include #include #include @@ -214,10 +215,129 @@ static void check_timer_distribution(void) ksft_test_result_skip("check signal distribution (old kernel)\n"); } +struct tmrsig { + int signals; + int overruns; +}; + +static void siginfo_handler(int sig, siginfo_t *si, void *uc) +{ + struct tmrsig *tsig = si ? si->si_ptr : NULL; + + if (tsig) { + tsig->signals++; + tsig->overruns += si->si_overrun; + } +} + +static void *ignore_thread(void *arg) +{ + unsigned int *tid = arg; + sigset_t set; + + sigemptyset(&set); + sigaddset(&set, SIGUSR1); + if (sigprocmask(SIG_BLOCK, &set, NULL)) + fatal_error(NULL, "sigprocmask(SIG_BLOCK)"); + + *tid = gettid(); + sleep(100); + + if (sigprocmask(SIG_UNBLOCK, &set, NULL)) + fatal_error(NULL, "sigprocmask(SIG_UNBLOCK)"); + return NULL; +} + +static void check_sig_ign(int thread) +{ + struct tmrsig tsig = { }; + struct itimerspec its; + unsigned int tid = 0; + struct sigaction sa; + struct sigevent sev; + pthread_t pthread; + timer_t timerid; + sigset_t set; + + if (thread) { + if (pthread_create(&pthread, NULL, ignore_thread, &tid)) + fatal_error(NULL, "pthread_create()"); + sleep(1); + } + + sa.sa_flags = SA_SIGINFO; + sa.sa_sigaction = siginfo_handler; + sigemptyset(&sa.sa_mask); + if (sigaction(SIGUSR1, &sa, NULL)) + fatal_error(NULL, "sigaction()"); + + /* Block the signal */ + sigemptyset(&set); + sigaddset(&set, SIGUSR1); + if (sigprocmask(SIG_BLOCK, &set, NULL)) + fatal_error(NULL, "sigprocmask(SIG_BLOCK)"); + + memset(&sev, 0, sizeof(sev)); + sev.sigev_notify = SIGEV_SIGNAL; + sev.sigev_signo = SIGUSR1; + sev.sigev_value.sival_ptr = &tsig; + if (thread) { + sev.sigev_notify = SIGEV_THREAD_ID; + sev._sigev_un._tid = tid; + } + + if (timer_create(CLOCK_MONOTONIC, &sev, &timerid)) + fatal_error(NULL, "timer_create()"); + + /* Start the timer to expire in 100ms and 100ms intervals */ + its.it_value.tv_sec = 0; + its.it_value.tv_nsec = 100000000; + its.it_interval.tv_sec = 0; + its.it_interval.tv_nsec = 100000000; + timer_settime(timerid, 0, &its, NULL); + + sleep(1); + + /* Set the signal to be ignored */ + if (signal(SIGUSR1, SIG_IGN) == SIG_ERR) + fatal_error(NULL, "signal(SIG_IGN)"); + + sleep(1); + + if (thread) { + /* Stop the thread first. No signal should be delivered to it */ + if (pthread_cancel(pthread)) + fatal_error(NULL, "pthread_cancel()"); + if (pthread_join(pthread, NULL)) + fatal_error(NULL, "pthread_join()"); + } + + /* Restore the handler */ + if (sigaction(SIGUSR1, &sa, NULL)) + fatal_error(NULL, "sigaction()"); + + sleep(1); + + /* Unblock it, which should deliver the signal in the !thread case*/ + if (sigprocmask(SIG_UNBLOCK, &set, NULL)) + fatal_error(NULL, "sigprocmask(SIG_UNBLOCK)"); + + if (timer_delete(timerid)) + fatal_error(NULL, "timer_delete()"); + + if (!thread) { + ksft_test_result(tsig.signals == 1 && tsig.overruns == 29, + "check_sig_ign SIGEV_SIGNAL\n"); + } else { + ksft_test_result(tsig.signals == 0 && tsig.overruns == 0, + "check_sig_ign SIGEV_THREAD_ID\n"); + } +} + int main(int argc, char **argv) { ksft_print_header(); - ksft_set_plan(6); + ksft_set_plan(8); ksft_print_msg("Testing posix timers. False negative may happen on CPU execution \n"); ksft_print_msg("based timers if other threads run on the CPU...\n"); @@ -239,5 +359,8 @@ int main(int argc, char **argv) check_timer_create(CLOCK_PROCESS_CPUTIME_ID, "CLOCK_PROCESS_CPUTIME_ID"); check_timer_distribution(); + check_sig_ign(0); + check_sig_ign(1); + ksft_finished(); } -- cgit v1.2.3 From e65bb03e442751ccf1631382516dcca5b3a20122 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 10 Jun 2024 18:42:09 +0200 Subject: selftests/timers/posix_timers: Validate signal rules Add a test case to validate correct behaviour vs. timer reprogramming and deletion. The handling of queued signals in case of timer reprogramming or deletion is inconsistent at best. POSIX does not really specify the behaviour for that: - "The effect of disarming or resetting a timer with pending expiration notifications is unspecified." - "The disposition of pending signals for the deleted timer is unspecified." In both cases it is reasonable to expect that pending signals are discarded. Especially in the reprogramming case it does not make sense to account for previous overruns or to deliver a signal for a timer which has been disarmed. Add tests to validate that no unexpected signals are delivered. They fail for now until the signal and posix timer code is updated. Signed-off-by: Thomas Gleixner Signed-off-by: Frederic Weisbecker --- tools/testing/selftests/timers/posix_timers.c | 108 +++++++++++++++++++++++++- 1 file changed, 107 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/timers/posix_timers.c b/tools/testing/selftests/timers/posix_timers.c index 8a6139a1d27a..41b43d1327eb 100644 --- a/tools/testing/selftests/timers/posix_timers.c +++ b/tools/testing/selftests/timers/posix_timers.c @@ -334,10 +334,114 @@ static void check_sig_ign(int thread) } } +static void check_rearm(void) +{ + struct tmrsig tsig = { }; + struct itimerspec its; + struct sigaction sa; + struct sigevent sev; + timer_t timerid; + sigset_t set; + + sa.sa_flags = SA_SIGINFO; + sa.sa_sigaction = siginfo_handler; + sigemptyset(&sa.sa_mask); + if (sigaction(SIGUSR1, &sa, NULL)) + fatal_error(NULL, "sigaction()"); + + /* Block the signal */ + sigemptyset(&set); + sigaddset(&set, SIGUSR1); + if (sigprocmask(SIG_BLOCK, &set, NULL)) + fatal_error(NULL, "sigprocmask(SIG_BLOCK)"); + + memset(&sev, 0, sizeof(sev)); + sev.sigev_notify = SIGEV_SIGNAL; + sev.sigev_signo = SIGUSR1; + sev.sigev_value.sival_ptr = &tsig; + if (timer_create(CLOCK_MONOTONIC, &sev, &timerid)) + fatal_error(NULL, "timer_create()"); + + /* Start the timer to expire in 100ms and 100ms intervals */ + its.it_value.tv_sec = 0; + its.it_value.tv_nsec = 100000000; + its.it_interval.tv_sec = 0; + its.it_interval.tv_nsec = 100000000; + if (timer_settime(timerid, 0, &its, NULL)) + fatal_error(NULL, "timer_settime()"); + + sleep(1); + + /* Reprogram the timer to single shot */ + its.it_value.tv_sec = 10; + its.it_value.tv_nsec = 0; + its.it_interval.tv_sec = 0; + its.it_interval.tv_nsec = 0; + if (timer_settime(timerid, 0, &its, NULL)) + fatal_error(NULL, "timer_settime()"); + + /* Unblock it, which should not deliver a signal */ + if (sigprocmask(SIG_UNBLOCK, &set, NULL)) + fatal_error(NULL, "sigprocmask(SIG_UNBLOCK)"); + + if (timer_delete(timerid)) + fatal_error(NULL, "timer_delete()"); + + ksft_test_result(!tsig.signals, "check_rearm\n"); +} + +static void check_delete(void) +{ + struct tmrsig tsig = { }; + struct itimerspec its; + struct sigaction sa; + struct sigevent sev; + timer_t timerid; + sigset_t set; + + sa.sa_flags = SA_SIGINFO; + sa.sa_sigaction = siginfo_handler; + sigemptyset(&sa.sa_mask); + if (sigaction(SIGUSR1, &sa, NULL)) + fatal_error(NULL, "sigaction()"); + + /* Block the signal */ + sigemptyset(&set); + sigaddset(&set, SIGUSR1); + if (sigprocmask(SIG_BLOCK, &set, NULL)) + fatal_error(NULL, "sigprocmask(SIG_BLOCK)"); + + memset(&sev, 0, sizeof(sev)); + sev.sigev_notify = SIGEV_SIGNAL; + sev.sigev_signo = SIGUSR1; + sev.sigev_value.sival_ptr = &tsig; + if (timer_create(CLOCK_MONOTONIC, &sev, &timerid)) + fatal_error(NULL, "timer_create()"); + + /* Start the timer to expire in 100ms and 100ms intervals */ + its.it_value.tv_sec = 0; + its.it_value.tv_nsec = 100000000; + its.it_interval.tv_sec = 0; + its.it_interval.tv_nsec = 100000000; + if (timer_settime(timerid, 0, &its, NULL)) + fatal_error(NULL, "timer_settime()"); + + sleep(1); + + if (timer_delete(timerid)) + fatal_error(NULL, "timer_delete()"); + + /* Unblock it, which should not deliver a signal */ + if (sigprocmask(SIG_UNBLOCK, &set, NULL)) + fatal_error(NULL, "sigprocmask(SIG_UNBLOCK)"); + + ksft_test_result(!tsig.signals, "check_delete\n"); +} + int main(int argc, char **argv) { ksft_print_header(); - ksft_set_plan(8); + ksft_set_plan(10); ksft_print_msg("Testing posix timers. False negative may happen on CPU execution \n"); ksft_print_msg("based timers if other threads run on the CPU...\n"); @@ -361,6 +465,8 @@ int main(int argc, char **argv) check_sig_ign(0); check_sig_ign(1); + check_rearm(); + check_delete(); ksft_finished(); } -- cgit v1.2.3 From 2c2b56132bb74b6b801dcb82f57489ae1cf81a91 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 10 Jun 2024 18:42:10 +0200 Subject: selftests/timers/posix-timers: Validate SIGEV_NONE Posix timers with a delivery mode of SIGEV_NONE deliver no signals but the remaining expiry time must be readable via timer_gettime() for both one shot and interval timers. That's implemented correctly for regular posix timers but broken for posix CPU timers. Add a self test so the fixes can be verified. Signed-off-by: Thomas Gleixner Signed-off-by: Frederic Weisbecker --- tools/testing/selftests/timers/posix_timers.c | 53 ++++++++++++++++++++++++++- 1 file changed, 52 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/timers/posix_timers.c b/tools/testing/selftests/timers/posix_timers.c index 41b43d1327eb..097a132615f6 100644 --- a/tools/testing/selftests/timers/posix_timers.c +++ b/tools/testing/selftests/timers/posix_timers.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -20,6 +21,7 @@ #define DELAY 2 #define USECS_PER_SEC 1000000 +#define NSECS_PER_SEC 1000000000 static void __fatal_error(const char *test, const char *name, const char *what) { @@ -438,10 +440,57 @@ static void check_delete(void) ksft_test_result(!tsig.signals, "check_delete\n"); } +static inline int64_t calcdiff_ns(struct timespec t1, struct timespec t2) +{ + int64_t diff; + + diff = NSECS_PER_SEC * (int64_t)((int) t1.tv_sec - (int) t2.tv_sec); + diff += ((int) t1.tv_nsec - (int) t2.tv_nsec); + return diff; +} + +static void check_sigev_none(int which, const char *name) +{ + struct timespec start, now; + struct itimerspec its; + struct sigevent sev; + timer_t timerid; + + memset(&sev, 0, sizeof(sev)); + sev.sigev_notify = SIGEV_NONE; + + if (timer_create(which, &sev, &timerid)) + fatal_error(name, "timer_create()"); + + /* Start the timer to expire in 100ms and 100ms intervals */ + its.it_value.tv_sec = 0; + its.it_value.tv_nsec = 100000000; + its.it_interval.tv_sec = 0; + its.it_interval.tv_nsec = 100000000; + timer_settime(timerid, 0, &its, NULL); + + if (clock_gettime(which, &start)) + fatal_error(name, "clock_gettime()"); + + do { + if (clock_gettime(which, &now)) + fatal_error(name, "clock_gettime()"); + } while (calcdiff_ns(now, start) < NSECS_PER_SEC); + + if (timer_gettime(timerid, &its)) + fatal_error(name, "timer_gettime()"); + + if (timer_delete(timerid)) + fatal_error(name, "timer_delete()"); + + ksft_test_result(its.it_value.tv_sec || its.it_value.tv_nsec, + "check_sigev_none %s\n", name); +} + int main(int argc, char **argv) { ksft_print_header(); - ksft_set_plan(10); + ksft_set_plan(12); ksft_print_msg("Testing posix timers. False negative may happen on CPU execution \n"); ksft_print_msg("based timers if other threads run on the CPU...\n"); @@ -467,6 +516,8 @@ int main(int argc, char **argv) check_sig_ign(1); check_rearm(); check_delete(); + check_sigev_none(CLOCK_MONOTONIC, "CLOCK_MONOTONIC"); + check_sigev_none(CLOCK_PROCESS_CPUTIME_ID, "CLOCK_PROCESS_CPUTIME_ID"); ksft_finished(); } -- cgit v1.2.3 From f924f868ed05d954d79db419e97ba11293110d52 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 10 Jun 2024 18:42:11 +0200 Subject: selftests/timers/posix-timers: Validate timer_gettime() timer_gettime() must return the correct expiry time for interval timers even when the timer is not armed, which is the case when a signal is pending but blocked. Works correctly for regular posix timers, but not for posix CPU timers. Add a selftest to validate the fixes. Signed-off-by: Thomas Gleixner Signed-off-by: Frederic Weisbecker --- tools/testing/selftests/timers/posix_timers.c | 58 ++++++++++++++++++++++++++- 1 file changed, 57 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/timers/posix_timers.c b/tools/testing/selftests/timers/posix_timers.c index 097a132615f6..4c993dbbdbf2 100644 --- a/tools/testing/selftests/timers/posix_timers.c +++ b/tools/testing/selftests/timers/posix_timers.c @@ -487,10 +487,63 @@ static void check_sigev_none(int which, const char *name) "check_sigev_none %s\n", name); } +static void check_gettime(int which, const char *name) +{ + struct itimerspec its, prev; + struct timespec start, now; + struct sigevent sev; + timer_t timerid; + int wraps = 0; + sigset_t set; + + /* Block the signal */ + sigemptyset(&set); + sigaddset(&set, SIGUSR1); + if (sigprocmask(SIG_BLOCK, &set, NULL)) + fatal_error(name, "sigprocmask(SIG_BLOCK)"); + + memset(&sev, 0, sizeof(sev)); + sev.sigev_notify = SIGEV_SIGNAL; + sev.sigev_signo = SIGUSR1; + + if (timer_create(which, &sev, &timerid)) + fatal_error(name, "timer_create()"); + + /* Start the timer to expire in 100ms and 100ms intervals */ + its.it_value.tv_sec = 0; + its.it_value.tv_nsec = 100000000; + its.it_interval.tv_sec = 0; + its.it_interval.tv_nsec = 100000000; + if (timer_settime(timerid, 0, &its, NULL)) + fatal_error(name, "timer_settime()"); + + if (timer_gettime(timerid, &prev)) + fatal_error(name, "timer_gettime()"); + + if (clock_gettime(which, &start)) + fatal_error(name, "clock_gettime()"); + + do { + if (clock_gettime(which, &now)) + fatal_error(name, "clock_gettime()"); + if (timer_gettime(timerid, &its)) + fatal_error(name, "timer_gettime()"); + if (its.it_value.tv_nsec > prev.it_value.tv_nsec) + wraps++; + prev = its; + + } while (calcdiff_ns(now, start) < NSECS_PER_SEC); + + if (timer_delete(timerid)) + fatal_error(name, "timer_delete()"); + + ksft_test_result(wraps > 1, "check_gettime %s\n", name); +} + int main(int argc, char **argv) { ksft_print_header(); - ksft_set_plan(12); + ksft_set_plan(15); ksft_print_msg("Testing posix timers. False negative may happen on CPU execution \n"); ksft_print_msg("based timers if other threads run on the CPU...\n"); @@ -518,6 +571,9 @@ int main(int argc, char **argv) check_delete(); check_sigev_none(CLOCK_MONOTONIC, "CLOCK_MONOTONIC"); check_sigev_none(CLOCK_PROCESS_CPUTIME_ID, "CLOCK_PROCESS_CPUTIME_ID"); + check_gettime(CLOCK_MONOTONIC, "CLOCK_MONOTONIC"); + check_gettime(CLOCK_PROCESS_CPUTIME_ID, "CLOCK_PROCESS_CPUTIME_ID"); + check_gettime(CLOCK_THREAD_CPUTIME_ID, "CLOCK_THREAD_CPUTIME_ID"); ksft_finished(); } -- cgit v1.2.3 From 73339b82f86521eeadbb781d8d7e04813cbd0998 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 10 Jun 2024 18:42:12 +0200 Subject: selftests/timers/posix-timers: Validate overrun after unblock When a timer signal is blocked and later unblocked then one signal should be delivered with the correct number of overruns since the timer was queued. Validate that behaviour. Signed-off-by: Thomas Gleixner Signed-off-by: Frederic Weisbecker --- tools/testing/selftests/timers/posix_timers.c | 61 ++++++++++++++++++++++++++- 1 file changed, 60 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/timers/posix_timers.c b/tools/testing/selftests/timers/posix_timers.c index 4c993dbbdbf2..16bd49492efa 100644 --- a/tools/testing/selftests/timers/posix_timers.c +++ b/tools/testing/selftests/timers/posix_timers.c @@ -540,10 +540,66 @@ static void check_gettime(int which, const char *name) ksft_test_result(wraps > 1, "check_gettime %s\n", name); } +static void check_overrun(int which, const char *name) +{ + struct timespec start, now; + struct tmrsig tsig = { }; + struct itimerspec its; + struct sigaction sa; + struct sigevent sev; + timer_t timerid; + sigset_t set; + + sa.sa_flags = SA_SIGINFO; + sa.sa_sigaction = siginfo_handler; + sigemptyset(&sa.sa_mask); + if (sigaction(SIGUSR1, &sa, NULL)) + fatal_error(name, "sigaction()"); + + /* Block the signal */ + sigemptyset(&set); + sigaddset(&set, SIGUSR1); + if (sigprocmask(SIG_BLOCK, &set, NULL)) + fatal_error(name, "sigprocmask(SIG_BLOCK)"); + + memset(&sev, 0, sizeof(sev)); + sev.sigev_notify = SIGEV_SIGNAL; + sev.sigev_signo = SIGUSR1; + sev.sigev_value.sival_ptr = &tsig; + if (timer_create(which, &sev, &timerid)) + fatal_error(name, "timer_create()"); + + /* Start the timer to expire in 100ms and 100ms intervals */ + its.it_value.tv_sec = 0; + its.it_value.tv_nsec = 100000000; + its.it_interval.tv_sec = 0; + its.it_interval.tv_nsec = 100000000; + if (timer_settime(timerid, 0, &its, NULL)) + fatal_error(name, "timer_settime()"); + + if (clock_gettime(which, &start)) + fatal_error(name, "clock_gettime()"); + + do { + if (clock_gettime(which, &now)) + fatal_error(name, "clock_gettime()"); + } while (calcdiff_ns(now, start) < NSECS_PER_SEC); + + /* Unblock it, which should deliver a signal */ + if (sigprocmask(SIG_UNBLOCK, &set, NULL)) + fatal_error(name, "sigprocmask(SIG_UNBLOCK)"); + + if (timer_delete(timerid)) + fatal_error(name, "timer_delete()"); + + ksft_test_result(tsig.signals == 1 && tsig.overruns == 9, + "check_overrun %s\n", name); +} + int main(int argc, char **argv) { ksft_print_header(); - ksft_set_plan(15); + ksft_set_plan(18); ksft_print_msg("Testing posix timers. False negative may happen on CPU execution \n"); ksft_print_msg("based timers if other threads run on the CPU...\n"); @@ -574,6 +630,9 @@ int main(int argc, char **argv) check_gettime(CLOCK_MONOTONIC, "CLOCK_MONOTONIC"); check_gettime(CLOCK_PROCESS_CPUTIME_ID, "CLOCK_PROCESS_CPUTIME_ID"); check_gettime(CLOCK_THREAD_CPUTIME_ID, "CLOCK_THREAD_CPUTIME_ID"); + check_overrun(CLOCK_MONOTONIC, "CLOCK_MONOTONIC"); + check_overrun(CLOCK_PROCESS_CPUTIME_ID, "CLOCK_PROCESS_CPUTIME_ID"); + check_overrun(CLOCK_THREAD_CPUTIME_ID, "CLOCK_THREAD_CPUTIME_ID"); ksft_finished(); } -- cgit v1.2.3 From e44b4fc40cb429f6dedc4518a16221115035f654 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Mon, 22 Jul 2024 11:30:46 -0700 Subject: selftests/bpf: Fix the missing tramp_1 to tramp_40 ops in cfi_stubs The tramp_1 to tramp_40 ops is not set in the cfi_stubs in the bpf_testmod_ops. It fails the struct_ops_multi_pages test after retiring the unsupported_ops in the earlier patch. This patch initializes them in a loop during the bpf_testmod_init(). Signed-off-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20240722183049.2254692-3-martin.lau@linux.dev Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko --- tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c index fd28c1157bd3..3687a40b61c6 100644 --- a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c +++ b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c @@ -1024,6 +1024,11 @@ static void bpf_testmod_test_2(int a, int b) { } +static int bpf_testmod_tramp(int value) +{ + return 0; +} + static int bpf_testmod_ops__test_maybe_null(int dummy, struct task_struct *task__nullable) { @@ -1080,6 +1085,7 @@ static int bpf_testmod_init(void) .kfunc_btf_id = bpf_testmod_dtor_ids[1] }, }; + void **tramp; int ret; ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_UNSPEC, &bpf_testmod_common_kfunc_set); @@ -1103,6 +1109,14 @@ static int bpf_testmod_init(void) ret = register_bpf_testmod_uprobe(); if (ret < 0) return ret; + + /* Ensure nothing is between tramp_1..tramp_40 */ + BUILD_BUG_ON(offsetof(struct bpf_testmod_ops, tramp_1) + 40 * sizeof(long) != + offsetofend(struct bpf_testmod_ops, tramp_40)); + tramp = (void **)&__bpf_testmod_ops.tramp_1; + while (tramp <= (void **)&__bpf_testmod_ops.tramp_40) + *tramp++ = bpf_testmod_tramp; + return 0; } -- cgit v1.2.3 From 4009c95fede6b783802ad01f264a7a0541f5ea60 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Mon, 22 Jul 2024 11:30:47 -0700 Subject: selftests/bpf: Ensure the unsupported struct_ops prog cannot be loaded There is an existing "bpf_tcp_ca/unsupp_cong_op" test to ensure the unsupported tcp-cc "get_info" struct_ops prog cannot be loaded. This patch adds a new test in the bpf_testmod such that the unsupported ops test does not depend on other kernel subsystem where its supporting ops may be changed in the future. Signed-off-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20240722183049.2254692-4-martin.lau@linux.dev Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko --- .../selftests/bpf/bpf_testmod/bpf_testmod.h | 1 + .../bpf/prog_tests/test_struct_ops_module.c | 2 ++ .../testing/selftests/bpf/progs/unsupported_ops.c | 22 ++++++++++++++++++++++ 3 files changed, 25 insertions(+) create mode 100644 tools/testing/selftests/bpf/progs/unsupported_ops.c (limited to 'tools') diff --git a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.h b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.h index 23fa1872ee67..fe0d402b0d65 100644 --- a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.h +++ b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.h @@ -35,6 +35,7 @@ struct bpf_testmod_ops { void (*test_2)(int a, int b); /* Used to test nullable arguments. */ int (*test_maybe_null)(int dummy, struct task_struct *task); + int (*unsupported_ops)(void); /* The following fields are used to test shadow copies. */ char onebyte; diff --git a/tools/testing/selftests/bpf/prog_tests/test_struct_ops_module.c b/tools/testing/selftests/bpf/prog_tests/test_struct_ops_module.c index bbcf12696a6b..75a0dea511b3 100644 --- a/tools/testing/selftests/bpf/prog_tests/test_struct_ops_module.c +++ b/tools/testing/selftests/bpf/prog_tests/test_struct_ops_module.c @@ -9,6 +9,7 @@ #include "struct_ops_nulled_out_cb.skel.h" #include "struct_ops_forgotten_cb.skel.h" #include "struct_ops_detach.skel.h" +#include "unsupported_ops.skel.h" static void check_map_info(struct bpf_map_info *info) { @@ -311,5 +312,6 @@ void serial_test_struct_ops_module(void) test_struct_ops_forgotten_cb(); if (test__start_subtest("test_detach_link")) test_detach_link(); + RUN_TESTS(unsupported_ops); } diff --git a/tools/testing/selftests/bpf/progs/unsupported_ops.c b/tools/testing/selftests/bpf/progs/unsupported_ops.c new file mode 100644 index 000000000000..9180365a3568 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/unsupported_ops.c @@ -0,0 +1,22 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */ + +#include +#include +#include "bpf_misc.h" +#include "../bpf_testmod/bpf_testmod.h" + +char _license[] SEC("license") = "GPL"; + +SEC("struct_ops/unsupported_ops") +__failure +__msg("attach to unsupported member unsupported_ops of struct bpf_testmod_ops") +int BPF_PROG(unsupported_ops) +{ + return 0; +} + +SEC(".struct_ops.link") +struct bpf_testmod_ops testmod = { + .unsupported_ops = (void *)unsupported_ops, +}; -- cgit v1.2.3 From 4dc7556490d77d5046844e7731d5d0a3055a3448 Mon Sep 17 00:00:00 2001 From: Xu Kuohai Date: Fri, 19 Jul 2024 19:00:56 +0800 Subject: selftests/bpf: Avoid load failure for token_lsm.c The compiler optimized the two bpf progs in token_lsm.c to make return value from the bool variable in the "return -1" path, causing an unexpected rejection: 0: R1=ctx() R10=fp0 ; int BPF_PROG(bpf_token_capable, struct bpf_token *token, int cap) @ bpf_lsm.c:17 0: (b7) r6 = 0 ; R6_w=0 ; if (my_pid == 0 || my_pid != (bpf_get_current_pid_tgid() >> 32)) @ bpf_lsm.c:19 1: (18) r1 = 0xffffc9000102a000 ; R1_w=map_value(map=bpf_lsm.bss,ks=4,vs=5) 3: (61) r7 = *(u32 *)(r1 +0) ; R1_w=map_value(map=bpf_lsm.bss,ks=4,vs=5) R7_w=scalar(smin=0,smax=umax=0xffffffff,var_off=(0x0; 0xffffffff)) 4: (15) if r7 == 0x0 goto pc+11 ; R7_w=scalar(smin=umin=umin32=1,smax=umax=0xffffffff,var_off=(0x0; 0xffffffff)) 5: (67) r7 <<= 32 ; R7_w=scalar(smax=0x7fffffff00000000,umax=0xffffffff00000000,smin32=0,smax32=umax32=0,var_off=(0x0; 0xffffffff00000000)) 6: (c7) r7 s>>= 32 ; R7_w=scalar(smin=0xffffffff80000000,smax=0x7fffffff) 7: (85) call bpf_get_current_pid_tgid#14 ; R0=scalar() 8: (77) r0 >>= 32 ; R0_w=scalar(smin=0,smax=umax=0xffffffff,var_off=(0x0; 0xffffffff)) 9: (5d) if r0 != r7 goto pc+6 ; R0_w=scalar(smin=smin32=0,smax=umax=umax32=0x7fffffff,var_off=(0x0; 0x7fffffff)) R7=scalar(smin=smin32=0,smax=umax=umax32=0x7fffffff,var_off=(0x0; 0x7fffffff)) ; if (reject_capable) @ bpf_lsm.c:21 10: (18) r1 = 0xffffc9000102a004 ; R1_w=map_value(map=bpf_lsm.bss,ks=4,vs=5,off=4) 12: (71) r6 = *(u8 *)(r1 +0) ; R1_w=map_value(map=bpf_lsm.bss,ks=4,vs=5,off=4) R6_w=scalar(smin=smin32=0,smax=umax=smax32=umax32=255,var_off=(0x0; 0xff)) ; @ bpf_lsm.c:0 13: (87) r6 = -r6 ; R6_w=scalar() 14: (67) r6 <<= 56 ; R6_w=scalar(smax=0x7f00000000000000,umax=0xff00000000000000,smin32=0,smax32=umax32=0,var_off=(0x0; 0xff00000000000000)) 15: (c7) r6 s>>= 56 ; R6_w=scalar(smin=smin32=-128,smax=smax32=127) ; int BPF_PROG(bpf_token_capable, struct bpf_token *token, int cap) @ bpf_lsm.c:17 16: (bf) r0 = r6 ; R0_w=scalar(id=1,smin=smin32=-128,smax=smax32=127) R6_w=scalar(id=1,smin=smin32=-128,smax=smax32=127) 17: (95) exit At program exit the register R0 has smin=-128 smax=127 should have been in [-4095, 0] To avoid this failure, change the variable type from bool to int. Signed-off-by: Xu Kuohai Link: https://lore.kernel.org/r/20240719110059.797546-7-xukuohai@huaweicloud.com Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko --- tools/testing/selftests/bpf/progs/token_lsm.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/progs/token_lsm.c b/tools/testing/selftests/bpf/progs/token_lsm.c index e4d59b6ba743..a6002d073b1b 100644 --- a/tools/testing/selftests/bpf/progs/token_lsm.c +++ b/tools/testing/selftests/bpf/progs/token_lsm.c @@ -8,8 +8,8 @@ char _license[] SEC("license") = "GPL"; int my_pid; -bool reject_capable; -bool reject_cmd; +int reject_capable; +int reject_cmd; SEC("lsm/bpf_token_capable") int BPF_PROG(token_capable, struct bpf_token *token, int cap) -- cgit v1.2.3 From 2b23b6c0f03c94dbacff2abdfd45490eca9d7f6e Mon Sep 17 00:00:00 2001 From: Xu Kuohai Date: Fri, 19 Jul 2024 19:00:57 +0800 Subject: selftests/bpf: Add return value checks for failed tests The return ranges of some bpf lsm test progs can not be deduced by the verifier accurately. To avoid erroneous rejections, add explicit return value checks for these progs. Signed-off-by: Xu Kuohai Link: https://lore.kernel.org/r/20240719110059.797546-8-xukuohai@huaweicloud.com Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko --- tools/testing/selftests/bpf/progs/err.h | 10 ++++++++++ tools/testing/selftests/bpf/progs/test_sig_in_xattr.c | 4 ++++ tools/testing/selftests/bpf/progs/test_verify_pkcs7_sig.c | 8 ++++++-- tools/testing/selftests/bpf/progs/verifier_global_subprogs.c | 7 ++++++- 4 files changed, 26 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/progs/err.h b/tools/testing/selftests/bpf/progs/err.h index d66d283d9e59..38529779a236 100644 --- a/tools/testing/selftests/bpf/progs/err.h +++ b/tools/testing/selftests/bpf/progs/err.h @@ -5,6 +5,16 @@ #define MAX_ERRNO 4095 #define IS_ERR_VALUE(x) (unsigned long)(void *)(x) >= (unsigned long)-MAX_ERRNO +#define __STR(x) #x + +#define set_if_not_errno_or_zero(x, y) \ +({ \ + asm volatile ("if %0 s< -4095 goto +1\n" \ + "if %0 s<= 0 goto +1\n" \ + "%0 = " __STR(y) "\n" \ + : "+r"(x)); \ +}) + static inline int IS_ERR_OR_NULL(const void *ptr) { return !ptr || IS_ERR_VALUE((unsigned long)ptr); diff --git a/tools/testing/selftests/bpf/progs/test_sig_in_xattr.c b/tools/testing/selftests/bpf/progs/test_sig_in_xattr.c index 2f0eb1334d65..8ef6b39335b6 100644 --- a/tools/testing/selftests/bpf/progs/test_sig_in_xattr.c +++ b/tools/testing/selftests/bpf/progs/test_sig_in_xattr.c @@ -6,6 +6,7 @@ #include #include #include "bpf_kfuncs.h" +#include "err.h" char _license[] SEC("license") = "GPL"; @@ -79,5 +80,8 @@ int BPF_PROG(test_file_open, struct file *f) ret = bpf_verify_pkcs7_signature(&digest_ptr, &sig_ptr, trusted_keyring); bpf_key_put(trusted_keyring); + + set_if_not_errno_or_zero(ret, -EFAULT); + return ret; } diff --git a/tools/testing/selftests/bpf/progs/test_verify_pkcs7_sig.c b/tools/testing/selftests/bpf/progs/test_verify_pkcs7_sig.c index f42e9f3831a1..12034a73ee2d 100644 --- a/tools/testing/selftests/bpf/progs/test_verify_pkcs7_sig.c +++ b/tools/testing/selftests/bpf/progs/test_verify_pkcs7_sig.c @@ -11,6 +11,7 @@ #include #include #include "bpf_kfuncs.h" +#include "err.h" #define MAX_DATA_SIZE (1024 * 1024) #define MAX_SIG_SIZE 1024 @@ -55,12 +56,12 @@ int BPF_PROG(bpf, int cmd, union bpf_attr *attr, unsigned int size) ret = bpf_probe_read_kernel(&value, sizeof(value), &attr->value); if (ret) - return ret; + goto out; ret = bpf_copy_from_user(data_val, sizeof(struct data), (void *)(unsigned long)value); if (ret) - return ret; + goto out; if (data_val->data_len > sizeof(data_val->data)) return -EINVAL; @@ -84,5 +85,8 @@ int BPF_PROG(bpf, int cmd, union bpf_attr *attr, unsigned int size) bpf_key_put(trusted_keyring); +out: + set_if_not_errno_or_zero(ret, -EFAULT); + return ret; } diff --git a/tools/testing/selftests/bpf/progs/verifier_global_subprogs.c b/tools/testing/selftests/bpf/progs/verifier_global_subprogs.c index a9fc30ed4d73..20904cd2baa2 100644 --- a/tools/testing/selftests/bpf/progs/verifier_global_subprogs.c +++ b/tools/testing/selftests/bpf/progs/verifier_global_subprogs.c @@ -7,6 +7,7 @@ #include "bpf_misc.h" #include "xdp_metadata.h" #include "bpf_kfuncs.h" +#include "err.h" /* The compiler may be able to detect the access to uninitialized memory in the routines performing out of bound memory accesses and @@ -331,7 +332,11 @@ SEC("?lsm/bpf") __success __log_level(2) int BPF_PROG(arg_tag_ctx_lsm) { - return tracing_subprog_void(ctx) + tracing_subprog_u64(ctx); + int ret; + + ret = tracing_subprog_void(ctx) + tracing_subprog_u64(ctx); + set_if_not_errno_or_zero(ret, -1); + return ret; } SEC("?struct_ops/test_1") -- cgit v1.2.3 From d463dd9c9aa24b17ccb8ed76bdd7768baf857b48 Mon Sep 17 00:00:00 2001 From: Xu Kuohai Date: Fri, 19 Jul 2024 19:00:58 +0800 Subject: selftests/bpf: Add test for lsm tail call Add test for lsm tail call to ensure tail call can only be used between bpf lsm progs attached to the same hook. Signed-off-by: Xu Kuohai Link: https://lore.kernel.org/r/20240719110059.797546-9-xukuohai@huaweicloud.com Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko --- tools/testing/selftests/bpf/prog_tests/test_lsm.c | 46 ++++++++++++++++++++++- tools/testing/selftests/bpf/progs/lsm_tailcall.c | 34 +++++++++++++++++ 2 files changed, 79 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/bpf/progs/lsm_tailcall.c (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/test_lsm.c b/tools/testing/selftests/bpf/prog_tests/test_lsm.c index 16175d579bc7..2a27f3714f5c 100644 --- a/tools/testing/selftests/bpf/prog_tests/test_lsm.c +++ b/tools/testing/selftests/bpf/prog_tests/test_lsm.c @@ -12,6 +12,7 @@ #include #include "lsm.skel.h" +#include "lsm_tailcall.skel.h" char *CMD_ARGS[] = {"true", NULL}; @@ -95,7 +96,7 @@ static int test_lsm(struct lsm *skel) return 0; } -void test_test_lsm(void) +static void test_lsm_basic(void) { struct lsm *skel = NULL; int err; @@ -114,3 +115,46 @@ void test_test_lsm(void) close_prog: lsm__destroy(skel); } + +static void test_lsm_tailcall(void) +{ + struct lsm_tailcall *skel = NULL; + int map_fd, prog_fd; + int err, key; + + skel = lsm_tailcall__open_and_load(); + if (!ASSERT_OK_PTR(skel, "lsm_tailcall__skel_load")) + goto close_prog; + + map_fd = bpf_map__fd(skel->maps.jmp_table); + if (CHECK_FAIL(map_fd < 0)) + goto close_prog; + + prog_fd = bpf_program__fd(skel->progs.lsm_file_permission_prog); + if (CHECK_FAIL(prog_fd < 0)) + goto close_prog; + + key = 0; + err = bpf_map_update_elem(map_fd, &key, &prog_fd, BPF_ANY); + if (CHECK_FAIL(!err)) + goto close_prog; + + prog_fd = bpf_program__fd(skel->progs.lsm_file_alloc_security_prog); + if (CHECK_FAIL(prog_fd < 0)) + goto close_prog; + + err = bpf_map_update_elem(map_fd, &key, &prog_fd, BPF_ANY); + if (CHECK_FAIL(err)) + goto close_prog; + +close_prog: + lsm_tailcall__destroy(skel); +} + +void test_test_lsm(void) +{ + if (test__start_subtest("lsm_basic")) + test_lsm_basic(); + if (test__start_subtest("lsm_tailcall")) + test_lsm_tailcall(); +} diff --git a/tools/testing/selftests/bpf/progs/lsm_tailcall.c b/tools/testing/selftests/bpf/progs/lsm_tailcall.c new file mode 100644 index 000000000000..49c075ce2d4c --- /dev/null +++ b/tools/testing/selftests/bpf/progs/lsm_tailcall.c @@ -0,0 +1,34 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2024 Huawei Technologies Co., Ltd */ + +#include "vmlinux.h" +#include +#include + +char _license[] SEC("license") = "GPL"; + +struct { + __uint(type, BPF_MAP_TYPE_PROG_ARRAY); + __uint(max_entries, 1); + __uint(key_size, sizeof(__u32)); + __uint(value_size, sizeof(__u32)); +} jmp_table SEC(".maps"); + +SEC("lsm/file_permission") +int lsm_file_permission_prog(void *ctx) +{ + return 0; +} + +SEC("lsm/file_alloc_security") +int lsm_file_alloc_security_prog(void *ctx) +{ + return 0; +} + +SEC("lsm/file_alloc_security") +int lsm_file_alloc_security_entry(void *ctx) +{ + bpf_tail_call_static(ctx, &jmp_table, 0); + return 0; +} -- cgit v1.2.3 From 04d8243b1f83251bd599fdd3ea91e89039b6a557 Mon Sep 17 00:00:00 2001 From: Xu Kuohai Date: Fri, 19 Jul 2024 19:00:59 +0800 Subject: selftests/bpf: Add verifier tests for bpf lsm Add verifier tests to check bpf lsm return values and disabled hooks. Signed-off-by: Xu Kuohai Link: https://lore.kernel.org/r/20240719110059.797546-10-xukuohai@huaweicloud.com Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko --- tools/testing/selftests/bpf/prog_tests/verifier.c | 2 + tools/testing/selftests/bpf/progs/verifier_lsm.c | 162 ++++++++++++++++++++++ 2 files changed, 164 insertions(+) create mode 100644 tools/testing/selftests/bpf/progs/verifier_lsm.c (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/verifier.c b/tools/testing/selftests/bpf/prog_tests/verifier.c index 9dc3687bc406..ff1c7da1d06e 100644 --- a/tools/testing/selftests/bpf/prog_tests/verifier.c +++ b/tools/testing/selftests/bpf/prog_tests/verifier.c @@ -88,6 +88,7 @@ #include "verifier_xdp.skel.h" #include "verifier_xdp_direct_packet_access.skel.h" #include "verifier_bits_iter.skel.h" +#include "verifier_lsm.skel.h" #define MAX_ENTRIES 11 @@ -206,6 +207,7 @@ void test_verifier_xadd(void) { RUN(verifier_xadd); } void test_verifier_xdp(void) { RUN(verifier_xdp); } void test_verifier_xdp_direct_packet_access(void) { RUN(verifier_xdp_direct_packet_access); } void test_verifier_bits_iter(void) { RUN(verifier_bits_iter); } +void test_verifier_lsm(void) { RUN(verifier_lsm); } static int init_test_val_map(struct bpf_object *obj, char *map_name) { diff --git a/tools/testing/selftests/bpf/progs/verifier_lsm.c b/tools/testing/selftests/bpf/progs/verifier_lsm.c new file mode 100644 index 000000000000..32e5e779cb96 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/verifier_lsm.c @@ -0,0 +1,162 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include "bpf_misc.h" + +SEC("lsm/file_alloc_security") +__description("lsm bpf prog with -4095~0 retval. test 1") +__success +__naked int errno_zero_retval_test1(void *ctx) +{ + asm volatile ( + "r0 = 0;" + "exit;" + ::: __clobber_all); +} + +SEC("lsm/file_alloc_security") +__description("lsm bpf prog with -4095~0 retval. test 2") +__success +__naked int errno_zero_retval_test2(void *ctx) +{ + asm volatile ( + "r0 = -4095;" + "exit;" + ::: __clobber_all); +} + +SEC("lsm/file_mprotect") +__description("lsm bpf prog with -4095~0 retval. test 4") +__failure __msg("R0 has smin=-4096 smax=-4096 should have been in [-4095, 0]") +__naked int errno_zero_retval_test4(void *ctx) +{ + asm volatile ( + "r0 = -4096;" + "exit;" + ::: __clobber_all); +} + +SEC("lsm/file_mprotect") +__description("lsm bpf prog with -4095~0 retval. test 5") +__failure __msg("R0 has smin=4096 smax=4096 should have been in [-4095, 0]") +__naked int errno_zero_retval_test5(void *ctx) +{ + asm volatile ( + "r0 = 4096;" + "exit;" + ::: __clobber_all); +} + +SEC("lsm/file_mprotect") +__description("lsm bpf prog with -4095~0 retval. test 6") +__failure __msg("R0 has smin=1 smax=1 should have been in [-4095, 0]") +__naked int errno_zero_retval_test6(void *ctx) +{ + asm volatile ( + "r0 = 1;" + "exit;" + ::: __clobber_all); +} + +SEC("lsm/audit_rule_known") +__description("lsm bpf prog with bool retval. test 1") +__success +__naked int bool_retval_test1(void *ctx) +{ + asm volatile ( + "r0 = 1;" + "exit;" + ::: __clobber_all); +} + +SEC("lsm/audit_rule_known") +__description("lsm bpf prog with bool retval. test 2") +__success +__success +__naked int bool_retval_test2(void *ctx) +{ + asm volatile ( + "r0 = 0;" + "exit;" + ::: __clobber_all); +} + +SEC("lsm/audit_rule_known") +__description("lsm bpf prog with bool retval. test 3") +__failure __msg("R0 has smin=-1 smax=-1 should have been in [0, 1]") +__naked int bool_retval_test3(void *ctx) +{ + asm volatile ( + "r0 = -1;" + "exit;" + ::: __clobber_all); +} + +SEC("lsm/audit_rule_known") +__description("lsm bpf prog with bool retval. test 4") +__failure __msg("R0 has smin=2 smax=2 should have been in [0, 1]") +__naked int bool_retval_test4(void *ctx) +{ + asm volatile ( + "r0 = 2;" + "exit;" + ::: __clobber_all); +} + +SEC("lsm/file_free_security") +__success +__description("lsm bpf prog with void retval. test 1") +__naked int void_retval_test1(void *ctx) +{ + asm volatile ( + "r0 = -4096;" + "exit;" + ::: __clobber_all); +} + +SEC("lsm/file_free_security") +__success +__description("lsm bpf prog with void retval. test 2") +__naked int void_retval_test2(void *ctx) +{ + asm volatile ( + "r0 = 4096;" + "exit;" + ::: __clobber_all); +} + +SEC("lsm/getprocattr") +__description("lsm disabled hook: getprocattr") +__failure __msg("points to disabled hook") +__naked int disabled_hook_test1(void *ctx) +{ + asm volatile ( + "r0 = 0;" + "exit;" + ::: __clobber_all); +} + +SEC("lsm/setprocattr") +__description("lsm disabled hook: setprocattr") +__failure __msg("points to disabled hook") +__naked int disabled_hook_test2(void *ctx) +{ + asm volatile ( + "r0 = 0;" + "exit;" + ::: __clobber_all); +} + +SEC("lsm/ismaclabel") +__description("lsm disabled hook: ismaclabel") +__failure __msg("points to disabled hook") +__naked int disabled_hook_test3(void *ctx) +{ + asm volatile ( + "r0 = 0;" + "exit;" + ::: __clobber_all); +} + +char _license[] SEC("license") = "GPL"; -- cgit v1.2.3 From aa8ebb270c66cea1f56a25d0f938036e91ad085a Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 22 Jul 2024 19:08:15 -0700 Subject: selftests/bpf: Workaround strict bpf_lsm return value check. test_progs-no_alu32 -t libbpf_get_fd_by_id_opts is being rejected by the verifier with the following error due to compiler optimization: 6: (67) r0 <<= 62 ; R0_w=scalar(smax=0x4000000000000000,umax=0xc000000000000000,smin32=0,smax32=umax32=0,var_off=(0x0; 0xc000000000000000)) 7: (c7) r0 s>>= 63 ; R0_w=scalar(smin=smin32=-1,smax=smax32=0) ; @ test_libbpf_get_fd_by_id_opts.c:0 8: (57) r0 &= -13 ; R0_w=scalar(smax=0x7ffffffffffffff3,umax=0xfffffffffffffff3,smax32=0x7ffffff3,umax32=0xfffffff3,var_off=(0x0; 0xfffffffffffffff3)) ; int BPF_PROG(check_access, struct bpf_map *map, fmode_t fmode) @ test_libbpf_get_fd_by_id_opts.c:27 9: (95) exit At program exit the register R0 has smax=9223372036854775795 should have been in [-4095, 0] Workaround by adding barrier(). Eventually the verifier will be able to recognize it. Fixes: 5d99e198be27 ("bpf, lsm: Add check for BPF LSM return value") Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko --- tools/testing/selftests/bpf/progs/test_libbpf_get_fd_by_id_opts.c | 1 + 1 file changed, 1 insertion(+) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/progs/test_libbpf_get_fd_by_id_opts.c b/tools/testing/selftests/bpf/progs/test_libbpf_get_fd_by_id_opts.c index f5ac5f3e8919..568816307f71 100644 --- a/tools/testing/selftests/bpf/progs/test_libbpf_get_fd_by_id_opts.c +++ b/tools/testing/selftests/bpf/progs/test_libbpf_get_fd_by_id_opts.c @@ -31,6 +31,7 @@ int BPF_PROG(check_access, struct bpf_map *map, fmode_t fmode) if (fmode & FMODE_WRITE) return -EACCES; + barrier(); return 0; } -- cgit v1.2.3 From af994e31b75e30941ef32796d0aa8510db00b32a Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Sat, 20 Jul 2024 19:40:03 +0800 Subject: selftests/bpf: Drop make_client in sk_lookup This patch uses the new helper connect_to_addr_str() in sk_lookup.c to create the client socket and connect to the server, instead of using local defined function make_client(). This local function can be dropped then. Signed-off-by: Geliang Tang Link: https://lore.kernel.org/r/058199d7ab46802249dae066ca22c98f6be508ee.1721475357.git.tanggeliang@kylinos.cn Signed-off-by: Martin KaFai Lau Signed-off-by: Andrii Nakryiko --- tools/testing/selftests/bpf/prog_tests/sk_lookup.c | 34 +++++----------------- 1 file changed, 7 insertions(+), 27 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/sk_lookup.c b/tools/testing/selftests/bpf/prog_tests/sk_lookup.c index ae87c00867ba..9510a3f83787 100644 --- a/tools/testing/selftests/bpf/prog_tests/sk_lookup.c +++ b/tools/testing/selftests/bpf/prog_tests/sk_lookup.c @@ -229,27 +229,6 @@ fail: return -1; } -static int make_client(int sotype, const char *ip, int port) -{ - struct sockaddr_storage addr = {0}; - int err, fd; - - fd = make_socket(sotype, ip, port, &addr); - if (fd < 0) - return -1; - - err = connect(fd, (void *)&addr, inetaddr_len(&addr)); - if (CHECK(err, "make_client", "connect")) { - log_err("failed to connect client socket"); - goto fail; - } - - return fd; -fail: - close(fd); - return -1; -} - static __u64 socket_cookie(int fd) { __u64 cookie; @@ -646,8 +625,9 @@ static void run_lookup_prog(const struct test *t) goto close; } - client_fd = make_client(t->sotype, t->connect_to.ip, t->connect_to.port); - if (client_fd < 0) + client_fd = connect_to_addr_str(is_ipv6(t->connect_to.ip) ? AF_INET6 : AF_INET, + t->sotype, t->connect_to.ip, t->connect_to.port, NULL); + if (!ASSERT_OK_FD(client_fd, "connect_to_addr_str")) goto close; if (t->sotype == SOCK_STREAM) @@ -1152,8 +1132,8 @@ static void run_sk_assign_connected(struct test_sk_lookup *skel, if (server_fd < 0) return; - connected_fd = make_client(sotype, EXT_IP4, EXT_PORT); - if (connected_fd < 0) + connected_fd = connect_to_addr_str(AF_INET, sotype, EXT_IP4, EXT_PORT, NULL); + if (!ASSERT_OK_FD(connected_fd, "connect_to_addr_str")) goto out_close_server; /* Put a connected socket in redirect map */ @@ -1166,8 +1146,8 @@ static void run_sk_assign_connected(struct test_sk_lookup *skel, goto out_close_connected; /* Try to redirect TCP SYN / UDP packet to a connected socket */ - client_fd = make_client(sotype, EXT_IP4, EXT_PORT); - if (client_fd < 0) + client_fd = connect_to_addr_str(AF_INET, sotype, EXT_IP4, EXT_PORT, NULL); + if (!ASSERT_OK_FD(client_fd, "connect_to_addr_str")) goto out_unlink_prog; if (sotype == SOCK_DGRAM) { send_byte(client_fd); -- cgit v1.2.3 From 01c2f776ed372155f9e80bd787324f26601914de Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Sat, 20 Jul 2024 19:40:04 +0800 Subject: selftests/bpf: Drop make_socket in sk_lookup This patch uses the public network helers client_socket() + make_sockaddr() in sk_lookup.c to create the client socket, set the timeout sockopts, and make the connecting address. The local defined function make_socket() can be dropped then. Signed-off-by: Geliang Tang Link: https://lore.kernel.org/r/588771977ac48c27f73526d8421a84b91d7cf218.1721475357.git.tanggeliang@kylinos.cn Signed-off-by: Martin KaFai Lau Signed-off-by: Andrii Nakryiko --- tools/testing/selftests/bpf/prog_tests/sk_lookup.c | 61 ++++++---------------- 1 file changed, 17 insertions(+), 44 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/sk_lookup.c b/tools/testing/selftests/bpf/prog_tests/sk_lookup.c index 9510a3f83787..88f2f9efeea5 100644 --- a/tools/testing/selftests/bpf/prog_tests/sk_lookup.c +++ b/tools/testing/selftests/bpf/prog_tests/sk_lookup.c @@ -47,8 +47,6 @@ #define INT_IP6 "fd00::2" #define INT_PORT 8008 -#define IO_TIMEOUT_SEC 3 - enum server { SERVER_A = 0, SERVER_B = 1, @@ -114,40 +112,6 @@ static socklen_t inetaddr_len(const struct sockaddr_storage *addr) addr->ss_family == AF_INET6 ? sizeof(struct sockaddr_in6) : 0); } -static int make_socket(int sotype, const char *ip, int port, - struct sockaddr_storage *addr) -{ - struct timeval timeo = { .tv_sec = IO_TIMEOUT_SEC }; - int err, family, fd; - - family = is_ipv6(ip) ? AF_INET6 : AF_INET; - err = make_sockaddr(family, ip, port, addr, NULL); - if (CHECK(err, "make_address", "failed\n")) - return -1; - - fd = socket(addr->ss_family, sotype, 0); - if (CHECK(fd < 0, "socket", "failed\n")) { - log_err("failed to make socket"); - return -1; - } - - err = setsockopt(fd, SOL_SOCKET, SO_SNDTIMEO, &timeo, sizeof(timeo)); - if (CHECK(err, "setsockopt(SO_SNDTIMEO)", "failed\n")) { - log_err("failed to set SNDTIMEO"); - close(fd); - return -1; - } - - err = setsockopt(fd, SOL_SOCKET, SO_RCVTIMEO, &timeo, sizeof(timeo)); - if (CHECK(err, "setsockopt(SO_RCVTIMEO)", "failed\n")) { - log_err("failed to set RCVTIMEO"); - close(fd); - return -1; - } - - return fd; -} - static int setsockopts(int fd, void *opts) { struct cb_opts *co = (struct cb_opts *)opts; @@ -842,6 +806,7 @@ static void test_redirect_lookup(struct test_sk_lookup *skel) static void drop_on_lookup(const struct test *t) { + int family = is_ipv6(t->connect_to.ip) ? AF_INET6 : AF_INET; struct sockaddr_storage dst = {}; int client_fd, server_fd, err; struct bpf_link *lookup_link; @@ -856,11 +821,13 @@ static void drop_on_lookup(const struct test *t) if (server_fd < 0) goto detach; - client_fd = make_socket(t->sotype, t->connect_to.ip, - t->connect_to.port, &dst); - if (client_fd < 0) + client_fd = client_socket(family, t->sotype, NULL); + if (!ASSERT_OK_FD(client_fd, "client_socket")) goto close_srv; + err = make_sockaddr(family, t->connect_to.ip, t->connect_to.port, &dst, NULL); + if (!ASSERT_OK(err, "make_sockaddr")) + goto close_all; err = connect(client_fd, (void *)&dst, inetaddr_len(&dst)); if (t->sotype == SOCK_DGRAM) { err = send_byte(client_fd); @@ -956,6 +923,7 @@ static void test_drop_on_lookup(struct test_sk_lookup *skel) static void drop_on_reuseport(const struct test *t) { + int family = is_ipv6(t->connect_to.ip) ? AF_INET6 : AF_INET; struct sockaddr_storage dst = { 0 }; int client, server1, server2, err; struct bpf_link *lookup_link; @@ -980,11 +948,13 @@ static void drop_on_reuseport(const struct test *t) if (server2 < 0) goto close_srv1; - client = make_socket(t->sotype, t->connect_to.ip, - t->connect_to.port, &dst); - if (client < 0) + client = client_socket(family, t->sotype, NULL); + if (!ASSERT_OK_FD(client, "client_socket")) goto close_srv2; + err = make_sockaddr(family, t->connect_to.ip, t->connect_to.port, &dst, NULL); + if (!ASSERT_OK(err, "make_sockaddr")) + goto close_all; err = connect(client, (void *)&dst, inetaddr_len(&dst)); if (t->sotype == SOCK_DGRAM) { err = send_byte(client); @@ -1228,10 +1198,13 @@ static void run_multi_prog_lookup(const struct test_multi_prog *t) if (err) goto out_close_server; - client_fd = make_socket(SOCK_STREAM, EXT_IP4, EXT_PORT, &dst); - if (client_fd < 0) + client_fd = client_socket(AF_INET, SOCK_STREAM, NULL); + if (!ASSERT_OK_FD(client_fd, "client_socket")) goto out_close_server; + err = make_sockaddr(AF_INET, EXT_IP4, EXT_PORT, &dst, NULL); + if (!ASSERT_OK(err, "make_sockaddr")) + goto out_close_client; err = connect(client_fd, (void *)&dst, inetaddr_len(&dst)); if (CHECK(err && !t->expect_errno, "connect", "unexpected error %d\n", errno)) -- cgit v1.2.3 From c3c41e016cca7cc6b6d9cd1322fdf9be845ccd06 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Sat, 20 Jul 2024 19:40:05 +0800 Subject: selftests/bpf: Drop inetaddr_len in sk_lookup No need to use a dedicated helper inetaddr_len() to get the length of the IPv4 or IPv6 address, it can be got by make_sockaddr(), this patch drops it. Signed-off-by: Geliang Tang Link: https://lore.kernel.org/r/32e2a4122921051da38a6e4fbb2ebee5f0af5a4e.1721475357.git.tanggeliang@kylinos.cn Signed-off-by: Martin KaFai Lau Signed-off-by: Andrii Nakryiko --- tools/testing/selftests/bpf/prog_tests/sk_lookup.c | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/sk_lookup.c b/tools/testing/selftests/bpf/prog_tests/sk_lookup.c index 88f2f9efeea5..96f3863da8bc 100644 --- a/tools/testing/selftests/bpf/prog_tests/sk_lookup.c +++ b/tools/testing/selftests/bpf/prog_tests/sk_lookup.c @@ -106,12 +106,6 @@ static int attach_reuseport(int sock_fd, struct bpf_program *reuseport_prog) return 0; } -static socklen_t inetaddr_len(const struct sockaddr_storage *addr) -{ - return (addr->ss_family == AF_INET ? sizeof(struct sockaddr_in) : - addr->ss_family == AF_INET6 ? sizeof(struct sockaddr_in6) : 0); -} - static int setsockopts(int fd, void *opts) { struct cb_opts *co = (struct cb_opts *)opts; @@ -810,6 +804,7 @@ static void drop_on_lookup(const struct test *t) struct sockaddr_storage dst = {}; int client_fd, server_fd, err; struct bpf_link *lookup_link; + socklen_t len; ssize_t n; lookup_link = attach_lookup_prog(t->lookup_prog); @@ -825,10 +820,10 @@ static void drop_on_lookup(const struct test *t) if (!ASSERT_OK_FD(client_fd, "client_socket")) goto close_srv; - err = make_sockaddr(family, t->connect_to.ip, t->connect_to.port, &dst, NULL); + err = make_sockaddr(family, t->connect_to.ip, t->connect_to.port, &dst, &len); if (!ASSERT_OK(err, "make_sockaddr")) goto close_all; - err = connect(client_fd, (void *)&dst, inetaddr_len(&dst)); + err = connect(client_fd, (void *)&dst, len); if (t->sotype == SOCK_DGRAM) { err = send_byte(client_fd); if (err) @@ -927,6 +922,7 @@ static void drop_on_reuseport(const struct test *t) struct sockaddr_storage dst = { 0 }; int client, server1, server2, err; struct bpf_link *lookup_link; + socklen_t len; ssize_t n; lookup_link = attach_lookup_prog(t->lookup_prog); @@ -952,10 +948,10 @@ static void drop_on_reuseport(const struct test *t) if (!ASSERT_OK_FD(client, "client_socket")) goto close_srv2; - err = make_sockaddr(family, t->connect_to.ip, t->connect_to.port, &dst, NULL); + err = make_sockaddr(family, t->connect_to.ip, t->connect_to.port, &dst, &len); if (!ASSERT_OK(err, "make_sockaddr")) goto close_all; - err = connect(client, (void *)&dst, inetaddr_len(&dst)); + err = connect(client, (void *)&dst, len); if (t->sotype == SOCK_DGRAM) { err = send_byte(client); if (err) @@ -1169,6 +1165,7 @@ static void run_multi_prog_lookup(const struct test_multi_prog *t) int map_fd, server_fd, client_fd; struct bpf_link *link1, *link2; int prog_idx, done, err; + socklen_t len; map_fd = bpf_map__fd(t->run_map); @@ -1202,10 +1199,10 @@ static void run_multi_prog_lookup(const struct test_multi_prog *t) if (!ASSERT_OK_FD(client_fd, "client_socket")) goto out_close_server; - err = make_sockaddr(AF_INET, EXT_IP4, EXT_PORT, &dst, NULL); + err = make_sockaddr(AF_INET, EXT_IP4, EXT_PORT, &dst, &len); if (!ASSERT_OK(err, "make_sockaddr")) goto out_close_client; - err = connect(client_fd, (void *)&dst, inetaddr_len(&dst)); + err = connect(client_fd, (void *)&dst, len); if (CHECK(err && !t->expect_errno, "connect", "unexpected error %d\n", errno)) goto out_close_client; -- cgit v1.2.3 From 71a2fbaf9c91b0d863e11ec03f7168893904fbab Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Sat, 20 Jul 2024 19:40:06 +0800 Subject: selftests/bpf: Drop __start_server in network_helpers The helper start_server_addr() is a wrapper of __start_server(), the only difference between them is __start_server() accepts a sockaddr type address parameter, but start_server_addr() accepts a sockaddr_storage one. This patch drops __start_server(), and updates the callers to invoke start_server_addr() instead. Signed-off-by: Geliang Tang Link: https://lore.kernel.org/r/31399df7cb957b7c233e79963b0aa0dc4278d273.1721475357.git.tanggeliang@kylinos.cn Signed-off-by: Martin KaFai Lau Signed-off-by: Andrii Nakryiko --- tools/testing/selftests/bpf/network_helpers.c | 26 ++++++++++---------------- 1 file changed, 10 insertions(+), 16 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/network_helpers.c b/tools/testing/selftests/bpf/network_helpers.c index 9c98a60cf1e2..a3f0a49fb26f 100644 --- a/tools/testing/selftests/bpf/network_helpers.c +++ b/tools/testing/selftests/bpf/network_helpers.c @@ -80,12 +80,15 @@ int settimeo(int fd, int timeout_ms) #define save_errno_close(fd) ({ int __save = errno; close(fd); errno = __save; }) -static int __start_server(int type, const struct sockaddr *addr, socklen_t addrlen, - const struct network_helper_opts *opts) +int start_server_addr(int type, const struct sockaddr_storage *addr, socklen_t addrlen, + const struct network_helper_opts *opts) { int fd; - fd = socket(addr->sa_family, type, opts->proto); + if (!opts) + opts = &default_opts; + + fd = socket(addr->ss_family, type, opts->proto); if (fd < 0) { log_err("Failed to create server socket"); return -1; @@ -100,7 +103,7 @@ static int __start_server(int type, const struct sockaddr *addr, socklen_t addrl goto error_close; } - if (bind(fd, addr, addrlen) < 0) { + if (bind(fd, (struct sockaddr *)addr, addrlen) < 0) { log_err("Failed to bind socket"); goto error_close; } @@ -131,7 +134,7 @@ int start_server_str(int family, int type, const char *addr_str, __u16 port, if (make_sockaddr(family, addr_str, port, &addr, &addrlen)) return -1; - return __start_server(type, (struct sockaddr *)&addr, addrlen, opts); + return start_server_addr(type, &addr, addrlen, opts); } int start_server(int family, int type, const char *addr_str, __u16 port, @@ -173,7 +176,7 @@ int *start_reuseport_server(int family, int type, const char *addr_str, if (!fds) return NULL; - fds[0] = __start_server(type, (struct sockaddr *)&addr, addrlen, &opts); + fds[0] = start_server_addr(type, &addr, addrlen, &opts); if (fds[0] == -1) goto close_fds; nr_fds = 1; @@ -182,7 +185,7 @@ int *start_reuseport_server(int family, int type, const char *addr_str, goto close_fds; for (; nr_fds < nr_listens; nr_fds++) { - fds[nr_fds] = __start_server(type, (struct sockaddr *)&addr, addrlen, &opts); + fds[nr_fds] = start_server_addr(type, &addr, addrlen, &opts); if (fds[nr_fds] == -1) goto close_fds; } @@ -194,15 +197,6 @@ close_fds: return NULL; } -int start_server_addr(int type, const struct sockaddr_storage *addr, socklen_t len, - const struct network_helper_opts *opts) -{ - if (!opts) - opts = &default_opts; - - return __start_server(type, (struct sockaddr *)addr, len, opts); -} - void free_fds(int *fds, unsigned int nr_close_fds) { if (fds) { -- cgit v1.2.3 From c7db4873fbd63918c124a5d67182492c78ba705b Mon Sep 17 00:00:00 2001 From: Song Liu Date: Mon, 22 Jul 2024 22:14:55 -0700 Subject: selftests/bpf: Add a test for mmap-able map in map Regular BPF hash map is not mmap-able from user space. However, map-in-map with outer map of type BPF_MAP_TYPE_HASH_OF_MAPS and mmap-able array as inner map can perform similar operations as a mmap-able hash map. This can be used by applications that benefit from fast accesses to some local data. Add a selftest to show this use case. Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20240723051455.1589192-1-song@kernel.org Signed-off-by: Martin KaFai Lau Signed-off-by: Andrii Nakryiko --- .../bpf/prog_tests/test_mmap_inner_array.c | 57 ++++++++++++++++++++++ .../testing/selftests/bpf/progs/mmap_inner_array.c | 57 ++++++++++++++++++++++ 2 files changed, 114 insertions(+) create mode 100644 tools/testing/selftests/bpf/prog_tests/test_mmap_inner_array.c create mode 100644 tools/testing/selftests/bpf/progs/mmap_inner_array.c (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/test_mmap_inner_array.c b/tools/testing/selftests/bpf/prog_tests/test_mmap_inner_array.c new file mode 100644 index 000000000000..ce745776ed18 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/test_mmap_inner_array.c @@ -0,0 +1,57 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */ +#include +#include +#include "mmap_inner_array.skel.h" + +void test_mmap_inner_array(void) +{ + const long page_size = sysconf(_SC_PAGE_SIZE); + struct mmap_inner_array *skel; + int inner_array_fd, err; + void *tmp; + __u64 *val; + + skel = mmap_inner_array__open_and_load(); + + if (!ASSERT_OK_PTR(skel, "open_and_load")) + return; + + inner_array_fd = bpf_map__fd(skel->maps.inner_array); + tmp = mmap(NULL, page_size, PROT_READ | PROT_WRITE, MAP_SHARED, inner_array_fd, 0); + if (!ASSERT_OK_PTR(tmp, "inner array mmap")) + goto out; + val = (void *)tmp; + + err = mmap_inner_array__attach(skel); + if (!ASSERT_OK(err, "attach")) + goto out_unmap; + + skel->bss->pid = getpid(); + usleep(1); + + /* pid is set, pid_match == true and outer_map_match == false */ + ASSERT_TRUE(skel->bss->pid_match, "pid match 1"); + ASSERT_FALSE(skel->bss->outer_map_match, "outer map match 1"); + ASSERT_FALSE(skel->bss->done, "done 1"); + ASSERT_EQ(*val, 0, "value match 1"); + + err = bpf_map__update_elem(skel->maps.outer_map, + &skel->bss->pid, sizeof(skel->bss->pid), + &inner_array_fd, sizeof(inner_array_fd), + BPF_ANY); + if (!ASSERT_OK(err, "update elem")) + goto out_unmap; + usleep(1); + + /* outer map key is set, outer_map_match == true */ + ASSERT_TRUE(skel->bss->pid_match, "pid match 2"); + ASSERT_TRUE(skel->bss->outer_map_match, "outer map match 2"); + ASSERT_TRUE(skel->bss->done, "done 2"); + ASSERT_EQ(*val, skel->data->match_value, "value match 2"); + +out_unmap: + munmap(tmp, page_size); +out: + mmap_inner_array__destroy(skel); +} diff --git a/tools/testing/selftests/bpf/progs/mmap_inner_array.c b/tools/testing/selftests/bpf/progs/mmap_inner_array.c new file mode 100644 index 000000000000..90aacbc2938a --- /dev/null +++ b/tools/testing/selftests/bpf/progs/mmap_inner_array.c @@ -0,0 +1,57 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */ + +#include "vmlinux.h" +#include + +#include "bpf_misc.h" + +char _license[] SEC("license") = "GPL"; + +struct inner_array_type { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(map_flags, BPF_F_MMAPABLE); + __type(key, __u32); + __type(value, __u64); + __uint(max_entries, 1); +} inner_array SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_HASH_OF_MAPS); + __uint(key_size, 4); + __uint(value_size, 4); + __uint(max_entries, 1); + __array(values, struct inner_array_type); +} outer_map SEC(".maps"); + +int pid = 0; +__u64 match_value = 0x13572468; +bool done = false; +bool pid_match = false; +bool outer_map_match = false; + +SEC("fentry/" SYS_PREFIX "sys_nanosleep") +int add_to_list_in_inner_array(void *ctx) +{ + __u32 curr_pid, zero = 0; + struct bpf_map *map; + __u64 *value; + + curr_pid = (u32)bpf_get_current_pid_tgid(); + if (done || curr_pid != pid) + return 0; + + pid_match = true; + map = bpf_map_lookup_elem(&outer_map, &curr_pid); + if (!map) + return 0; + + outer_map_match = true; + value = bpf_map_lookup_elem(map, &zero); + if (!value) + return 0; + + *value = match_value; + done = true; + return 0; +} -- cgit v1.2.3 From a0ef659d03d2fc66c270cee791a1c5ebc0ff3bac Mon Sep 17 00:00:00 2001 From: Ihor Solodrai Date: Tue, 23 Jul 2024 03:07:00 +0000 Subject: selftests/bpf: Don't include .d files on make clean Ignore generated %.test.o dependencies when make goal is clean or docs-clean. Signed-off-by: Ihor Solodrai Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/all/oNTIdax7aWGJdEgabzTqHzF4r-WTERrV1e1cNaPQMp-UhYUQpozXqkbuAlLBulczr6I99-jM5x3dxv56JJowaYBkm765R9Aa9kyrVuCl_kA=@pm.me Link: https://lore.kernel.org/bpf/K69Y8OKMLXBWR0dtOfsC4J46-HxeQfvqoFx1CysCm7u19HRx4MB6yAKOFkM6X-KAx2EFuCcCh_9vYWpsgQXnAer8oQ8PMeDEuiRMYECuGH4=@pm.me --- tools/testing/selftests/bpf/Makefile | 2 ++ 1 file changed, 2 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 05b234248b38..74f829952280 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -608,7 +608,9 @@ $(TRUNNER_TEST_OBJS:.o=.d): $(TRUNNER_OUTPUT)/%.test.d: \ $(TRUNNER_BPF_SKELS_LINKED) \ $$(BPFOBJ) | $(TRUNNER_OUTPUT) +ifeq ($(filter clean docs-clean,$(MAKECMDGOALS)),) include $(wildcard $(TRUNNER_TEST_OBJS:.o=.d)) +endif $(TRUNNER_EXTRA_OBJS): $(TRUNNER_OUTPUT)/%.o: \ %.c \ -- cgit v1.2.3 From c7ad90736763b4e6bed10788ec07020a4f15dc32 Mon Sep 17 00:00:00 2001 From: Tony Ambardar Date: Mon, 22 Jul 2024 17:13:28 -0700 Subject: selftests/bpf: Add missing system defines for mips Update get_sys_includes in Makefile with missing MIPS-related definitions to fix many, many compilation errors building selftests/bpf. The following added defines drive conditional logic in system headers for word-size and endianness selection: MIPSEL, MIPSEB _MIPS_SZPTR _MIPS_SZLONG _MIPS_SIM, _ABIO32, _ABIN32, _ABI64 Signed-off-by: Tony Ambardar Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/f3cfceaf5299cdd2ac0e0a36072d6ca7be23e603.1721692479.git.tony.ambardar@gmail.com --- tools/testing/selftests/bpf/Makefile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 74f829952280..a7c797dd075f 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -396,7 +396,8 @@ define get_sys_includes $(shell $(1) $(2) -v -E - &1 \ | sed -n '/<...> search starts here:/,/End of search list./{ s| \(/.*\)|-idirafter \1|p }') \ $(shell $(1) $(2) -dM -E - Date: Mon, 22 Jul 2024 17:13:29 -0700 Subject: selftests/bpf: Fix error linking uprobe_multi on mips Linking uprobe_multi.c on mips64el fails due to relocation overflows, when the GOT entries required exceeds the default maximum. Add a specific CFLAGS (-mxgot) for uprobe_multi.c on MIPS that allows using a larger GOT and avoids errors such as: /tmp/ccBTNQzv.o: in function `bench': uprobe_multi.c:49:(.text+0x1d7720): relocation truncated to fit: R_MIPS_GOT_DISP against `uprobe_multi_func_08188' uprobe_multi.c:49:(.text+0x1d7730): relocation truncated to fit: R_MIPS_GOT_DISP against `uprobe_multi_func_08189' ... collect2: error: ld returned 1 exit status Fixes: 519dfeaf5119 ("selftests/bpf: Add uprobe_multi test program") Signed-off-by: Tony Ambardar Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/14eb7b70f8ccef9834874d75eb373cb9292129da.1721692479.git.tony.ambardar@gmail.com --- tools/testing/selftests/bpf/Makefile | 2 ++ 1 file changed, 2 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index a7c797dd075f..8e01e4f9b1a7 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -786,6 +786,8 @@ $(OUTPUT)/veristat: $(OUTPUT)/veristat.o $(call msg,BINARY,,$@) $(Q)$(CC) $(CFLAGS) $(LDFLAGS) $(filter %.a %.o,$^) $(LDLIBS) -o $@ +# Linking uprobe_multi can fail due to relocation overflows on mips. +$(OUTPUT)/uprobe_multi: CFLAGS += $(if $(filter mips, $(ARCH)),-mxgot) $(OUTPUT)/uprobe_multi: uprobe_multi.c $(call msg,BINARY,,$@) $(Q)$(CC) $(CFLAGS) -O0 $(LDFLAGS) $^ $(LDLIBS) -o $@ -- cgit v1.2.3 From d17f9b370df628a3faaec7fd8be92a89ecc3e33a Mon Sep 17 00:00:00 2001 From: Artem Savkov Date: Tue, 23 Jul 2024 09:10:31 +0200 Subject: selftests/bpf: Fix compilation failure when CONFIG_NET_FOU!=y Without CONFIG_NET_FOU bpf selftests are unable to build because of missing definitions. Add ___local versions of struct bpf_fou_encap and enum bpf_fou_encap_type to fix the issue. Signed-off-by: Artem Savkov Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20240723071031.3389423-1-asavkov@redhat.com --- .../testing/selftests/bpf/progs/test_tunnel_kern.c | 27 +++++++++++++++++----- 1 file changed, 21 insertions(+), 6 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/progs/test_tunnel_kern.c b/tools/testing/selftests/bpf/progs/test_tunnel_kern.c index 3f5abcf3ff13..32127f1cd687 100644 --- a/tools/testing/selftests/bpf/progs/test_tunnel_kern.c +++ b/tools/testing/selftests/bpf/progs/test_tunnel_kern.c @@ -26,6 +26,18 @@ */ #define ASSIGNED_ADDR_VETH1 0xac1001c8 +struct bpf_fou_encap___local { + __be16 sport; + __be16 dport; +} __attribute__((preserve_access_index)); + +enum bpf_fou_encap_type___local { + FOU_BPF_ENCAP_FOU___local, + FOU_BPF_ENCAP_GUE___local, +}; + +struct bpf_fou_encap; + int bpf_skb_set_fou_encap(struct __sk_buff *skb_ctx, struct bpf_fou_encap *encap, int type) __ksym; int bpf_skb_get_fou_encap(struct __sk_buff *skb_ctx, @@ -745,7 +757,7 @@ SEC("tc") int ipip_gue_set_tunnel(struct __sk_buff *skb) { struct bpf_tunnel_key key = {}; - struct bpf_fou_encap encap = {}; + struct bpf_fou_encap___local encap = {}; void *data = (void *)(long)skb->data; struct iphdr *iph = data; void *data_end = (void *)(long)skb->data_end; @@ -769,7 +781,9 @@ int ipip_gue_set_tunnel(struct __sk_buff *skb) encap.sport = 0; encap.dport = bpf_htons(5555); - ret = bpf_skb_set_fou_encap(skb, &encap, FOU_BPF_ENCAP_GUE); + ret = bpf_skb_set_fou_encap(skb, (struct bpf_fou_encap *)&encap, + bpf_core_enum_value(enum bpf_fou_encap_type___local, + FOU_BPF_ENCAP_GUE___local)); if (ret < 0) { log_err(ret); return TC_ACT_SHOT; @@ -782,7 +796,7 @@ SEC("tc") int ipip_fou_set_tunnel(struct __sk_buff *skb) { struct bpf_tunnel_key key = {}; - struct bpf_fou_encap encap = {}; + struct bpf_fou_encap___local encap = {}; void *data = (void *)(long)skb->data; struct iphdr *iph = data; void *data_end = (void *)(long)skb->data_end; @@ -806,7 +820,8 @@ int ipip_fou_set_tunnel(struct __sk_buff *skb) encap.sport = 0; encap.dport = bpf_htons(5555); - ret = bpf_skb_set_fou_encap(skb, &encap, FOU_BPF_ENCAP_FOU); + ret = bpf_skb_set_fou_encap(skb, (struct bpf_fou_encap *)&encap, + FOU_BPF_ENCAP_FOU___local); if (ret < 0) { log_err(ret); return TC_ACT_SHOT; @@ -820,7 +835,7 @@ int ipip_encap_get_tunnel(struct __sk_buff *skb) { int ret; struct bpf_tunnel_key key = {}; - struct bpf_fou_encap encap = {}; + struct bpf_fou_encap___local encap = {}; ret = bpf_skb_get_tunnel_key(skb, &key, sizeof(key), 0); if (ret < 0) { @@ -828,7 +843,7 @@ int ipip_encap_get_tunnel(struct __sk_buff *skb) return TC_ACT_SHOT; } - ret = bpf_skb_get_fou_encap(skb, &encap); + ret = bpf_skb_get_fou_encap(skb, (struct bpf_fou_encap *)&encap); if (ret < 0) { log_err(ret); return TC_ACT_SHOT; -- cgit v1.2.3 From 30c2980c1092d5d69b2a3f460491145ebe0629ec Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Mon, 22 Jul 2024 22:27:57 +0200 Subject: selftests/bpf: Add uprobe fail tests for uprobe multi Adding tests for checking on recovery after failing to attach uprobe. Signed-off-by: Jiri Olsa Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20240722202758.3889061-2-jolsa@kernel.org --- .../selftests/bpf/prog_tests/uprobe_multi_test.c | 118 +++++++++++++++++++++ 1 file changed, 118 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c b/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c index bf6ca8e3eb13..e6255d4df81d 100644 --- a/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c +++ b/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c @@ -516,6 +516,122 @@ cleanup: uprobe_multi__destroy(skel); } +#ifdef __x86_64__ +noinline void uprobe_multi_error_func(void) +{ + /* + * If --fcf-protection=branch is enabled the gcc generates endbr as + * first instruction, so marking the exact address of int3 with the + * symbol to be used in the attach_uprobe_fail_trap test below. + */ + asm volatile ( + ".globl uprobe_multi_error_func_int3; \n" + "uprobe_multi_error_func_int3: \n" + "int3 \n" + ); +} + +/* + * Attaching uprobe on uprobe_multi_error_func results in error + * because it already starts with int3 instruction. + */ +static void attach_uprobe_fail_trap(struct uprobe_multi *skel) +{ + LIBBPF_OPTS(bpf_uprobe_multi_opts, opts); + const char *syms[4] = { + "uprobe_multi_func_1", + "uprobe_multi_func_2", + "uprobe_multi_func_3", + "uprobe_multi_error_func_int3", + }; + + opts.syms = syms; + opts.cnt = ARRAY_SIZE(syms); + + skel->links.uprobe = bpf_program__attach_uprobe_multi(skel->progs.uprobe, -1, + "/proc/self/exe", NULL, &opts); + if (!ASSERT_ERR_PTR(skel->links.uprobe, "bpf_program__attach_uprobe_multi")) { + bpf_link__destroy(skel->links.uprobe); + skel->links.uprobe = NULL; + } +} +#else +static void attach_uprobe_fail_trap(struct uprobe_multi *skel) { } +#endif + +short sema_1 __used, sema_2 __used; + +static void attach_uprobe_fail_refctr(struct uprobe_multi *skel) +{ + unsigned long *tmp_offsets = NULL, *tmp_ref_ctr_offsets = NULL; + unsigned long offsets[3], ref_ctr_offsets[3]; + LIBBPF_OPTS(bpf_link_create_opts, opts); + const char *path = "/proc/self/exe"; + const char *syms[3] = { + "uprobe_multi_func_1", + "uprobe_multi_func_2", + }; + const char *sema[3] = { + "sema_1", + "sema_2", + }; + int prog_fd, link_fd, err; + + prog_fd = bpf_program__fd(skel->progs.uprobe_extra); + + err = elf_resolve_syms_offsets("/proc/self/exe", 2, (const char **) &syms, + &tmp_offsets, STT_FUNC); + if (!ASSERT_OK(err, "elf_resolve_syms_offsets_func")) + return; + + err = elf_resolve_syms_offsets("/proc/self/exe", 2, (const char **) &sema, + &tmp_ref_ctr_offsets, STT_OBJECT); + if (!ASSERT_OK(err, "elf_resolve_syms_offsets_sema")) + goto cleanup; + + /* + * We attach to 3 uprobes on 2 functions, so 2 uprobes share single function, + * but with different ref_ctr_offset which is not allowed and results in fail. + */ + offsets[0] = tmp_offsets[0]; /* uprobe_multi_func_1 */ + offsets[1] = tmp_offsets[1]; /* uprobe_multi_func_2 */ + offsets[2] = tmp_offsets[1]; /* uprobe_multi_func_2 */ + + ref_ctr_offsets[0] = tmp_ref_ctr_offsets[0]; /* sema_1 */ + ref_ctr_offsets[1] = tmp_ref_ctr_offsets[1]; /* sema_2 */ + ref_ctr_offsets[2] = tmp_ref_ctr_offsets[0]; /* sema_1, error */ + + opts.uprobe_multi.path = path; + opts.uprobe_multi.offsets = (const unsigned long *) &offsets; + opts.uprobe_multi.ref_ctr_offsets = (const unsigned long *) &ref_ctr_offsets; + opts.uprobe_multi.cnt = 3; + + link_fd = bpf_link_create(prog_fd, 0, BPF_TRACE_UPROBE_MULTI, &opts); + if (!ASSERT_ERR(link_fd, "link_fd")) + close(link_fd); + +cleanup: + free(tmp_ref_ctr_offsets); + free(tmp_offsets); +} + +static void test_attach_uprobe_fails(void) +{ + struct uprobe_multi *skel = NULL; + + skel = uprobe_multi__open_and_load(); + if (!ASSERT_OK_PTR(skel, "uprobe_multi__open_and_load")) + return; + + /* attach fails due to adding uprobe on trap instruction, x86_64 only */ + attach_uprobe_fail_trap(skel); + + /* attach fail due to wrong ref_ctr_offs on one of the uprobes */ + attach_uprobe_fail_refctr(skel); + + uprobe_multi__destroy(skel); +} + static void __test_link_api(struct child *child) { int prog_fd, link1_fd = -1, link2_fd = -1, link3_fd = -1, link4_fd = -1; @@ -703,4 +819,6 @@ void test_uprobe_multi_test(void) test_bench_attach_usdt(); if (test__start_subtest("attach_api_fails")) test_attach_api_fails(); + if (test__start_subtest("attach_uprobe_fails")) + test_attach_uprobe_fails(); } -- cgit v1.2.3 From 98adc743ae1febb401d635bebe46667af2692751 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Mon, 22 Jul 2024 22:27:58 +0200 Subject: selftests/bpf: Add uprobe multi consumers test Adding test that attaches/detaches multiple consumers on single uprobe and verifies all were hit as expected. Signed-off-by: Jiri Olsa Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20240722202758.3889061-3-jolsa@kernel.org --- .../selftests/bpf/prog_tests/uprobe_multi_test.c | 213 +++++++++++++++++++++ .../selftests/bpf/progs/uprobe_multi_consumers.c | 39 ++++ 2 files changed, 252 insertions(+) create mode 100644 tools/testing/selftests/bpf/progs/uprobe_multi_consumers.c (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c b/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c index e6255d4df81d..acb62675ff65 100644 --- a/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c +++ b/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c @@ -6,6 +6,7 @@ #include "uprobe_multi.skel.h" #include "uprobe_multi_bench.skel.h" #include "uprobe_multi_usdt.skel.h" +#include "uprobe_multi_consumers.skel.h" #include "bpf/libbpf_internal.h" #include "testing_helpers.h" #include "../sdt.h" @@ -731,6 +732,216 @@ static void test_link_api(void) __test_link_api(child); } +static struct bpf_program * +get_program(struct uprobe_multi_consumers *skel, int prog) +{ + switch (prog) { + case 0: + return skel->progs.uprobe_0; + case 1: + return skel->progs.uprobe_1; + case 2: + return skel->progs.uprobe_2; + case 3: + return skel->progs.uprobe_3; + default: + ASSERT_FAIL("get_program"); + return NULL; + } +} + +static struct bpf_link ** +get_link(struct uprobe_multi_consumers *skel, int link) +{ + switch (link) { + case 0: + return &skel->links.uprobe_0; + case 1: + return &skel->links.uprobe_1; + case 2: + return &skel->links.uprobe_2; + case 3: + return &skel->links.uprobe_3; + default: + ASSERT_FAIL("get_link"); + return NULL; + } +} + +static int uprobe_attach(struct uprobe_multi_consumers *skel, int idx) +{ + struct bpf_program *prog = get_program(skel, idx); + struct bpf_link **link = get_link(skel, idx); + LIBBPF_OPTS(bpf_uprobe_multi_opts, opts); + + if (!prog || !link) + return -1; + + /* + * bit/prog: 0,1 uprobe entry + * bit/prog: 2,3 uprobe return + */ + opts.retprobe = idx == 2 || idx == 3; + + *link = bpf_program__attach_uprobe_multi(prog, 0, "/proc/self/exe", + "uprobe_consumer_test", + &opts); + if (!ASSERT_OK_PTR(*link, "bpf_program__attach_uprobe_multi")) + return -1; + return 0; +} + +static void uprobe_detach(struct uprobe_multi_consumers *skel, int idx) +{ + struct bpf_link **link = get_link(skel, idx); + + bpf_link__destroy(*link); + *link = NULL; +} + +static bool test_bit(int bit, unsigned long val) +{ + return val & (1 << bit); +} + +noinline int +uprobe_consumer_test(struct uprobe_multi_consumers *skel, + unsigned long before, unsigned long after) +{ + int idx; + + /* detach uprobe for each unset programs in 'before' state ... */ + for (idx = 0; idx < 4; idx++) { + if (test_bit(idx, before) && !test_bit(idx, after)) + uprobe_detach(skel, idx); + } + + /* ... and attach all new programs in 'after' state */ + for (idx = 0; idx < 4; idx++) { + if (!test_bit(idx, before) && test_bit(idx, after)) { + if (!ASSERT_OK(uprobe_attach(skel, idx), "uprobe_attach_after")) + return -1; + } + } + return 0; +} + +static void consumer_test(struct uprobe_multi_consumers *skel, + unsigned long before, unsigned long after) +{ + int err, idx; + + printf("consumer_test before %lu after %lu\n", before, after); + + /* 'before' is each, we attach uprobe for every set idx */ + for (idx = 0; idx < 4; idx++) { + if (test_bit(idx, before)) { + if (!ASSERT_OK(uprobe_attach(skel, idx), "uprobe_attach_before")) + goto cleanup; + } + } + + err = uprobe_consumer_test(skel, before, after); + if (!ASSERT_EQ(err, 0, "uprobe_consumer_test")) + goto cleanup; + + for (idx = 0; idx < 4; idx++) { + const char *fmt = "BUG"; + __u64 val = 0; + + if (idx < 2) { + /* + * uprobe entry + * +1 if define in 'before' + */ + if (test_bit(idx, before)) + val++; + fmt = "prog 0/1: uprobe"; + } else { + /* + * uprobe return is tricky ;-) + * + * to trigger uretprobe consumer, the uretprobe needs to be installed, + * which means one of the 'return' uprobes was alive when probe was hit: + * + * idxs: 2/3 uprobe return in 'installed' mask + * + * in addition if 'after' state removes everything that was installed in + * 'before' state, then uprobe kernel object goes away and return uprobe + * is not installed and we won't hit it even if it's in 'after' state. + */ + unsigned long had_uretprobes = before & 0b1100; /* is uretprobe installed */ + unsigned long probe_preserved = before & after; /* did uprobe go away */ + + if (had_uretprobes && probe_preserved && test_bit(idx, after)) + val++; + fmt = "idx 2/3: uretprobe"; + } + + ASSERT_EQ(skel->bss->uprobe_result[idx], val, fmt); + skel->bss->uprobe_result[idx] = 0; + } + +cleanup: + for (idx = 0; idx < 4; idx++) + uprobe_detach(skel, idx); +} + +static void test_consumers(void) +{ + struct uprobe_multi_consumers *skel; + int before, after; + + skel = uprobe_multi_consumers__open_and_load(); + if (!ASSERT_OK_PTR(skel, "uprobe_multi_consumers__open_and_load")) + return; + + /* + * The idea of this test is to try all possible combinations of + * uprobes consumers attached on single function. + * + * - 2 uprobe entry consumer + * - 2 uprobe exit consumers + * + * The test uses 4 uprobes attached on single function, but that + * translates into single uprobe with 4 consumers in kernel. + * + * The before/after values present the state of attached consumers + * before and after the probed function: + * + * bit/prog 0,1 : uprobe entry + * bit/prog 2,3 : uprobe return + * + * For example for: + * + * before = 0b0101 + * after = 0b0110 + * + * it means that before we call 'uprobe_consumer_test' we attach + * uprobes defined in 'before' value: + * + * - bit/prog 0: uprobe entry + * - bit/prog 2: uprobe return + * + * uprobe_consumer_test is called and inside it we attach and detach + * uprobes based on 'after' value: + * + * - bit/prog 0: stays untouched + * - bit/prog 2: uprobe return is detached + * + * uprobe_consumer_test returns and we check counters values increased + * by bpf programs on each uprobe to match the expected count based on + * before/after bits. + */ + + for (before = 0; before < 16; before++) { + for (after = 0; after < 16; after++) + consumer_test(skel, before, after); + } + + uprobe_multi_consumers__destroy(skel); +} + static void test_bench_attach_uprobe(void) { long attach_start_ns = 0, attach_end_ns = 0; @@ -821,4 +1032,6 @@ void test_uprobe_multi_test(void) test_attach_api_fails(); if (test__start_subtest("attach_uprobe_fails")) test_attach_uprobe_fails(); + if (test__start_subtest("consumers")) + test_consumers(); } diff --git a/tools/testing/selftests/bpf/progs/uprobe_multi_consumers.c b/tools/testing/selftests/bpf/progs/uprobe_multi_consumers.c new file mode 100644 index 000000000000..7e0fdcbbd242 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/uprobe_multi_consumers.c @@ -0,0 +1,39 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include "bpf_kfuncs.h" +#include "bpf_misc.h" + +char _license[] SEC("license") = "GPL"; + +__u64 uprobe_result[4]; + +SEC("uprobe.multi") +int uprobe_0(struct pt_regs *ctx) +{ + uprobe_result[0]++; + return 0; +} + +SEC("uprobe.multi") +int uprobe_1(struct pt_regs *ctx) +{ + uprobe_result[1]++; + return 0; +} + +SEC("uprobe.multi") +int uprobe_2(struct pt_regs *ctx) +{ + uprobe_result[2]++; + return 0; +} + +SEC("uprobe.multi") +int uprobe_3(struct pt_regs *ctx) +{ + uprobe_result[3]++; + return 0; +} -- cgit v1.2.3 From 3ece93a4087b2db7b99ebb2412bd60cf26bbbb51 Mon Sep 17 00:00:00 2001 From: Tony Ambardar Date: Fri, 19 Jul 2024 22:25:35 -0700 Subject: selftests/bpf: Fix wrong binary in Makefile log output Make log output incorrectly shows 'test_maps' as the binary name for every 'CLNG-BPF' build step, apparently picking up the last value defined for the $(TRUNNER_BINARY) variable. Update the 'CLANG_BPF_BUILD_RULE' variants to fix this confusing output. Current output: CLNG-BPF [test_maps] access_map_in_map.bpf.o GEN-SKEL [test_progs] access_map_in_map.skel.h ... CLNG-BPF [test_maps] access_map_in_map.bpf.o GEN-SKEL [test_progs-no_alu32] access_map_in_map.skel.h ... CLNG-BPF [test_maps] access_map_in_map.bpf.o GEN-SKEL [test_progs-cpuv4] access_map_in_map.skel.h After fix: CLNG-BPF [test_progs] access_map_in_map.bpf.o GEN-SKEL [test_progs] access_map_in_map.skel.h ... CLNG-BPF [test_progs-no_alu32] access_map_in_map.bpf.o GEN-SKEL [test_progs-no_alu32] access_map_in_map.skel.h ... CLNG-BPF [test_progs-cpuv4] access_map_in_map.bpf.o GEN-SKEL [test_progs-cpuv4] access_map_in_map.skel.h Fixes: a5d0c26a2784 ("selftests/bpf: Add a cpuv4 test runner for cpu=v4 testing") Fixes: 89ad7420b25c ("selftests/bpf: Drop the need for LLVM's llc") Signed-off-by: Tony Ambardar Signed-off-by: Andrii Nakryiko Acked-by: Eduard Zingerman Link: https://lore.kernel.org/bpf/20240720052535.2185967-1-tony.ambardar@gmail.com --- tools/testing/selftests/bpf/Makefile | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 8e01e4f9b1a7..3318d5425d75 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -428,23 +428,24 @@ $(OUTPUT)/cgroup_getset_retval_hooks.o: cgroup_getset_retval_hooks.h # $1 - input .c file # $2 - output .o file # $3 - CFLAGS +# $4 - binary name define CLANG_BPF_BUILD_RULE - $(call msg,CLNG-BPF,$(TRUNNER_BINARY),$2) + $(call msg,CLNG-BPF,$4,$2) $(Q)$(CLANG) $3 -O2 --target=bpf -c $1 -mcpu=v3 -o $2 endef # Similar to CLANG_BPF_BUILD_RULE, but with disabled alu32 define CLANG_NOALU32_BPF_BUILD_RULE - $(call msg,CLNG-BPF,$(TRUNNER_BINARY),$2) + $(call msg,CLNG-BPF,$4,$2) $(Q)$(CLANG) $3 -O2 --target=bpf -c $1 -mcpu=v2 -o $2 endef # Similar to CLANG_BPF_BUILD_RULE, but with cpu-v4 define CLANG_CPUV4_BPF_BUILD_RULE - $(call msg,CLNG-BPF,$(TRUNNER_BINARY),$2) + $(call msg,CLNG-BPF,$4,$2) $(Q)$(CLANG) $3 -O2 --target=bpf -c $1 -mcpu=v4 -o $2 endef # Build BPF object using GCC define GCC_BPF_BUILD_RULE - $(call msg,GCC-BPF,$(TRUNNER_BINARY),$2) + $(call msg,GCC-BPF,$4,$2) $(Q)$(BPF_GCC) $3 -DBPF_NO_PRESERVE_ACCESS_INDEX -Wno-attributes -O2 -c $1 -o $2 endef @@ -537,7 +538,7 @@ $(TRUNNER_BPF_OBJS): $(TRUNNER_OUTPUT)/%.bpf.o: \ $$(call $(TRUNNER_BPF_BUILD_RULE),$$<,$$@, \ $(TRUNNER_BPF_CFLAGS) \ $$($$<-CFLAGS) \ - $$($$<-$2-CFLAGS)) + $$($$<-$2-CFLAGS),$(TRUNNER_BINARY)) $(TRUNNER_BPF_SKELS): %.skel.h: %.bpf.o $(BPFTOOL) | $(TRUNNER_OUTPUT) $$(call msg,GEN-SKEL,$(TRUNNER_BINARY),$$@) -- cgit v1.2.3 From f86601c3661946721e8f260bdd812b759854ac22 Mon Sep 17 00:00:00 2001 From: Tony Ambardar Date: Mon, 22 Jul 2024 17:30:45 -0700 Subject: tools/runqslower: Fix LDFLAGS and add LDLIBS support Actually use previously defined LDFLAGS during build and add support for LDLIBS to link extra standalone libraries e.g. 'argp' which is not provided by musl libc. Fixes: 585bf4640ebe ("tools: runqslower: Add EXTRA_CFLAGS and EXTRA_LDFLAGS support") Signed-off-by: Tony Ambardar Signed-off-by: Andrii Nakryiko Acked-by: Ilya Leoshkevich Link: https://lore.kernel.org/bpf/20240723003045.2273499-1-tony.ambardar@gmail.com --- tools/bpf/runqslower/Makefile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/bpf/runqslower/Makefile b/tools/bpf/runqslower/Makefile index d8288936c912..c4f1f1735af6 100644 --- a/tools/bpf/runqslower/Makefile +++ b/tools/bpf/runqslower/Makefile @@ -15,6 +15,7 @@ INCLUDES := -I$(OUTPUT) -I$(BPF_INCLUDE) -I$(abspath ../../include/uapi) CFLAGS := -g -Wall $(CLANG_CROSS_FLAGS) CFLAGS += $(EXTRA_CFLAGS) LDFLAGS += $(EXTRA_LDFLAGS) +LDLIBS += -lelf -lz # Try to detect best kernel BTF source KERNEL_REL := $(shell uname -r) @@ -51,7 +52,7 @@ clean: libbpf_hdrs: $(BPFOBJ) $(OUTPUT)/runqslower: $(OUTPUT)/runqslower.o $(BPFOBJ) - $(QUIET_LINK)$(CC) $(CFLAGS) $^ -lelf -lz -o $@ + $(QUIET_LINK)$(CC) $(CFLAGS) $(LDFLAGS) $^ $(LDLIBS) -o $@ $(OUTPUT)/runqslower.o: runqslower.h $(OUTPUT)/runqslower.skel.h \ $(OUTPUT)/runqslower.bpf.o | libbpf_hdrs -- cgit v1.2.3 From 63a9936b45859b24780c86c32a32fc7b087c304e Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 23 Jul 2024 08:34:44 -0700 Subject: selftests/bpf: Add tests for ldsx of pkt data/data_end/data_meta accesses The following tests are added to verifier_ldsx.c: - sign extension of data/data_end/data_meta for tcx programs. The actual checking is in bpf_skb_is_valid_access() which is called by sk_filter, cg_skb, lwt, tc(tcx) and sk_skb. - sign extension of data/data_end/data_meta for xdp programs. - sign extension of data/data_end for flow_dissector programs. All newly-added tests have verification failure with message "invalid bpf_context access". Without previous patch, all these tests succeeded verification. Acked-by: Eduard Zingerman Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20240723153444.2430365-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko --- tools/testing/selftests/bpf/progs/verifier_ldsx.c | 112 ++++++++++++++++++++++ 1 file changed, 112 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/progs/verifier_ldsx.c b/tools/testing/selftests/bpf/progs/verifier_ldsx.c index d4427d8e1217..52edee41caf6 100644 --- a/tools/testing/selftests/bpf/progs/verifier_ldsx.c +++ b/tools/testing/selftests/bpf/progs/verifier_ldsx.c @@ -144,6 +144,118 @@ __naked void ldsx_s32_range(void) : __clobber_all); } +SEC("xdp") +__description("LDSX, xdp s32 xdp_md->data") +__failure __msg("invalid bpf_context access") +__naked void ldsx_ctx_1(void) +{ + asm volatile ( + "r2 = *(s32 *)(r1 + %[xdp_md_data]);" + "r0 = 0;" + "exit;" + : + : __imm_const(xdp_md_data, offsetof(struct xdp_md, data)) + : __clobber_all); +} + +SEC("xdp") +__description("LDSX, xdp s32 xdp_md->data_end") +__failure __msg("invalid bpf_context access") +__naked void ldsx_ctx_2(void) +{ + asm volatile ( + "r2 = *(s32 *)(r1 + %[xdp_md_data_end]);" + "r0 = 0;" + "exit;" + : + : __imm_const(xdp_md_data_end, offsetof(struct xdp_md, data_end)) + : __clobber_all); +} + +SEC("xdp") +__description("LDSX, xdp s32 xdp_md->data_meta") +__failure __msg("invalid bpf_context access") +__naked void ldsx_ctx_3(void) +{ + asm volatile ( + "r2 = *(s32 *)(r1 + %[xdp_md_data_meta]);" + "r0 = 0;" + "exit;" + : + : __imm_const(xdp_md_data_meta, offsetof(struct xdp_md, data_meta)) + : __clobber_all); +} + +SEC("tcx/ingress") +__description("LDSX, tcx s32 __sk_buff->data") +__failure __msg("invalid bpf_context access") +__naked void ldsx_ctx_4(void) +{ + asm volatile ( + "r2 = *(s32 *)(r1 + %[sk_buff_data]);" + "r0 = 0;" + "exit;" + : + : __imm_const(sk_buff_data, offsetof(struct __sk_buff, data)) + : __clobber_all); +} + +SEC("tcx/ingress") +__description("LDSX, tcx s32 __sk_buff->data_end") +__failure __msg("invalid bpf_context access") +__naked void ldsx_ctx_5(void) +{ + asm volatile ( + "r2 = *(s32 *)(r1 + %[sk_buff_data_end]);" + "r0 = 0;" + "exit;" + : + : __imm_const(sk_buff_data_end, offsetof(struct __sk_buff, data_end)) + : __clobber_all); +} + +SEC("tcx/ingress") +__description("LDSX, tcx s32 __sk_buff->data_meta") +__failure __msg("invalid bpf_context access") +__naked void ldsx_ctx_6(void) +{ + asm volatile ( + "r2 = *(s32 *)(r1 + %[sk_buff_data_meta]);" + "r0 = 0;" + "exit;" + : + : __imm_const(sk_buff_data_meta, offsetof(struct __sk_buff, data_meta)) + : __clobber_all); +} + +SEC("flow_dissector") +__description("LDSX, flow_dissector s32 __sk_buff->data") +__failure __msg("invalid bpf_context access") +__naked void ldsx_ctx_7(void) +{ + asm volatile ( + "r2 = *(s32 *)(r1 + %[sk_buff_data]);" + "r0 = 0;" + "exit;" + : + : __imm_const(sk_buff_data, offsetof(struct __sk_buff, data)) + : __clobber_all); +} + +SEC("flow_dissector") +__description("LDSX, flow_dissector s32 __sk_buff->data_end") +__failure __msg("invalid bpf_context access") +__naked void ldsx_ctx_8(void) +{ + asm volatile ( + "r2 = *(s32 *)(r1 + %[sk_buff_data_end]);" + "r0 = 0;" + "exit;" + : + : __imm_const(sk_buff_data_end, offsetof(struct __sk_buff, data_end)) + : __clobber_all); +} + #else SEC("socket") -- cgit v1.2.3 From 26672b5caf3cbeeffe8651035dddff95f610e7ec Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 23 Jul 2024 09:29:40 -0700 Subject: selftests/bpf: Add reg_bounds tests for ldsx and subreg compare Add a few reg_bounds selftests to test 32/16/8-bit ldsx and subreg comparison. Without the previous patch, all added tests will fail. Acked-by: Eduard Zingerman Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20240723162940.2732171-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko --- tools/testing/selftests/bpf/prog_tests/reg_bounds.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/reg_bounds.c b/tools/testing/selftests/bpf/prog_tests/reg_bounds.c index eb74363f9f70..0da4225749bd 100644 --- a/tools/testing/selftests/bpf/prog_tests/reg_bounds.c +++ b/tools/testing/selftests/bpf/prog_tests/reg_bounds.c @@ -433,6 +433,19 @@ static struct range range_refine(enum num_t x_t, struct range x, enum num_t y_t, y_cast = range_cast(y_t, x_t, y); + /* If we know that + * - *x* is in the range of signed 32bit value, and + * - *y_cast* range is 32-bit signed non-negative + * then *x* range can be improved with *y_cast* such that *x* range + * is 32-bit signed non-negative. Otherwise, if the new range for *x* + * allows upper 32-bit * 0xffffffff then the eventual new range for + * *x* will be out of signed 32-bit range which violates the origin + * *x* range. + */ + if (x_t == S64 && y_t == S32 && y_cast.a <= S32_MAX && y_cast.b <= S32_MAX && + (s64)x.a >= S32_MIN && (s64)x.b <= S32_MAX) + return range_improve(x_t, x, y_cast); + /* the case when new range knowledge, *y*, is a 32-bit subregister * range, while previous range knowledge, *x*, is a full register * 64-bit range, needs special treatment to take into account upper 32 @@ -2108,6 +2121,9 @@ static struct subtest_case crafted_cases[] = { {S32, U32, {(u32)S32_MIN, 0}, {0, 0}}, {S32, U32, {(u32)S32_MIN, 0}, {(u32)S32_MIN, (u32)S32_MIN}}, {S32, U32, {(u32)S32_MIN, S32_MAX}, {S32_MAX, S32_MAX}}, + {S64, U32, {0x0, 0x1f}, {0xffffffff80000000ULL, 0x000000007fffffffULL}}, + {S64, U32, {0x0, 0x1f}, {0xffffffffffff8000ULL, 0x0000000000007fffULL}}, + {S64, U32, {0x0, 0x1f}, {0xffffffffffffff80ULL, 0x000000000000007fULL}}, }; /* Go over crafted hard-coded cases. This is fast, so we do it as part of -- cgit v1.2.3 From 424ebaa3678b0d7f653dffb08eae90424740e0f4 Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Mon, 22 Jul 2024 16:38:38 -0700 Subject: selftests/bpf: extract utility function for BPF disassembly struct bpf_insn *disasm_insn(struct bpf_insn *insn, char *buf, size_t buf_sz); Disassembles instruction 'insn' to a text buffer 'buf'. Removes insn->code hex prefix added by kernel disassembly routine. Returns a pointer to the next instruction (increments insn by either 1 or 2). Acked-by: Andrii Nakryiko Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20240722233844.1406874-5-eddyz87@gmail.com Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko --- tools/testing/selftests/bpf/Makefile | 1 + tools/testing/selftests/bpf/disasm_helpers.c | 51 +++++++++++++++ tools/testing/selftests/bpf/disasm_helpers.h | 12 ++++ .../testing/selftests/bpf/prog_tests/ctx_rewrite.c | 74 +++------------------- tools/testing/selftests/bpf/testing_helpers.c | 1 + 5 files changed, 75 insertions(+), 64 deletions(-) create mode 100644 tools/testing/selftests/bpf/disasm_helpers.c create mode 100644 tools/testing/selftests/bpf/disasm_helpers.h (limited to 'tools') diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 3318d5425d75..888ba68e6592 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -661,6 +661,7 @@ TRUNNER_EXTRA_SOURCES := test_progs.c \ test_loader.c \ xsk.c \ disasm.c \ + disasm_helpers.c \ json_writer.c \ flow_dissector_load.h \ ip_check_defrag_frags.h diff --git a/tools/testing/selftests/bpf/disasm_helpers.c b/tools/testing/selftests/bpf/disasm_helpers.c new file mode 100644 index 000000000000..96b1f2ffe438 --- /dev/null +++ b/tools/testing/selftests/bpf/disasm_helpers.c @@ -0,0 +1,51 @@ +// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) + +#include +#include "disasm.h" + +struct print_insn_context { + char *buf; + size_t sz; +}; + +static void print_insn_cb(void *private_data, const char *fmt, ...) +{ + struct print_insn_context *ctx = private_data; + va_list args; + + va_start(args, fmt); + vsnprintf(ctx->buf, ctx->sz, fmt, args); + va_end(args); +} + +struct bpf_insn *disasm_insn(struct bpf_insn *insn, char *buf, size_t buf_sz) +{ + struct print_insn_context ctx = { + .buf = buf, + .sz = buf_sz, + }; + struct bpf_insn_cbs cbs = { + .cb_print = print_insn_cb, + .private_data = &ctx, + }; + char *tmp, *pfx_end, *sfx_start; + bool double_insn; + int len; + + print_bpf_insn(&cbs, insn, true); + /* We share code with kernel BPF disassembler, it adds '(FF) ' prefix + * for each instruction (FF stands for instruction `code` byte). + * Remove the prefix inplace, and also simplify call instructions. + * E.g.: "(85) call foo#10" -> "call foo". + * Also remove newline in the end (the 'max(strlen(buf) - 1, 0)' thing). + */ + pfx_end = buf + 5; + sfx_start = buf + max((int)strlen(buf) - 1, 0); + if (strncmp(pfx_end, "call ", 5) == 0 && (tmp = strrchr(buf, '#'))) + sfx_start = tmp; + len = sfx_start - pfx_end; + memmove(buf, pfx_end, len); + buf[len] = 0; + double_insn = insn->code == (BPF_LD | BPF_IMM | BPF_DW); + return insn + (double_insn ? 2 : 1); +} diff --git a/tools/testing/selftests/bpf/disasm_helpers.h b/tools/testing/selftests/bpf/disasm_helpers.h new file mode 100644 index 000000000000..7b26cab70099 --- /dev/null +++ b/tools/testing/selftests/bpf/disasm_helpers.h @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ + +#ifndef __DISASM_HELPERS_H +#define __DISASM_HELPERS_H + +#include + +struct bpf_insn; + +struct bpf_insn *disasm_insn(struct bpf_insn *insn, char *buf, size_t buf_sz); + +#endif /* __DISASM_HELPERS_H */ diff --git a/tools/testing/selftests/bpf/prog_tests/ctx_rewrite.c b/tools/testing/selftests/bpf/prog_tests/ctx_rewrite.c index 08b6391f2f56..dd75ccb03770 100644 --- a/tools/testing/selftests/bpf/prog_tests/ctx_rewrite.c +++ b/tools/testing/selftests/bpf/prog_tests/ctx_rewrite.c @@ -10,7 +10,8 @@ #include "bpf/btf.h" #include "bpf_util.h" #include "linux/filter.h" -#include "disasm.h" +#include "linux/kernel.h" +#include "disasm_helpers.h" #define MAX_PROG_TEXT_SZ (32 * 1024) @@ -628,63 +629,6 @@ err: return false; } -static void print_insn(void *private_data, const char *fmt, ...) -{ - va_list args; - - va_start(args, fmt); - vfprintf((FILE *)private_data, fmt, args); - va_end(args); -} - -/* Disassemble instructions to a stream */ -static void print_xlated(FILE *out, struct bpf_insn *insn, __u32 len) -{ - const struct bpf_insn_cbs cbs = { - .cb_print = print_insn, - .cb_call = NULL, - .cb_imm = NULL, - .private_data = out, - }; - bool double_insn = false; - int i; - - for (i = 0; i < len; i++) { - if (double_insn) { - double_insn = false; - continue; - } - - double_insn = insn[i].code == (BPF_LD | BPF_IMM | BPF_DW); - print_bpf_insn(&cbs, insn + i, true); - } -} - -/* We share code with kernel BPF disassembler, it adds '(FF) ' prefix - * for each instruction (FF stands for instruction `code` byte). - * This function removes the prefix inplace for each line in `str`. - */ -static void remove_insn_prefix(char *str, int size) -{ - const int prefix_size = 5; - - int write_pos = 0, read_pos = prefix_size; - int len = strlen(str); - char c; - - size = min(size, len); - - while (read_pos < size) { - c = str[read_pos++]; - if (c == 0) - break; - str[write_pos++] = c; - if (c == '\n') - read_pos += prefix_size; - } - str[write_pos] = 0; -} - struct prog_info { char *prog_kind; enum bpf_prog_type prog_type; @@ -699,9 +643,10 @@ static void match_program(struct btf *btf, char *reg_map[][2], bool skip_first_insn) { - struct bpf_insn *buf = NULL; + struct bpf_insn *buf = NULL, *insn, *insn_end; int err = 0, prog_fd = 0; FILE *prog_out = NULL; + char insn_buf[64]; char *text = NULL; __u32 cnt = 0; @@ -739,12 +684,13 @@ static void match_program(struct btf *btf, PRINT_FAIL("Can't open memory stream\n"); goto out; } - if (skip_first_insn) - print_xlated(prog_out, buf + 1, cnt - 1); - else - print_xlated(prog_out, buf, cnt); + insn_end = buf + cnt; + insn = buf + (skip_first_insn ? 1 : 0); + while (insn < insn_end) { + insn = disasm_insn(insn, insn_buf, sizeof(insn_buf)); + fprintf(prog_out, "%s\n", insn_buf); + } fclose(prog_out); - remove_insn_prefix(text, MAX_PROG_TEXT_SZ); ASSERT_TRUE(match_pattern(btf, pattern, text, reg_map), pinfo->prog_kind); diff --git a/tools/testing/selftests/bpf/testing_helpers.c b/tools/testing/selftests/bpf/testing_helpers.c index d5379a0e6da8..ac7c66f4fc7b 100644 --- a/tools/testing/selftests/bpf/testing_helpers.c +++ b/tools/testing/selftests/bpf/testing_helpers.c @@ -7,6 +7,7 @@ #include #include #include +#include "disasm.h" #include "test_progs.h" #include "testing_helpers.h" #include -- cgit v1.2.3 From 203e6aba7692bca18fd97251b1354da0f5e2ba30 Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Mon, 22 Jul 2024 16:38:39 -0700 Subject: selftests/bpf: print correct offset for pseudo calls in disasm_insn() Adjust disasm_helpers.c:disasm_insn() to account for the following part of the verifier.c:jit_subprogs: for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) { /* ... */ if (!bpf_pseudo_call(insn)) continue; insn->off = env->insn_aux_data[i].call_imm; subprog = find_subprog(env, i + insn->off + 1); insn->imm = subprog; } Where verifier moves offset of the subprogram to the insn->off field. Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20240722233844.1406874-6-eddyz87@gmail.com Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko --- tools/testing/selftests/bpf/disasm_helpers.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/disasm_helpers.c b/tools/testing/selftests/bpf/disasm_helpers.c index 96b1f2ffe438..f529f1c8c171 100644 --- a/tools/testing/selftests/bpf/disasm_helpers.c +++ b/tools/testing/selftests/bpf/disasm_helpers.c @@ -4,6 +4,7 @@ #include "disasm.h" struct print_insn_context { + char scratch[16]; char *buf; size_t sz; }; @@ -18,6 +19,22 @@ static void print_insn_cb(void *private_data, const char *fmt, ...) va_end(args); } +static const char *print_call_cb(void *private_data, const struct bpf_insn *insn) +{ + struct print_insn_context *ctx = private_data; + + /* For pseudo calls verifier.c:jit_subprogs() hides original + * imm to insn->off and changes insn->imm to be an index of + * the subprog instead. + */ + if (insn->src_reg == BPF_PSEUDO_CALL) { + snprintf(ctx->scratch, sizeof(ctx->scratch), "%+d", insn->off); + return ctx->scratch; + } + + return NULL; +} + struct bpf_insn *disasm_insn(struct bpf_insn *insn, char *buf, size_t buf_sz) { struct print_insn_context ctx = { @@ -26,6 +43,7 @@ struct bpf_insn *disasm_insn(struct bpf_insn *insn, char *buf, size_t buf_sz) }; struct bpf_insn_cbs cbs = { .cb_print = print_insn_cb, + .cb_call = print_call_cb, .private_data = &ctx, }; char *tmp, *pfx_end, *sfx_start; -- cgit v1.2.3 From 4ef5d6af493558124b7a6c13cace58b938fe27d4 Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Mon, 22 Jul 2024 16:38:40 -0700 Subject: selftests/bpf: no need to track next_match_pos in struct test_loader The call stack for validate_case() function looks as follows: - test_loader__run_subtests() - process_subtest() - run_subtest() - prepare_case(), which does 'tester->next_match_pos = 0'; - validate_case(), which increments tester->next_match_pos. Hence, each subtest is run with next_match_pos freshly set to zero. Meaning that there is no need to persist this variable in the struct test_loader, use local variable instead. Acked-by: Andrii Nakryiko Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20240722233844.1406874-7-eddyz87@gmail.com Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko --- tools/testing/selftests/bpf/test_loader.c | 19 ++++++++----------- tools/testing/selftests/bpf/test_progs.h | 1 - 2 files changed, 8 insertions(+), 12 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/test_loader.c b/tools/testing/selftests/bpf/test_loader.c index f14e10b0de96..47508cf66e89 100644 --- a/tools/testing/selftests/bpf/test_loader.c +++ b/tools/testing/selftests/bpf/test_loader.c @@ -434,7 +434,6 @@ static void prepare_case(struct test_loader *tester, bpf_program__set_flags(prog, prog_flags | spec->prog_flags); tester->log_buf[0] = '\0'; - tester->next_match_pos = 0; } static void emit_verifier_log(const char *log_buf, bool force) @@ -450,25 +449,23 @@ static void validate_case(struct test_loader *tester, struct bpf_program *prog, int load_err) { - int i, j, err; - char *match; regmatch_t reg_match[1]; + const char *log = tester->log_buf; + int i, j, err; for (i = 0; i < subspec->expect_msg_cnt; i++) { struct expect_msg *msg = &subspec->expect_msgs[i]; + const char *match = NULL; if (msg->substr) { - match = strstr(tester->log_buf + tester->next_match_pos, msg->substr); + match = strstr(log, msg->substr); if (match) - tester->next_match_pos = match - tester->log_buf + strlen(msg->substr); + log += strlen(msg->substr); } else { - err = regexec(&msg->regex, - tester->log_buf + tester->next_match_pos, 1, reg_match, 0); + err = regexec(&msg->regex, log, 1, reg_match, 0); if (err == 0) { - match = tester->log_buf + tester->next_match_pos + reg_match[0].rm_so; - tester->next_match_pos += reg_match[0].rm_eo; - } else { - match = NULL; + match = log + reg_match[0].rm_so; + log += reg_match[0].rm_eo; } } diff --git a/tools/testing/selftests/bpf/test_progs.h b/tools/testing/selftests/bpf/test_progs.h index 51341d50213b..b1e949fb16cf 100644 --- a/tools/testing/selftests/bpf/test_progs.h +++ b/tools/testing/selftests/bpf/test_progs.h @@ -447,7 +447,6 @@ typedef int (*pre_execution_cb)(struct bpf_object *obj); struct test_loader { char *log_buf; size_t log_buf_sz; - size_t next_match_pos; pre_execution_cb pre_execution_cb; struct bpf_object *obj; -- cgit v1.2.3 From 64f01e935ddb26f48baec71883c27878ac4231dc Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Mon, 22 Jul 2024 16:38:41 -0700 Subject: selftests/bpf: extract test_loader->expect_msgs as a data structure Non-functional change: use a separate data structure to represented expected messages in test_loader. This would allow to use the same functionality for expected set of disassembled instructions in the follow-up commit. Acked-by: Andrii Nakryiko Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20240722233844.1406874-8-eddyz87@gmail.com Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko --- tools/testing/selftests/bpf/test_loader.c | 81 ++++++++++++++++--------------- 1 file changed, 41 insertions(+), 40 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/test_loader.c b/tools/testing/selftests/bpf/test_loader.c index 47508cf66e89..3f84903558dd 100644 --- a/tools/testing/selftests/bpf/test_loader.c +++ b/tools/testing/selftests/bpf/test_loader.c @@ -55,11 +55,15 @@ struct expect_msg { regex_t regex; }; +struct expected_msgs { + struct expect_msg *patterns; + size_t cnt; +}; + struct test_subspec { char *name; bool expect_failure; - struct expect_msg *expect_msgs; - size_t expect_msg_cnt; + struct expected_msgs expect_msgs; int retval; bool execute; }; @@ -96,44 +100,45 @@ void test_loader_fini(struct test_loader *tester) free(tester->log_buf); } -static void free_test_spec(struct test_spec *spec) +static void free_msgs(struct expected_msgs *msgs) { int i; + for (i = 0; i < msgs->cnt; i++) + if (msgs->patterns[i].regex_str) + regfree(&msgs->patterns[i].regex); + free(msgs->patterns); + msgs->patterns = NULL; + msgs->cnt = 0; +} + +static void free_test_spec(struct test_spec *spec) +{ /* Deallocate expect_msgs arrays. */ - for (i = 0; i < spec->priv.expect_msg_cnt; i++) - if (spec->priv.expect_msgs[i].regex_str) - regfree(&spec->priv.expect_msgs[i].regex); - for (i = 0; i < spec->unpriv.expect_msg_cnt; i++) - if (spec->unpriv.expect_msgs[i].regex_str) - regfree(&spec->unpriv.expect_msgs[i].regex); + free_msgs(&spec->priv.expect_msgs); + free_msgs(&spec->unpriv.expect_msgs); free(spec->priv.name); free(spec->unpriv.name); - free(spec->priv.expect_msgs); - free(spec->unpriv.expect_msgs); - spec->priv.name = NULL; spec->unpriv.name = NULL; - spec->priv.expect_msgs = NULL; - spec->unpriv.expect_msgs = NULL; } -static int push_msg(const char *substr, const char *regex_str, struct test_subspec *subspec) +static int push_msg(const char *substr, const char *regex_str, struct expected_msgs *msgs) { void *tmp; int regcomp_res; char error_msg[100]; struct expect_msg *msg; - tmp = realloc(subspec->expect_msgs, - (1 + subspec->expect_msg_cnt) * sizeof(struct expect_msg)); + tmp = realloc(msgs->patterns, + (1 + msgs->cnt) * sizeof(struct expect_msg)); if (!tmp) { ASSERT_FAIL("failed to realloc memory for messages\n"); return -ENOMEM; } - subspec->expect_msgs = tmp; - msg = &subspec->expect_msgs[subspec->expect_msg_cnt]; + msgs->patterns = tmp; + msg = &msgs->patterns[msgs->cnt]; if (substr) { msg->substr = substr; @@ -150,7 +155,7 @@ static int push_msg(const char *substr, const char *regex_str, struct test_subsp } } - subspec->expect_msg_cnt += 1; + msgs->cnt += 1; return 0; } @@ -272,25 +277,25 @@ static int parse_test_spec(struct test_loader *tester, spec->mode_mask |= UNPRIV; } else if (str_has_pfx(s, TEST_TAG_EXPECT_MSG_PFX)) { msg = s + sizeof(TEST_TAG_EXPECT_MSG_PFX) - 1; - err = push_msg(msg, NULL, &spec->priv); + err = push_msg(msg, NULL, &spec->priv.expect_msgs); if (err) goto cleanup; spec->mode_mask |= PRIV; } else if (str_has_pfx(s, TEST_TAG_EXPECT_MSG_PFX_UNPRIV)) { msg = s + sizeof(TEST_TAG_EXPECT_MSG_PFX_UNPRIV) - 1; - err = push_msg(msg, NULL, &spec->unpriv); + err = push_msg(msg, NULL, &spec->unpriv.expect_msgs); if (err) goto cleanup; spec->mode_mask |= UNPRIV; } else if (str_has_pfx(s, TEST_TAG_EXPECT_REGEX_PFX)) { msg = s + sizeof(TEST_TAG_EXPECT_REGEX_PFX) - 1; - err = push_msg(NULL, msg, &spec->priv); + err = push_msg(NULL, msg, &spec->priv.expect_msgs); if (err) goto cleanup; spec->mode_mask |= PRIV; } else if (str_has_pfx(s, TEST_TAG_EXPECT_REGEX_PFX_UNPRIV)) { msg = s + sizeof(TEST_TAG_EXPECT_REGEX_PFX_UNPRIV) - 1; - err = push_msg(NULL, msg, &spec->unpriv); + err = push_msg(NULL, msg, &spec->unpriv.expect_msgs); if (err) goto cleanup; spec->mode_mask |= UNPRIV; @@ -387,11 +392,12 @@ static int parse_test_spec(struct test_loader *tester, spec->unpriv.execute = spec->priv.execute; } - if (!spec->unpriv.expect_msgs) { - for (i = 0; i < spec->priv.expect_msg_cnt; i++) { - struct expect_msg *msg = &spec->priv.expect_msgs[i]; + if (spec->unpriv.expect_msgs.cnt == 0) { + for (i = 0; i < spec->priv.expect_msgs.cnt; i++) { + struct expect_msg *msg = &spec->priv.expect_msgs.patterns[i]; - err = push_msg(msg->substr, msg->regex_str, &spec->unpriv); + err = push_msg(msg->substr, msg->regex_str, + &spec->unpriv.expect_msgs); if (err) goto cleanup; } @@ -443,18 +449,14 @@ static void emit_verifier_log(const char *log_buf, bool force) fprintf(stdout, "VERIFIER LOG:\n=============\n%s=============\n", log_buf); } -static void validate_case(struct test_loader *tester, - struct test_subspec *subspec, - struct bpf_object *obj, - struct bpf_program *prog, - int load_err) +static void validate_msgs(char *log_buf, struct expected_msgs *msgs) { regmatch_t reg_match[1]; - const char *log = tester->log_buf; + const char *log = log_buf; int i, j, err; - for (i = 0; i < subspec->expect_msg_cnt; i++) { - struct expect_msg *msg = &subspec->expect_msgs[i]; + for (i = 0; i < msgs->cnt; i++) { + struct expect_msg *msg = &msgs->patterns[i]; const char *match = NULL; if (msg->substr) { @@ -471,9 +473,9 @@ static void validate_case(struct test_loader *tester, if (!ASSERT_OK_PTR(match, "expect_msg")) { if (env.verbosity == VERBOSE_NONE) - emit_verifier_log(tester->log_buf, true /*force*/); + emit_verifier_log(log_buf, true /*force*/); for (j = 0; j <= i; j++) { - msg = &subspec->expect_msgs[j]; + msg = &msgs->patterns[j]; fprintf(stderr, "%s %s: '%s'\n", j < i ? "MATCHED " : "EXPECTED", msg->substr ? "SUBSTR" : " REGEX", @@ -692,9 +694,8 @@ void run_subtest(struct test_loader *tester, goto tobj_cleanup; } } - emit_verifier_log(tester->log_buf, false /*force*/); - validate_case(tester, subspec, tobj, tprog, err); + validate_msgs(tester->log_buf, &subspec->expect_msgs); if (should_do_test_run(spec, subspec)) { /* For some reason test_verifier executes programs -- cgit v1.2.3 From 9c9f7339131030949a8ef111080427ff1a8085b5 Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Mon, 22 Jul 2024 16:38:42 -0700 Subject: selftests/bpf: allow checking xlated programs in verifier_* tests Add a macro __xlated("...") for use with test_loader tests. When such annotations are present for the test case: - bpf_prog_get_info_by_fd() is used to get BPF program after all rewrites are applied by verifier. - the program is disassembled and patterns specified in __xlated are searched for in the disassembly text. __xlated matching follows the same mechanics as __msg: each subsequent pattern is matched from the point where previous pattern ended. This allows to write tests like below, where the goal is to verify the behavior of one of the of the transformations applied by verifier: SEC("raw_tp") __xlated("1: w0 = ") __xlated("2: r0 = &(void __percpu *)(r0)") __xlated("3: r0 = *(u32 *)(r0 +0)") __xlated("4: exit") __success __naked void simple(void) { asm volatile ( "call %[bpf_get_smp_processor_id];" "exit;" : : __imm(bpf_get_smp_processor_id) : __clobber_all); } Acked-by: Andrii Nakryiko Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20240722233844.1406874-9-eddyz87@gmail.com Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko --- tools/testing/selftests/bpf/progs/bpf_misc.h | 5 ++ tools/testing/selftests/bpf/test_loader.c | 82 +++++++++++++++++++++++++++- 2 files changed, 84 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/progs/bpf_misc.h b/tools/testing/selftests/bpf/progs/bpf_misc.h index 81097a3f15eb..a70939c7bc26 100644 --- a/tools/testing/selftests/bpf/progs/bpf_misc.h +++ b/tools/testing/selftests/bpf/progs/bpf_misc.h @@ -26,6 +26,9 @@ * * __regex Same as __msg, but using a regular expression. * __regex_unpriv Same as __msg_unpriv but using a regular expression. + * __xlated Expect a line in a disassembly log after verifier applies rewrites. + * Multiple __xlated attributes could be specified. + * __xlated_unpriv Same as __xlated but for unprivileged mode. * * __success Expect program load success in privileged mode. * __success_unpriv Expect program load success in unprivileged mode. @@ -63,11 +66,13 @@ */ #define __msg(msg) __attribute__((btf_decl_tag("comment:test_expect_msg=" msg))) #define __regex(regex) __attribute__((btf_decl_tag("comment:test_expect_regex=" regex))) +#define __xlated(msg) __attribute__((btf_decl_tag("comment:test_expect_xlated=" msg))) #define __failure __attribute__((btf_decl_tag("comment:test_expect_failure"))) #define __success __attribute__((btf_decl_tag("comment:test_expect_success"))) #define __description(desc) __attribute__((btf_decl_tag("comment:test_description=" desc))) #define __msg_unpriv(msg) __attribute__((btf_decl_tag("comment:test_expect_msg_unpriv=" msg))) #define __regex_unpriv(regex) __attribute__((btf_decl_tag("comment:test_expect_regex_unpriv=" regex))) +#define __xlated_unpriv(msg) __attribute__((btf_decl_tag("comment:test_expect_xlated_unpriv=" msg))) #define __failure_unpriv __attribute__((btf_decl_tag("comment:test_expect_failure_unpriv"))) #define __success_unpriv __attribute__((btf_decl_tag("comment:test_expect_success_unpriv"))) #define __log_level(lvl) __attribute__((btf_decl_tag("comment:test_log_level="#lvl))) diff --git a/tools/testing/selftests/bpf/test_loader.c b/tools/testing/selftests/bpf/test_loader.c index 3f84903558dd..b44b6a2fc82c 100644 --- a/tools/testing/selftests/bpf/test_loader.c +++ b/tools/testing/selftests/bpf/test_loader.c @@ -7,6 +7,7 @@ #include #include "autoconf_helper.h" +#include "disasm_helpers.h" #include "unpriv_helpers.h" #include "cap_helpers.h" @@ -19,10 +20,12 @@ #define TEST_TAG_EXPECT_SUCCESS "comment:test_expect_success" #define TEST_TAG_EXPECT_MSG_PFX "comment:test_expect_msg=" #define TEST_TAG_EXPECT_REGEX_PFX "comment:test_expect_regex=" +#define TEST_TAG_EXPECT_XLATED_PFX "comment:test_expect_xlated=" #define TEST_TAG_EXPECT_FAILURE_UNPRIV "comment:test_expect_failure_unpriv" #define TEST_TAG_EXPECT_SUCCESS_UNPRIV "comment:test_expect_success_unpriv" #define TEST_TAG_EXPECT_MSG_PFX_UNPRIV "comment:test_expect_msg_unpriv=" #define TEST_TAG_EXPECT_REGEX_PFX_UNPRIV "comment:test_expect_regex_unpriv=" +#define TEST_TAG_EXPECT_XLATED_PFX_UNPRIV "comment:test_expect_xlated_unpriv=" #define TEST_TAG_LOG_LEVEL_PFX "comment:test_log_level=" #define TEST_TAG_PROG_FLAGS_PFX "comment:test_prog_flags=" #define TEST_TAG_DESCRIPTION_PFX "comment:test_description=" @@ -64,6 +67,7 @@ struct test_subspec { char *name; bool expect_failure; struct expected_msgs expect_msgs; + struct expected_msgs expect_xlated; int retval; bool execute; }; @@ -117,6 +121,8 @@ static void free_test_spec(struct test_spec *spec) /* Deallocate expect_msgs arrays. */ free_msgs(&spec->priv.expect_msgs); free_msgs(&spec->unpriv.expect_msgs); + free_msgs(&spec->priv.expect_xlated); + free_msgs(&spec->unpriv.expect_xlated); free(spec->priv.name); free(spec->unpriv.name); @@ -299,6 +305,18 @@ static int parse_test_spec(struct test_loader *tester, if (err) goto cleanup; spec->mode_mask |= UNPRIV; + } else if (str_has_pfx(s, TEST_TAG_EXPECT_XLATED_PFX)) { + msg = s + sizeof(TEST_TAG_EXPECT_XLATED_PFX) - 1; + err = push_msg(msg, NULL, &spec->priv.expect_xlated); + if (err) + goto cleanup; + spec->mode_mask |= PRIV; + } else if (str_has_pfx(s, TEST_TAG_EXPECT_XLATED_PFX_UNPRIV)) { + msg = s + sizeof(TEST_TAG_EXPECT_XLATED_PFX_UNPRIV) - 1; + err = push_msg(msg, NULL, &spec->unpriv.expect_xlated); + if (err) + goto cleanup; + spec->mode_mask |= UNPRIV; } else if (str_has_pfx(s, TEST_TAG_RETVAL_PFX)) { val = s + sizeof(TEST_TAG_RETVAL_PFX) - 1; err = parse_retval(val, &spec->priv.retval, "__retval"); @@ -402,6 +420,16 @@ static int parse_test_spec(struct test_loader *tester, goto cleanup; } } + if (spec->unpriv.expect_xlated.cnt == 0) { + for (i = 0; i < spec->priv.expect_xlated.cnt; i++) { + struct expect_msg *msg = &spec->priv.expect_xlated.patterns[i]; + + err = push_msg(msg->substr, msg->regex_str, + &spec->unpriv.expect_xlated); + if (err) + goto cleanup; + } + } } spec->valid = true; @@ -449,7 +477,15 @@ static void emit_verifier_log(const char *log_buf, bool force) fprintf(stdout, "VERIFIER LOG:\n=============\n%s=============\n", log_buf); } -static void validate_msgs(char *log_buf, struct expected_msgs *msgs) +static void emit_xlated(const char *xlated, bool force) +{ + if (!force && env.verbosity == VERBOSE_NONE) + return; + fprintf(stdout, "XLATED:\n=============\n%s=============\n", xlated); +} + +static void validate_msgs(char *log_buf, struct expected_msgs *msgs, + void (*emit_fn)(const char *buf, bool force)) { regmatch_t reg_match[1]; const char *log = log_buf; @@ -473,7 +509,7 @@ static void validate_msgs(char *log_buf, struct expected_msgs *msgs) if (!ASSERT_OK_PTR(match, "expect_msg")) { if (env.verbosity == VERBOSE_NONE) - emit_verifier_log(log_buf, true /*force*/); + emit_fn(log_buf, true /*force*/); for (j = 0; j <= i; j++) { msg = &msgs->patterns[j]; fprintf(stderr, "%s %s: '%s'\n", @@ -610,6 +646,37 @@ static bool should_do_test_run(struct test_spec *spec, struct test_subspec *subs return true; } +/* Get a disassembly of BPF program after verifier applies all rewrites */ +static int get_xlated_program_text(int prog_fd, char *text, size_t text_sz) +{ + struct bpf_insn *insn_start = NULL, *insn, *insn_end; + __u32 insns_cnt = 0, i; + char buf[64]; + FILE *out = NULL; + int err; + + err = get_xlated_program(prog_fd, &insn_start, &insns_cnt); + if (!ASSERT_OK(err, "get_xlated_program")) + goto out; + out = fmemopen(text, text_sz, "w"); + if (!ASSERT_OK_PTR(out, "open_memstream")) + goto out; + insn_end = insn_start + insns_cnt; + insn = insn_start; + while (insn < insn_end) { + i = insn - insn_start; + insn = disasm_insn(insn, buf, sizeof(buf)); + fprintf(out, "%d: %s\n", i, buf); + } + fflush(out); + +out: + free(insn_start); + if (out) + fclose(out); + return err; +} + /* this function is forced noinline and has short generic name to look better * in test_progs output (in case of a failure) */ @@ -695,7 +762,16 @@ void run_subtest(struct test_loader *tester, } } emit_verifier_log(tester->log_buf, false /*force*/); - validate_msgs(tester->log_buf, &subspec->expect_msgs); + validate_msgs(tester->log_buf, &subspec->expect_msgs, emit_verifier_log); + + if (subspec->expect_xlated.cnt) { + err = get_xlated_program_text(bpf_program__fd(tprog), + tester->log_buf, tester->log_buf_sz); + if (err) + goto tobj_cleanup; + emit_xlated(tester->log_buf, false /*force*/); + validate_msgs(tester->log_buf, &subspec->expect_xlated, emit_xlated); + } if (should_do_test_run(spec, subspec)) { /* For some reason test_verifier executes programs -- cgit v1.2.3 From ee7fe84468b1732fe65c5af3836437d54ac4c419 Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Mon, 22 Jul 2024 16:38:43 -0700 Subject: selftests/bpf: __arch_* macro to limit test cases to specific archs Add annotations __arch_x86_64, __arch_arm64, __arch_riscv64 to specify on which architecture the test case should be tested. Several __arch_* annotations could be specified at once. When test case is not run on current arch it is marked as skipped. For example, the following would be tested only on arm64 and riscv64: SEC("raw_tp") __arch_arm64 __arch_riscv64 __xlated("1: *(u64 *)(r10 - 16) = r1") __xlated("2: call") __xlated("3: r1 = *(u64 *)(r10 - 16);") __success __naked void canary_arm64_riscv64(void) { asm volatile ( "r1 = 1;" "*(u64 *)(r10 - 16) = r1;" "call %[bpf_get_smp_processor_id];" "r1 = *(u64 *)(r10 - 16);" "exit;" : : __imm(bpf_get_smp_processor_id) : __clobber_all); } On x86 it would be skipped: #467/2 verifier_nocsr/canary_arm64_riscv64:SKIP Acked-by: Andrii Nakryiko Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20240722233844.1406874-10-eddyz87@gmail.com Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko --- tools/testing/selftests/bpf/progs/bpf_misc.h | 8 ++++++ tools/testing/selftests/bpf/test_loader.c | 43 ++++++++++++++++++++++++++++ 2 files changed, 51 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/progs/bpf_misc.h b/tools/testing/selftests/bpf/progs/bpf_misc.h index a70939c7bc26..a225cd87897c 100644 --- a/tools/testing/selftests/bpf/progs/bpf_misc.h +++ b/tools/testing/selftests/bpf/progs/bpf_misc.h @@ -63,6 +63,10 @@ * __auxiliary Annotated program is not a separate test, but used as auxiliary * for some other test cases and should always be loaded. * __auxiliary_unpriv Same, but load program in unprivileged mode. + * + * __arch_* Specify on which architecture the test case should be tested. + * Several __arch_* annotations could be specified at once. + * When test case is not run on current arch it is marked as skipped. */ #define __msg(msg) __attribute__((btf_decl_tag("comment:test_expect_msg=" msg))) #define __regex(regex) __attribute__((btf_decl_tag("comment:test_expect_regex=" regex))) @@ -82,6 +86,10 @@ #define __auxiliary __attribute__((btf_decl_tag("comment:test_auxiliary"))) #define __auxiliary_unpriv __attribute__((btf_decl_tag("comment:test_auxiliary_unpriv"))) #define __btf_path(path) __attribute__((btf_decl_tag("comment:test_btf_path=" path))) +#define __arch(arch) __attribute__((btf_decl_tag("comment:test_arch=" arch))) +#define __arch_x86_64 __arch("X86_64") +#define __arch_arm64 __arch("ARM64") +#define __arch_riscv64 __arch("RISCV64") /* Convenience macro for use with 'asm volatile' blocks */ #define __naked __attribute__((naked)) diff --git a/tools/testing/selftests/bpf/test_loader.c b/tools/testing/selftests/bpf/test_loader.c index b44b6a2fc82c..12b0c41e8d64 100644 --- a/tools/testing/selftests/bpf/test_loader.c +++ b/tools/testing/selftests/bpf/test_loader.c @@ -34,6 +34,7 @@ #define TEST_TAG_AUXILIARY "comment:test_auxiliary" #define TEST_TAG_AUXILIARY_UNPRIV "comment:test_auxiliary_unpriv" #define TEST_BTF_PATH "comment:test_btf_path=" +#define TEST_TAG_ARCH "comment:test_arch=" /* Warning: duplicated in bpf_misc.h */ #define POINTER_VALUE 0xcafe4all @@ -80,6 +81,7 @@ struct test_spec { int log_level; int prog_flags; int mode_mask; + int arch_mask; bool auxiliary; bool valid; }; @@ -213,6 +215,12 @@ static void update_flags(int *flags, int flag, bool clear) *flags |= flag; } +enum arch { + ARCH_X86_64 = 0x1, + ARCH_ARM64 = 0x2, + ARCH_RISCV64 = 0x4, +}; + /* Uses btf_decl_tag attributes to describe the expected test * behavior, see bpf_misc.h for detailed description of each attribute * and attribute combinations. @@ -226,6 +234,7 @@ static int parse_test_spec(struct test_loader *tester, bool has_unpriv_result = false; bool has_unpriv_retval = false; int func_id, i, err = 0; + u32 arch_mask = 0; struct btf *btf; memset(spec, 0, sizeof(*spec)); @@ -364,11 +373,26 @@ static int parse_test_spec(struct test_loader *tester, goto cleanup; update_flags(&spec->prog_flags, flags, clear); } + } else if (str_has_pfx(s, TEST_TAG_ARCH)) { + val = s + sizeof(TEST_TAG_ARCH) - 1; + if (strcmp(val, "X86_64") == 0) { + arch_mask |= ARCH_X86_64; + } else if (strcmp(val, "ARM64") == 0) { + arch_mask |= ARCH_ARM64; + } else if (strcmp(val, "RISCV64") == 0) { + arch_mask |= ARCH_RISCV64; + } else { + PRINT_FAIL("bad arch spec: '%s'", val); + err = -EINVAL; + goto cleanup; + } } else if (str_has_pfx(s, TEST_BTF_PATH)) { spec->btf_custom_path = s + sizeof(TEST_BTF_PATH) - 1; } } + spec->arch_mask = arch_mask; + if (spec->mode_mask == 0) spec->mode_mask = PRIV; @@ -677,6 +701,20 @@ out: return err; } +static bool run_on_current_arch(int arch_mask) +{ + if (arch_mask == 0) + return true; +#if defined(__x86_64__) + return arch_mask & ARCH_X86_64; +#elif defined(__aarch64__) + return arch_mask & ARCH_ARM64; +#elif defined(__riscv) && __riscv_xlen == 64 + return arch_mask & ARCH_RISCV64; +#endif + return false; +} + /* this function is forced noinline and has short generic name to look better * in test_progs output (in case of a failure) */ @@ -701,6 +739,11 @@ void run_subtest(struct test_loader *tester, if (!test__start_subtest(subspec->name)) return; + if (!run_on_current_arch(spec->arch_mask)) { + test__skip(); + return; + } + if (unpriv) { if (!can_execute_unpriv(tester, spec)) { test__skip(); -- cgit v1.2.3 From d0ad1f8f8846cffebca55abdd1ed275e276a6754 Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Mon, 22 Jul 2024 16:38:44 -0700 Subject: selftests/bpf: test no_caller_saved_registers spill/fill removal Tests for no_caller_saved_registers processing logic (see verifier.c:match_and_mark_nocsr_pattern()): - a canary positive test case; - a canary test case for arm64 and riscv64; - various tests with broken patterns; - tests with read/write fixed/varying stack access that violate nocsr stack access contract; - tests with multiple subprograms; - tests using nocsr in combination with may_goto/bpf_loop, as all of these features affect stack depth; - tests for nocsr stack spills below max stack depth. Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20240722233844.1406874-11-eddyz87@gmail.com Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko --- tools/testing/selftests/bpf/prog_tests/verifier.c | 2 + tools/testing/selftests/bpf/progs/verifier_nocsr.c | 796 +++++++++++++++++++++ 2 files changed, 798 insertions(+) create mode 100644 tools/testing/selftests/bpf/progs/verifier_nocsr.c (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/verifier.c b/tools/testing/selftests/bpf/prog_tests/verifier.c index ff1c7da1d06e..67a49d12472c 100644 --- a/tools/testing/selftests/bpf/prog_tests/verifier.c +++ b/tools/testing/selftests/bpf/prog_tests/verifier.c @@ -53,6 +53,7 @@ #include "verifier_movsx.skel.h" #include "verifier_netfilter_ctx.skel.h" #include "verifier_netfilter_retcode.skel.h" +#include "verifier_nocsr.skel.h" #include "verifier_or_jmp32_k.skel.h" #include "verifier_precision.skel.h" #include "verifier_prevent_map_lookup.skel.h" @@ -173,6 +174,7 @@ void test_verifier_meta_access(void) { RUN(verifier_meta_access); } void test_verifier_movsx(void) { RUN(verifier_movsx); } void test_verifier_netfilter_ctx(void) { RUN(verifier_netfilter_ctx); } void test_verifier_netfilter_retcode(void) { RUN(verifier_netfilter_retcode); } +void test_verifier_nocsr(void) { RUN(verifier_nocsr); } void test_verifier_or_jmp32_k(void) { RUN(verifier_or_jmp32_k); } void test_verifier_precision(void) { RUN(verifier_precision); } void test_verifier_prevent_map_lookup(void) { RUN(verifier_prevent_map_lookup); } diff --git a/tools/testing/selftests/bpf/progs/verifier_nocsr.c b/tools/testing/selftests/bpf/progs/verifier_nocsr.c new file mode 100644 index 000000000000..a7fe277e5167 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/verifier_nocsr.c @@ -0,0 +1,796 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include "../../../include/linux/filter.h" +#include "bpf_misc.h" + +SEC("raw_tp") +__arch_x86_64 +__log_level(4) __msg("stack depth 8") +__xlated("4: r5 = 5") +__xlated("5: w0 = ") +__xlated("6: r0 = &(void __percpu *)(r0)") +__xlated("7: r0 = *(u32 *)(r0 +0)") +__xlated("8: exit") +__success +__naked void simple(void) +{ + asm volatile ( + "r1 = 1;" + "r2 = 2;" + "r3 = 3;" + "r4 = 4;" + "r5 = 5;" + "*(u64 *)(r10 - 16) = r1;" + "*(u64 *)(r10 - 24) = r2;" + "*(u64 *)(r10 - 32) = r3;" + "*(u64 *)(r10 - 40) = r4;" + "*(u64 *)(r10 - 48) = r5;" + "call %[bpf_get_smp_processor_id];" + "r5 = *(u64 *)(r10 - 48);" + "r4 = *(u64 *)(r10 - 40);" + "r3 = *(u64 *)(r10 - 32);" + "r2 = *(u64 *)(r10 - 24);" + "r1 = *(u64 *)(r10 - 16);" + "exit;" + : + : __imm(bpf_get_smp_processor_id) + : __clobber_all); +} + +/* The logic for detecting and verifying nocsr pattern is the same for + * any arch, however x86 differs from arm64 or riscv64 in a way + * bpf_get_smp_processor_id is rewritten: + * - on x86 it is done by verifier + * - on arm64 and riscv64 it is done by jit + * + * Which leads to different xlated patterns for different archs: + * - on x86 the call is expanded as 3 instructions + * - on arm64 and riscv64 the call remains as is + * (but spills/fills are still removed) + * + * It is really desirable to check instruction indexes in the xlated + * patterns, so add this canary test to check that function rewrite by + * jit is correctly processed by nocsr logic, keep the rest of the + * tests as x86. + */ +SEC("raw_tp") +__arch_arm64 +__arch_riscv64 +__xlated("0: r1 = 1") +__xlated("1: call bpf_get_smp_processor_id") +__xlated("2: exit") +__success +__naked void canary_arm64_riscv64(void) +{ + asm volatile ( + "r1 = 1;" + "*(u64 *)(r10 - 16) = r1;" + "call %[bpf_get_smp_processor_id];" + "r1 = *(u64 *)(r10 - 16);" + "exit;" + : + : __imm(bpf_get_smp_processor_id) + : __clobber_all); +} + +SEC("raw_tp") +__arch_x86_64 +__xlated("1: r0 = &(void __percpu *)(r0)") +__xlated("3: exit") +__success +__naked void canary_zero_spills(void) +{ + asm volatile ( + "call %[bpf_get_smp_processor_id];" + "exit;" + : + : __imm(bpf_get_smp_processor_id) + : __clobber_all); +} + +SEC("raw_tp") +__arch_x86_64 +__log_level(4) __msg("stack depth 16") +__xlated("1: *(u64 *)(r10 -16) = r1") +__xlated("3: r0 = &(void __percpu *)(r0)") +__xlated("5: r2 = *(u64 *)(r10 -16)") +__success +__naked void wrong_reg_in_pattern1(void) +{ + asm volatile ( + "r1 = 1;" + "*(u64 *)(r10 - 16) = r1;" + "call %[bpf_get_smp_processor_id];" + "r2 = *(u64 *)(r10 - 16);" + "exit;" + : + : __imm(bpf_get_smp_processor_id) + : __clobber_all); +} + +SEC("raw_tp") +__arch_x86_64 +__xlated("1: *(u64 *)(r10 -16) = r6") +__xlated("3: r0 = &(void __percpu *)(r0)") +__xlated("5: r6 = *(u64 *)(r10 -16)") +__success +__naked void wrong_reg_in_pattern2(void) +{ + asm volatile ( + "r6 = 1;" + "*(u64 *)(r10 - 16) = r6;" + "call %[bpf_get_smp_processor_id];" + "r6 = *(u64 *)(r10 - 16);" + "exit;" + : + : __imm(bpf_get_smp_processor_id) + : __clobber_all); +} + +SEC("raw_tp") +__arch_x86_64 +__xlated("1: *(u64 *)(r10 -16) = r0") +__xlated("3: r0 = &(void __percpu *)(r0)") +__xlated("5: r0 = *(u64 *)(r10 -16)") +__success +__naked void wrong_reg_in_pattern3(void) +{ + asm volatile ( + "r0 = 1;" + "*(u64 *)(r10 - 16) = r0;" + "call %[bpf_get_smp_processor_id];" + "r0 = *(u64 *)(r10 - 16);" + "exit;" + : + : __imm(bpf_get_smp_processor_id) + : __clobber_all); +} + +SEC("raw_tp") +__arch_x86_64 +__xlated("2: *(u64 *)(r2 -16) = r1") +__xlated("4: r0 = &(void __percpu *)(r0)") +__xlated("6: r1 = *(u64 *)(r10 -16)") +__success +__naked void wrong_base_in_pattern(void) +{ + asm volatile ( + "r1 = 1;" + "r2 = r10;" + "*(u64 *)(r2 - 16) = r1;" + "call %[bpf_get_smp_processor_id];" + "r1 = *(u64 *)(r10 - 16);" + "exit;" + : + : __imm(bpf_get_smp_processor_id) + : __clobber_all); +} + +SEC("raw_tp") +__arch_x86_64 +__xlated("1: *(u64 *)(r10 -16) = r1") +__xlated("3: r0 = &(void __percpu *)(r0)") +__xlated("5: r2 = 1") +__success +__naked void wrong_insn_in_pattern(void) +{ + asm volatile ( + "r1 = 1;" + "*(u64 *)(r10 - 16) = r1;" + "call %[bpf_get_smp_processor_id];" + "r2 = 1;" + "r1 = *(u64 *)(r10 - 16);" + "exit;" + : + : __imm(bpf_get_smp_processor_id) + : __clobber_all); +} + +SEC("raw_tp") +__arch_x86_64 +__xlated("2: *(u64 *)(r10 -16) = r1") +__xlated("4: r0 = &(void __percpu *)(r0)") +__xlated("6: r1 = *(u64 *)(r10 -8)") +__success +__naked void wrong_off_in_pattern1(void) +{ + asm volatile ( + "r1 = 1;" + "*(u64 *)(r10 - 8) = r1;" + "*(u64 *)(r10 - 16) = r1;" + "call %[bpf_get_smp_processor_id];" + "r1 = *(u64 *)(r10 - 8);" + "exit;" + : + : __imm(bpf_get_smp_processor_id) + : __clobber_all); +} + +SEC("raw_tp") +__arch_x86_64 +__xlated("1: *(u32 *)(r10 -4) = r1") +__xlated("3: r0 = &(void __percpu *)(r0)") +__xlated("5: r1 = *(u32 *)(r10 -4)") +__success +__naked void wrong_off_in_pattern2(void) +{ + asm volatile ( + "r1 = 1;" + "*(u32 *)(r10 - 4) = r1;" + "call %[bpf_get_smp_processor_id];" + "r1 = *(u32 *)(r10 - 4);" + "exit;" + : + : __imm(bpf_get_smp_processor_id) + : __clobber_all); +} + +SEC("raw_tp") +__arch_x86_64 +__xlated("1: *(u32 *)(r10 -16) = r1") +__xlated("3: r0 = &(void __percpu *)(r0)") +__xlated("5: r1 = *(u32 *)(r10 -16)") +__success +__naked void wrong_size_in_pattern(void) +{ + asm volatile ( + "r1 = 1;" + "*(u32 *)(r10 - 16) = r1;" + "call %[bpf_get_smp_processor_id];" + "r1 = *(u32 *)(r10 - 16);" + "exit;" + : + : __imm(bpf_get_smp_processor_id) + : __clobber_all); +} + +SEC("raw_tp") +__arch_x86_64 +__xlated("2: *(u32 *)(r10 -8) = r1") +__xlated("4: r0 = &(void __percpu *)(r0)") +__xlated("6: r1 = *(u32 *)(r10 -8)") +__success +__naked void partial_pattern(void) +{ + asm volatile ( + "r1 = 1;" + "r2 = 2;" + "*(u32 *)(r10 - 8) = r1;" + "*(u64 *)(r10 - 16) = r2;" + "call %[bpf_get_smp_processor_id];" + "r2 = *(u64 *)(r10 - 16);" + "r1 = *(u32 *)(r10 - 8);" + "exit;" + : + : __imm(bpf_get_smp_processor_id) + : __clobber_all); +} + +SEC("raw_tp") +__arch_x86_64 +__xlated("0: r1 = 1") +__xlated("1: r2 = 2") +/* not patched, spills for -8, -16 not removed */ +__xlated("2: *(u64 *)(r10 -8) = r1") +__xlated("3: *(u64 *)(r10 -16) = r2") +__xlated("5: r0 = &(void __percpu *)(r0)") +__xlated("7: r2 = *(u64 *)(r10 -16)") +__xlated("8: r1 = *(u64 *)(r10 -8)") +/* patched, spills for -24, -32 removed */ +__xlated("10: r0 = &(void __percpu *)(r0)") +__xlated("12: exit") +__success +__naked void min_stack_offset(void) +{ + asm volatile ( + "r1 = 1;" + "r2 = 2;" + /* this call won't be patched */ + "*(u64 *)(r10 - 8) = r1;" + "*(u64 *)(r10 - 16) = r2;" + "call %[bpf_get_smp_processor_id];" + "r2 = *(u64 *)(r10 - 16);" + "r1 = *(u64 *)(r10 - 8);" + /* this call would be patched */ + "*(u64 *)(r10 - 24) = r1;" + "*(u64 *)(r10 - 32) = r2;" + "call %[bpf_get_smp_processor_id];" + "r2 = *(u64 *)(r10 - 32);" + "r1 = *(u64 *)(r10 - 24);" + "exit;" + : + : __imm(bpf_get_smp_processor_id) + : __clobber_all); +} + +SEC("raw_tp") +__arch_x86_64 +__xlated("1: *(u64 *)(r10 -8) = r1") +__xlated("3: r0 = &(void __percpu *)(r0)") +__xlated("5: r1 = *(u64 *)(r10 -8)") +__success +__naked void bad_fixed_read(void) +{ + asm volatile ( + "r1 = 1;" + "*(u64 *)(r10 - 8) = r1;" + "call %[bpf_get_smp_processor_id];" + "r1 = *(u64 *)(r10 - 8);" + "r1 = r10;" + "r1 += -8;" + "r1 = *(u64 *)(r1 - 0);" + "exit;" + : + : __imm(bpf_get_smp_processor_id) + : __clobber_all); +} + +SEC("raw_tp") +__arch_x86_64 +__xlated("1: *(u64 *)(r10 -8) = r1") +__xlated("3: r0 = &(void __percpu *)(r0)") +__xlated("5: r1 = *(u64 *)(r10 -8)") +__success +__naked void bad_fixed_write(void) +{ + asm volatile ( + "r1 = 1;" + "*(u64 *)(r10 - 8) = r1;" + "call %[bpf_get_smp_processor_id];" + "r1 = *(u64 *)(r10 - 8);" + "r1 = r10;" + "r1 += -8;" + "*(u64 *)(r1 - 0) = r1;" + "exit;" + : + : __imm(bpf_get_smp_processor_id) + : __clobber_all); +} + +SEC("raw_tp") +__arch_x86_64 +__xlated("6: *(u64 *)(r10 -16) = r1") +__xlated("8: r0 = &(void __percpu *)(r0)") +__xlated("10: r1 = *(u64 *)(r10 -16)") +__success +__naked void bad_varying_read(void) +{ + asm volatile ( + "r6 = *(u64 *)(r1 + 0);" /* random scalar value */ + "r6 &= 0x7;" /* r6 range [0..7] */ + "r6 += 0x2;" /* r6 range [2..9] */ + "r7 = 0;" + "r7 -= r6;" /* r7 range [-9..-2] */ + "r1 = 1;" + "*(u64 *)(r10 - 16) = r1;" + "call %[bpf_get_smp_processor_id];" + "r1 = *(u64 *)(r10 - 16);" + "r1 = r10;" + "r1 += r7;" + "r1 = *(u8 *)(r1 - 0);" /* touches slot [-16..-9] where spills are stored */ + "exit;" + : + : __imm(bpf_get_smp_processor_id) + : __clobber_all); +} + +SEC("raw_tp") +__arch_x86_64 +__xlated("6: *(u64 *)(r10 -16) = r1") +__xlated("8: r0 = &(void __percpu *)(r0)") +__xlated("10: r1 = *(u64 *)(r10 -16)") +__success +__naked void bad_varying_write(void) +{ + asm volatile ( + "r6 = *(u64 *)(r1 + 0);" /* random scalar value */ + "r6 &= 0x7;" /* r6 range [0..7] */ + "r6 += 0x2;" /* r6 range [2..9] */ + "r7 = 0;" + "r7 -= r6;" /* r7 range [-9..-2] */ + "r1 = 1;" + "*(u64 *)(r10 - 16) = r1;" + "call %[bpf_get_smp_processor_id];" + "r1 = *(u64 *)(r10 - 16);" + "r1 = r10;" + "r1 += r7;" + "*(u8 *)(r1 - 0) = r7;" /* touches slot [-16..-9] where spills are stored */ + "exit;" + : + : __imm(bpf_get_smp_processor_id) + : __clobber_all); +} + +SEC("raw_tp") +__arch_x86_64 +__xlated("1: *(u64 *)(r10 -8) = r1") +__xlated("3: r0 = &(void __percpu *)(r0)") +__xlated("5: r1 = *(u64 *)(r10 -8)") +__success +__naked void bad_write_in_subprog(void) +{ + asm volatile ( + "r1 = 1;" + "*(u64 *)(r10 - 8) = r1;" + "call %[bpf_get_smp_processor_id];" + "r1 = *(u64 *)(r10 - 8);" + "r1 = r10;" + "r1 += -8;" + "call bad_write_in_subprog_aux;" + "exit;" + : + : __imm(bpf_get_smp_processor_id) + : __clobber_all); +} + +__used +__naked static void bad_write_in_subprog_aux(void) +{ + asm volatile ( + "r0 = 1;" + "*(u64 *)(r1 - 0) = r0;" /* invalidates nocsr contract for caller: */ + "exit;" /* caller stack at -8 used outside of the pattern */ + ::: __clobber_all); +} + +SEC("raw_tp") +__arch_x86_64 +__xlated("1: *(u64 *)(r10 -8) = r1") +__xlated("3: r0 = &(void __percpu *)(r0)") +__xlated("5: r1 = *(u64 *)(r10 -8)") +__success +__naked void bad_helper_write(void) +{ + asm volatile ( + "r1 = 1;" + /* nocsr pattern with stack offset -8 */ + "*(u64 *)(r10 - 8) = r1;" + "call %[bpf_get_smp_processor_id];" + "r1 = *(u64 *)(r10 - 8);" + "r1 = r10;" + "r1 += -8;" + "r2 = 1;" + "r3 = 42;" + /* read dst is fp[-8], thus nocsr rewrite not applied */ + "call %[bpf_probe_read_kernel];" + "exit;" + : + : __imm(bpf_get_smp_processor_id), + __imm(bpf_probe_read_kernel) + : __clobber_all); +} + +SEC("raw_tp") +__arch_x86_64 +/* main, not patched */ +__xlated("1: *(u64 *)(r10 -8) = r1") +__xlated("3: r0 = &(void __percpu *)(r0)") +__xlated("5: r1 = *(u64 *)(r10 -8)") +__xlated("9: call pc+1") +__xlated("10: exit") +/* subprogram, patched */ +__xlated("11: r1 = 1") +__xlated("13: r0 = &(void __percpu *)(r0)") +__xlated("15: exit") +__success +__naked void invalidate_one_subprog(void) +{ + asm volatile ( + "r1 = 1;" + "*(u64 *)(r10 - 8) = r1;" + "call %[bpf_get_smp_processor_id];" + "r1 = *(u64 *)(r10 - 8);" + "r1 = r10;" + "r1 += -8;" + "r1 = *(u64 *)(r1 - 0);" + "call invalidate_one_subprog_aux;" + "exit;" + : + : __imm(bpf_get_smp_processor_id) + : __clobber_all); +} + +__used +__naked static void invalidate_one_subprog_aux(void) +{ + asm volatile ( + "r1 = 1;" + "*(u64 *)(r10 - 8) = r1;" + "call %[bpf_get_smp_processor_id];" + "r1 = *(u64 *)(r10 - 8);" + "exit;" + : + : __imm(bpf_get_smp_processor_id) + : __clobber_all); +} + +SEC("raw_tp") +__arch_x86_64 +/* main */ +__xlated("0: r1 = 1") +__xlated("2: r0 = &(void __percpu *)(r0)") +__xlated("4: call pc+1") +__xlated("5: exit") +/* subprogram */ +__xlated("6: r1 = 1") +__xlated("8: r0 = &(void __percpu *)(r0)") +__xlated("10: *(u64 *)(r10 -16) = r1") +__xlated("11: exit") +__success +__naked void subprogs_use_independent_offsets(void) +{ + asm volatile ( + "r1 = 1;" + "*(u64 *)(r10 - 16) = r1;" + "call %[bpf_get_smp_processor_id];" + "r1 = *(u64 *)(r10 - 16);" + "call subprogs_use_independent_offsets_aux;" + "exit;" + : + : __imm(bpf_get_smp_processor_id) + : __clobber_all); +} + +__used +__naked static void subprogs_use_independent_offsets_aux(void) +{ + asm volatile ( + "r1 = 1;" + "*(u64 *)(r10 - 24) = r1;" + "call %[bpf_get_smp_processor_id];" + "r1 = *(u64 *)(r10 - 24);" + "*(u64 *)(r10 - 16) = r1;" + "exit;" + : + : __imm(bpf_get_smp_processor_id) + : __clobber_all); +} + +SEC("raw_tp") +__arch_x86_64 +__log_level(4) __msg("stack depth 8") +__xlated("2: r0 = &(void __percpu *)(r0)") +__success +__naked void helper_call_does_not_prevent_nocsr(void) +{ + asm volatile ( + "r1 = 1;" + "*(u64 *)(r10 - 8) = r1;" + "call %[bpf_get_smp_processor_id];" + "r1 = *(u64 *)(r10 - 8);" + "*(u64 *)(r10 - 8) = r1;" + "call %[bpf_get_prandom_u32];" + "r1 = *(u64 *)(r10 - 8);" + "exit;" + : + : __imm(bpf_get_smp_processor_id), + __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("raw_tp") +__arch_x86_64 +__log_level(4) __msg("stack depth 16") +/* may_goto counter at -16 */ +__xlated("0: *(u64 *)(r10 -16) =") +__xlated("1: r1 = 1") +__xlated("3: r0 = &(void __percpu *)(r0)") +/* may_goto expansion starts */ +__xlated("5: r11 = *(u64 *)(r10 -16)") +__xlated("6: if r11 == 0x0 goto pc+3") +__xlated("7: r11 -= 1") +__xlated("8: *(u64 *)(r10 -16) = r11") +/* may_goto expansion ends */ +__xlated("9: *(u64 *)(r10 -8) = r1") +__xlated("10: exit") +__success +__naked void may_goto_interaction(void) +{ + asm volatile ( + "r1 = 1;" + "*(u64 *)(r10 - 16) = r1;" + "call %[bpf_get_smp_processor_id];" + "r1 = *(u64 *)(r10 - 16);" + ".8byte %[may_goto];" + /* just touch some stack at -8 */ + "*(u64 *)(r10 - 8) = r1;" + "exit;" + : + : __imm(bpf_get_smp_processor_id), + __imm_insn(may_goto, BPF_RAW_INSN(BPF_JMP | BPF_JCOND, 0, 0, +1 /* offset */, 0)) + : __clobber_all); +} + +__used +__naked static void dummy_loop_callback(void) +{ + asm volatile ( + "r0 = 0;" + "exit;" + ::: __clobber_all); +} + +SEC("raw_tp") +__arch_x86_64 +__log_level(4) __msg("stack depth 32+0") +__xlated("2: r1 = 1") +__xlated("3: w0 =") +__xlated("4: r0 = &(void __percpu *)(r0)") +__xlated("5: r0 = *(u32 *)(r0 +0)") +/* bpf_loop params setup */ +__xlated("6: r2 =") +__xlated("7: r3 = 0") +__xlated("8: r4 = 0") +/* ... part of the inlined bpf_loop */ +__xlated("12: *(u64 *)(r10 -32) = r6") +__xlated("13: *(u64 *)(r10 -24) = r7") +__xlated("14: *(u64 *)(r10 -16) = r8") +/* ... */ +__xlated("21: call pc+8") /* dummy_loop_callback */ +/* ... last insns of the bpf_loop_interaction1 */ +__xlated("28: r0 = 0") +__xlated("29: exit") +/* dummy_loop_callback */ +__xlated("30: r0 = 0") +__xlated("31: exit") +__success +__naked int bpf_loop_interaction1(void) +{ + asm volatile ( + "r1 = 1;" + /* nocsr stack region at -16, but could be removed */ + "*(u64 *)(r10 - 16) = r1;" + "call %[bpf_get_smp_processor_id];" + "r1 = *(u64 *)(r10 - 16);" + "r2 = %[dummy_loop_callback];" + "r3 = 0;" + "r4 = 0;" + "call %[bpf_loop];" + "r0 = 0;" + "exit;" + : + : __imm_ptr(dummy_loop_callback), + __imm(bpf_get_smp_processor_id), + __imm(bpf_loop) + : __clobber_common + ); +} + +SEC("raw_tp") +__arch_x86_64 +__log_level(4) __msg("stack depth 40+0") +/* call bpf_get_smp_processor_id */ +__xlated("2: r1 = 42") +__xlated("3: w0 =") +__xlated("4: r0 = &(void __percpu *)(r0)") +__xlated("5: r0 = *(u32 *)(r0 +0)") +/* call bpf_get_prandom_u32 */ +__xlated("6: *(u64 *)(r10 -16) = r1") +__xlated("7: call") +__xlated("8: r1 = *(u64 *)(r10 -16)") +/* ... */ +/* ... part of the inlined bpf_loop */ +__xlated("15: *(u64 *)(r10 -40) = r6") +__xlated("16: *(u64 *)(r10 -32) = r7") +__xlated("17: *(u64 *)(r10 -24) = r8") +__success +__naked int bpf_loop_interaction2(void) +{ + asm volatile ( + "r1 = 42;" + /* nocsr stack region at -16, cannot be removed */ + "*(u64 *)(r10 - 16) = r1;" + "call %[bpf_get_smp_processor_id];" + "r1 = *(u64 *)(r10 - 16);" + "*(u64 *)(r10 - 16) = r1;" + "call %[bpf_get_prandom_u32];" + "r1 = *(u64 *)(r10 - 16);" + "r2 = %[dummy_loop_callback];" + "r3 = 0;" + "r4 = 0;" + "call %[bpf_loop];" + "r0 = 0;" + "exit;" + : + : __imm_ptr(dummy_loop_callback), + __imm(bpf_get_smp_processor_id), + __imm(bpf_get_prandom_u32), + __imm(bpf_loop) + : __clobber_common + ); +} + +SEC("raw_tp") +__arch_x86_64 +__log_level(4) +__msg("stack depth 512+0") +/* just to print xlated version when debugging */ +__xlated("r0 = &(void __percpu *)(r0)") +__success +/* cumulative_stack_depth() stack usage is MAX_BPF_STACK, + * called subprogram uses an additional slot for nocsr spill/fill, + * since nocsr spill/fill could be removed the program still fits + * in MAX_BPF_STACK and should be accepted. + */ +__naked int cumulative_stack_depth(void) +{ + asm volatile( + "r1 = 42;" + "*(u64 *)(r10 - %[max_bpf_stack]) = r1;" + "call cumulative_stack_depth_subprog;" + "exit;" + : + : __imm_const(max_bpf_stack, MAX_BPF_STACK) + : __clobber_all + ); +} + +__used +__naked static void cumulative_stack_depth_subprog(void) +{ + asm volatile ( + "*(u64 *)(r10 - 8) = r1;" + "call %[bpf_get_smp_processor_id];" + "r1 = *(u64 *)(r10 - 8);" + "exit;" + :: __imm(bpf_get_smp_processor_id) : __clobber_all); +} + +SEC("raw_tp") +__arch_x86_64 +__log_level(4) +__msg("stack depth 512") +__xlated("0: r1 = 42") +__xlated("1: *(u64 *)(r10 -512) = r1") +__xlated("2: w0 = ") +__xlated("3: r0 = &(void __percpu *)(r0)") +__xlated("4: r0 = *(u32 *)(r0 +0)") +__xlated("5: exit") +__success +__naked int nocsr_max_stack_ok(void) +{ + asm volatile( + "r1 = 42;" + "*(u64 *)(r10 - %[max_bpf_stack]) = r1;" + "*(u64 *)(r10 - %[max_bpf_stack_8]) = r1;" + "call %[bpf_get_smp_processor_id];" + "r1 = *(u64 *)(r10 - %[max_bpf_stack_8]);" + "exit;" + : + : __imm_const(max_bpf_stack, MAX_BPF_STACK), + __imm_const(max_bpf_stack_8, MAX_BPF_STACK + 8), + __imm(bpf_get_smp_processor_id) + : __clobber_all + ); +} + +SEC("raw_tp") +__arch_x86_64 +__log_level(4) +__msg("stack depth 520") +__failure +__naked int nocsr_max_stack_fail(void) +{ + asm volatile( + "r1 = 42;" + "*(u64 *)(r10 - %[max_bpf_stack]) = r1;" + "*(u64 *)(r10 - %[max_bpf_stack_8]) = r1;" + "call %[bpf_get_smp_processor_id];" + "r1 = *(u64 *)(r10 - %[max_bpf_stack_8]);" + /* call to prandom blocks nocsr rewrite */ + "*(u64 *)(r10 - %[max_bpf_stack_8]) = r1;" + "call %[bpf_get_prandom_u32];" + "r1 = *(u64 *)(r10 - %[max_bpf_stack_8]);" + "exit;" + : + : __imm_const(max_bpf_stack, MAX_BPF_STACK), + __imm_const(max_bpf_stack_8, MAX_BPF_STACK + 8), + __imm(bpf_get_smp_processor_id), + __imm(bpf_get_prandom_u32) + : __clobber_all + ); +} + +char _license[] SEC("license") = "GPL"; -- cgit v1.2.3 From 6b376e7543dd15faec4e3878bc1a187126532985 Mon Sep 17 00:00:00 2001 From: Ihor Solodrai Date: Tue, 23 Jul 2024 20:57:43 +0000 Subject: selftests/bpf: Make %.test.d prerequisite order only %.test.o should depend on %.test.d order-only to avoid unnecessary recompilations due to compiler dumping .d and .o files in random order. Signed-off-by: Ihor Solodrai Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/all/gSoCpn9qV5K0hRvrvYlrw2StRntsvZcrUuDfkZUh1Ang9E6yZ9XJGYDuIP9iCuM2YTVhSEzEXCteQ94_0uIUjx_mXwupFJt64NJaiMr99a0=@pm.me Link: https://lore.kernel.org/all/FnnOUuDMmf0SebqA1bb0fQIW4vguOZ-VcAlPnPMnmT2lJYxMMxFAhcgh77px8MsPS5Fr01I0YQxLJClEJTFWHdpaTBVSQhlmsVTcEsNQbV4=@pm.me Link: https://lore.kernel.org/bpf/yyjJRl5LODbI4-FseU0wIP5e4ik0zAy7Sy-5eGwrzG_UanI8rwWlQPfXAFnn_27hoZFogoUHRSWxFsLk7hPr0b6P5TZ3cRrM30_ggnu555M=@pm.me --- tools/testing/selftests/bpf/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 888ba68e6592..083f413a9c98 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -598,7 +598,7 @@ endif # Note: we cd into output directory to ensure embedded BPF object is found $(TRUNNER_TEST_OBJS): $(TRUNNER_OUTPUT)/%.test.o: \ $(TRUNNER_TESTS_DIR)/%.c \ - $(TRUNNER_OUTPUT)/%.test.d + | $(TRUNNER_OUTPUT)/%.test.d $$(call msg,TEST-OBJ,$(TRUNNER_BINARY),$$@) $(Q)cd $$(@D) && $$(CC) -I. $$(CFLAGS) -MMD -MT $$@ -c $(CURDIR)/$$< $$(LDLIBS) -o $$(@F) -- cgit v1.2.3 From ec4fe2f0fa12fd2d0115df7e58414dc26899cc5e Mon Sep 17 00:00:00 2001 From: Tony Ambardar Date: Mon, 22 Jul 2024 22:54:28 -0700 Subject: selftests/bpf: Use pid_t consistently in test_progs.c Use pid_t rather than __pid_t when allocating memory for 'worker_pids' in 'struct test_env', as this is its declared type and also avoids compile errors seen building against musl libc on mipsel64: test_progs.c:1738:49: error: '__pid_t' undeclared (first use in this function); did you mean 'pid_t'? 1738 | env.worker_pids = calloc(sizeof(__pid_t), env.workers); | ^~~~~~~ | pid_t test_progs.c:1738:49: note: each undeclared identifier is reported only once for each function it appears in Fixes: 91b2c0afd00c ("selftests/bpf: Add parallelism to test_progs") Signed-off-by: Tony Ambardar Signed-off-by: Andrii Nakryiko Acked-by: Geliang Tang Link: https://lore.kernel.org/bpf/c6447da51a94babc1931711a43e2ceecb135c93d.1721713597.git.tony.ambardar@gmail.com --- tools/testing/selftests/bpf/test_progs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/test_progs.c b/tools/testing/selftests/bpf/test_progs.c index 89ff704e9dad..60c5ec0f6abf 100644 --- a/tools/testing/selftests/bpf/test_progs.c +++ b/tools/testing/selftests/bpf/test_progs.c @@ -1731,7 +1731,7 @@ int main(int argc, char **argv) /* launch workers if requested */ env.worker_id = -1; /* main process */ if (env.workers) { - env.worker_pids = calloc(sizeof(__pid_t), env.workers); + env.worker_pids = calloc(sizeof(pid_t), env.workers); env.worker_socks = calloc(sizeof(int), env.workers); if (env.debug) fprintf(stdout, "Launching %d workers.\n", env.workers); -- cgit v1.2.3 From d393f9479d4aaab0fa4c3caf513f28685e831f13 Mon Sep 17 00:00:00 2001 From: Tony Ambardar Date: Mon, 22 Jul 2024 22:54:29 -0700 Subject: selftests/bpf: Fix compile error from rlim_t in sk_storage_map.c Cast 'rlim_t' argument to match expected type of printf() format and avoid compile errors seen building for mips64el/musl-libc: In file included from map_tests/sk_storage_map.c:20: map_tests/sk_storage_map.c: In function 'test_sk_storage_map_stress_free': map_tests/sk_storage_map.c:414:56: error: format '%lu' expects argument of type 'long unsigned int', but argument 2 has type 'rlim_t' {aka 'long long unsigned int'} [-Werror=format=] 414 | CHECK(err, "setrlimit(RLIMIT_NOFILE)", "rlim_new:%lu errno:%d", | ^~~~~~~~~~~~~~~~~~~~~~~ 415 | rlim_new.rlim_cur, errno); | ~~~~~~~~~~~~~~~~~ | | | rlim_t {aka long long unsigned int} ./test_maps.h:12:24: note: in definition of macro 'CHECK' 12 | printf(format); \ | ^~~~~~ map_tests/sk_storage_map.c:414:68: note: format string is defined here 414 | CHECK(err, "setrlimit(RLIMIT_NOFILE)", "rlim_new:%lu errno:%d", | ~~^ | | | long unsigned int | %llu cc1: all warnings being treated as errors Fixes: 51a0e301a563 ("bpf: Add BPF_MAP_TYPE_SK_STORAGE test to test_maps") Signed-off-by: Tony Ambardar Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/1e00a1fa7acf91b4ca135c4102dc796d518bad86.1721713597.git.tony.ambardar@gmail.com --- tools/testing/selftests/bpf/map_tests/sk_storage_map.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/map_tests/sk_storage_map.c b/tools/testing/selftests/bpf/map_tests/sk_storage_map.c index 18405c3b7cee..af10c309359a 100644 --- a/tools/testing/selftests/bpf/map_tests/sk_storage_map.c +++ b/tools/testing/selftests/bpf/map_tests/sk_storage_map.c @@ -412,7 +412,7 @@ static void test_sk_storage_map_stress_free(void) rlim_new.rlim_max = rlim_new.rlim_cur + 128; err = setrlimit(RLIMIT_NOFILE, &rlim_new); CHECK(err, "setrlimit(RLIMIT_NOFILE)", "rlim_new:%lu errno:%d", - rlim_new.rlim_cur, errno); + (unsigned long) rlim_new.rlim_cur, errno); } err = do_sk_storage_map_stress_free(); -- cgit v1.2.3 From 7b10f0c227ce3fa055d601f058dc411092a62a78 Mon Sep 17 00:00:00 2001 From: Tony Ambardar Date: Mon, 22 Jul 2024 22:54:30 -0700 Subject: selftests/bpf: Fix error compiling bpf_iter_setsockopt.c with musl libc Existing code calls getsockname() with a 'struct sockaddr_in6 *' argument where a 'struct sockaddr *' argument is declared, yielding compile errors when building for mips64el/musl-libc: bpf_iter_setsockopt.c: In function 'get_local_port': bpf_iter_setsockopt.c:98:30: error: passing argument 2 of 'getsockname' from incompatible pointer type [-Werror=incompatible-pointer-types] 98 | if (!getsockname(fd, &addr, &addrlen)) | ^~~~~ | | | struct sockaddr_in6 * In file included from .../netinet/in.h:10, from .../arpa/inet.h:9, from ./test_progs.h:17, from bpf_iter_setsockopt.c:5: .../sys/socket.h:391:23: note: expected 'struct sockaddr * restrict' but argument is of type 'struct sockaddr_in6 *' 391 | int getsockname (int, struct sockaddr *__restrict, socklen_t *__restrict); | ^ cc1: all warnings being treated as errors This compiled under glibc only because the argument is declared to be a "funky" transparent union which includes both types above. Explicitly cast the argument to allow compiling for both musl and glibc. Fixes: eed92afdd14c ("bpf: selftest: Test batching and bpf_(get|set)sockopt in bpf tcp iter") Signed-off-by: Tony Ambardar Signed-off-by: Andrii Nakryiko Acked-by: Geliang Tang Link: https://lore.kernel.org/bpf/f41def0f17b27a23b1709080e4e3f37f4cc11ca9.1721713597.git.tony.ambardar@gmail.com --- tools/testing/selftests/bpf/prog_tests/bpf_iter_setsockopt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_iter_setsockopt.c b/tools/testing/selftests/bpf/prog_tests/bpf_iter_setsockopt.c index b52ff8ce34db..16bed9dd8e6a 100644 --- a/tools/testing/selftests/bpf/prog_tests/bpf_iter_setsockopt.c +++ b/tools/testing/selftests/bpf/prog_tests/bpf_iter_setsockopt.c @@ -95,7 +95,7 @@ static unsigned short get_local_port(int fd) struct sockaddr_in6 addr; socklen_t addrlen = sizeof(addr); - if (!getsockname(fd, &addr, &addrlen)) + if (!getsockname(fd, (struct sockaddr *)&addr, &addrlen)) return ntohs(addr.sin6_port); return 0; -- cgit v1.2.3 From 69f409469c9b1515a5db40d5a36fda372376fa2d Mon Sep 17 00:00:00 2001 From: Tony Ambardar Date: Mon, 22 Jul 2024 22:54:31 -0700 Subject: selftests/bpf: Drop unneeded error.h includes The addition of general support for unprivileged tests in test_loader.c breaks building test_verifier on non-glibc (e.g. musl) systems, due to the inclusion of glibc extension '' in 'unpriv_helpers.c'. However, the header is actually not needed, so remove it to restore building. Similarly for sk_lookup.c and flow_dissector.c, error.h is not necessary and causes problems, so drop them. Fixes: 1d56ade032a4 ("selftests/bpf: Unprivileged tests for test_loader.c") Fixes: 0ab5539f8584 ("selftests/bpf: Tests for BPF_SK_LOOKUP attach point") Fixes: 0905beec9f52 ("selftests/bpf: run flow dissector tests in skb-less mode") Signed-off-by: Tony Ambardar Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/5664367edf5fea4f3f4b4aec3b182bcfc6edff9c.1721713597.git.tony.ambardar@gmail.com --- tools/testing/selftests/bpf/prog_tests/flow_dissector.c | 1 - tools/testing/selftests/bpf/prog_tests/sk_lookup.c | 1 - tools/testing/selftests/bpf/unpriv_helpers.c | 1 - 3 files changed, 3 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/flow_dissector.c b/tools/testing/selftests/bpf/prog_tests/flow_dissector.c index 9e5f38739104..9625e6d21791 100644 --- a/tools/testing/selftests/bpf/prog_tests/flow_dissector.c +++ b/tools/testing/selftests/bpf/prog_tests/flow_dissector.c @@ -1,7 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 #include #include -#include #include #include diff --git a/tools/testing/selftests/bpf/prog_tests/sk_lookup.c b/tools/testing/selftests/bpf/prog_tests/sk_lookup.c index 96f3863da8bc..023c31bde229 100644 --- a/tools/testing/selftests/bpf/prog_tests/sk_lookup.c +++ b/tools/testing/selftests/bpf/prog_tests/sk_lookup.c @@ -18,7 +18,6 @@ #include #include #include -#include #include #include #include diff --git a/tools/testing/selftests/bpf/unpriv_helpers.c b/tools/testing/selftests/bpf/unpriv_helpers.c index b6d016461fb0..220f6a963813 100644 --- a/tools/testing/selftests/bpf/unpriv_helpers.c +++ b/tools/testing/selftests/bpf/unpriv_helpers.c @@ -2,7 +2,6 @@ #include #include -#include #include #include #include -- cgit v1.2.3 From d44c93fc2f5a0c47b23fa03d374e45259abd92d2 Mon Sep 17 00:00:00 2001 From: Tony Ambardar Date: Mon, 22 Jul 2024 22:54:34 -0700 Subject: selftests/bpf: Fix missing ARRAY_SIZE() definition in bench.c Add a "bpf_util.h" include to avoid the following error seen compiling for mips64el with musl libc: bench.c: In function 'find_benchmark': bench.c:590:25: error: implicit declaration of function 'ARRAY_SIZE' [-Werror=implicit-function-declaration] 590 | for (i = 0; i < ARRAY_SIZE(benchs); i++) { | ^~~~~~~~~~ cc1: all warnings being treated as errors Fixes: 8e7c2a023ac0 ("selftests/bpf: Add benchmark runner infrastructure") Signed-off-by: Tony Ambardar Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/bc4dde77dfcd17a825d8f28f72f3292341966810.1721713597.git.tony.ambardar@gmail.com --- tools/testing/selftests/bpf/bench.c | 1 + 1 file changed, 1 insertion(+) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/bench.c b/tools/testing/selftests/bpf/bench.c index 627b74ae041b..90dc3aca32bd 100644 --- a/tools/testing/selftests/bpf/bench.c +++ b/tools/testing/selftests/bpf/bench.c @@ -10,6 +10,7 @@ #include #include #include "bench.h" +#include "bpf_util.h" #include "testing_helpers.h" struct env env = { -- cgit v1.2.3 From a2c155131b710959beb508ca6a54769b6b1bd488 Mon Sep 17 00:00:00 2001 From: Tony Ambardar Date: Mon, 22 Jul 2024 22:54:35 -0700 Subject: selftests/bpf: Fix missing UINT_MAX definitions in benchmarks Include in 'bench.h' to provide a UINT_MAX definition and avoid multiple compile errors against mips64el/musl-libc like: benchs/bench_local_storage.c: In function 'parse_arg': benchs/bench_local_storage.c:40:38: error: 'UINT_MAX' undeclared (first use in this function) 40 | if (ret < 1 || ret > UINT_MAX) { | ^~~~~~~~ benchs/bench_local_storage.c:11:1: note: 'UINT_MAX' is defined in header ''; did you forget to '#include '? 10 | #include +++ |+#include 11 | seen with bench_local_storage.c, bench_local_storage_rcu_tasks_trace.c, and bench_bpf_hashmap_lookup.c. Fixes: 73087489250d ("selftests/bpf: Add benchmark for local_storage get") Signed-off-by: Tony Ambardar Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/8f64a9d9fcff40a7fca090a65a68a9b62a468e16.1721713597.git.tony.ambardar@gmail.com --- tools/testing/selftests/bpf/bench.h | 1 + 1 file changed, 1 insertion(+) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/bench.h b/tools/testing/selftests/bpf/bench.h index 68180d8f8558..005c401b3e22 100644 --- a/tools/testing/selftests/bpf/bench.h +++ b/tools/testing/selftests/bpf/bench.h @@ -10,6 +10,7 @@ #include #include #include +#include struct cpu_set { bool *cpus; -- cgit v1.2.3 From 6495eb79ca7d15bd87c38d77307e8f9b6b7bf4ef Mon Sep 17 00:00:00 2001 From: Tony Ambardar Date: Mon, 22 Jul 2024 22:54:36 -0700 Subject: selftests/bpf: Fix missing BUILD_BUG_ON() declaration Explicitly include '' to fix errors seen compiling with gcc targeting mips64el/musl-libc: user_ringbuf.c: In function 'test_user_ringbuf_loop': user_ringbuf.c:426:9: error: implicit declaration of function 'BUILD_BUG_ON' [-Werror=implicit-function-declaration] 426 | BUILD_BUG_ON(total_samples <= c_max_entries); | ^~~~~~~~~~~~ cc1: all warnings being treated as errors Fixes: e5a9df51c746 ("selftests/bpf: Add selftests validating the user ringbuf") Signed-off-by: Tony Ambardar Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/b28575f9221ec54871c46a2e87612bb4bbf46ccd.1721713597.git.tony.ambardar@gmail.com --- tools/testing/selftests/bpf/prog_tests/user_ringbuf.c | 1 + 1 file changed, 1 insertion(+) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/user_ringbuf.c b/tools/testing/selftests/bpf/prog_tests/user_ringbuf.c index e51721df14fc..dfff6feac12c 100644 --- a/tools/testing/selftests/bpf/prog_tests/user_ringbuf.c +++ b/tools/testing/selftests/bpf/prog_tests/user_ringbuf.c @@ -4,6 +4,7 @@ #define _GNU_SOURCE #include #include +#include #include #include #include -- cgit v1.2.3 From 21f0b0af977203220ad58aff95e372151288ec47 Mon Sep 17 00:00:00 2001 From: Tony Ambardar Date: Mon, 22 Jul 2024 22:54:37 -0700 Subject: selftests/bpf: Fix include of Update ns_current_pid_tgid.c to use '#include ' and avoid compile error against mips64el/musl libc: In file included from .../prog_tests/ns_current_pid_tgid.c:14: .../include/sys/fcntl.h:1:2: error: #warning redirecting incorrect #include to [-Werror=cpp] 1 | #warning redirecting incorrect #include to | ^~~~~~~ cc1: all warnings being treated as errors Fixes: 09c02d553c49 ("bpf, selftests: Fold test_current_pid_tgid_new_ns into test_progs.") Signed-off-by: Tony Ambardar Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/8bdc869749177b575025bf69600a4ce591822609.1721713597.git.tony.ambardar@gmail.com --- tools/testing/selftests/bpf/prog_tests/ns_current_pid_tgid.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/ns_current_pid_tgid.c b/tools/testing/selftests/bpf/prog_tests/ns_current_pid_tgid.c index e72d75d6baa7..c29787e092d6 100644 --- a/tools/testing/selftests/bpf/prog_tests/ns_current_pid_tgid.c +++ b/tools/testing/selftests/bpf/prog_tests/ns_current_pid_tgid.c @@ -11,7 +11,7 @@ #include #include #include -#include +#include #include "network_helpers.h" #define STACK_SIZE (1024 * 1024) -- cgit v1.2.3 From 4c329b99ef9c118343379bde9f97e8ce5cac9fc9 Mon Sep 17 00:00:00 2001 From: Tony Ambardar Date: Mon, 22 Jul 2024 22:54:38 -0700 Subject: selftests/bpf: Fix compiling parse_tcp_hdr_opt.c with musl-libc The GNU version of 'struct tcphdr', with members 'doff' and 'urg_ptr', is not exposed by musl headers unless _GNU_SOURCE is defined. Add this definition to fix errors seen compiling for mips64el/musl-libc: parse_tcp_hdr_opt.c:18:21: error: 'struct tcphdr' has no member named 'urg_ptr' 18 | .pk6_v6.tcp.urg_ptr = 123, | ^~~~~~~ parse_tcp_hdr_opt.c:19:21: error: 'struct tcphdr' has no member named 'doff' 19 | .pk6_v6.tcp.doff = 9, /* 16 bytes of options */ | ^~~~ Fixes: cfa7b011894d ("selftests/bpf: tests for using dynptrs to parse skb and xdp buffers") Signed-off-by: Tony Ambardar Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/ac5440213c242c62cb4e0d9e0a9cd5058b6a31f6.1721713597.git.tony.ambardar@gmail.com --- tools/testing/selftests/bpf/prog_tests/parse_tcp_hdr_opt.c | 1 + 1 file changed, 1 insertion(+) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/parse_tcp_hdr_opt.c b/tools/testing/selftests/bpf/prog_tests/parse_tcp_hdr_opt.c index daa952711d8f..e9c07d561ded 100644 --- a/tools/testing/selftests/bpf/prog_tests/parse_tcp_hdr_opt.c +++ b/tools/testing/selftests/bpf/prog_tests/parse_tcp_hdr_opt.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 +#define _GNU_SOURCE #include #include #include "test_parse_tcp_hdr_opt.skel.h" -- cgit v1.2.3 From bae9a5ce7d3a9b3a9e07b31ab9e9c58450e3e9fd Mon Sep 17 00:00:00 2001 From: Tony Ambardar Date: Mon, 22 Jul 2024 22:54:39 -0700 Subject: selftests/bpf: Fix compiling kfree_skb.c with musl-libc The GNU version of 'struct tcphdr' with member 'doff' is not exposed by musl headers unless _GNU_SOURCE is defined. Add this definition to fix errors seen compiling for mips64el/musl-libc: In file included from kfree_skb.c:2: kfree_skb.c: In function 'on_sample': kfree_skb.c:45:30: error: 'struct tcphdr' has no member named 'doff' 45 | if (CHECK(pkt_v6->tcp.doff != 5, "check_tcp", | ^ Fixes: 580d656d80cf ("selftests/bpf: Add kfree_skb raw_tp test") Signed-off-by: Tony Ambardar Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/e2d8cedc790959c10d6822a51f01a7a3616bea1b.1721713597.git.tony.ambardar@gmail.com --- tools/testing/selftests/bpf/prog_tests/kfree_skb.c | 1 + 1 file changed, 1 insertion(+) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/kfree_skb.c b/tools/testing/selftests/bpf/prog_tests/kfree_skb.c index c07991544a78..34f8822fd221 100644 --- a/tools/testing/selftests/bpf/prog_tests/kfree_skb.c +++ b/tools/testing/selftests/bpf/prog_tests/kfree_skb.c @@ -1,4 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 +#define _GNU_SOURCE #include #include #include "kfree_skb.skel.h" -- cgit v1.2.3 From 5e4c43bcb85973243d7274e0058b6e8f5810e4f7 Mon Sep 17 00:00:00 2001 From: Tony Ambardar Date: Mon, 22 Jul 2024 22:54:40 -0700 Subject: selftests/bpf: Fix compiling flow_dissector.c with musl-libc The GNU version of 'struct tcphdr' has members 'doff', 'source' and 'dest', which are not exposed by musl libc headers unless _GNU_SOURCE is defined. Add this definition to fix errors seen compiling for mips64el/musl-libc: flow_dissector.c:118:30: error: 'struct tcphdr' has no member named 'doff' 118 | .tcp.doff = 5, | ^~~~ flow_dissector.c:119:30: error: 'struct tcphdr' has no member named 'source' 119 | .tcp.source = 80, | ^~~~~~ flow_dissector.c:120:30: error: 'struct tcphdr' has no member named 'dest' 120 | .tcp.dest = 8080, | ^~~~ Fixes: ae173a915785 ("selftests/bpf: support BPF_FLOW_DISSECTOR_F_PARSE_1ST_FRAG") Signed-off-by: Tony Ambardar Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/8f7ab21a73f678f9cebd32b26c444a686e57414d.1721713597.git.tony.ambardar@gmail.com --- tools/testing/selftests/bpf/prog_tests/flow_dissector.c | 1 + 1 file changed, 1 insertion(+) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/flow_dissector.c b/tools/testing/selftests/bpf/prog_tests/flow_dissector.c index 9625e6d21791..3171047414a7 100644 --- a/tools/testing/selftests/bpf/prog_tests/flow_dissector.c +++ b/tools/testing/selftests/bpf/prog_tests/flow_dissector.c @@ -1,4 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 +#define _GNU_SOURCE #include #include #include -- cgit v1.2.3 From 18826fb0b79c3c3cd1fe765d85f9c6f1a902c722 Mon Sep 17 00:00:00 2001 From: Tony Ambardar Date: Mon, 22 Jul 2024 22:54:41 -0700 Subject: selftests/bpf: Fix compiling tcp_rtt.c with musl-libc The GNU version of 'struct tcp_info' in 'netinet/tcp.h' is not exposed by musl headers unless _GNU_SOURCE is defined. Add this definition to fix errors seen compiling for mips64el/musl-libc: tcp_rtt.c: In function 'wait_for_ack': tcp_rtt.c:24:25: error: storage size of 'info' isn't known 24 | struct tcp_info info; | ^~~~ tcp_rtt.c:24:25: error: unused variable 'info' [-Werror=unused-variable] cc1: all warnings being treated as errors Fixes: 1f4f80fed217 ("selftests/bpf: test_progs: convert test_tcp_rtt") Signed-off-by: Tony Ambardar Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/f2329767b15df206f08a5776d35a47c37da855ae.1721713597.git.tony.ambardar@gmail.com --- tools/testing/selftests/bpf/prog_tests/tcp_rtt.c | 1 + 1 file changed, 1 insertion(+) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/tcp_rtt.c b/tools/testing/selftests/bpf/prog_tests/tcp_rtt.c index f2b99d95d916..c38784c1c066 100644 --- a/tools/testing/selftests/bpf/prog_tests/tcp_rtt.c +++ b/tools/testing/selftests/bpf/prog_tests/tcp_rtt.c @@ -1,4 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 +#define _GNU_SOURCE #include #include "cgroup_helpers.h" #include "network_helpers.h" -- cgit v1.2.3 From debfa4f628f271f72933bf38d581cc53cfe1def5 Mon Sep 17 00:00:00 2001 From: Tony Ambardar Date: Mon, 22 Jul 2024 22:54:42 -0700 Subject: selftests/bpf: Fix compiling core_reloc.c with musl-libc The type 'loff_t' is a GNU extension and not exposed by the musl 'fcntl.h' header unless _GNU_SOURCE is defined. Add this definition to fix errors seen compiling for mips64el/musl-libc: In file included from tools/testing/selftests/bpf/prog_tests/core_reloc.c:4: ./bpf_testmod/bpf_testmod.h:10:9: error: unknown type name 'loff_t' 10 | loff_t off; | ^~~~~~ ./bpf_testmod/bpf_testmod.h:16:9: error: unknown type name 'loff_t' 16 | loff_t off; | ^~~~~~ Fixes: 6bcd39d366b6 ("selftests/bpf: Add CO-RE relocs selftest relying on kernel module BTF") Signed-off-by: Tony Ambardar Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/11c3af75a7eb6bcb7ad9acfae6a6f470c572eb82.1721713597.git.tony.ambardar@gmail.com --- tools/testing/selftests/bpf/prog_tests/core_reloc.c | 1 + 1 file changed, 1 insertion(+) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/core_reloc.c b/tools/testing/selftests/bpf/prog_tests/core_reloc.c index 47f42e680105..26019313e1fc 100644 --- a/tools/testing/selftests/bpf/prog_tests/core_reloc.c +++ b/tools/testing/selftests/bpf/prog_tests/core_reloc.c @@ -1,4 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 +#define _GNU_SOURCE #include #include "progs/core_reloc_types.h" #include "bpf_testmod/bpf_testmod.h" -- cgit v1.2.3 From 27c4797ce51c8dd51e35e68e9024a892f62d78b2 Mon Sep 17 00:00:00 2001 From: Tony Ambardar Date: Mon, 22 Jul 2024 22:54:43 -0700 Subject: selftests/bpf: Fix errors compiling lwt_redirect.c with musl libc Remove a redundant include of '' which is already provided in 'lwt_helpers.h'. This avoids errors seen compiling for mips64el/musl-libc: In file included from .../arpa/inet.h:9, from lwt_redirect.c:51: .../netinet/in.h:23:8: error: redefinition of 'struct in6_addr' 23 | struct in6_addr { | ^~~~~~~~ In file included from .../linux/icmp.h:24, from lwt_redirect.c:50: .../linux/in6.h:33:8: note: originally defined here 33 | struct in6_addr { | ^~~~~~~~ .../netinet/in.h:34:8: error: redefinition of 'struct sockaddr_in6' 34 | struct sockaddr_in6 { | ^~~~~~~~~~~~ .../linux/in6.h:50:8: note: originally defined here 50 | struct sockaddr_in6 { | ^~~~~~~~~~~~ .../netinet/in.h:42:8: error: redefinition of 'struct ipv6_mreq' 42 | struct ipv6_mreq { | ^~~~~~~~~ .../linux/in6.h:60:8: note: originally defined here 60 | struct ipv6_mreq { | ^~~~~~~~~ Fixes: 43a7c3ef8a15 ("selftests/bpf: Add lwt_xmit tests for BPF_REDIRECT") Signed-off-by: Tony Ambardar Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/3869dda876d5206d2f8d4dd67331c739ceb0c7f8.1721713597.git.tony.ambardar@gmail.com --- tools/testing/selftests/bpf/prog_tests/lwt_redirect.c | 1 - 1 file changed, 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/lwt_redirect.c b/tools/testing/selftests/bpf/prog_tests/lwt_redirect.c index 835a1d756c16..b6e8d822e8e9 100644 --- a/tools/testing/selftests/bpf/prog_tests/lwt_redirect.c +++ b/tools/testing/selftests/bpf/prog_tests/lwt_redirect.c @@ -47,7 +47,6 @@ #include #include #include -#include #include #include #include -- cgit v1.2.3 From 1b00f355130a5dfc38a01ad02458ae2cb2ebe609 Mon Sep 17 00:00:00 2001 From: Tony Ambardar Date: Mon, 22 Jul 2024 22:54:44 -0700 Subject: selftests/bpf: Fix errors compiling decap_sanity.c with musl libc Remove a redundant include of '', whose needed definitions are already provided by 'test_progs.h'. This avoids errors seen compiling for mips64el/musl-libc: In file included from .../arpa/inet.h:9, from ./test_progs.h:17, from prog_tests/decap_sanity.c:9: .../netinet/in.h:23:8: error: redefinition of 'struct in6_addr' 23 | struct in6_addr { | ^~~~~~~~ In file included from decap_sanity.c:7: .../linux/in6.h:33:8: note: originally defined here 33 | struct in6_addr { | ^~~~~~~~ .../netinet/in.h:34:8: error: redefinition of 'struct sockaddr_in6' 34 | struct sockaddr_in6 { | ^~~~~~~~~~~~ .../linux/in6.h:50:8: note: originally defined here 50 | struct sockaddr_in6 { | ^~~~~~~~~~~~ .../netinet/in.h:42:8: error: redefinition of 'struct ipv6_mreq' 42 | struct ipv6_mreq { | ^~~~~~~~~ .../linux/in6.h:60:8: note: originally defined here 60 | struct ipv6_mreq { | ^~~~~~~~~ Fixes: 70a00e2f1dba ("selftests/bpf: Test bpf_skb_adjust_room on CHECKSUM_PARTIAL") Signed-off-by: Tony Ambardar Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/e986ba2d7edccd254b54f7cd049b98f10bafa8c3.1721713597.git.tony.ambardar@gmail.com --- tools/testing/selftests/bpf/prog_tests/decap_sanity.c | 1 - 1 file changed, 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/decap_sanity.c b/tools/testing/selftests/bpf/prog_tests/decap_sanity.c index dcb9e5070cc3..d79f398ec6b7 100644 --- a/tools/testing/selftests/bpf/prog_tests/decap_sanity.c +++ b/tools/testing/selftests/bpf/prog_tests/decap_sanity.c @@ -4,7 +4,6 @@ #include #include #include -#include #include "test_progs.h" #include "network_helpers.h" -- cgit v1.2.3 From 9822be702fe6e1c3e0933ef4b68a8c56683d930d Mon Sep 17 00:00:00 2001 From: Tony Ambardar Date: Mon, 22 Jul 2024 22:54:45 -0700 Subject: selftests/bpf: Fix errors compiling crypto_sanity.c with musl libc Remove a redundant include of '', whose needed definitions are already provided by 'test_progs.h'. This avoids errors seen compiling for mips64el/musl-libc: In file included from .../arpa/inet.h:9, from ./test_progs.h:17, from prog_tests/crypto_sanity.c:10: .../netinet/in.h:23:8: error: redefinition of 'struct in6_addr' 23 | struct in6_addr { | ^~~~~~~~ In file included from crypto_sanity.c:7: .../linux/in6.h:33:8: note: originally defined here 33 | struct in6_addr { | ^~~~~~~~ .../netinet/in.h:34:8: error: redefinition of 'struct sockaddr_in6' 34 | struct sockaddr_in6 { | ^~~~~~~~~~~~ .../linux/in6.h:50:8: note: originally defined here 50 | struct sockaddr_in6 { | ^~~~~~~~~~~~ .../netinet/in.h:42:8: error: redefinition of 'struct ipv6_mreq' 42 | struct ipv6_mreq { | ^~~~~~~~~ .../linux/in6.h:60:8: note: originally defined here 60 | struct ipv6_mreq { | ^~~~~~~~~ Fixes: 91541ab192fc ("selftests: bpf: crypto skcipher algo selftests") Signed-off-by: Tony Ambardar Signed-off-by: Andrii Nakryiko Reviewed-by: Vadim Fedorenko Link: https://lore.kernel.org/bpf/911293968f424ad7b462d8805aeb3baee8f4985b.1721713597.git.tony.ambardar@gmail.com --- tools/testing/selftests/bpf/prog_tests/crypto_sanity.c | 1 - 1 file changed, 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/crypto_sanity.c b/tools/testing/selftests/bpf/prog_tests/crypto_sanity.c index b1a3a49a822a..42bd07f7218d 100644 --- a/tools/testing/selftests/bpf/prog_tests/crypto_sanity.c +++ b/tools/testing/selftests/bpf/prog_tests/crypto_sanity.c @@ -4,7 +4,6 @@ #include #include #include -#include #include #include "test_progs.h" -- cgit v1.2.3 From 730561d3c08d4a327cceaabf11365958a1c00cec Mon Sep 17 00:00:00 2001 From: Tony Ambardar Date: Mon, 22 Jul 2024 22:54:46 -0700 Subject: selftests/bpf: Fix errors compiling cg_storage_multi.h with musl libc Remove a redundant include of '', whose needed definitions are already included (via '') in cg_storage_multi_egress_only.c, cg_storage_multi_isolated.c, and cg_storage_multi_shared.c. This avoids redefinition errors seen compiling for mips64el/musl-libc like: In file included from progs/cg_storage_multi_egress_only.c:13: In file included from progs/cg_storage_multi.h:6: In file included from /usr/mips64el-linux-gnuabi64/include/asm/types.h:23: /usr/include/asm-generic/int-l64.h:29:25: error: typedef redefinition with different types ('long' vs 'long long') 29 | typedef __signed__ long __s64; | ^ /usr/include/asm-generic/int-ll64.h:30:44: note: previous definition is here 30 | __extension__ typedef __signed__ long long __s64; | ^ Fixes: 9e5bd1f7633b ("selftests/bpf: Test CGROUP_STORAGE map can't be used by multiple progs") Signed-off-by: Tony Ambardar Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/4f4702e9f6115b7f84fea01b2326ca24c6df7ba8.1721713597.git.tony.ambardar@gmail.com --- tools/testing/selftests/bpf/progs/cg_storage_multi.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/progs/cg_storage_multi.h b/tools/testing/selftests/bpf/progs/cg_storage_multi.h index a0778fe7857a..41d59f0ee606 100644 --- a/tools/testing/selftests/bpf/progs/cg_storage_multi.h +++ b/tools/testing/selftests/bpf/progs/cg_storage_multi.h @@ -3,8 +3,6 @@ #ifndef __PROGS_CG_STORAGE_MULTI_H #define __PROGS_CG_STORAGE_MULTI_H -#include - struct cgroup_value { __u32 egress_pkts; __u32 ingress_pkts; -- cgit v1.2.3 From 04a94133f1b3cccb19e056c26f056c50b4e5b3b1 Mon Sep 17 00:00:00 2001 From: David Vernet Date: Wed, 24 Jul 2024 12:14:58 -0500 Subject: libbpf: Don't take direct pointers into BTF data from st_ops In struct bpf_struct_ops, we have take a pointer to a BTF type name, and a struct btf_type. This was presumably done for convenience, but can actually result in subtle and confusing bugs given that BTF data can be invalidated before a program is loaded. For example, in sched_ext, we may sometimes resize a data section after a skeleton has been opened, but before the struct_ops scheduler map has been loaded. This may cause the BTF data to be realloc'd, which can then cause a UAF when loading the program because the struct_ops map has pointers directly into the BTF data. We're already storing the BTF type_id in struct bpf_struct_ops. Because type_id is stable, we can therefore just update the places where we were looking at those pointers to instead do the lookups we need from the type_id. Fixes: 590a00888250 ("bpf: libbpf: Add STRUCT_OPS support") Signed-off-by: David Vernet Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20240724171459.281234-1-void@manifault.com --- tools/lib/bpf/libbpf.c | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) (limited to 'tools') diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index a3be6f8fac09..e55353887439 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -496,8 +496,6 @@ struct bpf_program { }; struct bpf_struct_ops { - const char *tname; - const struct btf_type *type; struct bpf_program **progs; __u32 *kern_func_off; /* e.g. struct tcp_congestion_ops in bpf_prog's btf format */ @@ -1083,11 +1081,14 @@ static int bpf_object_adjust_struct_ops_autoload(struct bpf_object *obj) continue; for (j = 0; j < obj->nr_maps; ++j) { + const struct btf_type *type; + map = &obj->maps[j]; if (!bpf_map__is_struct_ops(map)) continue; - vlen = btf_vlen(map->st_ops->type); + type = btf__type_by_id(obj->btf, map->st_ops->type_id); + vlen = btf_vlen(type); for (k = 0; k < vlen; ++k) { slot_prog = map->st_ops->progs[k]; if (prog != slot_prog) @@ -1121,8 +1122,8 @@ static int bpf_map__init_kern_struct_ops(struct bpf_map *map) int err; st_ops = map->st_ops; - type = st_ops->type; - tname = st_ops->tname; + type = btf__type_by_id(btf, st_ops->type_id); + tname = btf__name_by_offset(btf, type->name_off); err = find_struct_ops_kern_types(obj, tname, &mod_btf, &kern_type, &kern_type_id, &kern_vtype, &kern_vtype_id, @@ -1423,8 +1424,6 @@ static int init_struct_ops_maps(struct bpf_object *obj, const char *sec_name, memcpy(st_ops->data, data->d_buf + vsi->offset, type->size); - st_ops->tname = tname; - st_ops->type = type; st_ops->type_id = type_id; pr_debug("struct_ops init: struct %s(type_id=%u) %s found at offset %u\n", @@ -8445,11 +8444,13 @@ static int bpf_object__resolve_externs(struct bpf_object *obj, static void bpf_map_prepare_vdata(const struct bpf_map *map) { + const struct btf_type *type; struct bpf_struct_ops *st_ops; __u32 i; st_ops = map->st_ops; - for (i = 0; i < btf_vlen(st_ops->type); i++) { + type = btf__type_by_id(map->obj->btf, st_ops->type_id); + for (i = 0; i < btf_vlen(type); i++) { struct bpf_program *prog = st_ops->progs[i]; void *kern_data; int prog_fd; @@ -9712,6 +9713,7 @@ static struct bpf_map *find_struct_ops_map_by_offset(struct bpf_object *obj, static int bpf_object__collect_st_ops_relos(struct bpf_object *obj, Elf64_Shdr *shdr, Elf_Data *data) { + const struct btf_type *type; const struct btf_member *member; struct bpf_struct_ops *st_ops; struct bpf_program *prog; @@ -9771,13 +9773,14 @@ static int bpf_object__collect_st_ops_relos(struct bpf_object *obj, } insn_idx = sym->st_value / BPF_INSN_SZ; - member = find_member_by_offset(st_ops->type, moff * 8); + type = btf__type_by_id(btf, st_ops->type_id); + member = find_member_by_offset(type, moff * 8); if (!member) { pr_warn("struct_ops reloc %s: cannot find member at moff %u\n", map->name, moff); return -EINVAL; } - member_idx = member - btf_members(st_ops->type); + member_idx = member - btf_members(type); name = btf__name_by_offset(btf, member->name_off); if (!resolve_func_ptr(btf, member->type, NULL)) { -- cgit v1.2.3 From 0bfdda9db88990539b63a11028c84c4c069e8a0f Mon Sep 17 00:00:00 2001 From: "Alexis Lothoré (eBPF Foundation)" Date: Sun, 21 Jul 2024 21:33:03 +0200 Subject: selftests/bpf: Update xdp_redirect_map prog sections for libbpf MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit xdp_redirect_map.c is a bpf program used by test_xdp_veth.sh, which is not handled by the generic test runner (test_progs). To allow converting this test to test_progs, the corresponding program must be updated to allow handling it through skeletons generated by bpftool and libbpf. Update programs section names to allow to manipulate those with libbpf. Signed-off-by: Alexis Lothoré (eBPF Foundation) Signed-off-by: Daniel Borkmann Acked-by: Stanislav Fomichev Link: https://lore.kernel.org/bpf/20240721-convert_test_xdp_veth-v4-1-23bdba21b2f9@bootlin.com Signed-off-by: Andrii Nakryiko --- tools/testing/selftests/bpf/progs/xdp_redirect_map.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/progs/xdp_redirect_map.c b/tools/testing/selftests/bpf/progs/xdp_redirect_map.c index d037262c8937..682dda8dabbc 100644 --- a/tools/testing/selftests/bpf/progs/xdp_redirect_map.c +++ b/tools/testing/selftests/bpf/progs/xdp_redirect_map.c @@ -10,19 +10,19 @@ struct { __uint(value_size, sizeof(int)); } tx_port SEC(".maps"); -SEC("redirect_map_0") +SEC("xdp") int xdp_redirect_map_0(struct xdp_md *xdp) { return bpf_redirect_map(&tx_port, 0, 0); } -SEC("redirect_map_1") +SEC("xdp") int xdp_redirect_map_1(struct xdp_md *xdp) { return bpf_redirect_map(&tx_port, 1, 0); } -SEC("redirect_map_2") +SEC("xdp") int xdp_redirect_map_2(struct xdp_md *xdp) { return bpf_redirect_map(&tx_port, 2, 0); -- cgit v1.2.3 From 41b01a0271fd0387171eb9ad4692c22e37c8c80a Mon Sep 17 00:00:00 2001 From: "Alexis Lothoré (eBPF Foundation)" Date: Sun, 21 Jul 2024 21:33:04 +0200 Subject: selftests/bpf: Integrate test_xdp_veth into test_progs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit test_xdp_veth.sh tests that XDP return codes work as expected, by bringing up multiple veth pairs isolated in different namespaces, attaching specific xdp programs to each interface, and ensuring that the whole chain allows to ping one end interface from the first one. The test runs well but is currently not integrated in test_progs, which prevents it from being run automatically in the CI infrastructure. Rewrite it as a C test relying on libbpf to allow running it in the CI infrastructure. The new code brings up the same network infrastructure and reuses the same eBPF programs as test_xdp_veth.sh, for which skeletons are already generated by the bpf tests makefile. Signed-off-by: Alexis Lothoré (eBPF Foundation) Signed-off-by: Daniel Borkmann Acked-by: Stanislav Fomichev Link: https://lore.kernel.org/bpf/20240721-convert_test_xdp_veth-v4-2-23bdba21b2f9@bootlin.com Signed-off-by: Andrii Nakryiko --- tools/testing/selftests/bpf/Makefile | 1 - .../selftests/bpf/prog_tests/test_xdp_veth.c | 213 +++++++++++++++++++++ tools/testing/selftests/bpf/test_xdp_veth.sh | 121 ------------ 3 files changed, 213 insertions(+), 122 deletions(-) create mode 100644 tools/testing/selftests/bpf/prog_tests/test_xdp_veth.c delete mode 100755 tools/testing/selftests/bpf/test_xdp_veth.sh (limited to 'tools') diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 083f413a9c98..774c6270e377 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -115,7 +115,6 @@ TEST_PROGS := test_kmod.sh \ test_xdp_redirect.sh \ test_xdp_redirect_multi.sh \ test_xdp_meta.sh \ - test_xdp_veth.sh \ test_tunnel.sh \ test_lwt_seg6local.sh \ test_lirc_mode2.sh \ diff --git a/tools/testing/selftests/bpf/prog_tests/test_xdp_veth.c b/tools/testing/selftests/bpf/prog_tests/test_xdp_veth.c new file mode 100644 index 000000000000..8d75424fe6bc --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/test_xdp_veth.c @@ -0,0 +1,213 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* Create 3 namespaces with 3 veth peers, and forward packets in-between using + * native XDP + * + * XDP_TX + * NS1(veth11) NS2(veth22) NS3(veth33) + * | | | + * | | | + * (veth1, (veth2, (veth3, + * id:111) id:122) id:133) + * ^ | ^ | ^ | + * | | XDP_REDIRECT | | XDP_REDIRECT | | + * | ------------------ ------------------ | + * ----------------------------------------- + * XDP_REDIRECT + */ + +#define _GNU_SOURCE +#include +#include "test_progs.h" +#include "network_helpers.h" +#include "xdp_dummy.skel.h" +#include "xdp_redirect_map.skel.h" +#include "xdp_tx.skel.h" + +#define VETH_PAIRS_COUNT 3 +#define NS_SUFFIX_LEN 6 +#define VETH_NAME_MAX_LEN 16 +#define IP_SRC "10.1.1.11" +#define IP_DST "10.1.1.33" +#define IP_CMD_MAX_LEN 128 + +struct skeletons { + struct xdp_dummy *xdp_dummy; + struct xdp_tx *xdp_tx; + struct xdp_redirect_map *xdp_redirect_maps; +}; + +struct veth_configuration { + char local_veth[VETH_NAME_MAX_LEN]; /* Interface in main namespace */ + char remote_veth[VETH_NAME_MAX_LEN]; /* Peer interface in dedicated namespace*/ + const char *namespace; /* Namespace for the remote veth */ + char next_veth[VETH_NAME_MAX_LEN]; /* Local interface to redirect traffic to */ + char *remote_addr; /* IP address of the remote veth */ +}; + +static struct veth_configuration config[VETH_PAIRS_COUNT] = { + { + .local_veth = "veth1", + .remote_veth = "veth11", + .next_veth = "veth2", + .remote_addr = IP_SRC, + .namespace = "ns-veth11" + }, + { + .local_veth = "veth2", + .remote_veth = "veth22", + .next_veth = "veth3", + .remote_addr = NULL, + .namespace = "ns-veth22" + }, + { + .local_veth = "veth3", + .remote_veth = "veth33", + .next_veth = "veth1", + .remote_addr = IP_DST, + .namespace = "ns-veth33" + } +}; + +static int attach_programs_to_veth_pair(struct skeletons *skeletons, int index) +{ + struct bpf_program *local_prog, *remote_prog; + struct bpf_link **local_link, **remote_link; + struct nstoken *nstoken; + struct bpf_link *link; + int interface; + + switch (index) { + case 0: + local_prog = skeletons->xdp_redirect_maps->progs.xdp_redirect_map_0; + local_link = &skeletons->xdp_redirect_maps->links.xdp_redirect_map_0; + remote_prog = skeletons->xdp_dummy->progs.xdp_dummy_prog; + remote_link = &skeletons->xdp_dummy->links.xdp_dummy_prog; + break; + case 1: + local_prog = skeletons->xdp_redirect_maps->progs.xdp_redirect_map_1; + local_link = &skeletons->xdp_redirect_maps->links.xdp_redirect_map_1; + remote_prog = skeletons->xdp_tx->progs.xdp_tx; + remote_link = &skeletons->xdp_tx->links.xdp_tx; + break; + case 2: + local_prog = skeletons->xdp_redirect_maps->progs.xdp_redirect_map_2; + local_link = &skeletons->xdp_redirect_maps->links.xdp_redirect_map_2; + remote_prog = skeletons->xdp_dummy->progs.xdp_dummy_prog; + remote_link = &skeletons->xdp_dummy->links.xdp_dummy_prog; + break; + } + interface = if_nametoindex(config[index].local_veth); + if (!ASSERT_NEQ(interface, 0, "non zero interface index")) + return -1; + link = bpf_program__attach_xdp(local_prog, interface); + if (!ASSERT_OK_PTR(link, "attach xdp program to local veth")) + return -1; + *local_link = link; + nstoken = open_netns(config[index].namespace); + if (!ASSERT_OK_PTR(nstoken, "switch to remote veth namespace")) + return -1; + interface = if_nametoindex(config[index].remote_veth); + if (!ASSERT_NEQ(interface, 0, "non zero interface index")) { + close_netns(nstoken); + return -1; + } + link = bpf_program__attach_xdp(remote_prog, interface); + *remote_link = link; + close_netns(nstoken); + if (!ASSERT_OK_PTR(link, "attach xdp program to remote veth")) + return -1; + + return 0; +} + +static int configure_network(struct skeletons *skeletons) +{ + int interface_id; + int map_fd; + int err; + int i = 0; + + /* First create and configure all interfaces */ + for (i = 0; i < VETH_PAIRS_COUNT; i++) { + SYS(fail, "ip netns add %s", config[i].namespace); + SYS(fail, "ip link add %s type veth peer name %s netns %s", + config[i].local_veth, config[i].remote_veth, config[i].namespace); + SYS(fail, "ip link set dev %s up", config[i].local_veth); + if (config[i].remote_addr) + SYS(fail, "ip -n %s addr add %s/24 dev %s", config[i].namespace, + config[i].remote_addr, config[i].remote_veth); + SYS(fail, "ip -n %s link set dev %s up", config[i].namespace, + config[i].remote_veth); + } + + /* Then configure the redirect map and attach programs to interfaces */ + map_fd = bpf_map__fd(skeletons->xdp_redirect_maps->maps.tx_port); + if (!ASSERT_GE(map_fd, 0, "open redirect map")) + goto fail; + for (i = 0; i < VETH_PAIRS_COUNT; i++) { + interface_id = if_nametoindex(config[i].next_veth); + if (!ASSERT_NEQ(interface_id, 0, "non zero interface index")) + goto fail; + err = bpf_map_update_elem(map_fd, &i, &interface_id, BPF_ANY); + if (!ASSERT_OK(err, "configure interface redirection through map")) + goto fail; + if (attach_programs_to_veth_pair(skeletons, i)) + goto fail; + } + + return 0; + +fail: + return -1; +} + +static void cleanup_network(void) +{ + int i; + + /* Deleting namespaces is enough to automatically remove veth pairs as well + */ + for (i = 0; i < VETH_PAIRS_COUNT; i++) + SYS_NOFAIL("ip netns del %s", config[i].namespace); +} + +static int check_ping(struct skeletons *skeletons) +{ + /* Test: if all interfaces are properly configured, we must be able to ping + * veth33 from veth11 + */ + return SYS_NOFAIL("ip netns exec %s ping -c 1 -W 1 %s > /dev/null", + config[0].namespace, IP_DST); +} + +void test_xdp_veth_redirect(void) +{ + struct skeletons skeletons = {}; + + skeletons.xdp_dummy = xdp_dummy__open_and_load(); + if (!ASSERT_OK_PTR(skeletons.xdp_dummy, "xdp_dummy__open_and_load")) + return; + + skeletons.xdp_tx = xdp_tx__open_and_load(); + if (!ASSERT_OK_PTR(skeletons.xdp_tx, "xdp_tx__open_and_load")) + goto destroy_xdp_dummy; + + skeletons.xdp_redirect_maps = xdp_redirect_map__open_and_load(); + if (!ASSERT_OK_PTR(skeletons.xdp_redirect_maps, "xdp_redirect_map__open_and_load")) + goto destroy_xdp_tx; + + if (configure_network(&skeletons)) + goto destroy_xdp_redirect_map; + + ASSERT_OK(check_ping(&skeletons), "ping"); + +destroy_xdp_redirect_map: + xdp_redirect_map__destroy(skeletons.xdp_redirect_maps); +destroy_xdp_tx: + xdp_tx__destroy(skeletons.xdp_tx); +destroy_xdp_dummy: + xdp_dummy__destroy(skeletons.xdp_dummy); + + cleanup_network(); +} diff --git a/tools/testing/selftests/bpf/test_xdp_veth.sh b/tools/testing/selftests/bpf/test_xdp_veth.sh deleted file mode 100755 index 5211ca9a0239..000000000000 --- a/tools/testing/selftests/bpf/test_xdp_veth.sh +++ /dev/null @@ -1,121 +0,0 @@ -#!/bin/sh -# SPDX-License-Identifier: GPL-2.0 -# -# Create 3 namespaces with 3 veth peers, and -# forward packets in-between using native XDP -# -# XDP_TX -# NS1(veth11) NS2(veth22) NS3(veth33) -# | | | -# | | | -# (veth1, (veth2, (veth3, -# id:111) id:122) id:133) -# ^ | ^ | ^ | -# | | XDP_REDIRECT | | XDP_REDIRECT | | -# | ------------------ ------------------ | -# ----------------------------------------- -# XDP_REDIRECT - -# Kselftest framework requirement - SKIP code is 4. -ksft_skip=4 - -TESTNAME=xdp_veth -BPF_FS=$(awk '$3 == "bpf" {print $2; exit}' /proc/mounts) -BPF_DIR=$BPF_FS/test_$TESTNAME -readonly NS1="ns1-$(mktemp -u XXXXXX)" -readonly NS2="ns2-$(mktemp -u XXXXXX)" -readonly NS3="ns3-$(mktemp -u XXXXXX)" - -_cleanup() -{ - set +e - ip link del veth1 2> /dev/null - ip link del veth2 2> /dev/null - ip link del veth3 2> /dev/null - ip netns del ${NS1} 2> /dev/null - ip netns del ${NS2} 2> /dev/null - ip netns del ${NS3} 2> /dev/null - rm -rf $BPF_DIR 2> /dev/null -} - -cleanup_skip() -{ - echo "selftests: $TESTNAME [SKIP]" - _cleanup - - exit $ksft_skip -} - -cleanup() -{ - if [ "$?" = 0 ]; then - echo "selftests: $TESTNAME [PASS]" - else - echo "selftests: $TESTNAME [FAILED]" - fi - _cleanup -} - -if [ $(id -u) -ne 0 ]; then - echo "selftests: $TESTNAME [SKIP] Need root privileges" - exit $ksft_skip -fi - -if ! ip link set dev lo xdp off > /dev/null 2>&1; then - echo "selftests: $TESTNAME [SKIP] Could not run test without the ip xdp support" - exit $ksft_skip -fi - -if [ -z "$BPF_FS" ]; then - echo "selftests: $TESTNAME [SKIP] Could not run test without bpffs mounted" - exit $ksft_skip -fi - -if ! bpftool version > /dev/null 2>&1; then - echo "selftests: $TESTNAME [SKIP] Could not run test without bpftool" - exit $ksft_skip -fi - -set -e - -trap cleanup_skip EXIT - -ip netns add ${NS1} -ip netns add ${NS2} -ip netns add ${NS3} - -ip link add veth1 index 111 type veth peer name veth11 netns ${NS1} -ip link add veth2 index 122 type veth peer name veth22 netns ${NS2} -ip link add veth3 index 133 type veth peer name veth33 netns ${NS3} - -ip link set veth1 up -ip link set veth2 up -ip link set veth3 up - -ip -n ${NS1} addr add 10.1.1.11/24 dev veth11 -ip -n ${NS3} addr add 10.1.1.33/24 dev veth33 - -ip -n ${NS1} link set dev veth11 up -ip -n ${NS2} link set dev veth22 up -ip -n ${NS3} link set dev veth33 up - -mkdir $BPF_DIR -bpftool prog loadall \ - xdp_redirect_map.bpf.o $BPF_DIR/progs type xdp \ - pinmaps $BPF_DIR/maps -bpftool map update pinned $BPF_DIR/maps/tx_port key 0 0 0 0 value 122 0 0 0 -bpftool map update pinned $BPF_DIR/maps/tx_port key 1 0 0 0 value 133 0 0 0 -bpftool map update pinned $BPF_DIR/maps/tx_port key 2 0 0 0 value 111 0 0 0 -ip link set dev veth1 xdp pinned $BPF_DIR/progs/xdp_redirect_map_0 -ip link set dev veth2 xdp pinned $BPF_DIR/progs/xdp_redirect_map_1 -ip link set dev veth3 xdp pinned $BPF_DIR/progs/xdp_redirect_map_2 - -ip -n ${NS1} link set dev veth11 xdp obj xdp_dummy.bpf.o sec xdp -ip -n ${NS2} link set dev veth22 xdp obj xdp_tx.bpf.o sec xdp -ip -n ${NS3} link set dev veth33 xdp obj xdp_dummy.bpf.o sec xdp - -trap cleanup EXIT - -ip netns exec ${NS1} ping -c 1 -W 1 10.1.1.33 - -exit 0 -- cgit v1.2.3 From ba71ffb660e4d41fe589f2459fb888ea61fdb310 Mon Sep 17 00:00:00 2001 From: David Vernet Date: Wed, 24 Jul 2024 22:22:14 -0500 Subject: selftests/bpf: Load struct_ops map in global_maps_resize test In prog_tests/test_global_maps_resize.c, we test various use cases for resizing global maps. Commit 7244100e0389 ("libbpf: Don't take direct pointers into BTF data from st_ops") updated libbpf to not store pointers to volatile BTF data, which for some users, was causing a UAF when resizing a datasec array. Let's ensure we have coverage for resizing datasec arrays with struct_ops progs by also including a struct_ops map and struct_ops prog in the test_global_map_resize skeleton. The map is automatically loaded, so we don't need to do anything other than add it to the BPF prog being tested to get the coverage. Signed-off-by: David Vernet Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20240725032214.50676-1-void@manifault.com --- .../testing/selftests/bpf/progs/test_global_map_resize.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/progs/test_global_map_resize.c b/tools/testing/selftests/bpf/progs/test_global_map_resize.c index 1fbb73d3e5d5..714b29c7f8b2 100644 --- a/tools/testing/selftests/bpf/progs/test_global_map_resize.c +++ b/tools/testing/selftests/bpf/progs/test_global_map_resize.c @@ -3,6 +3,7 @@ #include "vmlinux.h" #include +#include char _license[] SEC("license") = "GPL"; @@ -60,3 +61,18 @@ int data_array_sum(void *ctx) return 0; } + +SEC("struct_ops/test_1") +int BPF_PROG(test_1) +{ + return 0; +} + +struct bpf_testmod_ops { + int (*test_1)(void); +}; + +SEC(".struct_ops.link") +struct bpf_testmod_ops st_ops_resize = { + .test_1 = (void *)test_1 +}; -- cgit v1.2.3 From 781f0bbbdade00f91490a3f64212e8cc8d75905c Mon Sep 17 00:00:00 2001 From: Zhu Jun Date: Wed, 24 Jul 2024 04:11:20 -0700 Subject: tools/bpf: Fix the wrong format specifier The format specifier of "unsigned int" in printf() should be "%u", not "%d". Signed-off-by: Zhu Jun Signed-off-by: Andrii Nakryiko Acked-by: Quentin Monnet Link: https://lore.kernel.org/bpf/20240724111120.11625-1-zhujun2@cmss.chinamobile.com --- tools/bpf/bpftool/xlated_dumper.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/bpf/bpftool/xlated_dumper.c b/tools/bpf/bpftool/xlated_dumper.c index 567f56dfd9f1..d0094345fb2b 100644 --- a/tools/bpf/bpftool/xlated_dumper.c +++ b/tools/bpf/bpftool/xlated_dumper.c @@ -349,7 +349,7 @@ void dump_xlated_plain(struct dump_data *dd, void *buf, unsigned int len, double_insn = insn[i].code == (BPF_LD | BPF_IMM | BPF_DW); - printf("% 4d: ", i); + printf("%4u: ", i); print_bpf_insn(&cbs, insn + i, true); if (opcodes) { @@ -415,7 +415,7 @@ void dump_xlated_for_graph(struct dump_data *dd, void *buf_start, void *buf_end, } } - printf("%d: ", insn_off); + printf("%u: ", insn_off); print_bpf_insn(&cbs, cur, true); if (opcodes) { -- cgit v1.2.3 From c0247800ee7da5bc067b2916cf2722f755072307 Mon Sep 17 00:00:00 2001 From: Tony Ambardar Date: Mon, 29 Jul 2024 02:24:17 -0700 Subject: selftests/bpf: Use portable POSIX basename() Use the POSIX version of basename() to allow compilation against non-gnu libc (e.g. musl). Include ahead of to enable using functions from the latter while preferring POSIX over GNU basename(). In veristat.c, rely on strdupa() to avoid basename() altering the passed "const char" argument. This is not needed in xskxceiver.c since the arg is mutable and the program exits immediately after usage. Signed-off-by: Tony Ambardar Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/0fd3c9f3c605e6cba33504213c9df287817ade04.1722244708.git.tony.ambardar@gmail.com --- tools/testing/selftests/bpf/veristat.c | 8 +++++--- tools/testing/selftests/bpf/xskxceiver.c | 1 + 2 files changed, 6 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/veristat.c b/tools/testing/selftests/bpf/veristat.c index b2854238d4a0..11ec1190d582 100644 --- a/tools/testing/selftests/bpf/veristat.c +++ b/tools/testing/selftests/bpf/veristat.c @@ -2,6 +2,7 @@ /* Copyright (c) 2022 Meta Platforms, Inc. and affiliates. */ #define _GNU_SOURCE #include +#include #include #include #include @@ -988,8 +989,8 @@ skip_freplace_fixup: static int process_prog(const char *filename, struct bpf_object *obj, struct bpf_program *prog) { + const char *base_filename = basename(strdupa(filename)); const char *prog_name = bpf_program__name(prog); - const char *base_filename = basename(filename); char *buf; int buf_sz, log_level; struct verif_stats *stats; @@ -1056,13 +1057,14 @@ static int process_prog(const char *filename, struct bpf_object *obj, struct bpf static int process_obj(const char *filename) { + const char *base_filename = basename(strdupa(filename)); struct bpf_object *obj = NULL, *tobj; struct bpf_program *prog, *tprog, *lprog; libbpf_print_fn_t old_libbpf_print_fn; LIBBPF_OPTS(bpf_object_open_opts, opts); int err = 0, prog_cnt = 0; - if (!should_process_file_prog(basename(filename), NULL)) { + if (!should_process_file_prog(base_filename, NULL)) { if (env.verbose) printf("Skipping '%s' due to filters...\n", filename); env.files_skipped++; @@ -1076,7 +1078,7 @@ static int process_obj(const char *filename) } if (!env.quiet && env.out_fmt == RESFMT_TABLE) - printf("Processing '%s'...\n", basename(filename)); + printf("Processing '%s'...\n", base_filename); old_libbpf_print_fn = libbpf_set_print(libbpf_print_fn); obj = bpf_object__open_file(filename, &opts); diff --git a/tools/testing/selftests/bpf/xskxceiver.c b/tools/testing/selftests/bpf/xskxceiver.c index 8144fd145237..92af633faea8 100644 --- a/tools/testing/selftests/bpf/xskxceiver.c +++ b/tools/testing/selftests/bpf/xskxceiver.c @@ -90,6 +90,7 @@ #include #include #include +#include #include #include #include -- cgit v1.2.3 From 03bfcda1fbc37ef34aa21d2b9e09138335afc6ee Mon Sep 17 00:00:00 2001 From: Tony Ambardar Date: Mon, 29 Jul 2024 02:24:18 -0700 Subject: selftests/bpf: Fix arg parsing in veristat, test_progs Current code parses arguments with strtok_r() using a construct like char *state = NULL; while ((next = strtok_r(state ? NULL : input, ",", &state))) { ... } where logic assumes the 'state' var can distinguish between first and subsequent strtok_r() calls, and adjusts parameters accordingly. However, 'state' is strictly internal context for strtok_r() and no such assumptions are supported in the man page. Moreover, the exact behaviour of 'state' depends on the libc implementation, making the above code fragile. Indeed, invoking "./test_progs -t " on mips64el/musl will hang, with the above code in an infinite loop. Similarly, we see strange behaviour running 'veristat' on mips64el/musl: $ ./veristat -e file,prog,verdict,insns -C two-ok add-failure Can't specify more than 9 stats Rewrite code using a counter to distinguish between strtok_r() calls. Fixes: 61ddff373ffa ("selftests/bpf: Improve by-name subtest selection logic in prog_tests") Fixes: 394169b079b5 ("selftests/bpf: add comparison mode to veristat") Fixes: c8bc5e050976 ("selftests/bpf: Add veristat tool for mass-verifying BPF object files") Signed-off-by: Tony Ambardar Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/392d8bf5559f85fa37926c1494e62312ef252c3d.1722244708.git.tony.ambardar@gmail.com --- tools/testing/selftests/bpf/testing_helpers.c | 4 ++-- tools/testing/selftests/bpf/veristat.c | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/testing_helpers.c b/tools/testing/selftests/bpf/testing_helpers.c index ac7c66f4fc7b..c217e12bd9da 100644 --- a/tools/testing/selftests/bpf/testing_helpers.c +++ b/tools/testing/selftests/bpf/testing_helpers.c @@ -221,13 +221,13 @@ int parse_test_list(const char *s, bool is_glob_pattern) { char *input, *state = NULL, *test_spec; - int err = 0; + int err = 0, cnt = 0; input = strdup(s); if (!input) return -ENOMEM; - while ((test_spec = strtok_r(state ? NULL : input, ",", &state))) { + while ((test_spec = strtok_r(cnt++ ? NULL : input, ",", &state))) { err = insert_test(set, test_spec, is_glob_pattern); if (err) break; diff --git a/tools/testing/selftests/bpf/veristat.c b/tools/testing/selftests/bpf/veristat.c index 11ec1190d582..1ec5c4c47235 100644 --- a/tools/testing/selftests/bpf/veristat.c +++ b/tools/testing/selftests/bpf/veristat.c @@ -785,13 +785,13 @@ static int parse_stat(const char *stat_name, struct stat_specs *specs) static int parse_stats(const char *stats_str, struct stat_specs *specs) { char *input, *state = NULL, *next; - int err; + int err, cnt = 0; input = strdup(stats_str); if (!input) return -ENOMEM; - while ((next = strtok_r(state ? NULL : input, ",", &state))) { + while ((next = strtok_r(cnt++ ? NULL : input, ",", &state))) { err = parse_stat(next, specs); if (err) { free(input); @@ -1495,7 +1495,7 @@ static int parse_stats_csv(const char *filename, struct stat_specs *specs, while (fgets(line, sizeof(line), f)) { char *input = line, *state = NULL, *next; struct verif_stats *st = NULL; - int col = 0; + int col = 0, cnt = 0; if (!header) { void *tmp; @@ -1513,7 +1513,7 @@ static int parse_stats_csv(const char *filename, struct stat_specs *specs, *stat_cntp += 1; } - while ((next = strtok_r(state ? NULL : input, ",\n", &state))) { + while ((next = strtok_r(cnt++ ? NULL : input, ",\n", &state))) { if (header) { /* for the first line, set up spec stats */ err = parse_stat(next, specs); -- cgit v1.2.3 From cacf2a5a78cd1f5f616eae043ebc6f024104b721 Mon Sep 17 00:00:00 2001 From: Tony Ambardar Date: Mon, 29 Jul 2024 02:24:19 -0700 Subject: selftests/bpf: Fix error compiling test_lru_map.c Although the post-increment in macro 'CPU_SET(next++, &cpuset)' seems safe, the sequencing can raise compile errors, so move the increment outside the macro. This avoids an error seen using gcc 12.3.0 for mips64el/musl-libc: In file included from test_lru_map.c:11: test_lru_map.c: In function 'sched_next_online': test_lru_map.c:129:29: error: operation on 'next' may be undefined [-Werror=sequence-point] 129 | CPU_SET(next++, &cpuset); | ^ cc1: all warnings being treated as errors Fixes: 3fbfadce6012 ("bpf: Fix test_lru_sanity5() in test_lru_map.c") Signed-off-by: Tony Ambardar Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/22993dfb11ccf27925a626b32672fd3324cb76c4.1722244708.git.tony.ambardar@gmail.com --- tools/testing/selftests/bpf/test_lru_map.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/test_lru_map.c b/tools/testing/selftests/bpf/test_lru_map.c index 4d0650cfb5cd..fda7589c5023 100644 --- a/tools/testing/selftests/bpf/test_lru_map.c +++ b/tools/testing/selftests/bpf/test_lru_map.c @@ -126,7 +126,8 @@ static int sched_next_online(int pid, int *next_to_try) while (next < nr_cpus) { CPU_ZERO(&cpuset); - CPU_SET(next++, &cpuset); + CPU_SET(next, &cpuset); + next++; if (!sched_setaffinity(pid, sizeof(cpuset), &cpuset)) { ret = 0; break; -- cgit v1.2.3 From aa95073fd290b5b3e45f067fa22bb25e59e1ff7c Mon Sep 17 00:00:00 2001 From: Tony Ambardar Date: Mon, 29 Jul 2024 02:24:20 -0700 Subject: selftests/bpf: Fix C++ compile error from missing _Bool type While building, bpftool makes a skeleton from test_core_extern.c, which itself includes and uses the 'bool' type. However, the skeleton test_core_extern.skel.h generated *does not* include or use the 'bool' type, instead using the C-only '_Bool' type. Compiling test_cpp.cpp with g++ 12.3 for mips64el/musl-libc then fails with error: In file included from test_cpp.cpp:9: test_core_extern.skel.h:45:17: error: '_Bool' does not name a type 45 | _Bool CONFIG_BOOL; | ^~~~~ This was likely missed previously because glibc uses a GNU extension for with C++ (#define _Bool bool), not supported by musl libc. Normally, a C fragment would include and use the 'bool' type, and thus cleanly work after import by C++. The ideal fix would be for 'bpftool gen skeleton' to output the correct type/include supporting C++, but in the meantime add a conditional define as above. Fixes: 7c8dce4b1661 ("bpftool: Make skeleton C code compilable with C++ compiler") Signed-off-by: Tony Ambardar Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/6fc1dd28b8bda49e51e4f610bdc9d22f4455632d.1722244708.git.tony.ambardar@gmail.com --- tools/testing/selftests/bpf/test_cpp.cpp | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/test_cpp.cpp b/tools/testing/selftests/bpf/test_cpp.cpp index dde0bb16e782..abc2a56ab261 100644 --- a/tools/testing/selftests/bpf/test_cpp.cpp +++ b/tools/testing/selftests/bpf/test_cpp.cpp @@ -6,6 +6,10 @@ #include #include #include + +#ifndef _Bool +#define _Bool bool +#endif #include "test_core_extern.skel.h" #include "struct_ops_module.skel.h" -- cgit v1.2.3 From 16b795cc59528cf280abc79af3c70bda42f715b9 Mon Sep 17 00:00:00 2001 From: Tony Ambardar Date: Mon, 29 Jul 2024 02:24:21 -0700 Subject: selftests/bpf: Fix redefinition errors compiling lwt_reroute.c Compiling lwt_reroute.c with GCC 12.3 for mips64el/musl-libc yields errors: In file included from .../include/arpa/inet.h:9, from ./test_progs.h:18, from tools/testing/selftests/bpf/prog_tests/lwt_helpers.h:11, from tools/testing/selftests/bpf/prog_tests/lwt_reroute.c:52: .../include/netinet/in.h:23:8: error: redefinition of 'struct in6_addr' 23 | struct in6_addr { | ^~~~~~~~ In file included from .../include/linux/icmp.h:24, from tools/testing/selftests/bpf/prog_tests/lwt_helpers.h:9: .../include/linux/in6.h:33:8: note: originally defined here 33 | struct in6_addr { | ^~~~~~~~ .../include/netinet/in.h:34:8: error: redefinition of 'struct sockaddr_in6' 34 | struct sockaddr_in6 { | ^~~~~~~~~~~~ .../include/linux/in6.h:50:8: note: originally defined here 50 | struct sockaddr_in6 { | ^~~~~~~~~~~~ .../include/netinet/in.h:42:8: error: redefinition of 'struct ipv6_mreq' 42 | struct ipv6_mreq { | ^~~~~~~~~ .../include/linux/in6.h:60:8: note: originally defined here 60 | struct ipv6_mreq { | ^~~~~~~~~ These errors occur because is included before , bypassing the Linux uapi/libc compat mechanism's partial musl support. As described in [1] and [2], fix these errors by including in lwt_reroute.c before any uapi headers. [1]: commit c0bace798436 ("uapi libc compat: add fallback for unsupported libcs") [2]: https://git.musl-libc.org/cgit/musl/commit/?id=04983f227238 Fixes: 6c77997bc639 ("selftests/bpf: Add lwt_xmit tests for BPF_REROUTE") Signed-off-by: Tony Ambardar Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/bd2908aec0755ba8b75f5dc41848b00585f5c73e.1722244708.git.tony.ambardar@gmail.com --- tools/testing/selftests/bpf/prog_tests/lwt_reroute.c | 1 + 1 file changed, 1 insertion(+) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/lwt_reroute.c b/tools/testing/selftests/bpf/prog_tests/lwt_reroute.c index 03825d2b45a8..6c50c0f63f43 100644 --- a/tools/testing/selftests/bpf/prog_tests/lwt_reroute.c +++ b/tools/testing/selftests/bpf/prog_tests/lwt_reroute.c @@ -49,6 +49,7 @@ * is not crashed, it is considered successful. */ #define NETNS "ns_lwt_reroute" +#include #include "lwt_helpers.h" #include "network_helpers.h" #include -- cgit v1.2.3 From c9a83e76b5a96801a2c7ea0a79ca77c356d8b38d Mon Sep 17 00:00:00 2001 From: Tony Ambardar Date: Mon, 29 Jul 2024 02:24:22 -0700 Subject: selftests/bpf: Fix compile if backtrace support missing in libc Include GNU header only with glibc and provide weak, stubbed backtrace functions as a fallback in test_progs.c. This allows for non-GNU replacements while avoiding compile errors (e.g. with musl libc) like: test_progs.c:13:10: fatal error: execinfo.h: No such file or directory 13 | #include /* backtrace */ | ^~~~~~~~~~~~ test_progs.c: In function 'crash_handler': test_progs.c:1034:14: error: implicit declaration of function 'backtrace' [-Werror=implicit-function-declaration] 1034 | sz = backtrace(bt, ARRAY_SIZE(bt)); | ^~~~~~~~~ test_progs.c:1045:9: error: implicit declaration of function 'backtrace_symbols_fd' [-Werror=implicit-function-declaration] 1045 | backtrace_symbols_fd(bt, sz, STDERR_FILENO); | ^~~~~~~~~~~~~~~~~~~~ Fixes: 9fb156bb82a3 ("selftests/bpf: Print backtrace on SIGSEGV in test_progs") Signed-off-by: Tony Ambardar Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/aa6dc8e23710cb457b278039d0081de7e7b4847d.1722244708.git.tony.ambardar@gmail.com --- tools/testing/selftests/bpf/test_progs.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/test_progs.c b/tools/testing/selftests/bpf/test_progs.c index 60c5ec0f6abf..d5d0cb4eb197 100644 --- a/tools/testing/selftests/bpf/test_progs.c +++ b/tools/testing/selftests/bpf/test_progs.c @@ -10,7 +10,6 @@ #include #include #include -#include /* backtrace */ #include /* get_nprocs */ #include #include @@ -19,6 +18,21 @@ #include #include "json_writer.h" +#ifdef __GLIBC__ +#include /* backtrace */ +#endif + +/* Default backtrace funcs if missing at link */ +__weak int backtrace(void **buffer, int size) +{ + return 0; +} + +__weak void backtrace_symbols_fd(void *const *buffer, int size, int fd) +{ + dprintf(fd, "\n"); +} + static bool verbose(void) { return env.verbosity > VERBOSE_NONE; -- cgit v1.2.3 From 06eeca1217a8655975f340e0ead6a396df74a2a4 Mon Sep 17 00:00:00 2001 From: Tony Ambardar Date: Mon, 29 Jul 2024 02:24:23 -0700 Subject: selftests/bpf: Fix using stdout, stderr as struct field names Typically stdin, stdout, stderr are treated as reserved identifiers under ISO/ANSI C and libc implementations further define these as macros, both in glibc and musl . However, while glibc defines: ... /* Standard streams. */ extern FILE *stdin; /* Standard input stream. */ extern FILE *stdout; /* Standard output stream. */ extern FILE *stderr; /* Standard error output stream. */ /* C89/C99 say they're macros. Make them happy. */ #define stdin stdin #define stdout stdout #define stderr stderr ... musl instead uses (legally): ... extern FILE *const stdin; extern FILE *const stdout; extern FILE *const stderr; #define stdin (stdin) #define stdout (stdout) #define stderr (stderr) ... The latter results in compile errors when the names are reused as fields of 'struct test_env' and elsewhere in test_progs.[ch] and reg_bounds.c. Rename the fields to stdout_saved and stderr_saved to avoid many errors seen building against musl, e.g.: In file included from test_progs.h:6, from test_progs.c:5: test_progs.c: In function 'print_test_result': test_progs.c:237:21: error: expected identifier before '(' token 237 | fprintf(env.stdout, "#%-*d %s:", TEST_NUM_WIDTH, test->test_num, test->test_name); | ^~~~~~ test_progs.c:237:9: error: too few arguments to function 'fprintf' 237 | fprintf(env.stdout, "#%-*d %s:", TEST_NUM_WIDTH, test->test_num, test->test_name); | ^~~~~~~ Signed-off-by: Tony Ambardar Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/ZqR2DuHdBXPX%2Fyx8@kodidev-ubuntu/ Link: https://lore.kernel.org/bpf/684ea17548e237f39dfb3f7a3d33450069015b21.1722244708.git.tony.ambardar@gmail.com --- .../testing/selftests/bpf/prog_tests/reg_bounds.c | 2 +- tools/testing/selftests/bpf/test_progs.c | 66 +++++++++++----------- tools/testing/selftests/bpf/test_progs.h | 8 +-- 3 files changed, 38 insertions(+), 38 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/reg_bounds.c b/tools/testing/selftests/bpf/prog_tests/reg_bounds.c index 0da4225749bd..467027236d30 100644 --- a/tools/testing/selftests/bpf/prog_tests/reg_bounds.c +++ b/tools/testing/selftests/bpf/prog_tests/reg_bounds.c @@ -1487,7 +1487,7 @@ static int verify_case_opt(struct ctx *ctx, enum num_t init_t, enum num_t cond_t u64 elapsed_ns = get_time_ns() - ctx->start_ns; double remain_ns = elapsed_ns / progress * (1 - progress); - fprintf(env.stderr, "PROGRESS (%s): %d/%d (%.2lf%%), " + fprintf(env.stderr_saved, "PROGRESS (%s): %d/%d (%.2lf%%), " "elapsed %llu mins (%.2lf hrs), " "ETA %.0lf mins (%.2lf hrs)\n", ctx->progress_ctx, diff --git a/tools/testing/selftests/bpf/test_progs.c b/tools/testing/selftests/bpf/test_progs.c index d5d0cb4eb197..60fafa2f1ed7 100644 --- a/tools/testing/selftests/bpf/test_progs.c +++ b/tools/testing/selftests/bpf/test_progs.c @@ -51,15 +51,15 @@ static void stdio_hijack_init(char **log_buf, size_t *log_cnt) stdout = open_memstream(log_buf, log_cnt); if (!stdout) { - stdout = env.stdout; + stdout = env.stdout_saved; perror("open_memstream"); return; } if (env.subtest_state) - env.subtest_state->stdout = stdout; + env.subtest_state->stdout_saved = stdout; else - env.test_state->stdout = stdout; + env.test_state->stdout_saved = stdout; stderr = stdout; #endif @@ -73,8 +73,8 @@ static void stdio_hijack(char **log_buf, size_t *log_cnt) return; } - env.stdout = stdout; - env.stderr = stderr; + env.stdout_saved = stdout; + env.stderr_saved = stderr; stdio_hijack_init(log_buf, log_cnt); #endif @@ -91,13 +91,13 @@ static void stdio_restore_cleanup(void) fflush(stdout); if (env.subtest_state) { - fclose(env.subtest_state->stdout); - env.subtest_state->stdout = NULL; - stdout = env.test_state->stdout; - stderr = env.test_state->stdout; + fclose(env.subtest_state->stdout_saved); + env.subtest_state->stdout_saved = NULL; + stdout = env.test_state->stdout_saved; + stderr = env.test_state->stdout_saved; } else { - fclose(env.test_state->stdout); - env.test_state->stdout = NULL; + fclose(env.test_state->stdout_saved); + env.test_state->stdout_saved = NULL; } #endif } @@ -110,13 +110,13 @@ static void stdio_restore(void) return; } - if (stdout == env.stdout) + if (stdout == env.stdout_saved) return; stdio_restore_cleanup(); - stdout = env.stdout; - stderr = env.stderr; + stdout = env.stdout_saved; + stderr = env.stderr_saved; #endif } @@ -244,25 +244,25 @@ static void print_test_result(const struct prog_test_def *test, const struct tes int skipped_cnt = test_state->skip_cnt; int subtests_cnt = test_state->subtest_num; - fprintf(env.stdout, "#%-*d %s:", TEST_NUM_WIDTH, test->test_num, test->test_name); + fprintf(env.stdout_saved, "#%-*d %s:", TEST_NUM_WIDTH, test->test_num, test->test_name); if (test_state->error_cnt) - fprintf(env.stdout, "FAIL"); + fprintf(env.stdout_saved, "FAIL"); else if (!skipped_cnt) - fprintf(env.stdout, "OK"); + fprintf(env.stdout_saved, "OK"); else if (skipped_cnt == subtests_cnt || !subtests_cnt) - fprintf(env.stdout, "SKIP"); + fprintf(env.stdout_saved, "SKIP"); else - fprintf(env.stdout, "OK (SKIP: %d/%d)", skipped_cnt, subtests_cnt); + fprintf(env.stdout_saved, "OK (SKIP: %d/%d)", skipped_cnt, subtests_cnt); - fprintf(env.stdout, "\n"); + fprintf(env.stdout_saved, "\n"); } static void print_test_log(char *log_buf, size_t log_cnt) { log_buf[log_cnt] = '\0'; - fprintf(env.stdout, "%s", log_buf); + fprintf(env.stdout_saved, "%s", log_buf); if (log_buf[log_cnt - 1] != '\n') - fprintf(env.stdout, "\n"); + fprintf(env.stdout_saved, "\n"); } static void print_subtest_name(int test_num, int subtest_num, @@ -273,14 +273,14 @@ static void print_subtest_name(int test_num, int subtest_num, snprintf(test_num_str, sizeof(test_num_str), "%d/%d", test_num, subtest_num); - fprintf(env.stdout, "#%-*s %s/%s", + fprintf(env.stdout_saved, "#%-*s %s/%s", TEST_NUM_WIDTH, test_num_str, test_name, subtest_name); if (result) - fprintf(env.stdout, ":%s", result); + fprintf(env.stdout_saved, ":%s", result); - fprintf(env.stdout, "\n"); + fprintf(env.stdout_saved, "\n"); } static void jsonw_write_log_message(json_writer_t *w, char *log_buf, size_t log_cnt) @@ -465,7 +465,7 @@ bool test__start_subtest(const char *subtest_name) memset(subtest_state, 0, sub_state_size); if (!subtest_name || !subtest_name[0]) { - fprintf(env.stderr, + fprintf(env.stderr_saved, "Subtest #%d didn't provide sub-test name!\n", state->subtest_num); return false; @@ -473,7 +473,7 @@ bool test__start_subtest(const char *subtest_name) subtest_state->name = strdup(subtest_name); if (!subtest_state->name) { - fprintf(env.stderr, + fprintf(env.stderr_saved, "Subtest #%d: failed to copy subtest name!\n", state->subtest_num); return false; @@ -1043,7 +1043,7 @@ void crash_handler(int signum) sz = backtrace(bt, ARRAY_SIZE(bt)); - if (env.stdout) + if (env.stdout_saved) stdio_restore(); if (env.test) { env.test_state->error_cnt++; @@ -1359,7 +1359,7 @@ static void calculate_summary_and_print_errors(struct test_env *env) if (env->json) { w = jsonw_new(env->json); if (!w) - fprintf(env->stderr, "Failed to create new JSON stream."); + fprintf(env->stderr_saved, "Failed to create new JSON stream."); } if (w) { @@ -1708,8 +1708,8 @@ int main(int argc, char **argv) return -1; } - env.stdout = stdout; - env.stderr = stderr; + env.stdout_saved = stdout; + env.stderr_saved = stderr; env.has_testmod = true; if (!env.list_test_names) { @@ -1717,7 +1717,7 @@ int main(int argc, char **argv) unload_bpf_testmod(verbose()); if (load_bpf_testmod(verbose())) { - fprintf(env.stderr, "WARNING! Selftests relying on bpf_testmod.ko will be skipped.\n"); + fprintf(env.stderr_saved, "WARNING! Selftests relying on bpf_testmod.ko will be skipped.\n"); env.has_testmod = false; } } @@ -1795,7 +1795,7 @@ int main(int argc, char **argv) } if (env.list_test_names) { - fprintf(env.stdout, "%s\n", test->test_name); + fprintf(env.stdout_saved, "%s\n", test->test_name); env.succ_cnt++; continue; } diff --git a/tools/testing/selftests/bpf/test_progs.h b/tools/testing/selftests/bpf/test_progs.h index b1e949fb16cf..cb9d6d46826b 100644 --- a/tools/testing/selftests/bpf/test_progs.h +++ b/tools/testing/selftests/bpf/test_progs.h @@ -75,7 +75,7 @@ struct subtest_state { bool skipped; bool filtered; - FILE *stdout; + FILE *stdout_saved; }; struct test_state { @@ -92,7 +92,7 @@ struct test_state { size_t log_cnt; char *log_buf; - FILE *stdout; + FILE *stdout_saved; }; struct test_env { @@ -111,8 +111,8 @@ struct test_env { struct test_state *test_state; /* current running test state */ struct subtest_state *subtest_state; /* current running subtest state */ - FILE *stdout; - FILE *stderr; + FILE *stdout_saved; + FILE *stderr_saved; int nr_cpus; FILE *json; -- cgit v1.2.3 From 21c5f4f55da759c7444a1ef13e90b6e6f674eeeb Mon Sep 17 00:00:00 2001 From: Tony Ambardar Date: Mon, 29 Jul 2024 02:24:24 -0700 Subject: selftests/bpf: Fix error compiling tc_redirect.c with musl libc Linux 5.1 implemented 64-bit time types and related syscalls to address the Y2038 problem generally across archs. Userspace handling of Y2038 varies with the libc however. While musl libc uses 64-bit time across all 32-bit and 64-bit platforms, GNU glibc uses 64-bit time on 64-bit platforms but defaults to 32-bit time on 32-bit platforms unless they "opt-in" to 64-bit time or explicitly use 64-bit syscalls and time structures. One specific area is the standard setsockopt() call, SO_TIMESTAMPNS option used for timestamping, and the related output 'struct timespec'. GNU glibc defaults as above, also exposing the SO_TIMESTAMPNS_NEW flag to explicitly use a 64-bit call and 'struct __kernel_timespec'. Since these are not exposed or needed with musl libc, their use in tc_redirect.c leads to compile errors building for mips64el/musl: tc_redirect.c: In function 'rcv_tstamp': tc_redirect.c:425:32: error: 'SO_TIMESTAMPNS_NEW' undeclared (first use in this function); did you mean 'SO_TIMESTAMPNS'? 425 | cmsg->cmsg_type == SO_TIMESTAMPNS_NEW) | ^~~~~~~~~~~~~~~~~~ | SO_TIMESTAMPNS tc_redirect.c:425:32: note: each undeclared identifier is reported only once for each function it appears in tc_redirect.c: In function 'test_inet_dtime': tc_redirect.c:491:49: error: 'SO_TIMESTAMPNS_NEW' undeclared (first use in this function); did you mean 'SO_TIMESTAMPNS'? 491 | err = setsockopt(listen_fd, SOL_SOCKET, SO_TIMESTAMPNS_NEW, | ^~~~~~~~~~~~~~~~~~ | SO_TIMESTAMPNS However, using SO_TIMESTAMPNS_NEW isn't strictly needed, nor is Y2038 being explicitly tested. The timestamp checks in tc_redirect.c are simple: the packet receive timestamp is non-zero and processed/handled in less than 5 seconds. Switch to using the standard setsockopt() call and SO_TIMESTAMPNS option to ensure compatibility across glibc and musl libc. In the worst-case, there is a 5-second window 14 years from now where tc_redirect tests may fail on 32-bit systems. However, we should reasonably expect glibc to adopt a 64-bit mandate rather than the current "opt-in" policy before the Y2038 roll-over. Fixes: ce6f6cffaeaa ("selftests/bpf: Wait for the netstamp_needed_key static key to be turned on") Fixes: c803475fd8dd ("bpf: selftests: test skb->tstamp in redirect_neigh") Signed-off-by: Tony Ambardar Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/031d656c058b4e55ceae56ef49c4e1729b5090f3.1722244708.git.tony.ambardar@gmail.com --- tools/testing/selftests/bpf/prog_tests/tc_redirect.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/tc_redirect.c b/tools/testing/selftests/bpf/prog_tests/tc_redirect.c index 327d51f59142..53b8ffc943dc 100644 --- a/tools/testing/selftests/bpf/prog_tests/tc_redirect.c +++ b/tools/testing/selftests/bpf/prog_tests/tc_redirect.c @@ -471,7 +471,7 @@ static int set_forwarding(bool enable) static int __rcv_tstamp(int fd, const char *expected, size_t s, __u64 *tstamp) { - struct __kernel_timespec pkt_ts = {}; + struct timespec pkt_ts = {}; char ctl[CMSG_SPACE(sizeof(pkt_ts))]; struct timespec now_ts; struct msghdr msg = {}; @@ -495,7 +495,7 @@ static int __rcv_tstamp(int fd, const char *expected, size_t s, __u64 *tstamp) cmsg = CMSG_FIRSTHDR(&msg); if (cmsg && cmsg->cmsg_level == SOL_SOCKET && - cmsg->cmsg_type == SO_TIMESTAMPNS_NEW) + cmsg->cmsg_type == SO_TIMESTAMPNS) memcpy(&pkt_ts, CMSG_DATA(cmsg), sizeof(pkt_ts)); pkt_ns = pkt_ts.tv_sec * NSEC_PER_SEC + pkt_ts.tv_nsec; @@ -537,9 +537,9 @@ static int wait_netstamp_needed_key(void) if (!ASSERT_GE(srv_fd, 0, "start_server")) goto done; - err = setsockopt(srv_fd, SOL_SOCKET, SO_TIMESTAMPNS_NEW, + err = setsockopt(srv_fd, SOL_SOCKET, SO_TIMESTAMPNS, &opt, sizeof(opt)); - if (!ASSERT_OK(err, "setsockopt(SO_TIMESTAMPNS_NEW)")) + if (!ASSERT_OK(err, "setsockopt(SO_TIMESTAMPNS)")) goto done; cli_fd = connect_to_fd(srv_fd, TIMEOUT_MILLIS); @@ -621,9 +621,9 @@ static void test_inet_dtime(int family, int type, const char *addr, __u16 port) return; /* Ensure the kernel puts the (rcv) timestamp for all skb */ - err = setsockopt(listen_fd, SOL_SOCKET, SO_TIMESTAMPNS_NEW, + err = setsockopt(listen_fd, SOL_SOCKET, SO_TIMESTAMPNS, &opt, sizeof(opt)); - if (!ASSERT_OK(err, "setsockopt(SO_TIMESTAMPNS_NEW)")) + if (!ASSERT_OK(err, "setsockopt(SO_TIMESTAMPNS)")) goto done; if (type == SOCK_STREAM) { -- cgit v1.2.3 From 92cc2456e9775dc4333fb4aa430763ae4ac2f2d9 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 30 Jul 2024 16:18:05 -0700 Subject: selftests/bpf: fix RELEASE=1 compilation for sock_addr.c When building selftests with RELEASE=1 using GCC compiler, it complaints about uninitialized err. Fix the problem. Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20240730231805.1933923-1-andrii@kernel.org Signed-off-by: Martin KaFai Lau --- tools/testing/selftests/bpf/prog_tests/sock_addr.c | 1 + 1 file changed, 1 insertion(+) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/sock_addr.c b/tools/testing/selftests/bpf/prog_tests/sock_addr.c index b880c564a204..a6ee7f8d4f79 100644 --- a/tools/testing/selftests/bpf/prog_tests/sock_addr.c +++ b/tools/testing/selftests/bpf/prog_tests/sock_addr.c @@ -2642,6 +2642,7 @@ void test_sock_addr(void) break; default: ASSERT_TRUE(false, "Unknown sock addr test type"); + err = -EINVAL; break; } -- cgit v1.2.3 From ac93ca125b5409df56c5139648bbe10fd1ea989b Mon Sep 17 00:00:00 2001 From: Zhu Jun Date: Tue, 23 Jul 2024 19:46:36 -0700 Subject: tools: gpio: Fix the wrong format specifier The unsigned int should use "%u" instead of "%d". Signed-off-by: Zhu Jun Link: https://lore.kernel.org/r/20240724024636.3634-1-zhujun2@cmss.chinamobile.com Signed-off-by: Bartosz Golaszewski --- tools/gpio/gpio-hammer.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/gpio/gpio-hammer.c b/tools/gpio/gpio-hammer.c index 54fdf59dd320..ba0866eb3581 100644 --- a/tools/gpio/gpio-hammer.c +++ b/tools/gpio/gpio-hammer.c @@ -54,7 +54,7 @@ int hammer_device(const char *device_name, unsigned int *lines, int num_lines, fprintf(stdout, "Hammer lines ["); for (i = 0; i < num_lines; i++) { - fprintf(stdout, "%d", lines[i]); + fprintf(stdout, "%u", lines[i]); if (i != (num_lines - 1)) fprintf(stdout, ", "); } @@ -89,7 +89,7 @@ int hammer_device(const char *device_name, unsigned int *lines, int num_lines, fprintf(stdout, "["); for (i = 0; i < num_lines; i++) { - fprintf(stdout, "%d: %d", lines[i], + fprintf(stdout, "%u: %d", lines[i], gpiotools_test_bit(values.bits, i)); if (i != (num_lines - 1)) fprintf(stdout, ", "); -- cgit v1.2.3 From 30b968b002a92870325a5c9d1ce78eba0ce386e7 Mon Sep 17 00:00:00 2001 From: Yu-Ting Tseng Date: Tue, 9 Jul 2024 00:00:49 -0700 Subject: binder: frozen notification binder_features flag Add a flag to binder_features to indicate that the freeze notification feature is available. Signed-off-by: Yu-Ting Tseng Acked-by: Carlos Llamas Link: https://lore.kernel.org/r/20240709070047.4055369-6-yutingtseng@google.com Signed-off-by: Greg Kroah-Hartman --- drivers/android/binderfs.c | 8 ++++++++ tools/testing/selftests/filesystems/binderfs/binderfs_test.c | 1 + 2 files changed, 9 insertions(+) (limited to 'tools') diff --git a/drivers/android/binderfs.c b/drivers/android/binderfs.c index 3001d754ac36..ad1fa7abc323 100644 --- a/drivers/android/binderfs.c +++ b/drivers/android/binderfs.c @@ -58,6 +58,7 @@ enum binderfs_stats_mode { struct binder_features { bool oneway_spam_detection; bool extended_error; + bool freeze_notification; }; static const struct constant_table binderfs_param_stats[] = { @@ -74,6 +75,7 @@ static const struct fs_parameter_spec binderfs_fs_parameters[] = { static struct binder_features binder_features = { .oneway_spam_detection = true, .extended_error = true, + .freeze_notification = true, }; static inline struct binderfs_info *BINDERFS_SB(const struct super_block *sb) @@ -608,6 +610,12 @@ static int init_binder_features(struct super_block *sb) if (IS_ERR(dentry)) return PTR_ERR(dentry); + dentry = binderfs_create_file(dir, "freeze_notification", + &binder_features_fops, + &binder_features.freeze_notification); + if (IS_ERR(dentry)) + return PTR_ERR(dentry); + return 0; } diff --git a/tools/testing/selftests/filesystems/binderfs/binderfs_test.c b/tools/testing/selftests/filesystems/binderfs/binderfs_test.c index 5f362c0fd890..319567f0fae1 100644 --- a/tools/testing/selftests/filesystems/binderfs/binderfs_test.c +++ b/tools/testing/selftests/filesystems/binderfs/binderfs_test.c @@ -65,6 +65,7 @@ static int __do_binderfs_test(struct __test_metadata *_metadata) static const char * const binder_features[] = { "oneway_spam_detection", "extended_error", + "freeze_notification", }; change_mountns(_metadata); -- cgit v1.2.3 From 45a0c928e7aa42caf2380b134bcd326b40a9df84 Mon Sep 17 00:00:00 2001 From: Howard Chu Date: Tue, 25 Jun 2024 02:13:39 +0800 Subject: perf trace: BTF-based enum pretty printing for syscall args MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In this patch, BTF is used to turn enum value to the corresponding name. There is only one system call that uses enum value as its argument, that is `landlock_add_rule()`. The vmlinux btf is loaded lazily, when user decided to trace the `landlock_add_rule` syscall. But if one decide to run `perf trace` without any arguments, the behaviour is to trace `landlock_add_rule`, so vmlinux btf will be loaded by default. The laziest behaviour is to load vmlinux btf when a `landlock_add_rule` syscall hits. But I think you could lose some samples when loading vmlinux btf at run time, for it can delay the handling of other samples. I might need your precious opinions on this... before: ``` perf $ ./perf trace -e landlock_add_rule 0.000 ( 0.008 ms): ldlck-test/438194 landlock_add_rule(rule_type: 2) = -1 EBADFD (File descriptor in bad state) 0.010 ( 0.001 ms): ldlck-test/438194 landlock_add_rule(rule_type: 1) = -1 EBADFD (File descriptor in bad state) ``` after: ``` perf $ ./perf trace -e landlock_add_rule 0.000 ( 0.029 ms): ldlck-test/438194 landlock_add_rule(rule_type: LANDLOCK_RULE_NET_PORT) = -1 EBADFD (File descriptor in bad state) 0.036 ( 0.004 ms): ldlck-test/438194 landlock_add_rule(rule_type: LANDLOCK_RULE_PATH_BENEATH) = -1 EBADFD (File descriptor in bad state) ``` Committer notes: Made it build with NO_LIBBPF=1, simplified btf_enum_fprintf(), see [1] for the discussion. Signed-off-by: Howard Chu Tested-by: Arnaldo Carvalho de Melo Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Günther Noack Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kan Liang Cc: Mark Rutland Cc: Mickaël Salaün Cc: Namhyung Kim Cc: Peter Zijlstra Link: https://lore.kernel.org/lkml/20240613022757.3589783-1-howardchu95@gmail.com Link: https://lore.kernel.org/lkml/ZnXAhFflUl_LV1QY@x1 # [1] Link: https://lore.kernel.org/r/20240624181345.124764-3-howardchu95@gmail.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-trace.c | 110 +++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 106 insertions(+), 4 deletions(-) (limited to 'tools') diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c index 8449f2beb54d..1391564911d9 100644 --- a/tools/perf/builtin-trace.c +++ b/tools/perf/builtin-trace.c @@ -19,6 +19,7 @@ #ifdef HAVE_LIBBPF_SUPPORT #include #include +#include #ifdef HAVE_BPF_SKEL #include "bpf_skel/augmented_raw_syscalls.skel.h" #endif @@ -110,6 +111,10 @@ struct syscall_arg_fmt { const char *name; u16 nr_entries; // for arrays bool show_zero; + bool is_enum; +#ifdef HAVE_LIBBPF_SUPPORT + const struct btf_type *type; +#endif }; struct syscall_fmt { @@ -139,6 +144,9 @@ struct trace { } syscalls; #ifdef HAVE_BPF_SKEL struct augmented_raw_syscalls_bpf *skel; +#endif +#ifdef HAVE_LIBBPF_SUPPORT + struct btf *btf; #endif struct record_opts opts; struct evlist *evlist; @@ -204,6 +212,20 @@ struct trace { } oe; }; +static void trace__load_vmlinux_btf(struct trace *trace __maybe_unused) +{ +#ifdef HAVE_LIBBPF_SUPPORT + if (trace->btf != NULL) + return; + + trace->btf = btf__load_vmlinux_btf(); + if (verbose > 0) { + fprintf(trace->output, trace->btf ? "vmlinux BTF loaded\n" : + "Failed to load vmlinux BTF\n"); + } +#endif +} + struct tp_field { int offset; union { @@ -887,6 +909,64 @@ static size_t syscall_arg__scnprintf_getrandom_flags(char *bf, size_t size, #define SCA_GETRANDOM_FLAGS syscall_arg__scnprintf_getrandom_flags +#ifdef HAVE_LIBBPF_SUPPORT +static int syscall_arg_fmt__cache_btf_enum(struct syscall_arg_fmt *arg_fmt, struct btf *btf, char *type) +{ + int id; + + // Already cached? + if (arg_fmt->type != NULL) + return 0; + + type = strstr(type, "enum "); + if (type == NULL) + return -1; + + type += 5; // skip "enum " to get the enumeration name + + id = btf__find_by_name(btf, type); + if (id < 0) + return -1; + + arg_fmt->type = btf__type_by_id(btf, id); + return arg_fmt->type == NULL ? -1 : 0; +} + +static size_t btf_enum_scnprintf(const struct btf_type *type, struct btf *btf, char *bf, size_t size, int val) +{ + struct btf_enum *be = btf_enum(type); + const int nr_entries = btf_vlen(type); + + for (int i = 0; i < nr_entries; ++i, ++be) { + if (be->val == val) { + return scnprintf(bf, size, "%s", + btf__name_by_offset(btf, be->name_off)); + } + } + + return 0; +} + +static size_t trace__btf_enum_scnprintf(struct trace *trace, struct syscall_arg_fmt *arg_fmt, char *bf, + size_t size, int val, char *type) +{ + if (trace->btf == NULL) + return 0; + + if (syscall_arg_fmt__cache_btf_enum(arg_fmt, trace->btf, type) < 0) + return 0; + + return btf_enum_scnprintf(arg_fmt->type, trace->btf, bf, size, val); +} +#else // HAVE_LIBBPF_SUPPORT +static size_t trace__btf_enum_scnprintf(struct trace *trace __maybe_unused, struct syscall_arg_fmt *arg_fmt __maybe_unused, + char *bf __maybe_unused, size_t size __maybe_unused, int val __maybe_unused, + char *type __maybe_unused) +{ + return 0; +} +#endif // HAVE_LIBBPF_SUPPORT + #define STRARRAY(name, array) \ { .scnprintf = SCA_STRARRAY, \ .strtoul = STUL_STRARRAY, \ @@ -1238,6 +1318,7 @@ struct syscall { bool is_exit; bool is_open; bool nonexistent; + bool use_btf; struct tep_format_field *args; const char *name; const struct syscall_fmt *fmt; @@ -1744,7 +1825,8 @@ static const struct syscall_arg_fmt *syscall_arg_fmt__find_by_name(const char *n } static struct tep_format_field * -syscall_arg_fmt__init_array(struct syscall_arg_fmt *arg, struct tep_format_field *field) +syscall_arg_fmt__init_array(struct syscall_arg_fmt *arg, struct tep_format_field *field, + bool *use_btf) { struct tep_format_field *last_field = NULL; int len; @@ -1756,6 +1838,7 @@ syscall_arg_fmt__init_array(struct syscall_arg_fmt *arg, struct tep_format_field continue; len = strlen(field->name); + arg->is_enum = false; if (strcmp(field->type, "const char *") == 0 && ((len >= 4 && strcmp(field->name + len - 4, "name") == 0) || @@ -1782,6 +1865,8 @@ syscall_arg_fmt__init_array(struct syscall_arg_fmt *arg, struct tep_format_field * 7 unsigned long */ arg->scnprintf = SCA_FD; + } else if (strstr(field->type, "enum") && use_btf != NULL) { + *use_btf = arg->is_enum = true; } else { const struct syscall_arg_fmt *fmt = syscall_arg_fmt__find_by_name(field->name); @@ -1798,7 +1883,8 @@ syscall_arg_fmt__init_array(struct syscall_arg_fmt *arg, struct tep_format_field static int syscall__set_arg_fmts(struct syscall *sc) { - struct tep_format_field *last_field = syscall_arg_fmt__init_array(sc->arg_fmt, sc->args); + struct tep_format_field *last_field = syscall_arg_fmt__init_array(sc->arg_fmt, sc->args, + &sc->use_btf); if (last_field) sc->args_size = last_field->offset + last_field->size; @@ -1811,6 +1897,7 @@ static int trace__read_syscall_info(struct trace *trace, int id) char tp_name[128]; struct syscall *sc; const char *name = syscalltbl__name(trace->sctbl, id); + int err; #ifdef HAVE_SYSCALL_TABLE_SUPPORT if (trace->syscalls.table == NULL) { @@ -1883,7 +1970,13 @@ static int trace__read_syscall_info(struct trace *trace, int id) sc->is_exit = !strcmp(name, "exit_group") || !strcmp(name, "exit"); sc->is_open = !strcmp(name, "open") || !strcmp(name, "openat"); - return syscall__set_arg_fmts(sc); + err = syscall__set_arg_fmts(sc); + + /* after calling syscall__set_arg_fmts() we'll know whether use_btf is true */ + if (sc->use_btf) + trace__load_vmlinux_btf(trace); + + return err; } static int evsel__init_tp_arg_scnprintf(struct evsel *evsel) @@ -1891,7 +1984,7 @@ static int evsel__init_tp_arg_scnprintf(struct evsel *evsel) struct syscall_arg_fmt *fmt = evsel__syscall_arg_fmt(evsel); if (fmt != NULL) { - syscall_arg_fmt__init_array(fmt, evsel->tp_format->format.fields); + syscall_arg_fmt__init_array(fmt, evsel->tp_format->format.fields, NULL); return 0; } @@ -2103,6 +2196,15 @@ static size_t syscall__scnprintf_args(struct syscall *sc, char *bf, size_t size, if (trace->show_arg_names) printed += scnprintf(bf + printed, size - printed, "%s: ", field->name); + if (sc->arg_fmt[arg.idx].is_enum) { + size_t p = trace__btf_enum_scnprintf(trace, &sc->arg_fmt[arg.idx], bf + printed, + size - printed, val, field->type); + if (p) { + printed += p; + continue; + } + } + printed += syscall_arg_fmt__scnprintf_val(&sc->arg_fmt[arg.idx], bf + printed, size - printed, &arg, val); } -- cgit v1.2.3 From 607bbdb49ccb646be707a0f2ac1d78f5a7c3de7c Mon Sep 17 00:00:00 2001 From: Howard Chu Date: Tue, 25 Jun 2024 02:13:40 +0800 Subject: perf trace: Augment non-syscall tracepoints with enum arguments with BTF Before: perf $ ./perf trace -e timer:hrtimer_start --max-events=1 0.000 :0/0 timer:hrtimer_start(hrtimer: 0xffff974466c25f18, function: 0xffffffff89da5be0, expires: 377432432256753, softexpires: 377432432256753, mode: 10) After: perf $ ./perf trace -e timer:hrtimer_start --max-events=1 0.000 :0/0 timer:hrtimer_start(hrtimer: 0xffff9498a6ca5f18, function: 0xffffffffa77a5be0, expires: 4382442895089, softexpires: 4382442895089, mode: HRTIMER_MODE_ABS_PINNED_HARD) in which HRTIMER_MODE_ABS_PINNED_HARD is: perf $ pahole hrtimer_mode enum hrtimer_mode { HRTIMER_MODE_ABS = 0, HRTIMER_MODE_REL = 1, HRTIMER_MODE_PINNED = 2, HRTIMER_MODE_SOFT = 4, HRTIMER_MODE_HARD = 8, HRTIMER_MODE_ABS_PINNED = 2, HRTIMER_MODE_REL_PINNED = 3, HRTIMER_MODE_ABS_SOFT = 4, HRTIMER_MODE_REL_SOFT = 5, HRTIMER_MODE_ABS_PINNED_SOFT = 6, HRTIMER_MODE_REL_PINNED_SOFT = 7, HRTIMER_MODE_ABS_HARD = 8, HRTIMER_MODE_REL_HARD = 9, HRTIMER_MODE_ABS_PINNED_HARD = 10, HRTIMER_MODE_REL_PINNED_HARD = 11, }; Can also be tested by ./perf trace -e pagemap:mm_lru_insertion,timer:hrtimer_start,timer:hrtimer_init,skb:kfree_skb --max-events=10 (Chose these 4 events because they happen quite frequently.) However some enum arguments may not be contained in vmlinux BTF. To see what enum arguments are supported, use: vmlinux_dir $ bpftool btf dump file /sys/kernel/btf/vmlinux > vmlinux vmlinux_dir $ while read l; do grep "ENUM '$l'" vmlinux; done < <(grep field:enum /sys/kernel/tracing/events/*/*/format | awk '{print $3}' | sort | uniq) | awk '{print $3}' | sed "s/'\(.*\)'/\1/g" dev_pm_qos_req_type error_detector hrtimer_mode i2c_slave_event ieee80211_bss_type lru_list migrate_mode nl80211_auth_type nl80211_band nl80211_iftype numa_vmaskip_reason pm_qos_req_action pwm_polarity skb_drop_reason thermal_trip_type xen_lazy_mode xen_mc_extend_args xen_mc_flush_reason zone_type And what tracepoints have these enum types as their arguments: vmlinux_dir $ while read l; do grep "ENUM '$l'" vmlinux; done < <(grep field:enum /sys/kernel/tracing/events/*/*/format | awk '{print $3}' | sort | uniq) | awk '{print $3}' | sed "s/'\(.*\)'/\1/g" > good_enums vmlinux_dir $ cat good_enums dev_pm_qos_req_type error_detector hrtimer_mode i2c_slave_event ieee80211_bss_type lru_list migrate_mode nl80211_auth_type nl80211_band nl80211_iftype numa_vmaskip_reason pm_qos_req_action pwm_polarity skb_drop_reason thermal_trip_type xen_lazy_mode xen_mc_extend_args xen_mc_flush_reason zone_type vmlinux_dir $ grep -f good_enums -l /sys/kernel/tracing/events/*/*/format /sys/kernel/tracing/events/cfg80211/cfg80211_chandef_dfs_required/format /sys/kernel/tracing/events/cfg80211/cfg80211_ch_switch_notify/format /sys/kernel/tracing/events/cfg80211/cfg80211_ch_switch_started_notify/format /sys/kernel/tracing/events/cfg80211/cfg80211_get_bss/format /sys/kernel/tracing/events/cfg80211/cfg80211_ibss_joined/format /sys/kernel/tracing/events/cfg80211/cfg80211_inform_bss_frame/format /sys/kernel/tracing/events/cfg80211/cfg80211_radar_event/format /sys/kernel/tracing/events/cfg80211/cfg80211_ready_on_channel_expired/format /sys/kernel/tracing/events/cfg80211/cfg80211_ready_on_channel/format /sys/kernel/tracing/events/cfg80211/cfg80211_reg_can_beacon/format /sys/kernel/tracing/events/cfg80211/cfg80211_return_bss/format /sys/kernel/tracing/events/cfg80211/cfg80211_tx_mgmt_expired/format /sys/kernel/tracing/events/cfg80211/rdev_add_virtual_intf/format /sys/kernel/tracing/events/cfg80211/rdev_auth/format /sys/kernel/tracing/events/cfg80211/rdev_change_virtual_intf/format /sys/kernel/tracing/events/cfg80211/rdev_channel_switch/format /sys/kernel/tracing/events/cfg80211/rdev_connect/format /sys/kernel/tracing/events/cfg80211/rdev_inform_bss/format /sys/kernel/tracing/events/cfg80211/rdev_libertas_set_mesh_channel/format /sys/kernel/tracing/events/cfg80211/rdev_mgmt_tx/format /sys/kernel/tracing/events/cfg80211/rdev_remain_on_channel/format /sys/kernel/tracing/events/cfg80211/rdev_return_chandef/format /sys/kernel/tracing/events/cfg80211/rdev_return_int_survey_info/format /sys/kernel/tracing/events/cfg80211/rdev_set_ap_chanwidth/format /sys/kernel/tracing/events/cfg80211/rdev_set_monitor_channel/format /sys/kernel/tracing/events/cfg80211/rdev_set_radar_background/format /sys/kernel/tracing/events/cfg80211/rdev_start_ap/format /sys/kernel/tracing/events/cfg80211/rdev_start_radar_detection/format /sys/kernel/tracing/events/cfg80211/rdev_tdls_channel_switch/format /sys/kernel/tracing/events/compaction/mm_compaction_defer_compaction/format /sys/kernel/tracing/events/compaction/mm_compaction_deferred/format /sys/kernel/tracing/events/compaction/mm_compaction_defer_reset/format /sys/kernel/tracing/events/compaction/mm_compaction_finished/format /sys/kernel/tracing/events/compaction/mm_compaction_kcompactd_wake/format /sys/kernel/tracing/events/compaction/mm_compaction_suitable/format /sys/kernel/tracing/events/compaction/mm_compaction_wakeup_kcompactd/format /sys/kernel/tracing/events/error_report/error_report_end/format /sys/kernel/tracing/events/i2c_slave/i2c_slave/format /sys/kernel/tracing/events/migrate/mm_migrate_pages/format /sys/kernel/tracing/events/migrate/mm_migrate_pages_start/format /sys/kernel/tracing/events/pagemap/mm_lru_insertion/format /sys/kernel/tracing/events/power/dev_pm_qos_add_request/format /sys/kernel/tracing/events/power/dev_pm_qos_remove_request/format /sys/kernel/tracing/events/power/dev_pm_qos_update_request/format /sys/kernel/tracing/events/power/pm_qos_update_flags/format /sys/kernel/tracing/events/power/pm_qos_update_target/format /sys/kernel/tracing/events/pwm/pwm_apply/format /sys/kernel/tracing/events/pwm/pwm_get/format /sys/kernel/tracing/events/sched/sched_skip_vma_numa/format /sys/kernel/tracing/events/skb/kfree_skb/format /sys/kernel/tracing/events/thermal/thermal_zone_trip/format /sys/kernel/tracing/events/timer/hrtimer_init/format /sys/kernel/tracing/events/timer/hrtimer_start/format /sys/kernel/tracing/events/xen/xen_mc_batch/format /sys/kernel/tracing/events/xen/xen_mc_extend_args/format /sys/kernel/tracing/events/xen/xen_mc_flush_reason/format /sys/kernel/tracing/events/xen/xen_mc_issue/format Committer testing: root@x1:~# perf trace -e timer:hrtimer_start --max-events=2 0.000 :0/0 timer:hrtimer_start(hrtimer: 0xffff8d4eff225050, function: 0xffffffff9e22ddd0, expires: 241152380000000, softexpires: 241152380000000, mode: HRTIMER_MODE_ABS) 0.028 :0/0 timer:hrtimer_start(hrtimer: 0xffff8d4eff225050, function: 0xffffffff9e22ddd0, expires: 241153654000000, softexpires: 241153654000000, mode: HRTIMER_MODE_ABS_PINNED_HARD) root@x1:~# Suggested-by: Arnaldo Carvalho de Melo Reviewed-by: Arnaldo Carvalho de Melo Signed-off-by: Howard Chu Tested-by: Arnaldo Carvalho de Melo Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kan Liang Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Link: https://lore.kernel.org/lkml/20240615032743.112750-1-howardchu95@gmail.com Link: https://lore.kernel.org/r/20240624181345.124764-4-howardchu95@gmail.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-trace.c | 29 ++++++++++++++++++++++------- 1 file changed, 22 insertions(+), 7 deletions(-) (limited to 'tools') diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c index 1391564911d9..5618feb7d01a 100644 --- a/tools/perf/builtin-trace.c +++ b/tools/perf/builtin-trace.c @@ -1979,12 +1979,12 @@ static int trace__read_syscall_info(struct trace *trace, int id) return err; } -static int evsel__init_tp_arg_scnprintf(struct evsel *evsel) +static int evsel__init_tp_arg_scnprintf(struct evsel *evsel, bool *use_btf) { struct syscall_arg_fmt *fmt = evsel__syscall_arg_fmt(evsel); if (fmt != NULL) { - syscall_arg_fmt__init_array(fmt, evsel->tp_format->format.fields, NULL); + syscall_arg_fmt__init_array(fmt, evsel->tp_format->format.fields, use_btf); return 0; } @@ -2188,7 +2188,8 @@ static size_t syscall__scnprintf_args(struct syscall *sc, char *bf, size_t size, * property isn't set. */ if (val == 0 && !trace->show_zeros && - !(sc->arg_fmt && sc->arg_fmt[arg.idx].show_zero)) + !(sc->arg_fmt && sc->arg_fmt[arg.idx].show_zero) && + !(sc->arg_fmt && sc->arg_fmt[arg.idx].is_enum)) continue; printed += scnprintf(bf + printed, size - printed, "%s", printed ? ", " : ""); @@ -2893,7 +2894,7 @@ static size_t trace__fprintf_tp_fields(struct trace *trace, struct evsel *evsel, val = syscall_arg_fmt__mask_val(arg, &syscall_arg, val); /* Suppress this argument if its value is zero and show_zero property isn't set. */ - if (val == 0 && !trace->show_zeros && !arg->show_zero) + if (val == 0 && !trace->show_zeros && !arg->show_zero && !arg->is_enum) continue; printed += scnprintf(bf + printed, size - printed, "%s", printed ? ", " : ""); @@ -2901,6 +2902,15 @@ static size_t trace__fprintf_tp_fields(struct trace *trace, struct evsel *evsel, if (trace->show_arg_names) printed += scnprintf(bf + printed, size - printed, "%s: ", field->name); + if (arg->is_enum) { + size_t p = trace__btf_enum_scnprintf(trace, arg, bf + printed, + size - printed, val, field->type); + if (p) { + printed += p; + continue; + } + } + printed += syscall_arg_fmt__scnprintf_val(arg, bf + printed, size - printed, &syscall_arg, val); } @@ -4553,7 +4563,7 @@ static void evsel__set_syscall_arg_fmt(struct evsel *evsel, const char *name) } } -static int evlist__set_syscall_tp_fields(struct evlist *evlist) +static int evlist__set_syscall_tp_fields(struct evlist *evlist, bool *use_btf) { struct evsel *evsel; @@ -4562,7 +4572,7 @@ static int evlist__set_syscall_tp_fields(struct evlist *evlist) continue; if (strcmp(evsel->tp_format->system, "syscalls")) { - evsel__init_tp_arg_scnprintf(evsel); + evsel__init_tp_arg_scnprintf(evsel, use_btf); continue; } @@ -5040,11 +5050,16 @@ skip_augmentation: } if (trace.evlist->core.nr_entries > 0) { + bool use_btf = false; + evlist__set_default_evsel_handler(trace.evlist, trace__event_handler); - if (evlist__set_syscall_tp_fields(trace.evlist)) { + if (evlist__set_syscall_tp_fields(trace.evlist, &use_btf)) { perror("failed to set syscalls:* tracepoint fields"); goto out; } + + if (use_btf) + trace__load_vmlinux_btf(&trace); } if (trace.sort_events) { -- cgit v1.2.3 From 95586588868a04ea2bbfa144cf473bea9bb86110 Mon Sep 17 00:00:00 2001 From: Howard Chu Date: Tue, 25 Jun 2024 02:13:41 +0800 Subject: perf trace: Filter enum arguments with enum names Before: perf $ ./perf trace -e timer:hrtimer_start --filter='mode!=HRTIMER_MODE_ABS_PINNED_HARD' --max-events=1 No resolver (strtoul) for "mode" in "timer:hrtimer_start", can't set filter "(mode!=HRTIMER_MODE_ABS_PINNED_HARD) && (common_pid != 281988)" After: perf $ ./perf trace -e timer:hrtimer_start --filter='mode!=HRTIMER_MODE_ABS_PINNED_HARD' --max-events=1 0.000 :0/0 timer:hrtimer_start(hrtimer: 0xffff9498a6ca5f18, function: 0xffffffffa77a5be0, expires: 12351248764875, softexpires: 12351248764875, mode: HRTIMER_MODE_ABS) && and ||: perf $ ./perf trace -e timer:hrtimer_start --filter='mode != HRTIMER_MODE_ABS_PINNED_HARD && mode != HRTIMER_MODE_ABS' --max-events=1 0.000 Hyprland/534 timer:hrtimer_start(hrtimer: 0xffff9497801a84d0, function: 0xffffffffc04cdbe0, expires: 12639434638458, softexpires: 12639433638458, mode: HRTIMER_MODE_REL) perf $ ./perf trace -e timer:hrtimer_start --filter='mode == HRTIMER_MODE_REL || mode == HRTIMER_MODE_PINNED' --max-events=1 0.000 ldlck-test/60639 timer:hrtimer_start(hrtimer: 0xffffb16404ee7bf8, function: 0xffffffffa7790420, expires: 12772614418016, softexpires: 12772614368016, mode: HRTIMER_MODE_REL) Switching it up, using both enum name and integer value(--filter='mode == HRTIMER_MODE_ABS_PINNED_HARD || mode == 0'): perf $ ./perf trace -e timer:hrtimer_start --filter='mode == HRTIMER_MODE_ABS_PINNED_HARD || mode == 0' --max-events=3 0.000 :0/0 timer:hrtimer_start(hrtimer: 0xffff9498a6ca5f18, function: 0xffffffffa77a5be0, expires: 12601748739825, softexpires: 12601748739825, mode: HRTIMER_MODE_ABS_PINNED_HARD) 0.036 :0/0 timer:hrtimer_start(hrtimer: 0xffff9498a6ca5f18, function: 0xffffffffa77a5be0, expires: 12518758748124, softexpires: 12518758748124, mode: HRTIMER_MODE_ABS_PINNED_HARD) 0.172 tmux: server/41881 timer:hrtimer_start(hrtimer: 0xffffb164081e7838, function: 0xffffffffa7790420, expires: 12518768255836, softexpires: 12518768205836, mode: HRTIMER_MODE_ABS) P.S. perf $ pahole hrtimer_mode enum hrtimer_mode { HRTIMER_MODE_ABS = 0, HRTIMER_MODE_REL = 1, HRTIMER_MODE_PINNED = 2, HRTIMER_MODE_SOFT = 4, HRTIMER_MODE_HARD = 8, HRTIMER_MODE_ABS_PINNED = 2, HRTIMER_MODE_REL_PINNED = 3, HRTIMER_MODE_ABS_SOFT = 4, HRTIMER_MODE_REL_SOFT = 5, HRTIMER_MODE_ABS_PINNED_SOFT = 6, HRTIMER_MODE_REL_PINNED_SOFT = 7, HRTIMER_MODE_ABS_HARD = 8, HRTIMER_MODE_REL_HARD = 9, HRTIMER_MODE_ABS_PINNED_HARD = 10, HRTIMER_MODE_REL_PINNED_HARD = 11, }; Committer testing: root@x1:~# perf trace -e timer:hrtimer_start --filter='mode != HRTIMER_MODE_ABS' --max-events=2 0.000 :0/0 timer:hrtimer_start(hrtimer: 0xffff8d4eff2a5050, function: 0xffffffff9e22ddd0, expires: 241502326000000, softexpires: 241502326000000, mode: HRTIMER_MODE_ABS_PINNED_HARD) 18446744073709.488 :0/0 timer:hrtimer_start(hrtimer: 0xffff8d4eff425050, function: 0xffffffff9e22ddd0, expires: 241501814000000, softexpires: 241501814000000, mode: HRTIMER_MODE_ABS_PINNED_HARD) root@x1:~# perf trace -e timer:hrtimer_start --filter='mode != HRTIMER_MODE_ABS && mode != HRTIMER_MODE_ABS_PINNED_HARD' --max-events=2 0.000 podman/510644 timer:hrtimer_start(hrtimer: 0xffffa2024f5f7dd0, function: 0xffffffff9e2170c0, expires: 241530497418194, softexpires: 241530497368194, mode: HRTIMER_MODE_REL) 40.251 gnome-shell/2484 timer:hrtimer_start(hrtimer: 0xffff8d48bda17650, function: 0xffffffffc0661550, expires: 241550528619247, softexpires: 241550527619247, mode: HRTIMER_MODE_REL) root@x1:~# perf trace -v -e timer:hrtimer_start --filter='mode != HRTIMER_MODE_ABS && mode != HRTIMER_MODE_ABS_PINNED_HARD && mode != HRTIMER_MODE_REL' --max-events=2 Using CPUID GenuineIntel-6-BA-3 vmlinux BTF loaded 0 0xa 0x1 New filter for timer:hrtimer_start: (mode != 0 && mode != 0xa && mode != 0x1) && (common_pid != 524049 && common_pid != 4041) mmap size 528384B ^Croot@x1:~# Suggested-by: Arnaldo Carvalho de Melo Signed-off-by: Howard Chu Tested-by: Arnaldo Carvalho de Melo Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kan Liang Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Link: https://lore.kernel.org/lkml/ZnCcliuecJABD5FN@x1 Link: https://lore.kernel.org/r/20240624181345.124764-5-howardchu95@gmail.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-trace.c | 62 +++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 56 insertions(+), 6 deletions(-) (limited to 'tools') diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c index 5618feb7d01a..e664001d5ed7 100644 --- a/tools/perf/builtin-trace.c +++ b/tools/perf/builtin-trace.c @@ -932,6 +932,37 @@ static int syscall_arg_fmt__cache_btf_enum(struct syscall_arg_fmt *arg_fmt, stru return arg_fmt->type == NULL ? -1 : 0; } +static bool syscall_arg__strtoul_btf_enum(char *bf, size_t size, struct syscall_arg *arg, u64 *val) +{ + const struct btf_type *bt; + char *type = arg->parm; + struct btf_enum *be; + struct btf *btf; + + trace__load_vmlinux_btf(arg->trace); + + btf = arg->trace->btf; + if (btf == NULL) + return false; + + if (syscall_arg_fmt__cache_btf_enum(arg->fmt, btf, type) < 0) + return false; + + bt = arg->fmt->type; + be = btf_enum(bt); + for (int i = 0; i < btf_vlen(bt); ++i, ++be) { + const char *name = btf__name_by_offset(btf, be->name_off); + int max_len = max(size, strlen(name)); + + if (strncmp(name, bf, max_len) == 0) { + *val = be->val; + return true; + } + } + + return false; +} + static size_t btf_enum_scnprintf(const struct btf_type *type, struct btf *btf, char *bf, size_t size, int val) { struct btf_enum *be = btf_enum(type); @@ -965,8 +996,16 @@ static size_t trace__btf_enum_scnprintf(struct trace *trace __maybe_unused, stru { return 0; } + +static bool syscall_arg__strtoul_btf_enum(char *bf __maybe_unused, size_t size __maybe_unused, + struct syscall_arg *arg __maybe_unused, u64 *val __maybe_unused) +{ + return false; +} #endif // HAVE_LIBBPF_SUPPORT +#define STUL_BTF_ENUM syscall_arg__strtoul_btf_enum + #define STRARRAY(name, array) \ { .scnprintf = SCA_STRARRAY, \ .strtoul = STUL_STRARRAY, \ @@ -1867,6 +1906,7 @@ syscall_arg_fmt__init_array(struct syscall_arg_fmt *arg, struct tep_format_field arg->scnprintf = SCA_FD; } else if (strstr(field->type, "enum") && use_btf != NULL) { *use_btf = arg->is_enum = true; + arg->strtoul = STUL_BTF_ENUM; } else { const struct syscall_arg_fmt *fmt = syscall_arg_fmt__find_by_name(field->name); @@ -3792,7 +3832,8 @@ static int ordered_events__deliver_event(struct ordered_events *oe, return __trace__deliver_event(trace, event->event); } -static struct syscall_arg_fmt *evsel__find_syscall_arg_fmt_by_name(struct evsel *evsel, char *arg) +static struct syscall_arg_fmt *evsel__find_syscall_arg_fmt_by_name(struct evsel *evsel, char *arg, + char **type) { struct tep_format_field *field; struct syscall_arg_fmt *fmt = __evsel__syscall_arg_fmt(evsel); @@ -3801,13 +3842,15 @@ static struct syscall_arg_fmt *evsel__find_syscall_arg_fmt_by_name(struct evsel return NULL; for (field = evsel->tp_format->format.fields; field; field = field->next, ++fmt) - if (strcmp(field->name, arg) == 0) + if (strcmp(field->name, arg) == 0) { + *type = field->type; return fmt; + } return NULL; } -static int trace__expand_filter(struct trace *trace __maybe_unused, struct evsel *evsel) +static int trace__expand_filter(struct trace *trace, struct evsel *evsel) { char *tok, *left = evsel->filter, *new_filter = evsel->filter; @@ -3840,14 +3883,14 @@ static int trace__expand_filter(struct trace *trace __maybe_unused, struct evsel struct syscall_arg_fmt *fmt; int left_size = tok - left, right_size = right_end - right; - char arg[128]; + char arg[128], *type; while (isspace(left[left_size - 1])) --left_size; scnprintf(arg, sizeof(arg), "%.*s", left_size, left); - fmt = evsel__find_syscall_arg_fmt_by_name(evsel, arg); + fmt = evsel__find_syscall_arg_fmt_by_name(evsel, arg, &type); if (fmt == NULL) { pr_err("\"%s\" not found in \"%s\", can't set filter \"%s\"\n", arg, evsel->name, evsel->filter); @@ -3860,9 +3903,16 @@ static int trace__expand_filter(struct trace *trace __maybe_unused, struct evsel if (fmt->strtoul) { u64 val; struct syscall_arg syscall_arg = { - .parm = fmt->parm, + .trace = trace, + .fmt = fmt, }; + if (fmt->is_enum) { + syscall_arg.parm = type; + } else { + syscall_arg.parm = fmt->parm; + } + if (fmt->strtoul(right, right_size, &syscall_arg, &val)) { char *n, expansion[19]; int expansion_lenght = scnprintf(expansion, sizeof(expansion), "%#" PRIx64, val); -- cgit v1.2.3 From ba6a9018502eecb94e67001deeb48406fa71f916 Mon Sep 17 00:00:00 2001 From: "Alexis Lothoré (eBPF Foundation)" Date: Wed, 31 Jul 2024 08:37:25 +0200 Subject: selftests/bpf: do not disable /dev/null device access in cgroup dev test MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit test_dev_cgroup currently loads a small bpf program allowing any access on urandom and zero devices, disabling access to any other device. It makes migrating this test to test_progs impossible, since this one manipulates extensively /dev/null. Allow /dev/null manipulation in dev_cgroup program to make its usage in test_progs framework possible. Update test_dev_cgroup.c as well to match this change while it has not been removed. Reviewed-by: Alan Maguire Acked-by: Stanislav Fomichev Signed-off-by: Alexis Lothoré (eBPF Foundation) Link: https://lore.kernel.org/r/20240731-convert_dev_cgroup-v4-1-849425d90de6@bootlin.com Signed-off-by: Martin KaFai Lau --- tools/testing/selftests/bpf/progs/dev_cgroup.c | 4 ++-- tools/testing/selftests/bpf/test_dev_cgroup.c | 18 +++++++++--------- 2 files changed, 11 insertions(+), 11 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/progs/dev_cgroup.c b/tools/testing/selftests/bpf/progs/dev_cgroup.c index 79b54a4fa244..c1dfbd2b56fc 100644 --- a/tools/testing/selftests/bpf/progs/dev_cgroup.c +++ b/tools/testing/selftests/bpf/progs/dev_cgroup.c @@ -41,14 +41,14 @@ int bpf_prog1(struct bpf_cgroup_dev_ctx *ctx) bpf_trace_printk(fmt, sizeof(fmt), ctx->major, ctx->minor); #endif - /* Allow access to /dev/zero and /dev/random. + /* Allow access to /dev/null and /dev/urandom. * Forbid everything else. */ if (ctx->major != 1 || type != BPF_DEVCG_DEV_CHAR) return 0; switch (ctx->minor) { - case 5: /* 1:5 /dev/zero */ + case 3: /* 1:3 /dev/null */ case 9: /* 1:9 /dev/urandom */ return 1; } diff --git a/tools/testing/selftests/bpf/test_dev_cgroup.c b/tools/testing/selftests/bpf/test_dev_cgroup.c index adeaf63cb6fa..33f544f0005a 100644 --- a/tools/testing/selftests/bpf/test_dev_cgroup.c +++ b/tools/testing/selftests/bpf/test_dev_cgroup.c @@ -54,25 +54,25 @@ int main(int argc, char **argv) goto err; } - /* All operations with /dev/zero and and /dev/urandom are allowed, + /* All operations with /dev/null and /dev/urandom are allowed, * everything else is forbidden. */ - assert(system("rm -f /tmp/test_dev_cgroup_null") == 0); - assert(system("mknod /tmp/test_dev_cgroup_null c 1 3")); - assert(system("rm -f /tmp/test_dev_cgroup_null") == 0); - - /* /dev/zero is whitelisted */ assert(system("rm -f /tmp/test_dev_cgroup_zero") == 0); - assert(system("mknod /tmp/test_dev_cgroup_zero c 1 5") == 0); + assert(system("mknod /tmp/test_dev_cgroup_zero c 1 5")); assert(system("rm -f /tmp/test_dev_cgroup_zero") == 0); - assert(system("dd if=/dev/urandom of=/dev/zero count=64") == 0); + /* /dev/null is whitelisted */ + assert(system("rm -f /tmp/test_dev_cgroup_null") == 0); + assert(system("mknod /tmp/test_dev_cgroup_null c 1 3") == 0); + assert(system("rm -f /tmp/test_dev_cgroup_null") == 0); + + assert(system("dd if=/dev/urandom of=/dev/null count=64") == 0); /* src is allowed, target is forbidden */ assert(system("dd if=/dev/urandom of=/dev/full count=64")); /* src is forbidden, target is allowed */ - assert(system("dd if=/dev/random of=/dev/zero count=64")); + assert(system("dd if=/dev/random of=/dev/null count=64")); error = 0; printf("test_dev_cgroup:PASS\n"); -- cgit v1.2.3 From d83d8230e415ecf727f6b88c0ae55193f149d650 Mon Sep 17 00:00:00 2001 From: "Alexis Lothoré (eBPF Foundation)" Date: Wed, 31 Jul 2024 08:37:26 +0200 Subject: selftests/bpf: convert test_dev_cgroup to test_progs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit test_dev_cgroup is defined as a standalone test program, and so is not executed in CI. Convert it to test_progs framework so it is tested automatically in CI, and remove the old test. In order to be able to run it in test_progs, /dev/null must remain usable, so change the new test to test operations on devices 1:3 as valid, and operations on devices 1:5 (/dev/zero) as invalid. Reviewed-by: Alan Maguire Acked-by: Stanislav Fomichev Signed-off-by: Alexis Lothoré (eBPF Foundation) Link: https://lore.kernel.org/r/20240731-convert_dev_cgroup-v4-2-849425d90de6@bootlin.com Signed-off-by: Martin KaFai Lau --- tools/testing/selftests/bpf/.gitignore | 1 - tools/testing/selftests/bpf/Makefile | 2 - .../testing/selftests/bpf/prog_tests/cgroup_dev.c | 121 +++++++++++++++++++++ tools/testing/selftests/bpf/test_dev_cgroup.c | 85 --------------- 4 files changed, 121 insertions(+), 88 deletions(-) create mode 100644 tools/testing/selftests/bpf/prog_tests/cgroup_dev.c delete mode 100644 tools/testing/selftests/bpf/test_dev_cgroup.c (limited to 'tools') diff --git a/tools/testing/selftests/bpf/.gitignore b/tools/testing/selftests/bpf/.gitignore index 4e4aae8aa7ec..8f14d8faeb0b 100644 --- a/tools/testing/selftests/bpf/.gitignore +++ b/tools/testing/selftests/bpf/.gitignore @@ -9,7 +9,6 @@ test_lpm_map test_tag FEATURE-DUMP.libbpf fixdep -test_dev_cgroup /test_progs /test_progs-no_alu32 /test_progs-bpf_gcc diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 774c6270e377..f54185e96a95 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -67,7 +67,6 @@ endif # Order correspond to 'make run_tests' order TEST_GEN_PROGS = test_verifier test_tag test_maps test_lru_map test_lpm_map test_progs \ - test_dev_cgroup \ test_sock test_sockmap get_cgroup_id_user \ test_cgroup_storage \ test_tcpnotify_user test_sysctl \ @@ -292,7 +291,6 @@ JSON_WRITER := $(OUTPUT)/json_writer.o CAP_HELPERS := $(OUTPUT)/cap_helpers.o NETWORK_HELPERS := $(OUTPUT)/network_helpers.o -$(OUTPUT)/test_dev_cgroup: $(CGROUP_HELPERS) $(TESTING_HELPERS) $(OUTPUT)/test_skb_cgroup_id_user: $(CGROUP_HELPERS) $(TESTING_HELPERS) $(OUTPUT)/test_sock: $(CGROUP_HELPERS) $(TESTING_HELPERS) $(OUTPUT)/test_sockmap: $(CGROUP_HELPERS) $(TESTING_HELPERS) diff --git a/tools/testing/selftests/bpf/prog_tests/cgroup_dev.c b/tools/testing/selftests/bpf/prog_tests/cgroup_dev.c new file mode 100644 index 000000000000..8661e145ba84 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/cgroup_dev.c @@ -0,0 +1,121 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include +#include "test_progs.h" +#include "cgroup_helpers.h" +#include "dev_cgroup.skel.h" + +#define TEST_CGROUP "/test-bpf-based-device-cgroup/" +#define TEST_BUFFER_SIZE 64 + +static void test_mknod(const char *path, mode_t mode, int dev_major, + int dev_minor, int expected_ret, int expected_errno) +{ + int ret; + + unlink(path); + ret = mknod(path, mode, makedev(dev_major, dev_minor)); + ASSERT_EQ(ret, expected_ret, "mknod"); + if (expected_ret) + ASSERT_EQ(errno, expected_errno, "mknod errno"); + else + unlink(path); +} + +static void test_read(const char *path, char *buf, int buf_size, + int expected_ret, int expected_errno) +{ + int ret, fd; + + fd = open(path, O_RDONLY); + + /* A bare open on unauthorized device should fail */ + if (expected_ret < 0) { + ASSERT_EQ(fd, expected_ret, "open ret for read"); + ASSERT_EQ(errno, expected_errno, "open errno for read"); + if (fd >= 0) + close(fd); + return; + } + + if (!ASSERT_OK_FD(fd, "open ret for read")) + return; + + ret = read(fd, buf, buf_size); + ASSERT_EQ(ret, expected_ret, "read"); + + close(fd); +} + +static void test_write(const char *path, char *buf, int buf_size, + int expected_ret, int expected_errno) +{ + int ret, fd; + + fd = open(path, O_WRONLY); + + /* A bare open on unauthorized device should fail */ + if (expected_ret < 0) { + ASSERT_EQ(fd, expected_ret, "open ret for write"); + ASSERT_EQ(errno, expected_errno, "open errno for write"); + if (fd >= 0) + close(fd); + return; + } + + if (!ASSERT_OK_FD(fd, "open ret for write")) + return; + + ret = write(fd, buf, buf_size); + ASSERT_EQ(ret, expected_ret, "write"); + + close(fd); +} + +void test_cgroup_dev(void) +{ + char buf[TEST_BUFFER_SIZE] = "some random test data"; + struct dev_cgroup *skel; + int cgroup_fd; + + cgroup_fd = cgroup_setup_and_join(TEST_CGROUP); + if (!ASSERT_OK_FD(cgroup_fd, "cgroup switch")) + return; + + skel = dev_cgroup__open_and_load(); + if (!ASSERT_OK_PTR(skel, "load program")) + goto cleanup_cgroup; + + skel->links.bpf_prog1 = + bpf_program__attach_cgroup(skel->progs.bpf_prog1, cgroup_fd); + if (!ASSERT_OK_PTR(skel->links.bpf_prog1, "attach_program")) + goto cleanup_progs; + + if (test__start_subtest("allow-mknod")) + test_mknod("/dev/test_dev_cgroup_null", S_IFCHR, 1, 3, 0, 0); + + if (test__start_subtest("allow-read")) + test_read("/dev/urandom", buf, TEST_BUFFER_SIZE, + TEST_BUFFER_SIZE, 0); + + if (test__start_subtest("allow-write")) + test_write("/dev/null", buf, TEST_BUFFER_SIZE, + TEST_BUFFER_SIZE, 0); + + if (test__start_subtest("deny-mknod")) + test_mknod("/dev/test_dev_cgroup_zero", S_IFCHR, 1, 5, -1, + EPERM); + + if (test__start_subtest("deny-read")) + test_read("/dev/random", buf, TEST_BUFFER_SIZE, -1, EPERM); + + if (test__start_subtest("deny-write")) + test_write("/dev/zero", buf, TEST_BUFFER_SIZE, -1, EPERM); + +cleanup_progs: + dev_cgroup__destroy(skel); +cleanup_cgroup: + cleanup_cgroup_environment(); +} diff --git a/tools/testing/selftests/bpf/test_dev_cgroup.c b/tools/testing/selftests/bpf/test_dev_cgroup.c deleted file mode 100644 index 33f544f0005a..000000000000 --- a/tools/testing/selftests/bpf/test_dev_cgroup.c +++ /dev/null @@ -1,85 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* Copyright (c) 2017 Facebook - */ - -#include -#include -#include -#include -#include -#include - -#include -#include -#include - -#include "cgroup_helpers.h" -#include "testing_helpers.h" - -#define DEV_CGROUP_PROG "./dev_cgroup.bpf.o" - -#define TEST_CGROUP "/test-bpf-based-device-cgroup/" - -int main(int argc, char **argv) -{ - struct bpf_object *obj; - int error = EXIT_FAILURE; - int prog_fd, cgroup_fd; - __u32 prog_cnt; - - /* Use libbpf 1.0 API mode */ - libbpf_set_strict_mode(LIBBPF_STRICT_ALL); - - if (bpf_prog_test_load(DEV_CGROUP_PROG, BPF_PROG_TYPE_CGROUP_DEVICE, - &obj, &prog_fd)) { - printf("Failed to load DEV_CGROUP program\n"); - goto out; - } - - cgroup_fd = cgroup_setup_and_join(TEST_CGROUP); - if (cgroup_fd < 0) { - printf("Failed to create test cgroup\n"); - goto out; - } - - /* Attach bpf program */ - if (bpf_prog_attach(prog_fd, cgroup_fd, BPF_CGROUP_DEVICE, 0)) { - printf("Failed to attach DEV_CGROUP program"); - goto err; - } - - if (bpf_prog_query(cgroup_fd, BPF_CGROUP_DEVICE, 0, NULL, NULL, - &prog_cnt)) { - printf("Failed to query attached programs"); - goto err; - } - - /* All operations with /dev/null and /dev/urandom are allowed, - * everything else is forbidden. - */ - assert(system("rm -f /tmp/test_dev_cgroup_zero") == 0); - assert(system("mknod /tmp/test_dev_cgroup_zero c 1 5")); - assert(system("rm -f /tmp/test_dev_cgroup_zero") == 0); - - /* /dev/null is whitelisted */ - assert(system("rm -f /tmp/test_dev_cgroup_null") == 0); - assert(system("mknod /tmp/test_dev_cgroup_null c 1 3") == 0); - assert(system("rm -f /tmp/test_dev_cgroup_null") == 0); - - assert(system("dd if=/dev/urandom of=/dev/null count=64") == 0); - - /* src is allowed, target is forbidden */ - assert(system("dd if=/dev/urandom of=/dev/full count=64")); - - /* src is forbidden, target is allowed */ - assert(system("dd if=/dev/random of=/dev/null count=64")); - - error = 0; - printf("test_dev_cgroup:PASS\n"); - -err: - cleanup_cgroup_environment(); - -out: - return error; -} -- cgit v1.2.3 From 84cdbff4a93501b7199f0d2f1150961ee95c8582 Mon Sep 17 00:00:00 2001 From: "Alexis Lothoré (eBPF Foundation)" Date: Wed, 31 Jul 2024 08:37:27 +0200 Subject: selftests/bpf: add wrong type test to cgroup dev MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Current cgroup_dev test mostly tests that device operation is accepted or refused base on passed major/minor (and so, any operation performed during test involves only char device) Add a small subtest ensuring that the device type passed to bpf program allows it to take decisions as well. Reviewed-by: Alan Maguire Acked-by: Stanislav Fomichev Signed-off-by: Alexis Lothoré (eBPF Foundation) Link: https://lore.kernel.org/r/20240731-convert_dev_cgroup-v4-3-849425d90de6@bootlin.com Signed-off-by: Martin KaFai Lau --- tools/testing/selftests/bpf/prog_tests/cgroup_dev.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/cgroup_dev.c b/tools/testing/selftests/bpf/prog_tests/cgroup_dev.c index 8661e145ba84..5ab7547e38c0 100644 --- a/tools/testing/selftests/bpf/prog_tests/cgroup_dev.c +++ b/tools/testing/selftests/bpf/prog_tests/cgroup_dev.c @@ -114,6 +114,10 @@ void test_cgroup_dev(void) if (test__start_subtest("deny-write")) test_write("/dev/zero", buf, TEST_BUFFER_SIZE, -1, EPERM); + if (test__start_subtest("deny-mknod-wrong-type")) + test_mknod("/dev/test_dev_cgroup_block", S_IFBLK, 1, 3, -1, + EPERM); + cleanup_progs: dev_cgroup__destroy(skel); cleanup_cgroup: -- cgit v1.2.3 From 45eb1bf4d726caff8f1dce7ac8f950012d5f64b1 Mon Sep 17 00:00:00 2001 From: Muhammad Usama Anjum Date: Wed, 10 Jul 2024 13:15:34 +0500 Subject: selftests: tpm2: redirect python unittest logs to stdout The python unittest module writes all its output to stderr, even when the run is clean. Redirect its output logs to stdout. Cc: Jarkko Sakkinen Signed-off-by: Muhammad Usama Anjum Signed-off-by: Shuah Khan --- tools/testing/selftests/tpm2/test_async.sh | 2 +- tools/testing/selftests/tpm2/test_smoke.sh | 2 +- tools/testing/selftests/tpm2/test_space.sh | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/tpm2/test_async.sh b/tools/testing/selftests/tpm2/test_async.sh index 43bf5bd772fd..cf5a9c826097 100755 --- a/tools/testing/selftests/tpm2/test_async.sh +++ b/tools/testing/selftests/tpm2/test_async.sh @@ -7,4 +7,4 @@ ksft_skip=4 [ -e /dev/tpm0 ] || exit $ksft_skip [ -e /dev/tpmrm0 ] || exit $ksft_skip -python3 -m unittest -v tpm2_tests.AsyncTest +python3 -m unittest -v tpm2_tests.AsyncTest 2>&1 diff --git a/tools/testing/selftests/tpm2/test_smoke.sh b/tools/testing/selftests/tpm2/test_smoke.sh index 58af963e5b55..20fa70f970a9 100755 --- a/tools/testing/selftests/tpm2/test_smoke.sh +++ b/tools/testing/selftests/tpm2/test_smoke.sh @@ -6,4 +6,4 @@ ksft_skip=4 [ -e /dev/tpm0 ] || exit $ksft_skip -python3 -m unittest -v tpm2_tests.SmokeTest +python3 -m unittest -v tpm2_tests.SmokeTest 2>&1 diff --git a/tools/testing/selftests/tpm2/test_space.sh b/tools/testing/selftests/tpm2/test_space.sh index 04c47b13fe8a..93894cbc89a8 100755 --- a/tools/testing/selftests/tpm2/test_space.sh +++ b/tools/testing/selftests/tpm2/test_space.sh @@ -6,4 +6,4 @@ ksft_skip=4 [ -e /dev/tpmrm0 ] || exit $ksft_skip -python3 -m unittest -v tpm2_tests.SpaceTest +python3 -m unittest -v tpm2_tests.SpaceTest 2>&1 -- cgit v1.2.3 From 37ee7d1995701c1819e1cc984bdd111fe23c7f5f Mon Sep 17 00:00:00 2001 From: Chang Yu Date: Tue, 23 Jul 2024 21:21:28 -0700 Subject: selftests/exec: Fix grammar in an error message. Replace "not ... nor" in the error message with "neither ... nor". Signed-off-by: Chang Yu Signed-off-by: Shuah Khan --- tools/testing/selftests/exec/execveat.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/exec/execveat.c b/tools/testing/selftests/exec/execveat.c index 6418ded40bdd..071e03532cba 100644 --- a/tools/testing/selftests/exec/execveat.c +++ b/tools/testing/selftests/exec/execveat.c @@ -117,7 +117,7 @@ static int check_execveat_invoked_rc(int fd, const char *path, int flags, } if ((WEXITSTATUS(status) != expected_rc) && (WEXITSTATUS(status) != expected_rc2)) { - ksft_print_msg("child %d exited with %d not %d nor %d\n", + ksft_print_msg("child %d exited with %d neither %d nor %d\n", child, WEXITSTATUS(status), expected_rc, expected_rc2); ksft_test_result_fail("%s\n", test_name); -- cgit v1.2.3 From 0b631ed3ce922b34dad4938215f84e5321f7e524 Mon Sep 17 00:00:00 2001 From: Shreeya Patel Date: Tue, 16 Jul 2024 00:56:34 +0530 Subject: kselftest: cpufreq: Add RTC wakeup alarm Add RTC wakeup alarm for devices to resume after specific time interval. This improvement in the test will help in enabling this test in the CI systems and will eliminate the need of manual intervention for resuming back the devices after suspend/hibernation. Signed-off-by: Shreeya Patel Acked-by: Viresh Kumar Signed-off-by: Shuah Khan --- tools/testing/selftests/cpufreq/cpufreq.sh | 15 +++++++++++++++ tools/testing/selftests/cpufreq/main.sh | 13 ++++++++++++- 2 files changed, 27 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/cpufreq/cpufreq.sh b/tools/testing/selftests/cpufreq/cpufreq.sh index a8b1dbc0a3a5..e350c521b467 100755 --- a/tools/testing/selftests/cpufreq/cpufreq.sh +++ b/tools/testing/selftests/cpufreq/cpufreq.sh @@ -231,6 +231,21 @@ do_suspend() for i in `seq 1 $2`; do printf "Starting $1\n" + + if [ "$3" = "rtc" ]; then + if ! command -v rtcwake &> /dev/null; then + printf "rtcwake could not be found, please install it.\n" + return 1 + fi + + rtcwake -m $filename -s 15 + + if [ $? -ne 0 ]; then + printf "Failed to suspend using RTC wake alarm\n" + return 1 + fi + fi + echo $filename > $SYSFS/power/state printf "Came out of $1\n" diff --git a/tools/testing/selftests/cpufreq/main.sh b/tools/testing/selftests/cpufreq/main.sh index a0eb84cf7167..f12ff7416e41 100755 --- a/tools/testing/selftests/cpufreq/main.sh +++ b/tools/testing/selftests/cpufreq/main.sh @@ -24,6 +24,8 @@ helpme() [-t Date: Wed, 31 Jul 2024 00:14:37 -0500 Subject: scx/selftests: Verify we can call create_dsq from prog_run We already have some testcases verifying that we can call BPF_PROG_TYPE_SYSCALL progs and invoke scx_bpf_exit(). Let's extend that to also call scx_bpf_create_dsq() so we get coverage for that as well. Signed-off-by: David Vernet Signed-off-by: Tejun Heo --- tools/testing/selftests/sched_ext/prog_run.bpf.c | 1 + 1 file changed, 1 insertion(+) (limited to 'tools') diff --git a/tools/testing/selftests/sched_ext/prog_run.bpf.c b/tools/testing/selftests/sched_ext/prog_run.bpf.c index fd2c8f12af16..6a4d7c48e3f2 100644 --- a/tools/testing/selftests/sched_ext/prog_run.bpf.c +++ b/tools/testing/selftests/sched_ext/prog_run.bpf.c @@ -16,6 +16,7 @@ char _license[] SEC("license") = "GPL"; SEC("syscall") int BPF_PROG(prog_run_syscall) { + scx_bpf_create_dsq(0, -1); scx_bpf_exit(0xdeadbeef, "Exited from PROG_RUN"); return 0; } -- cgit v1.2.3 From 3656e566cf03ab0f959b2bd6f8274ee9799641e6 Mon Sep 17 00:00:00 2001 From: Howard Chu Date: Tue, 25 Jun 2024 02:13:42 +0800 Subject: perf test: Add landlock workload We'll use it to add a regression test for the BTF augmentation of enum arguments for tracepoints in 'perf trace': root@x1:~# perf trace -e landlock_add_rule perf test -w landlock 0.000 ( 0.009 ms): perf/747160 landlock_add_rule(ruleset_fd: 11, rule_type: LANDLOCK_RULE_PATH_BENEATH, rule_attr: 0x7ffd8e258594, flags: 45) = -1 EINVAL (Invalid argument) 0.011 ( 0.002 ms): perf/747160 landlock_add_rule(ruleset_fd: 11, rule_type: LANDLOCK_RULE_NET_PORT, rule_attr: 0x7ffd8e2585a0, flags: 45) = -1 EINVAL (Invalid argument) root@x1:~# Committer notes: It was agreed on the discussion (see Link below) to shorten then name of the workload from 'landlock_add_rule' to 'landlock', and I moved it to a separate patch. Also, to address a build failure from Namhyung, I stopped loading linux/landlock.h and instead added the used defines, enums and types to make this build in older systems. All we want is to emit the syscall and intercept it. Suggested-by: Arnaldo Carvalho de Melo Signed-off-by: Howard Chu Tested-by: Arnaldo Carvalho de Melo Cc: Adrian Hunter Cc: Ian Rogers Cc: Jiri Olsa Cc: Kan Liang Cc: Namhyung Kim Link: https://lore.kernel.org/lkml/CAH0uvohaypdTV6Z7O5QSK+va_qnhZ6BP6oSJ89s1c1E0CjgxDA@mail.gmail.com Link: https://lore.kernel.org/r/20240624181345.124764-1-howardchu95@gmail.com Link: https://lore.kernel.org/r/20240624181345.124764-6-howardchu95@gmail.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/builtin-test.c | 1 + tools/perf/tests/tests.h | 1 + tools/perf/tests/workloads/Build | 1 + tools/perf/tests/workloads/landlock.c | 66 +++++++++++++++++++++++++++++++++++ 4 files changed, 69 insertions(+) create mode 100644 tools/perf/tests/workloads/landlock.c (limited to 'tools') diff --git a/tools/perf/tests/builtin-test.c b/tools/perf/tests/builtin-test.c index c3d84b67ca8e..470a9709427d 100644 --- a/tools/perf/tests/builtin-test.c +++ b/tools/perf/tests/builtin-test.c @@ -152,6 +152,7 @@ static struct test_workload *workloads[] = { &workload__sqrtloop, &workload__brstack, &workload__datasym, + &workload__landlock, }; static int num_subtests(const struct test_suite *t) diff --git a/tools/perf/tests/tests.h b/tools/perf/tests/tests.h index 3aa7701ee0e9..6ea2be86b7bf 100644 --- a/tools/perf/tests/tests.h +++ b/tools/perf/tests/tests.h @@ -205,6 +205,7 @@ DECLARE_WORKLOAD(leafloop); DECLARE_WORKLOAD(sqrtloop); DECLARE_WORKLOAD(brstack); DECLARE_WORKLOAD(datasym); +DECLARE_WORKLOAD(landlock); extern const char *dso_to_test; extern const char *test_objdump_path; diff --git a/tools/perf/tests/workloads/Build b/tools/perf/tests/workloads/Build index 48bf0d3b0f3d..5af17206f04d 100644 --- a/tools/perf/tests/workloads/Build +++ b/tools/perf/tests/workloads/Build @@ -6,6 +6,7 @@ perf-test-y += leafloop.o perf-test-y += sqrtloop.o perf-test-y += brstack.o perf-test-y += datasym.o +perf-test-y += landlock.o CFLAGS_sqrtloop.o = -g -O0 -fno-inline -U_FORTIFY_SOURCE CFLAGS_leafloop.o = -g -O0 -fno-inline -fno-omit-frame-pointer -U_FORTIFY_SOURCE diff --git a/tools/perf/tests/workloads/landlock.c b/tools/perf/tests/workloads/landlock.c new file mode 100644 index 000000000000..e2b5ef647c09 --- /dev/null +++ b/tools/perf/tests/workloads/landlock.c @@ -0,0 +1,66 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#include +#include +#include +#include "../tests.h" + +/* This workload was initially added to test enum augmentation with BTF in perf + * trace because its the only syscall that has an enum argument. Since it is + * a recent addition to the Linux kernel (at the time of the introduction of this + * 'perf test' workload) we just add the required types and defines here instead + * of including linux/landlock, that isn't available in older systems. + * + * We are not interested in the the result of the syscall, just in intercepting + * its arguments. + */ + +#ifndef __NR_landlock_add_rule +#define __NR_landlock_add_rule 445 +#endif + +#ifndef LANDLOCK_ACCESS_FS_READ_FILE +#define LANDLOCK_ACCESS_FS_READ_FILE (1ULL << 2) + +#define LANDLOCK_RULE_PATH_BENEATH 1 + +struct landlock_path_beneath_attr { + __u64 allowed_access; + __s32 parent_fd; +}; +#endif + +#ifndef LANDLOCK_ACCESS_NET_CONNECT_TCP +#define LANDLOCK_ACCESS_NET_CONNECT_TCP (1ULL << 1) + +#define LANDLOCK_RULE_NET_PORT 2 + +struct landlock_net_port_attr { + __u64 allowed_access; + __u64 port; +}; +#endif + +static int landlock(int argc __maybe_unused, const char **argv __maybe_unused) +{ + int fd = 11, flags = 45; + + struct landlock_path_beneath_attr path_beneath_attr = { + .allowed_access = LANDLOCK_ACCESS_FS_READ_FILE, + .parent_fd = 14, + }; + + struct landlock_net_port_attr net_port_attr = { + .port = 19, + .allowed_access = LANDLOCK_ACCESS_NET_CONNECT_TCP, + }; + + syscall(__NR_landlock_add_rule, fd, LANDLOCK_RULE_PATH_BENEATH, + &path_beneath_attr, flags); + + syscall(__NR_landlock_add_rule, fd, LANDLOCK_RULE_NET_PORT, + &net_port_attr, flags); + + return 0; +} + +DEFINE_WORKLOAD(landlock); -- cgit v1.2.3 From d66763fed30f0bd8cc8fb2c8c144c69fcb560bda Mon Sep 17 00:00:00 2001 From: Howard Chu Date: Tue, 25 Jun 2024 02:13:43 +0800 Subject: perf test trace_btf_enum: Add regression test for the BTF augmentation of enums in 'perf trace' Trace landlock_add_rule syscall to see if the output is desirable. Trace the non-syscall tracepoint 'timer:hrtimer_init' and 'timer:hrtimer_start', see if the 'mode' argument is augmented, the 'mode' enum argument has the prefix of 'HRTIMER_MODE_' in its name. Committer testing: root@x1:~# perf test enum 124: perf trace enum augmentation tests : Ok root@x1:~# perf test -v enum 124: perf trace enum augmentation tests : Ok root@x1:~# perf trace -e landlock_add_rule perf test -v enum 0.000 ( 0.010 ms): perf/749827 landlock_add_rule(ruleset_fd: 11, rule_type: LANDLOCK_RULE_PATH_BENEATH, rule_attr: 0x7ffd324171d4, flags: 45) = -1 EINVAL (Invalid argument) 0.012 ( 0.002 ms): perf/749827 landlock_add_rule(ruleset_fd: 11, rule_type: LANDLOCK_RULE_NET_PORT, rule_attr: 0x7ffd324171e0, flags: 45) = -1 EINVAL (Invalid argument) 457.821 ( 0.007 ms): perf/749830 landlock_add_rule(ruleset_fd: 11, rule_type: LANDLOCK_RULE_PATH_BENEATH, rule_attr: 0x7ffd4acd31e4, flags: 45) = -1 EINVAL (Invalid argument) 457.832 ( 0.003 ms): perf/749830 landlock_add_rule(ruleset_fd: 11, rule_type: LANDLOCK_RULE_NET_PORT, rule_attr: 0x7ffd4acd31f0, flags: 45) = -1 EINVAL (Invalid argument) 124: perf trace enum augmentation tests : Ok root@x1:~# Suggested-by: Arnaldo Carvalho de Melo Signed-off-by: Howard Chu Tested-by: Arnaldo Carvalho de Melo Cc: Adrian Hunter Cc: Ian Rogers Cc: Jiri Olsa Cc: Kan Liang Cc: Namhyung Kim Link: https://lore.kernel.org/lkml/20240619082042.4173621-6-howardchu95@gmail.com Link: https://lore.kernel.org/r/20240624181345.124764-7-howardchu95@gmail.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/shell/trace_btf_enum.sh | 61 ++++++++++++++++++++++++++++++++ 1 file changed, 61 insertions(+) create mode 100755 tools/perf/tests/shell/trace_btf_enum.sh (limited to 'tools') diff --git a/tools/perf/tests/shell/trace_btf_enum.sh b/tools/perf/tests/shell/trace_btf_enum.sh new file mode 100755 index 000000000000..7d407b52bea5 --- /dev/null +++ b/tools/perf/tests/shell/trace_btf_enum.sh @@ -0,0 +1,61 @@ +#!/bin/sh +# perf trace enum augmentation tests +# SPDX-License-Identifier: GPL-2.0 + +err=0 +set -e + +syscall="landlock_add_rule" +non_syscall="timer:hrtimer_init,timer:hrtimer_start" + +TESTPROG="perf test -w landlock" + +. "$(dirname $0)"/lib/probe.sh +skip_if_no_perf_trace || exit 2 + +check_vmlinux() { + echo "Checking if vmlinux exists" + if ! ls /sys/kernel/btf/vmlinux 1>/dev/null 2>&1 + then + echo "trace+enum test [Skipped missing vmlinux BTF support]" + err=2 + fi +} + +trace_landlock() { + echo "Tracing syscall ${syscall}" + + # test flight just to see if landlock_add_rule and libbpf are available + $TESTPROG + + if perf trace -e $syscall $TESTPROG 2>&1 | \ + grep -q -E ".*landlock_add_rule\(ruleset_fd: 11, rule_type: (LANDLOCK_RULE_PATH_BENEATH|LANDLOCK_RULE_NET_PORT), rule_attr: 0x[a-f0-9]+, flags: 45\) = -1.*" + then + err=0 + else + err=1 + fi +} + +trace_non_syscall() { + echo "Tracing non-syscall tracepoint ${non-syscall}" + if perf trace -e $non_syscall --max-events=1 2>&1 | \ + grep -q -E '.*timer:hrtimer_.*\(.*mode: HRTIMER_MODE_.*\)$' + then + err=0 + else + err=1 + fi +} + +check_vmlinux + +if [ $err = 0 ]; then + trace_landlock +fi + +if [ $err = 0 ]; then + trace_non_syscall +fi + +exit $err -- cgit v1.2.3 From 62284329b194606f73252935cc422cf6156e811a Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 25 Jun 2024 02:13:44 +0800 Subject: perf trace: Introduce trace__btf_scnprintf() To have a central place that will look at the BTF type and call the right scnprintf routine or return zero, meaning BTF pretty printing isn't available or not implemented for a specific type. Signed-off-by: Howard Chu Tested-by: Howard Chu Cc: Adrian Hunter Cc: Ian Rogers Cc: Jiri Olsa Cc: Kan Liang Cc: Namhyung Kim Link: https://lore.kernel.org/r/20240624181345.124764-8-howardchu95@gmail.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-trace.c | 49 +++++++++++++++++++++++++--------------------- 1 file changed, 27 insertions(+), 22 deletions(-) (limited to 'tools') diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c index e664001d5ed7..d9104fc4f61f 100644 --- a/tools/perf/builtin-trace.c +++ b/tools/perf/builtin-trace.c @@ -981,18 +981,28 @@ static size_t btf_enum_scnprintf(const struct btf_type *type, struct btf *btf, c static size_t trace__btf_enum_scnprintf(struct trace *trace, struct syscall_arg_fmt *arg_fmt, char *bf, size_t size, int val, char *type) { - if (trace->btf == NULL) - return 0; - if (syscall_arg_fmt__cache_btf_enum(arg_fmt, trace->btf, type) < 0) return 0; return btf_enum_scnprintf(arg_fmt->type, trace->btf, bf, size, val); } + +static size_t trace__btf_scnprintf(struct trace *trace, struct syscall_arg_fmt *arg_fmt, char *bf, + size_t size, int val, char *type) +{ + if (trace->btf == NULL) + return 0; + + if (arg_fmt->is_enum) + return trace__btf_enum_scnprintf(trace, arg_fmt, bf, size, val, type); + + return 0; +} + #else // HAVE_LIBBPF_SUPPORT -static size_t trace__btf_enum_scnprintf(struct trace *trace __maybe_unused, struct syscall_arg_fmt *arg_fmt __maybe_unused, - char *bf __maybe_unused, size_t size __maybe_unused, int val __maybe_unused, - char *type __maybe_unused) +static size_t trace__btf_scnprintf(struct trace *trace __maybe_unused, struct syscall_arg_fmt *arg_fmt __maybe_unused, + char *bf __maybe_unused, size_t size __maybe_unused, int val __maybe_unused, + char *type __maybe_unused) { return 0; } @@ -2183,7 +2193,7 @@ static size_t syscall__scnprintf_args(struct syscall *sc, char *bf, size_t size, unsigned char *args, void *augmented_args, int augmented_args_size, struct trace *trace, struct thread *thread) { - size_t printed = 0; + size_t printed = 0, btf_printed; unsigned long val; u8 bit = 1; struct syscall_arg arg = { @@ -2237,13 +2247,11 @@ static size_t syscall__scnprintf_args(struct syscall *sc, char *bf, size_t size, if (trace->show_arg_names) printed += scnprintf(bf + printed, size - printed, "%s: ", field->name); - if (sc->arg_fmt[arg.idx].is_enum) { - size_t p = trace__btf_enum_scnprintf(trace, &sc->arg_fmt[arg.idx], bf + printed, - size - printed, val, field->type); - if (p) { - printed += p; - continue; - } + btf_printed = trace__btf_scnprintf(trace, &sc->arg_fmt[arg.idx], bf + printed, + size - printed, val, field->type); + if (btf_printed) { + printed += btf_printed; + continue; } printed += syscall_arg_fmt__scnprintf_val(&sc->arg_fmt[arg.idx], @@ -2892,7 +2900,7 @@ static size_t trace__fprintf_tp_fields(struct trace *trace, struct evsel *evsel, size_t size = sizeof(bf); struct tep_format_field *field = evsel->tp_format->format.fields; struct syscall_arg_fmt *arg = __evsel__syscall_arg_fmt(evsel); - size_t printed = 0; + size_t printed = 0, btf_printed; unsigned long val; u8 bit = 1; struct syscall_arg syscall_arg = { @@ -2942,13 +2950,10 @@ static size_t trace__fprintf_tp_fields(struct trace *trace, struct evsel *evsel, if (trace->show_arg_names) printed += scnprintf(bf + printed, size - printed, "%s: ", field->name); - if (arg->is_enum) { - size_t p = trace__btf_enum_scnprintf(trace, arg, bf + printed, - size - printed, val, field->type); - if (p) { - printed += p; - continue; - } + btf_printed = trace__btf_scnprintf(trace, arg, bf + printed, size - printed, val, field->type); + if (btf_printed) { + printed += btf_printed; + continue; } printed += syscall_arg_fmt__scnprintf_val(arg, bf + printed, size - printed, &syscall_arg, val); -- cgit v1.2.3 From c3d747134cec49aa95aad22d14deffa43b2be37d Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 25 Jun 2024 02:13:45 +0800 Subject: perf trace: Remove arg_fmt->is_enum, we can get that from the BTF type This is to pave the way for other BTF types, i.e. we try to find BTF type then use things like btf_is_enum(btf_type) that we cached to find the right strtoul and scnprintf routines. For now only enum is supported, all the other types simple return zero for scnprintf which makes it have the same behaviour as when BTF isn't available, i.e. fallback to no pretty printing. Ditto for strtoul. root@x1:~# perf test -v enum 124: perf trace enum augmentation tests : Ok root@x1:~# perf test -v enum 124: perf trace enum augmentation tests : Ok root@x1:~# perf test -v enum 124: perf trace enum augmentation tests : Ok root@x1:~# perf test -v enum 124: perf trace enum augmentation tests : Ok root@x1:~# perf test -v enum 124: perf trace enum augmentation tests : Ok root@x1:~# Signed-off-by: Howard Chu Tested-by: Howard Chu Cc: Adrian Hunter Cc: Howard Chu Cc: Ian Rogers Cc: Jiri Olsa Cc: Kan Liang Cc: Namhyung Kim Link: https://lore.kernel.org/r/20240624181345.124764-9-howardchu95@gmail.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-trace.c | 101 +++++++++++++++++++++------------------ tools/perf/trace/beauty/beauty.h | 1 + 2 files changed, 56 insertions(+), 46 deletions(-) (limited to 'tools') diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c index d9104fc4f61f..488c2cedc110 100644 --- a/tools/perf/builtin-trace.c +++ b/tools/perf/builtin-trace.c @@ -111,7 +111,6 @@ struct syscall_arg_fmt { const char *name; u16 nr_entries; // for arrays bool show_zero; - bool is_enum; #ifdef HAVE_LIBBPF_SUPPORT const struct btf_type *type; #endif @@ -910,33 +909,46 @@ static size_t syscall_arg__scnprintf_getrandom_flags(char *bf, size_t size, #define SCA_GETRANDOM_FLAGS syscall_arg__scnprintf_getrandom_flags #ifdef HAVE_LIBBPF_SUPPORT -static int syscall_arg_fmt__cache_btf_enum(struct syscall_arg_fmt *arg_fmt, struct btf *btf, char *type) +static void syscall_arg_fmt__cache_btf_enum(struct syscall_arg_fmt *arg_fmt, struct btf *btf, char *type) { int id; - // Already cached? - if (arg_fmt->type != NULL) - return 0; - type = strstr(type, "enum "); if (type == NULL) - return -1; + return; type += 5; // skip "enum " to get the enumeration name id = btf__find_by_name(btf, type); if (id < 0) - return -1; + return; arg_fmt->type = btf__type_by_id(btf, id); - return arg_fmt->type == NULL ? -1 : 0; } static bool syscall_arg__strtoul_btf_enum(char *bf, size_t size, struct syscall_arg *arg, u64 *val) +{ + const struct btf_type *bt = arg->fmt->type; + struct btf *btf = arg->trace->btf; + struct btf_enum *be = btf_enum(bt); + + for (int i = 0; i < btf_vlen(bt); ++i, ++be) { + const char *name = btf__name_by_offset(btf, be->name_off); + int max_len = max(size, strlen(name)); + + if (strncmp(name, bf, max_len) == 0) { + *val = be->val; + return true; + } + } + + return false; +} + +static bool syscall_arg__strtoul_btf_type(char *bf, size_t size, struct syscall_arg *arg, u64 *val) { const struct btf_type *bt; - char *type = arg->parm; - struct btf_enum *be; + char *type = arg->type_name; struct btf *btf; trace__load_vmlinux_btf(arg->trace); @@ -945,20 +957,19 @@ static bool syscall_arg__strtoul_btf_enum(char *bf, size_t size, struct syscall_ if (btf == NULL) return false; - if (syscall_arg_fmt__cache_btf_enum(arg->fmt, btf, type) < 0) - return false; + if (arg->fmt->type == NULL) { + // See if this is an enum + syscall_arg_fmt__cache_btf_enum(arg->fmt, btf, type); + } + // Now let's see if we have a BTF type resolved bt = arg->fmt->type; - be = btf_enum(bt); - for (int i = 0; i < btf_vlen(bt); ++i, ++be) { - const char *name = btf__name_by_offset(btf, be->name_off); - int max_len = max(size, strlen(name)); + if (bt == NULL) + return false; - if (strncmp(name, bf, max_len) == 0) { - *val = be->val; - return true; - } - } + // If it is an enum: + if (btf_is_enum(arg->fmt->type)) + return syscall_arg__strtoul_btf_enum(bf, size, arg, val); return false; } @@ -978,23 +989,23 @@ static size_t btf_enum_scnprintf(const struct btf_type *type, struct btf *btf, c return 0; } -static size_t trace__btf_enum_scnprintf(struct trace *trace, struct syscall_arg_fmt *arg_fmt, char *bf, - size_t size, int val, char *type) -{ - if (syscall_arg_fmt__cache_btf_enum(arg_fmt, trace->btf, type) < 0) - return 0; - - return btf_enum_scnprintf(arg_fmt->type, trace->btf, bf, size, val); -} - static size_t trace__btf_scnprintf(struct trace *trace, struct syscall_arg_fmt *arg_fmt, char *bf, size_t size, int val, char *type) { if (trace->btf == NULL) return 0; - if (arg_fmt->is_enum) - return trace__btf_enum_scnprintf(trace, arg_fmt, bf, size, val, type); + if (arg_fmt->type == NULL) { + // Check if this is an enum and if we have the BTF type for it. + syscall_arg_fmt__cache_btf_enum(arg_fmt, trace->btf, type); + } + + // Did we manage to find a BTF type for the syscall/tracepoint argument? + if (arg_fmt->type == NULL) + return 0; + + if (btf_is_enum(arg_fmt->type)) + return btf_enum_scnprintf(arg_fmt->type, trace->btf, bf, size, val); return 0; } @@ -1007,14 +1018,14 @@ static size_t trace__btf_scnprintf(struct trace *trace __maybe_unused, struct sy return 0; } -static bool syscall_arg__strtoul_btf_enum(char *bf __maybe_unused, size_t size __maybe_unused, +static bool syscall_arg__strtoul_btf_type(char *bf __maybe_unused, size_t size __maybe_unused, struct syscall_arg *arg __maybe_unused, u64 *val __maybe_unused) { return false; } #endif // HAVE_LIBBPF_SUPPORT -#define STUL_BTF_ENUM syscall_arg__strtoul_btf_enum +#define STUL_BTF_TYPE syscall_arg__strtoul_btf_type #define STRARRAY(name, array) \ { .scnprintf = SCA_STRARRAY, \ @@ -1887,7 +1898,6 @@ syscall_arg_fmt__init_array(struct syscall_arg_fmt *arg, struct tep_format_field continue; len = strlen(field->name); - arg->is_enum = false; if (strcmp(field->type, "const char *") == 0 && ((len >= 4 && strcmp(field->name + len - 4, "name") == 0) || @@ -1915,8 +1925,8 @@ syscall_arg_fmt__init_array(struct syscall_arg_fmt *arg, struct tep_format_field */ arg->scnprintf = SCA_FD; } else if (strstr(field->type, "enum") && use_btf != NULL) { - *use_btf = arg->is_enum = true; - arg->strtoul = STUL_BTF_ENUM; + *use_btf = true; + arg->strtoul = STUL_BTF_TYPE; } else { const struct syscall_arg_fmt *fmt = syscall_arg_fmt__find_by_name(field->name); @@ -2236,10 +2246,13 @@ static size_t syscall__scnprintf_args(struct syscall *sc, char *bf, size_t size, /* * Suppress this argument if its value is zero and show_zero * property isn't set. + * + * If it has a BTF type, then override the zero suppression knob + * as the common case is for zero in an enum to have an associated entry. */ if (val == 0 && !trace->show_zeros && !(sc->arg_fmt && sc->arg_fmt[arg.idx].show_zero) && - !(sc->arg_fmt && sc->arg_fmt[arg.idx].is_enum)) + !(sc->arg_fmt && sc->arg_fmt[arg.idx].strtoul == STUL_BTF_TYPE)) continue; printed += scnprintf(bf + printed, size - printed, "%s", printed ? ", " : ""); @@ -2942,7 +2955,7 @@ static size_t trace__fprintf_tp_fields(struct trace *trace, struct evsel *evsel, val = syscall_arg_fmt__mask_val(arg, &syscall_arg, val); /* Suppress this argument if its value is zero and show_zero property isn't set. */ - if (val == 0 && !trace->show_zeros && !arg->show_zero && !arg->is_enum) + if (val == 0 && !trace->show_zeros && !arg->show_zero && arg->strtoul != STUL_BTF_TYPE) continue; printed += scnprintf(bf + printed, size - printed, "%s", printed ? ", " : ""); @@ -3910,14 +3923,10 @@ static int trace__expand_filter(struct trace *trace, struct evsel *evsel) struct syscall_arg syscall_arg = { .trace = trace, .fmt = fmt, + .type_name = type, + .parm = fmt->parm, }; - if (fmt->is_enum) { - syscall_arg.parm = type; - } else { - syscall_arg.parm = fmt->parm; - } - if (fmt->strtoul(right, right_size, &syscall_arg, &val)) { char *n, expansion[19]; int expansion_lenght = scnprintf(expansion, sizeof(expansion), "%#" PRIx64, val); diff --git a/tools/perf/trace/beauty/beauty.h b/tools/perf/trace/beauty/beauty.h index 78d10d92d351..3ed11e18ee2d 100644 --- a/tools/perf/trace/beauty/beauty.h +++ b/tools/perf/trace/beauty/beauty.h @@ -113,6 +113,7 @@ struct syscall_arg { struct thread *thread; struct trace *trace; void *parm; + char *type_name; u16 len; u8 idx; u8 mask; -- cgit v1.2.3 From e293f4b1e57fcc4d7d34b7a7a44ebec8ba8a1b7d Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Wed, 17 Jul 2024 09:57:17 -0700 Subject: perf test: Avoid python leak sanitizer test failures Leak sanitizer will report memory leaks from python and the leak sanitizer output causes tests to fail. For example: ``` $ perf test 98 -v 98: perf script tests: --- start --- test child forked, pid 1272962 DB test [ perf record: Woken up 1 times to write data ] [ perf record: Captured and wrote 0.046 MB /tmp/perf-test-script.x0EktdCel8/perf.data (8 samples) ] call_path_table((1, 0, 0, 0) call_path_table((2, 1, 0, 140339508617447) call_path_table((3, 2, 2, 0) call_path_table((4, 3, 3, 0) call_path_table((5, 4, 4, 0) call_path_table((6, 5, 5, 0) call_path_table((7, 6, 6, 0) call_path_table((8, 7, 7, 0) call_path_table((9, 8, 8, 0) call_path_table((10, 9, 9, 0) call_path_table((11, 10, 10, 0) call_path_table((12, 11, 11, 0) call_path_table((13, 12, 1, 0) sample_table((1, 1, 1, 1, 1, 1, 1, 8, -2058824120, 588306954119000, -1, 0, 0, 0, 0, 1, 0, 0, 128933429281, 0, 0, 13, 0, 0, 0, -1, -1)) sample_table((2, 1, 1, 1, 1, 1, 1, 8, -2058824120, 588306954137053, -1, 0, 0, 0, 0, 1, 0, 0, 128933429281, 0, 0, 13, 0, 0, 0, -1, -1)) sample_table((3, 1, 1, 1, 1, 1, 1, 8, -2058824120, 588306954140089, -1, 0, 0, 0, 0, 9, 0, 0, 128933429281, 0, 0, 13, 0, 0, 0, -1, -1)) sample_table((4, 1, 1, 1, 1, 1, 1, 8, -2058824120, 588306954142376, -1, 0, 0, 0, 0, 155, 0, 0, 128933429281, 0, 0, 13, 0, 0, 0, -1, -1)) sample_table((5, 1, 1, 1, 1, 1, 1, 8, -2058824120, 588306954144045, -1, 0, 0, 0, 0, 2493, 0, 0, 128933429281, 0, 0, 13, 0, 0, 0, -1, -1)) sample_table((6, 1, 1, 1, 1, 1, 12, 77, -2046828595, 588306954145722, -1, 0, 0, 0, 0, 47555, 0, 0, 128933429281, 0, 0, 13, 0, 0, 0, -1, -1)) call_path_table((14, 9, 14, 0) call_path_table((15, 14, 15, 0) call_path_table((16, 15, 0, -1040969624) call_path_table((17, 16, 16, 0) call_path_table((18, 17, 17, 0) call_path_table((19, 18, 18, 0) call_path_table((20, 19, 19, 0) call_path_table((21, 20, 13, 0) sample_table((7, 1, 1, 1, 2, 1, 13, 46, -2053700898, 588306954157436, -1, 0, 0, 0, 0, 964078, 0, 0, 128933429281, 0, 0, 21, 0, 0, 0, -1, -1)) call_path_table((22, 1, 21, 0) call_path_table((23, 22, 22, 0) call_path_table((24, 23, 23, 0) call_path_table((25, 24, 24, 0) call_path_table((26, 25, 25, 0) call_path_table((27, 26, 26, 0) call_path_table((28, 27, 27, 0) call_path_table((29, 28, 28, 0) call_path_table((30, 29, 29, 0) call_path_table((31, 30, 30, 0) call_path_table((32, 31, 31, 0) call_path_table((33, 32, 32, 0) call_path_table((34, 33, 33, 0) call_path_table((35, 34, 20, 0) sample_table((8, 1, 1, 1, 2, 1, 20, 49, -2046878127, 588306954378624, -1, 0, 0, 0, 0, 2534317, 0, 0, 128933429281, 0, 0, 35, 0, 0, 0, -1, -1)) ================================================================= ==1272975==ERROR: LeakSanitizer: detected memory leaks Direct leak of 13628 byte(s) in 6 object(s) allocated from: #0 0x56354f60c092 in malloc (/tmp/perf/perf+0x29c092) #1 0x7ff25c7d02e7 in _PyObject_Malloc /build/python3.11/../Objects/obmalloc.c:2003:11 #2 0x7ff25c7d02e7 in _PyObject_Malloc /build/python3.11/../Objects/obmalloc.c:1996:1 SUMMARY: AddressSanitizer: 13628 byte(s) leaked in 6 allocation(s). --- Cleaning up --- ---- end(-1) ---- 98: perf script tests : FAILED! ``` Disable leak sanitizer when running specific perf+python tests to avoid this. This causes the tests to pass when run with leak sanitizer. Reviewed-by: Aditya Gupta Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Andi Kleen Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kan Liang Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Thomas Richter Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/shell/script.sh | 3 +++ tools/perf/tests/shell/test_task_analyzer.sh | 3 +++ 2 files changed, 6 insertions(+) (limited to 'tools') diff --git a/tools/perf/tests/shell/script.sh b/tools/perf/tests/shell/script.sh index c1a603653662..d3e2958d2242 100755 --- a/tools/perf/tests/shell/script.sh +++ b/tools/perf/tests/shell/script.sh @@ -61,7 +61,10 @@ _end_of_file_ esac perf record $cmd_flags -o "${perfdatafile}" true + # Disable lsan to avoid warnings about python memory leaks. + export ASAN_OPTIONS=detect_leaks=0 perf script -i "${perfdatafile}" -s "${db_test}" + export ASAN_OPTIONS= echo "DB test [Success]" } diff --git a/tools/perf/tests/shell/test_task_analyzer.sh b/tools/perf/tests/shell/test_task_analyzer.sh index 92d15154ba79..cb02bf23e6a5 100755 --- a/tools/perf/tests/shell/test_task_analyzer.sh +++ b/tools/perf/tests/shell/test_task_analyzer.sh @@ -11,6 +11,9 @@ if [ -e "$perfdir/scripts/python/Perf-Trace-Util" ]; then export PERF_EXEC_PATH=$perfdir fi +# Disable lsan to avoid warnings about python memory leaks. +export ASAN_OPTIONS=detect_leaks=0 + cleanup() { rm -f perf.data rm -f perf.data.old -- cgit v1.2.3 From 1d303deedb1057c6ca36fc0d1c0d7bf58a5b7322 Mon Sep 17 00:00:00 2001 From: Athira Rajeev Date: Thu, 18 Jul 2024 14:13:44 +0530 Subject: perf annotate: Move the data structures related to register type to header file Data type profiling uses instruction tracking by checking each instruction and updating the register type state in some data structures. This is useful to find the data type in cases when the register state gets transferred from one reg to another. Example, in x86, "mov" instruction and in powerpc, "mr" instruction. Currently these structures are defined in annotate-data.c and instruction tracking is implemented only for x86. Move these data structures to "annotate-data.h" header file so that other arch implementations can use it in arch specific files as well. Reviewed-by: Kajol Jain Reviewed-by: Namhyung Kim Signed-off-by: Athira Rajeev Tested-by: Kajol Jain Cc: Adrian Hunter Cc: Akanksha J N Cc: Christophe Leroy Cc: Disha Goel Cc: Hari Bathini Cc: Ian Rogers Cc: Jiri Olsa Cc: Madhavan Srinivasan Cc: Segher Boessenkool Link: https://lore.kernel.org/lkml/20240718084358.72242-2-atrajeev@linux.vnet.ibm.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/annotate-data.c | 53 +------------------------------------- tools/perf/util/annotate-data.h | 56 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 57 insertions(+), 52 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/annotate-data.c b/tools/perf/util/annotate-data.c index 965da6c0b542..a4c7f98a75e3 100644 --- a/tools/perf/util/annotate-data.c +++ b/tools/perf/util/annotate-data.c @@ -31,15 +31,6 @@ static void delete_var_types(struct die_var_type *var_types); -enum type_state_kind { - TSR_KIND_INVALID = 0, - TSR_KIND_TYPE, - TSR_KIND_PERCPU_BASE, - TSR_KIND_CONST, - TSR_KIND_POINTER, - TSR_KIND_CANARY, -}; - #define pr_debug_dtp(fmt, ...) \ do { \ if (debug_type_profile) \ @@ -140,49 +131,7 @@ static void pr_debug_location(Dwarf_Die *die, u64 pc, int reg) } } -/* - * Type information in a register, valid when @ok is true. - * The @caller_saved registers are invalidated after a function call. - */ -struct type_state_reg { - Dwarf_Die type; - u32 imm_value; - bool ok; - bool caller_saved; - u8 kind; -}; - -/* Type information in a stack location, dynamically allocated */ -struct type_state_stack { - struct list_head list; - Dwarf_Die type; - int offset; - int size; - bool compound; - u8 kind; -}; - -/* FIXME: This should be arch-dependent */ -#define TYPE_STATE_MAX_REGS 16 - -/* - * State table to maintain type info in each register and stack location. - * It'll be updated when new variable is allocated or type info is moved - * to a new location (register or stack). As it'd be used with the - * shortest path of basic blocks, it only maintains a single table. - */ -struct type_state { - /* state of general purpose registers */ - struct type_state_reg regs[TYPE_STATE_MAX_REGS]; - /* state of stack location */ - struct list_head stack_vars; - /* return value register */ - int ret_reg; - /* stack pointer register */ - int stack_reg; -}; - -static bool has_reg_type(struct type_state *state, int reg) +bool has_reg_type(struct type_state *state, int reg) { return (unsigned)reg < ARRAY_SIZE(state->regs); } diff --git a/tools/perf/util/annotate-data.h b/tools/perf/util/annotate-data.h index 0a57d9f5ee78..cdb5cd8960bb 100644 --- a/tools/perf/util/annotate-data.h +++ b/tools/perf/util/annotate-data.h @@ -6,6 +6,11 @@ #include #include #include +#include "annotate.h" + +#ifdef HAVE_DWARF_SUPPORT +#include "debuginfo.h" +#endif struct annotated_op_loc; struct debuginfo; @@ -15,6 +20,15 @@ struct hist_entry; struct map_symbol; struct thread; +enum type_state_kind { + TSR_KIND_INVALID = 0, + TSR_KIND_TYPE, + TSR_KIND_PERCPU_BASE, + TSR_KIND_CONST, + TSR_KIND_POINTER, + TSR_KIND_CANARY, +}; + /** * struct annotated_member - Type of member field * @node: List entry in the parent list @@ -143,6 +157,47 @@ struct annotated_data_stat { extern struct annotated_data_stat ann_data_stat; #ifdef HAVE_DWARF_SUPPORT +/* + * Type information in a register, valid when @ok is true. + * The @caller_saved registers are invalidated after a function call. + */ +struct type_state_reg { + Dwarf_Die type; + u32 imm_value; + bool ok; + bool caller_saved; + u8 kind; +}; + +/* Type information in a stack location, dynamically allocated */ +struct type_state_stack { + struct list_head list; + Dwarf_Die type; + int offset; + int size; + bool compound; + u8 kind; +}; + +/* FIXME: This should be arch-dependent */ +#define TYPE_STATE_MAX_REGS 16 + +/* + * State table to maintain type info in each register and stack location. + * It'll be updated when new variable is allocated or type info is moved + * to a new location (register or stack). As it'd be used with the + * shortest path of basic blocks, it only maintains a single table. + */ +struct type_state { + /* state of general purpose registers */ + struct type_state_reg regs[TYPE_STATE_MAX_REGS]; + /* state of stack location */ + struct list_head stack_vars; + /* return value register */ + int ret_reg; + /* stack pointer register */ + int stack_reg; +}; /* Returns data type at the location (ip, reg, offset) */ struct annotated_data_type *find_data_type(struct data_loc_info *dloc); @@ -160,6 +215,7 @@ void global_var_type__tree_delete(struct rb_root *root); int hist_entry__annotate_data_tty(struct hist_entry *he, struct evsel *evsel); +bool has_reg_type(struct type_state *state, int reg); #else /* HAVE_DWARF_SUPPORT */ static inline struct annotated_data_type * -- cgit v1.2.3 From 782959ac248ac3cbac80f7476d4e0410662ff400 Mon Sep 17 00:00:00 2001 From: Athira Rajeev Date: Thu, 18 Jul 2024 14:13:45 +0530 Subject: perf annotate: Add "update_insn_state" callback function to handle arch specific instruction tracking Add "update_insn_state" callback to "struct arch" to handle instruction tracking. Currently updating instruction state is handled by static function "update_insn_state_x86" which is defined in "annotate-data.c". Make this as a callback for specific arch and move to archs specific file "arch/x86/annotate/instructions.c" . This will help to add helper function for other platforms in file: "arch//annotate/instructions.c" and make changes/updates easier. Define callback "update_insn_state" as part of "struct arch", also make some of the debug functions non-static so that it can be referenced from other places. Reviewed-by: Kajol Jain Reviewed-by: Namhyung Kim Signed-off-by: Athira Rajeev Tested-by: Kajol Jain Cc: Adrian Hunter Cc: Akanksha J N Cc: Christophe Leroy Cc: Disha Goel Cc: Hari Bathini Cc: Ian Rogers Cc: Jiri Olsa Cc: Madhavan Srinivasan Cc: Segher Boessenkool Link: https://lore.kernel.org/lkml/20240718084358.72242-3-atrajeev@linux.vnet.ibm.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/arch/x86/annotate/instructions.c | 377 +++++++++++++++++++++++++++ tools/perf/util/annotate-data.c | 391 +--------------------------- tools/perf/util/annotate-data.h | 23 ++ tools/perf/util/disasm.c | 4 + tools/perf/util/disasm.h | 12 + 5 files changed, 424 insertions(+), 383 deletions(-) (limited to 'tools') diff --git a/tools/perf/arch/x86/annotate/instructions.c b/tools/perf/arch/x86/annotate/instructions.c index 5cdf457f5cbe..7b7d462c6c6b 100644 --- a/tools/perf/arch/x86/annotate/instructions.c +++ b/tools/perf/arch/x86/annotate/instructions.c @@ -206,3 +206,380 @@ static int x86__annotate_init(struct arch *arch, char *cpuid) arch->initialized = true; return err; } + +#ifdef HAVE_DWARF_SUPPORT +static void update_insn_state_x86(struct type_state *state, + struct data_loc_info *dloc, Dwarf_Die *cu_die, + struct disasm_line *dl) +{ + struct annotated_insn_loc loc; + struct annotated_op_loc *src = &loc.ops[INSN_OP_SOURCE]; + struct annotated_op_loc *dst = &loc.ops[INSN_OP_TARGET]; + struct type_state_reg *tsr; + Dwarf_Die type_die; + u32 insn_offset = dl->al.offset; + int fbreg = dloc->fbreg; + int fboff = 0; + + if (annotate_get_insn_location(dloc->arch, dl, &loc) < 0) + return; + + if (ins__is_call(&dl->ins)) { + struct symbol *func = dl->ops.target.sym; + + if (func == NULL) + return; + + /* __fentry__ will preserve all registers */ + if (!strcmp(func->name, "__fentry__")) + return; + + pr_debug_dtp("call [%x] %s\n", insn_offset, func->name); + + /* Otherwise invalidate caller-saved registers after call */ + for (unsigned i = 0; i < ARRAY_SIZE(state->regs); i++) { + if (state->regs[i].caller_saved) + state->regs[i].ok = false; + } + + /* Update register with the return type (if any) */ + if (die_find_func_rettype(cu_die, func->name, &type_die)) { + tsr = &state->regs[state->ret_reg]; + tsr->type = type_die; + tsr->kind = TSR_KIND_TYPE; + tsr->ok = true; + + pr_debug_dtp("call [%x] return -> reg%d", + insn_offset, state->ret_reg); + pr_debug_type_name(&type_die, tsr->kind); + } + return; + } + + if (!strncmp(dl->ins.name, "add", 3)) { + u64 imm_value = -1ULL; + int offset; + const char *var_name = NULL; + struct map_symbol *ms = dloc->ms; + u64 ip = ms->sym->start + dl->al.offset; + + if (!has_reg_type(state, dst->reg1)) + return; + + tsr = &state->regs[dst->reg1]; + + if (src->imm) + imm_value = src->offset; + else if (has_reg_type(state, src->reg1) && + state->regs[src->reg1].kind == TSR_KIND_CONST) + imm_value = state->regs[src->reg1].imm_value; + else if (src->reg1 == DWARF_REG_PC) { + u64 var_addr = annotate_calc_pcrel(dloc->ms, ip, + src->offset, dl); + + if (get_global_var_info(dloc, var_addr, + &var_name, &offset) && + !strcmp(var_name, "this_cpu_off") && + tsr->kind == TSR_KIND_CONST) { + tsr->kind = TSR_KIND_PERCPU_BASE; + imm_value = tsr->imm_value; + } + } + else + return; + + if (tsr->kind != TSR_KIND_PERCPU_BASE) + return; + + if (get_global_var_type(cu_die, dloc, ip, imm_value, &offset, + &type_die) && offset == 0) { + /* + * This is not a pointer type, but it should be treated + * as a pointer. + */ + tsr->type = type_die; + tsr->kind = TSR_KIND_POINTER; + tsr->ok = true; + + pr_debug_dtp("add [%x] percpu %#"PRIx64" -> reg%d", + insn_offset, imm_value, dst->reg1); + pr_debug_type_name(&tsr->type, tsr->kind); + } + return; + } + + if (strncmp(dl->ins.name, "mov", 3)) + return; + + if (dloc->fb_cfa) { + u64 ip = dloc->ms->sym->start + dl->al.offset; + u64 pc = map__rip_2objdump(dloc->ms->map, ip); + + if (die_get_cfa(dloc->di->dbg, pc, &fbreg, &fboff) < 0) + fbreg = -1; + } + + /* Case 1. register to register or segment:offset to register transfers */ + if (!src->mem_ref && !dst->mem_ref) { + if (!has_reg_type(state, dst->reg1)) + return; + + tsr = &state->regs[dst->reg1]; + if (dso__kernel(map__dso(dloc->ms->map)) && + src->segment == INSN_SEG_X86_GS && src->imm) { + u64 ip = dloc->ms->sym->start + dl->al.offset; + u64 var_addr; + int offset; + + /* + * In kernel, %gs points to a per-cpu region for the + * current CPU. Access with a constant offset should + * be treated as a global variable access. + */ + var_addr = src->offset; + + if (var_addr == 40) { + tsr->kind = TSR_KIND_CANARY; + tsr->ok = true; + + pr_debug_dtp("mov [%x] stack canary -> reg%d\n", + insn_offset, dst->reg1); + return; + } + + if (!get_global_var_type(cu_die, dloc, ip, var_addr, + &offset, &type_die) || + !die_get_member_type(&type_die, offset, &type_die)) { + tsr->ok = false; + return; + } + + tsr->type = type_die; + tsr->kind = TSR_KIND_TYPE; + tsr->ok = true; + + pr_debug_dtp("mov [%x] this-cpu addr=%#"PRIx64" -> reg%d", + insn_offset, var_addr, dst->reg1); + pr_debug_type_name(&tsr->type, tsr->kind); + return; + } + + if (src->imm) { + tsr->kind = TSR_KIND_CONST; + tsr->imm_value = src->offset; + tsr->ok = true; + + pr_debug_dtp("mov [%x] imm=%#x -> reg%d\n", + insn_offset, tsr->imm_value, dst->reg1); + return; + } + + if (!has_reg_type(state, src->reg1) || + !state->regs[src->reg1].ok) { + tsr->ok = false; + return; + } + + tsr->type = state->regs[src->reg1].type; + tsr->kind = state->regs[src->reg1].kind; + tsr->ok = true; + + pr_debug_dtp("mov [%x] reg%d -> reg%d", + insn_offset, src->reg1, dst->reg1); + pr_debug_type_name(&tsr->type, tsr->kind); + } + /* Case 2. memory to register transers */ + if (src->mem_ref && !dst->mem_ref) { + int sreg = src->reg1; + + if (!has_reg_type(state, dst->reg1)) + return; + + tsr = &state->regs[dst->reg1]; + +retry: + /* Check stack variables with offset */ + if (sreg == fbreg) { + struct type_state_stack *stack; + int offset = src->offset - fboff; + + stack = find_stack_state(state, offset); + if (stack == NULL) { + tsr->ok = false; + return; + } else if (!stack->compound) { + tsr->type = stack->type; + tsr->kind = stack->kind; + tsr->ok = true; + } else if (die_get_member_type(&stack->type, + offset - stack->offset, + &type_die)) { + tsr->type = type_die; + tsr->kind = TSR_KIND_TYPE; + tsr->ok = true; + } else { + tsr->ok = false; + return; + } + + pr_debug_dtp("mov [%x] -%#x(stack) -> reg%d", + insn_offset, -offset, dst->reg1); + pr_debug_type_name(&tsr->type, tsr->kind); + } + /* And then dereference the pointer if it has one */ + else if (has_reg_type(state, sreg) && state->regs[sreg].ok && + state->regs[sreg].kind == TSR_KIND_TYPE && + die_deref_ptr_type(&state->regs[sreg].type, + src->offset, &type_die)) { + tsr->type = type_die; + tsr->kind = TSR_KIND_TYPE; + tsr->ok = true; + + pr_debug_dtp("mov [%x] %#x(reg%d) -> reg%d", + insn_offset, src->offset, sreg, dst->reg1); + pr_debug_type_name(&tsr->type, tsr->kind); + } + /* Or check if it's a global variable */ + else if (sreg == DWARF_REG_PC) { + struct map_symbol *ms = dloc->ms; + u64 ip = ms->sym->start + dl->al.offset; + u64 addr; + int offset; + + addr = annotate_calc_pcrel(ms, ip, src->offset, dl); + + if (!get_global_var_type(cu_die, dloc, ip, addr, &offset, + &type_die) || + !die_get_member_type(&type_die, offset, &type_die)) { + tsr->ok = false; + return; + } + + tsr->type = type_die; + tsr->kind = TSR_KIND_TYPE; + tsr->ok = true; + + pr_debug_dtp("mov [%x] global addr=%"PRIx64" -> reg%d", + insn_offset, addr, dst->reg1); + pr_debug_type_name(&type_die, tsr->kind); + } + /* And check percpu access with base register */ + else if (has_reg_type(state, sreg) && + state->regs[sreg].kind == TSR_KIND_PERCPU_BASE) { + u64 ip = dloc->ms->sym->start + dl->al.offset; + u64 var_addr = src->offset; + int offset; + + if (src->multi_regs) { + int reg2 = (sreg == src->reg1) ? src->reg2 : src->reg1; + + if (has_reg_type(state, reg2) && state->regs[reg2].ok && + state->regs[reg2].kind == TSR_KIND_CONST) + var_addr += state->regs[reg2].imm_value; + } + + /* + * In kernel, %gs points to a per-cpu region for the + * current CPU. Access with a constant offset should + * be treated as a global variable access. + */ + if (get_global_var_type(cu_die, dloc, ip, var_addr, + &offset, &type_die) && + die_get_member_type(&type_die, offset, &type_die)) { + tsr->type = type_die; + tsr->kind = TSR_KIND_TYPE; + tsr->ok = true; + + if (src->multi_regs) { + pr_debug_dtp("mov [%x] percpu %#x(reg%d,reg%d) -> reg%d", + insn_offset, src->offset, src->reg1, + src->reg2, dst->reg1); + } else { + pr_debug_dtp("mov [%x] percpu %#x(reg%d) -> reg%d", + insn_offset, src->offset, sreg, dst->reg1); + } + pr_debug_type_name(&tsr->type, tsr->kind); + } else { + tsr->ok = false; + } + } + /* And then dereference the calculated pointer if it has one */ + else if (has_reg_type(state, sreg) && state->regs[sreg].ok && + state->regs[sreg].kind == TSR_KIND_POINTER && + die_get_member_type(&state->regs[sreg].type, + src->offset, &type_die)) { + tsr->type = type_die; + tsr->kind = TSR_KIND_TYPE; + tsr->ok = true; + + pr_debug_dtp("mov [%x] pointer %#x(reg%d) -> reg%d", + insn_offset, src->offset, sreg, dst->reg1); + pr_debug_type_name(&tsr->type, tsr->kind); + } + /* Or try another register if any */ + else if (src->multi_regs && sreg == src->reg1 && + src->reg1 != src->reg2) { + sreg = src->reg2; + goto retry; + } + else { + int offset; + const char *var_name = NULL; + + /* it might be per-cpu variable (in kernel) access */ + if (src->offset < 0) { + if (get_global_var_info(dloc, (s64)src->offset, + &var_name, &offset) && + !strcmp(var_name, "__per_cpu_offset")) { + tsr->kind = TSR_KIND_PERCPU_BASE; + + pr_debug_dtp("mov [%x] percpu base reg%d\n", + insn_offset, dst->reg1); + } + } + + tsr->ok = false; + } + } + /* Case 3. register to memory transfers */ + if (!src->mem_ref && dst->mem_ref) { + if (!has_reg_type(state, src->reg1) || + !state->regs[src->reg1].ok) + return; + + /* Check stack variables with offset */ + if (dst->reg1 == fbreg) { + struct type_state_stack *stack; + int offset = dst->offset - fboff; + + tsr = &state->regs[src->reg1]; + + stack = find_stack_state(state, offset); + if (stack) { + /* + * The source register is likely to hold a type + * of member if it's a compound type. Do not + * update the stack variable type since we can + * get the member type later by using the + * die_get_member_type(). + */ + if (!stack->compound) + set_stack_state(stack, offset, tsr->kind, + &tsr->type); + } else { + findnew_stack_state(state, offset, tsr->kind, + &tsr->type); + } + + pr_debug_dtp("mov [%x] reg%d -> -%#x(stack)", + insn_offset, src->reg1, -offset); + pr_debug_type_name(&tsr->type, tsr->kind); + } + /* + * Ignore other transfers since it'd set a value in a struct + * and won't change the type. + */ + } + /* Case 4. memory to memory transfers (not handled for now) */ +} +#endif diff --git a/tools/perf/util/annotate-data.c b/tools/perf/util/annotate-data.c index a4c7f98a75e3..7a48c3d72b89 100644 --- a/tools/perf/util/annotate-data.c +++ b/tools/perf/util/annotate-data.c @@ -39,7 +39,7 @@ do { \ pr_debug3(fmt, ##__VA_ARGS__); \ } while (0) -static void pr_debug_type_name(Dwarf_Die *die, enum type_state_kind kind) +void pr_debug_type_name(Dwarf_Die *die, enum type_state_kind kind) { struct strbuf sb; char *str; @@ -390,7 +390,7 @@ static int check_variable(struct data_loc_info *dloc, Dwarf_Die *var_die, return 0; } -static struct type_state_stack *find_stack_state(struct type_state *state, +struct type_state_stack *find_stack_state(struct type_state *state, int offset) { struct type_state_stack *stack; @@ -406,7 +406,7 @@ static struct type_state_stack *find_stack_state(struct type_state *state, return NULL; } -static void set_stack_state(struct type_state_stack *stack, int offset, u8 kind, +void set_stack_state(struct type_state_stack *stack, int offset, u8 kind, Dwarf_Die *type_die) { int tag; @@ -433,7 +433,7 @@ static void set_stack_state(struct type_state_stack *stack, int offset, u8 kind, } } -static struct type_state_stack *findnew_stack_state(struct type_state *state, +struct type_state_stack *findnew_stack_state(struct type_state *state, int offset, u8 kind, Dwarf_Die *type_die) { @@ -537,7 +537,7 @@ void global_var_type__tree_delete(struct rb_root *root) } } -static bool get_global_var_info(struct data_loc_info *dloc, u64 addr, +bool get_global_var_info(struct data_loc_info *dloc, u64 addr, const char **var_name, int *var_offset) { struct addr_location al; @@ -611,7 +611,7 @@ static void global_var__collect(struct data_loc_info *dloc) } } -static bool get_global_var_type(Dwarf_Die *cu_die, struct data_loc_info *dloc, +bool get_global_var_type(Dwarf_Die *cu_die, struct data_loc_info *dloc, u64 ip, u64 var_addr, int *var_offset, Dwarf_Die *type_die) { @@ -722,381 +722,6 @@ static void update_var_state(struct type_state *state, struct data_loc_info *dlo } } -static void update_insn_state_x86(struct type_state *state, - struct data_loc_info *dloc, Dwarf_Die *cu_die, - struct disasm_line *dl) -{ - struct annotated_insn_loc loc; - struct annotated_op_loc *src = &loc.ops[INSN_OP_SOURCE]; - struct annotated_op_loc *dst = &loc.ops[INSN_OP_TARGET]; - struct type_state_reg *tsr; - Dwarf_Die type_die; - u32 insn_offset = dl->al.offset; - int fbreg = dloc->fbreg; - int fboff = 0; - - if (annotate_get_insn_location(dloc->arch, dl, &loc) < 0) - return; - - if (ins__is_call(&dl->ins)) { - struct symbol *func = dl->ops.target.sym; - - if (func == NULL) - return; - - /* __fentry__ will preserve all registers */ - if (!strcmp(func->name, "__fentry__")) - return; - - pr_debug_dtp("call [%x] %s\n", insn_offset, func->name); - - /* Otherwise invalidate caller-saved registers after call */ - for (unsigned i = 0; i < ARRAY_SIZE(state->regs); i++) { - if (state->regs[i].caller_saved) - state->regs[i].ok = false; - } - - /* Update register with the return type (if any) */ - if (die_find_func_rettype(cu_die, func->name, &type_die)) { - tsr = &state->regs[state->ret_reg]; - tsr->type = type_die; - tsr->kind = TSR_KIND_TYPE; - tsr->ok = true; - - pr_debug_dtp("call [%x] return -> reg%d", - insn_offset, state->ret_reg); - pr_debug_type_name(&type_die, tsr->kind); - } - return; - } - - if (!strncmp(dl->ins.name, "add", 3)) { - u64 imm_value = -1ULL; - int offset; - const char *var_name = NULL; - struct map_symbol *ms = dloc->ms; - u64 ip = ms->sym->start + dl->al.offset; - - if (!has_reg_type(state, dst->reg1)) - return; - - tsr = &state->regs[dst->reg1]; - - if (src->imm) - imm_value = src->offset; - else if (has_reg_type(state, src->reg1) && - state->regs[src->reg1].kind == TSR_KIND_CONST) - imm_value = state->regs[src->reg1].imm_value; - else if (src->reg1 == DWARF_REG_PC) { - u64 var_addr = annotate_calc_pcrel(dloc->ms, ip, - src->offset, dl); - - if (get_global_var_info(dloc, var_addr, - &var_name, &offset) && - !strcmp(var_name, "this_cpu_off") && - tsr->kind == TSR_KIND_CONST) { - tsr->kind = TSR_KIND_PERCPU_BASE; - imm_value = tsr->imm_value; - } - } - else - return; - - if (tsr->kind != TSR_KIND_PERCPU_BASE) - return; - - if (get_global_var_type(cu_die, dloc, ip, imm_value, &offset, - &type_die) && offset == 0) { - /* - * This is not a pointer type, but it should be treated - * as a pointer. - */ - tsr->type = type_die; - tsr->kind = TSR_KIND_POINTER; - tsr->ok = true; - - pr_debug_dtp("add [%x] percpu %#"PRIx64" -> reg%d", - insn_offset, imm_value, dst->reg1); - pr_debug_type_name(&tsr->type, tsr->kind); - } - return; - } - - if (strncmp(dl->ins.name, "mov", 3)) - return; - - if (dloc->fb_cfa) { - u64 ip = dloc->ms->sym->start + dl->al.offset; - u64 pc = map__rip_2objdump(dloc->ms->map, ip); - - if (die_get_cfa(dloc->di->dbg, pc, &fbreg, &fboff) < 0) - fbreg = -1; - } - - /* Case 1. register to register or segment:offset to register transfers */ - if (!src->mem_ref && !dst->mem_ref) { - if (!has_reg_type(state, dst->reg1)) - return; - - tsr = &state->regs[dst->reg1]; - if (dso__kernel(map__dso(dloc->ms->map)) && - src->segment == INSN_SEG_X86_GS && src->imm) { - u64 ip = dloc->ms->sym->start + dl->al.offset; - u64 var_addr; - int offset; - - /* - * In kernel, %gs points to a per-cpu region for the - * current CPU. Access with a constant offset should - * be treated as a global variable access. - */ - var_addr = src->offset; - - if (var_addr == 40) { - tsr->kind = TSR_KIND_CANARY; - tsr->ok = true; - - pr_debug_dtp("mov [%x] stack canary -> reg%d\n", - insn_offset, dst->reg1); - return; - } - - if (!get_global_var_type(cu_die, dloc, ip, var_addr, - &offset, &type_die) || - !die_get_member_type(&type_die, offset, &type_die)) { - tsr->ok = false; - return; - } - - tsr->type = type_die; - tsr->kind = TSR_KIND_TYPE; - tsr->ok = true; - - pr_debug_dtp("mov [%x] this-cpu addr=%#"PRIx64" -> reg%d", - insn_offset, var_addr, dst->reg1); - pr_debug_type_name(&tsr->type, tsr->kind); - return; - } - - if (src->imm) { - tsr->kind = TSR_KIND_CONST; - tsr->imm_value = src->offset; - tsr->ok = true; - - pr_debug_dtp("mov [%x] imm=%#x -> reg%d\n", - insn_offset, tsr->imm_value, dst->reg1); - return; - } - - if (!has_reg_type(state, src->reg1) || - !state->regs[src->reg1].ok) { - tsr->ok = false; - return; - } - - tsr->type = state->regs[src->reg1].type; - tsr->kind = state->regs[src->reg1].kind; - tsr->ok = true; - - pr_debug_dtp("mov [%x] reg%d -> reg%d", - insn_offset, src->reg1, dst->reg1); - pr_debug_type_name(&tsr->type, tsr->kind); - } - /* Case 2. memory to register transers */ - if (src->mem_ref && !dst->mem_ref) { - int sreg = src->reg1; - - if (!has_reg_type(state, dst->reg1)) - return; - - tsr = &state->regs[dst->reg1]; - -retry: - /* Check stack variables with offset */ - if (sreg == fbreg) { - struct type_state_stack *stack; - int offset = src->offset - fboff; - - stack = find_stack_state(state, offset); - if (stack == NULL) { - tsr->ok = false; - return; - } else if (!stack->compound) { - tsr->type = stack->type; - tsr->kind = stack->kind; - tsr->ok = true; - } else if (die_get_member_type(&stack->type, - offset - stack->offset, - &type_die)) { - tsr->type = type_die; - tsr->kind = TSR_KIND_TYPE; - tsr->ok = true; - } else { - tsr->ok = false; - return; - } - - pr_debug_dtp("mov [%x] -%#x(stack) -> reg%d", - insn_offset, -offset, dst->reg1); - pr_debug_type_name(&tsr->type, tsr->kind); - } - /* And then dereference the pointer if it has one */ - else if (has_reg_type(state, sreg) && state->regs[sreg].ok && - state->regs[sreg].kind == TSR_KIND_TYPE && - die_deref_ptr_type(&state->regs[sreg].type, - src->offset, &type_die)) { - tsr->type = type_die; - tsr->kind = TSR_KIND_TYPE; - tsr->ok = true; - - pr_debug_dtp("mov [%x] %#x(reg%d) -> reg%d", - insn_offset, src->offset, sreg, dst->reg1); - pr_debug_type_name(&tsr->type, tsr->kind); - } - /* Or check if it's a global variable */ - else if (sreg == DWARF_REG_PC) { - struct map_symbol *ms = dloc->ms; - u64 ip = ms->sym->start + dl->al.offset; - u64 addr; - int offset; - - addr = annotate_calc_pcrel(ms, ip, src->offset, dl); - - if (!get_global_var_type(cu_die, dloc, ip, addr, &offset, - &type_die) || - !die_get_member_type(&type_die, offset, &type_die)) { - tsr->ok = false; - return; - } - - tsr->type = type_die; - tsr->kind = TSR_KIND_TYPE; - tsr->ok = true; - - pr_debug_dtp("mov [%x] global addr=%"PRIx64" -> reg%d", - insn_offset, addr, dst->reg1); - pr_debug_type_name(&type_die, tsr->kind); - } - /* And check percpu access with base register */ - else if (has_reg_type(state, sreg) && - state->regs[sreg].kind == TSR_KIND_PERCPU_BASE) { - u64 ip = dloc->ms->sym->start + dl->al.offset; - u64 var_addr = src->offset; - int offset; - - if (src->multi_regs) { - int reg2 = (sreg == src->reg1) ? src->reg2 : src->reg1; - - if (has_reg_type(state, reg2) && state->regs[reg2].ok && - state->regs[reg2].kind == TSR_KIND_CONST) - var_addr += state->regs[reg2].imm_value; - } - - /* - * In kernel, %gs points to a per-cpu region for the - * current CPU. Access with a constant offset should - * be treated as a global variable access. - */ - if (get_global_var_type(cu_die, dloc, ip, var_addr, - &offset, &type_die) && - die_get_member_type(&type_die, offset, &type_die)) { - tsr->type = type_die; - tsr->kind = TSR_KIND_TYPE; - tsr->ok = true; - - if (src->multi_regs) { - pr_debug_dtp("mov [%x] percpu %#x(reg%d,reg%d) -> reg%d", - insn_offset, src->offset, src->reg1, - src->reg2, dst->reg1); - } else { - pr_debug_dtp("mov [%x] percpu %#x(reg%d) -> reg%d", - insn_offset, src->offset, sreg, dst->reg1); - } - pr_debug_type_name(&tsr->type, tsr->kind); - } else { - tsr->ok = false; - } - } - /* And then dereference the calculated pointer if it has one */ - else if (has_reg_type(state, sreg) && state->regs[sreg].ok && - state->regs[sreg].kind == TSR_KIND_POINTER && - die_get_member_type(&state->regs[sreg].type, - src->offset, &type_die)) { - tsr->type = type_die; - tsr->kind = TSR_KIND_TYPE; - tsr->ok = true; - - pr_debug_dtp("mov [%x] pointer %#x(reg%d) -> reg%d", - insn_offset, src->offset, sreg, dst->reg1); - pr_debug_type_name(&tsr->type, tsr->kind); - } - /* Or try another register if any */ - else if (src->multi_regs && sreg == src->reg1 && - src->reg1 != src->reg2) { - sreg = src->reg2; - goto retry; - } - else { - int offset; - const char *var_name = NULL; - - /* it might be per-cpu variable (in kernel) access */ - if (src->offset < 0) { - if (get_global_var_info(dloc, (s64)src->offset, - &var_name, &offset) && - !strcmp(var_name, "__per_cpu_offset")) { - tsr->kind = TSR_KIND_PERCPU_BASE; - - pr_debug_dtp("mov [%x] percpu base reg%d\n", - insn_offset, dst->reg1); - } - } - - tsr->ok = false; - } - } - /* Case 3. register to memory transfers */ - if (!src->mem_ref && dst->mem_ref) { - if (!has_reg_type(state, src->reg1) || - !state->regs[src->reg1].ok) - return; - - /* Check stack variables with offset */ - if (dst->reg1 == fbreg) { - struct type_state_stack *stack; - int offset = dst->offset - fboff; - - tsr = &state->regs[src->reg1]; - - stack = find_stack_state(state, offset); - if (stack) { - /* - * The source register is likely to hold a type - * of member if it's a compound type. Do not - * update the stack variable type since we can - * get the member type later by using the - * die_get_member_type(). - */ - if (!stack->compound) - set_stack_state(stack, offset, tsr->kind, - &tsr->type); - } else { - findnew_stack_state(state, offset, tsr->kind, - &tsr->type); - } - - pr_debug_dtp("mov [%x] reg%d -> -%#x(stack)", - insn_offset, src->reg1, -offset); - pr_debug_type_name(&tsr->type, tsr->kind); - } - /* - * Ignore other transfers since it'd set a value in a struct - * and won't change the type. - */ - } - /* Case 4. memory to memory transfers (not handled for now) */ -} - /** * update_insn_state - Update type state for an instruction * @state: type state table @@ -1115,8 +740,8 @@ retry: static void update_insn_state(struct type_state *state, struct data_loc_info *dloc, Dwarf_Die *cu_die, struct disasm_line *dl) { - if (arch__is(dloc->arch, "x86")) - update_insn_state_x86(state, dloc, cu_die, dl); + if (dloc->arch->update_insn_state) + dloc->arch->update_insn_state(state, dloc, cu_die, dl); } /* diff --git a/tools/perf/util/annotate-data.h b/tools/perf/util/annotate-data.h index cdb5cd8960bb..6fe8ee8b8410 100644 --- a/tools/perf/util/annotate-data.h +++ b/tools/perf/util/annotate-data.h @@ -6,6 +6,7 @@ #include #include #include +#include "dwarf-regs.h" #include "annotate.h" #ifdef HAVE_DWARF_SUPPORT @@ -20,6 +21,14 @@ struct hist_entry; struct map_symbol; struct thread; +#define pr_debug_dtp(fmt, ...) \ +do { \ + if (debug_type_profile) \ + pr_info(fmt, ##__VA_ARGS__); \ + else \ + pr_debug3(fmt, ##__VA_ARGS__); \ +} while (0) + enum type_state_kind { TSR_KIND_INVALID = 0, TSR_KIND_TYPE, @@ -216,6 +225,20 @@ void global_var_type__tree_delete(struct rb_root *root); int hist_entry__annotate_data_tty(struct hist_entry *he, struct evsel *evsel); bool has_reg_type(struct type_state *state, int reg); +struct type_state_stack *findnew_stack_state(struct type_state *state, + int offset, u8 kind, + Dwarf_Die *type_die); +void set_stack_state(struct type_state_stack *stack, int offset, u8 kind, + Dwarf_Die *type_die); +struct type_state_stack *find_stack_state(struct type_state *state, + int offset); +bool get_global_var_type(Dwarf_Die *cu_die, struct data_loc_info *dloc, + u64 ip, u64 var_addr, int *var_offset, + Dwarf_Die *type_die); +bool get_global_var_info(struct data_loc_info *dloc, u64 addr, + const char **var_name, int *var_offset); +void pr_debug_type_name(Dwarf_Die *die, enum type_state_kind kind); + #else /* HAVE_DWARF_SUPPORT */ static inline struct annotated_data_type * diff --git a/tools/perf/util/disasm.c b/tools/perf/util/disasm.c index e10558b79504..931cd92dcc40 100644 --- a/tools/perf/util/disasm.c +++ b/tools/perf/util/disasm.c @@ -12,6 +12,7 @@ #include #include "annotate.h" +#include "annotate-data.h" #include "build-id.h" #include "debug.h" #include "disasm.h" @@ -145,6 +146,9 @@ static struct arch architectures[] = { .memory_ref_char = '(', .imm_char = '$', }, +#ifdef HAVE_DWARF_SUPPORT + .update_insn_state = update_insn_state_x86, +#endif }, { .name = "powerpc", diff --git a/tools/perf/util/disasm.h b/tools/perf/util/disasm.h index 3d381a043520..c835759c8e2b 100644 --- a/tools/perf/util/disasm.h +++ b/tools/perf/util/disasm.h @@ -4,11 +4,18 @@ #include "map_symbol.h" +#ifdef HAVE_DWARF_SUPPORT +#include "dwarf-aux.h" +#endif + struct annotation_options; struct disasm_line; struct ins; struct evsel; struct symbol; +struct data_loc_info; +struct type_state; +struct disasm_line; struct arch { const char *name; @@ -32,6 +39,11 @@ struct arch { char memory_ref_char; char imm_char; } objdump; +#ifdef HAVE_DWARF_SUPPORT + void (*update_insn_state)(struct type_state *state, + struct data_loc_info *dloc, Dwarf_Die *cu_die, + struct disasm_line *dl); +#endif }; struct ins { -- cgit v1.2.3 From b1d8d968a7983e9756de34b8bcaa24cc339828ed Mon Sep 17 00:00:00 2001 From: Athira Rajeev Date: Thu, 18 Jul 2024 14:13:46 +0530 Subject: perf annotate: Update TYPE_STATE_MAX_REGS to include max of regs in powerpc TYPE_STATE_MAX_REGS is arch-dependent. Currently this is defined to be 16. While checking if reg is valid using has_reg_type, max value is checked using TYPE_STATE_MAX_REGS value. Define this conditionally for powerpc. Reviewed-by: Kajol Jain Reviewed-by: Namhyung Kim Signed-off-by: Athira Rajeev Tested-by: Kajol Jain Cc: Adrian Hunter Cc: Akanksha J N Cc: Christophe Leroy Cc: Disha Goel Cc: Hari Bathini Cc: Ian Rogers Cc: Jiri Olsa Cc: Madhavan Srinivasan Cc: Segher Boessenkool Link: https://lore.kernel.org/lkml/20240718084358.72242-4-atrajeev@linux.vnet.ibm.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/annotate-data.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'tools') diff --git a/tools/perf/util/annotate-data.h b/tools/perf/util/annotate-data.h index 6fe8ee8b8410..992b7ce4bd11 100644 --- a/tools/perf/util/annotate-data.h +++ b/tools/perf/util/annotate-data.h @@ -189,7 +189,11 @@ struct type_state_stack { }; /* FIXME: This should be arch-dependent */ +#ifdef __powerpc__ +#define TYPE_STATE_MAX_REGS 32 +#else #define TYPE_STATE_MAX_REGS 16 +#endif /* * State table to maintain type info in each register and stack location. -- cgit v1.2.3 From 06dd4c5a561c48c66745352bae0b2c04bbe455be Mon Sep 17 00:00:00 2001 From: Athira Rajeev Date: Thu, 18 Jul 2024 14:13:47 +0530 Subject: perf annotate: Add disasm_line__parse() to parse raw instruction for powerpc Currently, the perf tool infrastructure uses the disasm_line__parse function to parse disassembled line. Example snippet from objdump: objdump --start-address=
--stop-address=
-d --no-show-raw-insn -C c0000000010224b4: lwz r10,0(r9) This line "lwz r10,0(r9)" is parsed to extract instruction name, registers names and offset. In powerpc, the approach for data type profiling uses raw instruction instead of result from objdump to identify the instruction category and extract the source/target registers. Example: 38 01 81 e8 ld r4,312(r1) Here "38 01 81 e8" is the raw instruction representation. Add function "disasm_line__parse_powerpc" to handle parsing of raw instruction. Also update "struct disasm_line" to save the binary code/ With the change, function captures: line -> "38 01 81 e8 ld r4,312(r1)" raw instruction "38 01 81 e8" Raw instruction is used later to extract the reg/offset fields. Macros are added to extract opcode and register fields. "struct disasm_line" is updated to carry union of "bytes" and "raw_insn" of 32 bit to carry raw code (raw). Function "disasm_line__parse_powerpc fills the raw instruction hex value and can use macros to get opcode. There is no changes in existing code paths, which parses the disassembled code. The size of raw instruction depends on architecture. In case of powerpc, the parsing the disasm line needs to handle cases for reading binary code directly from DSO as well as parsing the objdump result. Hence adding the logic into separate function instead of updating "disasm_line__parse". The architecture using the instruction name and present approach is not altered. Since this approach targets powerpc, the macro implementation is added for powerpc as of now. Since the disasm_line__parse is used in other cases (perf annotate) and not only data tye profiling, the powerpc callback includes changes to work with binary code as well as mnemonic representation. Also in case if the DSO read fails and libcapstone is not supported, the approach fallback to use objdump as option. Hence as option, patch has changes to ensure objdump option also works well. Reviewed-by: Kajol Jain Reviewed-by: Namhyung Kim Signed-off-by: Athira Rajeev Tested-by: Kajol Jain Cc: Adrian Hunter Cc: Akanksha J N Cc: Christophe Leroy Cc: Disha Goel Cc: Hari Bathini Cc: Ian Rogers Cc: Jiri Olsa Cc: Madhavan Srinivasan Cc: Segher Boessenkool Link: https://lore.kernel.org/lkml/20240718084358.72242-5-atrajeev@linux.vnet.ibm.com [ Add check for strndup() result ] Signed-off-by: Arnaldo Carvalho de Melo --- tools/include/linux/string.h | 2 + tools/lib/string.c | 13 +++++++ tools/perf/arch/powerpc/annotate/instructions.c | 1 + tools/perf/arch/powerpc/util/dwarf-regs.c | 9 +++++ tools/perf/util/annotate.h | 5 ++- tools/perf/util/disasm.c | 51 ++++++++++++++++++++++++- 6 files changed, 79 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/include/linux/string.h b/tools/include/linux/string.h index db5c99318c79..0acb1fc14e19 100644 --- a/tools/include/linux/string.h +++ b/tools/include/linux/string.h @@ -46,5 +46,7 @@ extern char * __must_check skip_spaces(const char *); extern char *strim(char *); +extern void remove_spaces(char *s); + extern void *memchr_inv(const void *start, int c, size_t bytes); #endif /* _TOOLS_LINUX_STRING_H_ */ diff --git a/tools/lib/string.c b/tools/lib/string.c index 8b6892f959ab..3126d2cff716 100644 --- a/tools/lib/string.c +++ b/tools/lib/string.c @@ -153,6 +153,19 @@ char *strim(char *s) return skip_spaces(s); } +/* + * remove_spaces - Removes whitespaces from @s + */ +void remove_spaces(char *s) +{ + char *d = s; + + do { + while (*d == ' ') + ++d; + } while ((*s++ = *d++)); +} + /** * strreplace - Replace all occurrences of character in string. * @s: The string to operate on. diff --git a/tools/perf/arch/powerpc/annotate/instructions.c b/tools/perf/arch/powerpc/annotate/instructions.c index a3f423c27cae..d57fd023ef9c 100644 --- a/tools/perf/arch/powerpc/annotate/instructions.c +++ b/tools/perf/arch/powerpc/annotate/instructions.c @@ -55,6 +55,7 @@ static int powerpc__annotate_init(struct arch *arch, char *cpuid __maybe_unused) arch->initialized = true; arch->associate_instruction_ops = powerpc__associate_instruction_ops; arch->objdump.comment_char = '#'; + annotate_opts.show_asm_raw = true; } return 0; diff --git a/tools/perf/arch/powerpc/util/dwarf-regs.c b/tools/perf/arch/powerpc/util/dwarf-regs.c index 0c4f4caf53ac..430623ca5612 100644 --- a/tools/perf/arch/powerpc/util/dwarf-regs.c +++ b/tools/perf/arch/powerpc/util/dwarf-regs.c @@ -98,3 +98,12 @@ int regs_query_register_offset(const char *name) return roff->ptregs_offset; return -EINVAL; } + +#define PPC_OP(op) (((op) >> 26) & 0x3F) +#define PPC_RA(a) (((a) >> 16) & 0x1f) +#define PPC_RT(t) (((t) >> 21) & 0x1f) +#define PPC_RB(b) (((b) >> 11) & 0x1f) +#define PPC_D(D) ((D) & 0xfffe) +#define PPC_DS(DS) ((DS) & 0xfffc) +#define OP_LD 58 +#define OP_STD 62 diff --git a/tools/perf/util/annotate.h b/tools/perf/util/annotate.h index d5c821c22f79..9ba772f46270 100644 --- a/tools/perf/util/annotate.h +++ b/tools/perf/util/annotate.h @@ -113,7 +113,10 @@ struct annotation_line { struct disasm_line { struct ins ins; struct ins_operands ops; - + union { + u8 bytes[4]; + u32 raw_insn; + } raw; /* This needs to be at the end. */ struct annotation_line al; }; diff --git a/tools/perf/util/disasm.c b/tools/perf/util/disasm.c index 931cd92dcc40..6d4055b9b966 100644 --- a/tools/perf/util/disasm.c +++ b/tools/perf/util/disasm.c @@ -44,6 +44,7 @@ static int call__scnprintf(struct ins *ins, char *bf, size_t size, static void ins__sort(struct arch *arch); static int disasm_line__parse(char *line, const char **namep, char **rawp); +static int disasm_line__parse_powerpc(struct disasm_line *dl); static __attribute__((constructor)) void symbol__init_regexpr(void) { @@ -845,6 +846,51 @@ out: return -1; } +/* + * Parses the result captured from symbol__disassemble_* + * Example, line read from DSO file in powerpc: + * line: 38 01 81 e8 + * opcode: fetched from arch specific get_opcode_insn + * rawp_insn: e8810138 + * + * rawp_insn is used later to extract the reg/offset fields + */ +#define PPC_OP(op) (((op) >> 26) & 0x3F) +#define RAW_BYTES 11 + +static int disasm_line__parse_powerpc(struct disasm_line *dl) +{ + char *line = dl->al.line; + const char **namep = &dl->ins.name; + char **rawp = &dl->ops.raw; + char *tmp_raw_insn, *name_raw_insn = skip_spaces(line); + char *name = skip_spaces(name_raw_insn + RAW_BYTES); + int objdump = 0; + + if (strlen(line) > RAW_BYTES) + objdump = 1; + + if (name_raw_insn[0] == '\0') + return -1; + + if (objdump) { + disasm_line__parse(name, namep, rawp); + } else + *namep = ""; + + tmp_raw_insn = strndup(name_raw_insn, 11); + if (tmp_raw_insn == NULL) + return -1; + + remove_spaces(tmp_raw_insn); + + sscanf(tmp_raw_insn, "%x", &dl->raw.raw_insn); + if (objdump) + dl->raw.raw_insn = be32_to_cpu(dl->raw.raw_insn); + + return 0; +} + static void annotation_line__init(struct annotation_line *al, struct annotate_args *args, int nr) @@ -898,7 +944,10 @@ struct disasm_line *disasm_line__new(struct annotate_args *args) goto out_delete; if (args->offset != -1) { - if (disasm_line__parse(dl->al.line, &dl->ins.name, &dl->ops.raw) < 0) + if (arch__is(args->arch, "powerpc")) { + if (disasm_line__parse_powerpc(dl) < 0) + goto out_free_line; + } else if (disasm_line__parse(dl->al.line, &dl->ins.name, &dl->ops.raw) < 0) goto out_free_line; disasm_line__init_ins(dl, args->arch, &args->ms); -- cgit v1.2.3 From 0b971e6bf1c305843a034ca762718b9428a5dd7f Mon Sep 17 00:00:00 2001 From: Athira Rajeev Date: Thu, 18 Jul 2024 14:13:48 +0530 Subject: perf annotate: Add support to capture and parse raw instruction in powerpc using dso__data_read_offset utility MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add support to capture and parse raw instruction in powerpc. Currently, the perf tool infrastructure uses two ways to disassemble and understand the instruction. One is objdump and other option is via libcapstone. Currently, the perf tool infrastructure uses "--no-show-raw-insn" option with "objdump" while disassemble. Example from powerpc with this option for an instruction address is: Snippet from: objdump --start-address=
--stop-address=
-d --no-show-raw-insn -C c0000000010224b4: lwz r10,0(r9) This line "lwz r10,0(r9)" is parsed to extract instruction name, registers names and offset. Also to find whether there is a memory reference in the operands, "memory_ref_char" field of objdump is used. For x86, "(" is used as memory_ref_char to tackle instructions of the form "mov (%rax), %rcx". In case of powerpc, not all instructions using "(" are the only memory instructions. Example, above instruction can also be of extended form (X form) "lwzx r10,0,r19". Inorder to easy identify the instruction category and extract the source/target registers, patch adds support to use raw instruction for powerpc. Approach used is to read the raw instruction directly from the DSO file using "dso__data_read_offset" utility which is already implemented in perf infrastructure in "util/dso.c". Example: 38 01 81 e8 ld r4,312(r1) Here "38 01 81 e8" is the raw instruction representation. In powerpc, this translates to instruction form: "ld RT,DS(RA)" and binary code as: | 58 | RT | RA | DS | | ------------------------------------- 0 6 11 16 30 31 Function "symbol__disassemble_dso" is updated to read raw instruction directly from DSO using dso__data_read_offset utility. In case of above example, this captures: line: 38 01 81 e8 The above works well when 'perf report' is invoked with only sort keys for data type ie type and typeoff. Because there is no instruction level annotation needed if only data type information is requested for. For annotating sample, along with type and typeoff sort key, "sym" sort key is also needed. And by default invoking just "perf report" uses sort key "sym" that displays the symbol information. With approach changes in powerpc which first reads DSO for raw instruction, "perf annotate" and "perf report" + a key breaks since it doesn't do the instruction level disassembly. Snippet of result from 'perf report': Samples: 1K of event 'mem-loads', 4000 Hz, Event count (approx.): 937238 do_work /usr/bin/pmlogger [Percent: local period] Percent│ ea230010 │ 3a550010 │ 3a600000 │ 38f60001 │ 39490008 │ 42400438 51.44 │ 81290008 │ 7d485378 Here, raw instruction is displayed in the output instead of human readable annotated form. One way to get the appropriate data is to specify "--objdump path", by which code annotation will be done. But the default behaviour will be changed. To fix this breakage, check if "sym" sort key is set. If so fallback and use the libcapstone/objdump way of disassmbling the sample. With the changes and "perf report" Samples: 1K of event 'mem-loads', 4000 Hz, Event count (approx.): 937238 do_work /usr/bin/pmlogger [Percent: local period] Percent│ ld r17,16(r3) │ addi r18,r21,16 │ li r19,0 │ 8b0: rldicl r10,r10,63,33 │ addi r10,r10,1 │ mtctr r10 │ ↓ b 8e4 │ 8c0: addi r7,r22,1 │ addi r10,r9,8 │ ↓ bdz d00 51.44 │ lwz r9,8(r9) │ mr r8,r10 │ cmpw r20,r9 Committer notes: Just add the extern for 'sort_order' in disasm.c so that we don't end up breaking the build due to this type colision with capstone and libbpf: In file included from /usr/include/capstone/capstone.h:325, from /git/perf-6.10.0/tools/perf/util/print_insn.h:23, from builtin-script.c:38: /usr/include/capstone/bpf.h:94:14: error: 'bpf_insn' defined as wrong kind of tag 94 | typedef enum bpf_insn { I reported this to the bpf mailing list, see one of the links below. Reviewed-by: Kajol Jain Reviewed-by: Namhyung Kim Signed-off-by: Athira Rajeev Tested-by: Kajol Jain Cc: Adrian Hunter Cc: Akanksha J N Cc: Christophe Leroy Cc: Disha Goel Cc: Hari Bathini Cc: Ian Rogers Cc: Jiri Olsa Cc: Madhavan Srinivasan Cc: Segher Boessenkool Link: https://lore.kernel.org/lkml/20240718084358.72242-6-atrajeev@linux.vnet.ibm.com Link: https://lore.kernel.org/bpf/ZqOltPk9VQGgJZAA@x1/T/#u Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/disasm.c | 102 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 102 insertions(+) (limited to 'tools') diff --git a/tools/perf/util/disasm.c b/tools/perf/util/disasm.c index 6d4055b9b966..bab15cce612f 100644 --- a/tools/perf/util/disasm.c +++ b/tools/perf/util/disasm.c @@ -1637,6 +1637,91 @@ err: } #endif +static int symbol__disassemble_raw(char *filename, struct symbol *sym, + struct annotate_args *args) +{ + struct annotation *notes = symbol__annotation(sym); + struct map *map = args->ms.map; + struct dso *dso = map__dso(map); + u64 start = map__rip_2objdump(map, sym->start); + u64 end = map__rip_2objdump(map, sym->end); + u64 len = end - start; + u64 offset; + int i, count; + u8 *buf = NULL; + char disasm_buf[512]; + struct disasm_line *dl; + u32 *line; + + /* Return if objdump is specified explicitly */ + if (args->options->objdump_path) + return -1; + + pr_debug("Reading raw instruction from : %s using dso__data_read_offset\n", filename); + + buf = malloc(len); + if (buf == NULL) + goto err; + + count = dso__data_read_offset(dso, NULL, sym->start, buf, len); + + line = (u32 *)buf; + + if ((u64)count != len) + goto err; + + /* add the function address and name */ + scnprintf(disasm_buf, sizeof(disasm_buf), "%#"PRIx64" <%s>:", + start, sym->name); + + args->offset = -1; + args->line = disasm_buf; + args->line_nr = 0; + args->fileloc = NULL; + args->ms.sym = sym; + + dl = disasm_line__new(args); + if (dl == NULL) + goto err; + + annotation_line__add(&dl->al, ¬es->src->source); + + /* Each raw instruction is 4 byte */ + count = len/4; + + for (i = 0, offset = 0; i < count; i++) { + args->offset = offset; + sprintf(args->line, "%x", line[i]); + dl = disasm_line__new(args); + if (dl == NULL) + goto err; + + annotation_line__add(&dl->al, ¬es->src->source); + offset += 4; + } + + /* It failed in the middle */ + if (offset != len) { + struct list_head *list = ¬es->src->source; + + /* Discard all lines and fallback to objdump */ + while (!list_empty(list)) { + dl = list_first_entry(list, struct disasm_line, al.node); + + list_del_init(&dl->al.node); + disasm_line__free(dl); + } + count = -1; + } + +out: + free(buf); + return count < 0 ? count : 0; + +err: + count = -1; + goto out; +} /* * Possibly create a new version of line with tabs expanded. Returns the * existing or new line, storage is updated if a new line is allocated. If @@ -1761,6 +1846,23 @@ int symbol__disassemble(struct symbol *sym, struct annotate_args *args) strcpy(symfs_filename, tmp); } + /* + * For powerpc data type profiling, use the dso__data_read_offset + * to read raw instruction directly and interpret the binary code + * to understand instructions and register fields. For sort keys as + * type and typeoff, disassemble to mnemonic notation is + * not required in case of powerpc. + */ + if (arch__is(args->arch, "powerpc")) { + extern const char *sort_order; + + if (sort_order && !strstr(sort_order, "sym")) { + err = symbol__disassemble_raw(symfs_filename, sym, args); + if (err == 0) + goto out_remove_tmp; + } + } + #ifdef HAVE_LIBCAPSTONE_SUPPORT err = symbol__disassemble_capstone(symfs_filename, sym, args); if (err == 0) -- cgit v1.2.3 From 1b4406d2a88cf65d615b0bcb42525d7a4723eb70 Mon Sep 17 00:00:00 2001 From: Athira Rajeev Date: Thu, 18 Jul 2024 14:13:49 +0530 Subject: perf annotate: Update parameters for reg extract functions to use raw instruction on powerpc Use the raw instruction code and macros to identify memory instructions, extract register fields and also offset. The implementation addresses the D-form, X-form, DS-form instructions. Adds "mem_ref" field to check whether source/target has memory reference. Add function "get_powerpc_regs" which will set these fields: reg1, reg2, offset depending of where it is source or target ops. Update "parse" callback for "struct ins_ops" to also pass "struct disasm_line" as argument. This is needed in parse functions where opcode is used to determine whether to set multi_regs and other fields Reviewed-by: Kajol Jain Reviewed-by: Namhyung Kim Signed-off-by: Athira Rajeev Tested-by: Kajol Jain Cc: Adrian Hunter Cc: Akanksha J N Cc: Christophe Leroy Cc: Disha Goel Cc: Hari Bathini Cc: Ian Rogers Cc: Jiri Olsa Cc: Madhavan Srinivasan Cc: Segher Boessenkool Link: https://lore.kernel.org/lkml/20240718084358.72242-7-atrajeev@linux.vnet.ibm.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/arch/arm64/annotate/instructions.c | 3 +- tools/perf/arch/loongarch/annotate/instructions.c | 6 ++-- tools/perf/arch/powerpc/util/dwarf-regs.c | 44 +++++++++++++++++++++++ tools/perf/arch/s390/annotate/instructions.c | 5 +-- tools/perf/util/annotate.c | 19 ++++++++-- tools/perf/util/disasm.c | 19 ++++++---- tools/perf/util/disasm.h | 5 ++- tools/perf/util/include/dwarf-regs.h | 11 ++++++ 8 files changed, 96 insertions(+), 16 deletions(-) (limited to 'tools') diff --git a/tools/perf/arch/arm64/annotate/instructions.c b/tools/perf/arch/arm64/annotate/instructions.c index 4af0c3a0f86e..f86d9f4798bd 100644 --- a/tools/perf/arch/arm64/annotate/instructions.c +++ b/tools/perf/arch/arm64/annotate/instructions.c @@ -11,7 +11,8 @@ struct arm64_annotate { static int arm64_mov__parse(struct arch *arch __maybe_unused, struct ins_operands *ops, - struct map_symbol *ms __maybe_unused) + struct map_symbol *ms __maybe_unused, + struct disasm_line *dl __maybe_unused) { char *s = strchr(ops->raw, ','), *target, *endptr; diff --git a/tools/perf/arch/loongarch/annotate/instructions.c b/tools/perf/arch/loongarch/annotate/instructions.c index 21cc7e4149f7..ab43b1ab51e3 100644 --- a/tools/perf/arch/loongarch/annotate/instructions.c +++ b/tools/perf/arch/loongarch/annotate/instructions.c @@ -5,7 +5,8 @@ * Copyright (C) 2020-2023 Loongson Technology Corporation Limited */ -static int loongarch_call__parse(struct arch *arch, struct ins_operands *ops, struct map_symbol *ms) +static int loongarch_call__parse(struct arch *arch, struct ins_operands *ops, struct map_symbol *ms, + struct disasm_line *dl __maybe_unused) { char *c, *endptr, *tok, *name; struct map *map = ms->map; @@ -51,7 +52,8 @@ static struct ins_ops loongarch_call_ops = { .scnprintf = call__scnprintf, }; -static int loongarch_jump__parse(struct arch *arch, struct ins_operands *ops, struct map_symbol *ms) +static int loongarch_jump__parse(struct arch *arch, struct ins_operands *ops, struct map_symbol *ms, + struct disasm_line *dl __maybe_unused) { struct map *map = ms->map; struct symbol *sym = ms->sym; diff --git a/tools/perf/arch/powerpc/util/dwarf-regs.c b/tools/perf/arch/powerpc/util/dwarf-regs.c index 430623ca5612..104c7ae5c433 100644 --- a/tools/perf/arch/powerpc/util/dwarf-regs.c +++ b/tools/perf/arch/powerpc/util/dwarf-regs.c @@ -107,3 +107,47 @@ int regs_query_register_offset(const char *name) #define PPC_DS(DS) ((DS) & 0xfffc) #define OP_LD 58 #define OP_STD 62 + +static int get_source_reg(u32 raw_insn) +{ + return PPC_RA(raw_insn); +} + +static int get_target_reg(u32 raw_insn) +{ + return PPC_RT(raw_insn); +} + +static int get_offset_opcode(u32 raw_insn) +{ + int opcode = PPC_OP(raw_insn); + + /* DS- form */ + if ((opcode == OP_LD) || (opcode == OP_STD)) + return PPC_DS(raw_insn); + else + return PPC_D(raw_insn); +} + +/* + * Fills the required fields for op_loc depending on if it + * is a source or target. + * D form: ins RT,D(RA) -> src_reg1 = RA, offset = D, dst_reg1 = RT + * DS form: ins RT,DS(RA) -> src_reg1 = RA, offset = DS, dst_reg1 = RT + * X form: ins RT,RA,RB -> src_reg1 = RA, src_reg2 = RB, dst_reg1 = RT + */ +void get_powerpc_regs(u32 raw_insn, int is_source, + struct annotated_op_loc *op_loc) +{ + if (is_source) + op_loc->reg1 = get_source_reg(raw_insn); + else + op_loc->reg1 = get_target_reg(raw_insn); + + if (op_loc->multi_regs) + op_loc->reg2 = PPC_RB(raw_insn); + + /* TODO: Implement offset handling for X Form */ + if ((op_loc->mem_ref) && (PPC_OP(raw_insn) != 31)) + op_loc->offset = get_offset_opcode(raw_insn); +} diff --git a/tools/perf/arch/s390/annotate/instructions.c b/tools/perf/arch/s390/annotate/instructions.c index da5aa3e1f04c..eeac25cca699 100644 --- a/tools/perf/arch/s390/annotate/instructions.c +++ b/tools/perf/arch/s390/annotate/instructions.c @@ -2,7 +2,7 @@ #include static int s390_call__parse(struct arch *arch, struct ins_operands *ops, - struct map_symbol *ms) + struct map_symbol *ms, struct disasm_line *dl __maybe_unused) { char *endptr, *tok, *name; struct map *map = ms->map; @@ -52,7 +52,8 @@ static struct ins_ops s390_call_ops = { static int s390_mov__parse(struct arch *arch __maybe_unused, struct ins_operands *ops, - struct map_symbol *ms __maybe_unused) + struct map_symbol *ms __maybe_unused, + struct disasm_line *dl __maybe_unused) { char *s = strchr(ops->raw, ','), *target, *endptr; diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c index 1451caf25e77..ce99db291c5e 100644 --- a/tools/perf/util/annotate.c +++ b/tools/perf/util/annotate.c @@ -2123,20 +2123,33 @@ int annotate_get_insn_location(struct arch *arch, struct disasm_line *dl, for_each_insn_op_loc(loc, i, op_loc) { const char *insn_str = ops->source.raw; bool multi_regs = ops->source.multi_regs; + bool mem_ref = ops->source.mem_ref; if (i == INSN_OP_TARGET) { insn_str = ops->target.raw; multi_regs = ops->target.multi_regs; + mem_ref = ops->target.mem_ref; } /* Invalidate the register by default */ op_loc->reg1 = -1; op_loc->reg2 = -1; - if (insn_str == NULL) - continue; + if (insn_str == NULL) { + if (!arch__is(arch, "powerpc")) + continue; + } - if (strchr(insn_str, arch->objdump.memory_ref_char)) { + /* + * For powerpc, call get_powerpc_regs function which extracts the + * required fields for op_loc, ie reg1, reg2, offset from the + * raw instruction. + */ + if (arch__is(arch, "powerpc")) { + op_loc->mem_ref = mem_ref; + op_loc->multi_regs = multi_regs; + get_powerpc_regs(dl->raw.raw_insn, !i, op_loc); + } else if (strchr(insn_str, arch->objdump.memory_ref_char)) { op_loc->mem_ref = true; op_loc->multi_regs = multi_regs; extract_reg_offset(arch, insn_str, op_loc); diff --git a/tools/perf/util/disasm.c b/tools/perf/util/disasm.c index bab15cce612f..3178dc9bedca 100644 --- a/tools/perf/util/disasm.c +++ b/tools/perf/util/disasm.c @@ -255,7 +255,8 @@ bool ins__is_fused(struct arch *arch, const char *ins1, const char *ins2) return arch->ins_is_fused(arch, ins1, ins2); } -static int call__parse(struct arch *arch, struct ins_operands *ops, struct map_symbol *ms) +static int call__parse(struct arch *arch, struct ins_operands *ops, struct map_symbol *ms, + struct disasm_line *dl __maybe_unused) { char *endptr, *tok, *name; struct map *map = ms->map; @@ -350,7 +351,8 @@ static inline const char *validate_comma(const char *c, struct ins_operands *ops return c; } -static int jump__parse(struct arch *arch, struct ins_operands *ops, struct map_symbol *ms) +static int jump__parse(struct arch *arch, struct ins_operands *ops, struct map_symbol *ms, + struct disasm_line *dl __maybe_unused) { struct map *map = ms->map; struct symbol *sym = ms->sym; @@ -509,7 +511,8 @@ static int comment__symbol(char *raw, char *comment, u64 *addrp, char **namep) return 0; } -static int lock__parse(struct arch *arch, struct ins_operands *ops, struct map_symbol *ms) +static int lock__parse(struct arch *arch, struct ins_operands *ops, struct map_symbol *ms, + struct disasm_line *dl __maybe_unused) { ops->locked.ops = zalloc(sizeof(*ops->locked.ops)); if (ops->locked.ops == NULL) @@ -524,7 +527,7 @@ static int lock__parse(struct arch *arch, struct ins_operands *ops, struct map_s goto out_free_ops; if (ops->locked.ins.ops->parse && - ops->locked.ins.ops->parse(arch, ops->locked.ops, ms) < 0) + ops->locked.ins.ops->parse(arch, ops->locked.ops, ms, NULL) < 0) goto out_free_ops; return 0; @@ -595,7 +598,8 @@ static bool check_multi_regs(struct arch *arch, const char *op) return count > 1; } -static int mov__parse(struct arch *arch, struct ins_operands *ops, struct map_symbol *ms __maybe_unused) +static int mov__parse(struct arch *arch, struct ins_operands *ops, struct map_symbol *ms __maybe_unused, + struct disasm_line *dl __maybe_unused) { char *s = strchr(ops->raw, ','), *target, *comment, prev; @@ -673,7 +677,8 @@ static struct ins_ops mov_ops = { .scnprintf = mov__scnprintf, }; -static int dec__parse(struct arch *arch __maybe_unused, struct ins_operands *ops, struct map_symbol *ms __maybe_unused) +static int dec__parse(struct arch *arch __maybe_unused, struct ins_operands *ops, struct map_symbol *ms __maybe_unused, + struct disasm_line *dl __maybe_unused) { char *target, *comment, *s, prev; @@ -814,7 +819,7 @@ static void disasm_line__init_ins(struct disasm_line *dl, struct arch *arch, str if (!dl->ins.ops) return; - if (dl->ins.ops->parse && dl->ins.ops->parse(arch, &dl->ops, ms) < 0) + if (dl->ins.ops->parse && dl->ins.ops->parse(arch, &dl->ops, ms, dl) < 0) dl->ins.ops = NULL; } diff --git a/tools/perf/util/disasm.h b/tools/perf/util/disasm.h index c835759c8e2b..30be0a94ea04 100644 --- a/tools/perf/util/disasm.h +++ b/tools/perf/util/disasm.h @@ -62,6 +62,7 @@ struct ins_operands { bool offset_avail; bool outside; bool multi_regs; + bool mem_ref; } target; union { struct { @@ -69,6 +70,7 @@ struct ins_operands { char *name; u64 addr; bool multi_regs; + bool mem_ref; } source; struct { struct ins ins; @@ -83,7 +85,8 @@ struct ins_operands { struct ins_ops { void (*free)(struct ins_operands *ops); - int (*parse)(struct arch *arch, struct ins_operands *ops, struct map_symbol *ms); + int (*parse)(struct arch *arch, struct ins_operands *ops, struct map_symbol *ms, + struct disasm_line *dl); int (*scnprintf)(struct ins *ins, char *bf, size_t size, struct ins_operands *ops, int max_ins_name); }; diff --git a/tools/perf/util/include/dwarf-regs.h b/tools/perf/util/include/dwarf-regs.h index 01fb25a1150a..75b28dcc8317 100644 --- a/tools/perf/util/include/dwarf-regs.h +++ b/tools/perf/util/include/dwarf-regs.h @@ -1,6 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _PERF_DWARF_REGS_H_ #define _PERF_DWARF_REGS_H_ +#include "annotate.h" #define DWARF_REG_PC 0xd3af9c /* random number */ #define DWARF_REG_FB 0xd3affb /* random number */ @@ -31,6 +32,16 @@ static inline int get_dwarf_regnum(const char *name __maybe_unused, } #endif +#if !defined(__powerpc__) || !defined(HAVE_DWARF_SUPPORT) +static inline void get_powerpc_regs(u32 raw_insn __maybe_unused, int is_source __maybe_unused, + struct annotated_op_loc *op_loc __maybe_unused) +{ + return; +} +#else +void get_powerpc_regs(u32 raw_insn, int is_source, struct annotated_op_loc *op_loc); +#endif + #ifdef HAVE_ARCH_REGS_QUERY_REGISTER_OFFSET /* * Arch should support fetching the offset of a register in pt_regs -- cgit v1.2.3 From 1acdad68183ab870fca1e63b64f32601be612611 Mon Sep 17 00:00:00 2001 From: Athira Rajeev Date: Thu, 18 Jul 2024 14:13:50 +0530 Subject: perf annotate: Add parse function for memory instructions in powerpc Use the raw instruction code and macros to identify memory instructions, extract register fields and also offset. The implementation addresses the D-form, X-form, DS-form instructions. Two main functions are added. New parse function "load_store__parse" as instruction ops parser for memory instructions. Unlike other parsers (like mov__parse), this one fills in the "multi_regs" field for source/target and new added "mem_ref" field. No other fields are set because, here there is no need to parse the disassembled code and arch specific macros will take care of extracting offset and regs which is easier and will be precise. In powerpc, all instructions with a primary opcode from 32 to 63 are memory instructions. Update "ins__find" function to have "raw_insn" also as a parameter. Reviewed-by: Kajol Jain Reviewed-by: Namhyung Kim Signed-off-by: Athira Rajeev Tested-by: Kajol Jain Cc: Adrian Hunter Cc: Akanksha J N Cc: Christophe Leroy Cc: Disha Goel Cc: Hari Bathini Cc: Ian Rogers Cc: Jiri Olsa Cc: Madhavan Srinivasan Cc: Segher Boessenkool Link: https://lore.kernel.org/lkml/20240718084358.72242-8-atrajeev@linux.vnet.ibm.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/arch/powerpc/annotate/instructions.c | 16 ++++++++ tools/perf/util/disasm.c | 54 ++++++++++++++++++++++--- tools/perf/util/disasm.h | 2 +- 3 files changed, 66 insertions(+), 6 deletions(-) (limited to 'tools') diff --git a/tools/perf/arch/powerpc/annotate/instructions.c b/tools/perf/arch/powerpc/annotate/instructions.c index d57fd023ef9c..b084423d8477 100644 --- a/tools/perf/arch/powerpc/annotate/instructions.c +++ b/tools/perf/arch/powerpc/annotate/instructions.c @@ -49,6 +49,22 @@ static struct ins_ops *powerpc__associate_instruction_ops(struct arch *arch, con return ops; } +#define PPC_OP(op) (((op) >> 26) & 0x3F) + +static struct ins_ops *check_ppc_insn(u32 raw_insn) +{ + int opcode = PPC_OP(raw_insn); + + /* + * Instructions with opcode 32 to 63 are memory + * instructions in powerpc + */ + if ((opcode & 0x20)) + return &load_store_ops; + + return NULL; +} + static int powerpc__annotate_init(struct arch *arch, char *cpuid __maybe_unused) { if (!arch->initialized) { diff --git a/tools/perf/util/disasm.c b/tools/perf/util/disasm.c index 3178dc9bedca..0afa1847f1c5 100644 --- a/tools/perf/util/disasm.c +++ b/tools/perf/util/disasm.c @@ -36,6 +36,7 @@ static struct ins_ops mov_ops; static struct ins_ops nop_ops; static struct ins_ops lock_ops; static struct ins_ops ret_ops; +static struct ins_ops load_store_ops; static int jump__scnprintf(struct ins *ins, char *bf, size_t size, struct ins_operands *ops, int max_ins_name); @@ -521,7 +522,7 @@ static int lock__parse(struct arch *arch, struct ins_operands *ops, struct map_s if (disasm_line__parse(ops->raw, &ops->locked.ins.name, &ops->locked.ops->raw) < 0) goto out_free_ops; - ops->locked.ins.ops = ins__find(arch, ops->locked.ins.name); + ops->locked.ins.ops = ins__find(arch, ops->locked.ins.name, 0); if (ops->locked.ins.ops == NULL) goto out_free_ops; @@ -677,6 +678,37 @@ static struct ins_ops mov_ops = { .scnprintf = mov__scnprintf, }; +static int load_store__scnprintf(struct ins *ins, char *bf, size_t size, + struct ins_operands *ops, int max_ins_name) +{ + return scnprintf(bf, size, "%-*s %s", max_ins_name, ins->name, + ops->raw); +} + +/* + * Sets the fields: multi_regs and "mem_ref". + * "mem_ref" is set for ops->source which is later used to + * fill the objdump->memory_ref-char field. This ops is currently + * used by powerpc and since binary instruction code is used to + * extract opcode, regs and offset, no other parsing is needed here + */ +static int load_store__parse(struct arch *arch __maybe_unused, struct ins_operands *ops, + struct map_symbol *ms __maybe_unused, struct disasm_line *dl __maybe_unused) +{ + ops->source.mem_ref = true; + ops->source.multi_regs = false; + + ops->target.mem_ref = false; + ops->target.multi_regs = false; + + return 0; +} + +static struct ins_ops load_store_ops = { + .parse = load_store__parse, + .scnprintf = load_store__scnprintf, +}; + static int dec__parse(struct arch *arch __maybe_unused, struct ins_operands *ops, struct map_symbol *ms __maybe_unused, struct disasm_line *dl __maybe_unused) { @@ -768,11 +800,23 @@ static void ins__sort(struct arch *arch) qsort(arch->instructions, nmemb, sizeof(struct ins), ins__cmp); } -static struct ins_ops *__ins__find(struct arch *arch, const char *name) +static struct ins_ops *__ins__find(struct arch *arch, const char *name, u32 raw_insn) { struct ins *ins; const int nmemb = arch->nr_instructions; + if (arch__is(arch, "powerpc")) { + /* + * For powerpc, identify the instruction ops + * from the opcode using raw_insn. + */ + struct ins_ops *ops; + + ops = check_ppc_insn(raw_insn); + if (ops) + return ops; + } + if (!arch->sorted_instructions) { ins__sort(arch); arch->sorted_instructions = true; @@ -802,9 +846,9 @@ static struct ins_ops *__ins__find(struct arch *arch, const char *name) return ins ? ins->ops : NULL; } -struct ins_ops *ins__find(struct arch *arch, const char *name) +struct ins_ops *ins__find(struct arch *arch, const char *name, u32 raw_insn) { - struct ins_ops *ops = __ins__find(arch, name); + struct ins_ops *ops = __ins__find(arch, name, raw_insn); if (!ops && arch->associate_instruction_ops) ops = arch->associate_instruction_ops(arch, name); @@ -814,7 +858,7 @@ struct ins_ops *ins__find(struct arch *arch, const char *name) static void disasm_line__init_ins(struct disasm_line *dl, struct arch *arch, struct map_symbol *ms) { - dl->ins.ops = ins__find(arch, dl->ins.name); + dl->ins.ops = ins__find(arch, dl->ins.name, dl->raw.raw_insn); if (!dl->ins.ops) return; diff --git a/tools/perf/util/disasm.h b/tools/perf/util/disasm.h index 30be0a94ea04..c1bb1e484bfb 100644 --- a/tools/perf/util/disasm.h +++ b/tools/perf/util/disasm.h @@ -105,7 +105,7 @@ struct annotate_args { struct arch *arch__find(const char *name); bool arch__is(struct arch *arch, const char *name); -struct ins_ops *ins__find(struct arch *arch, const char *name); +struct ins_ops *ins__find(struct arch *arch, const char *name, u32 raw_insn); int ins__scnprintf(struct ins *ins, char *bf, size_t size, struct ins_operands *ops, int max_ins_name); -- cgit v1.2.3 From ace7d681d82d8ac6cc1c9646b19ab4cca7be0d90 Mon Sep 17 00:00:00 2001 From: Athira Rajeev Date: Thu, 18 Jul 2024 14:13:51 +0530 Subject: perf annotate: Add support to identify memory instructions of opcode 31 in powerpc There are memory instructions in powerpc with opcode as 31. Example: "ldx RT,RA,RB" , Its X form is as below: ______________________________________ | 31 | RT | RA | RB | 21 |/| -------------------------------------- 0 6 11 16 21 30 31 The opcode for "ldx" is 31. There are other instructions also with opcode 31 which are memory insn like ldux, stbx, lwzx, lhaux But all instructions with opcode 31 are not memory. Example is add instruction: "add RT,RA,RB" The value in bit 21-30 [ 21 for ldx ] is different for these instructions. Patch uses this value to assign instruction ops for these cases. The naming convention and value to identify these are picked from defines in "arch/powerpc/include/asm/ppc-opcode.h" Reviewed-by: Kajol Jain Reviewed-by: Namhyung Kim Signed-off-by: Athira Rajeev Tested-by: Kajol Jain Cc: Adrian Hunter Cc: Akanksha J N Cc: Christophe Leroy Cc: Disha Goel Cc: Hari Bathini Cc: Ian Rogers Cc: Jiri Olsa Cc: Madhavan Srinivasan Cc: Segher Boessenkool Link: https://lore.kernel.org/lkml/20240718084358.72242-9-atrajeev@linux.vnet.ibm.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/arch/powerpc/annotate/instructions.c | 107 +++++++++++++++++++++++- tools/perf/util/disasm.c | 3 + 2 files changed, 108 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/perf/arch/powerpc/annotate/instructions.c b/tools/perf/arch/powerpc/annotate/instructions.c index b084423d8477..1ffb64c6bd0d 100644 --- a/tools/perf/arch/powerpc/annotate/instructions.c +++ b/tools/perf/arch/powerpc/annotate/instructions.c @@ -49,18 +49,121 @@ static struct ins_ops *powerpc__associate_instruction_ops(struct arch *arch, con return ops; } -#define PPC_OP(op) (((op) >> 26) & 0x3F) +#define PPC_OP(op) (((op) >> 26) & 0x3F) +#define PPC_21_30(R) (((R) >> 1) & 0x3ff) + +struct insn_offset { + const char *name; + int value; +}; + +/* + * There are memory instructions with opcode 31 which are + * of X Form, Example: + * ldx RT,RA,RB + * ______________________________________ + * | 31 | RT | RA | RB | 21 |/| + * -------------------------------------- + * 0 6 11 16 21 30 31 + * + * But all instructions with opcode 31 are not memory. + * Example: add RT,RA,RB + * + * Use bits 21 to 30 to check memory insns with 31 as opcode. + * In ins_array below, for ldx instruction: + * name => OP_31_XOP_LDX + * value => 21 + */ + +static struct insn_offset ins_array[] = { + { .name = "OP_31_XOP_LXSIWZX", .value = 12, }, + { .name = "OP_31_XOP_LWARX", .value = 20, }, + { .name = "OP_31_XOP_LDX", .value = 21, }, + { .name = "OP_31_XOP_LWZX", .value = 23, }, + { .name = "OP_31_XOP_LDUX", .value = 53, }, + { .name = "OP_31_XOP_LWZUX", .value = 55, }, + { .name = "OP_31_XOP_LXSIWAX", .value = 76, }, + { .name = "OP_31_XOP_LDARX", .value = 84, }, + { .name = "OP_31_XOP_LBZX", .value = 87, }, + { .name = "OP_31_XOP_LVX", .value = 103, }, + { .name = "OP_31_XOP_LBZUX", .value = 119, }, + { .name = "OP_31_XOP_STXSIWX", .value = 140, }, + { .name = "OP_31_XOP_STDX", .value = 149, }, + { .name = "OP_31_XOP_STWX", .value = 151, }, + { .name = "OP_31_XOP_STDUX", .value = 181, }, + { .name = "OP_31_XOP_STWUX", .value = 183, }, + { .name = "OP_31_XOP_STBX", .value = 215, }, + { .name = "OP_31_XOP_STVX", .value = 231, }, + { .name = "OP_31_XOP_STBUX", .value = 247, }, + { .name = "OP_31_XOP_LHZX", .value = 279, }, + { .name = "OP_31_XOP_LHZUX", .value = 311, }, + { .name = "OP_31_XOP_LXVDSX", .value = 332, }, + { .name = "OP_31_XOP_LWAX", .value = 341, }, + { .name = "OP_31_XOP_LHAX", .value = 343, }, + { .name = "OP_31_XOP_LWAUX", .value = 373, }, + { .name = "OP_31_XOP_LHAUX", .value = 375, }, + { .name = "OP_31_XOP_STHX", .value = 407, }, + { .name = "OP_31_XOP_STHUX", .value = 439, }, + { .name = "OP_31_XOP_LXSSPX", .value = 524, }, + { .name = "OP_31_XOP_LDBRX", .value = 532, }, + { .name = "OP_31_XOP_LSWX", .value = 533, }, + { .name = "OP_31_XOP_LWBRX", .value = 534, }, + { .name = "OP_31_XOP_LFSUX", .value = 567, }, + { .name = "OP_31_XOP_LXSDX", .value = 588, }, + { .name = "OP_31_XOP_LSWI", .value = 597, }, + { .name = "OP_31_XOP_LFDX", .value = 599, }, + { .name = "OP_31_XOP_LFDUX", .value = 631, }, + { .name = "OP_31_XOP_STXSSPX", .value = 652, }, + { .name = "OP_31_XOP_STDBRX", .value = 660, }, + { .name = "OP_31_XOP_STXWX", .value = 661, }, + { .name = "OP_31_XOP_STWBRX", .value = 662, }, + { .name = "OP_31_XOP_STFSX", .value = 663, }, + { .name = "OP_31_XOP_STFSUX", .value = 695, }, + { .name = "OP_31_XOP_STXSDX", .value = 716, }, + { .name = "OP_31_XOP_STSWI", .value = 725, }, + { .name = "OP_31_XOP_STFDX", .value = 727, }, + { .name = "OP_31_XOP_STFDUX", .value = 759, }, + { .name = "OP_31_XOP_LXVW4X", .value = 780, }, + { .name = "OP_31_XOP_LHBRX", .value = 790, }, + { .name = "OP_31_XOP_LXVD2X", .value = 844, }, + { .name = "OP_31_XOP_LFIWAX", .value = 855, }, + { .name = "OP_31_XOP_LFIWZX", .value = 887, }, + { .name = "OP_31_XOP_STXVW4X", .value = 908, }, + { .name = "OP_31_XOP_STHBRX", .value = 918, }, + { .name = "OP_31_XOP_STXVD2X", .value = 972, }, + { .name = "OP_31_XOP_STFIWX", .value = 983, }, +}; + +static int cmp_offset(const void *a, const void *b) +{ + const struct insn_offset *val1 = a; + const struct insn_offset *val2 = b; + + return (val1->value - val2->value); +} static struct ins_ops *check_ppc_insn(u32 raw_insn) { int opcode = PPC_OP(raw_insn); + int mem_insn_31 = PPC_21_30(raw_insn); + struct insn_offset *ret; + struct insn_offset mem_insns_31_opcode = { + "OP_31_INSN", + mem_insn_31 + }; /* * Instructions with opcode 32 to 63 are memory * instructions in powerpc */ - if ((opcode & 0x20)) + if ((opcode & 0x20)) { return &load_store_ops; + } else if (opcode == 31) { + /* Check for memory instructions with opcode 31 */ + ret = bsearch(&mem_insns_31_opcode, ins_array, ARRAY_SIZE(ins_array), sizeof(ins_array[0]), cmp_offset); + if (ret != NULL) + return &load_store_ops; + } return NULL; } diff --git a/tools/perf/util/disasm.c b/tools/perf/util/disasm.c index 0afa1847f1c5..e36be7b7c625 100644 --- a/tools/perf/util/disasm.c +++ b/tools/perf/util/disasm.c @@ -697,6 +697,9 @@ static int load_store__parse(struct arch *arch __maybe_unused, struct ins_operan { ops->source.mem_ref = true; ops->source.multi_regs = false; + /* opcode 31 is of X form */ + if (PPC_OP(dl->raw.raw_insn) == 31) + ops->source.multi_regs = true; ops->target.mem_ref = false; ops->target.multi_regs = false; -- cgit v1.2.3 From cd0b6f67c4ab1aecdfedb277c42880fcffe75ace Mon Sep 17 00:00:00 2001 From: Athira Rajeev Date: Thu, 18 Jul 2024 14:13:52 +0530 Subject: perf annotate: Add some of the arithmetic instructions to support instruction tracking in powerpc Data-type profiling has the concept of instruction tracking. Example sequence in powerpc: ld r10,264(r3) mr r31,r3 < ld r9,312(r31) or differently lwz r10,264(r3) add r31, r3, RB lwz r9, 0(r31) If a sample is hit at "lwz r9, 0(r31)", data type of r31 depends on previous instruction sequence here. So to track the previous instructions, patch adds changes to identify some of the arithmetic instructions which are having opcode as 31. Since memory instructions also has cases with opcode 31, use the bits 22:30 to filter the arithmetic instructions here. Also there are instructions with just two operands like "addme", "addze". This patch adds new instructions ops "arithmetic_ops" to handle this Reviewed-by: Kajol Jain Reviewed-by: Namhyung Kim Signed-off-by: Athira Rajeev Tested-by: Kajol Jain Cc: Adrian Hunter Cc: Akanksha J N Cc: Christophe Leroy Cc: Disha Goel Cc: Hari Bathini Cc: Ian Rogers Cc: Jiri Olsa Cc: Madhavan Srinivasan Cc: Segher Boessenkool Link: https://lore.kernel.org/lkml/20240718084358.72242-10-atrajeev@linux.vnet.ibm.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/arch/powerpc/annotate/instructions.c | 49 ++++++++++++++++++++++++ tools/perf/util/disasm.c | 51 +++++++++++++++++++++++++ 2 files changed, 100 insertions(+) (limited to 'tools') diff --git a/tools/perf/arch/powerpc/annotate/instructions.c b/tools/perf/arch/powerpc/annotate/instructions.c index 1ffb64c6bd0d..aa5ee09fa28f 100644 --- a/tools/perf/arch/powerpc/annotate/instructions.c +++ b/tools/perf/arch/powerpc/annotate/instructions.c @@ -51,6 +51,7 @@ static struct ins_ops *powerpc__associate_instruction_ops(struct arch *arch, con #define PPC_OP(op) (((op) >> 26) & 0x3F) #define PPC_21_30(R) (((R) >> 1) & 0x3ff) +#define PPC_22_30(R) (((R) >> 1) & 0x1ff) struct insn_offset { const char *name; @@ -134,6 +135,44 @@ static struct insn_offset ins_array[] = { { .name = "OP_31_XOP_STFIWX", .value = 983, }, }; +/* + * Arithmetic instructions which are having opcode as 31. + * These instructions are tracked to save the register state + * changes. Example: + * + * lwz r10,264(r3) + * add r31, r3, r3 + * lwz r9, 0(r31) + * + * Here instruction tracking needs to identify the "add" + * instruction and save data type of r3 to r31. If a sample + * is hit at next "lwz r9, 0(r31)", by this instruction tracking, + * data type of r31 can be resolved. + */ +static struct insn_offset arithmetic_ins_op_31[] = { + { .name = "SUB_CARRY_XO_FORM", .value = 8, }, + { .name = "MUL_HDW_XO_FORM1", .value = 9, }, + { .name = "ADD_CARRY_XO_FORM", .value = 10, }, + { .name = "MUL_HW_XO_FORM1", .value = 11, }, + { .name = "SUB_XO_FORM", .value = 40, }, + { .name = "MUL_HDW_XO_FORM", .value = 73, }, + { .name = "MUL_HW_XO_FORM", .value = 75, }, + { .name = "SUB_EXT_XO_FORM", .value = 136, }, + { .name = "ADD_EXT_XO_FORM", .value = 138, }, + { .name = "SUB_ZERO_EXT_XO_FORM", .value = 200, }, + { .name = "ADD_ZERO_EXT_XO_FORM", .value = 202, }, + { .name = "SUB_EXT_XO_FORM2", .value = 232, }, + { .name = "MUL_DW_XO_FORM", .value = 233, }, + { .name = "ADD_EXT_XO_FORM2", .value = 234, }, + { .name = "MUL_W_XO_FORM", .value = 235, }, + { .name = "ADD_XO_FORM", .value = 266, }, + { .name = "DIV_DW_XO_FORM1", .value = 457, }, + { .name = "DIV_W_XO_FORM1", .value = 459, }, + { .name = "DIV_DW_XO_FORM", .value = 489, }, + { .name = "DIV_W_XO_FORM", .value = 491, }, +}; + + static int cmp_offset(const void *a, const void *b) { const struct insn_offset *val1 = a; @@ -163,6 +202,16 @@ static struct ins_ops *check_ppc_insn(u32 raw_insn) ret = bsearch(&mem_insns_31_opcode, ins_array, ARRAY_SIZE(ins_array), sizeof(ins_array[0]), cmp_offset); if (ret != NULL) return &load_store_ops; + else { + mem_insns_31_opcode.value = PPC_22_30(raw_insn); + ret = bsearch(&mem_insns_31_opcode, arithmetic_ins_op_31, ARRAY_SIZE(arithmetic_ins_op_31), + sizeof(arithmetic_ins_op_31[0]), cmp_offset); + if (ret != NULL) + return &arithmetic_ops; + /* Bits 21 to 30 has value 444 for "mr" insn ie, OR X form */ + if (PPC_21_30(raw_insn) == 444) + return &arithmetic_ops; + } } return NULL; diff --git a/tools/perf/util/disasm.c b/tools/perf/util/disasm.c index e36be7b7c625..65915e1f575a 100644 --- a/tools/perf/util/disasm.c +++ b/tools/perf/util/disasm.c @@ -37,6 +37,7 @@ static struct ins_ops nop_ops; static struct ins_ops lock_ops; static struct ins_ops ret_ops; static struct ins_ops load_store_ops; +static struct ins_ops arithmetic_ops; static int jump__scnprintf(struct ins *ins, char *bf, size_t size, struct ins_operands *ops, int max_ins_name); @@ -678,6 +679,56 @@ static struct ins_ops mov_ops = { .scnprintf = mov__scnprintf, }; +#define PPC_22_30(R) (((R) >> 1) & 0x1ff) +#define MINUS_EXT_XO_FORM 234 +#define SUB_EXT_XO_FORM 232 +#define ADD_ZERO_EXT_XO_FORM 202 +#define SUB_ZERO_EXT_XO_FORM 200 + +static int arithmetic__scnprintf(struct ins *ins, char *bf, size_t size, + struct ins_operands *ops, int max_ins_name) +{ + return scnprintf(bf, size, "%-*s %s", max_ins_name, ins->name, + ops->raw); +} + +/* + * Sets the fields: multi_regs and "mem_ref". + * "mem_ref" is set for ops->source which is later used to + * fill the objdump->memory_ref-char field. This ops is currently + * used by powerpc and since binary instruction code is used to + * extract opcode, regs and offset, no other parsing is needed here. + * + * Dont set multi regs for 4 cases since it has only one operand + * for source: + * - Add to Minus One Extended XO-form ( Ex: addme, addmeo ) + * - Subtract From Minus One Extended XO-form ( Ex: subfme ) + * - Add to Zero Extended XO-form ( Ex: addze, addzeo ) + * - Subtract From Zero Extended XO-form ( Ex: subfze ) + */ +static int arithmetic__parse(struct arch *arch __maybe_unused, struct ins_operands *ops, + struct map_symbol *ms __maybe_unused, struct disasm_line *dl) +{ + int opcode = PPC_OP(dl->raw.raw_insn); + + ops->source.mem_ref = false; + if (opcode == 31) { + if ((opcode != MINUS_EXT_XO_FORM) && (opcode != SUB_EXT_XO_FORM) \ + && (opcode != ADD_ZERO_EXT_XO_FORM) && (opcode != SUB_ZERO_EXT_XO_FORM)) + ops->source.multi_regs = true; + } + + ops->target.mem_ref = false; + ops->target.multi_regs = false; + + return 0; +} + +static struct ins_ops arithmetic_ops = { + .parse = arithmetic__parse, + .scnprintf = arithmetic__scnprintf, +}; + static int load_store__scnprintf(struct ins *ins, char *bf, size_t size, struct ins_operands *ops, int max_ins_name) { -- cgit v1.2.3 From 539bfea3e09c8e7a773b0fc4f6a4b26d921d63ef Mon Sep 17 00:00:00 2001 From: Athira Rajeev Date: Thu, 18 Jul 2024 14:13:53 +0530 Subject: perf annotate: Add more instructions for instruction tracking Add few more instructions and use opcode as search key to find if it is supported by the architecture. The added ones are: addi, addic, addic., addis, subfic and mulli Reviewed-by: Kajol Jain Reviewed-by: Namhyung Kim Signed-off-by: Athira Rajeev Tested-by: Kajol Jain Cc: Adrian Hunter Cc: Akanksha J N Cc: Christophe Leroy Cc: Disha Goel Cc: Hari Bathini Cc: Ian Rogers Cc: Jiri Olsa Cc: Madhavan Srinivasan Cc: Segher Boessenkool Link: https://lore.kernel.org/lkml/20240718084358.72242-11-atrajeev@linux.vnet.ibm.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/arch/powerpc/annotate/instructions.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'tools') diff --git a/tools/perf/arch/powerpc/annotate/instructions.c b/tools/perf/arch/powerpc/annotate/instructions.c index aa5ee09fa28f..aa25a336d8d0 100644 --- a/tools/perf/arch/powerpc/annotate/instructions.c +++ b/tools/perf/arch/powerpc/annotate/instructions.c @@ -172,6 +172,14 @@ static struct insn_offset arithmetic_ins_op_31[] = { { .name = "DIV_W_XO_FORM", .value = 491, }, }; +static struct insn_offset arithmetic_two_ops[] = { + { .name = "mulli", .value = 7, }, + { .name = "subfic", .value = 8, }, + { .name = "addic", .value = 12, }, + { .name = "addic.", .value = 13, }, + { .name = "addi", .value = 14, }, + { .name = "addis", .value = 15, }, +}; static int cmp_offset(const void *a, const void *b) { @@ -212,6 +220,12 @@ static struct ins_ops *check_ppc_insn(u32 raw_insn) if (PPC_21_30(raw_insn) == 444) return &arithmetic_ops; } + } else { + mem_insns_31_opcode.value = opcode; + ret = bsearch(&mem_insns_31_opcode, arithmetic_two_ops, ARRAY_SIZE(arithmetic_two_ops), + sizeof(arithmetic_two_ops[0]), cmp_offset); + if (ret != NULL) + return &arithmetic_ops; } return NULL; -- cgit v1.2.3 From 88444952bdfe27fcc91b1c499a0f77675db592a9 Mon Sep 17 00:00:00 2001 From: Athira Rajeev Date: Thu, 18 Jul 2024 14:13:54 +0530 Subject: perf annotate: Update instruction tracking for powerpc Add instruction tracking function "update_insn_state_powerpc" for powerpc. Example sequence in powerpc: ld r10,264(r3) mr r31,r3 < ld r9,312(r31) Consider ithe sample is pointing to: "ld r9,312(r31)". Here the memory reference is hit at "312(r31)" where 312 is the offset and r31 is the source register. Previous instruction sequence shows that register state of r3 is moved to r31. So to identify the data type for r31 access, the previous instruction ("mr") needs to be tracked and the state type entry has to be updated. Current instruction tracking support in perf tools infrastructure is specific to x86. Patch adds this support for powerpc as well. Reviewed-by: Kajol Jain Reviewed-by: Namhyung Kim Signed-off-by: Athira Rajeev Tested-by: Kajol Jain Cc: Adrian Hunter Cc: Akanksha J N Cc: Christophe Leroy Cc: Disha Goel Cc: Hari Bathini Cc: Ian Rogers Cc: Jiri Olsa Cc: Madhavan Srinivasan Cc: Segher Boessenkool Link: https://lore.kernel.org/lkml/20240718084358.72242-12-atrajeev@linux.vnet.ibm.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/arch/powerpc/annotate/instructions.c | 59 +++++++++++++++++++++++++ tools/perf/util/annotate-data.c | 9 +++- tools/perf/util/disasm.c | 3 ++ 3 files changed, 70 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/perf/arch/powerpc/annotate/instructions.c b/tools/perf/arch/powerpc/annotate/instructions.c index aa25a336d8d0..af1032572bf3 100644 --- a/tools/perf/arch/powerpc/annotate/instructions.c +++ b/tools/perf/arch/powerpc/annotate/instructions.c @@ -231,6 +231,65 @@ static struct ins_ops *check_ppc_insn(u32 raw_insn) return NULL; } +/* + * Instruction tracking function to track register state moves. + * Example sequence: + * ld r10,264(r3) + * mr r31,r3 + * < + * ld r9,312(r31) + * + * Previous instruction sequence shows that register state of r3 + * is moved to r31. update_insn_state_powerpc tracks these state + * changes + */ +#ifdef HAVE_DWARF_SUPPORT +static void update_insn_state_powerpc(struct type_state *state, + struct data_loc_info *dloc, Dwarf_Die * cu_die __maybe_unused, + struct disasm_line *dl) +{ + struct annotated_insn_loc loc; + struct annotated_op_loc *src = &loc.ops[INSN_OP_SOURCE]; + struct annotated_op_loc *dst = &loc.ops[INSN_OP_TARGET]; + struct type_state_reg *tsr; + u32 insn_offset = dl->al.offset; + + if (annotate_get_insn_location(dloc->arch, dl, &loc) < 0) + return; + + /* + * Value 444 for bits 21:30 is for "mr" + * instruction. "mr" is extended OR. So set the + * source and destination reg correctly + */ + if (PPC_21_30(dl->raw.raw_insn) == 444) { + int src_reg = src->reg1; + + src->reg1 = dst->reg1; + dst->reg1 = src_reg; + } + + if (!has_reg_type(state, dst->reg1)) + return; + + tsr = &state->regs[dst->reg1]; + + if (!has_reg_type(state, src->reg1) || + !state->regs[src->reg1].ok) { + tsr->ok = false; + return; + } + + tsr->type = state->regs[src->reg1].type; + tsr->kind = state->regs[src->reg1].kind; + tsr->ok = true; + + pr_debug_dtp("mov [%x] reg%d -> reg%d", + insn_offset, src->reg1, dst->reg1); + pr_debug_type_name(&tsr->type, tsr->kind); +} +#endif /* HAVE_DWARF_SUPPORT */ + static int powerpc__annotate_init(struct arch *arch, char *cpuid __maybe_unused) { if (!arch->initialized) { diff --git a/tools/perf/util/annotate-data.c b/tools/perf/util/annotate-data.c index 7a48c3d72b89..734acdd8c4b7 100644 --- a/tools/perf/util/annotate-data.c +++ b/tools/perf/util/annotate-data.c @@ -1080,6 +1080,13 @@ out: return ret; } +static int arch_supports_insn_tracking(struct data_loc_info *dloc) +{ + if ((arch__is(dloc->arch, "x86")) || (arch__is(dloc->arch, "powerpc"))) + return 1; + return 0; +} + /* * Construct a list of basic blocks for each scope with variables and try to find * the data type by updating a type state table through instructions. @@ -1094,7 +1101,7 @@ static int find_data_type_block(struct data_loc_info *dloc, int ret = -1; /* TODO: other architecture support */ - if (!arch__is(dloc->arch, "x86")) + if (!arch_supports_insn_tracking(dloc)) return -1; prev_dst_ip = dst_ip = dloc->ip; diff --git a/tools/perf/util/disasm.c b/tools/perf/util/disasm.c index 65915e1f575a..3d25443e1162 100644 --- a/tools/perf/util/disasm.c +++ b/tools/perf/util/disasm.c @@ -156,6 +156,9 @@ static struct arch architectures[] = { { .name = "powerpc", .init = powerpc__annotate_init, +#ifdef HAVE_DWARF_SUPPORT + .update_insn_state = update_insn_state_powerpc, +#endif }, { .name = "riscv64", -- cgit v1.2.3 From 1fe86bc245abd1aded01403675d6610455794b5f Mon Sep 17 00:00:00 2001 From: Athira Rajeev Date: Thu, 18 Jul 2024 14:13:55 +0530 Subject: perf annotate: Make capstone_init non-static so that it can be used during symbol disassemble symbol__disassemble_capstone in util/disasm.c calls function open_capstone_handle to open/init the capstone. We already have a capstone_init function in "util/print_insn.c". But capstone_init is defined as a static function in util/print_insn.c. Change this and also add the function in print_insn.h The open_capstone_handle checks the disassembler_style option from annotation_options to decide whether to set CS_OPT_SYNTAX_ATT. Add that logic in capstone_init also and by default set it to true. Reviewed-by: Kajol Jain Reviewed-by: Namhyung Kim Signed-off-by: Athira Rajeev Tested-by: Kajol Jain Cc: Adrian Hunter Cc: Akanksha J N Cc: Christophe Leroy Cc: Disha Goel Cc: Hari Bathini Cc: Ian Rogers Cc: Jiri Olsa Cc: Madhavan Srinivasan Cc: Segher Boessenkool Link: https://lore.kernel.org/lkml/20240718084358.72242-13-atrajeev@linux.vnet.ibm.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/print_insn.c | 12 +++++++++--- tools/perf/util/print_insn.h | 5 +++++ 2 files changed, 14 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/print_insn.c b/tools/perf/util/print_insn.c index a950e9157d2d..a76aae81d7a0 100644 --- a/tools/perf/util/print_insn.c +++ b/tools/perf/util/print_insn.c @@ -32,7 +32,7 @@ size_t sample__fprintf_insn_raw(struct perf_sample *sample, FILE *fp) #ifdef HAVE_LIBCAPSTONE_SUPPORT #include -static int capstone_init(struct machine *machine, csh *cs_handle, bool is64) +int capstone_init(struct machine *machine, csh *cs_handle, bool is64, bool disassembler_style) { cs_arch arch; cs_mode mode; @@ -62,7 +62,13 @@ static int capstone_init(struct machine *machine, csh *cs_handle, bool is64) } if (machine__normalized_is(machine, "x86")) { - cs_option(*cs_handle, CS_OPT_SYNTAX, CS_OPT_SYNTAX_ATT); + /* + * In case of using capstone_init while symbol__disassemble + * setting CS_OPT_SYNTAX_ATT depends if disassembler_style opts + * is set via annotation args + */ + if (disassembler_style) + cs_option(*cs_handle, CS_OPT_SYNTAX, CS_OPT_SYNTAX_ATT); /* * Resolving address operands to symbols is implemented * on x86 by investigating instruction details. @@ -122,7 +128,7 @@ ssize_t fprintf_insn_asm(struct machine *machine, struct thread *thread, u8 cpum int ret; /* TODO: Try to initiate capstone only once but need a proper place. */ - ret = capstone_init(machine, &cs_handle, is64bit); + ret = capstone_init(machine, &cs_handle, is64bit, true); if (ret < 0) return ret; diff --git a/tools/perf/util/print_insn.h b/tools/perf/util/print_insn.h index 07d11af3fc1c..2c8ee41c4a5d 100644 --- a/tools/perf/util/print_insn.h +++ b/tools/perf/util/print_insn.h @@ -19,4 +19,9 @@ ssize_t fprintf_insn_asm(struct machine *machine, struct thread *thread, u8 cpum bool is64bit, const uint8_t *code, size_t code_size, uint64_t ip, int *lenp, int print_opts, FILE *fp); +#ifdef HAVE_LIBCAPSTONE_SUPPORT +#include +int capstone_init(struct machine *machine, csh *cs_handle, bool is64, bool disassembler_style); +#endif + #endif /* PERF_PRINT_INSN_H */ -- cgit v1.2.3 From f1e9347c855de73d8df1b0fa763184a46548d62f Mon Sep 17 00:00:00 2001 From: Athira Rajeev Date: Thu, 18 Jul 2024 14:13:56 +0530 Subject: perf annotate: Use capstone_init and remove open_capstone_handle from disasm.c capstone_init is made availbale for all archs to use and updated to enable support for CS_ARCH_PPC as well. Patch removes open_capstone_handle and uses capstone_init in all the places. Committer notes: Avoid including capstone/capstone.h from print_insn.h to not break the build in builtin-script.c due to the namespace clash with libbpf: /usr/include/capstone/bpf.h:94:14: error: 'bpf_insn' defined as wrong kind of tag Reviewed-by: Kajol Jain Reviewed-by: Namhyung Kim Signed-off-by: Athira Rajeev Tested-by: Kajol Jain Cc: Adrian Hunter Cc: Akanksha J N Cc: Christophe Leroy Cc: Disha Goel Cc: Hari Bathini Cc: Ian Rogers Cc: Jiri Olsa Cc: Madhavan Srinivasan Cc: Segher Boessenkool Link: https://lore.kernel.org/lkml/20240718084358.72242-14-atrajeev@linux.vnet.ibm.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/disasm.c | 2 ++ tools/perf/util/print_insn.c | 2 ++ tools/perf/util/print_insn.h | 5 ----- 3 files changed, 4 insertions(+), 5 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/disasm.c b/tools/perf/util/disasm.c index 3d25443e1162..7a11edefe1bd 100644 --- a/tools/perf/util/disasm.c +++ b/tools/perf/util/disasm.c @@ -1512,6 +1512,8 @@ symbol__disassemble_bpf_image(struct symbol *sym, #ifdef HAVE_LIBCAPSTONE_SUPPORT #include +int capstone_init(struct machine *machine, csh *cs_handle, bool is64, bool disassembler_style); + static int open_capstone_handle(struct annotate_args *args, bool is_64bit, csh *handle) { diff --git a/tools/perf/util/print_insn.c b/tools/perf/util/print_insn.c index a76aae81d7a0..a33a7726422d 100644 --- a/tools/perf/util/print_insn.c +++ b/tools/perf/util/print_insn.c @@ -32,6 +32,8 @@ size_t sample__fprintf_insn_raw(struct perf_sample *sample, FILE *fp) #ifdef HAVE_LIBCAPSTONE_SUPPORT #include +int capstone_init(struct machine *machine, csh *cs_handle, bool is64, bool disassembler_style); + int capstone_init(struct machine *machine, csh *cs_handle, bool is64, bool disassembler_style) { cs_arch arch; diff --git a/tools/perf/util/print_insn.h b/tools/perf/util/print_insn.h index 2c8ee41c4a5d..07d11af3fc1c 100644 --- a/tools/perf/util/print_insn.h +++ b/tools/perf/util/print_insn.h @@ -19,9 +19,4 @@ ssize_t fprintf_insn_asm(struct machine *machine, struct thread *thread, u8 cpum bool is64bit, const uint8_t *code, size_t code_size, uint64_t ip, int *lenp, int print_opts, FILE *fp); -#ifdef HAVE_LIBCAPSTONE_SUPPORT -#include -int capstone_init(struct machine *machine, csh *cs_handle, bool is64, bool disassembler_style); -#endif - #endif /* PERF_PRINT_INSN_H */ -- cgit v1.2.3 From c5d60de1813a9e09ea6d94f82da287d0fa1e1179 Mon Sep 17 00:00:00 2001 From: Athira Rajeev Date: Thu, 18 Jul 2024 14:13:57 +0530 Subject: perf annotate: Add support to use libcapstone in powerpc Now perf uses the capstone library to disassemble the instructions in x86. capstone is used (if available) for perf annotate to speed up. Currently it only supports x86 architecture. This patch includes changes to enable this in powerpc. For now, only for data type sort keys, this method is used and only binary code (raw instruction) is read. This is because powerpc approach to understand instructions and reg fields uses raw instruction. The "cs_disasm" is currently not enabled. While attempting to do cs_disasm, observation is that some of the instructions were not identified (ex: extswsli, maddld) and it had to fallback to use objdump. Hence enabling "cs_disasm" is added in comment section as a TODO for powerpc. Reviewed-by: Kajol Jain Reviewed-by: Namhyung Kim Signed-off-by: Athira Rajeev Tested-by: Kajol Jain Cc: Adrian Hunter Cc: Akanksha J N Cc: Christophe Leroy Cc: Disha Goel Cc: Hari Bathini Cc: Ian Rogers Cc: Jiri Olsa Cc: Madhavan Srinivasan Cc: Segher Boessenkool Link: https://lore.kernel.org/lkml/20240718084358.72242-15-atrajeev@linux.vnet.ibm.com [ Use dso__nsinfo(dso) as required to match EXTRA_CFLAGS=-DREFCNT_CHECKING=1 build expectations ] Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/disasm.c | 143 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 143 insertions(+) (limited to 'tools') diff --git a/tools/perf/util/disasm.c b/tools/perf/util/disasm.c index 7a11edefe1bd..6e7a3db9f330 100644 --- a/tools/perf/util/disasm.c +++ b/tools/perf/util/disasm.c @@ -1614,6 +1614,144 @@ static void print_capstone_detail(cs_insn *insn, char *buf, size_t len, } } +static int symbol__disassemble_capstone_powerpc(char *filename, struct symbol *sym, + struct annotate_args *args) +{ + struct annotation *notes = symbol__annotation(sym); + struct map *map = args->ms.map; + struct dso *dso = map__dso(map); + struct nscookie nsc; + u64 start = map__rip_2objdump(map, sym->start); + u64 end = map__rip_2objdump(map, sym->end); + u64 len = end - start; + u64 offset; + int i, fd, count; + bool is_64bit = false; + bool needs_cs_close = false; + u8 *buf = NULL; + struct find_file_offset_data data = { + .ip = start, + }; + csh handle; + char disasm_buf[512]; + struct disasm_line *dl; + u32 *line; + bool disassembler_style = false; + + if (args->options->objdump_path) + return -1; + + nsinfo__mountns_enter(dso__nsinfo(dso), &nsc); + fd = open(filename, O_RDONLY); + nsinfo__mountns_exit(&nsc); + if (fd < 0) + return -1; + + if (file__read_maps(fd, /*exe=*/true, find_file_offset, &data, + &is_64bit) == 0) + goto err; + + if (!args->options->disassembler_style || + !strcmp(args->options->disassembler_style, "att")) + disassembler_style = true; + + if (capstone_init(maps__machine(args->ms.maps), &handle, is_64bit, disassembler_style) < 0) + goto err; + + needs_cs_close = true; + + buf = malloc(len); + if (buf == NULL) + goto err; + + count = pread(fd, buf, len, data.offset); + close(fd); + fd = -1; + + if ((u64)count != len) + goto err; + + line = (u32 *)buf; + + /* add the function address and name */ + scnprintf(disasm_buf, sizeof(disasm_buf), "%#"PRIx64" <%s>:", + start, sym->name); + + args->offset = -1; + args->line = disasm_buf; + args->line_nr = 0; + args->fileloc = NULL; + args->ms.sym = sym; + + dl = disasm_line__new(args); + if (dl == NULL) + goto err; + + annotation_line__add(&dl->al, ¬es->src->source); + + /* + * TODO: enable disassm for powerpc + * count = cs_disasm(handle, buf, len, start, len, &insn); + * + * For now, only binary code is saved in disassembled line + * to be used in "type" and "typeoff" sort keys. Each raw code + * is 32 bit instruction. So use "len/4" to get the number of + * entries. + */ + count = len/4; + + for (i = 0, offset = 0; i < count; i++) { + args->offset = offset; + sprintf(args->line, "%x", line[i]); + + dl = disasm_line__new(args); + if (dl == NULL) + goto err; + + annotation_line__add(&dl->al, ¬es->src->source); + + offset += 4; + } + + /* It failed in the middle */ + if (offset != len) { + struct list_head *list = ¬es->src->source; + + /* Discard all lines and fallback to objdump */ + while (!list_empty(list)) { + dl = list_first_entry(list, struct disasm_line, al.node); + + list_del_init(&dl->al.node); + disasm_line__free(dl); + } + count = -1; + } + +out: + if (needs_cs_close) + cs_close(&handle); + free(buf); + return count < 0 ? count : 0; + +err: + if (fd >= 0) + close(fd); + if (needs_cs_close) { + struct disasm_line *tmp; + + /* + * It probably failed in the middle of the above loop. + * Release any resources it might add. + */ + list_for_each_entry_safe(dl, tmp, ¬es->src->source, al.node) { + list_del(&dl->al.node); + free(dl); + } + } + count = -1; + goto out; +} + static int symbol__disassemble_capstone(char *filename, struct symbol *sym, struct annotate_args *args) { @@ -1968,6 +2106,11 @@ int symbol__disassemble(struct symbol *sym, struct annotate_args *args) err = symbol__disassemble_raw(symfs_filename, sym, args); if (err == 0) goto out_remove_tmp; +#ifdef HAVE_LIBCAPSTONE_SUPPORT + err = symbol__disassemble_capstone_powerpc(symfs_filename, sym, args); + if (err == 0) + goto out_remove_tmp; +#endif } } -- cgit v1.2.3 From 2c9db7475e5de5125b5043fe8f560284b248b0ae Mon Sep 17 00:00:00 2001 From: Athira Rajeev Date: Thu, 18 Jul 2024 14:13:58 +0530 Subject: perf annotate: Set instruction name to be used with insn-stat when using raw instruction Since the "ins.name" is not set while using raw instruction, 'perf annotate' with insn-stat gives wrong data: Result from "./perf annotate --data-type --insn-stat": Annotate Instruction stats total 615, ok 419 (68.1%), bad 196 (31.9%) Name : Good Bad ----------------------------------------------------------- : 419 196 This patch sets "dl->ins.name" in arch specific function "check_ppc_insn" while initialising "struct disasm_line". Also update "ins_find" function to pass "struct disasm_line" as a parameter so as to set its name field in arch specific call. With the patch changes: Annotate Instruction stats total 609, ok 446 (73.2%), bad 163 (26.8%) Name/opcode : Good Bad ----------------------------------------------------------- 58 : 323 80 32 : 49 43 34 : 33 11 OP_31_XOP_LDX : 8 20 40 : 23 0 OP_31_XOP_LWARX : 5 1 OP_31_XOP_LWZX : 2 3 OP_31_XOP_LDARX : 3 0 33 : 0 2 OP_31_XOP_LBZX : 0 1 OP_31_XOP_LWAX : 0 1 OP_31_XOP_LHZX : 0 1 Reviewed-by: Kajol Jain Reviewed-by: Namhyung Kim Signed-off-by: Athira Rajeev Tested-by: Kajol Jain Cc: Adrian Hunter Cc: Akanksha J N Cc: Christophe Leroy Cc: Disha Goel Cc: Hari Bathini Cc: Ian Rogers Cc: Jiri Olsa Cc: Madhavan Srinivasan Cc: Segher Boessenkool Link: https://lore.kernel.org/lkml/20240718084358.72242-16-atrajeev@linux.vnet.ibm.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/arch/powerpc/annotate/instructions.c | 18 +++++++++++++++--- tools/perf/builtin-annotate.c | 4 ++-- tools/perf/util/annotate.c | 2 +- tools/perf/util/disasm.c | 10 +++++----- tools/perf/util/disasm.h | 2 +- 5 files changed, 24 insertions(+), 12 deletions(-) (limited to 'tools') diff --git a/tools/perf/arch/powerpc/annotate/instructions.c b/tools/perf/arch/powerpc/annotate/instructions.c index af1032572bf3..ede9eeade0ab 100644 --- a/tools/perf/arch/powerpc/annotate/instructions.c +++ b/tools/perf/arch/powerpc/annotate/instructions.c @@ -189,8 +189,9 @@ static int cmp_offset(const void *a, const void *b) return (val1->value - val2->value); } -static struct ins_ops *check_ppc_insn(u32 raw_insn) +static struct ins_ops *check_ppc_insn(struct disasm_line *dl) { + int raw_insn = dl->raw.raw_insn; int opcode = PPC_OP(raw_insn); int mem_insn_31 = PPC_21_30(raw_insn); struct insn_offset *ret; @@ -198,19 +199,30 @@ static struct ins_ops *check_ppc_insn(u32 raw_insn) "OP_31_INSN", mem_insn_31 }; + char name_insn[32]; /* * Instructions with opcode 32 to 63 are memory * instructions in powerpc */ if ((opcode & 0x20)) { + /* + * Set name in case of raw instruction to + * opcode to be used in insn-stat + */ + if (!strlen(dl->ins.name)) { + sprintf(name_insn, "%d", opcode); + dl->ins.name = strdup(name_insn); + } return &load_store_ops; } else if (opcode == 31) { /* Check for memory instructions with opcode 31 */ ret = bsearch(&mem_insns_31_opcode, ins_array, ARRAY_SIZE(ins_array), sizeof(ins_array[0]), cmp_offset); - if (ret != NULL) + if (ret) { + if (!strlen(dl->ins.name)) + dl->ins.name = strdup(ret->name); return &load_store_ops; - else { + } else { mem_insns_31_opcode.value = PPC_22_30(raw_insn); ret = bsearch(&mem_insns_31_opcode, arithmetic_ins_op_31, ARRAY_SIZE(arithmetic_ins_op_31), sizeof(arithmetic_ins_op_31[0]), cmp_offset); diff --git a/tools/perf/builtin-annotate.c b/tools/perf/builtin-annotate.c index b10b7f005658..cf60392b1c19 100644 --- a/tools/perf/builtin-annotate.c +++ b/tools/perf/builtin-annotate.c @@ -396,10 +396,10 @@ static void print_annotate_item_stat(struct list_head *head, const char *title) printf("total %d, ok %d (%.1f%%), bad %d (%.1f%%)\n\n", total, total_good, 100.0 * total_good / (total ?: 1), total_bad, 100.0 * total_bad / (total ?: 1)); - printf(" %-10s: %5s %5s\n", "Name", "Good", "Bad"); + printf(" %-20s: %5s %5s\n", "Name/opcode", "Good", "Bad"); printf("-----------------------------------------------------------\n"); list_for_each_entry(istat, head, list) - printf(" %-10s: %5d %5d\n", istat->name, istat->good, istat->bad); + printf(" %-20s: %5d %5d\n", istat->name, istat->good, istat->bad); printf("\n"); } diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c index ce99db291c5e..a2ee4074f768 100644 --- a/tools/perf/util/annotate.c +++ b/tools/perf/util/annotate.c @@ -2229,7 +2229,7 @@ static struct annotated_item_stat *annotate_data_stat(struct list_head *head, return NULL; istat->name = strdup(name); - if (istat->name == NULL) { + if ((istat->name == NULL) || (!strlen(istat->name))) { free(istat); return NULL; } diff --git a/tools/perf/util/disasm.c b/tools/perf/util/disasm.c index 6e7a3db9f330..410e52cd9cfd 100644 --- a/tools/perf/util/disasm.c +++ b/tools/perf/util/disasm.c @@ -857,7 +857,7 @@ static void ins__sort(struct arch *arch) qsort(arch->instructions, nmemb, sizeof(struct ins), ins__cmp); } -static struct ins_ops *__ins__find(struct arch *arch, const char *name, u32 raw_insn) +static struct ins_ops *__ins__find(struct arch *arch, const char *name, struct disasm_line *dl) { struct ins *ins; const int nmemb = arch->nr_instructions; @@ -869,7 +869,7 @@ static struct ins_ops *__ins__find(struct arch *arch, const char *name, u32 raw_ */ struct ins_ops *ops; - ops = check_ppc_insn(raw_insn); + ops = check_ppc_insn(dl); if (ops) return ops; } @@ -903,9 +903,9 @@ static struct ins_ops *__ins__find(struct arch *arch, const char *name, u32 raw_ return ins ? ins->ops : NULL; } -struct ins_ops *ins__find(struct arch *arch, const char *name, u32 raw_insn) +struct ins_ops *ins__find(struct arch *arch, const char *name, struct disasm_line *dl) { - struct ins_ops *ops = __ins__find(arch, name, raw_insn); + struct ins_ops *ops = __ins__find(arch, name, dl); if (!ops && arch->associate_instruction_ops) ops = arch->associate_instruction_ops(arch, name); @@ -915,7 +915,7 @@ struct ins_ops *ins__find(struct arch *arch, const char *name, u32 raw_insn) static void disasm_line__init_ins(struct disasm_line *dl, struct arch *arch, struct map_symbol *ms) { - dl->ins.ops = ins__find(arch, dl->ins.name, dl->raw.raw_insn); + dl->ins.ops = ins__find(arch, dl->ins.name, dl); if (!dl->ins.ops) return; diff --git a/tools/perf/util/disasm.h b/tools/perf/util/disasm.h index c1bb1e484bfb..f56beedeb9da 100644 --- a/tools/perf/util/disasm.h +++ b/tools/perf/util/disasm.h @@ -105,7 +105,7 @@ struct annotate_args { struct arch *arch__find(const char *name); bool arch__is(struct arch *arch, const char *name); -struct ins_ops *ins__find(struct arch *arch, const char *name, u32 raw_insn); +struct ins_ops *ins__find(struct arch *arch, const char *name, struct disasm_line *dl); int ins__scnprintf(struct ins *ins, char *bf, size_t size, struct ins_operands *ops, int max_ins_name); -- cgit v1.2.3 From 42d37fc0c819b81f6f6afd108b55d04ba9d32d0f Mon Sep 17 00:00:00 2001 From: Kajol Jain Date: Tue, 23 Jul 2024 10:51:54 +0530 Subject: perf vendor events power10: Update JSON/events Update JSON/events for power10 platform with additional events. Also move PM_VECTOR_LD_CMPL event from others.json to frontend.json file. Reviewed-by: Ian Rogers Signed-off-by: Kajol Jain Tested-by: Disha Goel Cc: Akanksha J N Cc: Athira Rajeev Cc: Disha Goel Cc: Madhavan Srinivasan Cc: Namhyung Kim Cc: hbathini@linux.ibm.com Cc: linuxppc-dev@lists.ozlabs.org Link: https://lore.kernel.org/r/20240723052154.96202-1-kjain@linux.ibm.com [ Remove alternative to ' char that made the build break in some distros with a unicode parsing python error ] Signed-off-by: Arnaldo Carvalho de Melo --- .../pmu-events/arch/powerpc/power10/frontend.json | 5 ++ .../pmu-events/arch/powerpc/power10/others.json | 100 +++++++++++++++++++-- 2 files changed, 100 insertions(+), 5 deletions(-) (limited to 'tools') diff --git a/tools/perf/pmu-events/arch/powerpc/power10/frontend.json b/tools/perf/pmu-events/arch/powerpc/power10/frontend.json index 5977f5e64212..53660c279286 100644 --- a/tools/perf/pmu-events/arch/powerpc/power10/frontend.json +++ b/tools/perf/pmu-events/arch/powerpc/power10/frontend.json @@ -74,6 +74,11 @@ "EventName": "PM_ISSUE_KILL", "BriefDescription": "Cycles in which an instruction or group of instructions were cancelled after being issued. This event increments once per occurrence, regardless of how many instructions are included in the issue group." }, + { + "EventCode": "0x44054", + "EventName": "PM_VECTOR_LD_CMPL", + "BriefDescription": "Vector load instruction completed." + }, { "EventCode": "0x44056", "EventName": "PM_VECTOR_ST_CMPL", diff --git a/tools/perf/pmu-events/arch/powerpc/power10/others.json b/tools/perf/pmu-events/arch/powerpc/power10/others.json index fcf8a8ebe7bd..3789304cb363 100644 --- a/tools/perf/pmu-events/arch/powerpc/power10/others.json +++ b/tools/perf/pmu-events/arch/powerpc/power10/others.json @@ -94,11 +94,6 @@ "EventName": "PM_L1_ICACHE_RELOADED_ALL", "BriefDescription": "Counts all instruction cache reloads includes demand, prefetch, prefetch turned into demand and demand turned into prefetch." }, - { - "EventCode": "0x44054", - "EventName": "PM_VECTOR_LD_CMPL", - "BriefDescription": "Vector load instruction completed." - }, { "EventCode": "0x4D05E", "EventName": "PM_BR_CMPL", @@ -108,5 +103,100 @@ "EventCode": "0x400F0", "EventName": "PM_LD_DEMAND_MISS_L1_FIN", "BriefDescription": "Load missed L1, counted at finish time." + }, + { + "EventCode": "0x00000038BC", + "EventName": "PM_ISYNC_CMPL", + "BriefDescription": "Isync completion count per thread." + }, + { + "EventCode": "0x000000C088", + "EventName": "PM_LD0_32B_FIN", + "BriefDescription": "256-bit load finished in the LD0 load execution unit." + }, + { + "EventCode": "0x000000C888", + "EventName": "PM_LD1_32B_FIN", + "BriefDescription": "256-bit load finished in the LD1 load execution unit." + }, + { + "EventCode": "0x000000C090", + "EventName": "PM_LD0_UNALIGNED_FIN", + "BriefDescription": "Load instructions in LD0 port that are either unaligned, or treated as unaligned and require an additional recycle through the pipeline using the load gather buffer. This typically adds about 10 cycles to the latency of the instruction. This includes loads that cross the 128 byte boundary, octword loads that are not aligned, and a special forward progress case of a load that does not hit in the L1 and crosses the 32 byte boundary and is launched NTC. Counted at finish time." + }, + { + "EventCode": "0x000000C890", + "EventName": "PM_LD1_UNALIGNED_FIN", + "BriefDescription": "Load instructions in LD1 port that are either unaligned, or treated as unaligned and require an additional recycle through the pipeline using the load gather buffer. This typically adds about 10 cycles to the latency of the instruction. This includes loads that cross the 128 byte boundary, octword loads that are not aligned, and a special forward progress case of a load that does not hit in the L1 and crosses the 32 byte boundary and is launched NTC. Counted at finish time." + }, + { + "EventCode": "0x000000C0A4", + "EventName": "PM_ST0_UNALIGNED_FIN", + "BriefDescription": "Store instructions in ST0 port that are either unaligned, or treated as unaligned and require an additional recycle through the pipeline. This typically adds about 10 cycles to the latency of the instruction. This only includes stores that cross the 128 byte boundary. Counted at finish time." + }, + { + "EventCode": "0x000000C8A4", + "EventName": "PM_ST1_UNALIGNED_FIN", + "BriefDescription": "Store instructions in ST1 port that are either unaligned, or treated as unaligned and require an additional recycle through the pipeline. This typically adds about 10 cycles to the latency of the instruction. This only includes stores that cross the 128 byte boundary. Counted at finish time." + }, + { + "EventCode": "0x000000C8B8", + "EventName": "PM_STCX_SUCCESS_CMPL", + "BriefDescription": "STCX instructions that completed successfully. Specifically, counts only when a pass status is returned from the nest." + }, + { + "EventCode": "0x000000D0B4", + "EventName": "PM_DC_PREF_STRIDED_CONF", + "BriefDescription": "A demand load referenced a line in an active strided prefetch stream. The stream could have been allocated through the hardware prefetch mechanism or through software." + }, + { + "EventCode": "0x000000F880", + "EventName": "PM_SNOOP_TLBIE_CYC", + "BriefDescription": "Cycles in which TLBIE snoops are executed in the LSU." + }, + { + "EventCode": "0x000000F084", + "EventName": "PM_SNOOP_TLBIE_CACHE_WALK_CYC", + "BriefDescription": "TLBIE snoop cycles in which the data cache is being walked." + }, + { + "EventCode": "0x000000F884", + "EventName": "PM_SNOOP_TLBIE_WAIT_ST_CYC", + "BriefDescription": "TLBIE snoop cycles in which older stores are still draining." + }, + { + "EventCode": "0x000000F088", + "EventName": "PM_SNOOP_TLBIE_WAIT_LD_CYC", + "BriefDescription": "TLBIE snoop cycles in which older loads are still draining." + }, + { + "EventCode": "0x000000F08C", + "EventName": "PM_SNOOP_TLBIE_WAIT_MMU_CYC", + "BriefDescription": "TLBIE snoop cycles in which the Load-Store unit is waiting for the MMU to finish invalidation." + }, + { + "EventCode": "0x0000004884", + "EventName": "PM_NO_FETCH_IBUF_FULL_CYC", + "BriefDescription": "Cycles in which no instructions are fetched because there is no room in the instruction buffers." + }, + { + "EventCode": "0x00000048B4", + "EventName": "PM_BR_TKN_UNCOND_FIN", + "BriefDescription": "An unconditional branch finished. All unconditional branches are taken." + }, + { + "EventCode": "0x0B0000016080", + "EventName": "PM_L2_TLBIE_SLBIE_START", + "BriefDescription": "NCU Master received a TLBIE/SLBIEG/SLBIAG operation from the core. Event count should be multiplied by 2 since the data is coming from a 2:1 clock domain and the data is time sliced across all 4 threads." + }, + { + "EventCode": "0x0B0000016880", + "EventName": "PM_L2_TLBIE_SLBIE_DELAY", + "BriefDescription": "Cycles when a TLBIE/SLBIEG/SLBIAG command was held in a hottemp condition by the NCU Master. Multiply this count by 1000 to obtain the total number of cycles. This can be divided by PM_L2_TLBIE_SLBIE_SENT to obtain the average time a TLBIE/SLBIEG/SLBIAG command was held. Event count should be multiplied by 2 since the data is coming from a 2:1 clock domain and the data is time sliced across all 4 threads." + }, + { + "EventCode": "0x0B0000026880", + "EventName": "PM_L2_SNP_TLBIE_SLBIE_DELAY", + "BriefDescription": "Cycles when a TLBIE/SLBIEG/SLBIAG that targets this thread's LPAR was in flight while in a hottemp condition. Multiply this count by 1000 to obtain the total number of cycles. This can be divided by PM_L2_SNP_TLBIE_SLBIE_START to obtain the overall efficiency. Note: 'inflight' means SnpTLB has been sent to core(ie doesn't include when SnpTLB is in NCU waiting to be launched serially behind different SnpTLB). The NCU Snooper gets in a 'hottemp' delay window when it detects it is above its TLBIE/SLBIE threshold for process SnpTLBIE/SLBIE with this core. Event count should be multiplied by 2 since the data is coming from a 2:1 clock domain and the data is time sliced across all 4 threads." } ] -- cgit v1.2.3 From 050f2a03aaadac13fff13f9ab7c5c1f0937ab729 Mon Sep 17 00:00:00 2001 From: Chen Ni Date: Tue, 16 Jul 2024 15:34:05 +0800 Subject: perf annotate: Convert comma to semicolon MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace a comma between expression statements by a semicolon. Reviewed-by: Ian Rogers Signed-off-by: Chen Ni Cc: Adrian Hunter Cc: Ahelenia Ziemiańska Cc: Alexander Shishkin Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kan Liang Cc: Mark Rutland Cc: Namhyung Kim Cc: Pekka Enberg Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20240716073405.968801-1-nichen@iscas.ac.cn Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/ui/browsers/annotate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/perf/ui/browsers/annotate.c b/tools/perf/ui/browsers/annotate.c index ea986430241e..fe991a81256b 100644 --- a/tools/perf/ui/browsers/annotate.c +++ b/tools/perf/ui/browsers/annotate.c @@ -985,7 +985,7 @@ int symbol__tui_annotate(struct map_symbol *ms, struct evsel *evsel, browser.b.width = notes->src->widths.max_line_len; browser.b.nr_entries = notes->src->nr_entries; - browser.b.entries = ¬es->src->source, + browser.b.entries = ¬es->src->source; browser.b.width += 18; /* Percentage */ if (annotate_opts.hide_src_code) -- cgit v1.2.3 From e60fc19eab439736009018406a19fad28e60e664 Mon Sep 17 00:00:00 2001 From: Chen Ni Date: Tue, 16 Jul 2024 15:43:40 +0800 Subject: perf daemon: Convert comma to semicolon Replace a comma between expression statements by a semicolon. Reviewed-by: Ian Rogers Signed-off-by: Chen Ni Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kan Liang Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20240716074340.968909-1-nichen@iscas.ac.cn Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-daemon.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/perf/builtin-daemon.c b/tools/perf/builtin-daemon.c index de76bbc50bfb..18f7f417ba11 100644 --- a/tools/perf/builtin-daemon.c +++ b/tools/perf/builtin-daemon.c @@ -1433,7 +1433,7 @@ static int __cmd_signal(struct daemon *daemon, struct option parent_options[], } memset(&cmd, 0, sizeof(cmd)); - cmd.signal.cmd = CMD_SIGNAL, + cmd.signal.cmd = CMD_SIGNAL; cmd.signal.sig = SIGUSR2; strncpy(cmd.signal.name, name, sizeof(cmd.signal.name) - 1); -- cgit v1.2.3 From 496cae1b3306e2ea7bbb8ca7ced082a2046afed9 Mon Sep 17 00:00:00 2001 From: Chen Ni Date: Tue, 16 Jul 2024 15:53:47 +0800 Subject: perf inject: Convert comma to semicolon Replace a comma between expression statements by a semicolon. Reviewed-by: Ian Rogers Signed-off-by: Chen Ni Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kan Liang Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20240716075347.969041-1-nichen@iscas.ac.cn Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-inject.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/perf/builtin-inject.c b/tools/perf/builtin-inject.c index a212678d47be..7b4a5d56d279 100644 --- a/tools/perf/builtin-inject.c +++ b/tools/perf/builtin-inject.c @@ -2069,7 +2069,7 @@ static int __cmd_inject(struct perf_inject *inject) */ inject->tool.finished_init = host__finished_init; /* Obey finished round ordering */ - inject->tool.finished_round = host__finished_round, + inject->tool.finished_round = host__finished_round; /* Keep track of which CPU a VCPU is runnng on */ inject->tool.context_switch = host__context_switch; /* -- cgit v1.2.3 From 4194744602c8cd971ad7fd95710416ca0568d999 Mon Sep 17 00:00:00 2001 From: James Clark Date: Mon, 22 Jul 2024 16:27:56 +0100 Subject: perf cs-etm: Output 0 instead of 0xdeadbeef when exception packets are flushed Normally exception packets don't directly output a branch sample, but if they're the last record in a buffer then they will. Because they don't have addresses set we'll see the placeholder value CS_ETM_INVAL_ADDR (0xdeadbeef) in the output. Since commit 6035b6804bdf ("perf cs-etm: Support dummy address value for CS_ETM_TRACE_ON packet") we've used 0 as an externally visible "not set" address value. For consistency reasons and to not make exceptions look like an error, change them to use 0 too. This is particularly visible when doing userspace only tracing because trace is disabled when jumping to the kernel, causing the flush and then forcing the last exception packet to be emitted as a branch. With kernel trace included, there is no flush so exception packets don't generate samples until the next range packet and they'll pick up the correct address. Before: $ perf record -e cs_etm//u -- stress -i 1 -t 1 $ perf script -F comm,ip,addr,flags stress syscall ffffb7eedbc0 => deadbeefdeadbeef stress syscall ffffb7f14a14 => deadbeefdeadbeef stress syscall ffffb7eedbc0 => deadbeefdeadbeef After: stress syscall ffffb7eedbc0 => 0 stress syscall ffffb7f14a14 => 0 stress syscall ffffb7eedbc0 => 0 Reviewed-by: Mike Leach Signed-off-by: James Clark Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ian Rogers Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: John Garry Cc: Leo Yan Cc: Leo Yan Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Suzuki Poulouse Cc: Will Deacon Cc: coresight@lists.linaro.org Cc: gankulkarni@os.amperecomputing.com Cc: linux-arm-kernel@lists.infradead.org Link: https://lore.kernel.org/r/20240722152756.59453-2-james.clark@linaro.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/cs-etm.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/cs-etm.c b/tools/perf/util/cs-etm.c index 5e9fbcfad7d4..d3e9c64d17d4 100644 --- a/tools/perf/util/cs-etm.c +++ b/tools/perf/util/cs-etm.c @@ -1267,8 +1267,12 @@ static inline int cs_etm__t32_instr_size(struct cs_etm_queue *etmq, static inline u64 cs_etm__first_executed_instr(struct cs_etm_packet *packet) { - /* Returns 0 for the CS_ETM_DISCONTINUITY packet */ - if (packet->sample_type == CS_ETM_DISCONTINUITY) + /* + * Return 0 for packets that have no addresses so that CS_ETM_INVAL_ADDR doesn't + * appear in samples. + */ + if (packet->sample_type == CS_ETM_DISCONTINUITY || + packet->sample_type == CS_ETM_EXCEPTION) return 0; return packet->start_addr; -- cgit v1.2.3 From ae8e4f4048b839c1cb333d9e3d20e634b430139e Mon Sep 17 00:00:00 2001 From: James Clark Date: Tue, 23 Jul 2024 14:28:58 +0100 Subject: perf scripts python cs-etm: Restore first sample log in verbose mode The linked commit moved the early return on the first sample to before the verbose log, so move the log earlier too. Now the first sample is also logged and not skipped. Fixes: 2d98dbb4c9c5b09c ("perf scripts python arm-cs-trace-disasm.py: Do not ignore disam first sample") Reviewed-by: Leo Yan Signed-off-by: James Clark Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Benjamin Gray Cc: coresight@lists.linaro.org Cc: gankulkarni@os.amperecomputing.com Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Mark Rutland Cc: Mike Leach Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Ruidong Tian Cc: Suzuki Poulouse Link: https://lore.kernel.org/r/20240723132858.12747-1-james.clark@linaro.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/scripts/python/arm-cs-trace-disasm.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'tools') diff --git a/tools/perf/scripts/python/arm-cs-trace-disasm.py b/tools/perf/scripts/python/arm-cs-trace-disasm.py index d973c2baed1c..7aff02d84ffb 100755 --- a/tools/perf/scripts/python/arm-cs-trace-disasm.py +++ b/tools/perf/scripts/python/arm-cs-trace-disasm.py @@ -192,17 +192,16 @@ def process_event(param_dict): ip = sample["ip"] addr = sample["addr"] + if (options.verbose == True): + print("Event type: %s" % name) + print_sample(sample) + # Initialize CPU data if it's empty, and directly return back # if this is the first tracing event for this CPU. if (cpu_data.get(str(cpu) + 'addr') == None): cpu_data[str(cpu) + 'addr'] = addr return - - if (options.verbose == True): - print("Event type: %s" % name) - print_sample(sample) - # If cannot find dso so cannot dump assembler, bail out if (dso == '[unknown]'): return -- cgit v1.2.3 From c91928a8d5243f6b7355aab6325fe7545ee65404 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Mon, 15 Jul 2024 19:07:04 +0300 Subject: perf tools: Enable evsel__is_aux_event() to work for ARM/ARM64 Set pmu->auxtrace on ARM/ARM64 AUX area PMUs. evsel__is_aux_event() needs the setting to identify AUX area tracing selected events. Currently, the features that use evsel__is_aux_event() are used only by Intel PT, but that may change in the future. Reviewed-by: Andi Kleen Reviewed-by: Leo Yan Signed-off-by: Adrian Hunter Acked-by: Ian Rogers Cc: Alexander Shishkin Cc: Heiko Carstens Cc: Hendrik Brueckner Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: Jonathan Cameron Cc: Mark Rutland Cc: Mike Leach Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Suzuki Poulouse Cc: Thomas Richter Cc: Will Deacon Cc: Yicong Yang Cc: coresight@lists.linaro.org Cc: linux-arm-kernel@lists.infradead.org Link: https://lore.kernel.org/r/20240715160712.127117-6-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/arch/arm/util/pmu.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'tools') diff --git a/tools/perf/arch/arm/util/pmu.c b/tools/perf/arch/arm/util/pmu.c index 1c9541d01722..57dc94a6e38c 100644 --- a/tools/perf/arch/arm/util/pmu.c +++ b/tools/perf/arch/arm/util/pmu.c @@ -23,16 +23,19 @@ void perf_pmu__arch_init(struct perf_pmu *pmu) #ifdef HAVE_AUXTRACE_SUPPORT if (!strcmp(pmu->name, CORESIGHT_ETM_PMU_NAME)) { /* add ETM default config here */ + pmu->auxtrace = true; pmu->selectable = true; pmu->perf_event_attr_init_default = cs_etm_get_default_config; #if defined(__aarch64__) } else if (strstarts(pmu->name, ARM_SPE_PMU_NAME)) { + pmu->auxtrace = true; pmu->selectable = true; pmu->is_uncore = false; pmu->perf_event_attr_init_default = arm_spe_pmu_default_config; if (strstarts(pmu->name, "arm_spe_")) pmu->mem_events = perf_mem_events_arm; } else if (strstarts(pmu->name, HISI_PTT_PMU_NAME)) { + pmu->auxtrace = true; pmu->selectable = true; #endif } -- cgit v1.2.3 From feab89bf991c940e2d44e45ededbde8640ac8830 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Mon, 15 Jul 2024 19:07:05 +0300 Subject: perf tools: Enable evsel__is_aux_event() to work for S390_CPUMSF evsel__is_aux_event() identifies AUX area tracing selected events. S390_CPUMSF uses a raw event type (PERF_TYPE_RAW - refer s390_cpumsf_evsel_is_auxtrace()) not a PMU type value that could be checked in evsel__is_aux_event(). However it sets needs_auxtrace_mmap (refer auxtrace_record__init()), so check that first. Currently, the features that use evsel__is_aux_event() are used only by Intel PT, but that may change in the future. Reviewed-by: Andi Kleen Reviewed-by: Leo Yan Signed-off-by: Adrian Hunter Acked-by: Ian Rogers Cc: Alexander Shishkin Cc: Heiko Carstens Cc: Hendrik Brueckner Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: Jonathan Cameron Cc: Mark Rutland Cc: Mike Leach Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Suzuki Poulouse Cc: Thomas Richter Cc: Will Deacon Cc: Yicong Yang Cc: coresight@lists.linaro.org Cc: linux-arm-kernel@lists.infradead.org Link: https://lore.kernel.org/r/20240715160712.127117-7-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/pmu.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/perf/util/pmu.c b/tools/perf/util/pmu.c index 986166bc7c78..0b38c51bd6eb 100644 --- a/tools/perf/util/pmu.c +++ b/tools/perf/util/pmu.c @@ -1199,8 +1199,12 @@ void perf_pmu__warn_invalid_formats(struct perf_pmu *pmu) bool evsel__is_aux_event(const struct evsel *evsel) { - struct perf_pmu *pmu = evsel__find_pmu(evsel); + struct perf_pmu *pmu; + + if (evsel->needs_auxtrace_mmap) + return true; + pmu = evsel__find_pmu(evsel); return pmu && pmu->auxtrace; } -- cgit v1.2.3 From 156e8dcfeceebce0694fb00542ac68e61a785fe1 Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Sat, 27 Jul 2024 18:59:19 +0100 Subject: perf test pmu: Remove unused test_pmus Commit aa1551f299ba ("perf test pmu: Refactor format test and exposed test APIs") added the 'test_pmus' list, but didn't use it. (It seems to put them on the other_pmus list?) Remove it. Fixes: aa1551f299ba414c ("perf test pmu: Refactor format test and exposed test APIs") Reviewed-by: Ian Rogers Signed-off-by: Dr. David Alan Gilbert Cc: Ian Rogers Cc: Kan Liang Link: https://lore.kernel.org/lkml/20240727175919.1041468-1-linux@treblig.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/pmu.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'tools') diff --git a/tools/perf/tests/pmu.c b/tools/perf/tests/pmu.c index 40132655ccd1..0b2f04a55d7b 100644 --- a/tools/perf/tests/pmu.c +++ b/tools/perf/tests/pmu.c @@ -18,9 +18,6 @@ #include #include -/* Fake PMUs created in temp directory. */ -static LIST_HEAD(test_pmus); - /* Cleanup test PMU directory. */ static int test_pmu_put(const char *dir, struct perf_pmu *pmu) { -- cgit v1.2.3 From c77800894b5a7d7d37b83862fd732230c5c3e518 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Sun, 28 Jul 2024 17:41:24 -0700 Subject: perf ftrace: Add 'tail' option to --graph-opts The 'graph-tail' option is to print function name as a comment at the end. This is useful when a large function is mixed with other functions (possibly from different CPUs). For example, $ sudo perf ftrace -- perf stat true ... 1) | get_unused_fd_flags() { 1) | alloc_fd() { 1) 0.178 us | _raw_spin_lock(); 1) 0.187 us | expand_files(); 1) 0.169 us | _raw_spin_unlock(); 1) 1.211 us | } 1) 1.503 us | } $ sudo perf ftrace --graph-opts tail -- perf stat true ... 1) | get_unused_fd_flags() { 1) | alloc_fd() { 1) 0.099 us | _raw_spin_lock(); 1) 0.083 us | expand_files(); 1) 0.081 us | _raw_spin_unlock(); 1) 0.601 us | } /* alloc_fd */ 1) 0.751 us | } /* get_unused_fd_flags */ Reviewed-by: Ian Rogers Signed-off-by: Namhyung Kim Tested-by: Arnaldo Carvalho de Melo Cc: Adrian Hunter Cc: Changbin Du Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kan Liang Cc: Peter Zijlstra Cc: Steven Rostedt (VMware) Link: https://lore.kernel.org/lkml/20240729004127.238611-2-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Documentation/perf-ftrace.txt | 1 + tools/perf/builtin-ftrace.c | 18 ++++++++++++++++++ tools/perf/util/ftrace.h | 1 + 3 files changed, 20 insertions(+) (limited to 'tools') diff --git a/tools/perf/Documentation/perf-ftrace.txt b/tools/perf/Documentation/perf-ftrace.txt index d780b93fcf87..7ea1645a13cf 100644 --- a/tools/perf/Documentation/perf-ftrace.txt +++ b/tools/perf/Documentation/perf-ftrace.txt @@ -125,6 +125,7 @@ OPTIONS for 'perf ftrace trace' - verbose - Show process names, PIDs, timestamps, etc. - thresh= - Setup trace duration threshold in microseconds. - depth= - Set max depth for function graph tracer to follow. + - tail - Print function name at the end. OPTIONS for 'perf ftrace latency' diff --git a/tools/perf/builtin-ftrace.c b/tools/perf/builtin-ftrace.c index eb30c8eca488..33c52ebda2fd 100644 --- a/tools/perf/builtin-ftrace.c +++ b/tools/perf/builtin-ftrace.c @@ -228,6 +228,7 @@ static void reset_tracing_options(struct perf_ftrace *ftrace __maybe_unused) write_tracing_option_file("funcgraph-irqs", "1"); write_tracing_option_file("funcgraph-proc", "0"); write_tracing_option_file("funcgraph-abstime", "0"); + write_tracing_option_file("funcgraph-tail", "0"); write_tracing_option_file("latency-format", "0"); write_tracing_option_file("irq-info", "0"); } @@ -464,6 +465,17 @@ static int set_tracing_funcgraph_verbose(struct perf_ftrace *ftrace) return 0; } +static int set_tracing_funcgraph_tail(struct perf_ftrace *ftrace) +{ + if (!ftrace->graph_tail) + return 0; + + if (write_tracing_option_file("funcgraph-tail", "1") < 0) + return -1; + + return 0; +} + static int set_tracing_thresh(struct perf_ftrace *ftrace) { int ret; @@ -540,6 +552,11 @@ static int set_tracing_options(struct perf_ftrace *ftrace) return -1; } + if (set_tracing_funcgraph_tail(ftrace) < 0) { + pr_err("failed to set tracing option funcgraph-tail\n"); + return -1; + } + return 0; } @@ -1099,6 +1116,7 @@ static int parse_graph_tracer_opts(const struct option *opt, { .name = "verbose", .value_ptr = &ftrace->graph_verbose }, { .name = "thresh", .value_ptr = &ftrace->graph_thresh }, { .name = "depth", .value_ptr = &ftrace->graph_depth }, + { .name = "tail", .value_ptr = &ftrace->graph_tail }, { .name = NULL, } }; diff --git a/tools/perf/util/ftrace.h b/tools/perf/util/ftrace.h index 558efcb98d25..2515e841c64c 100644 --- a/tools/perf/util/ftrace.h +++ b/tools/perf/util/ftrace.h @@ -25,6 +25,7 @@ struct perf_ftrace { int graph_noirqs; int graph_verbose; int graph_thresh; + int graph_tail; }; struct filter_entry { -- cgit v1.2.3 From 608585f43f9e4798db7585d8999c0404103d4b66 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Sun, 28 Jul 2024 17:41:25 -0700 Subject: perf ftrace: Factor out check_ftrace_capable() The check is a common part of the ftrace commands, let's move it out. Reviewed-by: Ian Rogers Signed-off-by: Namhyung Kim Cc: Adrian Hunter Cc: Changbin Du Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kan Liang Cc: Peter Zijlstra Cc: Steven Rostedt (VMware) Link: https://lore.kernel.org/lkml/20240729004127.238611-3-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-ftrace.c | 44 ++++++++++++++++++++------------------------ 1 file changed, 20 insertions(+), 24 deletions(-) (limited to 'tools') diff --git a/tools/perf/builtin-ftrace.c b/tools/perf/builtin-ftrace.c index 33c52ebda2fd..978608ecd89c 100644 --- a/tools/perf/builtin-ftrace.c +++ b/tools/perf/builtin-ftrace.c @@ -59,6 +59,22 @@ static void ftrace__workload_exec_failed_signal(int signo __maybe_unused, done = true; } +static int check_ftrace_capable(void) +{ + if (!(perf_cap__capable(CAP_PERFMON) || + perf_cap__capable(CAP_SYS_ADMIN))) { + pr_err("ftrace only works for %s!\n", +#ifdef HAVE_LIBCAP_SUPPORT + "users with the CAP_PERFMON or CAP_SYS_ADMIN capability" +#else + "root" +#endif + ); + return -1; + } + return 0; +} + static int __write_tracing_file(const char *name, const char *val, bool append) { char *file; @@ -586,18 +602,6 @@ static int __cmd_ftrace(struct perf_ftrace *ftrace) .events = POLLIN, }; - if (!(perf_cap__capable(CAP_PERFMON) || - perf_cap__capable(CAP_SYS_ADMIN))) { - pr_err("ftrace only works for %s!\n", -#ifdef HAVE_LIBCAP_SUPPORT - "users with the CAP_PERFMON or CAP_SYS_ADMIN capability" -#else - "root" -#endif - ); - return -1; - } - select_tracer(ftrace); if (reset_tracing_files(ftrace) < 0) { @@ -902,18 +906,6 @@ static int __cmd_latency(struct perf_ftrace *ftrace) }; int buckets[NUM_BUCKET] = { }; - if (!(perf_cap__capable(CAP_PERFMON) || - perf_cap__capable(CAP_SYS_ADMIN))) { - pr_err("ftrace only works for %s!\n", -#ifdef HAVE_LIBCAP_SUPPORT - "users with the CAP_PERFMON or CAP_SYS_ADMIN capability" -#else - "root" -#endif - ); - return -1; - } - trace_fd = prepare_func_latency(ftrace); if (trace_fd < 0) goto out; @@ -1220,6 +1212,10 @@ int cmd_ftrace(int argc, const char **argv) signal(SIGCHLD, sig_handler); signal(SIGPIPE, sig_handler); + ret = check_ftrace_capable(); + if (ret < 0) + return -1; + ret = perf_config(perf_ftrace_config, &ftrace); if (ret < 0) return -1; -- cgit v1.2.3 From 0f223813edd051a516ec4b1fc23b1fdc00dd3b6d Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Sun, 28 Jul 2024 17:41:26 -0700 Subject: perf ftrace: Add 'profile' command The 'perf ftrace profile' command is to get function execution profiles using function-graph tracer so that users can see the total, average, max execution time as well as the number of invocations easily. The following is a profile for the perf_event_open syscall. $ sudo perf ftrace profile -G __x64_sys_perf_event_open -- \ perf stat -e cycles -C1 true 2> /dev/null | head # Total (us) Avg (us) Max (us) Count Function 65.611 65.611 65.611 1 __x64_sys_perf_event_open 30.527 30.527 30.527 1 anon_inode_getfile 30.260 30.260 30.260 1 __anon_inode_getfile 29.700 29.700 29.700 1 alloc_file_pseudo 17.578 17.578 17.578 1 d_alloc_pseudo 17.382 17.382 17.382 1 __d_alloc 16.738 16.738 16.738 1 kmem_cache_alloc_lru 15.686 15.686 15.686 1 perf_event_alloc 14.012 7.006 11.264 2 obj_cgroup_charge # Reviewed-by: Ian Rogers Signed-off-by: Namhyung Kim Tested-by: Arnaldo Carvalho de Melo Cc: Adrian Hunter Cc: Changbin Du Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kan Liang Cc: Peter Zijlstra Cc: Steven Rostedt (VMware) Link: https://lore.kernel.org/lkml/20240729004127.238611-4-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Documentation/perf-ftrace.txt | 42 +++- tools/perf/builtin-ftrace.c | 318 ++++++++++++++++++++++++++++++- tools/perf/util/ftrace.h | 2 + 3 files changed, 359 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/perf/Documentation/perf-ftrace.txt b/tools/perf/Documentation/perf-ftrace.txt index 7ea1645a13cf..33f32467f287 100644 --- a/tools/perf/Documentation/perf-ftrace.txt +++ b/tools/perf/Documentation/perf-ftrace.txt @@ -9,7 +9,7 @@ perf-ftrace - simple wrapper for kernel's ftrace functionality SYNOPSIS -------- [verse] -'perf ftrace' {trace|latency} +'perf ftrace' {trace|latency|profile} DESCRIPTION ----------- @@ -23,6 +23,9 @@ kernel's ftrace infrastructure. 'perf ftrace latency' calculates execution latency of a given function (optionally with BPF) and display it as a histogram. + 'perf ftrace profile' show a execution profile for each function including + total, average, max time and the number of calls. + The following options apply to perf ftrace. COMMON OPTIONS @@ -146,6 +149,43 @@ OPTIONS for 'perf ftrace latency' Use nano-second instead of micro-second as a base unit of the histogram. +OPTIONS for 'perf ftrace profile' +--------------------------------- + +-T:: +--trace-funcs=:: + Set function filter on the given function (or a glob pattern). + Multiple functions can be given by using this option more than once. + The function argument also can be a glob pattern. It will be passed + to 'set_ftrace_filter' in tracefs. + +-N:: +--notrace-funcs=:: + Do not trace functions given by the argument. Like -T option, this + can be used more than once to specify multiple functions (or glob + patterns). It will be passed to 'set_ftrace_notrace' in tracefs. + +-G:: +--graph-funcs=:: + Set graph filter on the given function (or a glob pattern). This is + useful to trace for functions executed from the given function. This + can be used more than once to specify multiple functions. It will be + passed to 'set_graph_function' in tracefs. + +-g:: +--nograph-funcs=:: + Set graph notrace filter on the given function (or a glob pattern). + Like -G option, this is useful for the function_graph tracer only and + disables tracing for function executed from the given function. This + can be used more than once to specify multiple functions. It will be + passed to 'set_graph_notrace' in tracefs. + +-m:: +--buffer-size:: + Set the size of per-cpu tracing buffer, is expected to + be a number with appended unit character - B/K/M/G. + + SEE ALSO -------- linkperf:perf-record[1], linkperf:perf-trace[1] diff --git a/tools/perf/builtin-ftrace.c b/tools/perf/builtin-ftrace.c index 978608ecd89c..ae9389435d1b 100644 --- a/tools/perf/builtin-ftrace.c +++ b/tools/perf/builtin-ftrace.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include @@ -22,15 +23,18 @@ #include "debug.h" #include #include +#include #include #include "evlist.h" #include "target.h" #include "cpumap.h" +#include "hashmap.h" #include "thread_map.h" #include "strfilter.h" #include "util/cap.h" #include "util/config.h" #include "util/ftrace.h" +#include "util/stat.h" #include "util/units.h" #include "util/parse-sublevel-options.h" @@ -959,6 +963,294 @@ out: return (done && !workload_exec_errno) ? 0 : -1; } +static size_t profile_hash(long func, void *ctx __maybe_unused) +{ + return str_hash((char *)func); +} + +static bool profile_equal(long func1, long func2, void *ctx __maybe_unused) +{ + return !strcmp((char *)func1, (char *)func2); +} + +static int prepare_func_profile(struct perf_ftrace *ftrace) +{ + ftrace->tracer = "function_graph"; + ftrace->graph_tail = 1; + + ftrace->profile_hash = hashmap__new(profile_hash, profile_equal, NULL); + if (ftrace->profile_hash == NULL) + return -ENOMEM; + + return 0; +} + +/* This is saved in a hashmap keyed by the function name */ +struct ftrace_profile_data { + struct stats st; +}; + +static int add_func_duration(struct perf_ftrace *ftrace, char *func, double time_ns) +{ + struct ftrace_profile_data *prof = NULL; + + if (!hashmap__find(ftrace->profile_hash, func, &prof)) { + char *key = strdup(func); + + if (key == NULL) + return -ENOMEM; + + prof = zalloc(sizeof(*prof)); + if (prof == NULL) { + free(key); + return -ENOMEM; + } + + init_stats(&prof->st); + hashmap__add(ftrace->profile_hash, key, prof); + } + + update_stats(&prof->st, time_ns); + return 0; +} + +/* + * The ftrace function_graph text output normally looks like below: + * + * CPU DURATION FUNCTION + * + * 0) | syscall_trace_enter.isra.0() { + * 0) | __audit_syscall_entry() { + * 0) | auditd_test_task() { + * 0) 0.271 us | __rcu_read_lock(); + * 0) 0.275 us | __rcu_read_unlock(); + * 0) 1.254 us | } /\* auditd_test_task *\/ + * 0) 0.279 us | ktime_get_coarse_real_ts64(); + * 0) 2.227 us | } /\* __audit_syscall_entry *\/ + * 0) 2.713 us | } /\* syscall_trace_enter.isra.0 *\/ + * + * Parse the line and get the duration and function name. + */ +static int parse_func_duration(struct perf_ftrace *ftrace, char *line, size_t len) +{ + char *p; + char *func; + double duration; + + /* skip CPU */ + p = strchr(line, ')'); + if (p == NULL) + return 0; + + /* get duration */ + p = skip_spaces(p + 1); + + /* no duration? */ + if (p == NULL || *p == '|') + return 0; + + /* skip markers like '*' or '!' for longer than ms */ + if (!isdigit(*p)) + p++; + + duration = strtod(p, &p); + + if (strncmp(p, " us", 3)) { + pr_debug("non-usec time found.. ignoring\n"); + return 0; + } + + /* + * profile stat keeps the max and min values as integer, + * convert to nsec time so that we can have accurate max. + */ + duration *= 1000; + + /* skip to the pipe */ + while (p < line + len && *p != '|') + p++; + + if (*p++ != '|') + return -EINVAL; + + /* get function name */ + func = skip_spaces(p); + + /* skip the closing bracket and the start of comment */ + if (*func == '}') + func += 5; + + /* remove semi-colon or end of comment at the end */ + p = line + len - 1; + while (!isalnum(*p) && *p != ']') { + *p = '\0'; + --p; + } + + return add_func_duration(ftrace, func, duration); +} + +static int cmp_profile_data(const void *a, const void *b) +{ + const struct hashmap_entry *e1 = *(const struct hashmap_entry **)a; + const struct hashmap_entry *e2 = *(const struct hashmap_entry **)b; + struct ftrace_profile_data *p1 = e1->pvalue; + struct ftrace_profile_data *p2 = e2->pvalue; + + /* compare by total time */ + if ((p1->st.n * p1->st.mean) > (p2->st.n * p2->st.mean)) + return -1; + else + return 1; +} + +static void print_profile_result(struct perf_ftrace *ftrace) +{ + struct hashmap_entry *entry, **profile; + size_t i, nr, bkt; + + nr = hashmap__size(ftrace->profile_hash); + if (nr == 0) + return; + + profile = calloc(nr, sizeof(*profile)); + if (profile == NULL) { + pr_err("failed to allocate memory for the result\n"); + return; + } + + i = 0; + hashmap__for_each_entry(ftrace->profile_hash, entry, bkt) + profile[i++] = entry; + + assert(i == nr); + + //cmp_profile_data(profile[0], profile[1]); + qsort(profile, nr, sizeof(*profile), cmp_profile_data); + + printf("# %10s %10s %10s %10s %s\n", + "Total (us)", "Avg (us)", "Max (us)", "Count", "Function"); + + for (i = 0; i < nr; i++) { + const char *name = profile[i]->pkey; + struct ftrace_profile_data *p = profile[i]->pvalue; + + printf("%12.3f %10.3f %6"PRIu64".%03"PRIu64" %10.0f %s\n", + p->st.n * p->st.mean / 1000, p->st.mean / 1000, + p->st.max / 1000, p->st.max % 1000, p->st.n, name); + } + + free(profile); + + hashmap__for_each_entry(ftrace->profile_hash, entry, bkt) { + free((char *)entry->pkey); + free(entry->pvalue); + } + + hashmap__free(ftrace->profile_hash); + ftrace->profile_hash = NULL; +} + +static int __cmd_profile(struct perf_ftrace *ftrace) +{ + char *trace_file; + int trace_fd; + char buf[4096]; + struct io io; + char *line = NULL; + size_t line_len = 0; + + if (prepare_func_profile(ftrace) < 0) { + pr_err("failed to prepare func profiler\n"); + goto out; + } + + if (reset_tracing_files(ftrace) < 0) { + pr_err("failed to reset ftrace\n"); + goto out; + } + + /* reset ftrace buffer */ + if (write_tracing_file("trace", "0") < 0) + goto out; + + if (set_tracing_options(ftrace) < 0) + return -1; + + if (write_tracing_file("current_tracer", ftrace->tracer) < 0) { + pr_err("failed to set current_tracer to %s\n", ftrace->tracer); + goto out_reset; + } + + setup_pager(); + + trace_file = get_tracing_file("trace_pipe"); + if (!trace_file) { + pr_err("failed to open trace_pipe\n"); + goto out_reset; + } + + trace_fd = open(trace_file, O_RDONLY); + + put_tracing_file(trace_file); + + if (trace_fd < 0) { + pr_err("failed to open trace_pipe\n"); + goto out_reset; + } + + fcntl(trace_fd, F_SETFL, O_NONBLOCK); + + if (write_tracing_file("tracing_on", "1") < 0) { + pr_err("can't enable tracing\n"); + goto out_close_fd; + } + + evlist__start_workload(ftrace->evlist); + + io__init(&io, trace_fd, buf, sizeof(buf)); + io.timeout_ms = -1; + + while (!done && !io.eof) { + if (io__getline(&io, &line, &line_len) < 0) + break; + + if (parse_func_duration(ftrace, line, line_len) < 0) + break; + } + + write_tracing_file("tracing_on", "0"); + + if (workload_exec_errno) { + const char *emsg = str_error_r(workload_exec_errno, buf, sizeof(buf)); + /* flush stdout first so below error msg appears at the end. */ + fflush(stdout); + pr_err("workload failed: %s\n", emsg); + goto out_free_line; + } + + /* read remaining buffer contents */ + io.timeout_ms = 0; + while (!io.eof) { + if (io__getline(&io, &line, &line_len) < 0) + break; + + if (parse_func_duration(ftrace, line, line_len) < 0) + break; + } + + print_profile_result(ftrace); + +out_free_line: + free(line); +out_close_fd: + close(trace_fd); +out_reset: + reset_tracing_files(ftrace); +out: + return (done && !workload_exec_errno) ? 0 : -1; +} + static int perf_ftrace_config(const char *var, const char *value, void *cb) { struct perf_ftrace *ftrace = cb; @@ -1126,6 +1418,7 @@ enum perf_ftrace_subcommand { PERF_FTRACE_NONE, PERF_FTRACE_TRACE, PERF_FTRACE_LATENCY, + PERF_FTRACE_PROFILE, }; int cmd_ftrace(int argc, const char **argv) @@ -1191,13 +1484,28 @@ int cmd_ftrace(int argc, const char **argv) "Use nano-second histogram"), OPT_PARENT(common_options), }; + const struct option profile_options[] = { + OPT_CALLBACK('T', "trace-funcs", &ftrace.filters, "func", + "Trace given functions using function tracer", + parse_filter_func), + OPT_CALLBACK('N', "notrace-funcs", &ftrace.notrace, "func", + "Do not trace given functions", parse_filter_func), + OPT_CALLBACK('G', "graph-funcs", &ftrace.graph_funcs, "func", + "Trace given functions using function_graph tracer", + parse_filter_func), + OPT_CALLBACK('g', "nograph-funcs", &ftrace.nograph_funcs, "func", + "Set nograph filter on given functions", parse_filter_func), + OPT_CALLBACK('m', "buffer-size", &ftrace.percpu_buffer_size, "size", + "Size of per cpu buffer, needs to use a B, K, M or G suffix.", parse_buffer_size), + OPT_PARENT(common_options), + }; const struct option *options = ftrace_options; const char * const ftrace_usage[] = { "perf ftrace [] []", "perf ftrace [] -- [] []", - "perf ftrace {trace|latency} [] []", - "perf ftrace {trace|latency} [] -- [] []", + "perf ftrace {trace|latency|profile} [] []", + "perf ftrace {trace|latency|profile} [] -- [] []", NULL }; enum perf_ftrace_subcommand subcmd = PERF_FTRACE_NONE; @@ -1226,6 +1534,9 @@ int cmd_ftrace(int argc, const char **argv) } else if (!strcmp(argv[1], "latency")) { subcmd = PERF_FTRACE_LATENCY; options = latency_options; + } else if (!strcmp(argv[1], "profile")) { + subcmd = PERF_FTRACE_PROFILE; + options = profile_options; } if (subcmd != PERF_FTRACE_NONE) { @@ -1261,6 +1572,9 @@ int cmd_ftrace(int argc, const char **argv) } cmd_func = __cmd_latency; break; + case PERF_FTRACE_PROFILE: + cmd_func = __cmd_profile; + break; case PERF_FTRACE_NONE: default: pr_err("Invalid subcommand\n"); diff --git a/tools/perf/util/ftrace.h b/tools/perf/util/ftrace.h index 2515e841c64c..bae649ef50e8 100644 --- a/tools/perf/util/ftrace.h +++ b/tools/perf/util/ftrace.h @@ -6,6 +6,7 @@ #include "target.h" struct evlist; +struct hashamp; struct perf_ftrace { struct evlist *evlist; @@ -15,6 +16,7 @@ struct perf_ftrace { struct list_head notrace; struct list_head graph_funcs; struct list_head nograph_funcs; + struct hashmap *profile_hash; unsigned long percpu_buffer_size; bool inherit; bool use_nsec; -- cgit v1.2.3 From 74ae366c37b71b46be7f2fa45fa4b2c9c6708fbe Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Sun, 28 Jul 2024 17:41:27 -0700 Subject: perf ftrace profile: Add -s/--sort option The -s/--sort option is to sort the output by given column. $ sudo perf ftrace profile -s max sync | head # Total (us) Avg (us) Max (us) Count Function 6301.811 6301.811 6301.811 1 __do_sys_sync 6301.328 6301.328 6301.328 1 ksys_sync 5320.300 1773.433 2858.819 3 iterate_supers 2755.875 17.012 2610.633 162 sync_fs_one_sb 2728.351 682.088 2610.413 4 ext4_sync_fs [ext4] 2603.654 2603.654 2603.654 1 jbd2_log_wait_commit [jbd2] 4750.615 593.827 2597.427 8 schedule 2164.986 26.728 2115.673 81 sync_inodes_one_sb 2143.842 26.467 2115.438 81 sync_inodes_sb Reviewed-by: Ian Rogers Signed-off-by: Namhyung Kim Tested-by: Arnaldo Carvalho de Melo Cc: Adrian Hunter Cc: Changbin Du Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kan Liang Cc: Peter Zijlstra Cc: Steven Rostedt (VMware) Link: https://lore.kernel.org/lkml/20240729004127.238611-5-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Documentation/perf-ftrace.txt | 5 +++ tools/perf/builtin-ftrace.c | 63 +++++++++++++++++++++++++++++++- 2 files changed, 66 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/perf/Documentation/perf-ftrace.txt b/tools/perf/Documentation/perf-ftrace.txt index 33f32467f287..eaec8253be68 100644 --- a/tools/perf/Documentation/perf-ftrace.txt +++ b/tools/perf/Documentation/perf-ftrace.txt @@ -185,6 +185,11 @@ OPTIONS for 'perf ftrace profile' Set the size of per-cpu tracing buffer, is expected to be a number with appended unit character - B/K/M/G. +-s:: +--sort=:: + Sort the result by the given field. Available values are: + total, avg, max, count, name. Default is 'total'. + SEE ALSO -------- diff --git a/tools/perf/builtin-ftrace.c b/tools/perf/builtin-ftrace.c index ae9389435d1b..a615c405d98f 100644 --- a/tools/perf/builtin-ftrace.c +++ b/tools/perf/builtin-ftrace.c @@ -1090,15 +1090,47 @@ static int parse_func_duration(struct perf_ftrace *ftrace, char *line, size_t le return add_func_duration(ftrace, func, duration); } +enum perf_ftrace_profile_sort_key { + PFP_SORT_TOTAL = 0, + PFP_SORT_AVG, + PFP_SORT_MAX, + PFP_SORT_COUNT, + PFP_SORT_NAME, +}; + +static enum perf_ftrace_profile_sort_key profile_sort = PFP_SORT_TOTAL; + static int cmp_profile_data(const void *a, const void *b) { const struct hashmap_entry *e1 = *(const struct hashmap_entry **)a; const struct hashmap_entry *e2 = *(const struct hashmap_entry **)b; struct ftrace_profile_data *p1 = e1->pvalue; struct ftrace_profile_data *p2 = e2->pvalue; + double v1, v2; + + switch (profile_sort) { + case PFP_SORT_NAME: + return strcmp(e1->pkey, e2->pkey); + case PFP_SORT_AVG: + v1 = p1->st.mean; + v2 = p2->st.mean; + break; + case PFP_SORT_MAX: + v1 = p1->st.max; + v2 = p2->st.max; + break; + case PFP_SORT_COUNT: + v1 = p1->st.n; + v2 = p2->st.n; + break; + case PFP_SORT_TOTAL: + default: + v1 = p1->st.n * p1->st.mean; + v2 = p2->st.n * p2->st.mean; + break; + } - /* compare by total time */ - if ((p1->st.n * p1->st.mean) > (p2->st.n * p2->st.mean)) + if (v1 > v2) return -1; else return 1; @@ -1414,6 +1446,30 @@ static int parse_graph_tracer_opts(const struct option *opt, return 0; } +static int parse_sort_key(const struct option *opt, const char *str, int unset) +{ + enum perf_ftrace_profile_sort_key *key = (void *)opt->value; + + if (unset) + return 0; + + if (!strcmp(str, "total")) + *key = PFP_SORT_TOTAL; + else if (!strcmp(str, "avg")) + *key = PFP_SORT_AVG; + else if (!strcmp(str, "max")) + *key = PFP_SORT_MAX; + else if (!strcmp(str, "count")) + *key = PFP_SORT_COUNT; + else if (!strcmp(str, "name")) + *key = PFP_SORT_NAME; + else { + pr_err("Unknown sort key: %s\n", str); + return -1; + } + return 0; +} + enum perf_ftrace_subcommand { PERF_FTRACE_NONE, PERF_FTRACE_TRACE, @@ -1497,6 +1553,9 @@ int cmd_ftrace(int argc, const char **argv) "Set nograph filter on given functions", parse_filter_func), OPT_CALLBACK('m', "buffer-size", &ftrace.percpu_buffer_size, "size", "Size of per cpu buffer, needs to use a B, K, M or G suffix.", parse_buffer_size), + OPT_CALLBACK('s', "sort", &profile_sort, "key", + "Sort result by key: total (default), avg, max, count, name.", + parse_sort_key), OPT_PARENT(common_options), }; const struct option *options = ftrace_options; -- cgit v1.2.3 From 4ed0f392e7dbd2e90a903bdd77d1f6e61b7d3073 Mon Sep 17 00:00:00 2001 From: Weilin Wang Date: Wed, 22 May 2024 13:42:54 -0700 Subject: perf test: make metric validation test return early when there is no metric supported on the test system Add a check to return the metric validation test early when perf list metric does not output any metric. This would happen when NO_JEVENTS=1 is set or in a system that there is no metric supported. Signed-off-by: Weilin Wang Tested-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Caleb Biggers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kan Liang Cc: Namhyung Kim Cc: Perry Taylor Cc: Peter Zijlstra Cc: Samantha Alt Link: https://lore.kernel.org/lkml/20240522204254.1841420-1-weilin.wang@intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/shell/lib/perf_metric_validation.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/perf/tests/shell/lib/perf_metric_validation.py b/tools/perf/tests/shell/lib/perf_metric_validation.py index a2d235252183..0b94216c9c46 100644 --- a/tools/perf/tests/shell/lib/perf_metric_validation.py +++ b/tools/perf/tests/shell/lib/perf_metric_validation.py @@ -95,7 +95,7 @@ class Validator: indent=4) def get_results(self, idx: int = 0): - return self.results[idx] + return self.results.get(idx) def get_bounds(self, lb, ub, error, alias={}, ridx: int = 0) -> list: """ @@ -173,7 +173,10 @@ class Validator: pcnt = 0 tcnt = 0 rerun = list() - for name, val in self.get_results().items(): + results = self.get_results() + if not results: + return + for name, val in results.items(): if val < 0: negmetric[name] = val rerun.append(name) @@ -532,6 +535,9 @@ class Validator: ''' if not self.collectlist: self.parse_perf_metrics() + if not self.metrics: + print("No metric found for testing") + return 0 self.create_rules() for i in range(0, len(self.workloads)): self.wlidx = i -- cgit v1.2.3 From ccd6fcda257573081b308f3f8f002a15810eba7c Mon Sep 17 00:00:00 2001 From: Leo Yan Date: Thu, 25 Jul 2024 07:46:19 +0100 Subject: perf arm-spe: Extract evsel setting up The evsel for Arm SPE PMU needs to be set up. Extract the setting up into a function arm_spe_setup_evsel(). Signed-off-by: Leo Yan Cc: Mark Rutland Cc: Suzuki K Poulose Cc: Ian Rogers Cc: Adrian Hunter Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Namhyung Kim Cc: James Clark Cc: Mike Leach Cc: Cc: John Garry Cc: Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/arch/arm64/util/arm-spe.c | 74 +++++++++++++++++++----------------- 1 file changed, 40 insertions(+), 34 deletions(-) (limited to 'tools') diff --git a/tools/perf/arch/arm64/util/arm-spe.c b/tools/perf/arch/arm64/util/arm-spe.c index 0b52e67edb3b..fe7942824113 100644 --- a/tools/perf/arch/arm64/util/arm-spe.c +++ b/tools/perf/arch/arm64/util/arm-spe.c @@ -132,6 +132,45 @@ static __u64 arm_spe_pmu__sample_period(const struct perf_pmu *arm_spe_pmu) return sample_period; } +static void arm_spe_setup_evsel(struct evsel *evsel, struct perf_cpu_map *cpus) +{ + u64 bit; + + evsel->core.attr.freq = 0; + evsel->core.attr.sample_period = arm_spe_pmu__sample_period(evsel->pmu); + evsel->needs_auxtrace_mmap = true; + + /* + * To obtain the auxtrace buffer file descriptor, the auxtrace event + * must come first. + */ + evlist__to_front(evsel->evlist, evsel); + + /* + * In the case of per-cpu mmaps, sample CPU for AUX event; + * also enable the timestamp tracing for samples correlation. + */ + if (!perf_cpu_map__is_any_cpu_or_is_empty(cpus)) { + evsel__set_sample_bit(evsel, CPU); + evsel__set_config_if_unset(evsel->pmu, evsel, "ts_enable", 1); + } + + /* + * Set this only so that perf report knows that SPE generates memory info. It has no effect + * on the opening of the event or the SPE data produced. + */ + evsel__set_sample_bit(evsel, DATA_SRC); + + /* + * The PHYS_ADDR flag does not affect the driver behaviour, it is used to + * inform that the resulting output's SPE samples contain physical addresses + * where applicable. + */ + bit = perf_pmu__format_bits(evsel->pmu, "pa_enable"); + if (evsel->core.attr.config & bit) + evsel__set_sample_bit(evsel, PHYS_ADDR); +} + static int arm_spe_recording_options(struct auxtrace_record *itr, struct evlist *evlist, struct record_opts *opts) @@ -144,7 +183,6 @@ static int arm_spe_recording_options(struct auxtrace_record *itr, bool privileged = perf_event_paranoid_check(-1); struct evsel *tracking_evsel; int err; - u64 bit; sper->evlist = evlist; @@ -154,9 +192,6 @@ static int arm_spe_recording_options(struct auxtrace_record *itr, pr_err("There may be only one " ARM_SPE_PMU_NAME "x event\n"); return -EINVAL; } - evsel->core.attr.freq = 0; - evsel->core.attr.sample_period = arm_spe_pmu__sample_period(arm_spe_pmu); - evsel->needs_auxtrace_mmap = true; arm_spe_evsel = evsel; opts->full_auxtrace = true; } @@ -222,36 +257,7 @@ static int arm_spe_recording_options(struct auxtrace_record *itr, pr_debug2("%sx snapshot size: %zu\n", ARM_SPE_PMU_NAME, opts->auxtrace_snapshot_size); - /* - * To obtain the auxtrace buffer file descriptor, the auxtrace event - * must come first. - */ - evlist__to_front(evlist, arm_spe_evsel); - - /* - * In the case of per-cpu mmaps, sample CPU for AUX event; - * also enable the timestamp tracing for samples correlation. - */ - if (!perf_cpu_map__is_any_cpu_or_is_empty(cpus)) { - evsel__set_sample_bit(arm_spe_evsel, CPU); - evsel__set_config_if_unset(arm_spe_pmu, arm_spe_evsel, - "ts_enable", 1); - } - - /* - * Set this only so that perf report knows that SPE generates memory info. It has no effect - * on the opening of the event or the SPE data produced. - */ - evsel__set_sample_bit(arm_spe_evsel, DATA_SRC); - - /* - * The PHYS_ADDR flag does not affect the driver behaviour, it is used to - * inform that the resulting output's SPE samples contain physical addresses - * where applicable. - */ - bit = perf_pmu__format_bits(arm_spe_pmu, "pa_enable"); - if (arm_spe_evsel->core.attr.config & bit) - evsel__set_sample_bit(arm_spe_evsel, PHYS_ADDR); + arm_spe_setup_evsel(arm_spe_evsel, cpus); /* Add dummy event to keep tracking */ err = parse_event(evlist, "dummy:u"); -- cgit v1.2.3 From 1635bdca4b02bbe6529e769dd7d56a98960344fc Mon Sep 17 00:00:00 2001 From: Leo Yan Date: Thu, 25 Jul 2024 07:46:20 +0100 Subject: perf arm-spe: Support multiple Arm SPE events As the flag 'auxtrace' has been set for Arm SPE events, now it is ready to use evsel__is_aux_event() to check if an event is AUX trace event or not. Use this function to replace the old checking for only the first Arm SPE event. Signed-off-by: Leo Yan Cc: Mark Rutland Cc: Suzuki K Poulose Cc: Ian Rogers Cc: Adrian Hunter Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Namhyung Kim Cc: James Clark Cc: Mike Leach Cc: Cc: John Garry Cc: Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/arch/arm64/util/arm-spe.c | 37 +++++++++++++++++++++++------------- 1 file changed, 24 insertions(+), 13 deletions(-) (limited to 'tools') diff --git a/tools/perf/arch/arm64/util/arm-spe.c b/tools/perf/arch/arm64/util/arm-spe.c index fe7942824113..d59f6ca499f2 100644 --- a/tools/perf/arch/arm64/util/arm-spe.c +++ b/tools/perf/arch/arm64/util/arm-spe.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #include @@ -177,8 +178,7 @@ static int arm_spe_recording_options(struct auxtrace_record *itr, { struct arm_spe_recording *sper = container_of(itr, struct arm_spe_recording, itr); - struct perf_pmu *arm_spe_pmu = sper->arm_spe_pmu; - struct evsel *evsel, *arm_spe_evsel = NULL; + struct evsel *evsel, *tmp; struct perf_cpu_map *cpus = evlist->core.user_requested_cpus; bool privileged = perf_event_paranoid_check(-1); struct evsel *tracking_evsel; @@ -187,12 +187,12 @@ static int arm_spe_recording_options(struct auxtrace_record *itr, sper->evlist = evlist; evlist__for_each_entry(evlist, evsel) { - if (evsel->core.attr.type == arm_spe_pmu->type) { - if (arm_spe_evsel) { - pr_err("There may be only one " ARM_SPE_PMU_NAME "x event\n"); + if (evsel__is_aux_event(evsel)) { + if (!strstarts(evsel->pmu_name, ARM_SPE_PMU_NAME)) { + pr_err("Found unexpected auxtrace event: %s\n", + evsel->pmu_name); return -EINVAL; } - arm_spe_evsel = evsel; opts->full_auxtrace = true; } } @@ -257,7 +257,10 @@ static int arm_spe_recording_options(struct auxtrace_record *itr, pr_debug2("%sx snapshot size: %zu\n", ARM_SPE_PMU_NAME, opts->auxtrace_snapshot_size); - arm_spe_setup_evsel(arm_spe_evsel, cpus); + evlist__for_each_entry_safe(evlist, tmp, evsel) { + if (evsel__is_aux_event(evsel)) + arm_spe_setup_evsel(evsel, cpus); + } /* Add dummy event to keep tracking */ err = parse_event(evlist, "dummy:u"); @@ -307,12 +310,16 @@ static int arm_spe_snapshot_start(struct auxtrace_record *itr) struct arm_spe_recording *ptr = container_of(itr, struct arm_spe_recording, itr); struct evsel *evsel; + int ret = -EINVAL; evlist__for_each_entry(ptr->evlist, evsel) { - if (evsel->core.attr.type == ptr->arm_spe_pmu->type) - return evsel__disable(evsel); + if (evsel__is_aux_event(evsel)) { + ret = evsel__disable(evsel); + if (ret < 0) + return ret; + } } - return -EINVAL; + return ret; } static int arm_spe_snapshot_finish(struct auxtrace_record *itr) @@ -320,12 +327,16 @@ static int arm_spe_snapshot_finish(struct auxtrace_record *itr) struct arm_spe_recording *ptr = container_of(itr, struct arm_spe_recording, itr); struct evsel *evsel; + int ret = -EINVAL; evlist__for_each_entry(ptr->evlist, evsel) { - if (evsel->core.attr.type == ptr->arm_spe_pmu->type) - return evsel__enable(evsel); + if (evsel__is_aux_event(evsel)) { + ret = evsel__enable(evsel); + if (ret < 0) + return ret; + } } - return -EINVAL; + return ret; } static int arm_spe_alloc_wrapped_array(struct arm_spe_recording *ptr, int idx) -- cgit v1.2.3 From d261f9ebcf424535fe04e720a1cfa023be409f52 Mon Sep 17 00:00:00 2001 From: Charlie Jenkins Date: Mon, 29 Jul 2024 16:11:02 -0700 Subject: libperf: Add gitignore Ignore files that are generated by libperf and libperf tests. Signed-off-by: Charlie Jenkins Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Link: https://lore.kernel.org/lkml/20240729-libperf_gitignore-v1-1-1c70dd98edf9@rivosinc.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/lib/perf/.gitignore | 5 +++++ 1 file changed, 5 insertions(+) create mode 100644 tools/lib/perf/.gitignore (limited to 'tools') diff --git a/tools/lib/perf/.gitignore b/tools/lib/perf/.gitignore new file mode 100644 index 000000000000..0f5b4af63f62 --- /dev/null +++ b/tools/lib/perf/.gitignore @@ -0,0 +1,5 @@ +# SPDX-License-Identifier: GPL-2.0-only +libperf.pc +libperf.so.* +tests-shared +tests-static -- cgit v1.2.3 From 839b1832e68a5b7d8c81c9281296b03e5ba7c31a Mon Sep 17 00:00:00 2001 From: Tiezhu Yang Date: Tue, 30 Jul 2024 14:23:00 +0800 Subject: perf tools: Fix wrong message when running "make JOBS=1" There is only one job when running "make JOBS=1", it should print "sequential build" rather than "parallel build". Before: $ cd tools/perf && make JOBS=1 BUILD: Doing 'make -j1' parallel build After: $ cd tools/perf && make JOBS=1 BUILD: Doing 'make -j1' sequential build Signed-off-by: Tiezhu Yang Tested-by: Arnaldo Carvalho de Melo Cc: Ingo Molnar Cc: Namhyung Kim Cc: Peter Zijlstra Link: https://lore.kernel.org/lkml/20240730062301.23244-2-yangtiezhu@loongson.cn Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Makefile | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/perf/Makefile b/tools/perf/Makefile index 75f3f6e0a231..816d5d84816b 100644 --- a/tools/perf/Makefile +++ b/tools/perf/Makefile @@ -51,8 +51,14 @@ else override DEBUG = 0 endif +ifeq ($(JOBS),1) + BUILD_TYPE := sequential +else + BUILD_TYPE := parallel +endif + define print_msg - @printf ' BUILD: Doing '\''make \033[33m-j'$(JOBS)'\033[m'\'' parallel build\n' + @printf ' BUILD: Doing '\''make \033[33m-j'$(JOBS)'\033[m'\'' $(BUILD_TYPE) build\n' endef define make -- cgit v1.2.3 From b48543c451c30387b53ee6e202dda8d5303f6268 Mon Sep 17 00:00:00 2001 From: Tiezhu Yang Date: Tue, 30 Jul 2024 14:23:01 +0800 Subject: perf list: Give clues if failed to open tracing events directory When executing the command "perf list", I met "Error: failed to open tracing events directory" twice, the first reason is that there is no "/sys/kernel/tracing/events" directory due to it does not enable the kernel tracing infrastructure with CONFIG_FTRACE, the second reason is that there is no root privileges. Add the error string to tell the users what happened and what should to do, and also call put_tracing_file() to free events_path a little later to avoid messy code in the error message. At the same time, just remove the redundant "/" of the file path in the function get_tracing_file(), otherwise it shows something like "/sys/kernel/tracing//events". Before: $ ./perf list Error: failed to open tracing events directory After: (1) Without CONFIG_FTRACE $ ./perf list Error: failed to open tracing events directory /sys/kernel/tracing/events: No such file or directory (2) With CONFIG_FTRACE but no root privileges $ ./perf list Error: failed to open tracing events directory /sys/kernel/tracing/events: Permission denied Committer testing: Redirect stdout to null to quickly test the patch: Before: $ perf list > /dev/null Error: failed to open tracing events directory $ After: $ perf list > /dev/null Error: failed to open tracing events directory /sys/kernel/tracing/events: Permission denied $ Signed-off-by: Tiezhu Yang Tested-by: Arnaldo Carvalho de Melo Cc: Ingo Molnar Cc: Namhyung Kim Cc: Peter Zijlstra Link: https://lore.kernel.org/lkml/20240730062301.23244-3-yangtiezhu@loongson.cn Signed-off-by: Arnaldo Carvalho de Melo --- tools/lib/api/fs/tracing_path.c | 2 +- tools/perf/util/print-events.c | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/lib/api/fs/tracing_path.c b/tools/lib/api/fs/tracing_path.c index 30745f35d0d2..834fd64c7130 100644 --- a/tools/lib/api/fs/tracing_path.c +++ b/tools/lib/api/fs/tracing_path.c @@ -69,7 +69,7 @@ char *get_tracing_file(const char *name) { char *file; - if (asprintf(&file, "%s/%s", tracing_path_mount(), name) < 0) + if (asprintf(&file, "%s%s", tracing_path_mount(), name) < 0) return NULL; return file; diff --git a/tools/perf/util/print-events.c b/tools/perf/util/print-events.c index 3f38c27f0157..81e0135cddf0 100644 --- a/tools/perf/util/print-events.c +++ b/tools/perf/util/print-events.c @@ -68,11 +68,12 @@ void print_tracepoint_events(const struct print_callbacks *print_cb __maybe_unus struct dirent **sys_namelist = NULL; int sys_items; - put_tracing_file(events_path); if (events_fd < 0) { pr_err("Error: failed to open tracing events directory\n"); + pr_err("%s: %s\n", events_path, strerror(errno)); return; } + put_tracing_file(events_path); sys_items = tracing_events__scandir_alphasort(&sys_namelist); -- cgit v1.2.3 From 0f2c0400b560a3752fac9e583c1576a53a51cde9 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Tue, 30 Jul 2024 12:17:43 -0700 Subject: perf jevents: Use name for special find value (PMU_EVENTS__NOT_FOUND) -1000 was used as a special value added in Commit 3d5045492ab2 ("perf pmu-events: Add pmu_events_table__find_event()") to show that 1 table lacked a PMU/event but that didn't terminate the search in other tables. Add a new constant PMU_EVENTS__NOT_FOUND for this value and use it. Reviewed-by: John Garry Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ingo Molnar Cc: Jing Zhang Cc: Jiri Olsa Cc: Kan Liang Cc: Mark Rutland Cc: Namhyung Kim Cc: Oliver Sang Cc: Peter Zijlstra Cc: Philip Li Cc: Sandipan Das Cc: Weilin Wang Cc: Xu Yang Link: https://lore.kernel.org/r/20240730191744.3097329-2-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/pmu-events/jevents.py | 6 +++--- tools/perf/pmu-events/pmu-events.h | 9 +++++++++ 2 files changed, 12 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/perf/pmu-events/jevents.py b/tools/perf/pmu-events/jevents.py index ac9b7ca41856..731776e29f47 100755 --- a/tools/perf/pmu-events/jevents.py +++ b/tools/perf/pmu-events/jevents.py @@ -906,7 +906,7 @@ static int pmu_events_table__find_event_pmu(const struct pmu_events_table *table do_call: return fn ? fn(&pe, table, data) : 0; } - return -1000; + return PMU_EVENTS__NOT_FOUND; } int pmu_events_table__for_each_event(const struct pmu_events_table *table, @@ -944,10 +944,10 @@ int pmu_events_table__find_event(const struct pmu_events_table *table, continue; ret = pmu_events_table__find_event_pmu(table, table_pmu, name, fn, data); - if (ret != -1000) + if (ret != PMU_EVENTS__NOT_FOUND) return ret; } - return -1000; + return PMU_EVENTS__NOT_FOUND; } size_t pmu_events_table__num_events(const struct pmu_events_table *table, diff --git a/tools/perf/pmu-events/pmu-events.h b/tools/perf/pmu-events/pmu-events.h index f5aa96f1685c..5435ad92180c 100644 --- a/tools/perf/pmu-events/pmu-events.h +++ b/tools/perf/pmu-events/pmu-events.h @@ -70,6 +70,8 @@ struct pmu_metric { struct pmu_events_table; struct pmu_metrics_table; +#define PMU_EVENTS__NOT_FOUND -1000 + typedef int (*pmu_event_iter_fn)(const struct pmu_event *pe, const struct pmu_events_table *table, void *data); @@ -82,6 +84,13 @@ int pmu_events_table__for_each_event(const struct pmu_events_table *table, struct perf_pmu *pmu, pmu_event_iter_fn fn, void *data); +/* + * Search for table and entry matching with pmu__name_match. Each matching event + * has fn called on it. 0 implies to success/continue the search while non-zero + * means to terminate. The special value PMU_EVENTS__NOT_FOUND is used to + * indicate no event was found in one of the tables which doesn't terminate the + * search of all tables. + */ int pmu_events_table__find_event(const struct pmu_events_table *table, struct perf_pmu *pmu, const char *name, -- cgit v1.2.3 From edb08cdd10b57d30631492e534f5bc13a78e739b Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Wed, 3 Jul 2024 15:30:28 -0700 Subject: perf bpf-filter: Make filters map a single entry hashmap And the value is now an array. This is to support multiple filter entries in the map later. No functional changes intended. Signed-off-by: Namhyung Kim Cc: Adrian Hunter Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: KP Singh Cc: Kan Liang Cc: Peter Zijlstra Cc: Song Liu Cc: Stephane Eranian Cc: bpf@vger.kernel.org Link: https://lore.kernel.org/r/20240703223035.2024586-2-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/bpf-filter.c | 81 +++++++++++++++++++--------- tools/perf/util/bpf_skel/sample-filter.h | 1 + tools/perf/util/bpf_skel/sample_filter.bpf.c | 39 +++++++------- 3 files changed, 78 insertions(+), 43 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/bpf-filter.c b/tools/perf/util/bpf-filter.c index 04f98b6bb291..2510832d83f9 100644 --- a/tools/perf/util/bpf-filter.c +++ b/tools/perf/util/bpf-filter.c @@ -93,71 +93,102 @@ static int check_sample_flags(struct evsel *evsel, struct perf_bpf_filter_expr * int perf_bpf_filter__prepare(struct evsel *evsel) { - int i, x, y, fd; + int i, x, y, fd, ret; struct sample_filter_bpf *skel; struct bpf_program *prog; struct bpf_link *link; struct perf_bpf_filter_expr *expr; + struct perf_bpf_filter_entry *entry; + + entry = calloc(MAX_FILTERS, sizeof(*entry)); + if (entry == NULL) + return -1; skel = sample_filter_bpf__open_and_load(); if (!skel) { pr_err("Failed to load perf sample-filter BPF skeleton\n"); - return -1; + ret = -EPERM; + goto err; } i = 0; fd = bpf_map__fd(skel->maps.filters); list_for_each_entry(expr, &evsel->bpf_filters, list) { - struct perf_bpf_filter_entry entry = { - .op = expr->op, - .part = expr->part, - .term = expr->term, - .value = expr->val, - }; + if (check_sample_flags(evsel, expr) < 0) { + ret = -EINVAL; + goto err; + } - if (check_sample_flags(evsel, expr) < 0) - return -1; + if (i == MAX_FILTERS) { + ret = -E2BIG; + goto err; + } - bpf_map_update_elem(fd, &i, &entry, BPF_ANY); + entry[i].op = expr->op; + entry[i].part = expr->part; + entry[i].term = expr->term; + entry[i].value = expr->val; i++; if (expr->op == PBF_OP_GROUP_BEGIN) { struct perf_bpf_filter_expr *group; list_for_each_entry(group, &expr->groups, list) { - struct perf_bpf_filter_entry group_entry = { - .op = group->op, - .part = group->part, - .term = group->term, - .value = group->val, - }; - bpf_map_update_elem(fd, &i, &group_entry, BPF_ANY); + if (i == MAX_FILTERS) { + ret = -E2BIG; + goto err; + } + + entry[i].op = group->op; + entry[i].part = group->part; + entry[i].term = group->term; + entry[i].value = group->val; i++; } - memset(&entry, 0, sizeof(entry)); - entry.op = PBF_OP_GROUP_END; - bpf_map_update_elem(fd, &i, &entry, BPF_ANY); + if (i == MAX_FILTERS) { + ret = -E2BIG; + goto err; + } + + entry[i].op = PBF_OP_GROUP_END; i++; } } - if (i > MAX_FILTERS) { - pr_err("Too many filters: %d (max = %d)\n", i, MAX_FILTERS); - return -1; + if (i < MAX_FILTERS) { + /* to terminate the loop early */ + entry[i].op = PBF_OP_DONE; + i++; + } + + /* The filters map has only one entry for now */ + i = 0; + if (bpf_map_update_elem(fd, &i, entry, BPF_ANY) < 0) { + ret = -errno; + pr_err("Failed to update the filter map\n"); + goto err; } + prog = skel->progs.perf_sample_filter; for (x = 0; x < xyarray__max_x(evsel->core.fd); x++) { for (y = 0; y < xyarray__max_y(evsel->core.fd); y++) { link = bpf_program__attach_perf_event(prog, FD(evsel, x, y)); if (IS_ERR(link)) { pr_err("Failed to attach perf sample-filter program\n"); - return PTR_ERR(link); + ret = PTR_ERR(link); + goto err; } } } + free(entry); evsel->bpf_skel = skel; return 0; + +err: + free(entry); + sample_filter_bpf__destroy(skel); + return ret; } int perf_bpf_filter__destroy(struct evsel *evsel) diff --git a/tools/perf/util/bpf_skel/sample-filter.h b/tools/perf/util/bpf_skel/sample-filter.h index 350efa121026..bb6a1b91f1df 100644 --- a/tools/perf/util/bpf_skel/sample-filter.h +++ b/tools/perf/util/bpf_skel/sample-filter.h @@ -14,6 +14,7 @@ enum perf_bpf_filter_op { PBF_OP_AND, PBF_OP_GROUP_BEGIN, PBF_OP_GROUP_END, + PBF_OP_DONE, }; enum perf_bpf_filter_term { diff --git a/tools/perf/util/bpf_skel/sample_filter.bpf.c b/tools/perf/util/bpf_skel/sample_filter.bpf.c index f59985101973..0d56e52b922c 100644 --- a/tools/perf/util/bpf_skel/sample_filter.bpf.c +++ b/tools/perf/util/bpf_skel/sample_filter.bpf.c @@ -9,10 +9,10 @@ /* BPF map that will be filled by user space */ struct filters { - __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(type, BPF_MAP_TYPE_HASH); __type(key, int); - __type(value, struct perf_bpf_filter_entry); - __uint(max_entries, MAX_FILTERS); + __type(value, struct perf_bpf_filter_entry[MAX_FILTERS]); + __uint(max_entries, 1); } filters SEC(".maps"); int dropped; @@ -179,39 +179,39 @@ int perf_sample_filter(void *ctx) __u64 sample_data; int in_group = 0; int group_result = 0; - int i; + int i, k; kctx = bpf_cast_to_kern_ctx(ctx); - for (i = 0; i < MAX_FILTERS; i++) { - int key = i; /* needed for verifier :( */ + k = 0; + entry = bpf_map_lookup_elem(&filters, &k); + if (entry == NULL) + goto drop; - entry = bpf_map_lookup_elem(&filters, &key); - if (entry == NULL) - break; - sample_data = perf_get_sample(kctx, entry); + for (i = 0; i < MAX_FILTERS; i++) { + sample_data = perf_get_sample(kctx, &entry[i]); - switch (entry->op) { + switch (entry[i].op) { case PBF_OP_EQ: - CHECK_RESULT(sample_data, ==, entry->value) + CHECK_RESULT(sample_data, ==, entry[i].value) break; case PBF_OP_NEQ: - CHECK_RESULT(sample_data, !=, entry->value) + CHECK_RESULT(sample_data, !=, entry[i].value) break; case PBF_OP_GT: - CHECK_RESULT(sample_data, >, entry->value) + CHECK_RESULT(sample_data, >, entry[i].value) break; case PBF_OP_GE: - CHECK_RESULT(sample_data, >=, entry->value) + CHECK_RESULT(sample_data, >=, entry[i].value) break; case PBF_OP_LT: - CHECK_RESULT(sample_data, <, entry->value) + CHECK_RESULT(sample_data, <, entry[i].value) break; case PBF_OP_LE: - CHECK_RESULT(sample_data, <=, entry->value) + CHECK_RESULT(sample_data, <=, entry[i].value) break; case PBF_OP_AND: - CHECK_RESULT(sample_data, &, entry->value) + CHECK_RESULT(sample_data, &, entry[i].value) break; case PBF_OP_GROUP_BEGIN: in_group = 1; @@ -222,6 +222,9 @@ int perf_sample_filter(void *ctx) goto drop; in_group = 0; break; + case PBF_OP_DONE: + /* no failures so far, accept it */ + return 1; } } /* generate sample data */ -- cgit v1.2.3 From 966854e72f6e8a259609ea3c7fd78215e6606c7b Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Wed, 3 Jul 2024 15:30:29 -0700 Subject: perf bpf-filter: Pass 'target' to perf_bpf_filter__prepare() This is needed to prepare target-specific actions in the later patch. We want to reuse the pinned BPF program and map for regular users to profile their own processes. Signed-off-by: Namhyung Kim Cc: Adrian Hunter Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: KP Singh Cc: Kan Liang Cc: Peter Zijlstra Cc: Song Liu Cc: Stephane Eranian Cc: bpf@vger.kernel.org Link: https://lore.kernel.org/r/20240703223035.2024586-3-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-record.c | 2 +- tools/perf/builtin-stat.c | 2 +- tools/perf/builtin-top.c | 2 +- tools/perf/builtin-trace.c | 2 +- tools/perf/util/bpf-filter.c | 2 +- tools/perf/util/bpf-filter.h | 6 ++++-- tools/perf/util/evlist.c | 5 +++-- tools/perf/util/evlist.h | 4 +++- 8 files changed, 15 insertions(+), 10 deletions(-) (limited to 'tools') diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index a94516e8c522..c599e620ee89 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -1389,7 +1389,7 @@ try_again: "even with a suitable vmlinux or kallsyms file.\n\n"); } - if (evlist__apply_filters(evlist, &pos)) { + if (evlist__apply_filters(evlist, &pos, &opts->target)) { pr_err("failed to set filter \"%s\" on event %s with %d (%s)\n", pos->filter ?: "BPF", evsel__name(pos), errno, str_error_r(errno, msg, sizeof(msg))); diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index 661832756a24..1f92445f7480 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -833,7 +833,7 @@ try_again_reset: return -1; } - if (evlist__apply_filters(evsel_list, &counter)) { + if (evlist__apply_filters(evsel_list, &counter, &target)) { pr_err("failed to set filter \"%s\" on event %s with %d (%s)\n", counter->filter, evsel__name(counter), errno, str_error_r(errno, msg, sizeof(msg))); diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c index e8cbbf10d361..d1a06a88d693 100644 --- a/tools/perf/builtin-top.c +++ b/tools/perf/builtin-top.c @@ -1055,7 +1055,7 @@ try_again: } } - if (evlist__apply_filters(evlist, &counter)) { + if (evlist__apply_filters(evlist, &counter, &opts->target)) { pr_err("failed to set filter \"%s\" on event %s with %d (%s)\n", counter->filter ?: "BPF", evsel__name(counter), errno, str_error_r(errno, msg, sizeof(msg))); diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c index 488c2cedc110..ef951ce1a0dd 100644 --- a/tools/perf/builtin-trace.c +++ b/tools/perf/builtin-trace.c @@ -4135,7 +4135,7 @@ static int trace__run(struct trace *trace, int argc, const char **argv) err = trace__expand_filters(trace, &evsel); if (err) goto out_delete_evlist; - err = evlist__apply_filters(evlist, &evsel); + err = evlist__apply_filters(evlist, &evsel, &trace->opts.target); if (err < 0) goto out_error_apply_filters; diff --git a/tools/perf/util/bpf-filter.c b/tools/perf/util/bpf-filter.c index 2510832d83f9..0b2eca56aa10 100644 --- a/tools/perf/util/bpf-filter.c +++ b/tools/perf/util/bpf-filter.c @@ -91,7 +91,7 @@ static int check_sample_flags(struct evsel *evsel, struct perf_bpf_filter_expr * return -1; } -int perf_bpf_filter__prepare(struct evsel *evsel) +int perf_bpf_filter__prepare(struct evsel *evsel, struct target *target __maybe_unused) { int i, x, y, fd, ret; struct sample_filter_bpf *skel; diff --git a/tools/perf/util/bpf-filter.h b/tools/perf/util/bpf-filter.h index cd6764442c16..605a3d0226e0 100644 --- a/tools/perf/util/bpf-filter.h +++ b/tools/perf/util/bpf-filter.h @@ -16,6 +16,7 @@ struct perf_bpf_filter_expr { }; struct evsel; +struct target; #ifdef HAVE_BPF_SKEL struct perf_bpf_filter_expr *perf_bpf_filter_expr__new(enum perf_bpf_filter_term term, @@ -23,7 +24,7 @@ struct perf_bpf_filter_expr *perf_bpf_filter_expr__new(enum perf_bpf_filter_term enum perf_bpf_filter_op op, unsigned long val); int perf_bpf_filter__parse(struct list_head *expr_head, const char *str); -int perf_bpf_filter__prepare(struct evsel *evsel); +int perf_bpf_filter__prepare(struct evsel *evsel, struct target *target); int perf_bpf_filter__destroy(struct evsel *evsel); u64 perf_bpf_filter__lost_count(struct evsel *evsel); @@ -34,7 +35,8 @@ static inline int perf_bpf_filter__parse(struct list_head *expr_head __maybe_unu { return -EOPNOTSUPP; } -static inline int perf_bpf_filter__prepare(struct evsel *evsel __maybe_unused) +static inline int perf_bpf_filter__prepare(struct evsel *evsel __maybe_unused, + struct target *target __maybe_unused) { return -EOPNOTSUPP; } diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c index 3a719edafc7a..1417f9a23083 100644 --- a/tools/perf/util/evlist.c +++ b/tools/perf/util/evlist.c @@ -1086,7 +1086,8 @@ out_delete_threads: return -1; } -int evlist__apply_filters(struct evlist *evlist, struct evsel **err_evsel) +int evlist__apply_filters(struct evlist *evlist, struct evsel **err_evsel, + struct target *target) { struct evsel *evsel; int err = 0; @@ -1108,7 +1109,7 @@ int evlist__apply_filters(struct evlist *evlist, struct evsel **err_evsel) * non-tracepoint events can have BPF filters. */ if (!list_empty(&evsel->bpf_filters)) { - err = perf_bpf_filter__prepare(evsel); + err = perf_bpf_filter__prepare(evsel, target); if (err) { *err_evsel = evsel; break; diff --git a/tools/perf/util/evlist.h b/tools/perf/util/evlist.h index cb91dc9117a2..cccc34da5a02 100644 --- a/tools/perf/util/evlist.h +++ b/tools/perf/util/evlist.h @@ -20,6 +20,7 @@ struct pollfd; struct thread_map; struct perf_cpu_map; struct record_opts; +struct target; /* * State machine of bkw_mmap_state: @@ -212,7 +213,8 @@ void evlist__enable_non_dummy(struct evlist *evlist); void evlist__set_selected(struct evlist *evlist, struct evsel *evsel); int evlist__create_maps(struct evlist *evlist, struct target *target); -int evlist__apply_filters(struct evlist *evlist, struct evsel **err_evsel); +int evlist__apply_filters(struct evlist *evlist, struct evsel **err_evsel, + struct target *target); u64 __evlist__combined_sample_type(struct evlist *evlist); u64 evlist__combined_sample_type(struct evlist *evlist); -- cgit v1.2.3 From eb1693b1150d4b99af5241242d6bcd6290cf68a0 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Wed, 3 Jul 2024 15:30:30 -0700 Subject: perf bpf-filter: Split per-task filter use case If the target is a list of tasks, it can use a shared hash map for filter expressions. The key of the filter map is an integer index like in an array. A separate pid_hash map is added to get the index for the filter map using the tgid. For system-wide mode including per-cpu or per-user targets are handled by the single entry map like before. Signed-off-by: Namhyung Kim Cc: Adrian Hunter Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: KP Singh Cc: Kan Liang Cc: Peter Zijlstra Cc: Song Liu Cc: Stephane Eranian Cc: bpf@vger.kernel.org Link: https://lore.kernel.org/r/20240703223035.2024586-4-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/bpf-filter.c | 186 +++++++++++++++++++++------ tools/perf/util/bpf_skel/sample-filter.h | 1 + tools/perf/util/bpf_skel/sample_filter.bpf.c | 21 +++ 3 files changed, 168 insertions(+), 40 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/bpf-filter.c b/tools/perf/util/bpf-filter.c index 0b2eca56aa10..5ec0e0955ec4 100644 --- a/tools/perf/util/bpf-filter.c +++ b/tools/perf/util/bpf-filter.c @@ -3,10 +3,13 @@ #include #include +#include #include +#include #include "util/debug.h" #include "util/evsel.h" +#include "util/target.h" #include "util/bpf-filter.h" #include @@ -91,38 +94,17 @@ static int check_sample_flags(struct evsel *evsel, struct perf_bpf_filter_expr * return -1; } -int perf_bpf_filter__prepare(struct evsel *evsel, struct target *target __maybe_unused) +static int get_filter_entries(struct evsel *evsel, struct perf_bpf_filter_entry *entry) { - int i, x, y, fd, ret; - struct sample_filter_bpf *skel; - struct bpf_program *prog; - struct bpf_link *link; + int i = 0; struct perf_bpf_filter_expr *expr; - struct perf_bpf_filter_entry *entry; - - entry = calloc(MAX_FILTERS, sizeof(*entry)); - if (entry == NULL) - return -1; - - skel = sample_filter_bpf__open_and_load(); - if (!skel) { - pr_err("Failed to load perf sample-filter BPF skeleton\n"); - ret = -EPERM; - goto err; - } - i = 0; - fd = bpf_map__fd(skel->maps.filters); list_for_each_entry(expr, &evsel->bpf_filters, list) { - if (check_sample_flags(evsel, expr) < 0) { - ret = -EINVAL; - goto err; - } + if (check_sample_flags(evsel, expr) < 0) + return -EINVAL; - if (i == MAX_FILTERS) { - ret = -E2BIG; - goto err; - } + if (i == MAX_FILTERS) + return -E2BIG; entry[i].op = expr->op; entry[i].part = expr->part; @@ -134,10 +116,8 @@ int perf_bpf_filter__prepare(struct evsel *evsel, struct target *target __maybe_ struct perf_bpf_filter_expr *group; list_for_each_entry(group, &expr->groups, list) { - if (i == MAX_FILTERS) { - ret = -E2BIG; - goto err; - } + if (i == MAX_FILTERS) + return -E2BIG; entry[i].op = group->op; entry[i].part = group->part; @@ -146,10 +126,8 @@ int perf_bpf_filter__prepare(struct evsel *evsel, struct target *target __maybe_ i++; } - if (i == MAX_FILTERS) { - ret = -E2BIG; - goto err; - } + if (i == MAX_FILTERS) + return -E2BIG; entry[i].op = PBF_OP_GROUP_END; i++; @@ -161,15 +139,143 @@ int perf_bpf_filter__prepare(struct evsel *evsel, struct target *target __maybe_ entry[i].op = PBF_OP_DONE; i++; } + return 0; +} + +static int convert_to_tgid(int tid) +{ + char path[128]; + char *buf, *p, *q; + int tgid; + size_t len; + + scnprintf(path, sizeof(path), "%d/status", tid); + if (procfs__read_str(path, &buf, &len) < 0) + return -1; - /* The filters map has only one entry for now */ - i = 0; - if (bpf_map_update_elem(fd, &i, entry, BPF_ANY) < 0) { - ret = -errno; - pr_err("Failed to update the filter map\n"); + p = strstr(buf, "Tgid:"); + if (p == NULL) { + free(buf); + return -1; + } + + tgid = strtol(p + 6, &q, 0); + free(buf); + if (*q != '\n') + return -1; + + return tgid; +} + +static int update_pid_hash(struct sample_filter_bpf *skel, struct evsel *evsel, + struct perf_bpf_filter_entry *entry) +{ + int filter_idx; + int nr, last; + int fd = bpf_map__fd(skel->maps.filters); + struct perf_thread_map *threads; + + /* Find the first available entry in the filters map */ + for (filter_idx = 0; filter_idx < MAX_FILTERS; filter_idx++) { + if (bpf_map_update_elem(fd, &filter_idx, entry, BPF_NOEXIST) == 0) + break; + } + + if (filter_idx == MAX_FILTERS) { + pr_err("Too many users for the filter map\n"); + return -EBUSY; + } + + threads = perf_evsel__threads(&evsel->core); + if (threads == NULL) { + pr_err("Cannot get the thread list of the event\n"); + return -EINVAL; + } + + /* save the index to a hash map */ + fd = bpf_map__fd(skel->maps.pid_hash); + + last = -1; + nr = perf_thread_map__nr(threads); + for (int i = 0; i < nr; i++) { + int pid = perf_thread_map__pid(threads, i); + int tgid; + + /* it actually needs tgid, let's get tgid from /proc. */ + tgid = convert_to_tgid(pid); + if (tgid < 0) { + /* the thread may be dead, ignore. */ + continue; + } + + if (tgid == last) + continue; + last = tgid; + + if (bpf_map_update_elem(fd, &tgid, &filter_idx, BPF_ANY) < 0) { + pr_err("Failed to update the pid hash\n"); + return -errno; + } + pr_debug("pid hash: %d -> %d\n", tgid, filter_idx); + } + return 0; +} + +int perf_bpf_filter__prepare(struct evsel *evsel, struct target *target) +{ + int i, x, y, fd, ret; + struct sample_filter_bpf *skel = NULL; + struct bpf_program *prog; + struct bpf_link *link; + struct perf_bpf_filter_entry *entry; + bool needs_pid_hash = !target__has_cpu(target) && !target->uid_str; + + entry = calloc(MAX_FILTERS, sizeof(*entry)); + if (entry == NULL) + return -1; + + ret = get_filter_entries(evsel, entry); + if (ret < 0) { + pr_err("Failed to process filter entries\n"); + goto err; + } + + skel = sample_filter_bpf__open(); + if (!skel) { + pr_err("Failed to open perf sample-filter BPF skeleton\n"); + ret = -EPERM; goto err; } + if (needs_pid_hash) { + bpf_map__set_max_entries(skel->maps.filters, MAX_FILTERS); + bpf_map__set_max_entries(skel->maps.pid_hash, MAX_PIDS); + skel->rodata->use_pid_hash = 1; + } + + if (sample_filter_bpf__load(skel) < 0) { + pr_err("Failed to load perf sample-filter BPF skeleton\n"); + ret = -EPERM; + goto err; + } + + if (needs_pid_hash) { + /* The filters map is shared among other processes */ + ret = update_pid_hash(skel, evsel, entry); + if (ret < 0) + goto err; + } else { + i = 0; + fd = bpf_map__fd(skel->maps.filters); + + /* The filters map has only one entry in this case */ + if (bpf_map_update_elem(fd, &i, entry, BPF_ANY) < 0) { + ret = -errno; + pr_err("Failed to update the filter map\n"); + goto err; + } + } + prog = skel->progs.perf_sample_filter; for (x = 0; x < xyarray__max_x(evsel->core.fd); x++) { for (y = 0; y < xyarray__max_y(evsel->core.fd); y++) { diff --git a/tools/perf/util/bpf_skel/sample-filter.h b/tools/perf/util/bpf_skel/sample-filter.h index bb6a1b91f1df..e666bfd5fbdd 100644 --- a/tools/perf/util/bpf_skel/sample-filter.h +++ b/tools/perf/util/bpf_skel/sample-filter.h @@ -2,6 +2,7 @@ #define PERF_UTIL_BPF_SKEL_SAMPLE_FILTER_H #define MAX_FILTERS 64 +#define MAX_PIDS (16 * 1024) /* supported filter operations */ enum perf_bpf_filter_op { diff --git a/tools/perf/util/bpf_skel/sample_filter.bpf.c b/tools/perf/util/bpf_skel/sample_filter.bpf.c index 0d56e52b922c..c5273f06fa45 100644 --- a/tools/perf/util/bpf_skel/sample_filter.bpf.c +++ b/tools/perf/util/bpf_skel/sample_filter.bpf.c @@ -15,7 +15,16 @@ struct filters { __uint(max_entries, 1); } filters SEC(".maps"); +/* tgid to filter index */ +struct pid_hash { + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, int); + __type(value, int); + __uint(max_entries, 1); +} pid_hash SEC(".maps"); + int dropped; +volatile const int use_pid_hash; void *bpf_cast_to_kern_ctx(void *) __ksym; @@ -184,6 +193,18 @@ int perf_sample_filter(void *ctx) kctx = bpf_cast_to_kern_ctx(ctx); k = 0; + + if (use_pid_hash) { + int tgid = bpf_get_current_pid_tgid() >> 32; + int *idx; + + idx = bpf_map_lookup_elem(&pid_hash, &tgid); + if (idx) + k = *idx; + else + goto drop; + } + entry = bpf_map_lookup_elem(&filters, &k); if (entry == NULL) goto drop; -- cgit v1.2.3 From 0715f65e94377016e8e12dae6bb6a6d24644c54d Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Wed, 3 Jul 2024 15:30:31 -0700 Subject: perf bpf-filter: Support pin/unpin BPF object And use the pinned objects for unprivileged users to profile their own tasks. The BPF objects need to be pinned in the BPF-fs by root first and it'll be handled in the later patch. Signed-off-by: Namhyung Kim Cc: Adrian Hunter Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: KP Singh Cc: Kan Liang Cc: Peter Zijlstra Cc: Song Liu Cc: Stephane Eranian Cc: bpf@vger.kernel.org Link: https://lore.kernel.org/r/20240703223035.2024586-5-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/bpf-filter.c | 230 ++++++++++++++++++++++++++++++++++++------- tools/perf/util/bpf-filter.h | 13 +++ 2 files changed, 209 insertions(+), 34 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/bpf-filter.c b/tools/perf/util/bpf-filter.c index 5ec0e0955ec4..37ed6c48debf 100644 --- a/tools/perf/util/bpf-filter.c +++ b/tools/perf/util/bpf-filter.c @@ -1,5 +1,8 @@ /* SPDX-License-Identifier: GPL-2.0 */ #include +#include +#include +#include #include #include @@ -23,6 +26,9 @@ #define __PERF_SAMPLE_TYPE(tt, st, opt) { tt, #st, opt } #define PERF_SAMPLE_TYPE(_st, opt) __PERF_SAMPLE_TYPE(PBF_TERM_##_st, PERF_SAMPLE_##_st, opt) +/* Index in the pinned 'filters' map. Should be released after use. */ +static int pinned_filter_idx = -1; + static const struct perf_sample_info { enum perf_bpf_filter_term type; const char *name; @@ -47,6 +53,8 @@ static const struct perf_sample_info { PERF_SAMPLE_TYPE(DATA_PAGE_SIZE, "--data-page-size"), }; +static int get_pinned_fd(const char *name); + static const struct perf_sample_info *get_sample_info(enum perf_bpf_filter_term type) { size_t i; @@ -167,19 +175,26 @@ static int convert_to_tgid(int tid) return tgid; } -static int update_pid_hash(struct sample_filter_bpf *skel, struct evsel *evsel, - struct perf_bpf_filter_entry *entry) +static int update_pid_hash(struct evsel *evsel, struct perf_bpf_filter_entry *entry) { int filter_idx; - int nr, last; - int fd = bpf_map__fd(skel->maps.filters); + int fd, nr, last; struct perf_thread_map *threads; + fd = get_pinned_fd("filters"); + if (fd < 0) { + pr_debug("cannot get fd for 'filters' map\n"); + return fd; + } + /* Find the first available entry in the filters map */ for (filter_idx = 0; filter_idx < MAX_FILTERS; filter_idx++) { - if (bpf_map_update_elem(fd, &filter_idx, entry, BPF_NOEXIST) == 0) + if (bpf_map_update_elem(fd, &filter_idx, entry, BPF_NOEXIST) == 0) { + pinned_filter_idx = filter_idx; break; + } } + close(fd); if (filter_idx == MAX_FILTERS) { pr_err("Too many users for the filter map\n"); @@ -193,7 +208,9 @@ static int update_pid_hash(struct sample_filter_bpf *skel, struct evsel *evsel, } /* save the index to a hash map */ - fd = bpf_map__fd(skel->maps.pid_hash); + fd = get_pinned_fd("pid_hash"); + if (fd < 0) + return fd; last = -1; nr = perf_thread_map__nr(threads); @@ -214,10 +231,12 @@ static int update_pid_hash(struct sample_filter_bpf *skel, struct evsel *evsel, if (bpf_map_update_elem(fd, &tgid, &filter_idx, BPF_ANY) < 0) { pr_err("Failed to update the pid hash\n"); - return -errno; + close(fd); + return -1; } pr_debug("pid hash: %d -> %d\n", tgid, filter_idx); } + close(fd); return 0; } @@ -240,40 +259,48 @@ int perf_bpf_filter__prepare(struct evsel *evsel, struct target *target) goto err; } - skel = sample_filter_bpf__open(); - if (!skel) { - pr_err("Failed to open perf sample-filter BPF skeleton\n"); - ret = -EPERM; - goto err; - } + if (needs_pid_hash && geteuid() != 0) { + /* The filters map is shared among other processes */ + ret = update_pid_hash(evsel, entry); + if (ret < 0) + goto err; - if (needs_pid_hash) { - bpf_map__set_max_entries(skel->maps.filters, MAX_FILTERS); - bpf_map__set_max_entries(skel->maps.pid_hash, MAX_PIDS); - skel->rodata->use_pid_hash = 1; + fd = get_pinned_fd("perf_sample_filter"); + if (fd < 0) { + ret = fd; + goto err; + } + + for (x = 0; x < xyarray__max_x(evsel->core.fd); x++) { + for (y = 0; y < xyarray__max_y(evsel->core.fd); y++) { + ret = ioctl(FD(evsel, x, y), PERF_EVENT_IOC_SET_BPF, fd); + if (ret < 0) { + pr_err("Failed to attach perf sample-filter\n"); + goto err; + } + } + } + + close(fd); + free(entry); + return 0; } - if (sample_filter_bpf__load(skel) < 0) { + skel = sample_filter_bpf__open_and_load(); + if (!skel) { + ret = -errno; pr_err("Failed to load perf sample-filter BPF skeleton\n"); - ret = -EPERM; goto err; } - if (needs_pid_hash) { - /* The filters map is shared among other processes */ - ret = update_pid_hash(skel, evsel, entry); - if (ret < 0) - goto err; - } else { - i = 0; - fd = bpf_map__fd(skel->maps.filters); - - /* The filters map has only one entry in this case */ - if (bpf_map_update_elem(fd, &i, entry, BPF_ANY) < 0) { - ret = -errno; - pr_err("Failed to update the filter map\n"); - goto err; - } + i = 0; + fd = bpf_map__fd(skel->maps.filters); + + /* The filters map has only one entry in this case */ + if (bpf_map_update_elem(fd, &i, entry, BPF_ANY) < 0) { + ret = -errno; + pr_err("Failed to update the filter map\n"); + goto err; } prog = skel->progs.perf_sample_filter; @@ -306,6 +333,15 @@ int perf_bpf_filter__destroy(struct evsel *evsel) free(expr); } sample_filter_bpf__destroy(evsel->bpf_skel); + + if (pinned_filter_idx >= 0) { + int fd = get_pinned_fd("filters"); + + bpf_map_delete_elem(fd, &pinned_filter_idx); + pinned_filter_idx = -1; + close(fd); + } + return 0; } @@ -349,3 +385,129 @@ int perf_bpf_filter__parse(struct list_head *expr_head, const char *str) return ret; } + +int perf_bpf_filter__pin(void) +{ + struct sample_filter_bpf *skel; + char *path = NULL; + int dir_fd, ret = -1; + + skel = sample_filter_bpf__open(); + if (!skel) { + ret = -errno; + pr_err("Failed to open perf sample-filter BPF skeleton\n"); + goto err; + } + + /* pinned program will use pid-hash */ + bpf_map__set_max_entries(skel->maps.filters, MAX_FILTERS); + bpf_map__set_max_entries(skel->maps.pid_hash, MAX_PIDS); + skel->rodata->use_pid_hash = 1; + + if (sample_filter_bpf__load(skel) < 0) { + ret = -errno; + pr_err("Failed to load perf sample-filter BPF skeleton\n"); + goto err; + } + + if (asprintf(&path, "%s/fs/bpf/%s", sysfs__mountpoint(), + PERF_BPF_FILTER_PIN_PATH) < 0) { + ret = -errno; + pr_err("Failed to allocate pathname in the BPF-fs\n"); + goto err; + } + + ret = bpf_object__pin(skel->obj, path); + if (ret < 0) { + pr_err("Failed to pin BPF filter objects\n"); + goto err; + } + + /* setup access permissions for the pinned objects */ + dir_fd = open(path, O_PATH); + if (dir_fd < 0) { + bpf_object__unpin(skel->obj, path); + ret = dir_fd; + goto err; + } + + /* BPF-fs root has the sticky bit */ + if (fchmodat(dir_fd, "..", 01755, 0) < 0) { + pr_debug("chmod for BPF-fs failed\n"); + ret = -errno; + goto err_close; + } + + /* perf_filter directory */ + if (fchmodat(dir_fd, ".", 0755, 0) < 0) { + pr_debug("chmod for perf_filter directory failed?\n"); + ret = -errno; + goto err_close; + } + + /* programs need write permission for some reason */ + if (fchmodat(dir_fd, "perf_sample_filter", 0777, 0) < 0) { + pr_debug("chmod for perf_sample_filter failed\n"); + ret = -errno; + } + /* maps */ + if (fchmodat(dir_fd, "filters", 0666, 0) < 0) { + pr_debug("chmod for filters failed\n"); + ret = -errno; + } + if (fchmodat(dir_fd, "pid_hash", 0666, 0) < 0) { + pr_debug("chmod for pid_hash failed\n"); + ret = -errno; + } + +err_close: + close(dir_fd); + +err: + free(path); + sample_filter_bpf__destroy(skel); + return ret; +} + +int perf_bpf_filter__unpin(void) +{ + struct sample_filter_bpf *skel; + char *path = NULL; + int ret = -1; + + skel = sample_filter_bpf__open_and_load(); + if (!skel) { + ret = -errno; + pr_err("Failed to open perf sample-filter BPF skeleton\n"); + goto err; + } + + if (asprintf(&path, "%s/fs/bpf/%s", sysfs__mountpoint(), + PERF_BPF_FILTER_PIN_PATH) < 0) { + ret = -errno; + pr_err("Failed to allocate pathname in the BPF-fs\n"); + goto err; + } + + ret = bpf_object__unpin(skel->obj, path); + +err: + free(path); + sample_filter_bpf__destroy(skel); + return ret; +} + +static int get_pinned_fd(const char *name) +{ + char *path = NULL; + int fd; + + if (asprintf(&path, "%s/fs/bpf/%s/%s", sysfs__mountpoint(), + PERF_BPF_FILTER_PIN_PATH, name) < 0) + return -1; + + fd = bpf_obj_get(path); + + free(path); + return fd; +} diff --git a/tools/perf/util/bpf-filter.h b/tools/perf/util/bpf-filter.h index 605a3d0226e0..916ed7770b73 100644 --- a/tools/perf/util/bpf-filter.h +++ b/tools/perf/util/bpf-filter.h @@ -18,6 +18,9 @@ struct perf_bpf_filter_expr { struct evsel; struct target; +/* path in BPF-fs for the pinned program and maps */ +#define PERF_BPF_FILTER_PIN_PATH "perf_filter" + #ifdef HAVE_BPF_SKEL struct perf_bpf_filter_expr *perf_bpf_filter_expr__new(enum perf_bpf_filter_term term, int part, @@ -27,6 +30,8 @@ int perf_bpf_filter__parse(struct list_head *expr_head, const char *str); int perf_bpf_filter__prepare(struct evsel *evsel, struct target *target); int perf_bpf_filter__destroy(struct evsel *evsel); u64 perf_bpf_filter__lost_count(struct evsel *evsel); +int perf_bpf_filter__pin(void); +int perf_bpf_filter__unpin(void); #else /* !HAVE_BPF_SKEL */ @@ -48,5 +53,13 @@ static inline u64 perf_bpf_filter__lost_count(struct evsel *evsel __maybe_unused { return 0; } +static inline int perf_bpf_filter__pin(void) +{ + return -EOPNOTSUPP; +} +static inline int perf_bpf_filter__unpin(void) +{ + return -EOPNOTSUPP; +} #endif /* HAVE_BPF_SKEL*/ #endif /* PERF_UTIL_BPF_FILTER_H */ -- cgit v1.2.3 From 1ec6fd34e0572588b9628e397ff134fd3cf79b5d Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Wed, 3 Jul 2024 15:30:32 -0700 Subject: perf bpf-filter: Support separate lost counts for each filter As the BPF filter is shared between other processes, it should have its own counter for each invocation. Add a new array map (lost_count) to save the count using the same index as the filter. It should clear the count before running the filter. Signed-off-by: Namhyung Kim Cc: Adrian Hunter Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: KP Singh Cc: Kan Liang Cc: Peter Zijlstra Cc: Song Liu Cc: Stephane Eranian Cc: bpf@vger.kernel.org Link: https://lore.kernel.org/r/20240703223035.2024586-6-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/bpf-filter.c | 37 ++++++++++++++++++++++++++-- tools/perf/util/bpf_skel/sample_filter.bpf.c | 15 +++++++++-- 2 files changed, 48 insertions(+), 4 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/bpf-filter.c b/tools/perf/util/bpf-filter.c index 37ed6c48debf..c5eb0b7eec19 100644 --- a/tools/perf/util/bpf-filter.c +++ b/tools/perf/util/bpf-filter.c @@ -260,11 +260,23 @@ int perf_bpf_filter__prepare(struct evsel *evsel, struct target *target) } if (needs_pid_hash && geteuid() != 0) { + int zero = 0; + /* The filters map is shared among other processes */ ret = update_pid_hash(evsel, entry); if (ret < 0) goto err; + fd = get_pinned_fd("dropped"); + if (fd < 0) { + ret = fd; + goto err; + } + + /* Reset the lost count */ + bpf_map_update_elem(fd, &pinned_filter_idx, &zero, BPF_ANY); + close(fd); + fd = get_pinned_fd("perf_sample_filter"); if (fd < 0) { ret = fd; @@ -347,9 +359,25 @@ int perf_bpf_filter__destroy(struct evsel *evsel) u64 perf_bpf_filter__lost_count(struct evsel *evsel) { - struct sample_filter_bpf *skel = evsel->bpf_skel; + int count = 0; + + if (list_empty(&evsel->bpf_filters)) + return 0; + + if (pinned_filter_idx >= 0) { + int fd = get_pinned_fd("dropped"); + + bpf_map_lookup_elem(fd, &pinned_filter_idx, &count); + close(fd); + } else if (evsel->bpf_skel) { + struct sample_filter_bpf *skel = evsel->bpf_skel; + int fd = bpf_map__fd(skel->maps.dropped); + int idx = 0; - return skel ? skel->bss->dropped : 0; + bpf_map_lookup_elem(fd, &idx, &count); + } + + return count; } struct perf_bpf_filter_expr *perf_bpf_filter_expr__new(enum perf_bpf_filter_term term, @@ -402,6 +430,7 @@ int perf_bpf_filter__pin(void) /* pinned program will use pid-hash */ bpf_map__set_max_entries(skel->maps.filters, MAX_FILTERS); bpf_map__set_max_entries(skel->maps.pid_hash, MAX_PIDS); + bpf_map__set_max_entries(skel->maps.dropped, MAX_FILTERS); skel->rodata->use_pid_hash = 1; if (sample_filter_bpf__load(skel) < 0) { @@ -459,6 +488,10 @@ int perf_bpf_filter__pin(void) pr_debug("chmod for pid_hash failed\n"); ret = -errno; } + if (fchmodat(dir_fd, "dropped", 0666, 0) < 0) { + pr_debug("chmod for dropped failed\n"); + ret = -errno; + } err_close: close(dir_fd); diff --git a/tools/perf/util/bpf_skel/sample_filter.bpf.c b/tools/perf/util/bpf_skel/sample_filter.bpf.c index c5273f06fa45..4c75354b84fd 100644 --- a/tools/perf/util/bpf_skel/sample_filter.bpf.c +++ b/tools/perf/util/bpf_skel/sample_filter.bpf.c @@ -23,7 +23,14 @@ struct pid_hash { __uint(max_entries, 1); } pid_hash SEC(".maps"); -int dropped; +/* tgid to filter index */ +struct lost_count { + __uint(type, BPF_MAP_TYPE_ARRAY); + __type(key, int); + __type(value, int); + __uint(max_entries, 1); +} dropped SEC(".maps"); + volatile const int use_pid_hash; void *bpf_cast_to_kern_ctx(void *) __ksym; @@ -189,6 +196,7 @@ int perf_sample_filter(void *ctx) int in_group = 0; int group_result = 0; int i, k; + int *losts; kctx = bpf_cast_to_kern_ctx(ctx); @@ -252,7 +260,10 @@ int perf_sample_filter(void *ctx) return 1; drop: - __sync_fetch_and_add(&dropped, 1); + losts = bpf_map_lookup_elem(&dropped, &k); + if (losts != NULL) + __sync_fetch_and_add(losts, 1); + return 0; } -- cgit v1.2.3 From 73bf63a4750ea18d7fbb8f80695dcfd0656d13f4 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Wed, 3 Jul 2024 15:30:33 -0700 Subject: perf record: Fix a potential error handling issue The evlist is allocated at the beginning of cmd_record(). Also free-ing thread masks should be paired with record__init_thread_masks() which is called right before __cmd_record(). Let's change the order of these functions to release the resources correctly in case of errors. This is maybe fine as the process exits, but it might be a problem if it manages some system-wide resources that live longer than the process. Signed-off-by: Namhyung Kim Cc: Adrian Hunter Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: KP Singh Cc: Kan Liang Cc: Peter Zijlstra Cc: Song Liu Cc: Stephane Eranian Cc: bpf@vger.kernel.org Link: https://lore.kernel.org/r/20240703223035.2024586-7-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-record.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index c599e620ee89..01fb95487a63 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -4242,13 +4242,13 @@ int cmd_record(int argc, const char **argv) err = __cmd_record(&record, argc, argv); out: - evlist__delete(rec->evlist); + record__free_thread_masks(rec, rec->nr_threads); + rec->nr_threads = 0; symbol__exit(); auxtrace_record__free(rec->itr); out_opts: - record__free_thread_masks(rec, rec->nr_threads); - rec->nr_threads = 0; evlist__close_control(rec->opts.ctl_fd, rec->opts.ctl_fd_ack, &rec->opts.ctl_fd_close); + evlist__delete(rec->evlist); return err; } -- cgit v1.2.3 From 3dee4b83a6b4c167db0827c2b1e288460af72ac6 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Wed, 3 Jul 2024 15:30:34 -0700 Subject: perf record: Add --setup-filter option To allow BPF filters for unprivileged users it needs to pin the BPF objects to BPF-fs first. Let's add a new option to pin and unpin the objects easily. I'm not sure 'perf record' is a right place to do this but I don't have a better idea right now. $ sudo perf record --setup-filter pin The above command would pin BPF program and maps for the filter when the system has BPF-fs (usually at /sys/fs/bpf/). To unpin the objects, users can run the following command (as root). $ sudo perf record --setup-filter unpin Committer testing: root@number:~# perf record --setup-filter pin root@number:~# ls -la /sys/fs/bpf/perf_filter/ total 0 drwxr-xr-x. 2 root root 0 Jul 31 10:43 . drwxr-xr-t. 3 root root 0 Jul 31 10:43 .. -rw-rw-rw-. 1 root root 0 Jul 31 10:43 dropped -rw-rw-rw-. 1 root root 0 Jul 31 10:43 filters -rwxrwxrwx. 1 root root 0 Jul 31 10:43 perf_sample_filter -rw-rw-rw-. 1 root root 0 Jul 31 10:43 pid_hash -rw-------. 1 root root 0 Jul 31 10:43 sample_f_rodata root@number:~# ls -la /sys/fs/bpf/perf_filter/perf_sample_filter -rwxrwxrwx. 1 root root 0 Jul 31 10:43 /sys/fs/bpf/perf_filter/perf_sample_filter root@number:~# Signed-off-by: Namhyung Kim Tested-by: Arnaldo Carvalho de Melo Cc: Adrian Hunter Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: KP Singh Cc: Kan Liang Cc: Peter Zijlstra Cc: Song Liu Cc: Stephane Eranian Cc: bpf@vger.kernel.org Link: https://lore.kernel.org/r/20240703223035.2024586-8-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Documentation/perf-record.txt | 5 +++++ tools/perf/builtin-record.c | 15 +++++++++++++++ 2 files changed, 20 insertions(+) (limited to 'tools') diff --git a/tools/perf/Documentation/perf-record.txt b/tools/perf/Documentation/perf-record.txt index d6532ed97c02..41e36b4dc765 100644 --- a/tools/perf/Documentation/perf-record.txt +++ b/tools/perf/Documentation/perf-record.txt @@ -828,6 +828,11 @@ filtered through the mask provided by -C option. only, as of now. So the applications built without the frame pointer might see bogus addresses. +--setup-filter=:: + Prepare BPF filter to be used by regular users. The action should be + either "pin" or "unpin". The filter can be used after it's pinned. + + include::intel-hybrid.txt[] SEE ALSO diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index 01fb95487a63..72345d1e54b0 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -171,6 +171,7 @@ struct record { bool timestamp_filename; bool timestamp_boundary; bool off_cpu; + const char *filter_action; struct switch_output switch_output; unsigned long long samples; unsigned long output_max_size; /* = 0: unlimited */ @@ -3557,6 +3558,8 @@ static struct option __record_options[] = { "write collected trace data into several data files using parallel threads", record__parse_threads), OPT_BOOLEAN(0, "off-cpu", &record.off_cpu, "Enable off-cpu analysis"), + OPT_STRING(0, "setup-filter", &record.filter_action, "pin|unpin", + "BPF filter action"), OPT_END() }; @@ -4086,6 +4089,18 @@ int cmd_record(int argc, const char **argv) pr_warning("WARNING: --timestamp-filename option is not available in parallel streaming mode.\n"); } + if (rec->filter_action) { + if (!strcmp(rec->filter_action, "pin")) + err = perf_bpf_filter__pin(); + else if (!strcmp(rec->filter_action, "unpin")) + err = perf_bpf_filter__unpin(); + else { + pr_warning("Unknown BPF filter action: %s\n", rec->filter_action); + err = -EINVAL; + } + goto out_opts; + } + /* * Allow aliases to facilitate the lookup of symbols for address * filters. Refer to auxtrace_parse_filters(). -- cgit v1.2.3 From 9cb3549b73c1cd1f8ed0ff32bafab7649e5bf4ea Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Wed, 3 Jul 2024 15:30:35 -0700 Subject: perf test: Update sample filtering test Now it can run the BPF filtering test with normal user if the BPF objects are pinned by 'sudo perf record --setup-filter pin'. Let's update the test case to verify the behavior. It'll skip the test if the filter check is failed from a normal user, but it shows a message how to set up the filters. First, run the test as a normal user and it fails. $ perf test -vv filtering 95: perf record sample filtering (by BPF) tests: --- start --- test child forked, pid 425677 Checking BPF-filter privilege try 'sudo perf record --setup-filter pin' first. <<<--- here bpf-filter test [Skipped permission] ---- end(-2) ---- 95: perf record sample filtering (by BPF) tests : Skip According to the message, run the perf record command to pin the BPF objects. $ sudo perf record --setup-filter pin And re-run the test as a normal user. $ perf test -vv filtering 95: perf record sample filtering (by BPF) tests: --- start --- test child forked, pid 424486 Checking BPF-filter privilege Basic bpf-filter test Basic bpf-filter test [Success] Failing bpf-filter test Error: task-clock event does not have PERF_SAMPLE_CPU Failing bpf-filter test [Success] Group bpf-filter test Error: task-clock event does not have PERF_SAMPLE_CPU Error: task-clock event does not have PERF_SAMPLE_CODE_PAGE_SIZE Group bpf-filter test [Success] ---- end(0) ---- 95: perf record sample filtering (by BPF) tests : Ok Signed-off-by: Namhyung Kim Tested-by: Arnaldo Carvalho de Melo Cc: Adrian Hunter Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: KP Singh Cc: Kan Liang Cc: Peter Zijlstra Cc: Song Liu Cc: Stephane Eranian Cc: bpf@vger.kernel.org Link: https://lore.kernel.org/r/20240703223035.2024586-9-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/shell/record_bpf_filter.sh | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'tools') diff --git a/tools/perf/tests/shell/record_bpf_filter.sh b/tools/perf/tests/shell/record_bpf_filter.sh index 31c593966e8c..c5882d620db7 100755 --- a/tools/perf/tests/shell/record_bpf_filter.sh +++ b/tools/perf/tests/shell/record_bpf_filter.sh @@ -22,15 +22,16 @@ trap trap_cleanup EXIT TERM INT test_bpf_filter_priv() { echo "Checking BPF-filter privilege" - if [ "$(id -u)" != 0 ] - then - echo "bpf-filter test [Skipped permission]" - err=2 - return - fi if ! perf record -e task-clock --filter 'period > 1' \ -o /dev/null --quiet true 2>&1 then + if [ "$(id -u)" != 0 ] + then + echo "try 'sudo perf record --setup-filter pin' first." + echo "bpf-filter test [Skipped permission]" + err=2 + return + fi echo "bpf-filter test [Skipped missing BPF support]" err=2 return -- cgit v1.2.3 From 4e51e13bd986cab31dfab5c3bff1678171debbd1 Mon Sep 17 00:00:00 2001 From: Muhammad Usama Anjum Date: Thu, 25 Jul 2024 16:08:03 +0500 Subject: selftests: user: remove user suite The user test suite has only one test, test_user_copy which loads test_user_copy module for testing. But test_user_copy module has already been converted to kunit (see fixes). Hence remove the entire suite. Fixes: cf6219ee889f ("usercopy: Convert test_user_copy to KUnit test") Signed-off-by: Muhammad Usama Anjum Signed-off-by: Shuah Khan --- tools/testing/selftests/Makefile | 1 - tools/testing/selftests/user/Makefile | 9 --------- tools/testing/selftests/user/config | 1 - tools/testing/selftests/user/test_user_copy.sh | 18 ------------------ 4 files changed, 29 deletions(-) delete mode 100644 tools/testing/selftests/user/Makefile delete mode 100644 tools/testing/selftests/user/config delete mode 100755 tools/testing/selftests/user/test_user_copy.sh (limited to 'tools') diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile index bc8fe9e8f7f2..af2429431b6b 100644 --- a/tools/testing/selftests/Makefile +++ b/tools/testing/selftests/Makefile @@ -107,7 +107,6 @@ TARGETS += tmpfs TARGETS += tpm2 TARGETS += tty TARGETS += uevent -TARGETS += user TARGETS += user_events TARGETS += vDSO TARGETS += mm diff --git a/tools/testing/selftests/user/Makefile b/tools/testing/selftests/user/Makefile deleted file mode 100644 index 640a40f9b72b..000000000000 --- a/tools/testing/selftests/user/Makefile +++ /dev/null @@ -1,9 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0-only -# Makefile for user memory selftests - -# No binaries, but make sure arg-less "make" doesn't trigger "run_tests" -all: - -TEST_PROGS := test_user_copy.sh - -include ../lib.mk diff --git a/tools/testing/selftests/user/config b/tools/testing/selftests/user/config deleted file mode 100644 index 784ed8416324..000000000000 --- a/tools/testing/selftests/user/config +++ /dev/null @@ -1 +0,0 @@ -CONFIG_TEST_USER_COPY=m diff --git a/tools/testing/selftests/user/test_user_copy.sh b/tools/testing/selftests/user/test_user_copy.sh deleted file mode 100755 index f9b31a57439b..000000000000 --- a/tools/testing/selftests/user/test_user_copy.sh +++ /dev/null @@ -1,18 +0,0 @@ -#!/bin/sh -# SPDX-License-Identifier: GPL-2.0 -# Runs copy_to/from_user infrastructure using test_user_copy kernel module - -# Kselftest framework requirement - SKIP code is 4. -ksft_skip=4 - -if ! /sbin/modprobe -q -n test_user_copy; then - echo "user: module test_user_copy is not found [SKIP]" - exit $ksft_skip -fi -if /sbin/modprobe -q test_user_copy; then - /sbin/modprobe -q -r test_user_copy - echo "user_copy: ok" -else - echo "user_copy: [FAIL]" - exit 1 -fi -- cgit v1.2.3 From 44b045e27c65153cc8f549627d8d6d82f897c03c Mon Sep 17 00:00:00 2001 From: Muhammad Usama Anjum Date: Thu, 25 Jul 2024 17:11:39 +0500 Subject: selftests: lib: remove strscpy test The strscpy test loads test_strscpy module for testing. But test_strscpy was converted to Kunit (see fixes). Hence remove strscpy. Fixes: 41eefc46a3a4 ("string: Convert strscpy() self-test to KUnit") Signed-off-by: Muhammad Usama Anjum Signed-off-by: Shuah Khan --- tools/testing/selftests/lib/Makefile | 3 +-- tools/testing/selftests/lib/config | 1 - tools/testing/selftests/lib/strscpy.sh | 3 --- 3 files changed, 1 insertion(+), 6 deletions(-) delete mode 100755 tools/testing/selftests/lib/strscpy.sh (limited to 'tools') diff --git a/tools/testing/selftests/lib/Makefile b/tools/testing/selftests/lib/Makefile index ee71fc99d5b5..c52fe3ad8e98 100644 --- a/tools/testing/selftests/lib/Makefile +++ b/tools/testing/selftests/lib/Makefile @@ -4,6 +4,5 @@ # No binaries, but make sure arg-less "make" doesn't trigger "run_tests" all: -TEST_PROGS := printf.sh bitmap.sh prime_numbers.sh scanf.sh strscpy.sh - +TEST_PROGS := printf.sh bitmap.sh prime_numbers.sh scanf.sh include ../lib.mk diff --git a/tools/testing/selftests/lib/config b/tools/testing/selftests/lib/config index 645839b50b0a..dc15aba8d0a3 100644 --- a/tools/testing/selftests/lib/config +++ b/tools/testing/selftests/lib/config @@ -2,5 +2,4 @@ CONFIG_TEST_PRINTF=m CONFIG_TEST_SCANF=m CONFIG_TEST_BITMAP=m CONFIG_PRIME_NUMBERS=m -CONFIG_TEST_STRSCPY=m CONFIG_TEST_BITOPS=m diff --git a/tools/testing/selftests/lib/strscpy.sh b/tools/testing/selftests/lib/strscpy.sh deleted file mode 100755 index be60ef6e1a7f..000000000000 --- a/tools/testing/selftests/lib/strscpy.sh +++ /dev/null @@ -1,3 +0,0 @@ -#!/bin/sh -# SPDX-License-Identifier: GPL-2.0+ -$(dirname $0)/../kselftest/module.sh "strscpy*" test_strscpy -- cgit v1.2.3 From f0a1ffa6f9771c6a1283afab591781e7c797e2e1 Mon Sep 17 00:00:00 2001 From: Abdulrasaq Lawani Date: Thu, 1 Aug 2024 05:29:52 -0400 Subject: selftest: acct: Add selftest for the acct() syscall The acct() system call enables or disables process accounting. If accounting is turned on, records for each terminating process are appended to a specified filename as it terminates. An argument of NULL causes accounting to be turned off. This patch will add a test for the acct() syscall. Signed-off-by: Abdulrasaq Lawani Signed-off-by: Shuah Khan --- tools/testing/selftests/Makefile | 1 + tools/testing/selftests/acct/.gitignore | 3 ++ tools/testing/selftests/acct/Makefile | 5 ++ tools/testing/selftests/acct/acct_syscall.c | 78 +++++++++++++++++++++++++++++ 4 files changed, 87 insertions(+) create mode 100644 tools/testing/selftests/acct/.gitignore create mode 100644 tools/testing/selftests/acct/Makefile create mode 100644 tools/testing/selftests/acct/acct_syscall.c (limited to 'tools') diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile index af2429431b6b..f51fcceb45fd 100644 --- a/tools/testing/selftests/Makefile +++ b/tools/testing/selftests/Makefile @@ -1,4 +1,5 @@ # SPDX-License-Identifier: GPL-2.0 +TARGETS += acct TARGETS += alsa TARGETS += amd-pstate TARGETS += arm64 diff --git a/tools/testing/selftests/acct/.gitignore b/tools/testing/selftests/acct/.gitignore new file mode 100644 index 000000000000..7e78aac19038 --- /dev/null +++ b/tools/testing/selftests/acct/.gitignore @@ -0,0 +1,3 @@ +acct_syscall +config +process_log \ No newline at end of file diff --git a/tools/testing/selftests/acct/Makefile b/tools/testing/selftests/acct/Makefile new file mode 100644 index 000000000000..7e025099cf65 --- /dev/null +++ b/tools/testing/selftests/acct/Makefile @@ -0,0 +1,5 @@ +# SPDX-License-Identifier: GPL-2.0 +TEST_GEN_PROGS := acct_syscall +CFLAGS += -Wall + +include ../lib.mk \ No newline at end of file diff --git a/tools/testing/selftests/acct/acct_syscall.c b/tools/testing/selftests/acct/acct_syscall.c new file mode 100644 index 000000000000..e44e8fe1f4a3 --- /dev/null +++ b/tools/testing/selftests/acct/acct_syscall.c @@ -0,0 +1,78 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* kselftest for acct() system call + * The acct() system call enables or disables process accounting. + */ + +#include +#include +#include +#include + +#include "../kselftest.h" + +int main(void) +{ + char filename[] = "process_log"; + FILE *fp; + pid_t child_pid; + int sz; + + // Setting up kselftest framework + ksft_print_header(); + ksft_set_plan(1); + + // Check if test is run a root + if (geteuid()) { + ksft_test_result_skip("This test needs root to run!\n"); + return 1; + } + + // Create file to log closed processes + fp = fopen(filename, "w"); + + if (!fp) { + ksft_test_result_error("%s.\n", strerror(errno)); + ksft_finished(); + return 1; + } + + acct(filename); + + // Handle error conditions + if (errno) { + ksft_test_result_error("%s.\n", strerror(errno)); + fclose(fp); + ksft_finished(); + return 1; + } + + // Create child process and wait for it to terminate. + + child_pid = fork(); + + if (child_pid < 0) { + ksft_test_result_error("Creating a child process to log failed\n"); + acct(NULL); + return 1; + } else if (child_pid > 0) { + wait(NULL); + fseek(fp, 0L, SEEK_END); + sz = ftell(fp); + + acct(NULL); + + if (sz <= 0) { + ksft_test_result_fail("Terminated child process not logged\n"); + ksft_exit_fail(); + return 1; + } + + ksft_test_result_pass("Successfully logged terminated process.\n"); + fclose(fp); + ksft_exit_pass(); + return 0; + } + + return 1; +} -- cgit v1.2.3 From ea59b70a8418a313d6f2ab48a957de015fc33018 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Wed, 31 Jul 2024 11:58:56 -0300 Subject: perf bpf: Move BPF disassembly routines to separate file to avoid clash with capstone bpf headers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There is a clash of the libbpf and capstone libraries, that ends up with: In file included from /usr/include/capstone/capstone.h:325, from util/disasm.c:1513: /usr/include/capstone/bpf.h:94:14: error: ‘bpf_insn’ defined as wrong kind of tag 94 | typedef enum bpf_insn { So far we're just trying to avoid this by not having both headers included in the same .c or .h file, do it one more time by moving the BPF diassembly routines from util/disasm.c to util/disasm_bpf.c. This is only being hit when building with BUILD_NONDISTRO=1, i.e. building with binutils-devel, that isn't the in the default build due to a licencing clash. We need to reimplement what is now isolated in util/disasm_bpf.c using some other library to have BPF annotation feature that now only is available with BUILD_NONDISTRO=1. Fixes: 6d17edc113de1e21 ("perf annotate: Use libcapstone to disassemble") Cc: Adrian Hunter Cc: Ian Rogers Cc: Jiri Olsa Cc: Kan Liang Cc: Namhyung Kim Link: https://lore.kernel.org/lkml/ZqpUSKPxMwaQKORr@x1 Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/Build | 1 + tools/perf/util/disasm.c | 187 +---------------------------------------- tools/perf/util/disasm_bpf.c | 195 +++++++++++++++++++++++++++++++++++++++++++ tools/perf/util/disasm_bpf.h | 12 +++ 4 files changed, 209 insertions(+), 186 deletions(-) create mode 100644 tools/perf/util/disasm_bpf.c create mode 100644 tools/perf/util/disasm_bpf.h (limited to 'tools') diff --git a/tools/perf/util/Build b/tools/perf/util/Build index 0f18fe81ef0b..b24360c04aae 100644 --- a/tools/perf/util/Build +++ b/tools/perf/util/Build @@ -13,6 +13,7 @@ perf-util-y += copyfile.o perf-util-y += ctype.o perf-util-y += db-export.o perf-util-y += disasm.o +perf-util-y += disasm_bpf.o perf-util-y += env.o perf-util-y += event.o perf-util-y += evlist.o diff --git a/tools/perf/util/disasm.c b/tools/perf/util/disasm.c index 410e52cd9cfd..85fb0cfedf94 100644 --- a/tools/perf/util/disasm.c +++ b/tools/perf/util/disasm.c @@ -16,6 +16,7 @@ #include "build-id.h" #include "debug.h" #include "disasm.h" +#include "disasm_bpf.h" #include "dso.h" #include "env.h" #include "evsel.h" @@ -1323,192 +1324,6 @@ fallback: return 0; } -#if defined(HAVE_LIBBFD_SUPPORT) && defined(HAVE_LIBBPF_SUPPORT) -#define PACKAGE "perf" -#include -#include -#include -#include -#include -#include -#include - -#include "bpf-event.h" -#include "bpf-utils.h" - -static int symbol__disassemble_bpf(struct symbol *sym, - struct annotate_args *args) -{ - struct annotation *notes = symbol__annotation(sym); - struct bpf_prog_linfo *prog_linfo = NULL; - struct bpf_prog_info_node *info_node; - int len = sym->end - sym->start; - disassembler_ftype disassemble; - struct map *map = args->ms.map; - struct perf_bpil *info_linear; - struct disassemble_info info; - struct dso *dso = map__dso(map); - int pc = 0, count, sub_id; - struct btf *btf = NULL; - char tpath[PATH_MAX]; - size_t buf_size; - int nr_skip = 0; - char *buf; - bfd *bfdf; - int ret; - FILE *s; - - if (dso__binary_type(dso) != DSO_BINARY_TYPE__BPF_PROG_INFO) - return SYMBOL_ANNOTATE_ERRNO__BPF_INVALID_FILE; - - pr_debug("%s: handling sym %s addr %" PRIx64 " len %" PRIx64 "\n", __func__, - sym->name, sym->start, sym->end - sym->start); - - memset(tpath, 0, sizeof(tpath)); - perf_exe(tpath, sizeof(tpath)); - - bfdf = bfd_openr(tpath, NULL); - if (bfdf == NULL) - abort(); - - if (!bfd_check_format(bfdf, bfd_object)) - abort(); - - s = open_memstream(&buf, &buf_size); - if (!s) { - ret = errno; - goto out; - } - init_disassemble_info_compat(&info, s, - (fprintf_ftype) fprintf, - fprintf_styled); - info.arch = bfd_get_arch(bfdf); - info.mach = bfd_get_mach(bfdf); - - info_node = perf_env__find_bpf_prog_info(dso__bpf_prog(dso)->env, - dso__bpf_prog(dso)->id); - if (!info_node) { - ret = SYMBOL_ANNOTATE_ERRNO__BPF_MISSING_BTF; - goto out; - } - info_linear = info_node->info_linear; - sub_id = dso__bpf_prog(dso)->sub_id; - - info.buffer = (void *)(uintptr_t)(info_linear->info.jited_prog_insns); - info.buffer_length = info_linear->info.jited_prog_len; - - if (info_linear->info.nr_line_info) - prog_linfo = bpf_prog_linfo__new(&info_linear->info); - - if (info_linear->info.btf_id) { - struct btf_node *node; - - node = perf_env__find_btf(dso__bpf_prog(dso)->env, - info_linear->info.btf_id); - if (node) - btf = btf__new((__u8 *)(node->data), - node->data_size); - } - - disassemble_init_for_target(&info); - -#ifdef DISASM_FOUR_ARGS_SIGNATURE - disassemble = disassembler(info.arch, - bfd_big_endian(bfdf), - info.mach, - bfdf); -#else - disassemble = disassembler(bfdf); -#endif - if (disassemble == NULL) - abort(); - - fflush(s); - do { - const struct bpf_line_info *linfo = NULL; - struct disasm_line *dl; - size_t prev_buf_size; - const char *srcline; - u64 addr; - - addr = pc + ((u64 *)(uintptr_t)(info_linear->info.jited_ksyms))[sub_id]; - count = disassemble(pc, &info); - - if (prog_linfo) - linfo = bpf_prog_linfo__lfind_addr_func(prog_linfo, - addr, sub_id, - nr_skip); - - if (linfo && btf) { - srcline = btf__name_by_offset(btf, linfo->line_off); - nr_skip++; - } else - srcline = NULL; - - fprintf(s, "\n"); - prev_buf_size = buf_size; - fflush(s); - - if (!annotate_opts.hide_src_code && srcline) { - args->offset = -1; - args->line = strdup(srcline); - args->line_nr = 0; - args->fileloc = NULL; - args->ms.sym = sym; - dl = disasm_line__new(args); - if (dl) { - annotation_line__add(&dl->al, - ¬es->src->source); - } - } - - args->offset = pc; - args->line = buf + prev_buf_size; - args->line_nr = 0; - args->fileloc = NULL; - args->ms.sym = sym; - dl = disasm_line__new(args); - if (dl) - annotation_line__add(&dl->al, ¬es->src->source); - - pc += count; - } while (count > 0 && pc < len); - - ret = 0; -out: - free(prog_linfo); - btf__free(btf); - fclose(s); - bfd_close(bfdf); - return ret; -} -#else // defined(HAVE_LIBBFD_SUPPORT) && defined(HAVE_LIBBPF_SUPPORT) -static int symbol__disassemble_bpf(struct symbol *sym __maybe_unused, - struct annotate_args *args __maybe_unused) -{ - return SYMBOL_ANNOTATE_ERRNO__NO_LIBOPCODES_FOR_BPF; -} -#endif // defined(HAVE_LIBBFD_SUPPORT) && defined(HAVE_LIBBPF_SUPPORT) - -static int -symbol__disassemble_bpf_image(struct symbol *sym, - struct annotate_args *args) -{ - struct annotation *notes = symbol__annotation(sym); - struct disasm_line *dl; - - args->offset = -1; - args->line = strdup("to be implemented"); - args->line_nr = 0; - args->fileloc = NULL; - dl = disasm_line__new(args); - if (dl) - annotation_line__add(&dl->al, ¬es->src->source); - - zfree(&args->line); - return 0; -} - #ifdef HAVE_LIBCAPSTONE_SUPPORT #include diff --git a/tools/perf/util/disasm_bpf.c b/tools/perf/util/disasm_bpf.c new file mode 100644 index 000000000000..1fee71c79b62 --- /dev/null +++ b/tools/perf/util/disasm_bpf.c @@ -0,0 +1,195 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include "util/annotate.h" +#include "util/disasm_bpf.h" +#include "util/symbol.h" +#include +#include + +#if defined(HAVE_LIBBFD_SUPPORT) && defined(HAVE_LIBBPF_SUPPORT) +#define PACKAGE "perf" +#include +#include +#include +#include +#include +#include +#include +#include + +#include "util/bpf-event.h" +#include "util/bpf-utils.h" +#include "util/debug.h" +#include "util/dso.h" +#include "util/map.h" +#include "util/env.h" +#include "util/util.h" + +int symbol__disassemble_bpf(struct symbol *sym, struct annotate_args *args) +{ + struct annotation *notes = symbol__annotation(sym); + struct bpf_prog_linfo *prog_linfo = NULL; + struct bpf_prog_info_node *info_node; + int len = sym->end - sym->start; + disassembler_ftype disassemble; + struct map *map = args->ms.map; + struct perf_bpil *info_linear; + struct disassemble_info info; + struct dso *dso = map__dso(map); + int pc = 0, count, sub_id; + struct btf *btf = NULL; + char tpath[PATH_MAX]; + size_t buf_size; + int nr_skip = 0; + char *buf; + bfd *bfdf; + int ret; + FILE *s; + + if (dso__binary_type(dso) != DSO_BINARY_TYPE__BPF_PROG_INFO) + return SYMBOL_ANNOTATE_ERRNO__BPF_INVALID_FILE; + + pr_debug("%s: handling sym %s addr %" PRIx64 " len %" PRIx64 "\n", __func__, + sym->name, sym->start, sym->end - sym->start); + + memset(tpath, 0, sizeof(tpath)); + perf_exe(tpath, sizeof(tpath)); + + bfdf = bfd_openr(tpath, NULL); + if (bfdf == NULL) + abort(); + + if (!bfd_check_format(bfdf, bfd_object)) + abort(); + + s = open_memstream(&buf, &buf_size); + if (!s) { + ret = errno; + goto out; + } + init_disassemble_info_compat(&info, s, + (fprintf_ftype) fprintf, + fprintf_styled); + info.arch = bfd_get_arch(bfdf); + info.mach = bfd_get_mach(bfdf); + + info_node = perf_env__find_bpf_prog_info(dso__bpf_prog(dso)->env, + dso__bpf_prog(dso)->id); + if (!info_node) { + ret = SYMBOL_ANNOTATE_ERRNO__BPF_MISSING_BTF; + goto out; + } + info_linear = info_node->info_linear; + sub_id = dso__bpf_prog(dso)->sub_id; + + info.buffer = (void *)(uintptr_t)(info_linear->info.jited_prog_insns); + info.buffer_length = info_linear->info.jited_prog_len; + + if (info_linear->info.nr_line_info) + prog_linfo = bpf_prog_linfo__new(&info_linear->info); + + if (info_linear->info.btf_id) { + struct btf_node *node; + + node = perf_env__find_btf(dso__bpf_prog(dso)->env, + info_linear->info.btf_id); + if (node) + btf = btf__new((__u8 *)(node->data), + node->data_size); + } + + disassemble_init_for_target(&info); + +#ifdef DISASM_FOUR_ARGS_SIGNATURE + disassemble = disassembler(info.arch, + bfd_big_endian(bfdf), + info.mach, + bfdf); +#else + disassemble = disassembler(bfdf); +#endif + if (disassemble == NULL) + abort(); + + fflush(s); + do { + const struct bpf_line_info *linfo = NULL; + struct disasm_line *dl; + size_t prev_buf_size; + const char *srcline; + u64 addr; + + addr = pc + ((u64 *)(uintptr_t)(info_linear->info.jited_ksyms))[sub_id]; + count = disassemble(pc, &info); + + if (prog_linfo) + linfo = bpf_prog_linfo__lfind_addr_func(prog_linfo, + addr, sub_id, + nr_skip); + + if (linfo && btf) { + srcline = btf__name_by_offset(btf, linfo->line_off); + nr_skip++; + } else + srcline = NULL; + + fprintf(s, "\n"); + prev_buf_size = buf_size; + fflush(s); + + if (!annotate_opts.hide_src_code && srcline) { + args->offset = -1; + args->line = strdup(srcline); + args->line_nr = 0; + args->fileloc = NULL; + args->ms.sym = sym; + dl = disasm_line__new(args); + if (dl) { + annotation_line__add(&dl->al, + ¬es->src->source); + } + } + + args->offset = pc; + args->line = buf + prev_buf_size; + args->line_nr = 0; + args->fileloc = NULL; + args->ms.sym = sym; + dl = disasm_line__new(args); + if (dl) + annotation_line__add(&dl->al, ¬es->src->source); + + pc += count; + } while (count > 0 && pc < len); + + ret = 0; +out: + free(prog_linfo); + btf__free(btf); + fclose(s); + bfd_close(bfdf); + return ret; +} +#else // defined(HAVE_LIBBFD_SUPPORT) && defined(HAVE_LIBBPF_SUPPORT) +int symbol__disassemble_bpf(struct symbol *sym __maybe_unused, struct annotate_args *args __maybe_unused) +{ + return SYMBOL_ANNOTATE_ERRNO__NO_LIBOPCODES_FOR_BPF; +} +#endif // defined(HAVE_LIBBFD_SUPPORT) && defined(HAVE_LIBBPF_SUPPORT) + +int symbol__disassemble_bpf_image(struct symbol *sym, struct annotate_args *args) +{ + struct annotation *notes = symbol__annotation(sym); + struct disasm_line *dl; + + args->offset = -1; + args->line = strdup("to be implemented"); + args->line_nr = 0; + args->fileloc = NULL; + dl = disasm_line__new(args); + if (dl) + annotation_line__add(&dl->al, ¬es->src->source); + + zfree(&args->line); + return 0; +} diff --git a/tools/perf/util/disasm_bpf.h b/tools/perf/util/disasm_bpf.h new file mode 100644 index 000000000000..2ecb19545388 --- /dev/null +++ b/tools/perf/util/disasm_bpf.h @@ -0,0 +1,12 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#ifndef __PERF_DISASM_BPF_H +#define __PERF_DISASM_BPF_H + +struct symbol; +struct annotate_args; + +int symbol__disassemble_bpf(struct symbol *sym, struct annotate_args *args); +int symbol__disassemble_bpf_image(struct symbol *sym, struct annotate_args *args); + +#endif /* __PERF_DISASM_BPF_H */ -- cgit v1.2.3 From 0fe881f10ceb7ae2be455d8bdf70fab18c0a5c5f Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Tue, 30 Jul 2024 12:17:44 -0700 Subject: perf jevents: Autogenerate empty-pmu-events.c empty-pmu-events.c exists so that builds may occur without python being installed on a system. Manually updating empty-pmu-events.c to be in sync with jevents.py is a pain, let's use jevents.py to generate empty-pmu-events.c. 1) change jevents.py so that an arch and model of none cause generation of a pmu-events.c without any json. Add a SPDX and autogenerated warning to the start of the file. 2) change Build so that if a generated pmu-events.c for arch none and model none doesn't match empty-pmu-events.c the build fails with a cat of the differences. Update Makefile.perf to clean up the files used for this. 3) update empty-pmu-events.c to match the output of jevents.py with arch and mode of none. Committer notes: The firtst paragraph is confusing, so I asked and Ian further clarified: --- The requirement for python hasn't changed. Case 1: no python or NO_JEVENTS=1 Build happens using empty-pmu-events.c that is checked in, no python is required. Case 2: python pmu-events.c is created by jevents.py (requiring python) and then built. This change adds a step where the empty-pmu-events.c is created using jevents.py and that file is diffed against the checked in version. This stops the checked in empty-pmu-events.c diverging if changes are made to jevents.py. If the diff causes the build to fail then you just copy the diff empty-pmu-events.c over the checked in one. --- Reviewed-by: John Garry Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ingo Molnar Cc: Jing Zhang Cc: Jiri Olsa Cc: Kan Liang Cc: Mark Rutland Cc: Namhyung Kim Cc: Oliver Sang Cc: Peter Zijlstra Cc: Philip Li Cc: Sandipan Das Cc: Weilin Wang Cc: Xu Yang Link: https://lore.kernel.org/r/20240730191744.3097329-3-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Makefile.perf | 2 + tools/perf/pmu-events/Build | 12 +- tools/perf/pmu-events/empty-pmu-events.c | 894 +++++++++++++++++++------------ tools/perf/pmu-events/jevents.py | 6 +- 4 files changed, 562 insertions(+), 352 deletions(-) (limited to 'tools') diff --git a/tools/perf/Makefile.perf b/tools/perf/Makefile.perf index f8148db5fc38..72534bb72d43 100644 --- a/tools/perf/Makefile.perf +++ b/tools/perf/Makefile.perf @@ -1277,6 +1277,8 @@ clean:: $(LIBAPI)-clean $(LIBBPF)-clean $(LIBSUBCMD)-clean $(LIBSYMBOL)-clean $( $(OUTPUT)util/intel-pt-decoder/inat-tables.c \ $(OUTPUT)tests/llvm-src-{base,kbuild,prologue,relocation}.c \ $(OUTPUT)pmu-events/pmu-events.c \ + $(OUTPUT)pmu-events/test-empty-pmu-events.c \ + $(OUTPUT)pmu-events/empty-pmu-events.log \ $(OUTPUT)pmu-events/metric_test.log \ $(OUTPUT)$(fadvise_advice_array) \ $(OUTPUT)$(fsconfig_arrays) \ diff --git a/tools/perf/pmu-events/Build b/tools/perf/pmu-events/Build index 1d18bb89402e..c3fa43c49706 100644 --- a/tools/perf/pmu-events/Build +++ b/tools/perf/pmu-events/Build @@ -11,6 +11,8 @@ METRIC_TEST_PY = pmu-events/metric_test.py EMPTY_PMU_EVENTS_C = pmu-events/empty-pmu-events.c PMU_EVENTS_C = $(OUTPUT)pmu-events/pmu-events.c METRIC_TEST_LOG = $(OUTPUT)pmu-events/metric_test.log +TEST_EMPTY_PMU_EVENTS_C = $(OUTPUT)pmu-events/test-empty-pmu-events.c +EMPTY_PMU_EVENTS_TEST_LOG = $(OUTPUT)pmu-events/empty-pmu-events.log ifeq ($(JEVENTS_ARCH),) JEVENTS_ARCH=$(SRCARCH) @@ -31,7 +33,15 @@ $(METRIC_TEST_LOG): $(METRIC_TEST_PY) $(METRIC_PY) $(call rule_mkdir) $(Q)$(call echo-cmd,test)$(PYTHON) $< 2> $@ || (cat $@ && false) -$(PMU_EVENTS_C): $(JSON) $(JSON_TEST) $(JEVENTS_PY) $(METRIC_PY) $(METRIC_TEST_LOG) +$(TEST_EMPTY_PMU_EVENTS_C): $(JSON) $(JSON_TEST) $(JEVENTS_PY) $(METRIC_PY) $(METRIC_TEST_LOG) + $(call rule_mkdir) + $(Q)$(call echo-cmd,gen)$(PYTHON) $(JEVENTS_PY) none none pmu-events/arch $@ + +$(EMPTY_PMU_EVENTS_TEST_LOG): $(EMPTY_PMU_EVENTS_C) $(TEST_EMPTY_PMU_EVENTS_C) + $(call rule_mkdir) + $(Q)$(call echo-cmd,test)diff -u $? 2> $@ || (cat $@ && false) + +$(PMU_EVENTS_C): $(JSON) $(JSON_TEST) $(JEVENTS_PY) $(METRIC_PY) $(METRIC_TEST_LOG) $(EMPTY_PMU_EVENTS_TEST_LOG) $(call rule_mkdir) $(Q)$(call echo-cmd,gen)$(PYTHON) $(JEVENTS_PY) $(JEVENTS_ARCH) $(JEVENTS_MODEL) pmu-events/arch $@ endif diff --git a/tools/perf/pmu-events/empty-pmu-events.c b/tools/perf/pmu-events/empty-pmu-events.c index 13727421d424..c592079982fb 100644 --- a/tools/perf/pmu-events/empty-pmu-events.c +++ b/tools/perf/pmu-events/empty-pmu-events.c @@ -1,196 +1,193 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * An empty pmu-events.c file used when there is no architecture json files in - * arch or when the jevents.py script cannot be run. - * - * The test cpu/soc is provided for testing. - */ -#include "pmu-events/pmu-events.h" + +/* SPDX-License-Identifier: GPL-2.0 */ +/* THIS FILE WAS AUTOGENERATED BY jevents.py arch=none model=none ! */ + +#include #include "util/header.h" #include "util/pmu.h" #include #include -static const struct pmu_event pmu_events__test_soc_cpu[] = { - { - .name = "l3_cache_rd", - .event = "event=0x40", - .desc = "L3 cache access, read", - .topic = "cache", - .long_desc = "Attributable Level 3 cache access, read", - }, - { - .name = "segment_reg_loads.any", - .event = "event=0x6,period=200000,umask=0x80", - .desc = "Number of segment register loads", - .topic = "other", - }, - { - .name = "dispatch_blocked.any", - .event = "event=0x9,period=200000,umask=0x20", - .desc = "Memory cluster signals to block micro-op dispatch for any reason", - .topic = "other", - }, - { - .name = "eist_trans", - .event = "event=0x3a,period=200000,umask=0x0", - .desc = "Number of Enhanced Intel SpeedStep(R) Technology (EIST) transitions", - .topic = "other", - }, - { - .name = "uncore_hisi_ddrc.flux_wcmd", - .event = "event=0x2", - .desc = "DDRC write commands. Unit: hisi_sccl,ddrc ", - .topic = "uncore", - .long_desc = "DDRC write commands", - .pmu = "hisi_sccl,ddrc", - }, - { - .name = "unc_cbo_xsnp_response.miss_eviction", - .event = "event=0x22,umask=0x81", - .desc = "A cross-core snoop resulted from L3 Eviction which misses in some processor core. Unit: uncore_cbox ", - .topic = "uncore", - .long_desc = "A cross-core snoop resulted from L3 Eviction which misses in some processor core", - .pmu = "uncore_cbox", - }, - { - .name = "event-hyphen", - .event = "event=0xe0,umask=0x00", - .desc = "UNC_CBO_HYPHEN. Unit: uncore_cbox ", - .topic = "uncore", - .long_desc = "UNC_CBO_HYPHEN", - .pmu = "uncore_cbox", - }, - { - .name = "event-two-hyph", - .event = "event=0xc0,umask=0x00", - .desc = "UNC_CBO_TWO_HYPH. Unit: uncore_cbox ", - .topic = "uncore", - .long_desc = "UNC_CBO_TWO_HYPH", - .pmu = "uncore_cbox", - }, - { - .name = "uncore_hisi_l3c.rd_hit_cpipe", - .event = "event=0x7", - .desc = "Total read hits. Unit: hisi_sccl,l3c ", - .topic = "uncore", - .long_desc = "Total read hits", - .pmu = "hisi_sccl,l3c", - }, - { - .name = "uncore_imc_free_running.cache_miss", - .event = "event=0x12", - .desc = "Total cache misses. Unit: uncore_imc_free_running ", - .topic = "uncore", - .long_desc = "Total cache misses", - .pmu = "uncore_imc_free_running", - }, - { - .name = "uncore_imc.cache_hits", - .event = "event=0x34", - .desc = "Total cache hits. Unit: uncore_imc ", - .topic = "uncore", - .long_desc = "Total cache hits", - .pmu = "uncore_imc", - }, - { - .name = "bp_l1_btb_correct", - .event = "event=0x8a", - .desc = "L1 BTB Correction", - .topic = "branch", - }, - { - .name = "bp_l2_btb_correct", - .event = "event=0x8b", - .desc = "L2 BTB Correction", - .topic = "branch", - }, - { - .name = 0, - .event = 0, - .desc = 0, - }, +struct compact_pmu_event { + int offset; }; -static const struct pmu_metric pmu_metrics__test_soc_cpu[] = { - { - .metric_expr = "1 / IPC", - .metric_name = "CPI", - }, - { - .metric_expr = "inst_retired.any / cpu_clk_unhalted.thread", - .metric_name = "IPC", - .metric_group = "group1", - }, - { - .metric_expr = "idq_uops_not_delivered.core / (4 * (( ( cpu_clk_unhalted.thread / 2 ) * " - "( 1 + cpu_clk_unhalted.one_thread_active / cpu_clk_unhalted.ref_xclk ) )))", - .metric_name = "Frontend_Bound_SMT", - }, - { - .metric_expr = "l1d\\-loads\\-misses / inst_retired.any", - .metric_name = "dcache_miss_cpi", - }, - { - .metric_expr = "l1i\\-loads\\-misses / inst_retired.any", - .metric_name = "icache_miss_cycles", - }, - { - .metric_expr = "(dcache_miss_cpi + icache_miss_cycles)", - .metric_name = "cache_miss_cycles", - .metric_group = "group1", - }, - { - .metric_expr = "l2_rqsts.demand_data_rd_hit + l2_rqsts.pf_hit + l2_rqsts.rfo_hit", - .metric_name = "DCache_L2_All_Hits", - }, - { - .metric_expr = "max(l2_rqsts.all_demand_data_rd - l2_rqsts.demand_data_rd_hit, 0) + " - "l2_rqsts.pf_miss + l2_rqsts.rfo_miss", - .metric_name = "DCache_L2_All_Miss", - }, - { - .metric_expr = "DCache_L2_All_Hits + DCache_L2_All_Miss", - .metric_name = "DCache_L2_All", - }, - { - .metric_expr = "d_ratio(DCache_L2_All_Hits, DCache_L2_All)", - .metric_name = "DCache_L2_Hits", - }, - { - .metric_expr = "d_ratio(DCache_L2_All_Miss, DCache_L2_All)", - .metric_name = "DCache_L2_Misses", - }, - { - .metric_expr = "ipc + M2", - .metric_name = "M1", - }, - { - .metric_expr = "ipc + M1", - .metric_name = "M2", - }, - { - .metric_expr = "1/M3", - .metric_name = "M3", - }, - { - .metric_expr = "64 * l1d.replacement / 1000000000 / duration_time", - .metric_name = "L1D_Cache_Fill_BW", - }, - { - .metric_expr = 0, - .metric_name = 0, - }, +struct pmu_table_entry { + const struct compact_pmu_event *entries; + uint32_t num_entries; + struct compact_pmu_event pmu_name; +}; + +static const char *const big_c_string = +/* offset=0 */ "default_core\000" +/* offset=13 */ "bp_l1_btb_correct\000branch\000L1 BTB Correction\000event=0x8a\000\00000\000\000" +/* offset=72 */ "bp_l2_btb_correct\000branch\000L2 BTB Correction\000event=0x8b\000\00000\000\000" +/* offset=131 */ "l3_cache_rd\000cache\000L3 cache access, read\000event=0x40\000\00000\000Attributable Level 3 cache access, read\000" +/* offset=226 */ "segment_reg_loads.any\000other\000Number of segment register loads\000event=6,period=200000,umask=0x80\000\00000\000\000" +/* offset=325 */ "dispatch_blocked.any\000other\000Memory cluster signals to block micro-op dispatch for any reason\000event=9,period=200000,umask=0x20\000\00000\000\000" +/* offset=455 */ "eist_trans\000other\000Number of Enhanced Intel SpeedStep(R) Technology (EIST) transitions\000event=0x3a,period=200000\000\00000\000\000" +/* offset=570 */ "hisi_sccl,ddrc\000" +/* offset=585 */ "uncore_hisi_ddrc.flux_wcmd\000uncore\000DDRC write commands\000event=2\000\00000\000DDRC write commands\000" +/* offset=671 */ "uncore_cbox\000" +/* offset=683 */ "unc_cbo_xsnp_response.miss_eviction\000uncore\000A cross-core snoop resulted from L3 Eviction which misses in some processor core\000event=0x22,umask=0x81\000\00000\000A cross-core snoop resulted from L3 Eviction which misses in some processor core\000" +/* offset=914 */ "event-hyphen\000uncore\000UNC_CBO_HYPHEN\000event=0xe0\000\00000\000UNC_CBO_HYPHEN\000" +/* offset=979 */ "event-two-hyph\000uncore\000UNC_CBO_TWO_HYPH\000event=0xc0\000\00000\000UNC_CBO_TWO_HYPH\000" +/* offset=1050 */ "hisi_sccl,l3c\000" +/* offset=1064 */ "uncore_hisi_l3c.rd_hit_cpipe\000uncore\000Total read hits\000event=7\000\00000\000Total read hits\000" +/* offset=1144 */ "uncore_imc_free_running\000" +/* offset=1168 */ "uncore_imc_free_running.cache_miss\000uncore\000Total cache misses\000event=0x12\000\00000\000Total cache misses\000" +/* offset=1263 */ "uncore_imc\000" +/* offset=1274 */ "uncore_imc.cache_hits\000uncore\000Total cache hits\000event=0x34\000\00000\000Total cache hits\000" +/* offset=1352 */ "uncore_sys_ddr_pmu\000" +/* offset=1371 */ "sys_ddr_pmu.write_cycles\000uncore\000ddr write-cycles event\000event=0x2b\000v8\00000\000\000" +/* offset=1444 */ "uncore_sys_ccn_pmu\000" +/* offset=1463 */ "sys_ccn_pmu.read_cycles\000uncore\000ccn read-cycles event\000config=0x2c\0000x01\00000\000\000" +/* offset=1537 */ "uncore_sys_cmn_pmu\000" +/* offset=1556 */ "sys_cmn_pmu.hnf_cache_miss\000uncore\000Counts total cache misses in first lookup result (high priority)\000eventid=1,type=5\000(434|436|43c|43a).*\00000\000\000" +/* offset=1696 */ "CPI\000\0001 / IPC\000\000\000\000\000\000\000\00000" +/* offset=1718 */ "IPC\000group1\000inst_retired.any / cpu_clk_unhalted.thread\000\000\000\000\000\000\000\00000" +/* offset=1781 */ "Frontend_Bound_SMT\000\000idq_uops_not_delivered.core / (4 * (cpu_clk_unhalted.thread / 2 * (1 + cpu_clk_unhalted.one_thread_active / cpu_clk_unhalted.ref_xclk)))\000\000\000\000\000\000\000\00000" +/* offset=1947 */ "dcache_miss_cpi\000\000l1d\\-loads\\-misses / inst_retired.any\000\000\000\000\000\000\000\00000" +/* offset=2011 */ "icache_miss_cycles\000\000l1i\\-loads\\-misses / inst_retired.any\000\000\000\000\000\000\000\00000" +/* offset=2078 */ "cache_miss_cycles\000group1\000dcache_miss_cpi + icache_miss_cycles\000\000\000\000\000\000\000\00000" +/* offset=2149 */ "DCache_L2_All_Hits\000\000l2_rqsts.demand_data_rd_hit + l2_rqsts.pf_hit + l2_rqsts.rfo_hit\000\000\000\000\000\000\000\00000" +/* offset=2243 */ "DCache_L2_All_Miss\000\000max(l2_rqsts.all_demand_data_rd - l2_rqsts.demand_data_rd_hit, 0) + l2_rqsts.pf_miss + l2_rqsts.rfo_miss\000\000\000\000\000\000\000\00000" +/* offset=2377 */ "DCache_L2_All\000\000DCache_L2_All_Hits + DCache_L2_All_Miss\000\000\000\000\000\000\000\00000" +/* offset=2441 */ "DCache_L2_Hits\000\000d_ratio(DCache_L2_All_Hits, DCache_L2_All)\000\000\000\000\000\000\000\00000" +/* offset=2509 */ "DCache_L2_Misses\000\000d_ratio(DCache_L2_All_Miss, DCache_L2_All)\000\000\000\000\000\000\000\00000" +/* offset=2579 */ "M1\000\000ipc + M2\000\000\000\000\000\000\000\00000" +/* offset=2601 */ "M2\000\000ipc + M1\000\000\000\000\000\000\000\00000" +/* offset=2623 */ "M3\000\0001 / M3\000\000\000\000\000\000\000\00000" +/* offset=2643 */ "L1D_Cache_Fill_BW\000\00064 * l1d.replacement / 1e9 / duration_time\000\000\000\000\000\000\000\00000" +; + +static const struct compact_pmu_event pmu_events__test_soc_cpu_default_core[] = { +{ 13 }, /* bp_l1_btb_correct\000branch\000L1 BTB Correction\000event=0x8a\000\00000\000\000 */ +{ 72 }, /* bp_l2_btb_correct\000branch\000L2 BTB Correction\000event=0x8b\000\00000\000\000 */ +{ 325 }, /* dispatch_blocked.any\000other\000Memory cluster signals to block micro-op dispatch for any reason\000event=9,period=200000,umask=0x20\000\00000\000\000 */ +{ 455 }, /* eist_trans\000other\000Number of Enhanced Intel SpeedStep(R) Technology (EIST) transitions\000event=0x3a,period=200000\000\00000\000\000 */ +{ 131 }, /* l3_cache_rd\000cache\000L3 cache access, read\000event=0x40\000\00000\000Attributable Level 3 cache access, read\000 */ +{ 226 }, /* segment_reg_loads.any\000other\000Number of segment register loads\000event=6,period=200000,umask=0x80\000\00000\000\000 */ +}; +static const struct compact_pmu_event pmu_events__test_soc_cpu_hisi_sccl_ddrc[] = { +{ 585 }, /* uncore_hisi_ddrc.flux_wcmd\000uncore\000DDRC write commands\000event=2\000\00000\000DDRC write commands\000 */ +}; +static const struct compact_pmu_event pmu_events__test_soc_cpu_hisi_sccl_l3c[] = { +{ 1064 }, /* uncore_hisi_l3c.rd_hit_cpipe\000uncore\000Total read hits\000event=7\000\00000\000Total read hits\000 */ +}; +static const struct compact_pmu_event pmu_events__test_soc_cpu_uncore_cbox[] = { +{ 914 }, /* event-hyphen\000uncore\000UNC_CBO_HYPHEN\000event=0xe0\000\00000\000UNC_CBO_HYPHEN\000 */ +{ 979 }, /* event-two-hyph\000uncore\000UNC_CBO_TWO_HYPH\000event=0xc0\000\00000\000UNC_CBO_TWO_HYPH\000 */ +{ 683 }, /* unc_cbo_xsnp_response.miss_eviction\000uncore\000A cross-core snoop resulted from L3 Eviction which misses in some processor core\000event=0x22,umask=0x81\000\00000\000A cross-core snoop resulted from L3 Eviction which misses in some processor core\000 */ +}; +static const struct compact_pmu_event pmu_events__test_soc_cpu_uncore_imc[] = { +{ 1274 }, /* uncore_imc.cache_hits\000uncore\000Total cache hits\000event=0x34\000\00000\000Total cache hits\000 */ +}; +static const struct compact_pmu_event pmu_events__test_soc_cpu_uncore_imc_free_running[] = { +{ 1168 }, /* uncore_imc_free_running.cache_miss\000uncore\000Total cache misses\000event=0x12\000\00000\000Total cache misses\000 */ + +}; + +const struct pmu_table_entry pmu_events__test_soc_cpu[] = { +{ + .entries = pmu_events__test_soc_cpu_default_core, + .num_entries = ARRAY_SIZE(pmu_events__test_soc_cpu_default_core), + .pmu_name = { 0 /* default_core\000 */ }, +}, +{ + .entries = pmu_events__test_soc_cpu_hisi_sccl_ddrc, + .num_entries = ARRAY_SIZE(pmu_events__test_soc_cpu_hisi_sccl_ddrc), + .pmu_name = { 570 /* hisi_sccl,ddrc\000 */ }, +}, +{ + .entries = pmu_events__test_soc_cpu_hisi_sccl_l3c, + .num_entries = ARRAY_SIZE(pmu_events__test_soc_cpu_hisi_sccl_l3c), + .pmu_name = { 1050 /* hisi_sccl,l3c\000 */ }, +}, +{ + .entries = pmu_events__test_soc_cpu_uncore_cbox, + .num_entries = ARRAY_SIZE(pmu_events__test_soc_cpu_uncore_cbox), + .pmu_name = { 671 /* uncore_cbox\000 */ }, +}, +{ + .entries = pmu_events__test_soc_cpu_uncore_imc, + .num_entries = ARRAY_SIZE(pmu_events__test_soc_cpu_uncore_imc), + .pmu_name = { 1263 /* uncore_imc\000 */ }, +}, +{ + .entries = pmu_events__test_soc_cpu_uncore_imc_free_running, + .num_entries = ARRAY_SIZE(pmu_events__test_soc_cpu_uncore_imc_free_running), + .pmu_name = { 1144 /* uncore_imc_free_running\000 */ }, +}, }; +static const struct compact_pmu_event pmu_metrics__test_soc_cpu_default_core[] = { +{ 1696 }, /* CPI\000\0001 / IPC\000\000\000\000\000\000\000\00000 */ +{ 2377 }, /* DCache_L2_All\000\000DCache_L2_All_Hits + DCache_L2_All_Miss\000\000\000\000\000\000\000\00000 */ +{ 2149 }, /* DCache_L2_All_Hits\000\000l2_rqsts.demand_data_rd_hit + l2_rqsts.pf_hit + l2_rqsts.rfo_hit\000\000\000\000\000\000\000\00000 */ +{ 2243 }, /* DCache_L2_All_Miss\000\000max(l2_rqsts.all_demand_data_rd - l2_rqsts.demand_data_rd_hit, 0) + l2_rqsts.pf_miss + l2_rqsts.rfo_miss\000\000\000\000\000\000\000\00000 */ +{ 2441 }, /* DCache_L2_Hits\000\000d_ratio(DCache_L2_All_Hits, DCache_L2_All)\000\000\000\000\000\000\000\00000 */ +{ 2509 }, /* DCache_L2_Misses\000\000d_ratio(DCache_L2_All_Miss, DCache_L2_All)\000\000\000\000\000\000\000\00000 */ +{ 1781 }, /* Frontend_Bound_SMT\000\000idq_uops_not_delivered.core / (4 * (cpu_clk_unhalted.thread / 2 * (1 + cpu_clk_unhalted.one_thread_active / cpu_clk_unhalted.ref_xclk)))\000\000\000\000\000\000\000\00000 */ +{ 1718 }, /* IPC\000group1\000inst_retired.any / cpu_clk_unhalted.thread\000\000\000\000\000\000\000\00000 */ +{ 2643 }, /* L1D_Cache_Fill_BW\000\00064 * l1d.replacement / 1e9 / duration_time\000\000\000\000\000\000\000\00000 */ +{ 2579 }, /* M1\000\000ipc + M2\000\000\000\000\000\000\000\00000 */ +{ 2601 }, /* M2\000\000ipc + M1\000\000\000\000\000\000\000\00000 */ +{ 2623 }, /* M3\000\0001 / M3\000\000\000\000\000\000\000\00000 */ +{ 2078 }, /* cache_miss_cycles\000group1\000dcache_miss_cpi + icache_miss_cycles\000\000\000\000\000\000\000\00000 */ +{ 1947 }, /* dcache_miss_cpi\000\000l1d\\-loads\\-misses / inst_retired.any\000\000\000\000\000\000\000\00000 */ +{ 2011 }, /* icache_miss_cycles\000\000l1i\\-loads\\-misses / inst_retired.any\000\000\000\000\000\000\000\00000 */ + +}; + +const struct pmu_table_entry pmu_metrics__test_soc_cpu[] = { +{ + .entries = pmu_metrics__test_soc_cpu_default_core, + .num_entries = ARRAY_SIZE(pmu_metrics__test_soc_cpu_default_core), + .pmu_name = { 0 /* default_core\000 */ }, +}, +}; + +static const struct compact_pmu_event pmu_events__test_soc_sys_uncore_sys_ccn_pmu[] = { +{ 1463 }, /* sys_ccn_pmu.read_cycles\000uncore\000ccn read-cycles event\000config=0x2c\0000x01\00000\000\000 */ +}; +static const struct compact_pmu_event pmu_events__test_soc_sys_uncore_sys_cmn_pmu[] = { +{ 1556 }, /* sys_cmn_pmu.hnf_cache_miss\000uncore\000Counts total cache misses in first lookup result (high priority)\000eventid=1,type=5\000(434|436|43c|43a).*\00000\000\000 */ +}; +static const struct compact_pmu_event pmu_events__test_soc_sys_uncore_sys_ddr_pmu[] = { +{ 1371 }, /* sys_ddr_pmu.write_cycles\000uncore\000ddr write-cycles event\000event=0x2b\000v8\00000\000\000 */ + +}; + +const struct pmu_table_entry pmu_events__test_soc_sys[] = { +{ + .entries = pmu_events__test_soc_sys_uncore_sys_ccn_pmu, + .num_entries = ARRAY_SIZE(pmu_events__test_soc_sys_uncore_sys_ccn_pmu), + .pmu_name = { 1444 /* uncore_sys_ccn_pmu\000 */ }, +}, +{ + .entries = pmu_events__test_soc_sys_uncore_sys_cmn_pmu, + .num_entries = ARRAY_SIZE(pmu_events__test_soc_sys_uncore_sys_cmn_pmu), + .pmu_name = { 1537 /* uncore_sys_cmn_pmu\000 */ }, +}, +{ + .entries = pmu_events__test_soc_sys_uncore_sys_ddr_pmu, + .num_entries = ARRAY_SIZE(pmu_events__test_soc_sys_uncore_sys_ddr_pmu), + .pmu_name = { 1352 /* uncore_sys_ddr_pmu\000 */ }, +}, +}; + + /* Struct used to make the PMU event table implementation opaque to callers. */ struct pmu_events_table { - const struct pmu_event *entries; + const struct pmu_table_entry *pmus; + uint32_t num_pmus; }; /* Struct used to make the PMU metric table implementation opaque to callers. */ struct pmu_metrics_table { - const struct pmu_metric *entries; + const struct pmu_table_entry *pmus; + uint32_t num_pmus; }; /* @@ -202,92 +199,191 @@ struct pmu_metrics_table { * The cpuid can contain any character other than the comma. */ struct pmu_events_map { - const char *arch; - const char *cpuid; - const struct pmu_events_table event_table; - const struct pmu_metrics_table metric_table; + const char *arch; + const char *cpuid; + struct pmu_events_table event_table; + struct pmu_metrics_table metric_table; }; /* * Global table mapping each known CPU for the architecture to its * table of PMU events. */ -static const struct pmu_events_map pmu_events_map[] = { - { - .arch = "testarch", - .cpuid = "testcpu", - .event_table = { pmu_events__test_soc_cpu }, - .metric_table = { pmu_metrics__test_soc_cpu }, - }, - { - .arch = 0, - .cpuid = 0, - .event_table = { 0 }, - .metric_table = { 0 }, - }, -}; - -static const struct pmu_event pmu_events__test_soc_sys[] = { - { - .name = "sys_ddr_pmu.write_cycles", - .event = "event=0x2b", - .desc = "ddr write-cycles event. Unit: uncore_sys_ddr_pmu ", - .compat = "v8", - .topic = "uncore", - .pmu = "uncore_sys_ddr_pmu", - }, - { - .name = "sys_ccn_pmu.read_cycles", - .event = "config=0x2c", - .desc = "ccn read-cycles event. Unit: uncore_sys_ccn_pmu ", - .compat = "0x01", - .topic = "uncore", - .pmu = "uncore_sys_ccn_pmu", - }, - { - .name = "sys_cmn_pmu.hnf_cache_miss", - .event = "eventid=0x1,type=0x5", - .desc = "Counts total cache misses in first lookup result (high priority). Unit: uncore_sys_cmn_pmu ", - .compat = "(434|436|43c|43a).*", - .topic = "uncore", - .pmu = "uncore_sys_cmn_pmu", - }, - { - .name = 0, - .event = 0, - .desc = 0, - }, +const struct pmu_events_map pmu_events_map[] = { +{ + .arch = "testarch", + .cpuid = "testcpu", + .event_table = { + .pmus = pmu_events__test_soc_cpu, + .num_pmus = ARRAY_SIZE(pmu_events__test_soc_cpu), + }, + .metric_table = { + .pmus = pmu_metrics__test_soc_cpu, + .num_pmus = ARRAY_SIZE(pmu_metrics__test_soc_cpu), + } +}, +{ + .arch = 0, + .cpuid = 0, + .event_table = { 0, 0 }, + .metric_table = { 0, 0 }, +} }; struct pmu_sys_events { const char *name; - const struct pmu_events_table table; + struct pmu_events_table event_table; + struct pmu_metrics_table metric_table; }; static const struct pmu_sys_events pmu_sys_event_tables[] = { { - .table = { pmu_events__test_soc_sys }, + .event_table = { + .pmus = pmu_events__test_soc_sys, + .num_pmus = ARRAY_SIZE(pmu_events__test_soc_sys) + }, .name = "pmu_events__test_soc_sys", }, { - .table = { 0 } + .event_table = { 0, 0 }, + .metric_table = { 0, 0 }, }, }; -int pmu_events_table__for_each_event(const struct pmu_events_table *table, struct perf_pmu *pmu, - pmu_event_iter_fn fn, void *data) +static void decompress_event(int offset, struct pmu_event *pe) +{ + const char *p = &big_c_string[offset]; + + pe->name = (*p == '\0' ? NULL : p); + while (*p++); + pe->topic = (*p == '\0' ? NULL : p); + while (*p++); + pe->desc = (*p == '\0' ? NULL : p); + while (*p++); + pe->event = (*p == '\0' ? NULL : p); + while (*p++); + pe->compat = (*p == '\0' ? NULL : p); + while (*p++); + pe->deprecated = *p - '0'; + p++; + pe->perpkg = *p - '0'; + p++; + pe->unit = (*p == '\0' ? NULL : p); + while (*p++); + pe->long_desc = (*p == '\0' ? NULL : p); +} + +static void decompress_metric(int offset, struct pmu_metric *pm) { - for (const struct pmu_event *pe = &table->entries[0]; pe->name; pe++) { - int ret; + const char *p = &big_c_string[offset]; + + pm->metric_name = (*p == '\0' ? NULL : p); + while (*p++); + pm->metric_group = (*p == '\0' ? NULL : p); + while (*p++); + pm->metric_expr = (*p == '\0' ? NULL : p); + while (*p++); + pm->metric_threshold = (*p == '\0' ? NULL : p); + while (*p++); + pm->desc = (*p == '\0' ? NULL : p); + while (*p++); + pm->long_desc = (*p == '\0' ? NULL : p); + while (*p++); + pm->unit = (*p == '\0' ? NULL : p); + while (*p++); + pm->compat = (*p == '\0' ? NULL : p); + while (*p++); + pm->metricgroup_no_group = (*p == '\0' ? NULL : p); + while (*p++); + pm->default_metricgroup_name = (*p == '\0' ? NULL : p); + while (*p++); + pm->aggr_mode = *p - '0'; + p++; + pm->event_grouping = *p - '0'; +} - if (pmu && !pmu__name_match(pmu, pe->pmu)) +static int pmu_events_table__for_each_event_pmu(const struct pmu_events_table *table, + const struct pmu_table_entry *pmu, + pmu_event_iter_fn fn, + void *data) +{ + int ret; + struct pmu_event pe = { + .pmu = &big_c_string[pmu->pmu_name.offset], + }; + + for (uint32_t i = 0; i < pmu->num_entries; i++) { + decompress_event(pmu->entries[i].offset, &pe); + if (!pe.name) continue; + ret = fn(&pe, table, data); + if (ret) + return ret; + } + return 0; + } + +static int pmu_events_table__find_event_pmu(const struct pmu_events_table *table, + const struct pmu_table_entry *pmu, + const char *name, + pmu_event_iter_fn fn, + void *data) +{ + struct pmu_event pe = { + .pmu = &big_c_string[pmu->pmu_name.offset], + }; + int low = 0, high = pmu->num_entries - 1; - ret = fn(pe, table, data); - if (ret) - return ret; - } - return 0; + while (low <= high) { + int cmp, mid = (low + high) / 2; + + decompress_event(pmu->entries[mid].offset, &pe); + + if (!pe.name && !name) + goto do_call; + + if (!pe.name && name) { + low = mid + 1; + continue; + } + if (pe.name && !name) { + high = mid - 1; + continue; + } + + cmp = strcasecmp(pe.name, name); + if (cmp < 0) { + low = mid + 1; + continue; + } + if (cmp > 0) { + high = mid - 1; + continue; + } + do_call: + return fn ? fn(&pe, table, data) : 0; + } + return PMU_EVENTS__NOT_FOUND; +} + +int pmu_events_table__for_each_event(const struct pmu_events_table *table, + struct perf_pmu *pmu, + pmu_event_iter_fn fn, + void *data) +{ + for (size_t i = 0; i < table->num_pmus; i++) { + const struct pmu_table_entry *table_pmu = &table->pmus[i]; + const char *pmu_name = &big_c_string[table_pmu->pmu_name.offset]; + int ret; + + if (pmu && !pmu__name_match(pmu, pmu_name)) + continue; + + ret = pmu_events_table__for_each_event_pmu(table, table_pmu, fn, data); + if (pmu || ret) + return ret; + } + return 0; } int pmu_events_table__find_event(const struct pmu_events_table *table, @@ -296,14 +392,19 @@ int pmu_events_table__find_event(const struct pmu_events_table *table, pmu_event_iter_fn fn, void *data) { - for (const struct pmu_event *pe = &table->entries[0]; pe->name; pe++) { - if (pmu && !pmu__name_match(pmu, pe->pmu)) + for (size_t i = 0; i < table->num_pmus; i++) { + const struct pmu_table_entry *table_pmu = &table->pmus[i]; + const char *pmu_name = &big_c_string[table_pmu->pmu_name.offset]; + int ret; + + if (!pmu__name_match(pmu, pmu_name)) continue; - if (!strcasecmp(pe->name, name)) - return fn(pe, table, data); - } - return -1000; + ret = pmu_events_table__find_event_pmu(table, table_pmu, name, fn, data); + if (ret != PMU_EVENTS__NOT_FOUND) + return ret; + } + return PMU_EVENTS__NOT_FOUND; } size_t pmu_events_table__num_events(const struct pmu_events_table *table, @@ -311,160 +412,253 @@ size_t pmu_events_table__num_events(const struct pmu_events_table *table, { size_t count = 0; - for (const struct pmu_event *pe = &table->entries[0]; pe->name; pe++) { - if (pmu && !pmu__name_match(pmu, pe->pmu)) - continue; + for (size_t i = 0; i < table->num_pmus; i++) { + const struct pmu_table_entry *table_pmu = &table->pmus[i]; + const char *pmu_name = &big_c_string[table_pmu->pmu_name.offset]; - count++; - } + if (pmu__name_match(pmu, pmu_name)) + count += table_pmu->num_entries; + } return count; } -int pmu_metrics_table__for_each_metric(const struct pmu_metrics_table *table, pmu_metric_iter_fn fn, - void *data) +static int pmu_metrics_table__for_each_metric_pmu(const struct pmu_metrics_table *table, + const struct pmu_table_entry *pmu, + pmu_metric_iter_fn fn, + void *data) +{ + int ret; + struct pmu_metric pm = { + .pmu = &big_c_string[pmu->pmu_name.offset], + }; + + for (uint32_t i = 0; i < pmu->num_entries; i++) { + decompress_metric(pmu->entries[i].offset, &pm); + if (!pm.metric_expr) + continue; + ret = fn(&pm, table, data); + if (ret) + return ret; + } + return 0; +} + +int pmu_metrics_table__for_each_metric(const struct pmu_metrics_table *table, + pmu_metric_iter_fn fn, + void *data) { - for (const struct pmu_metric *pm = &table->entries[0]; pm->metric_expr; pm++) { - int ret = fn(pm, table, data); + for (size_t i = 0; i < table->num_pmus; i++) { + int ret = pmu_metrics_table__for_each_metric_pmu(table, &table->pmus[i], + fn, data); + + if (ret) + return ret; + } + return 0; +} - if (ret) - return ret; - } - return 0; +static const struct pmu_events_map *map_for_pmu(struct perf_pmu *pmu) +{ + static struct { + const struct pmu_events_map *map; + struct perf_pmu *pmu; + } last_result; + static struct { + const struct pmu_events_map *map; + char *cpuid; + } last_map_search; + static bool has_last_result, has_last_map_search; + const struct pmu_events_map *map = NULL; + char *cpuid = NULL; + size_t i; + + if (has_last_result && last_result.pmu == pmu) + return last_result.map; + + cpuid = perf_pmu__getcpuid(pmu); + + /* + * On some platforms which uses cpus map, cpuid can be NULL for + * PMUs other than CORE PMUs. + */ + if (!cpuid) + goto out_update_last_result; + + if (has_last_map_search && !strcmp(last_map_search.cpuid, cpuid)) { + map = last_map_search.map; + free(cpuid); + } else { + i = 0; + for (;;) { + map = &pmu_events_map[i++]; + + if (!map->arch) { + map = NULL; + break; + } + + if (!strcmp_cpuid_str(map->cpuid, cpuid)) + break; + } + free(last_map_search.cpuid); + last_map_search.cpuid = cpuid; + last_map_search.map = map; + has_last_map_search = true; + } +out_update_last_result: + last_result.pmu = pmu; + last_result.map = map; + has_last_result = true; + return map; } const struct pmu_events_table *perf_pmu__find_events_table(struct perf_pmu *pmu) { - const struct pmu_events_table *table = NULL; - char *cpuid = perf_pmu__getcpuid(pmu); - int i; + const struct pmu_events_map *map = map_for_pmu(pmu); - /* on some platforms which uses cpus map, cpuid can be NULL for - * PMUs other than CORE PMUs. - */ - if (!cpuid) - return NULL; + if (!map) + return NULL; - i = 0; - for (;;) { - const struct pmu_events_map *map = &pmu_events_map[i++]; + if (!pmu) + return &map->event_table; - if (!map->cpuid) - break; + for (size_t i = 0; i < map->event_table.num_pmus; i++) { + const struct pmu_table_entry *table_pmu = &map->event_table.pmus[i]; + const char *pmu_name = &big_c_string[table_pmu->pmu_name.offset]; - if (!strcmp_cpuid_str(map->cpuid, cpuid)) { - table = &map->event_table; - break; - } - } - free(cpuid); - return table; + if (pmu__name_match(pmu, pmu_name)) + return &map->event_table; + } + return NULL; } const struct pmu_metrics_table *perf_pmu__find_metrics_table(struct perf_pmu *pmu) { - const struct pmu_metrics_table *table = NULL; - char *cpuid = perf_pmu__getcpuid(pmu); - int i; + const struct pmu_events_map *map = map_for_pmu(pmu); - /* on some platforms which uses cpus map, cpuid can be NULL for - * PMUs other than CORE PMUs. - */ - if (!cpuid) - return NULL; + if (!map) + return NULL; - i = 0; - for (;;) { - const struct pmu_events_map *map = &pmu_events_map[i++]; + if (!pmu) + return &map->metric_table; - if (!map->cpuid) - break; + for (size_t i = 0; i < map->metric_table.num_pmus; i++) { + const struct pmu_table_entry *table_pmu = &map->metric_table.pmus[i]; + const char *pmu_name = &big_c_string[table_pmu->pmu_name.offset]; - if (!strcmp_cpuid_str(map->cpuid, cpuid)) { - table = &map->metric_table; - break; - } - } - free(cpuid); - return table; + if (pmu__name_match(pmu, pmu_name)) + return &map->metric_table; + } + return NULL; } const struct pmu_events_table *find_core_events_table(const char *arch, const char *cpuid) { - for (const struct pmu_events_map *tables = &pmu_events_map[0]; - tables->arch; - tables++) { - if (!strcmp(tables->arch, arch) && !strcmp_cpuid_str(tables->cpuid, cpuid)) - return &tables->event_table; - } - return NULL; + for (const struct pmu_events_map *tables = &pmu_events_map[0]; + tables->arch; + tables++) { + if (!strcmp(tables->arch, arch) && !strcmp_cpuid_str(tables->cpuid, cpuid)) + return &tables->event_table; + } + return NULL; } const struct pmu_metrics_table *find_core_metrics_table(const char *arch, const char *cpuid) { - for (const struct pmu_events_map *tables = &pmu_events_map[0]; - tables->arch; - tables++) { - if (!strcmp(tables->arch, arch) && !strcmp_cpuid_str(tables->cpuid, cpuid)) - return &tables->metric_table; - } - return NULL; + for (const struct pmu_events_map *tables = &pmu_events_map[0]; + tables->arch; + tables++) { + if (!strcmp(tables->arch, arch) && !strcmp_cpuid_str(tables->cpuid, cpuid)) + return &tables->metric_table; + } + return NULL; } int pmu_for_each_core_event(pmu_event_iter_fn fn, void *data) { - for (const struct pmu_events_map *tables = &pmu_events_map[0]; tables->arch; tables++) { - int ret = pmu_events_table__for_each_event(&tables->event_table, - /*pmu=*/ NULL, fn, data); - - if (ret) - return ret; - } - return 0; + for (const struct pmu_events_map *tables = &pmu_events_map[0]; + tables->arch; + tables++) { + int ret = pmu_events_table__for_each_event(&tables->event_table, + /*pmu=*/ NULL, fn, data); + + if (ret) + return ret; + } + return 0; } int pmu_for_each_core_metric(pmu_metric_iter_fn fn, void *data) { - for (const struct pmu_events_map *tables = &pmu_events_map[0]; - tables->arch; - tables++) { - int ret = pmu_metrics_table__for_each_metric(&tables->metric_table, fn, data); - - if (ret) - return ret; - } - return 0; + for (const struct pmu_events_map *tables = &pmu_events_map[0]; + tables->arch; + tables++) { + int ret = pmu_metrics_table__for_each_metric(&tables->metric_table, fn, data); + + if (ret) + return ret; + } + return 0; } const struct pmu_events_table *find_sys_events_table(const char *name) { - for (const struct pmu_sys_events *tables = &pmu_sys_event_tables[0]; - tables->name; - tables++) { - if (!strcmp(tables->name, name)) - return &tables->table; - } - return NULL; + for (const struct pmu_sys_events *tables = &pmu_sys_event_tables[0]; + tables->name; + tables++) { + if (!strcmp(tables->name, name)) + return &tables->event_table; + } + return NULL; } int pmu_for_each_sys_event(pmu_event_iter_fn fn, void *data) { - for (const struct pmu_sys_events *tables = &pmu_sys_event_tables[0]; - tables->name; - tables++) { - int ret = pmu_events_table__for_each_event(&tables->table, /*pmu=*/ NULL, fn, data); - - if (ret) - return ret; - } - return 0; + for (const struct pmu_sys_events *tables = &pmu_sys_event_tables[0]; + tables->name; + tables++) { + int ret = pmu_events_table__for_each_event(&tables->event_table, + /*pmu=*/ NULL, fn, data); + + if (ret) + return ret; + } + return 0; } -int pmu_for_each_sys_metric(pmu_metric_iter_fn fn __maybe_unused, void *data __maybe_unused) +int pmu_for_each_sys_metric(pmu_metric_iter_fn fn, void *data) { - return 0; + for (const struct pmu_sys_events *tables = &pmu_sys_event_tables[0]; + tables->name; + tables++) { + int ret = pmu_metrics_table__for_each_metric(&tables->metric_table, fn, data); + + if (ret) + return ret; + } + return 0; } -const char *describe_metricgroup(const char *group __maybe_unused) +static const int metricgroups[][2] = { + +}; + +const char *describe_metricgroup(const char *group) { - return NULL; + int low = 0, high = (int)ARRAY_SIZE(metricgroups) - 1; + + while (low <= high) { + int mid = (low + high) / 2; + const char *mgroup = &big_c_string[metricgroups[mid][0]]; + int cmp = strcmp(mgroup, group); + + if (cmp == 0) { + return &big_c_string[metricgroups[mid][1]]; + } else if (cmp < 0) { + low = mid + 1; + } else { + high = mid - 1; + } + } + return NULL; } diff --git a/tools/perf/pmu-events/jevents.py b/tools/perf/pmu-events/jevents.py index 731776e29f47..fcf0158438b5 100755 --- a/tools/perf/pmu-events/jevents.py +++ b/tools/perf/pmu-events/jevents.py @@ -1256,6 +1256,10 @@ such as "arm/cortex-a34".''', 'output_file', type=argparse.FileType('w', encoding='utf-8'), nargs='?', default=sys.stdout) _args = ap.parse_args() + _args.output_file.write(f""" +/* SPDX-License-Identifier: GPL-2.0 */ +/* THIS FILE WAS AUTOGENERATED BY jevents.py arch={_args.arch} model={_args.model} ! */ +""") _args.output_file.write(""" #include #include "util/header.h" @@ -1281,7 +1285,7 @@ struct pmu_table_entry { if item.name == _args.arch or _args.arch == 'all' or item.name == 'test': archs.append(item.name) - if len(archs) < 2: + if len(archs) < 2 and _args.arch != 'none': raise IOError(f'Missing architecture directory \'{_args.arch}\'') archs.sort() -- cgit v1.2.3 From 7c5dd51bbb6767095df8a5e9e2b4528b8af18538 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Wed, 31 Jul 2024 16:00:05 -0700 Subject: perf python: Remove PYTHON_PERF ifdefs When perf code was compiled one way for the binary and another for the python module, the PYTHON_PERF ifdef was used to remove some code from the python module. Since switching to building the perf code as a series of libraries, with the same libraries being used for the python module, the ifdefs became unused as PYTHON_PERF is never defined. As such remove the ifdefs. Fixes: 9dabf4003423c8d3 ("perf python: Switch module to linking libraries from building source") Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ian Rogers Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: Kan Liang Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20240731230005.12295-1-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/evsel_fprintf.c | 2 -- tools/perf/util/mmap.c | 4 ---- 2 files changed, 6 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/evsel_fprintf.c b/tools/perf/util/evsel_fprintf.c index 8719b3cb5646..c2c0500d5da9 100644 --- a/tools/perf/util/evsel_fprintf.c +++ b/tools/perf/util/evsel_fprintf.c @@ -107,7 +107,6 @@ out: return ++printed; } -#ifndef PYTHON_PERF int sample__fprintf_callchain(struct perf_sample *sample, int left_alignment, unsigned int print_opts, struct callchain_cursor *cursor, struct strlist *bt_stop_list, FILE *fp) @@ -248,4 +247,3 @@ int sample__fprintf_sym(struct perf_sample *sample, struct addr_location *al, return printed; } -#endif /* PYTHON_PERF */ diff --git a/tools/perf/util/mmap.c b/tools/perf/util/mmap.c index 122ee198a86e..43b02293f1d2 100644 --- a/tools/perf/util/mmap.c +++ b/tools/perf/util/mmap.c @@ -230,9 +230,7 @@ void mmap__munmap(struct mmap *map) { bitmap_free(map->affinity_mask.bits); -#ifndef PYTHON_PERF zstd_fini(&map->zstd_data); -#endif perf_mmap__aio_munmap(map); if (map->data != NULL) { @@ -295,12 +293,10 @@ int mmap__mmap(struct mmap *map, struct mmap_params *mp, int fd, struct perf_cpu map->core.flush = mp->flush; -#ifndef PYTHON_PERF if (zstd_init(&map->zstd_data, mp->comp_level)) { pr_debug2("failed to init mmap compressor, error %d\n", errno); return -1; } -#endif if (mp->comp_level && !perf_mmap__aio_enabled(map)) { map->data = mmap(NULL, mmap__mmap_len(map), PROT_READ|PROT_WRITE, -- cgit v1.2.3 From 96465e0179fa8a7059bd48a81a81889dcad7d543 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Wed, 31 Jul 2024 16:55:00 -0700 Subject: perf hist: Correct hist_entry->mem_info refcounts The 'struct mem_info' is created by iter_prepare_mem_entry() at the beginning and destroyed by iter_finish_mem_entry() at the end. So if it's used in a new hist_entry, it should be cloned. Simplify (hopefully) the logic by adding some helper functions and by not holding the refcount in the temporary entry. Signed-off-by: Namhyung Kim Cc: Adrian Hunter Cc: Athira Rajeev Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kan Liang Cc: Peter Zijlstra Cc: Stephane Eranian Link: https://lore.kernel.org/r/20240731235505.710436-2-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/hist.c | 14 +++++++------- tools/perf/util/map_symbol.c | 18 ++++++++++++++++++ tools/perf/util/map_symbol.h | 3 +++ tools/perf/util/mem-info.c | 13 +++++++++++++ tools/perf/util/mem-info.h | 1 + 5 files changed, 42 insertions(+), 7 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c index f028f113c4fd..f8ee1cd6929d 100644 --- a/tools/perf/util/hist.c +++ b/tools/perf/util/hist.c @@ -476,6 +476,12 @@ static int hist_entry__init(struct hist_entry *he, he->branch_info->to.ms.map = map__get(he->branch_info->to.ms.map); } + if (he->mem_info) { + he->mem_info = mem_info__clone(template->mem_info); + if (he->mem_info == NULL) + goto err_infos; + } + if (hist_entry__has_callchains(he) && symbol_conf.use_callchain) callchain_init(he->callchain); @@ -620,12 +626,6 @@ static struct hist_entry *hists__findnew_entry(struct hists *hists, if (symbol_conf.cumulate_callchain) he_stat__add_period(he->stat_acc, period); - /* - * This mem info was allocated from sample__resolve_mem - * and will not be used anymore. - */ - mem_info__zput(entry->mem_info); - block_info__delete(entry->block_info); kvm_info__zput(entry->kvm_info); @@ -739,7 +739,7 @@ __hists__add_entry(struct hists *hists, .filtered = symbol__parent_filter(sym_parent) | al->filtered, .hists = hists, .branch_info = bi, - .mem_info = mem_info__get(mi), + .mem_info = mi, .kvm_info = ki, .block_info = block_info, .transaction = sample->transaction, diff --git a/tools/perf/util/map_symbol.c b/tools/perf/util/map_symbol.c index bef5079f2403..6ad2960bc289 100644 --- a/tools/perf/util/map_symbol.c +++ b/tools/perf/util/map_symbol.c @@ -13,3 +13,21 @@ void addr_map_symbol__exit(struct addr_map_symbol *ams) { map_symbol__exit(&ams->ms); } + +void map_symbol__copy(struct map_symbol *dst, struct map_symbol *src) +{ + dst->maps = maps__get(src->maps); + dst->map = map__get(src->map); + dst->sym = src->sym; +} + +void addr_map_symbol__copy(struct addr_map_symbol *dst, struct addr_map_symbol *src) +{ + map_symbol__copy(&dst->ms, &src->ms); + + dst->addr = src->addr; + dst->al_addr = src->al_addr; + dst->al_level = src->al_level; + dst->phys_addr = src->phys_addr; + dst->data_page_size = src->data_page_size; +} diff --git a/tools/perf/util/map_symbol.h b/tools/perf/util/map_symbol.h index 72d5ed938ed6..e370bb32ed47 100644 --- a/tools/perf/util/map_symbol.h +++ b/tools/perf/util/map_symbol.h @@ -26,4 +26,7 @@ struct addr_map_symbol { void map_symbol__exit(struct map_symbol *ms); void addr_map_symbol__exit(struct addr_map_symbol *ams); +void map_symbol__copy(struct map_symbol *dst, struct map_symbol *src); +void addr_map_symbol__copy(struct addr_map_symbol *dst, struct addr_map_symbol *src); + #endif // __PERF_MAP_SYMBOL diff --git a/tools/perf/util/mem-info.c b/tools/perf/util/mem-info.c index 27d67721a695..d3efa9c139f2 100644 --- a/tools/perf/util/mem-info.c +++ b/tools/perf/util/mem-info.c @@ -33,3 +33,16 @@ struct mem_info *mem_info__new(void) return result; } + +struct mem_info *mem_info__clone(struct mem_info *mi) +{ + struct mem_info *result = mem_info__new(); + + if (result) { + addr_map_symbol__copy(mem_info__iaddr(result), mem_info__iaddr(mi)); + addr_map_symbol__copy(mem_info__daddr(result), mem_info__daddr(mi)); + mem_info__data_src(result)->val = mem_info__data_src(mi)->val; + } + + return result; +} diff --git a/tools/perf/util/mem-info.h b/tools/perf/util/mem-info.h index 0f68e29f311b..df75e94ed3d0 100644 --- a/tools/perf/util/mem-info.h +++ b/tools/perf/util/mem-info.h @@ -15,6 +15,7 @@ DECLARE_RC_STRUCT(mem_info) { }; struct mem_info *mem_info__new(void); +struct mem_info *mem_info__clone(struct mem_info *mi); struct mem_info *mem_info__get(struct mem_info *mi); void mem_info__put(struct mem_info *mi); -- cgit v1.2.3 From 3da209bb1177462b6fe8e3021a5527a5a49a9336 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Wed, 31 Jul 2024 16:55:01 -0700 Subject: perf mem: Free the allocated sort string, fixing a leak The get_sort_order() returns either a new string (from strdup) or NULL but it never gets freed. Signed-off-by: Namhyung Kim Fixes: 2e7f545096f954a9 ("perf mem: Factor out a function to generate sort order") Cc: Adrian Hunter Cc: Athira Rajeev Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kan Liang Cc: Peter Zijlstra Cc: Stephane Eranian Link: https://lore.kernel.org/r/20240731235505.710436-3-namhyung@kernel.org [ Added Fixes tag ] Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-mem.c | 1 + 1 file changed, 1 insertion(+) (limited to 'tools') diff --git a/tools/perf/builtin-mem.c b/tools/perf/builtin-mem.c index 863fcd735dae..93413cfcd585 100644 --- a/tools/perf/builtin-mem.c +++ b/tools/perf/builtin-mem.c @@ -372,6 +372,7 @@ static int report_events(int argc, const char **argv, struct perf_mem *mem) rep_argv[i] = argv[j]; ret = cmd_report(i, rep_argv); + free(new_sort_order); free(rep_argv); return ret; } -- cgit v1.2.3 From 35b38a71c92fa033aa6c8d681ee827e215254964 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Wed, 31 Jul 2024 16:55:02 -0700 Subject: perf mem: Rework command option handling Split the common option and ones for record or report. Otherwise -U in the record option cannot be used because it clashes with in the common (or report) option. Also rename report_events() to __cmd_report() to follow the convention and to be sync with the record part. Also set the flag PARSE_OPT_STOP_AT_NON_OPTION for the common option so that it can show the help message in the subcommand like below: $ perf mem record -h Usage: perf mem record [] [] or: perf mem record [] -- [] -C, --cpu list of cpus to profile -e, --event event selector. use 'perf mem record -e list' to list available events -f, --force don't complain, do it -K, --all-kernel collect only kernel level data -p, --phys-data Record/Report sample physical addresses -t, --type memory operations(load,store) Default load,store -U, --all-user collect only user level data -v, --verbose be more verbose (show counter open errors, etc) --data-page-size Record/Report sample data address page size --ldlat mem-loads latency Cc: Aditya Gupta Signed-off-by: Namhyung Kim Cc: Adrian Hunter Cc: Athira Rajeev Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kan Liang Cc: Peter Zijlstra Cc: Stephane Eranian Link: https://lore.kernel.org/r/20240731235505.710436-4-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-mem.c | 79 +++++++++++++++++++++++++++--------------------- 1 file changed, 45 insertions(+), 34 deletions(-) (limited to 'tools') diff --git a/tools/perf/builtin-mem.c b/tools/perf/builtin-mem.c index 93413cfcd585..819edaf6b1df 100644 --- a/tools/perf/builtin-mem.c +++ b/tools/perf/builtin-mem.c @@ -34,6 +34,8 @@ struct perf_mem { bool force; bool phys_addr; bool data_page_size; + bool all_kernel; + bool all_user; int operation; const char *cpu_list; DECLARE_BITMAP(cpu_bitmap, MAX_NR_CPUS); @@ -62,33 +64,19 @@ static int parse_record_events(const struct option *opt, return 0; } -static const char * const __usage[] = { - "perf mem record [] []", - "perf mem record [] -- []", - NULL -}; - -static const char * const *record_mem_usage = __usage; - -static int __cmd_record(int argc, const char **argv, struct perf_mem *mem) +static int __cmd_record(int argc, const char **argv, struct perf_mem *mem, + const struct option *options) { int rec_argc, i = 0, j; int start, end; const char **rec_argv; int ret; - bool all_user = false, all_kernel = false; struct perf_mem_event *e; struct perf_pmu *pmu; - struct option options[] = { - OPT_CALLBACK('e', "event", &mem, "event", - "event selector. use 'perf mem record -e list' to list available events", - parse_record_events), - OPT_UINTEGER(0, "ldlat", &perf_mem_events__loads_ldlat, "mem-loads latency"), - OPT_INCR('v', "verbose", &verbose, - "be more verbose (show counter open errors, etc)"), - OPT_BOOLEAN('U', "all-user", &all_user, "collect only user level data"), - OPT_BOOLEAN('K', "all-kernel", &all_kernel, "collect only kernel level data"), - OPT_END() + const char * const record_usage[] = { + "perf mem record [] []", + "perf mem record [] -- []", + NULL }; pmu = perf_mem_events_find_pmu(); @@ -102,7 +90,7 @@ static int __cmd_record(int argc, const char **argv, struct perf_mem *mem) return -1; } - argc = parse_options(argc, argv, options, record_mem_usage, + argc = parse_options(argc, argv, options, record_usage, PARSE_OPT_KEEP_UNKNOWN); /* Max number of arguments multiplied by number of PMUs that can support them. */ @@ -158,10 +146,10 @@ static int __cmd_record(int argc, const char **argv, struct perf_mem *mem) goto out; end = i; - if (all_user) + if (mem->all_user) rec_argv[i++] = "--all-user"; - if (all_kernel) + if (mem->all_kernel) rec_argv[i++] = "--all-kernel"; if (mem->cpu_list) { @@ -319,6 +307,7 @@ out_delete: perf_session__delete(session); return ret; } + static char *get_sort_order(struct perf_mem *mem) { bool has_extra_options = (mem->phys_addr | mem->data_page_size) ? true : false; @@ -346,11 +335,19 @@ static char *get_sort_order(struct perf_mem *mem) return strdup(sort); } -static int report_events(int argc, const char **argv, struct perf_mem *mem) +static int __cmd_report(int argc, const char **argv, struct perf_mem *mem, + const struct option *options) { const char **rep_argv; int ret, i = 0, j, rep_argc; char *new_sort_order; + const char * const report_usage[] = { + "perf mem report []", + NULL + }; + + argc = parse_options(argc, argv, options, report_usage, + PARSE_OPT_KEEP_UNKNOWN); if (mem->dump_raw) return report_raw_events(mem); @@ -368,7 +365,7 @@ static int report_events(int argc, const char **argv, struct perf_mem *mem) if (new_sort_order) rep_argv[i++] = new_sort_order; - for (j = 1; j < argc; j++, i++) + for (j = 0; j < argc; j++, i++) rep_argv[i] = argv[j]; ret = cmd_report(i, rep_argv); @@ -475,22 +472,36 @@ int cmd_mem(int argc, const char **argv) OPT_CALLBACK('t', "type", &mem.operation, "type", "memory operations(load,store) Default load,store", parse_mem_ops), + OPT_STRING('C', "cpu", &mem.cpu_list, "cpu", + "list of cpus to profile"), + OPT_BOOLEAN('f', "force", &mem.force, "don't complain, do it"), + OPT_INCR('v', "verbose", &verbose, + "be more verbose (show counter open errors, etc)"), + OPT_BOOLEAN('p', "phys-data", &mem.phys_addr, "Record/Report sample physical addresses"), + OPT_BOOLEAN(0, "data-page-size", &mem.data_page_size, "Record/Report sample data address page size"), + OPT_END() + }; + const struct option record_options[] = { + OPT_CALLBACK('e', "event", &mem, "event", + "event selector. use 'perf mem record -e list' to list available events", + parse_record_events), + OPT_UINTEGER(0, "ldlat", &perf_mem_events__loads_ldlat, "mem-loads latency"), + OPT_BOOLEAN('U', "all-user", &mem.all_user, "collect only user level data"), + OPT_BOOLEAN('K', "all-kernel", &mem.all_kernel, "collect only kernel level data"), + OPT_PARENT(mem_options) + }; + const struct option report_options[] = { OPT_BOOLEAN('D', "dump-raw-samples", &mem.dump_raw, "dump raw samples in ASCII"), OPT_BOOLEAN('U', "hide-unresolved", &mem.hide_unresolved, "Only display entries resolved to a symbol"), OPT_STRING('i', "input", &input_name, "file", "input file name"), - OPT_STRING('C', "cpu", &mem.cpu_list, "cpu", - "list of cpus to profile"), OPT_STRING_NOEMPTY('x', "field-separator", &symbol_conf.field_sep, "separator", "separator for columns, no spaces will be added" " between columns '.' is reserved."), - OPT_BOOLEAN('f', "force", &mem.force, "don't complain, do it"), - OPT_BOOLEAN('p', "phys-data", &mem.phys_addr, "Record/Report sample physical addresses"), - OPT_BOOLEAN(0, "data-page-size", &mem.data_page_size, "Record/Report sample data address page size"), - OPT_END() + OPT_PARENT(mem_options) }; const char *const mem_subcommands[] = { "record", "report", NULL }; const char *mem_usage[] = { @@ -499,7 +510,7 @@ int cmd_mem(int argc, const char **argv) }; argc = parse_options_subcommand(argc, argv, mem_options, mem_subcommands, - mem_usage, PARSE_OPT_KEEP_UNKNOWN); + mem_usage, PARSE_OPT_STOP_AT_NON_OPTION); if (!argc || !(strncmp(argv[0], "rec", 3) || mem.operation)) usage_with_options(mem_usage, mem_options); @@ -512,9 +523,9 @@ int cmd_mem(int argc, const char **argv) } if (strlen(argv[0]) > 2 && strstarts("record", argv[0])) - return __cmd_record(argc, argv, &mem); + return __cmd_record(argc, argv, &mem, record_options); else if (strlen(argv[0]) > 2 && strstarts("report", argv[0])) - return report_events(argc, argv, &mem); + return __cmd_report(argc, argv, &mem, report_options); else usage_with_options(mem_usage, mem_options); -- cgit v1.2.3 From 871893d748cce346097d907b9937604af3dafcf1 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Wed, 31 Jul 2024 16:55:03 -0700 Subject: perf tools: Add mode argument to sort_help() Some sort keys are meaningful only in a specific mode - like branch stack and memory (data-src). Add the mode to skip unnecessary ones. This will be used for 'perf mem report' later. While at it, change the prefix for the -F/--fields option to remove the duplicate part. Before: $ perf report -F Error: switch `F' requires a value Usage: perf report [] -F, --fields output field(s): overhead period sample overhead overhead_sys overhead_us overhead_guest_sys overhead_guest_us overhead_children sample period weight1 weight2 weight3 ins_lat retire_lat ... After: $ perf report -F Error: switch `F' requires a value Usage: perf report [] -F, --fields output field(s): overhead overhead_sys overhead_us overhead_guest_sys overhead_guest_us overhead_children sample period weight1 weight2 weight3 ins_lat retire_lat ... Signed-off-by: Namhyung Kim Cc: Adrian Hunter Cc: Athira Rajeev Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kan Liang Cc: Peter Zijlstra Cc: Stephane Eranian Link: https://lore.kernel.org/r/20240731235505.710436-5-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-report.c | 4 ++-- tools/perf/util/sort.c | 12 +++++++----- tools/perf/util/sort.h | 2 +- 3 files changed, 10 insertions(+), 8 deletions(-) (limited to 'tools') diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c index 6edc0d4ce6fb..930052961c1a 100644 --- a/tools/perf/builtin-report.c +++ b/tools/perf/builtin-report.c @@ -1301,8 +1301,8 @@ int cmd_report(int argc, const char **argv) .socket_filter = -1, .skip_empty = true, }; - char *sort_order_help = sort_help("sort by key(s):"); - char *field_order_help = sort_help("output field(s): overhead period sample "); + char *sort_order_help = sort_help("sort by key(s):", SORT_MODE__NORMAL); + char *field_order_help = sort_help("output field(s):", SORT_MODE__NORMAL); const char *disassembler_style = NULL, *objdump_path = NULL, *addr2line_path = NULL; const struct option options[] = { OPT_STRING('i', "input", &input_name, "file", diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c index ab7c7ff35f9b..c4046d5d1749 100644 --- a/tools/perf/util/sort.c +++ b/tools/perf/util/sort.c @@ -3960,7 +3960,7 @@ static void add_hpp_sort_string(struct strbuf *sb, struct hpp_dimension *s, int add_key(sb, s[i].name, llen); } -char *sort_help(const char *prefix) +char *sort_help(const char *prefix, enum sort_mode mode) { struct strbuf sb; char *s; @@ -3972,10 +3972,12 @@ char *sort_help(const char *prefix) ARRAY_SIZE(hpp_sort_dimensions), &len); add_sort_string(&sb, common_sort_dimensions, ARRAY_SIZE(common_sort_dimensions), &len); - add_sort_string(&sb, bstack_sort_dimensions, - ARRAY_SIZE(bstack_sort_dimensions), &len); - add_sort_string(&sb, memory_sort_dimensions, - ARRAY_SIZE(memory_sort_dimensions), &len); + if (mode == SORT_MODE__NORMAL || mode == SORT_MODE__BRANCH) + add_sort_string(&sb, bstack_sort_dimensions, + ARRAY_SIZE(bstack_sort_dimensions), &len); + if (mode == SORT_MODE__NORMAL || mode == SORT_MODE__MEMORY) + add_sort_string(&sb, memory_sort_dimensions, + ARRAY_SIZE(memory_sort_dimensions), &len); s = strbuf_detach(&sb, NULL); strbuf_release(&sb); return s; diff --git a/tools/perf/util/sort.h b/tools/perf/util/sort.h index 0bd0ee3ae76b..6357bc32c5ca 100644 --- a/tools/perf/util/sort.h +++ b/tools/perf/util/sort.h @@ -130,7 +130,7 @@ void reset_output_field(void); void sort__setup_elide(FILE *fp); void perf_hpp__set_elide(int idx, bool elide); -char *sort_help(const char *prefix); +char *sort_help(const char *prefix, enum sort_mode mode); int report_parse_ignore_callees_opt(const struct option *opt, const char *arg, int unset); -- cgit v1.2.3 From 2d99a991337f70a7d11135d6a539bed8c060cb0f Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Wed, 31 Jul 2024 16:55:04 -0700 Subject: perf mem: Add -s/--sort option So that users can set the sort key manually as they want. $ perf mem report -s Error: switch `s' requires a value Usage: perf mem report [] -s, --sort sort by key(s): overhead overhead_sys overhead_us overhead_guest_sys overhead_guest_us overhead_children sample period weight1 weight2 weight3 ins_lat retire_lat p_stage_cyc pid comm dso symbol parent cpu socket srcline srcfile local_weight weight transaction trace symbol_size dso_size cgroup cgroup_id ipc_null time code_page_size local_ins_lat ins_lat local_p_stage_cyc p_stage_cyc addr local_retire_lat retire_lat simd type typeoff symoff symbol_daddr dso_daddr locked tlb mem snoop dcacheline symbol_iaddr phys_daddr data_page_size blocked Signed-off-by: Namhyung Kim Cc: Adrian Hunter Cc: Athira Rajeev Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kan Liang Cc: Peter Zijlstra Cc: Stephane Eranian Link: https://lore.kernel.org/r/20240731235505.710436-6-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-mem.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/perf/builtin-mem.c b/tools/perf/builtin-mem.c index 819edaf6b1df..24a4f0084f49 100644 --- a/tools/perf/builtin-mem.c +++ b/tools/perf/builtin-mem.c @@ -19,6 +19,7 @@ #include "util/symbol.h" #include "util/pmus.h" #include "util/sample.h" +#include "util/sort.h" #include "util/string2.h" #include "util/util.h" #include @@ -28,7 +29,8 @@ struct perf_mem { struct perf_tool tool; - char const *input_name; + const char *input_name; + const char *sort_key; bool hide_unresolved; bool dump_raw; bool force; @@ -313,11 +315,13 @@ static char *get_sort_order(struct perf_mem *mem) bool has_extra_options = (mem->phys_addr | mem->data_page_size) ? true : false; char sort[128]; + if (mem->sort_key) + scnprintf(sort, sizeof(sort), "--sort=%s", mem->sort_key); /* * there is no weight (cost) associated with stores, so don't print * the column */ - if (!(mem->operation & MEM_OPERATION_LOAD)) { + else if (!(mem->operation & MEM_OPERATION_LOAD)) { strcpy(sort, "--sort=mem,sym,dso,symbol_daddr," "dso_daddr,tlb,locked"); } else if (has_extra_options) { @@ -468,6 +472,7 @@ int cmd_mem(int argc, const char **argv) */ .operation = MEM_OPERATION_LOAD | MEM_OPERATION_STORE, }; + char *sort_order_help = sort_help("sort by key(s):", SORT_MODE__MEMORY); const struct option mem_options[] = { OPT_CALLBACK('t', "type", &mem.operation, "type", "memory operations(load,store) Default load,store", @@ -501,6 +506,8 @@ int cmd_mem(int argc, const char **argv) "separator", "separator for columns, no spaces will be added" " between columns '.' is reserved."), + OPT_STRING('s', "sort", &mem.sort_key, "key[,key2...]", + sort_order_help), OPT_PARENT(mem_options) }; const char *const mem_subcommands[] = { "record", "report", NULL }; -- cgit v1.2.3 From 7320ad972510415515d00838451c51f2db3974a7 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Wed, 31 Jul 2024 16:55:05 -0700 Subject: perf mem: Add -T/--data-type option to report subcommand This is just a shortcut to have 'type' in the sort key and use more compact output format like below. $ perf mem report -T ... # # Overhead Samples Memory access Snoop TLB access Data Type # ........ ............ ....................................... ............ ...................... ......... # 14.84% 22 L1 hit None L1 or L2 hit (unknown) 7.68% 8 LFB/MAB hit None L1 or L2 hit (unknown) 7.17% 3 RAM hit Hit L2 miss (unknown) 6.29% 12 L1 hit None L1 or L2 hit (stack operation) 4.85% 5 RAM hit Hit L1 or L2 hit (unknown) 3.97% 5 LFB/MAB hit None L1 or L2 hit struct psi_group_cpu 3.18% 3 LFB/MAB hit None L1 or L2 hit (stack operation) 2.58% 3 L1 hit None L1 or L2 hit unsigned int 2.36% 2 L1 hit None L1 or L2 hit struct 2.31% 2 L1 hit None L1 or L2 hit struct psi_group_cpu ... Users also can use their own sort keys and -T option makes sure it has the 'type' sort key at the end. $ perf mem report -T -s mem Signed-off-by: Namhyung Kim Tested-by: Arnaldo Carvalho de Melo Cc: Adrian Hunter Cc: Athira Rajeev Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kan Liang Cc: Peter Zijlstra Cc: Stephane Eranian Link: https://lore.kernel.org/r/20240731235505.710436-7-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-mem.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'tools') diff --git a/tools/perf/builtin-mem.c b/tools/perf/builtin-mem.c index 24a4f0084f49..efa700d14c98 100644 --- a/tools/perf/builtin-mem.c +++ b/tools/perf/builtin-mem.c @@ -38,6 +38,7 @@ struct perf_mem { bool data_page_size; bool all_kernel; bool all_user; + bool data_type; int operation; const char *cpu_list; DECLARE_BITMAP(cpu_bitmap, MAX_NR_CPUS); @@ -317,6 +318,8 @@ static char *get_sort_order(struct perf_mem *mem) if (mem->sort_key) scnprintf(sort, sizeof(sort), "--sort=%s", mem->sort_key); + else if (mem->data_type) + strcpy(sort, "--sort=mem,snoop,tlb,type"); /* * there is no weight (cost) associated with stores, so don't print * the column @@ -336,6 +339,10 @@ static char *get_sort_order(struct perf_mem *mem) if (mem->data_page_size) strcat(sort, ",data_page_size"); + /* make sure it has 'type' sort key even -s option is used */ + if (mem->data_type && !strstr(sort, "type")) + strcat(sort, ",type"); + return strdup(sort); } @@ -508,6 +515,8 @@ int cmd_mem(int argc, const char **argv) " between columns '.' is reserved."), OPT_STRING('s', "sort", &mem.sort_key, "key[,key2...]", sort_order_help), + OPT_BOOLEAN('T', "type-profile", &mem.data_type, + "Show data-type profile result"), OPT_PARENT(mem_options) }; const char *const mem_subcommands[] = { "record", "report", NULL }; -- cgit v1.2.3 From 39e470057f785eab7b6638feb3f24cbad6816aff Mon Sep 17 00:00:00 2001 From: "Ahmed S. Darwish" Date: Thu, 18 Jul 2024 15:47:41 +0200 Subject: tools/x86/kcpuid: Remove unused variable Global variable "num_leafs" is set in multiple places but is never read anywhere. Remove it. Signed-off-by: Ahmed S. Darwish Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20240718134755.378115-2-darwi@linutronix.de --- tools/arch/x86/kcpuid/kcpuid.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'tools') diff --git a/tools/arch/x86/kcpuid/kcpuid.c b/tools/arch/x86/kcpuid/kcpuid.c index 24b7d017ec2c..e1973d8b322e 100644 --- a/tools/arch/x86/kcpuid/kcpuid.c +++ b/tools/arch/x86/kcpuid/kcpuid.c @@ -76,7 +76,6 @@ struct cpuid_range { */ struct cpuid_range *leafs_basic, *leafs_ext; -static int num_leafs; static bool is_amd; static bool show_details; static bool show_raw; @@ -246,7 +245,6 @@ struct cpuid_range *setup_cpuid_range(u32 input_eax) allzero = cpuid_store(range, f, subleaf, eax, ebx, ecx, edx); if (allzero) continue; - num_leafs++; if (!has_subleafs(f)) continue; @@ -272,7 +270,6 @@ struct cpuid_range *setup_cpuid_range(u32 input_eax) eax, ebx, ecx, edx); if (allzero) continue; - num_leafs++; } } -- cgit v1.2.3 From a52e735f282c963155090d2d60726324ccd0e4bc Mon Sep 17 00:00:00 2001 From: "Ahmed S. Darwish" Date: Thu, 18 Jul 2024 15:47:42 +0200 Subject: tools/x86/kcpuid: Properly align long-description columns When kcpuid is invoked with "--all --details", the detailed description column is not properly aligned for all bitfield rows: CPUID_0x4_ECX[0x0]: cache_level : 0x1 - Cache Level ... cache_self_init - Cache Self Initialization This is due to differences in output handling between boolean single-bit "bitflags" and multi-bit bitfields. For the former, the bitfield's value is not outputted as it is implied to be true by just outputting the bitflag's name in its respective line. If long descriptions were requested through the --all parameter, properly align the bitflag's description columns through extra tabs. With that, the sample output above becomes: CPUID_0x4_ECX[0x0]: cache_level : 0x1 - Cache Level ... cache_self_init - Cache Self Initialization Signed-off-by: Ahmed S. Darwish Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20240718134755.378115-3-darwi@linutronix.de --- tools/arch/x86/kcpuid/kcpuid.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/arch/x86/kcpuid/kcpuid.c b/tools/arch/x86/kcpuid/kcpuid.c index e1973d8b322e..08f64d9ecb40 100644 --- a/tools/arch/x86/kcpuid/kcpuid.c +++ b/tools/arch/x86/kcpuid/kcpuid.c @@ -449,8 +449,9 @@ static void decode_bits(u32 value, struct reg_desc *rdesc, enum cpuid_reg reg) if (start == end) { /* single bit flag */ if (value & (1 << start)) - printf("\t%-20s %s%s\n", + printf("\t%-20s %s%s%s\n", bdesc->simp, + show_flags_only ? "" : "\t\t\t", show_details ? "-" : "", show_details ? bdesc->detail : "" ); -- cgit v1.2.3 From 5dd7ca42475bb15fd93d58d42e925512776f14a4 Mon Sep 17 00:00:00 2001 From: "Ahmed S. Darwish" Date: Thu, 18 Jul 2024 15:47:43 +0200 Subject: tools/x86/kcpuid: Set max possible subleaves count to 64 cpuid.csv will be extended in further commits with all-publicly-known CPUID leaves and bitfields. One of the new leaves is 0xd for extended CPU state enumeration. Depending on XCR0 dword bits, it can export up to 64 subleaves. Set kcpuid.c MAX_SUBLEAF_NUM to 64. Signed-off-by: Ahmed S. Darwish Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20240718134755.378115-4-darwi@linutronix.de --- tools/arch/x86/kcpuid/kcpuid.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/arch/x86/kcpuid/kcpuid.c b/tools/arch/x86/kcpuid/kcpuid.c index 08f64d9ecb40..a87cddc19554 100644 --- a/tools/arch/x86/kcpuid/kcpuid.c +++ b/tools/arch/x86/kcpuid/kcpuid.c @@ -203,7 +203,7 @@ static void raw_dump_range(struct cpuid_range *range) } } -#define MAX_SUBLEAF_NUM 32 +#define MAX_SUBLEAF_NUM 64 struct cpuid_range *setup_cpuid_range(u32 input_eax) { u32 max_func, idx_func; -- cgit v1.2.3 From cf96ab1a966b87b09fdd9e8cc8357d2d00776a3a Mon Sep 17 00:00:00 2001 From: "Ahmed S. Darwish" Date: Thu, 18 Jul 2024 15:47:44 +0200 Subject: tools/x86/kcpuid: Protect against faulty "max subleaf" values MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Protect against the kcpuid code parsing faulty max subleaf numbers through a min() expression. Thus, ensuring that max_subleaf will always be ≤ MAX_SUBLEAF_NUM. Use "u32" for the subleaf numbers since kcpuid is compiled with -Wextra, which includes signed/unsigned comparisons warnings. Signed-off-by: Ahmed S. Darwish Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20240718134755.378115-5-darwi@linutronix.de --- tools/arch/x86/kcpuid/kcpuid.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) (limited to 'tools') diff --git a/tools/arch/x86/kcpuid/kcpuid.c b/tools/arch/x86/kcpuid/kcpuid.c index a87cddc19554..581d28c861ee 100644 --- a/tools/arch/x86/kcpuid/kcpuid.c +++ b/tools/arch/x86/kcpuid/kcpuid.c @@ -7,7 +7,8 @@ #include #include -#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) +#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) +#define min(a, b) (((a) < (b)) ? (a) : (b)) typedef unsigned int u32; typedef unsigned long long u64; @@ -206,12 +207,9 @@ static void raw_dump_range(struct cpuid_range *range) #define MAX_SUBLEAF_NUM 64 struct cpuid_range *setup_cpuid_range(u32 input_eax) { - u32 max_func, idx_func; - int subleaf; + u32 max_func, idx_func, subleaf, max_subleaf; + u32 eax, ebx, ecx, edx, f = input_eax; struct cpuid_range *range; - u32 eax, ebx, ecx, edx; - u32 f = input_eax; - int max_subleaf; bool allzero; eax = input_eax; @@ -256,7 +254,7 @@ struct cpuid_range *setup_cpuid_range(u32 input_eax) * others have to be tried (0xf) */ if (f == 0x7 || f == 0x14 || f == 0x17 || f == 0x18) - max_subleaf = (eax & 0xff) + 1; + max_subleaf = min((eax & 0xff) + 1, max_subleaf); if (f == 0xb) max_subleaf = 2; -- cgit v1.2.3 From 9ecbc60a5ede928abdd2b152d828ae0ea8a1e3ed Mon Sep 17 00:00:00 2001 From: "Ahmed S. Darwish" Date: Thu, 18 Jul 2024 15:47:45 +0200 Subject: tools/x86/kcpuid: Strip bitfield names leading/trailing whitespace While parsing and saving bitfield names from the CSV file, an extra leading space is copied verbatim. That extra space is not a big issue now, but further commits will add a new CSV file with much more padding for the bitfield's name column. Strip leading/trailing whitespaces while saving bitfield names. Signed-off-by: Ahmed S. Darwish Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20240718134755.378115-6-darwi@linutronix.de --- tools/arch/x86/kcpuid/kcpuid.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/arch/x86/kcpuid/kcpuid.c b/tools/arch/x86/kcpuid/kcpuid.c index 581d28c861ee..c4f0ace5c9ff 100644 --- a/tools/arch/x86/kcpuid/kcpuid.c +++ b/tools/arch/x86/kcpuid/kcpuid.c @@ -379,7 +379,7 @@ static int parse_line(char *line) if (start) bdesc->start = strtoul(start, NULL, 0); - strcpy(bdesc->simp, tokens[4]); + strcpy(bdesc->simp, strtok(tokens[4], " \t")); strcpy(bdesc->detail, tokens[5]); return 0; -- cgit v1.2.3 From b0a59d14966dbfe0d544b2595dcb29728d41daf3 Mon Sep 17 00:00:00 2001 From: "Ahmed S. Darwish" Date: Thu, 18 Jul 2024 15:47:46 +0200 Subject: tools/x86/kcpuid: Recognize all leaves with subleaves cpuid.csv will be extended in further commits with all-publicly-known CPUID leaves and bitfields. Thus, modify has_subleafs() to identify all known leaves with subleaves. Remove the redundant "is_amd" check since all x86 vendors already report the maxium supported extended leaf at leaf 0x80000000 EAX register. The extra mentioned leaves are: - Leaf 0x12, Intel Software Guard Extensions (SGX) enumeration - Leaf 0x14, Intel process trace (PT) enumeration - Leaf 0x17, Intel SoC vendor attributes enumeration - Leaf 0x1b, Intel PCONFIG (Platform configuration) enumeration - Leaf 0x1d, Intel AMX (Advanced Matrix Extensions) tile information - Leaf 0x1f, Intel v2 extended topology enumeration - Leaf 0x23, Intel ArchPerfmonExt (Architectural PMU ext) enumeration - Leaf 0x80000020, AMD Platform QoS extended features enumeration - Leaf 0x80000026, AMD v2 extended topology enumeration Set the 'max_subleaf' variable for all the newly marked leaves with extra subleaves. Ideally, this should be fetched from the CSV file instead, but the current kcpuid code architecture has two runs: one run to serially invoke the cpuid instructions and save all the output in-memory, and one run to parse this in-memory output through the CSV specification. Signed-off-by: Ahmed S. Darwish Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20240718134755.378115-7-darwi@linutronix.de --- tools/arch/x86/kcpuid/kcpuid.c | 39 ++++++++++++++++++--------------------- 1 file changed, 18 insertions(+), 21 deletions(-) (limited to 'tools') diff --git a/tools/arch/x86/kcpuid/kcpuid.c b/tools/arch/x86/kcpuid/kcpuid.c index c4f0ace5c9ff..725a7a27fcac 100644 --- a/tools/arch/x86/kcpuid/kcpuid.c +++ b/tools/arch/x86/kcpuid/kcpuid.c @@ -98,27 +98,17 @@ static inline void cpuid(u32 *eax, u32 *ebx, u32 *ecx, u32 *edx) static inline bool has_subleafs(u32 f) { - if (f == 0x7 || f == 0xd) - return true; - - if (is_amd) { - if (f == 0x8000001d) + u32 with_subleaves[] = { + 0x4, 0x7, 0xb, 0xd, 0xf, 0x10, 0x12, + 0x14, 0x17, 0x18, 0x1b, 0x1d, 0x1f, 0x23, + 0x8000001d, 0x80000020, 0x80000026, + }; + + for (unsigned i = 0; i < ARRAY_SIZE(with_subleaves); i++) + if (f == with_subleaves[i]) return true; - return false; - } - switch (f) { - case 0x4: - case 0xb: - case 0xf: - case 0x10: - case 0x14: - case 0x18: - case 0x1f: - return true; - default: - return false; - } + return false; } static void leaf_print_raw(struct subleaf *leaf) @@ -253,11 +243,18 @@ struct cpuid_range *setup_cpuid_range(u32 input_eax) * Some can provide the exact number of subleafs, * others have to be tried (0xf) */ - if (f == 0x7 || f == 0x14 || f == 0x17 || f == 0x18) + if (f == 0x7 || f == 0x14 || f == 0x17 || f == 0x18 || f == 0x1d) max_subleaf = min((eax & 0xff) + 1, max_subleaf); - if (f == 0xb) max_subleaf = 2; + if (f == 0x1f) + max_subleaf = 6; + if (f == 0x23) + max_subleaf = 4; + if (f == 0x80000020) + max_subleaf = 4; + if (f == 0x80000026) + max_subleaf = 5; for (subleaf = 1; subleaf < max_subleaf; subleaf++) { eax = f; -- cgit v1.2.3 From 58921443e9b04bbb1866070ed14ea39578302cea Mon Sep 17 00:00:00 2001 From: "Ahmed S. Darwish" Date: Thu, 18 Jul 2024 15:47:47 +0200 Subject: tools/x86/kcpuid: Parse subleaf ranges if provided It's a common pattern in cpuid leaves to have the same bitfields format repeated across a number of subleaves. Typically, this is used for enumerating hierarchial structures like cache and TLB levels, CPU topology levels, etc. Modify kcpuid.c to handle subleaf ranges in the CSV file subleaves column. For example, make it able to parse lines in the form: # LEAF, SUBLEAVES, reg, bits, short_name , ... 0xb, 1:0, eax, 4:0, x2apic_id_shift , ... 0xb, 1:0, ebx, 15:0, domain_lcpus_count , ... 0xb, 1:0, ecx, 7:0, domain_nr , ... This way, full output can be printed to the user. Signed-off-by: Ahmed S. Darwish Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20240718134755.378115-8-darwi@linutronix.de --- tools/arch/x86/kcpuid/kcpuid.c | 50 +++++++++++++++++++++++++++--------------- 1 file changed, 32 insertions(+), 18 deletions(-) (limited to 'tools') diff --git a/tools/arch/x86/kcpuid/kcpuid.c b/tools/arch/x86/kcpuid/kcpuid.c index 725a7a27fcac..1b25c0a95d3f 100644 --- a/tools/arch/x86/kcpuid/kcpuid.c +++ b/tools/arch/x86/kcpuid/kcpuid.c @@ -305,6 +305,8 @@ static int parse_line(char *line) struct bits_desc *bdesc; int reg_index; char *start, *end; + u32 subleaf_start, subleaf_end; + unsigned bit_start, bit_end; /* Skip comments and NULL line */ if (line[0] == '#' || line[0] == '\n') @@ -343,13 +345,25 @@ static int parse_line(char *line) return 0; /* subleaf */ - sub = strtoul(tokens[1], NULL, 0); - if ((int)sub > func->nr) - return -1; + buf = tokens[1]; + end = strtok(buf, ":"); + start = strtok(NULL, ":"); + subleaf_end = strtoul(end, NULL, 0); + + /* A subleaf range is given? */ + if (start) { + subleaf_start = strtoul(start, NULL, 0); + subleaf_end = min(subleaf_end, (u32)(func->nr - 1)); + if (subleaf_start > subleaf_end) + return 0; + } else { + subleaf_start = subleaf_end; + if (subleaf_start > (u32)(func->nr - 1)) + return 0; + } - leaf = &func->leafs[sub]; + /* register */ buf = tokens[2]; - if (strcasestr(buf, "EAX")) reg_index = R_EAX; else if (strcasestr(buf, "EBX")) @@ -361,23 +375,23 @@ static int parse_line(char *line) else goto err_exit; - reg = &leaf->info[reg_index]; - bdesc = ®->descs[reg->nr++]; - /* bit flag or bits field */ buf = tokens[3]; - end = strtok(buf, ":"); - bdesc->end = strtoul(end, NULL, 0); - bdesc->start = bdesc->end; - - /* start != NULL means it is bit fields */ start = strtok(NULL, ":"); - if (start) - bdesc->start = strtoul(start, NULL, 0); - - strcpy(bdesc->simp, strtok(tokens[4], " \t")); - strcpy(bdesc->detail, tokens[5]); + bit_end = strtoul(end, NULL, 0); + bit_start = (start) ? strtoul(start, NULL, 0) : bit_end; + + for (sub = subleaf_start; sub <= subleaf_end; sub++) { + leaf = &func->leafs[sub]; + reg = &leaf->info[reg_index]; + bdesc = ®->descs[reg->nr++]; + + bdesc->end = bit_end; + bdesc->start = bit_start; + strcpy(bdesc->simp, strtok(tokens[4], " \t")); + strcpy(bdesc->detail, tokens[5]); + } return 0; err_exit: -- cgit v1.2.3 From cbbd847d107fb750e62670d0f205a7f58b36f893 Mon Sep 17 00:00:00 2001 From: "Ahmed S. Darwish" Date: Thu, 18 Jul 2024 15:47:48 +0200 Subject: tools/x86/kcpuid: Introduce a complete cpuid bitfields CSV file For parsing the cpuid bitfields, kcpuid uses an incomplete CSV file with 300+ bitfields. Use an auto-generated CSV file from the x86-cpuid.org project instead. It provides complete bitfields coverage: 830+ bitfields, all with proper descriptions. The auto-generated file has the following blurb automatically added: # SPDX-License-Identifier: CC0-1.0 # Generator: x86-cpuid-db v1.0 The generator tag includes the project's workspace "git describe" version string. It is intended for projects like KernelCI, to aid in verifying that the auto-generated files have not been tampered with. The file also has the blurb: # Auto-generated file. # Please submit all updates and bugfixes to https://x86-cpuid.org It's thus kindly requested that the Linux kernel's x86 tree maintainers enforce sending all updates to x86-cpuid.org's upstream database first, thus benefiting the whole ecosystem. Signed-off-by: Ahmed S. Darwish Signed-off-by: Thomas Gleixner Link: https://gitlab.com/x86-cpuid.org/x86-cpuid-db/-/blob/v1.0/LICENSE.rst Link: https://gitlab.com/x86-cpuid.org/x86-cpuid-db Link: https://lore.kernel.org/all/20240718134755.378115-9-darwi@linutronix.de --- tools/arch/x86/kcpuid/cpuid.csv | 1430 +++++++++++++++++++++++++++------------ 1 file changed, 1016 insertions(+), 414 deletions(-) (limited to 'tools') diff --git a/tools/arch/x86/kcpuid/cpuid.csv b/tools/arch/x86/kcpuid/cpuid.csv index e0c25b75327e..d751eb8585d0 100644 --- a/tools/arch/x86/kcpuid/cpuid.csv +++ b/tools/arch/x86/kcpuid/cpuid.csv @@ -1,451 +1,1053 @@ -# The basic row format is: -# LEAF, SUBLEAF, register_name, bits, short_name, long_description - -# Leaf 00H - 0, 0, EAX, 31:0, max_basic_leafs, Max input value for supported subleafs - -# Leaf 01H - 1, 0, EAX, 3:0, stepping, Stepping ID - 1, 0, EAX, 7:4, model, Model - 1, 0, EAX, 11:8, family, Family ID - 1, 0, EAX, 13:12, processor, Processor Type - 1, 0, EAX, 19:16, model_ext, Extended Model ID - 1, 0, EAX, 27:20, family_ext, Extended Family ID - - 1, 0, EBX, 7:0, brand, Brand Index - 1, 0, EBX, 15:8, clflush_size, CLFLUSH line size (value * 8) in bytes - 1, 0, EBX, 23:16, max_cpu_id, Maxim number of addressable logic cpu in this package - 1, 0, EBX, 31:24, apic_id, Initial APIC ID - - 1, 0, ECX, 0, sse3, Streaming SIMD Extensions 3(SSE3) - 1, 0, ECX, 1, pclmulqdq, PCLMULQDQ instruction supported - 1, 0, ECX, 2, dtes64, DS area uses 64-bit layout - 1, 0, ECX, 3, mwait, MONITOR/MWAIT supported - 1, 0, ECX, 4, ds_cpl, CPL Qualified Debug Store which allows for branch message storage qualified by CPL - 1, 0, ECX, 5, vmx, Virtual Machine Extensions supported - 1, 0, ECX, 6, smx, Safer Mode Extension supported - 1, 0, ECX, 7, eist, Enhanced Intel SpeedStep Technology - 1, 0, ECX, 8, tm2, Thermal Monitor 2 - 1, 0, ECX, 9, ssse3, Supplemental Streaming SIMD Extensions 3 (SSSE3) - 1, 0, ECX, 10, l1_ctx_id, L1 data cache could be set to either adaptive mode or shared mode (check IA32_MISC_ENABLE bit 24 definition) - 1, 0, ECX, 11, sdbg, IA32_DEBUG_INTERFACE MSR for silicon debug supported - 1, 0, ECX, 12, fma, FMA extensions using YMM state supported - 1, 0, ECX, 13, cmpxchg16b, 'CMPXCHG16B - Compare and Exchange Bytes' supported - 1, 0, ECX, 14, xtpr_update, xTPR Update Control supported - 1, 0, ECX, 15, pdcm, Perfmon and Debug Capability present - 1, 0, ECX, 17, pcid, Process-Context Identifiers feature present - 1, 0, ECX, 18, dca, Prefetching data from a memory mapped device supported - 1, 0, ECX, 19, sse4_1, SSE4.1 feature present - 1, 0, ECX, 20, sse4_2, SSE4.2 feature present - 1, 0, ECX, 21, x2apic, x2APIC supported - 1, 0, ECX, 22, movbe, MOVBE instruction supported - 1, 0, ECX, 23, popcnt, POPCNT instruction supported - 1, 0, ECX, 24, tsc_deadline_timer, LAPIC supports one-shot operation using a TSC deadline value - 1, 0, ECX, 25, aesni, AESNI instruction supported - 1, 0, ECX, 26, xsave, XSAVE/XRSTOR processor extended states (XSETBV/XGETBV/XCR0) - 1, 0, ECX, 27, osxsave, OS has set CR4.OSXSAVE bit to enable XSETBV/XGETBV/XCR0 - 1, 0, ECX, 28, avx, AVX instruction supported - 1, 0, ECX, 29, f16c, 16-bit floating-point conversion instruction supported - 1, 0, ECX, 30, rdrand, RDRAND instruction supported - - 1, 0, EDX, 0, fpu, x87 FPU on chip - 1, 0, EDX, 1, vme, Virtual-8086 Mode Enhancement - 1, 0, EDX, 2, de, Debugging Extensions - 1, 0, EDX, 3, pse, Page Size Extensions - 1, 0, EDX, 4, tsc, Time Stamp Counter - 1, 0, EDX, 5, msr, RDMSR and WRMSR Support - 1, 0, EDX, 6, pae, Physical Address Extensions - 1, 0, EDX, 7, mce, Machine Check Exception - 1, 0, EDX, 8, cx8, CMPXCHG8B instr - 1, 0, EDX, 9, apic, APIC on Chip - 1, 0, EDX, 11, sep, SYSENTER and SYSEXIT instrs - 1, 0, EDX, 12, mtrr, Memory Type Range Registers - 1, 0, EDX, 13, pge, Page Global Bit - 1, 0, EDX, 14, mca, Machine Check Architecture - 1, 0, EDX, 15, cmov, Conditional Move Instrs - 1, 0, EDX, 16, pat, Page Attribute Table - 1, 0, EDX, 17, pse36, 36-Bit Page Size Extension - 1, 0, EDX, 18, psn, Processor Serial Number - 1, 0, EDX, 19, clflush, CLFLUSH instr -# 1, 0, EDX, 20, - 1, 0, EDX, 21, ds, Debug Store - 1, 0, EDX, 22, acpi, Thermal Monitor and Software Controlled Clock Facilities - 1, 0, EDX, 23, mmx, Intel MMX Technology - 1, 0, EDX, 24, fxsr, XSAVE and FXRSTOR Instrs - 1, 0, EDX, 25, sse, SSE - 1, 0, EDX, 26, sse2, SSE2 - 1, 0, EDX, 27, ss, Self Snoop - 1, 0, EDX, 28, hit, Max APIC IDs - 1, 0, EDX, 29, tm, Thermal Monitor -# 1, 0, EDX, 30, - 1, 0, EDX, 31, pbe, Pending Break Enable - -# Leaf 02H -# cache and TLB descriptor info - -# Leaf 03H -# Precessor Serial Number, introduced on Pentium III, not valid for -# latest models - -# Leaf 04H -# thread/core and cache topology - 4, 0, EAX, 4:0, cache_type, Cache type like instr/data or unified - 4, 0, EAX, 7:5, cache_level, Cache Level (starts at 1) - 4, 0, EAX, 8, cache_self_init, Cache Self Initialization - 4, 0, EAX, 9, fully_associate, Fully Associative cache -# 4, 0, EAX, 13:10, resvd, resvd - 4, 0, EAX, 25:14, max_logical_id, Max number of addressable IDs for logical processors sharing the cache - 4, 0, EAX, 31:26, max_phy_id, Max number of addressable IDs for processors in phy package - - 4, 0, EBX, 11:0, cache_linesize, Size of a cache line in bytes - 4, 0, EBX, 21:12, cache_partition, Physical Line partitions - 4, 0, EBX, 31:22, cache_ways, Ways of associativity - 4, 0, ECX, 31:0, cache_sets, Number of Sets - 1 - 4, 0, EDX, 0, c_wbinvd, 1 means WBINVD/INVD is not ganranteed to act upon lower level caches of non-originating threads sharing this cache - 4, 0, EDX, 1, c_incl, Whether cache is inclusive of lower cache level - 4, 0, EDX, 2, c_comp_index, Complex Cache Indexing - -# Leaf 05H -# MONITOR/MWAIT - 5, 0, EAX, 15:0, min_mon_size, Smallest monitor line size in bytes - 5, 0, EBX, 15:0, max_mon_size, Largest monitor line size in bytes - 5, 0, ECX, 0, mwait_ext, Enum of Monitor-Mwait extensions supported - 5, 0, ECX, 1, mwait_irq_break, Largest monitor line size in bytes - 5, 0, EDX, 3:0, c0_sub_stats, Number of C0* sub C-states supported using MWAIT - 5, 0, EDX, 7:4, c1_sub_stats, Number of C1* sub C-states supported using MWAIT - 5, 0, EDX, 11:8, c2_sub_stats, Number of C2* sub C-states supported using MWAIT - 5, 0, EDX, 15:12, c3_sub_stats, Number of C3* sub C-states supported using MWAIT - 5, 0, EDX, 19:16, c4_sub_stats, Number of C4* sub C-states supported using MWAIT - 5, 0, EDX, 23:20, c5_sub_stats, Number of C5* sub C-states supported using MWAIT - 5, 0, EDX, 27:24, c6_sub_stats, Number of C6* sub C-states supported using MWAIT - 5, 0, EDX, 31:28, c7_sub_stats, Number of C7* sub C-states supported using MWAIT - -# Leaf 06H -# Thermal & Power Management - - 6, 0, EAX, 0, dig_temp, Digital temperature sensor supported - 6, 0, EAX, 1, turbo, Intel Turbo Boost - 6, 0, EAX, 2, arat, Always running APIC timer -# 6, 0, EAX, 3, resv, Reserved - 6, 0, EAX, 4, pln, Power limit notifications supported - 6, 0, EAX, 5, ecmd, Clock modulation duty cycle extension supported - 6, 0, EAX, 6, ptm, Package thermal management supported - 6, 0, EAX, 7, hwp, HWP base register - 6, 0, EAX, 8, hwp_notify, HWP notification - 6, 0, EAX, 9, hwp_act_window, HWP activity window - 6, 0, EAX, 10, hwp_energy, HWP energy performance preference - 6, 0, EAX, 11, hwp_pkg_req, HWP package level request -# 6, 0, EAX, 12, resv, Reserved - 6, 0, EAX, 13, hdc, HDC base registers supported - 6, 0, EAX, 14, turbo3, Turbo Boost Max 3.0 - 6, 0, EAX, 15, hwp_cap, Highest Performance change supported - 6, 0, EAX, 16, hwp_peci, HWP PECI override is supported - 6, 0, EAX, 17, hwp_flex, Flexible HWP is supported - 6, 0, EAX, 18, hwp_fast, Fast access mode for the IA32_HWP_REQUEST MSR is supported -# 6, 0, EAX, 19, resv, Reserved - 6, 0, EAX, 20, hwp_ignr, Ignoring Idle Logical Processor HWP request is supported - - 6, 0, EBX, 3:0, therm_irq_thresh, Number of Interrupt Thresholds in Digital Thermal Sensor - 6, 0, ECX, 0, aperfmperf, Presence of IA32_MPERF and IA32_APERF - 6, 0, ECX, 3, energ_bias, Performance-energy bias preference supported - -# Leaf 07H -# ECX == 0 -# AVX512 refers to https://en.wikipedia.org/wiki/AVX-512 -# XXX: Do we really need to enumerate each and every AVX512 sub features - - 7, 0, EBX, 0, fsgsbase, RDFSBASE/RDGSBASE/WRFSBASE/WRGSBASE supported - 7, 0, EBX, 1, tsc_adjust, TSC_ADJUST MSR supported - 7, 0, EBX, 2, sgx, Software Guard Extensions - 7, 0, EBX, 3, bmi1, BMI1 - 7, 0, EBX, 4, hle, Hardware Lock Elision - 7, 0, EBX, 5, avx2, AVX2 -# 7, 0, EBX, 6, fdp_excp_only, x87 FPU Data Pointer updated only on x87 exceptions - 7, 0, EBX, 7, smep, Supervisor-Mode Execution Prevention - 7, 0, EBX, 8, bmi2, BMI2 - 7, 0, EBX, 9, rep_movsb, Enhanced REP MOVSB/STOSB - 7, 0, EBX, 10, invpcid, INVPCID instruction - 7, 0, EBX, 11, rtm, Restricted Transactional Memory - 7, 0, EBX, 12, rdt_m, Intel RDT Monitoring capability - 7, 0, EBX, 13, depc_fpu_cs_ds, Deprecates FPU CS and FPU DS - 7, 0, EBX, 14, mpx, Memory Protection Extensions - 7, 0, EBX, 15, rdt_a, Intel RDT Allocation capability - 7, 0, EBX, 16, avx512f, AVX512 Foundation instr - 7, 0, EBX, 17, avx512dq, AVX512 Double and Quadword AVX512 instr - 7, 0, EBX, 18, rdseed, RDSEED instr - 7, 0, EBX, 19, adx, ADX instr - 7, 0, EBX, 20, smap, Supervisor Mode Access Prevention - 7, 0, EBX, 21, avx512ifma, AVX512 Integer Fused Multiply Add -# 7, 0, EBX, 22, resvd, resvd - 7, 0, EBX, 23, clflushopt, CLFLUSHOPT instr - 7, 0, EBX, 24, clwb, CLWB instr - 7, 0, EBX, 25, intel_pt, Intel Processor Trace instr - 7, 0, EBX, 26, avx512pf, Prefetch - 7, 0, EBX, 27, avx512er, AVX512 Exponent Reciproca instr - 7, 0, EBX, 28, avx512cd, AVX512 Conflict Detection instr - 7, 0, EBX, 29, sha, Intel Secure Hash Algorithm Extensions instr - 7, 0, EBX, 30, avx512bw, AVX512 Byte & Word instr - 7, 0, EBX, 31, avx512vl, AVX512 Vector Length Extentions (VL) - 7, 0, ECX, 0, prefetchwt1, X - 7, 0, ECX, 1, avx512vbmi, AVX512 Vector Byte Manipulation Instructions - 7, 0, ECX, 2, umip, User-mode Instruction Prevention - - 7, 0, ECX, 3, pku, Protection Keys for User-mode pages - 7, 0, ECX, 4, ospke, CR4 PKE set to enable protection keys -# 7, 0, ECX, 16:5, resvd, resvd - 7, 0, ECX, 21:17, mawau, The value of MAWAU used by the BNDLDX and BNDSTX instructions in 64-bit mode - 7, 0, ECX, 22, rdpid, RDPID and IA32_TSC_AUX -# 7, 0, ECX, 29:23, resvd, resvd - 7, 0, ECX, 30, sgx_lc, SGX Launch Configuration -# 7, 0, ECX, 31, resvd, resvd - -# Leaf 08H -# - - -# Leaf 09H -# Direct Cache Access (DCA) information - 9, 0, ECX, 31:0, dca_cap, The value of IA32_PLATFORM_DCA_CAP +# SPDX-License-Identifier: CC0-1.0 +# Generator: x86-cpuid-db v1.0 -# Leaf 0AH -# Architectural Performance Monitoring # -# Do we really need to print out the PMU related stuff? -# Does normal user really care about it? +# Auto-generated file. +# Please submit all updates and bugfixes to https://x86-cpuid.org # - 0xA, 0, EAX, 7:0, pmu_ver, Performance Monitoring Unit version - 0xA, 0, EAX, 15:8, pmu_gp_cnt_num, Numer of general-purose PMU counters per logical CPU - 0xA, 0, EAX, 23:16, pmu_cnt_bits, Bit wideth of PMU counter - 0xA, 0, EAX, 31:24, pmu_ebx_bits, Length of EBX bit vector to enumerate PMU events - - 0xA, 0, EBX, 0, pmu_no_core_cycle_evt, Core cycle event not available - 0xA, 0, EBX, 1, pmu_no_instr_ret_evt, Instruction retired event not available - 0xA, 0, EBX, 2, pmu_no_ref_cycle_evt, Reference cycles event not available - 0xA, 0, EBX, 3, pmu_no_llc_ref_evt, Last-level cache reference event not available - 0xA, 0, EBX, 4, pmu_no_llc_mis_evt, Last-level cache misses event not available - 0xA, 0, EBX, 5, pmu_no_br_instr_ret_evt, Branch instruction retired event not available - 0xA, 0, EBX, 6, pmu_no_br_mispredict_evt, Branch mispredict retired event not available - - 0xA, 0, ECX, 4:0, pmu_fixed_cnt_num, Performance Monitoring Unit version - 0xA, 0, ECX, 12:5, pmu_fixed_cnt_bits, Numer of PMU counters per logical CPU - -# Leaf 0BH -# Extended Topology Enumeration Leaf -# - - 0xB, 0, EAX, 4:0, id_shift, Number of bits to shift right on x2APIC ID to get a unique topology ID of the next level type - 0xB, 0, EBX, 15:0, cpu_nr, Number of logical processors at this level type - 0xB, 0, ECX, 15:8, lvl_type, 0-Invalid 1-SMT 2-Core - 0xB, 0, EDX, 31:0, x2apic_id, x2APIC ID the current logical processor - - -# Leaf 0DH -# Processor Extended State - 0xD, 0, EAX, 0, x87, X87 state - 0xD, 0, EAX, 1, sse, SSE state - 0xD, 0, EAX, 2, avx, AVX state - 0xD, 0, EAX, 4:3, mpx, MPX state - 0xD, 0, EAX, 7:5, avx512, AVX-512 state - 0xD, 0, EAX, 9, pkru, PKRU state - - 0xD, 0, EBX, 31:0, max_sz_xcr0, Maximum size (bytes) required by enabled features in XCR0 - 0xD, 0, ECX, 31:0, max_sz_xsave, Maximum size (bytes) of the XSAVE/XRSTOR save area - - 0xD, 1, EAX, 0, xsaveopt, XSAVEOPT available - 0xD, 1, EAX, 1, xsavec, XSAVEC and compacted form supported - 0xD, 1, EAX, 2, xgetbv, XGETBV supported - 0xD, 1, EAX, 3, xsaves, XSAVES/XRSTORS and IA32_XSS supported - - 0xD, 1, EBX, 31:0, max_sz_xcr0, Maximum size (bytes) required by enabled features in XCR0 - 0xD, 1, ECX, 8, pt, PT state - 0xD, 1, ECX, 11, cet_usr, CET user state - 0xD, 1, ECX, 12, cet_supv, CET supervisor state - 0xD, 1, ECX, 13, hdc, HDC state - 0xD, 1, ECX, 16, hwp, HWP state - -# Leaf 0FH -# Intel RDT Monitoring - - 0xF, 0, EBX, 31:0, rmid_range, Maximum range (zero-based) of RMID within this physical processor of all types - 0xF, 0, EDX, 1, l3c_rdt_mon, L3 Cache RDT Monitoring supported - - 0xF, 1, ECX, 31:0, rmid_range, Maximum range (zero-based) of RMID of this types - 0xF, 1, EDX, 0, l3c_ocp_mon, L3 Cache occupancy Monitoring supported - 0xF, 1, EDX, 1, l3c_tbw_mon, L3 Cache Total Bandwidth Monitoring supported - 0xF, 1, EDX, 2, l3c_lbw_mon, L3 Cache Local Bandwidth Monitoring supported +# The basic row format is: +# LEAF, SUBLEAVES, reg, bits, short_name , long_description + +# Leaf 0H +# Maximum standard leaf number + CPU vendor string + + 0, 0, eax, 31:0, max_std_leaf , Highest cpuid standard leaf supported + 0, 0, ebx, 31:0, cpu_vendorid_0 , CPU vendor ID string bytes 0 - 3 + 0, 0, ecx, 31:0, cpu_vendorid_2 , CPU vendor ID string bytes 8 - 11 + 0, 0, edx, 31:0, cpu_vendorid_1 , CPU vendor ID string bytes 4 - 7 + +# Leaf 1H +# CPU FMS (Family/Model/Stepping) + standard feature flags + + 1, 0, eax, 3:0, stepping , Stepping ID + 1, 0, eax, 7:4, base_model , Base CPU model ID + 1, 0, eax, 11:8, base_family_id , Base CPU family ID + 1, 0, eax, 13:12, cpu_type , CPU type + 1, 0, eax, 19:16, ext_model , Extended CPU model ID + 1, 0, eax, 27:20, ext_family , Extended CPU family ID + 1, 0, ebx, 7:0, brand_id , Brand index + 1, 0, ebx, 15:8, clflush_size , CLFLUSH instruction cache line size + 1, 0, ebx, 23:16, n_logical_cpu , Logical CPU (HW threads) count + 1, 0, ebx, 31:24, local_apic_id , Initial local APIC physical ID + 1, 0, ecx, 0, pni , Streaming SIMD Extensions 3 (SSE3) + 1, 0, ecx, 1, pclmulqdq , PCLMULQDQ instruction support + 1, 0, ecx, 2, dtes64 , 64-bit DS save area + 1, 0, ecx, 3, monitor , MONITOR/MWAIT support + 1, 0, ecx, 4, ds_cpl , CPL Qualified Debug Store + 1, 0, ecx, 5, vmx , Virtual Machine Extensions + 1, 0, ecx, 6, smx , Safer Mode Extensions + 1, 0, ecx, 7, est , Enhanced Intel SpeedStep + 1, 0, ecx, 8, tm2 , Thermal Monitor 2 + 1, 0, ecx, 9, ssse3 , Supplemental SSE3 + 1, 0, ecx, 10, cid , L1 Context ID + 1, 0, ecx, 11, sdbg , Sillicon Debug + 1, 0, ecx, 12, fma , FMA extensions using YMM state + 1, 0, ecx, 13, cx16 , CMPXCHG16B instruction support + 1, 0, ecx, 14, xtpr , xTPR Update Control + 1, 0, ecx, 15, pdcm , Perfmon and Debug Capability + 1, 0, ecx, 17, pcid , Process-context identifiers + 1, 0, ecx, 18, dca , Direct Cache Access + 1, 0, ecx, 19, sse4_1 , SSE4.1 + 1, 0, ecx, 20, sse4_2 , SSE4.2 + 1, 0, ecx, 21, x2apic , X2APIC support + 1, 0, ecx, 22, movbe , MOVBE instruction support + 1, 0, ecx, 23, popcnt , POPCNT instruction support + 1, 0, ecx, 24, tsc_deadline_timer , APIC timer one-shot operation + 1, 0, ecx, 25, aes , AES instructions + 1, 0, ecx, 26, xsave , XSAVE (and related instructions) support + 1, 0, ecx, 27, osxsave , XSAVE (and related instructions) are enabled by OS + 1, 0, ecx, 28, avx , AVX instructions support + 1, 0, ecx, 29, f16c , Half-precision floating-point conversion support + 1, 0, ecx, 30, rdrand , RDRAND instruction support + 1, 0, ecx, 31, guest_status , System is running as guest; (para-)virtualized system + 1, 0, edx, 0, fpu , Floating-Point Unit on-chip (x87) + 1, 0, edx, 1, vme , Virtual-8086 Mode Extensions + 1, 0, edx, 2, de , Debugging Extensions + 1, 0, edx, 3, pse , Page Size Extension + 1, 0, edx, 4, tsc , Time Stamp Counter + 1, 0, edx, 5, msr , Model-Specific Registers (RDMSR and WRMSR support) + 1, 0, edx, 6, pae , Physical Address Extensions + 1, 0, edx, 7, mce , Machine Check Exception + 1, 0, edx, 8, cx8 , CMPXCHG8B instruction + 1, 0, edx, 9, apic , APIC on-chip + 1, 0, edx, 11, sep , SYSENTER, SYSEXIT, and associated MSRs + 1, 0, edx, 12, mtrr , Memory Type Range Registers + 1, 0, edx, 13, pge , Page Global Extensions + 1, 0, edx, 14, mca , Machine Check Architecture + 1, 0, edx, 15, cmov , Conditional Move Instruction + 1, 0, edx, 16, pat , Page Attribute Table + 1, 0, edx, 17, pse36 , Page Size Extension (36-bit) + 1, 0, edx, 18, pn , Processor Serial Number + 1, 0, edx, 19, clflush , CLFLUSH instruction + 1, 0, edx, 21, dts , Debug Store + 1, 0, edx, 22, acpi , Thermal monitor and clock control + 1, 0, edx, 23, mmx , MMX instructions + 1, 0, edx, 24, fxsr , FXSAVE and FXRSTOR instructions + 1, 0, edx, 25, sse , SSE instructions + 1, 0, edx, 26, sse2 , SSE2 instructions + 1, 0, edx, 27, ss , Self Snoop + 1, 0, edx, 28, ht , Hyper-threading + 1, 0, edx, 29, tm , Thermal Monitor + 1, 0, edx, 30, ia64 , Legacy IA-64 (Itanium) support bit, now resreved + 1, 0, edx, 31, pbe , Pending Break Enable + +# Leaf 2H +# Intel cache and TLB information one-byte descriptors + + 2, 0, eax, 7:0, iteration_count , Number of times this CPUD leaf must be queried + 2, 0, eax, 15:8, desc1 , Descriptor #1 + 2, 0, eax, 23:16, desc2 , Descriptor #2 + 2, 0, eax, 30:24, desc3 , Descriptor #3 + 2, 0, eax, 31, eax_invalid , Descriptors 1-3 are invalid if set + 2, 0, ebx, 7:0, desc4 , Descriptor #4 + 2, 0, ebx, 15:8, desc5 , Descriptor #5 + 2, 0, ebx, 23:16, desc6 , Descriptor #6 + 2, 0, ebx, 30:24, desc7 , Descriptor #7 + 2, 0, ebx, 31, ebx_invalid , Descriptors 4-7 are invalid if set + 2, 0, ecx, 7:0, desc8 , Descriptor #8 + 2, 0, ecx, 15:8, desc9 , Descriptor #9 + 2, 0, ecx, 23:16, desc10 , Descriptor #10 + 2, 0, ecx, 30:24, desc11 , Descriptor #11 + 2, 0, ecx, 31, ecx_invalid , Descriptors 8-11 are invalid if set + 2, 0, edx, 7:0, desc12 , Descriptor #12 + 2, 0, edx, 15:8, desc13 , Descriptor #13 + 2, 0, edx, 23:16, desc14 , Descriptor #14 + 2, 0, edx, 30:24, desc15 , Descriptor #15 + 2, 0, edx, 31, edx_invalid , Descriptors 12-15 are invalid if set + +# Leaf 4H +# Intel deterministic cache parameters + + 4, 31:0, eax, 4:0, cache_type , Cache type field + 4, 31:0, eax, 7:5, cache_level , Cache level (1-based) + 4, 31:0, eax, 8, cache_self_init , Self-initialializing cache level + 4, 31:0, eax, 9, fully_associative , Fully-associative cache + 4, 31:0, eax, 25:14, num_threads_sharing , Number logical CPUs sharing this cache + 4, 31:0, eax, 31:26, num_cores_on_die , Number of cores in the physical package + 4, 31:0, ebx, 11:0, cache_linesize , System coherency line size (0-based) + 4, 31:0, ebx, 21:12, cache_npartitions , Physical line partitions (0-based) + 4, 31:0, ebx, 31:22, cache_nways , Ways of associativity (0-based) + 4, 31:0, ecx, 30:0, cache_nsets , Cache number of sets (0-based) + 4, 31:0, edx, 0, wbinvd_rll_no_guarantee, WBINVD/INVD not guaranteed for Remote Lower-Level caches + 4, 31:0, edx, 1, ll_inclusive , Cache is inclusive of Lower-Level caches + 4, 31:0, edx, 2, complex_indexing , Not a direct-mapped cache (complex function) + +# Leaf 5H +# MONITOR/MWAIT instructions enumeration + + 5, 0, eax, 15:0, min_mon_size , Smallest monitor-line size, in bytes + 5, 0, ebx, 15:0, max_mon_size , Largest monitor-line size, in bytes + 5, 0, ecx, 0, mwait_ext , Enumeration of MONITOR/MWAIT extensions is supported + 5, 0, ecx, 1, mwait_irq_break , Interrupts as a break-event for MWAIT is supported + 5, 0, edx, 3:0, n_c0_substates , Number of C0 sub C-states supported using MWAIT + 5, 0, edx, 7:4, n_c1_substates , Number of C1 sub C-states supported using MWAIT + 5, 0, edx, 11:8, n_c2_substates , Number of C2 sub C-states supported using MWAIT + 5, 0, edx, 15:12, n_c3_substates , Number of C3 sub C-states supported using MWAIT + 5, 0, edx, 19:16, n_c4_substates , Number of C4 sub C-states supported using MWAIT + 5, 0, edx, 23:20, n_c5_substates , Number of C5 sub C-states supported using MWAIT + 5, 0, edx, 27:24, n_c6_substates , Number of C6 sub C-states supported using MWAIT + 5, 0, edx, 31:28, n_c7_substates , Number of C7 sub C-states supported using MWAIT + +# Leaf 6H +# Thermal and Power Management enumeration + + 6, 0, eax, 0, dtherm , Digital temprature sensor + 6, 0, eax, 1, turbo_boost , Intel Turbo Boost + 6, 0, eax, 2, arat , Always-Running APIC Timer (not affected by p-state) + 6, 0, eax, 4, pln , Power Limit Notification (PLN) event + 6, 0, eax, 5, ecmd , Clock modulation duty cycle extension + 6, 0, eax, 6, pts , Package thermal management + 6, 0, eax, 7, hwp , HWP (Hardware P-states) base registers are supported + 6, 0, eax, 8, hwp_notify , HWP notification (IA32_HWP_INTERRUPT MSR) + 6, 0, eax, 9, hwp_act_window , HWP activity window (IA32_HWP_REQUEST[bits 41:32]) supported + 6, 0, eax, 10, hwp_epp , HWP Energy Performance Preference + 6, 0, eax, 11, hwp_pkg_req , HWP Package Level Request + 6, 0, eax, 13, hdc_base_regs , HDC base registers are supported + 6, 0, eax, 14, turbo_boost_3_0 , Intel Turbo Boost Max 3.0 + 6, 0, eax, 15, hwp_capabilities , HWP Highest Performance change + 6, 0, eax, 16, hwp_peci_override , HWP PECI override + 6, 0, eax, 17, hwp_flexible , Flexible HWP + 6, 0, eax, 18, hwp_fast , IA32_HWP_REQUEST MSR fast access mode + 6, 0, eax, 19, hfi , HW_FEEDBACK MSRs supported + 6, 0, eax, 20, hwp_ignore_idle , Ignoring idle logical CPU HWP req is supported + 6, 0, eax, 23, thread_director , Intel thread director support + 6, 0, eax, 24, therm_interrupt_bit25 , IA32_THERM_INTERRUPT MSR bit 25 is supported + 6, 0, ebx, 3:0, n_therm_thresholds , Digital thermometer thresholds + 6, 0, ecx, 0, aperfmperf , MPERF/APERF MSRs (effective frequency interface) + 6, 0, ecx, 3, epb , IA32_ENERGY_PERF_BIAS MSR support + 6, 0, ecx, 15:8, thrd_director_nclasses , Number of classes, Intel thread director + 6, 0, edx, 0, perfcap_reporting , Performance capability reporting + 6, 0, edx, 1, encap_reporting , Energy efficiency capability reporting + 6, 0, edx, 11:8, feedback_sz , HW feedback interface struct size, in 4K pages + 6, 0, edx, 31:16, this_lcpu_hwfdbk_idx , This logical CPU index @ HW feedback struct, 0-based + +# Leaf 7H +# Extended CPU features enumeration + + 7, 0, eax, 31:0, leaf7_n_subleaves , Number of cpuid 0x7 subleaves + 7, 0, ebx, 0, fsgsbase , FSBASE/GSBASE read/write support + 7, 0, ebx, 1, tsc_adjust , IA32_TSC_ADJUST MSR supported + 7, 0, ebx, 2, sgx , Intel SGX (Software Guard Extensions) + 7, 0, ebx, 3, bmi1 , Bit manipulation extensions group 1 + 7, 0, ebx, 4, hle , Hardware Lock Elision + 7, 0, ebx, 5, avx2 , AVX2 instruction set + 7, 0, ebx, 6, fdp_excptn_only , FPU Data Pointer updated only on x87 exceptions + 7, 0, ebx, 7, smep , Supervisor Mode Execution Protection + 7, 0, ebx, 8, bmi2 , Bit manipulation extensions group 2 + 7, 0, ebx, 9, erms , Enhanced REP MOVSB/STOSB + 7, 0, ebx, 10, invpcid , INVPCID instruction (Invalidate Processor Context ID) + 7, 0, ebx, 11, rtm , Intel restricted transactional memory + 7, 0, ebx, 12, cqm , Intel RDT-CMT / AMD Platform-QoS cache monitoring + 7, 0, ebx, 13, zero_fcs_fds , Deprecated FPU CS/DS (stored as zero) + 7, 0, ebx, 14, mpx , Intel memory protection extensions + 7, 0, ebx, 15, rdt_a , Intel RDT / AMD Platform-QoS Enforcemeent + 7, 0, ebx, 16, avx512f , AVX-512 foundation instructions + 7, 0, ebx, 17, avx512dq , AVX-512 double/quadword instructions + 7, 0, ebx, 18, rdseed , RDSEED instruction + 7, 0, ebx, 19, adx , ADCX/ADOX instructions + 7, 0, ebx, 20, smap , Supervisor mode access prevention + 7, 0, ebx, 21, avx512ifma , AVX-512 integer fused multiply add + 7, 0, ebx, 23, clflushopt , CLFLUSHOPT instruction + 7, 0, ebx, 24, clwb , CLWB instruction + 7, 0, ebx, 25, intel_pt , Intel processor trace + 7, 0, ebx, 26, avx512pf , AVX-512 prefetch instructions + 7, 0, ebx, 27, avx512er , AVX-512 exponent/reciprocal instrs + 7, 0, ebx, 28, avx512cd , AVX-512 conflict detection instrs + 7, 0, ebx, 29, sha_ni , SHA/SHA256 instructions + 7, 0, ebx, 30, avx512bw , AVX-512 BW (byte/word granular) instructions + 7, 0, ebx, 31, avx512vl , AVX-512 VL (128/256 vector length) extensions + 7, 0, ecx, 0, prefetchwt1 , PREFETCHWT1 (Intel Xeon Phi only) + 7, 0, ecx, 1, avx512vbmi , AVX-512 Vector byte manipulation instrs + 7, 0, ecx, 2, umip , User mode instruction protection + 7, 0, ecx, 3, pku , Protection keys for user-space + 7, 0, ecx, 4, ospke , OS protection keys enable + 7, 0, ecx, 5, waitpkg , WAITPKG instructions + 7, 0, ecx, 6, avx512_vbmi2 , AVX-512 vector byte manipulation instrs group 2 + 7, 0, ecx, 7, cet_ss , CET shadow stack features + 7, 0, ecx, 8, gfni , Galois field new instructions + 7, 0, ecx, 9, vaes , Vector AES instrs + 7, 0, ecx, 10, vpclmulqdq , VPCLMULQDQ 256-bit instruction support + 7, 0, ecx, 11, avx512_vnni , Vector neural network instructions + 7, 0, ecx, 12, avx512_bitalg , AVX-512 bit count/shiffle + 7, 0, ecx, 13, tme , Intel total memory encryption + 7, 0, ecx, 14, avx512_vpopcntdq , AVX-512: POPCNT for vectors of DW/QW + 7, 0, ecx, 16, la57 , 57-bit linear addreses (five-level paging) + 7, 0, ecx, 21:17, mawau_val_lm , BNDLDX/BNDSTX MAWAU value in 64-bit mode + 7, 0, ecx, 22, rdpid , RDPID instruction + 7, 0, ecx, 23, key_locker , Intel key locker support + 7, 0, ecx, 24, bus_lock_detect , OS bus-lock detection + 7, 0, ecx, 25, cldemote , CLDEMOTE instruction + 7, 0, ecx, 27, movdiri , MOVDIRI instruction + 7, 0, ecx, 28, movdir64b , MOVDIR64B instruction + 7, 0, ecx, 29, enqcmd , Enqueue stores supported (ENQCMD{,S}) + 7, 0, ecx, 30, sgx_lc , Intel SGX launch configuration + 7, 0, ecx, 31, pks , Protection keys for supervisor-mode pages + 7, 0, edx, 1, sgx_keys , Intel SGX attestation services + 7, 0, edx, 2, avx512_4vnniw , AVX-512 neural network instructions + 7, 0, edx, 3, avx512_4fmaps , AVX-512 multiply accumulation single precision + 7, 0, edx, 4, fsrm , Fast short REP MOV + 7, 0, edx, 5, uintr , CPU supports user interrupts + 7, 0, edx, 8, avx512_vp2intersect , VP2INTERSECT{D,Q} instructions + 7, 0, edx, 9, srdbs_ctrl , SRBDS mitigation MSR available + 7, 0, edx, 10, md_clear , VERW MD_CLEAR microcode support + 7, 0, edx, 11, rtm_always_abort , XBEGIN (RTM transaction) always aborts + 7, 0, edx, 13, tsx_force_abort , MSR TSX_FORCE_ABORT, RTM_ABORT bit, supported + 7, 0, edx, 14, serialize , SERIALIZE instruction + 7, 0, edx, 15, hybrid_cpu , The CPU is identified as a 'hybrid part' + 7, 0, edx, 16, tsxldtrk , TSX suspend/resume load address tracking + 7, 0, edx, 18, pconfig , PCONFIG instruction + 7, 0, edx, 19, arch_lbr , Intel architectural LBRs + 7, 0, edx, 20, ibt , CET indirect branch tracking + 7, 0, edx, 22, amx_bf16 , AMX-BF16: tile bfloat16 support + 7, 0, edx, 23, avx512_fp16 , AVX-512 FP16 instructions + 7, 0, edx, 24, amx_tile , AMX-TILE: tile architecture support + 7, 0, edx, 25, amx_int8 , AMX-INT8: tile 8-bit integer support + 7, 0, edx, 26, spec_ctrl , Speculation Control (IBRS/IBPB: indirect branch restrictions) + 7, 0, edx, 27, intel_stibp , Single thread indirect branch predictors + 7, 0, edx, 28, flush_l1d , FLUSH L1D cache: IA32_FLUSH_CMD MSR + 7, 0, edx, 29, arch_capabilities , Intel IA32_ARCH_CAPABILITIES MSR + 7, 0, edx, 30, core_capabilities , IA32_CORE_CAPABILITIES MSR + 7, 0, edx, 31, spec_ctrl_ssbd , Speculative store bypass disable + 7, 1, eax, 4, avx_vnni , AVX-VNNI instructions + 7, 1, eax, 5, avx512_bf16 , AVX-512 bFloat16 instructions + 7, 1, eax, 6, lass , Linear address space separation + 7, 1, eax, 7, cmpccxadd , CMPccXADD instructions + 7, 1, eax, 8, arch_perfmon_ext , ArchPerfmonExt: CPUID leaf 0x23 is supported + 7, 1, eax, 10, fzrm , Fast zero-length REP MOVSB + 7, 1, eax, 11, fsrs , Fast short REP STOSB + 7, 1, eax, 12, fsrc , Fast Short REP CMPSB/SCASB + 7, 1, eax, 17, fred , FRED: Flexible return and event delivery transitions + 7, 1, eax, 18, lkgs , LKGS: Load 'kernel' (userspace) GS + 7, 1, eax, 19, wrmsrns , WRMSRNS instr (WRMSR-non-serializing) + 7, 1, eax, 21, amx_fp16 , AMX-FP16: FP16 tile operations + 7, 1, eax, 22, hreset , History reset support + 7, 1, eax, 23, avx_ifma , Integer fused multiply add + 7, 1, eax, 26, lam , Linear address masking + 7, 1, eax, 27, rd_wr_msrlist , RDMSRLIST/WRMSRLIST instructions + 7, 1, ebx, 0, intel_ppin , Protected processor inventory number (PPIN{,_CTL} MSRs) + 7, 1, edx, 4, avx_vnni_int8 , AVX-VNNI-INT8 instructions + 7, 1, edx, 5, avx_ne_convert , AVX-NE-CONVERT instructions + 7, 1, edx, 8, amx_complex , AMX-COMPLEX instructions (starting from Granite Rapids) + 7, 1, edx, 14, prefetchit_0_1 , PREFETCHIT0/1 instructions + 7, 1, edx, 18, cet_sss , CET supervisor shadow stacks safe to use + 7, 2, edx, 0, intel_psfd , Intel predictive store forward disable + 7, 2, edx, 1, ipred_ctrl , MSR bits IA32_SPEC_CTRL.IPRED_DIS_{U,S} + 7, 2, edx, 2, rrsba_ctrl , MSR bits IA32_SPEC_CTRL.RRSBA_DIS_{U,S} + 7, 2, edx, 3, ddp_ctrl , MSR bit IA32_SPEC_CTRL.DDPD_U + 7, 2, edx, 4, bhi_ctrl , MSR bit IA32_SPEC_CTRL.BHI_DIS_S + 7, 2, edx, 5, mcdt_no , MCDT mitigation not needed + 7, 2, edx, 6, uclock_disable , UC-lock disable is supported + +# Leaf 9H +# Intel DCA (Direct Cache Access) enumeration + + 9, 0, eax, 0, dca_enabled_in_bios , DCA is enabled in BIOS + +# Leaf AH +# Intel PMU (Performance Monitoring Unit) enumeration + + 0xa, 0, eax, 7:0, pmu_version , Performance monitoring unit version ID + 0xa, 0, eax, 15:8, pmu_n_gcounters , Number of general PMU counters per logical CPU + 0xa, 0, eax, 23:16, pmu_gcounters_nbits , Bitwidth of PMU general counters + 0xa, 0, eax, 31:24, pmu_cpuid_ebx_bits , Length of cpuid leaf 0xa EBX bit vector + 0xa, 0, ebx, 0, no_core_cycle_evt , Core cycle event not available + 0xa, 0, ebx, 1, no_insn_retired_evt , Instruction retired event not available + 0xa, 0, ebx, 2, no_refcycle_evt , Reference cycles event not available + 0xa, 0, ebx, 3, no_llc_ref_evt , LLC-reference event not available + 0xa, 0, ebx, 4, no_llc_miss_evt , LLC-misses event not available + 0xa, 0, ebx, 5, no_br_insn_ret_evt , Branch instruction retired event not available + 0xa, 0, ebx, 6, no_br_mispredict_evt , Branch mispredict retired event not available + 0xa, 0, ebx, 7, no_td_slots_evt , Topdown slots event not available + 0xa, 0, ecx, 31:0, pmu_fcounters_bitmap , Fixed-function PMU counters support bitmap + 0xa, 0, edx, 4:0, pmu_n_fcounters , Number of fixed PMU counters + 0xa, 0, edx, 12:5, pmu_fcounters_nbits , Bitwidth of PMU fixed counters + 0xa, 0, edx, 15, anythread_depr , AnyThread deprecation + +# Leaf BH +# CPUs v1 extended topology enumeration + + 0xb, 1:0, eax, 4:0, x2apic_id_shift , Bit width of this level (previous levels inclusive) + 0xb, 1:0, ebx, 15:0, domain_lcpus_count , Logical CPUs count across all instances of this domain + 0xb, 1:0, ecx, 7:0, domain_nr , This domain level (subleaf ID) + 0xb, 1:0, ecx, 15:8, domain_type , This domain type + 0xb, 1:0, edx, 31:0, x2apic_id , x2APIC ID of current logical CPU + +# Leaf DH +# Processor extended state enumeration + + 0xd, 0, eax, 0, xcr0_x87 , XCR0.X87 (bit 0) supported + 0xd, 0, eax, 1, xcr0_sse , XCR0.SEE (bit 1) supported + 0xd, 0, eax, 2, xcr0_avx , XCR0.AVX (bit 2) supported + 0xd, 0, eax, 3, xcr0_mpx_bndregs , XCR0.BNDREGS (bit 3) supported (MPX BND0-BND3 regs) + 0xd, 0, eax, 4, xcr0_mpx_bndcsr , XCR0.BNDCSR (bit 4) supported (MPX BNDCFGU/BNDSTATUS regs) + 0xd, 0, eax, 5, xcr0_avx512_opmask , XCR0.OPMASK (bit 5) supported (AVX-512 k0-k7 regs) + 0xd, 0, eax, 6, xcr0_avx512_zmm_hi256 , XCR0.ZMM_Hi256 (bit 6) supported (AVX-512 ZMM0->ZMM7/15 regs) + 0xd, 0, eax, 7, xcr0_avx512_hi16_zmm , XCR0.HI16_ZMM (bit 7) supported (AVX-512 ZMM16->ZMM31 regs) + 0xd, 0, eax, 9, xcr0_pkru , XCR0.PKRU (bit 9) supported (XSAVE PKRU reg) + 0xd, 0, eax, 11, xcr0_cet_u , AMD XCR0.CET_U (bit 11) supported (CET supervisor state) + 0xd, 0, eax, 12, xcr0_cet_s , AMD XCR0.CET_S (bit 12) support (CET user state) + 0xd, 0, eax, 17, xcr0_tileconfig , XCR0.TILECONFIG (bit 17) supported (AMX can manage TILECONFIG) + 0xd, 0, eax, 18, xcr0_tiledata , XCR0.TILEDATA (bit 18) supported (AMX can manage TILEDATA) + 0xd, 0, ebx, 31:0, xsave_sz_xcr0_enabled , XSAVE/XRSTR area byte size, for XCR0 enabled features + 0xd, 0, ecx, 31:0, xsave_sz_max , XSAVE/XRSTR area max byte size, all CPU features + 0xd, 0, edx, 30, xcr0_lwp , AMD XCR0.LWP (bit 62) supported (Light-weight Profiling) + 0xd, 1, eax, 0, xsaveopt , XSAVEOPT instruction + 0xd, 1, eax, 1, xsavec , XSAVEC instruction + 0xd, 1, eax, 2, xgetbv1 , XGETBV instruction with ECX = 1 + 0xd, 1, eax, 3, xsaves , XSAVES/XRSTORS instructions (and XSS MSR) + 0xd, 1, eax, 4, xfd , Extended feature disable support + 0xd, 1, ebx, 31:0, xsave_sz_xcr0_xmms_enabled, XSAVE area size, all XCR0 and XMMS features enabled + 0xd, 1, ecx, 8, xss_pt , PT state, supported + 0xd, 1, ecx, 10, xss_pasid , PASID state, supported + 0xd, 1, ecx, 11, xss_cet_u , CET user state, supported + 0xd, 1, ecx, 12, xss_cet_p , CET supervisor state, supported + 0xd, 1, ecx, 13, xss_hdc , HDC state, supported + 0xd, 1, ecx, 14, xss_uintr , UINTR state, supported + 0xd, 1, ecx, 15, xss_lbr , LBR state, supported + 0xd, 1, ecx, 16, xss_hwp , HWP state, supported + 0xd, 63:2, eax, 31:0, xsave_sz , Size of save area for subleaf-N feature, in bytes + 0xd, 63:2, ebx, 31:0, xsave_offset , Offset of save area for subleaf-N feature, in bytes + 0xd, 63:2, ecx, 0, is_xss_bit , Subleaf N describes an XSS bit, otherwise XCR0 bit + 0xd, 63:2, ecx, 1, compacted_xsave_64byte_aligned, When compacted, subleaf-N feature xsave area is 64-byte aligned + +# Leaf FH +# Intel RDT / AMD PQoS resource monitoring + + 0xf, 0, ebx, 31:0, core_rmid_max , RMID max, within this core, all types (0-based) + 0xf, 0, edx, 1, cqm_llc , LLC QoS-monitoring supported + 0xf, 1, eax, 7:0, l3c_qm_bitwidth , L3 QoS-monitoring counter bitwidth (24-based) + 0xf, 1, eax, 8, l3c_qm_overflow_bit , QM_CTR MSR bit 61 is an overflow bit + 0xf, 1, ebx, 31:0, l3c_qm_conver_factor , QM_CTR MSR conversion factor to bytes + 0xf, 1, ecx, 31:0, l3c_qm_rmid_max , L3 QoS-monitoring max RMID + 0xf, 1, edx, 0, cqm_occup_llc , L3 QoS occupancy monitoring supported + 0xf, 1, edx, 1, cqm_mbm_total , L3 QoS total bandwidth monitoring supported + 0xf, 1, edx, 2, cqm_mbm_local , L3 QoS local bandwidth monitoring supported # Leaf 10H -# Intel RDT Allocation - - 0x10, 0, EBX, 1, l3c_rdt_alloc, L3 Cache Allocation supported - 0x10, 0, EBX, 2, l2c_rdt_alloc, L2 Cache Allocation supported - 0x10, 0, EBX, 3, mem_bw_alloc, Memory Bandwidth Allocation supported - +# Intel RDT / AMD PQoS allocation enumeration + + 0x10, 0, ebx, 1, cat_l3 , L3 Cache Allocation Technology supported + 0x10, 0, ebx, 2, cat_l2 , L2 Cache Allocation Technology supported + 0x10, 0, ebx, 3, mba , Memory Bandwidth Allocation supported + 0x10, 2:1, eax, 4:0, cat_cbm_len , L3/L2_CAT capacity bitmask length, minus-one notation + 0x10, 2:1, ebx, 31:0, cat_units_bitmap , L3/L2_CAT bitmap of allocation units + 0x10, 2:1, ecx, 1, l3_cat_cos_infreq_updates, L3_CAT COS updates should be infrequent + 0x10, 2:1, ecx, 2, cdp_l3 , L3/L2_CAT CDP (Code and Data Prioritization) + 0x10, 2:1, ecx, 3, cat_sparse_1s , L3/L2_CAT non-contiguous 1s value supported + 0x10, 2:1, edx, 15:0, cat_cos_max , L3/L2_CAT max COS (Class of Service) supported + 0x10, 3, eax, 11:0, mba_max_delay , Max MBA throttling value; minus-one notation + 0x10, 3, ecx, 0, per_thread_mba , Per-thread MBA controls are supported + 0x10, 3, ecx, 2, mba_delay_linear , Delay values are linear + 0x10, 3, edx, 15:0, mba_cos_max , MBA max Class of Service supported # Leaf 12H -# SGX Capability -# -# Some detailed SGX features not added yet - - 0x12, 0, EAX, 0, sgx1, L3 Cache Allocation supported - 0x12, 1, EAX, 0, sgx2, L3 Cache Allocation supported - +# Intel Software Guard Extensions (SGX) enumeration + + 0x12, 0, eax, 0, sgx1 , SGX1 leaf functions supported + 0x12, 0, eax, 1, sgx2 , SGX2 leaf functions supported + 0x12, 0, eax, 5, enclv_leaves , ENCLV leaves (E{INC,DEC}VIRTCHILD, ESETCONTEXT) supported + 0x12, 0, eax, 6, encls_leaves , ENCLS leaves (ENCLS ETRACKC, ERDINFO, ELDBC, ELDUC) supported + 0x12, 0, eax, 7, enclu_everifyreport2 , ENCLU leaf EVERIFYREPORT2 supported + 0x12, 0, eax, 10, encls_eupdatesvn , ENCLS leaf EUPDATESVN supported + 0x12, 0, eax, 11, sgx_edeccssa , ENCLU leaf EDECCSSA supported + 0x12, 0, ebx, 0, miscselect_exinfo , SSA.MISC frame: reporting #PF and #GP exceptions inside enclave supported + 0x12, 0, ebx, 1, miscselect_cpinfo , SSA.MISC frame: reporting #CP exceptions inside enclave supported + 0x12, 0, edx, 7:0, max_enclave_sz_not64 , Maximum enclave size in non-64-bit mode (log2) + 0x12, 0, edx, 15:8, max_enclave_sz_64 , Maximum enclave size in 64-bit mode (log2) + 0x12, 1, eax, 0, secs_attr_init , ATTRIBUTES.INIT supported (enclave initialized by EINIT) + 0x12, 1, eax, 1, secs_attr_debug , ATTRIBUTES.DEBUG supported (enclave permits debugger read/write) + 0x12, 1, eax, 2, secs_attr_mode64bit , ATTRIBUTES.MODE64BIT supported (enclave runs in 64-bit mode) + 0x12, 1, eax, 4, secs_attr_provisionkey , ATTRIBUTES.PROVISIONKEY supported (provisioning key available) + 0x12, 1, eax, 5, secs_attr_einittoken_key, ATTRIBUTES.EINITTOKEN_KEY supported (EINIT token key available) + 0x12, 1, eax, 6, secs_attr_cet , ATTRIBUTES.CET supported (enable CET attributes) + 0x12, 1, eax, 7, secs_attr_kss , ATTRIBUTES.KSS supported (Key Separation and Sharing enabled) + 0x12, 1, eax, 10, secs_attr_aexnotify , ATTRIBUTES.AEXNOTIFY supported (enclave threads may get AEX notifications + 0x12, 1, ecx, 0, xfrm_x87 , Enclave XFRM.X87 (bit 0) supported + 0x12, 1, ecx, 1, xfrm_sse , Enclave XFRM.SEE (bit 1) supported + 0x12, 1, ecx, 2, xfrm_avx , Enclave XFRM.AVX (bit 2) supported + 0x12, 1, ecx, 3, xfrm_mpx_bndregs , Enclave XFRM.BNDREGS (bit 3) supported (MPX BND0-BND3 regs) + 0x12, 1, ecx, 4, xfrm_mpx_bndcsr , Enclave XFRM.BNDCSR (bit 4) supported (MPX BNDCFGU/BNDSTATUS regs) + 0x12, 1, ecx, 5, xfrm_avx512_opmask , Enclave XFRM.OPMASK (bit 5) supported (AVX-512 k0-k7 regs) + 0x12, 1, ecx, 6, xfrm_avx512_zmm_hi256 , Enclave XFRM.ZMM_Hi256 (bit 6) supported (AVX-512 ZMM0->ZMM7/15 regs) + 0x12, 1, ecx, 7, xfrm_avx512_hi16_zmm , Enclave XFRM.HI16_ZMM (bit 7) supported (AVX-512 ZMM16->ZMM31 regs) + 0x12, 1, ecx, 9, xfrm_pkru , Enclave XFRM.PKRU (bit 9) supported (XSAVE PKRU reg) + 0x12, 1, ecx, 17, xfrm_tileconfig , Enclave XFRM.TILECONFIG (bit 17) supported (AMX can manage TILECONFIG) + 0x12, 1, ecx, 18, xfrm_tiledata , Enclave XFRM.TILEDATA (bit 18) supported (AMX can manage TILEDATA) + 0x12, 31:2, eax, 3:0, subleaf_type , Subleaf type (dictates output layout) + 0x12, 31:2, eax, 31:12, epc_sec_base_addr_0 , EPC section base addr, bits[12:31] + 0x12, 31:2, ebx, 19:0, epc_sec_base_addr_1 , EPC section base addr, bits[32:51] + 0x12, 31:2, ecx, 3:0, epc_sec_type , EPC section type / property encoding + 0x12, 31:2, ecx, 31:12, epc_sec_size_0 , EPC section size, bits[12:31] + 0x12, 31:2, edx, 19:0, epc_sec_size_1 , EPC section size, bits[32:51] # Leaf 14H -# Intel Processor Tracer -# +# Intel Processor Trace enumeration + + 0x14, 0, eax, 31:0, pt_max_subleaf , Max cpuid 0x14 subleaf + 0x14, 0, ebx, 0, cr3_filtering , IA32_RTIT_CR3_MATCH is accessible + 0x14, 0, ebx, 1, psb_cyc , Configurable PSB and cycle-accurate mode + 0x14, 0, ebx, 2, ip_filtering , IP/TraceStop filtering; Warm-reset PT MSRs preservation + 0x14, 0, ebx, 3, mtc_timing , MTC timing packet; COFI-based packets suppression + 0x14, 0, ebx, 4, ptwrite , PTWRITE support + 0x14, 0, ebx, 5, power_event_trace , Power Event Trace support + 0x14, 0, ebx, 6, psb_pmi_preserve , PSB and PMI preservation support + 0x14, 0, ebx, 7, event_trace , Event Trace packet generation through IA32_RTIT_CTL.EventEn + 0x14, 0, ebx, 8, tnt_disable , TNT packet generation disable through IA32_RTIT_CTL.DisTNT + 0x14, 0, ecx, 0, topa_output , ToPA output scheme support + 0x14, 0, ecx, 1, topa_multiple_entries , ToPA tables can hold multiple entries + 0x14, 0, ecx, 2, single_range_output , Single-range output scheme supported + 0x14, 0, ecx, 3, trance_transport_output, Trace Transport subsystem output support + 0x14, 0, ecx, 31, ip_payloads_lip , IP payloads have LIP values (CS base included) + 0x14, 1, eax, 2:0, num_address_ranges , Filtering number of configurable Address Ranges + 0x14, 1, eax, 31:16, mtc_periods_bmp , Bitmap of supported MTC period encodings + 0x14, 1, ebx, 15:0, cycle_thresholds_bmp , Bitmap of supported Cycle Threshold encodings + 0x14, 1, ebx, 31:16, psb_periods_bmp , Bitmap of supported Configurable PSB frequency encodings # Leaf 15H -# Time Stamp Counter and Nominal Core Crystal Clock Information +# Intel TSC (Time Stamp Counter) enumeration - 0x15, 0, EAX, 31:0, tsc_denominator, The denominator of the TSC/”core crystal clock” ratio - 0x15, 0, EBX, 31:0, tsc_numerator, The numerator of the TSC/”core crystal clock” ratio - 0x15, 0, ECX, 31:0, nom_freq, Nominal frequency of the core crystal clock in Hz + 0x15, 0, eax, 31:0, tsc_denominator , Denominator of the TSC/'core crystal clock' ratio + 0x15, 0, ebx, 31:0, tsc_numerator , Numerator of the TSC/'core crystal clock' ratio + 0x15, 0, ecx, 31:0, cpu_crystal_hz , Core crystal clock nominal frequency, in Hz # Leaf 16H -# Processor Frequency Information +# Intel processor fequency enumeration - 0x16, 0, EAX, 15:0, cpu_base_freq, Processor Base Frequency in MHz - 0x16, 0, EBX, 15:0, cpu_max_freq, Maximum Frequency in MHz - 0x16, 0, ECX, 15:0, bus_freq, Bus (Reference) Frequency in MHz + 0x16, 0, eax, 15:0, cpu_base_mhz , Processor base frequency, in MHz + 0x16, 0, ebx, 15:0, cpu_max_mhz , Processor max frequency, in MHz + 0x16, 0, ecx, 15:0, bus_mhz , Bus reference frequency, in MHz # Leaf 17H -# System-On-Chip Vendor Attribute - - 0x17, 0, EAX, 31:0, max_socid, Maximum input value of supported sub-leaf - 0x17, 0, EBX, 15:0, soc_vid, SOC Vendor ID - 0x17, 0, EBX, 16, std_vid, SOC Vendor ID is assigned via an industry standard scheme - 0x17, 0, ECX, 31:0, soc_pid, SOC Project ID assigned by vendor - 0x17, 0, EDX, 31:0, soc_sid, SOC Stepping ID +# Intel SoC vendor attributes enumeration + + 0x17, 0, eax, 31:0, soc_max_subleaf , Max cpuid leaf 0x17 subleaf + 0x17, 0, ebx, 15:0, soc_vendor_id , SoC vendor ID + 0x17, 0, ebx, 16, is_vendor_scheme , Assigned by industry enumaeratoion scheme (not Intel) + 0x17, 0, ecx, 31:0, soc_proj_id , SoC project ID, assigned by vendor + 0x17, 0, edx, 31:0, soc_stepping_id , Soc project stepping ID, assigned by vendor + 0x17, 3:1, eax, 31:0, vendor_brand_a , Vendor Brand ID string, bytes subleaf_nr * (0 -> 3) + 0x17, 3:1, ebx, 31:0, vendor_brand_b , Vendor Brand ID string, bytes subleaf_nr * (4 -> 7) + 0x17, 3:1, ecx, 31:0, vendor_brand_c , Vendor Brand ID string, bytes subleaf_nr * (8 -> 11) + 0x17, 3:1, edx, 31:0, vendor_brand_d , Vendor Brand ID string, bytes subleaf_nr * (12 -> 15) # Leaf 18H -# Deterministic Address Translation Parameters - +# Intel determenestic address translation (TLB) parameters + + 0x18, 31:0, eax, 31:0, tlb_max_subleaf , Max cpuid 0x18 subleaf + 0x18, 31:0, ebx, 0, tlb_4k_page , TLB 4KB-page entries supported + 0x18, 31:0, ebx, 1, tlb_2m_page , TLB 2MB-page entries supported + 0x18, 31:0, ebx, 2, tlb_4m_page , TLB 4MB-page entries supported + 0x18, 31:0, ebx, 3, tlb_1g_page , TLB 1GB-page entries supported + 0x18, 31:0, ebx, 10:8, hard_partitioning , (Hard/Soft) partitioning between logical CPUs sharing this struct + 0x18, 31:0, ebx, 31:16, n_way_associative , Ways of associativity + 0x18, 31:0, ecx, 31:0, n_sets , Number of sets + 0x18, 31:0, edx, 4:0, tlb_type , Translation cache type (TLB type) + 0x18, 31:0, edx, 7:5, tlb_cache_level , Translation cache level (1-based) + 0x18, 31:0, edx, 8, is_fully_associative , Fully-associative structure + 0x18, 31:0, edx, 25:14, tlb_max_addressible_ids, Max num of addressible IDs for logical CPUs sharing this TLB - 1 # Leaf 19H -# Key Locker Leaf +# Intel Key Locker enumeration + 0x19, 0, eax, 0, kl_cpl0_only , CPL0-only key Locker restriction supported + 0x19, 0, eax, 1, kl_no_encrypt , No-encrypt key locker restriction supported + 0x19, 0, eax, 2, kl_no_decrypt , No-decrypt key locker restriction supported + 0x19, 0, ebx, 0, aes_keylocker , AES key locker instructions supported + 0x19, 0, ebx, 2, aes_keylocker_wide , AES wide key locker instructions supported + 0x19, 0, ebx, 4, kl_msr_iwkey , Key locker MSRs and IWKEY backups supported + 0x19, 0, ecx, 0, loadiwkey_no_backup , LOADIWKEY NoBackup parameter supported + 0x19, 0, ecx, 1, iwkey_rand , IWKEY randomization (KeySource encoding 1) supported # Leaf 1AH -# Hybrid Information - - 0x1A, 0, EAX, 31:24, core_type, 20H-Intel_Atom 40H-Intel_Core - +# Intel hybrid CPUs identification (e.g. Atom, Core) + + 0x1a, 0, eax, 23:0, core_native_model , This core's native model ID + 0x1a, 0, eax, 31:24, core_type , This core's type + +# Leaf 1BH +# Intel PCONFIG (Platform configuration) enumeration + + 0x1b, 31:0, eax, 11:0, pconfig_subleaf_type , CPUID 0x1b subleaf type + 0x1b, 31:0, ebx, 31:0, pconfig_target_id_x , A supported PCONFIG target ID + 0x1b, 31:0, ecx, 31:0, pconfig_target_id_y , A supported PCONFIG target ID + 0x1b, 31:0, edx, 31:0, pconfig_target_id_z , A supported PCONFIG target ID + +# Leaf 1CH +# Intel LBR (Last Branch Record) enumeration + + 0x1c, 0, eax, 0, lbr_depth_8 , Max stack depth (number of LBR entries) = 8 + 0x1c, 0, eax, 1, lbr_depth_16 , Max stack depth (number of LBR entries) = 16 + 0x1c, 0, eax, 2, lbr_depth_24 , Max stack depth (number of LBR entries) = 24 + 0x1c, 0, eax, 3, lbr_depth_32 , Max stack depth (number of LBR entries) = 32 + 0x1c, 0, eax, 4, lbr_depth_40 , Max stack depth (number of LBR entries) = 40 + 0x1c, 0, eax, 5, lbr_depth_48 , Max stack depth (number of LBR entries) = 48 + 0x1c, 0, eax, 6, lbr_depth_56 , Max stack depth (number of LBR entries) = 56 + 0x1c, 0, eax, 7, lbr_depth_64 , Max stack depth (number of LBR entries) = 64 + 0x1c, 0, eax, 30, lbr_deep_c_reset , LBRs maybe cleared on MWAIT C-state > C1 + 0x1c, 0, eax, 31, lbr_ip_is_lip , LBR IP contain Last IP, otherwise effective IP + 0x1c, 0, ebx, 0, lbr_cpl , CPL filtering (non-zero IA32_LBR_CTL[2:1]) supported + 0x1c, 0, ebx, 1, lbr_branch_filter , Branch filtering (non-zero IA32_LBR_CTL[22:16]) supported + 0x1c, 0, ebx, 2, lbr_call_stack , Call-stack mode (IA32_LBR_CTL[3] = 1) supported + 0x1c, 0, ecx, 0, lbr_mispredict , Branch misprediction bit supported (IA32_LBR_x_INFO[63]) + 0x1c, 0, ecx, 1, lbr_timed_lbr , Timed LBRs (CPU cycles since last LBR entry) supported + 0x1c, 0, ecx, 2, lbr_branch_type , Branch type field (IA32_LBR_INFO_x[59:56]) supported + 0x1c, 0, ecx, 19:16, lbr_events_gpc_bmp , LBR PMU-events logging support; bitmap for first 4 GP (general-purpose) Counters + +# Leaf 1DH +# Intel AMX (Advanced Matrix Extensions) tile information + + 0x1d, 0, eax, 31:0, amx_max_palette , Highest palette ID / subleaf ID + 0x1d, 1, eax, 15:0, amx_palette_size , AMX palette total tiles size, in bytes + 0x1d, 1, eax, 31:16, amx_tile_size , AMX single tile's size, in bytes + 0x1d, 1, ebx, 15:0, amx_tile_row_size , AMX tile single row's size, in bytes + 0x1d, 1, ebx, 31:16, amx_palette_nr_tiles , AMX palette number of tiles + 0x1d, 1, ecx, 15:0, amx_tile_nr_rows , AMX tile max number of rows + +# Leaf 1EH +# Intel AMX, TMUL (Tile-matrix MULtiply) accelerator unit enumeration + + 0x1e, 0, ebx, 7:0, tmul_maxk , TMUL unit maximum height, K (rows or columns) + 0x1e, 0, ebx, 23:8, tmul_maxn , TMUL unit maxiumum SIMD dimension, N (column bytes) # Leaf 1FH -# V2 Extended Topology - A preferred superset to leaf 0BH - - -# According to SDM -# 40000000H - 4FFFFFFFH is invalid range +# Intel extended topology enumeration v2 + + 0x1f, 5:0, eax, 4:0, x2apic_id_shift , Bit width of this level (previous levels inclusive) + 0x1f, 5:0, ebx, 15:0, domain_lcpus_count , Logical CPUs count across all instances of this domain + 0x1f, 5:0, ecx, 7:0, domain_level , This domain level (subleaf ID) + 0x1f, 5:0, ecx, 15:8, domain_type , This domain type + 0x1f, 5:0, edx, 31:0, x2apic_id , x2APIC ID of current logical CPU + +# Leaf 20H +# Intel HRESET (History Reset) enumeration + + 0x20, 0, eax, 31:0, hreset_nr_subleaves , CPUID 0x20 max subleaf + 1 + 0x20, 0, ebx, 0, hreset_thread_director , HRESET of Intel thread director is supported + +# Leaf 21H +# Intel TD (Trust Domain) guest execution environment enumeration + + 0x21, 0, ebx, 31:0, tdx_vendorid_0 , TDX vendor ID string bytes 0 - 3 + 0x21, 0, ecx, 31:0, tdx_vendorid_2 , CPU vendor ID string bytes 8 - 11 + 0x21, 0, edx, 31:0, tdx_vendorid_1 , CPU vendor ID string bytes 4 - 7 + +# Leaf 23H +# Intel Architectural Performance Monitoring Extended (ArchPerfmonExt) + + 0x23, 0, eax, 1, subleaf_1_counters , Subleaf 1, PMU counters bitmaps, is valid + 0x23, 0, eax, 3, subleaf_3_events , Subleaf 3, PMU events bitmaps, is valid + 0x23, 0, ebx, 0, unitmask2 , IA32_PERFEVTSELx MSRs UnitMask2 is supported + 0x23, 0, ebx, 1, zbit , IA32_PERFEVTSELx MSRs Z-bit is supported + 0x23, 1, eax, 31:0, pmu_gp_counters_bitmap , General-purpose PMU counters bitmap + 0x23, 1, ebx, 31:0, pmu_f_counters_bitmap , Fixed PMU counters bitmap + 0x23, 3, eax, 0, core_cycles_evt , Core cycles event supported + 0x23, 3, eax, 1, insn_retired_evt , Instructions retired event supported + 0x23, 3, eax, 2, ref_cycles_evt , Reference cycles event supported + 0x23, 3, eax, 3, llc_refs_evt , Last-level cache references event supported + 0x23, 3, eax, 4, llc_misses_evt , Last-level cache misses event supported + 0x23, 3, eax, 5, br_insn_ret_evt , Branch instruction retired event supported + 0x23, 3, eax, 6, br_mispr_evt , Branch mispredict retired event supported + 0x23, 3, eax, 7, td_slots_evt , Topdown slots event supported + 0x23, 3, eax, 8, td_backend_bound_evt , Topdown backend bound event supported + 0x23, 3, eax, 9, td_bad_spec_evt , Topdown bad speculation event supported + 0x23, 3, eax, 10, td_frontend_bound_evt , Topdown frontend bound event supported + 0x23, 3, eax, 11, td_retiring_evt , Topdown retiring event support + +# Leaf 40000000H +# Maximum hypervisor standard leaf + hypervisor vendor string + +0x40000000, 0, eax, 31:0, max_hyp_leaf , Maximum hypervisor standard leaf number +0x40000000, 0, ebx, 31:0, hypervisor_id_0 , Hypervisor ID string bytes 0 - 3 +0x40000000, 0, ecx, 31:0, hypervisor_id_1 , Hypervisor ID string bytes 4 - 7 +0x40000000, 0, edx, 31:0, hypervisor_id_2 , Hypervisor ID string bytes 8 - 11 + +# Leaf 80000000H +# Maximum extended leaf number + CPU vendor string (AMD) + +0x80000000, 0, eax, 31:0, max_ext_leaf , Maximum extended cpuid leaf supported +0x80000000, 0, ebx, 31:0, cpu_vendorid_0 , Vendor ID string bytes 0 - 3 +0x80000000, 0, ecx, 31:0, cpu_vendorid_2 , Vendor ID string bytes 8 - 11 +0x80000000, 0, edx, 31:0, cpu_vendorid_1 , Vendor ID string bytes 4 - 7 # Leaf 80000001H -# Extended Processor Signature and Feature Bits - -0x80000001, 0, EAX, 27:20, extfamily, Extended family -0x80000001, 0, EAX, 19:16, extmodel, Extended model -0x80000001, 0, EAX, 11:8, basefamily, Description of Family -0x80000001, 0, EAX, 11:8, basemodel, Model numbers vary with product -0x80000001, 0, EAX, 3:0, stepping, Processor stepping (revision) for a specific model - -0x80000001, 0, EBX, 31:28, pkgtype, Specifies the package type - -0x80000001, 0, ECX, 0, lahf_lm, LAHF/SAHF available in 64-bit mode -0x80000001, 0, ECX, 1, cmplegacy, Core multi-processing legacy mode -0x80000001, 0, ECX, 2, svm, Indicates support for: VMRUN, VMLOAD, VMSAVE, CLGI, VMMCALL, and INVLPGA -0x80000001, 0, ECX, 3, extapicspace, Extended APIC register space -0x80000001, 0, ECX, 4, altmovecr8, Indicates support for LOCK MOV CR0 means MOV CR8 -0x80000001, 0, ECX, 5, lzcnt, LZCNT -0x80000001, 0, ECX, 6, sse4a, EXTRQ, INSERTQ, MOVNTSS, and MOVNTSD instruction support -0x80000001, 0, ECX, 7, misalignsse, Misaligned SSE Mode -0x80000001, 0, ECX, 8, prefetchw, PREFETCHW -0x80000001, 0, ECX, 9, osvw, OS Visible Work-around support -0x80000001, 0, ECX, 10, ibs, Instruction Based Sampling -0x80000001, 0, ECX, 11, xop, Extended operation support -0x80000001, 0, ECX, 12, skinit, SKINIT and STGI support -0x80000001, 0, ECX, 13, wdt, Watchdog timer support -0x80000001, 0, ECX, 15, lwp, Lightweight profiling support -0x80000001, 0, ECX, 16, fma4, Four-operand FMA instruction support -0x80000001, 0, ECX, 17, tce, Translation cache extension -0x80000001, 0, ECX, 22, TopologyExtensions, Indicates support for Core::X86::Cpuid::CachePropEax0 and Core::X86::Cpuid::ExtApicId -0x80000001, 0, ECX, 23, perfctrextcore, Indicates support for Core::X86::Msr::PERF_CTL0 - 5 and Core::X86::Msr::PERF_CTR -0x80000001, 0, ECX, 24, perfctrextdf, Indicates support for Core::X86::Msr::DF_PERF_CTL and Core::X86::Msr::DF_PERF_CTR -0x80000001, 0, ECX, 26, databreakpointextension, Indicates data breakpoint support for Core::X86::Msr::DR0_ADDR_MASK, Core::X86::Msr::DR1_ADDR_MASK, Core::X86::Msr::DR2_ADDR_MASK and Core::X86::Msr::DR3_ADDR_MASK -0x80000001, 0, ECX, 27, perftsc, Performance time-stamp counter supported -0x80000001, 0, ECX, 28, perfctrextllc, Indicates support for L3 performance counter extensions -0x80000001, 0, ECX, 29, mwaitextended, MWAITX and MONITORX capability is supported -0x80000001, 0, ECX, 30, admskextn, Indicates support for address mask extension (to 32 bits and to all 4 DRs) for instruction breakpoints - -0x80000001, 0, EDX, 0, fpu, x87 floating point unit on-chip -0x80000001, 0, EDX, 1, vme, Virtual-mode enhancements -0x80000001, 0, EDX, 2, de, Debugging extensions, IO breakpoints, CR4.DE -0x80000001, 0, EDX, 3, pse, Page-size extensions (4 MB pages) -0x80000001, 0, EDX, 4, tsc, Time stamp counter, RDTSC/RDTSCP instructions, CR4.TSD -0x80000001, 0, EDX, 5, msr, Model-specific registers (MSRs), with RDMSR and WRMSR instructions -0x80000001, 0, EDX, 6, pae, Physical-address extensions (PAE) -0x80000001, 0, EDX, 7, mce, Machine Check Exception, CR4.MCE -0x80000001, 0, EDX, 8, cmpxchg8b, CMPXCHG8B instruction -0x80000001, 0, EDX, 9, apic, advanced programmable interrupt controller (APIC) exists and is enabled -0x80000001, 0, EDX, 11, sysret, SYSCALL/SYSRET supported -0x80000001, 0, EDX, 12, mtrr, Memory-type range registers -0x80000001, 0, EDX, 13, pge, Page global extension, CR4.PGE -0x80000001, 0, EDX, 14, mca, Machine check architecture, MCG_CAP -0x80000001, 0, EDX, 15, cmov, Conditional move instructions, CMOV, FCOMI, FCMOV -0x80000001, 0, EDX, 16, pat, Page attribute table -0x80000001, 0, EDX, 17, pse36, Page-size extensions -0x80000001, 0, EDX, 20, exec_dis, Execute Disable Bit available -0x80000001, 0, EDX, 22, mmxext, AMD extensions to MMX instructions -0x80000001, 0, EDX, 23, mmx, MMX instructions -0x80000001, 0, EDX, 24, fxsr, FXSAVE and FXRSTOR instructions -0x80000001, 0, EDX, 25, ffxsr, FXSAVE and FXRSTOR instruction optimizations -0x80000001, 0, EDX, 26, 1gb_page, 1GB page supported -0x80000001, 0, EDX, 27, rdtscp, RDTSCP and IA32_TSC_AUX are available -0x80000001, 0, EDX, 29, lm, 64b Architecture supported -0x80000001, 0, EDX, 30, threednowext, AMD extensions to 3DNow! instructions -0x80000001, 0, EDX, 31, threednow, 3DNow! instructions - -# Leaf 80000002H/80000003H/80000004H -# Processor Brand String +# Extended CPU feature identifiers + +0x80000001, 0, eax, 3:0, e_stepping_id , Stepping ID +0x80000001, 0, eax, 7:4, e_base_model , Base processor model +0x80000001, 0, eax, 11:8, e_base_family , Base processor family +0x80000001, 0, eax, 19:16, e_ext_model , Extended processor model +0x80000001, 0, eax, 27:20, e_ext_family , Extended processor family +0x80000001, 0, ebx, 15:0, brand_id , Brand ID +0x80000001, 0, ebx, 31:28, pkg_type , Package type +0x80000001, 0, ecx, 0, lahf_lm , LAHF and SAHF in 64-bit mode +0x80000001, 0, ecx, 1, cmp_legacy , Multi-processing legacy mode (No HT) +0x80000001, 0, ecx, 2, svm , Secure Virtual Machine +0x80000001, 0, ecx, 3, extapic , Extended APIC space +0x80000001, 0, ecx, 4, cr8_legacy , LOCK MOV CR0 means MOV CR8 +0x80000001, 0, ecx, 5, abm , LZCNT advanced bit manipulation +0x80000001, 0, ecx, 6, sse4a , SSE4A support +0x80000001, 0, ecx, 7, misalignsse , Misaligned SSE mode +0x80000001, 0, ecx, 8, 3dnowprefetch , 3DNow PREFETCH/PREFETCHW support +0x80000001, 0, ecx, 9, osvw , OS visible workaround +0x80000001, 0, ecx, 10, ibs , Instruction based sampling +0x80000001, 0, ecx, 11, xop , XOP: extended operation (AVX instructions) +0x80000001, 0, ecx, 12, skinit , SKINIT/STGI support +0x80000001, 0, ecx, 13, wdt , Watchdog timer support +0x80000001, 0, ecx, 15, lwp , Lightweight profiling +0x80000001, 0, ecx, 16, fma4 , 4-operand FMA instruction +0x80000001, 0, ecx, 17, tce , Translation cache extension +0x80000001, 0, ecx, 19, nodeid_msr , NodeId MSR (0xc001100c) +0x80000001, 0, ecx, 21, tbm , Trailing bit manipulations +0x80000001, 0, ecx, 22, topoext , Topology Extensions (cpuid leaf 0x8000001d) +0x80000001, 0, ecx, 23, perfctr_core , Core performance counter extensions +0x80000001, 0, ecx, 24, perfctr_nb , NB/DF performance counter extensions +0x80000001, 0, ecx, 26, bpext , Data access breakpoint extension +0x80000001, 0, ecx, 27, ptsc , Performance time-stamp counter +0x80000001, 0, ecx, 28, perfctr_llc , LLC (L3) performance counter extensions +0x80000001, 0, ecx, 29, mwaitx , MWAITX/MONITORX support +0x80000001, 0, ecx, 30, addr_mask_ext , Breakpoint address mask extension (to bit 31) +0x80000001, 0, edx, 0, e_fpu , Floating-Point Unit on-chip (x87) +0x80000001, 0, edx, 1, e_vme , Virtual-8086 Mode Extensions +0x80000001, 0, edx, 2, e_de , Debugging Extensions +0x80000001, 0, edx, 3, e_pse , Page Size Extension +0x80000001, 0, edx, 4, e_tsc , Time Stamp Counter +0x80000001, 0, edx, 5, e_msr , Model-Specific Registers (RDMSR and WRMSR support) +0x80000001, 0, edx, 6, pae , Physical Address Extensions +0x80000001, 0, edx, 7, mce , Machine Check Exception +0x80000001, 0, edx, 8, cx8 , CMPXCHG8B instruction +0x80000001, 0, edx, 9, apic , APIC on-chip +0x80000001, 0, edx, 11, syscall , SYSCALL and SYSRET instructions +0x80000001, 0, edx, 12, mtrr , Memory Type Range Registers +0x80000001, 0, edx, 13, pge , Page Global Extensions +0x80000001, 0, edx, 14, mca , Machine Check Architecture +0x80000001, 0, edx, 15, cmov , Conditional Move Instruction +0x80000001, 0, edx, 16, pat , Page Attribute Table +0x80000001, 0, edx, 17, pse36 , Page Size Extension (36-bit) +0x80000001, 0, edx, 19, mp , Out-of-spec AMD Multiprocessing bit +0x80000001, 0, edx, 20, nx , No-execute page protection +0x80000001, 0, edx, 22, mmxext , AMD MMX extensions +0x80000001, 0, edx, 24, e_fxsr , FXSAVE and FXRSTOR instructions +0x80000001, 0, edx, 25, fxsr_opt , FXSAVE and FXRSTOR optimizations +0x80000001, 0, edx, 26, pdpe1gb , 1-GB large page support +0x80000001, 0, edx, 27, rdtscp , RDTSCP instruction +0x80000001, 0, edx, 29, lm , Long mode (x86-64, 64-bit support) +0x80000001, 0, edx, 30, 3dnowext , AMD 3DNow extensions +0x80000001, 0, edx, 31, 3dnow , 3DNow instructions + +# Leaf 80000002H +# CPU brand ID string, bytes 0 - 15 + +0x80000002, 0, eax, 31:0, cpu_brandid_0 , CPU brand ID string, bytes 0 - 3 +0x80000002, 0, ebx, 31:0, cpu_brandid_1 , CPU brand ID string, bytes 4 - 7 +0x80000002, 0, ecx, 31:0, cpu_brandid_2 , CPU brand ID string, bytes 8 - 11 +0x80000002, 0, edx, 31:0, cpu_brandid_3 , CPU brand ID string, bytes 12 - 15 + +# Leaf 80000003H +# CPU brand ID string, bytes 16 - 31 + +0x80000003, 0, eax, 31:0, cpu_brandid_4 , CPU brand ID string bytes, 16 - 19 +0x80000003, 0, ebx, 31:0, cpu_brandid_5 , CPU brand ID string bytes, 20 - 23 +0x80000003, 0, ecx, 31:0, cpu_brandid_6 , CPU brand ID string bytes, 24 - 27 +0x80000003, 0, edx, 31:0, cpu_brandid_7 , CPU brand ID string bytes, 28 - 31 + +# Leaf 80000004H +# CPU brand ID string, bytes 32 - 47 + +0x80000004, 0, eax, 31:0, cpu_brandid_8 , CPU brand ID string, bytes 32 - 35 +0x80000004, 0, ebx, 31:0, cpu_brandid_9 , CPU brand ID string, bytes 36 - 39 +0x80000004, 0, ecx, 31:0, cpu_brandid_10 , CPU brand ID string, bytes 40 - 43 +0x80000004, 0, edx, 31:0, cpu_brandid_11 , CPU brand ID string, bytes 44 - 47 # Leaf 80000005H -# Reserved +# AMD L1 cache and L1 TLB enumeration + +0x80000005, 0, eax, 7:0, l1_itlb_2m_4m_nentries , L1 ITLB #entires, 2M and 4M pages +0x80000005, 0, eax, 15:8, l1_itlb_2m_4m_assoc , L1 ITLB associativity, 2M and 4M pages +0x80000005, 0, eax, 23:16, l1_dtlb_2m_4m_nentries , L1 DTLB #entires, 2M and 4M pages +0x80000005, 0, eax, 31:24, l1_dtlb_2m_4m_assoc , L1 DTLB associativity, 2M and 4M pages +0x80000005, 0, ebx, 7:0, l1_itlb_4k_nentries , L1 ITLB #entries, 4K pages +0x80000005, 0, ebx, 15:8, l1_itlb_4k_assoc , L1 ITLB associativity, 4K pages +0x80000005, 0, ebx, 23:16, l1_dtlb_4k_nentries , L1 DTLB #entries, 4K pages +0x80000005, 0, ebx, 31:24, l1_dtlb_4k_assoc , L1 DTLB associativity, 4K pages +0x80000005, 0, ecx, 7:0, l1_dcache_line_size , L1 dcache line size, in bytes +0x80000005, 0, ecx, 15:8, l1_dcache_nlines , L1 dcache lines per tag +0x80000005, 0, ecx, 23:16, l1_dcache_assoc , L1 dcache associativity +0x80000005, 0, ecx, 31:24, l1_dcache_size_kb , L1 dcache size, in KB +0x80000005, 0, edx, 7:0, l1_icache_line_size , L1 icache line size, in bytes +0x80000005, 0, edx, 15:8, l1_icache_nlines , L1 icache lines per tag +0x80000005, 0, edx, 23:16, l1_icache_assoc , L1 icache associativity +0x80000005, 0, edx, 31:24, l1_icache_size_kb , L1 icache size, in KB # Leaf 80000006H -# Extended L2 Cache Features - -0x80000006, 0, ECX, 7:0, clsize, Cache Line size in bytes -0x80000006, 0, ECX, 15:12, l2c_assoc, L2 Associativity -0x80000006, 0, ECX, 31:16, csize, Cache size in 1K units - +# (Mostly AMD) L2 TLB, L2 cache, and L3 cache enumeration + +0x80000006, 0, eax, 11:0, l2_itlb_2m_4m_nentries , L2 iTLB #entries, 2M and 4M pages +0x80000006, 0, eax, 15:12, l2_itlb_2m_4m_assoc , L2 iTLB associativity, 2M and 4M pages +0x80000006, 0, eax, 27:16, l2_dtlb_2m_4m_nentries , L2 dTLB #entries, 2M and 4M pages +0x80000006, 0, eax, 31:28, l2_dtlb_2m_4m_assoc , L2 dTLB associativity, 2M and 4M pages +0x80000006, 0, ebx, 11:0, l2_itlb_4k_nentries , L2 iTLB #entries, 4K pages +0x80000006, 0, ebx, 15:12, l2_itlb_4k_assoc , L2 iTLB associativity, 4K pages +0x80000006, 0, ebx, 27:16, l2_dtlb_4k_nentries , L2 dTLB #entries, 4K pages +0x80000006, 0, ebx, 31:28, l2_dtlb_4k_assoc , L2 dTLB associativity, 4K pages +0x80000006, 0, ecx, 7:0, l2_line_size , L2 cache line size, in bytes +0x80000006, 0, ecx, 11:8, l2_nlines , L2 cache number of lines per tag +0x80000006, 0, ecx, 15:12, l2_assoc , L2 cache associativity +0x80000006, 0, ecx, 31:16, l2_size_kb , L2 cache size, in KB +0x80000006, 0, edx, 7:0, l3_line_size , L3 cache line size, in bytes +0x80000006, 0, edx, 11:8, l3_nlines , L3 cache number of lines per tag +0x80000006, 0, edx, 15:12, l3_assoc , L3 cache associativity +0x80000006, 0, edx, 31:18, l3_size_range , L3 cache size range # Leaf 80000007H - -0x80000007, 0, EDX, 8, nonstop_tsc, Invariant TSC available - +# CPU power management (mostly AMD) and AMD RAS enumeration + +0x80000007, 0, ebx, 0, overflow_recov , MCA overflow conditions not fatal +0x80000007, 0, ebx, 1, succor , Software containment of UnCORRectable errors +0x80000007, 0, ebx, 2, hw_assert , Hardware assert MSRs +0x80000007, 0, ebx, 3, smca , Scalable MCA (MCAX MSRs) +0x80000007, 0, ecx, 31:0, cpu_pwr_sample_ratio , CPU power sample time ratio +0x80000007, 0, edx, 0, digital_temp , Digital temprature sensor +0x80000007, 0, edx, 1, powernow_freq_id , PowerNOW! frequency scaling +0x80000007, 0, edx, 2, powernow_volt_id , PowerNOW! voltage scaling +0x80000007, 0, edx, 3, thermal_trip , THERMTRIP (Thermal Trip) +0x80000007, 0, edx, 4, hw_thermal_control , Hardware thermal control +0x80000007, 0, edx, 5, sw_thermal_control , Software thermal control +0x80000007, 0, edx, 6, 100mhz_steps , 100 MHz multiplier control +0x80000007, 0, edx, 7, hw_pstate , Hardware P-state control +0x80000007, 0, edx, 8, constant_tsc , TSC ticks at constant rate across all P and C states +0x80000007, 0, edx, 9, cpb , Core performance boost +0x80000007, 0, edx, 10, eff_freq_ro , Read-only effective frequency interface +0x80000007, 0, edx, 11, proc_feedback , Processor feedback interface (deprecated) +0x80000007, 0, edx, 12, acc_power , Processor power reporting interface +0x80000007, 0, edx, 13, connected_standby , CPU Connected Standby support +0x80000007, 0, edx, 14, rapl , Runtime Average Power Limit interface # Leaf 80000008H - -0x80000008, 0, EAX, 7:0, phy_adr_bits, Physical Address Bits -0x80000008, 0, EAX, 15:8, lnr_adr_bits, Linear Address Bits -0x80000007, 0, EBX, 9, wbnoinvd, WBNOINVD - -# 0x8000001E -# EAX: Extended APIC ID -0x8000001E, 0, EAX, 31:0, extended_apic_id, Extended APIC ID -# EBX: Core Identifiers -0x8000001E, 0, EBX, 7:0, core_id, Identifies the logical core ID -0x8000001E, 0, EBX, 15:8, threads_per_core, The number of threads per core is threads_per_core + 1 -# ECX: Node Identifiers -0x8000001E, 0, ECX, 7:0, node_id, Node ID -0x8000001E, 0, ECX, 10:8, nodes_per_processor, Nodes per processor { 0: 1 node, else reserved } - -# 8000001F: AMD Secure Encryption -0x8000001F, 0, EAX, 0, sme, Secure Memory Encryption -0x8000001F, 0, EAX, 1, sev, Secure Encrypted Virtualization -0x8000001F, 0, EAX, 2, vmpgflush, VM Page Flush MSR -0x8000001F, 0, EAX, 3, seves, SEV Encrypted State -0x8000001F, 0, EBX, 5:0, c-bit, Page table bit number used to enable memory encryption -0x8000001F, 0, EBX, 11:6, mem_encrypt_physaddr_width, Reduction of physical address space in bits with SME enabled -0x8000001F, 0, ECX, 31:0, num_encrypted_guests, Maximum ASID value that may be used for an SEV-enabled guest -0x8000001F, 0, EDX, 31:0, minimum_sev_asid, Minimum ASID value that must be used for an SEV-enabled, SEV-ES-disabled guest +# CPU capacity parameters and extended feature flags (mostly AMD) + +0x80000008, 0, eax, 7:0, phys_addr_bits , Max physical address bits +0x80000008, 0, eax, 15:8, virt_addr_bits , Max virtual address bits +0x80000008, 0, eax, 23:16, guest_phys_addr_bits , Max nested-paging guest physical address bits +0x80000008, 0, ebx, 0, clzero , CLZERO supported +0x80000008, 0, ebx, 1, irperf , Instruction retired counter MSR +0x80000008, 0, ebx, 2, xsaveerptr , XSAVE/XRSTOR always saves/restores FPU error pointers +0x80000008, 0, ebx, 3, invlpgb , INVLPGB broadcasts a TLB invalidate to all threads +0x80000008, 0, ebx, 4, rdpru , RDPRU (Read Processor Register at User level) supported +0x80000008, 0, ebx, 6, mba , Memory Bandwidth Allocation (AMD bit) +0x80000008, 0, ebx, 8, mcommit , MCOMMIT (Memory commit) supported +0x80000008, 0, ebx, 9, wbnoinvd , WBNOINVD supported +0x80000008, 0, ebx, 12, amd_ibpb , Indirect Branch Prediction Barrier +0x80000008, 0, ebx, 13, wbinvd_int , Interruptible WBINVD/WBNOINVD +0x80000008, 0, ebx, 14, amd_ibrs , Indirect Branch Restricted Speculation +0x80000008, 0, ebx, 15, amd_stibp , Single Thread Indirect Branch Prediction mode +0x80000008, 0, ebx, 16, ibrs_always_on , IBRS always-on preferred +0x80000008, 0, ebx, 17, amd_stibp_always_on , STIBP always-on preferred +0x80000008, 0, ebx, 18, ibrs_fast , IBRS is preferred over software solution +0x80000008, 0, ebx, 19, ibrs_same_mode , IBRS provides same mode protection +0x80000008, 0, ebx, 20, no_efer_lmsle , EFER[LMSLE] bit (Long-Mode Segment Limit Enable) unsupported +0x80000008, 0, ebx, 21, tlb_flush_nested , INVLPGB RAX[5] bit can be set (nested translations) +0x80000008, 0, ebx, 23, amd_ppin , Protected Processor Inventory Number +0x80000008, 0, ebx, 24, amd_ssbd , Speculative Store Bypass Disable +0x80000008, 0, ebx, 25, virt_ssbd , virtualized SSBD (Speculative Store Bypass Disable) +0x80000008, 0, ebx, 26, amd_ssb_no , SSBD not needed (fixed in HW) +0x80000008, 0, ebx, 27, cppc , Collaborative Processor Performance Control +0x80000008, 0, ebx, 28, amd_psfd , Predictive Store Forward Disable +0x80000008, 0, ebx, 29, btc_no , CPU not affected by Branch Type Confusion +0x80000008, 0, ebx, 30, ibpb_ret , IBPB clears RSB/RAS too +0x80000008, 0, ebx, 31, brs , Branch Sampling supported +0x80000008, 0, ecx, 7:0, cpu_nthreads , Number of physical threads - 1 +0x80000008, 0, ecx, 15:12, apicid_coreid_len , Number of thread core ID bits (shift) in APIC ID +0x80000008, 0, ecx, 17:16, perf_tsc_len , Performance time-stamp counter size +0x80000008, 0, edx, 15:0, invlpgb_max_pages , INVLPGB maximum page count +0x80000008, 0, edx, 31:16, rdpru_max_reg_id , RDPRU max register ID (ECX input) + +# Leaf 8000000AH +# AMD SVM (Secure Virtual Machine) enumeration + +0x8000000a, 0, eax, 7:0, svm_version , SVM revision number +0x8000000a, 0, ebx, 31:0, svm_nasid , Number of address space identifiers (ASID) +0x8000000a, 0, edx, 0, npt , Nested paging +0x8000000a, 0, edx, 1, lbrv , LBR virtualization +0x8000000a, 0, edx, 2, svm_lock , SVM lock +0x8000000a, 0, edx, 3, nrip_save , NRIP save support on #VMEXIT +0x8000000a, 0, edx, 4, tsc_scale , MSR based TSC rate control +0x8000000a, 0, edx, 5, vmcb_clean , VMCB clean bits support +0x8000000a, 0, edx, 6, flushbyasid , Flush by ASID + Extended VMCB TLB_Control +0x8000000a, 0, edx, 7, decodeassists , Decode Assists support +0x8000000a, 0, edx, 10, pausefilter , Pause intercept filter +0x8000000a, 0, edx, 12, pfthreshold , Pause filter threshold +0x8000000a, 0, edx, 13, avic , Advanced virtual interrupt controller +0x8000000a, 0, edx, 15, v_vmsave_vmload , Virtual VMSAVE/VMLOAD (nested virt) +0x8000000a, 0, edx, 16, vgif , Virtualize the Global Interrupt Flag +0x8000000a, 0, edx, 17, gmet , Guest mode execution trap +0x8000000a, 0, edx, 18, x2avic , Virtual x2APIC +0x8000000a, 0, edx, 19, sss_check , Supervisor Shadow Stack restrictions +0x8000000a, 0, edx, 20, v_spec_ctrl , Virtual SPEC_CTRL +0x8000000a, 0, edx, 21, ro_gpt , Read-Only guest page table support +0x8000000a, 0, edx, 23, h_mce_override , Host MCE override +0x8000000a, 0, edx, 24, tlbsync_int , TLBSYNC intercept + INVLPGB/TLBSYNC in VMCB +0x8000000a, 0, edx, 25, vnmi , NMI virtualization +0x8000000a, 0, edx, 26, ibs_virt , IBS Virtualization +0x8000000a, 0, edx, 27, ext_lvt_off_chg , Extended LVT offset fault change +0x8000000a, 0, edx, 28, svme_addr_chk , Guest SVME addr check + +# Leaf 80000019H +# AMD TLB 1G-pages enumeration + +0x80000019, 0, eax, 11:0, l1_itlb_1g_nentries , L1 iTLB #entries, 1G pages +0x80000019, 0, eax, 15:12, l1_itlb_1g_assoc , L1 iTLB associativity, 1G pages +0x80000019, 0, eax, 27:16, l1_dtlb_1g_nentries , L1 dTLB #entries, 1G pages +0x80000019, 0, eax, 31:28, l1_dtlb_1g_assoc , L1 dTLB associativity, 1G pages +0x80000019, 0, ebx, 11:0, l2_itlb_1g_nentries , L2 iTLB #entries, 1G pages +0x80000019, 0, ebx, 15:12, l2_itlb_1g_assoc , L2 iTLB associativity, 1G pages +0x80000019, 0, ebx, 27:16, l2_dtlb_1g_nentries , L2 dTLB #entries, 1G pages +0x80000019, 0, ebx, 31:28, l2_dtlb_1g_assoc , L2 dTLB associativity, 1G pages + +# Leaf 8000001AH +# AMD instruction optimizations enumeration + +0x8000001a, 0, eax, 0, fp_128 , Internal FP/SIMD exec data path is 128-bits wide +0x8000001a, 0, eax, 1, movu_preferred , SSE: MOVU* better than MOVL*/MOVH* +0x8000001a, 0, eax, 2, fp_256 , internal FP/SSE exec data path is 256-bits wide + +# Leaf 8000001BH +# AMD IBS (Instruction-Based Sampling) enumeration + +0x8000001b, 0, eax, 0, ibs_flags_valid , IBS feature flags valid +0x8000001b, 0, eax, 1, ibs_fetch_sampling , IBS fetch sampling supported +0x8000001b, 0, eax, 2, ibs_op_sampling , IBS execution sampling supported +0x8000001b, 0, eax, 3, ibs_rdwr_op_counter , IBS read/write of op counter supported +0x8000001b, 0, eax, 4, ibs_op_count , IBS OP counting mode supported +0x8000001b, 0, eax, 5, ibs_branch_target , IBS branch target address reporting supported +0x8000001b, 0, eax, 6, ibs_op_counters_ext , IBS IbsOpCurCnt/IbsOpMaxCnt extend by 7 bits +0x8000001b, 0, eax, 7, ibs_rip_invalid_chk , IBS invalid RIP indication supported +0x8000001b, 0, eax, 8, ibs_op_branch_fuse , IBS fused branch micro-op indication supported +0x8000001b, 0, eax, 9, ibs_fetch_ctl_ext , IBS Fetch Control Extended MSR (0xc001103c) supported +0x8000001b, 0, eax, 10, ibs_op_data_4 , IBS op data 4 MSR supported +0x8000001b, 0, eax, 11, ibs_l3_miss_filter , IBS L3-miss filtering supported (Zen4+) + +# Leaf 8000001CH +# AMD LWP (Lightweight Profiling) + +0x8000001c, 0, eax, 0, os_lwp_avail , LWP is available to application programs (supported by OS) +0x8000001c, 0, eax, 1, os_lpwval , LWPVAL instruction (EventId=1) is supported by OS +0x8000001c, 0, eax, 2, os_lwp_ire , Instructions Retired Event (EventId=2) is supported by OS +0x8000001c, 0, eax, 3, os_lwp_bre , Branch Retired Event (EventId=3) is supported by OS +0x8000001c, 0, eax, 4, os_lwp_dme , DCache Miss Event (EventId=4) is supported by OS +0x8000001c, 0, eax, 5, os_lwp_cnh , CPU Clocks Not Halted event (EventId=5) is supported by OS +0x8000001c, 0, eax, 6, os_lwp_rnh , CPU Reference clocks Not Halted event (EventId=6) is supported by OS +0x8000001c, 0, eax, 29, os_lwp_cont , LWP sampling in continuous mode is supported by OS +0x8000001c, 0, eax, 30, os_lwp_ptsc , Performance Time Stamp Counter in event records is supported by OS +0x8000001c, 0, eax, 31, os_lwp_int , Interrupt on threshold overflow is supported by OS +0x8000001c, 0, ebx, 7:0, lwp_lwpcb_sz , LWP Control Block size, in quadwords +0x8000001c, 0, ebx, 15:8, lwp_event_sz , LWP event record size, in bytes +0x8000001c, 0, ebx, 23:16, lwp_max_events , LWP max supported EventId value (EventID 255 not included) +0x8000001c, 0, ebx, 31:24, lwp_event_offset , LWP events area offset in the LWP Control Block +0x8000001c, 0, ecx, 4:0, lwp_latency_max , Num of bits in cache latency counters (10 to 31) +0x8000001c, 0, ecx, 5, lwp_data_adddr , Cache miss events report the data address of the reference +0x8000001c, 0, ecx, 8:6, lwp_latency_rnd , Amount by which cache latency is rounded +0x8000001c, 0, ecx, 15:9, lwp_version , LWP implementation version +0x8000001c, 0, ecx, 23:16, lwp_buf_min_sz , LWP event ring buffer min size, in units of 32 event records +0x8000001c, 0, ecx, 28, lwp_branch_predict , Branches Retired events can be filtered +0x8000001c, 0, ecx, 29, lwp_ip_filtering , IP filtering (IPI, IPF, BaseIP, and LimitIP @ LWPCP) supported +0x8000001c, 0, ecx, 30, lwp_cache_levels , Cache-related events can be filtered by cache level +0x8000001c, 0, ecx, 31, lwp_cache_latency , Cache-related events can be filtered by latency +0x8000001c, 0, edx, 0, hw_lwp_avail , LWP is available in Hardware +0x8000001c, 0, edx, 1, hw_lpwval , LWPVAL instruction (EventId=1) is available in HW +0x8000001c, 0, edx, 2, hw_lwp_ire , Instructions Retired Event (EventId=2) is available in HW +0x8000001c, 0, edx, 3, hw_lwp_bre , Branch Retired Event (EventId=3) is available in HW +0x8000001c, 0, edx, 4, hw_lwp_dme , DCache Miss Event (EventId=4) is available in HW +0x8000001c, 0, edx, 5, hw_lwp_cnh , CPU Clocks Not Halted event (EventId=5) is available in HW +0x8000001c, 0, edx, 6, hw_lwp_rnh , CPU Reference clocks Not Halted event (EventId=6) is available in HW +0x8000001c, 0, edx, 29, hw_lwp_cont , LWP sampling in continuous mode is available in HW +0x8000001c, 0, edx, 30, hw_lwp_ptsc , Performance Time Stamp Counter in event records is available in HW +0x8000001c, 0, edx, 31, hw_lwp_int , Interrupt on threshold overflow is available in HW + +# Leaf 8000001DH +# AMD deterministic cache parameters + +0x8000001d, 31:0, eax, 4:0, cache_type , Cache type field +0x8000001d, 31:0, eax, 7:5, cache_level , Cache level (1-based) +0x8000001d, 31:0, eax, 8, cache_self_init , Self-initializing cache level +0x8000001d, 31:0, eax, 9, fully_associative , Fully-associative cache +0x8000001d, 31:0, eax, 25:14, num_threads_sharing , Number of logical CPUs sharing cache +0x8000001d, 31:0, ebx, 11:0, cache_linesize , System coherency line size (0-based) +0x8000001d, 31:0, ebx, 21:12, cache_npartitions , Physical line partitions (0-based) +0x8000001d, 31:0, ebx, 31:22, cache_nways , Ways of associativity (0-based) +0x8000001d, 31:0, ecx, 30:0, cache_nsets , Cache number of sets (0-based) +0x8000001d, 31:0, edx, 0, wbinvd_rll_no_guarantee, WBINVD/INVD not guaranteed for Remote Lower-Level caches +0x8000001d, 31:0, edx, 1, ll_inclusive , Cache is inclusive of Lower-Level caches + +# Leaf 8000001EH +# AMD CPU topology enumeration + +0x8000001e, 0, eax, 31:0, ext_apic_id , Extended APIC ID +0x8000001e, 0, ebx, 7:0, core_id , Unique per-socket logical core unit ID +0x8000001e, 0, ebx, 15:8, core_nthreas , #Threads per core (zero-based) +0x8000001e, 0, ecx, 7:0, node_id , Node (die) ID of invoking logical CPU +0x8000001e, 0, ecx, 10:8, nnodes_per_socket , #nodes in invoking logical CPU's package/socket + +# Leaf 8000001FH +# AMD encrypted memory capabilities enumeration (SME/SEV) + +0x8000001f, 0, eax, 0, sme , Secure Memory Encryption supported +0x8000001f, 0, eax, 1, sev , Secure Encrypted Virtualization supported +0x8000001f, 0, eax, 2, vm_page_flush , VM Page Flush MSR (0xc001011e) available +0x8000001f, 0, eax, 3, sev_es , SEV Encrypted State supported +0x8000001f, 0, eax, 4, sev_nested_paging , SEV secure nested paging supported +0x8000001f, 0, eax, 5, vm_permission_levels , VMPL supported +0x8000001f, 0, eax, 6, rpmquery , RPMQUERY instruction supported +0x8000001f, 0, eax, 7, vmpl_sss , VMPL supervisor shadwo stack supported +0x8000001f, 0, eax, 8, secure_tsc , Secure TSC supported +0x8000001f, 0, eax, 9, v_tsc_aux , Hardware virtualizes TSC_AUX +0x8000001f, 0, eax, 10, sme_coherent , HW enforces cache coherency across encryption domains +0x8000001f, 0, eax, 11, req_64bit_hypervisor , SEV guest mandates 64-bit hypervisor +0x8000001f, 0, eax, 12, restricted_injection , Restricted Injection supported +0x8000001f, 0, eax, 13, alternate_injection , Alternate Injection supported +0x8000001f, 0, eax, 14, debug_swap , SEV-ES: full debug state swap is supported +0x8000001f, 0, eax, 15, disallow_host_ibs , SEV-ES: Disallowing IBS use by the host is supported +0x8000001f, 0, eax, 16, virt_transparent_enc , Virtual Transparent Encryption +0x8000001f, 0, eax, 17, vmgexit_paremeter , VmgexitParameter is supported in SEV_FEATURES +0x8000001f, 0, eax, 18, virt_tom_msr , Virtual TOM MSR is supported +0x8000001f, 0, eax, 19, virt_ibs , IBS state virtualization is supported for SEV-ES guests +0x8000001f, 0, eax, 24, vmsa_reg_protection , VMSA register protection is supported +0x8000001f, 0, eax, 25, smt_protection , SMT protection is supported +0x8000001f, 0, eax, 28, svsm_page_msr , SVSM communication page MSR (0xc001f000h) is supported +0x8000001f, 0, eax, 29, nested_virt_snp_msr , VIRT_RMPUPDATE/VIRT_PSMASH MSRs are supported +0x8000001f, 0, ebx, 5:0, pte_cbit_pos , PTE bit number used to enable memory encryption +0x8000001f, 0, ebx, 11:6, phys_addr_reduction_nbits, Reduction of phys address space when encryption is enabled, in bits +0x8000001f, 0, ebx, 15:12, vmpl_count , Number of VM permission levels (VMPL) supported +0x8000001f, 0, ecx, 31:0, enc_guests_max , Max supported number of simultaneous encrypted guests +0x8000001f, 0, edx, 31:0, min_sev_asid_no_sev_es , Mininum ASID for SEV-enabled SEV-ES-disabled guest + +# Leaf 80000020H +# AMD Platform QoS extended feature IDs + +0x80000020, 0, ebx, 1, mba , Memory Bandwidth Allocation support +0x80000020, 0, ebx, 2, smba , Slow Memory Bandwidth Allocation support +0x80000020, 0, ebx, 3, bmec , Bandwidth Monitoring Event Configuration support +0x80000020, 0, ebx, 4, l3rr , L3 Range Reservation support +0x80000020, 1, eax, 31:0, mba_limit_len , MBA enforcement limit size +0x80000020, 1, edx, 31:0, mba_cos_max , MBA max Class of Service number (zero-based) +0x80000020, 2, eax, 31:0, smba_limit_len , SMBA enforcement limit size +0x80000020, 2, edx, 31:0, smba_cos_max , SMBA max Class of Service number (zero-based) +0x80000020, 3, ebx, 7:0, bmec_num_events , BMEC number of bandwidth events available +0x80000020, 3, ecx, 0, bmec_local_reads , Local NUMA reads can be tracked +0x80000020, 3, ecx, 1, bmec_remote_reads , Remote NUMA reads can be tracked +0x80000020, 3, ecx, 2, bmec_local_nontemp_wr , Local NUMA non-temporal writes can be tracked +0x80000020, 3, ecx, 3, bmec_remote_nontemp_wr , Remote NUMA non-temporal writes can be tracked +0x80000020, 3, ecx, 4, bmec_local_slow_mem_rd , Local NUMA slow-memory reads can be tracked +0x80000020, 3, ecx, 5, bmec_remote_slow_mem_rd, Remote NUMA slow-memory reads can be tracked +0x80000020, 3, ecx, 6, bmec_all_dirty_victims , Dirty QoS victims to all types of memory can be tracked + +# Leaf 80000021H +# AMD extended features enumeration 2 + +0x80000021, 0, eax, 0, no_nested_data_bp , No nested data breakpoints +0x80000021, 0, eax, 1, fsgs_non_serializing , WRMSR to {FS,GS,KERNEL_GS}_BASE is non-serializing +0x80000021, 0, eax, 2, lfence_rdtsc , LFENCE always serializing / synchronizes RDTSC +0x80000021, 0, eax, 3, smm_page_cfg_lock , SMM paging configuration lock is supported +0x80000021, 0, eax, 6, null_sel_clr_base , Null selector clears base +0x80000021, 0, eax, 7, upper_addr_ignore , EFER MSR Upper Address Ignore Enable bit supported +0x80000021, 0, eax, 8, autoibrs , EFER MSR Automatic IBRS enable bit supported +0x80000021, 0, eax, 9, no_smm_ctl_msr , SMM_CTL MSR (0xc0010116) is not present +0x80000021, 0, eax, 10, fsrs_supported , Fast Short Rep Stosb (FSRS) is supported +0x80000021, 0, eax, 11, fsrc_supported , Fast Short Repe Cmpsb (FSRC) is supported +0x80000021, 0, eax, 13, prefetch_ctl_msr , Prefetch control MSR is supported +0x80000021, 0, eax, 17, user_cpuid_disable , #GP when executing CPUID at CPL > 0 is supported +0x80000021, 0, eax, 18, epsf_supported , Enhanced Predictive Store Forwarding (EPSF) is supported +0x80000021, 0, ebx, 11:0, microcode_patch_size , Size of microcode patch, in 16-byte units + +# Leaf 80000022H +# AMD Performance Monitoring v2 enumeration + +0x80000022, 0, eax, 0, perfmon_v2 , Performance monitoring v2 supported +0x80000022, 0, eax, 1, lbr_v2 , Last Branch Record v2 extensions (LBR Stack) +0x80000022, 0, eax, 2, lbr_pmc_freeze , Freezing core performance counters / LBR Stack supported +0x80000022, 0, ebx, 3:0, n_pmc_core , Number of core perfomance counters +0x80000022, 0, ebx, 9:4, lbr_v2_stack_size , Number of available LBR stack entries +0x80000022, 0, ebx, 15:10, n_pmc_northbridge , Number of available northbridge (data fabric) performance counters +0x80000022, 0, ebx, 21:16, n_pmc_umc , Number of available UMC performance counters +0x80000022, 0, ecx, 31:0, active_umc_bitmask , Active UMCs bitmask + +# Leaf 80000023H +# AMD Secure Multi-key Encryption enumeration + +0x80000023, 0, eax, 0, mem_hmk_mode , MEM-HMK encryption mode is supported +0x80000023, 0, ebx, 15:0, mem_hmk_avail_keys , MEM-HMK mode: total num of available encryption keys + +# Leaf 80000026H +# AMD extended topology enumeration v2 + +0x80000026, 3:0, eax, 4:0, x2apic_id_shift , Bit width of this level (previous levels inclusive) +0x80000026, 3:0, eax, 29, core_has_pwreff_ranking, This core has a power efficiency ranking +0x80000026, 3:0, eax, 30, domain_has_hybrid_cores, This domain level has hybrid (E, P) cores +0x80000026, 3:0, eax, 31, domain_core_count_asymm, The 'Core' domain has asymmetric cores count +0x80000026, 3:0, ebx, 15:0, domain_lcpus_count , Number of logical CPUs at this domain instance +0x80000026, 3:0, ebx, 23:16, core_pwreff_ranking , This core's static power efficiency ranking +0x80000026, 3:0, ebx, 27:24, core_native_model_id , This core's native model ID +0x80000026, 3:0, ebx, 31:28, core_type , This core's type +0x80000026, 3:0, ecx, 7:0, domain_level , This domain level (subleaf ID) +0x80000026, 3:0, ecx, 15:8, domain_type , This domain type +0x80000026, 3:0, edx, 31:0, x2apic_id , x2APIC ID of current logical CPU -- cgit v1.2.3 From 18ee44ce97c18ee72f5807140d07ff8cebe3cab5 Mon Sep 17 00:00:00 2001 From: Luigi Leonardi Date: Tue, 30 Jul 2024 21:43:08 +0200 Subject: test/vsock: add ioctl unsent bytes test Introduce two tests, one for SOCK_STREAM and one for SOCK_SEQPACKET, which use SIOCOUTQ ioctl to check that the number of unsent bytes is zero after delivering a packet. vsock_connect and vsock_accept are no longer static: this is to create more generic tests, allowing code to be reused for SEQPACKET and STREAM. Signed-off-by: Luigi Leonardi Reviewed-by: Stefano Garzarella Signed-off-by: David S. Miller --- tools/testing/vsock/util.c | 6 +-- tools/testing/vsock/util.h | 3 ++ tools/testing/vsock/vsock_test.c | 85 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 91 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/testing/vsock/util.c b/tools/testing/vsock/util.c index 554b290fefdc..a3d448a075e3 100644 --- a/tools/testing/vsock/util.c +++ b/tools/testing/vsock/util.c @@ -139,7 +139,7 @@ int vsock_bind_connect(unsigned int cid, unsigned int port, unsigned int bind_po } /* Connect to and return the file descriptor. */ -static int vsock_connect(unsigned int cid, unsigned int port, int type) +int vsock_connect(unsigned int cid, unsigned int port, int type) { union { struct sockaddr sa; @@ -226,8 +226,8 @@ static int vsock_listen(unsigned int cid, unsigned int port, int type) /* Listen on and return the first incoming connection. The remote * address is stored to clientaddrp. clientaddrp may be NULL. */ -static int vsock_accept(unsigned int cid, unsigned int port, - struct sockaddr_vm *clientaddrp, int type) +int vsock_accept(unsigned int cid, unsigned int port, + struct sockaddr_vm *clientaddrp, int type) { union { struct sockaddr sa; diff --git a/tools/testing/vsock/util.h b/tools/testing/vsock/util.h index e95e62485959..fff22d4a14c0 100644 --- a/tools/testing/vsock/util.h +++ b/tools/testing/vsock/util.h @@ -39,6 +39,9 @@ struct test_case { void init_signals(void); unsigned int parse_cid(const char *str); unsigned int parse_port(const char *str); +int vsock_connect(unsigned int cid, unsigned int port, int type); +int vsock_accept(unsigned int cid, unsigned int port, + struct sockaddr_vm *clientaddrp, int type); int vsock_stream_connect(unsigned int cid, unsigned int port); int vsock_bind_connect(unsigned int cid, unsigned int port, unsigned int bind_port, int type); diff --git a/tools/testing/vsock/vsock_test.c b/tools/testing/vsock/vsock_test.c index f851f8961247..8d38dbf8f41f 100644 --- a/tools/testing/vsock/vsock_test.c +++ b/tools/testing/vsock/vsock_test.c @@ -20,6 +20,8 @@ #include #include #include +#include +#include #include "vsock_test_zerocopy.h" #include "timeout.h" @@ -1238,6 +1240,79 @@ static void test_double_bind_connect_client(const struct test_opts *opts) } } +#define MSG_BUF_IOCTL_LEN 64 +static void test_unsent_bytes_server(const struct test_opts *opts, int type) +{ + unsigned char buf[MSG_BUF_IOCTL_LEN]; + int client_fd; + + client_fd = vsock_accept(VMADDR_CID_ANY, opts->peer_port, NULL, type); + if (client_fd < 0) { + perror("accept"); + exit(EXIT_FAILURE); + } + + recv_buf(client_fd, buf, sizeof(buf), 0, sizeof(buf)); + control_writeln("RECEIVED"); + + close(client_fd); +} + +static void test_unsent_bytes_client(const struct test_opts *opts, int type) +{ + unsigned char buf[MSG_BUF_IOCTL_LEN]; + int ret, fd, sock_bytes_unsent; + + fd = vsock_connect(opts->peer_cid, opts->peer_port, type); + if (fd < 0) { + perror("connect"); + exit(EXIT_FAILURE); + } + + for (int i = 0; i < sizeof(buf); i++) + buf[i] = rand() & 0xFF; + + send_buf(fd, buf, sizeof(buf), 0, sizeof(buf)); + control_expectln("RECEIVED"); + + ret = ioctl(fd, SIOCOUTQ, &sock_bytes_unsent); + if (ret < 0) { + if (errno == EOPNOTSUPP) { + fprintf(stderr, "Test skipped, SIOCOUTQ not supported.\n"); + } else { + perror("ioctl"); + exit(EXIT_FAILURE); + } + } else if (ret == 0 && sock_bytes_unsent != 0) { + fprintf(stderr, + "Unexpected 'SIOCOUTQ' value, expected 0, got %i\n", + sock_bytes_unsent); + exit(EXIT_FAILURE); + } + + close(fd); +} + +static void test_stream_unsent_bytes_client(const struct test_opts *opts) +{ + test_unsent_bytes_client(opts, SOCK_STREAM); +} + +static void test_stream_unsent_bytes_server(const struct test_opts *opts) +{ + test_unsent_bytes_server(opts, SOCK_STREAM); +} + +static void test_seqpacket_unsent_bytes_client(const struct test_opts *opts) +{ + test_unsent_bytes_client(opts, SOCK_SEQPACKET); +} + +static void test_seqpacket_unsent_bytes_server(const struct test_opts *opts) +{ + test_unsent_bytes_server(opts, SOCK_SEQPACKET); +} + #define RCVLOWAT_CREDIT_UPD_BUF_SIZE (1024 * 128) /* This define is the same as in 'include/linux/virtio_vsock.h': * it is used to decide when to send credit update message during @@ -1523,6 +1598,16 @@ static struct test_case test_cases[] = { .run_client = test_stream_rcvlowat_def_cred_upd_client, .run_server = test_stream_cred_upd_on_low_rx_bytes, }, + { + .name = "SOCK_STREAM ioctl(SIOCOUTQ) 0 unsent bytes", + .run_client = test_stream_unsent_bytes_client, + .run_server = test_stream_unsent_bytes_server, + }, + { + .name = "SOCK_SEQPACKET ioctl(SIOCOUTQ) 0 unsent bytes", + .run_client = test_seqpacket_unsent_bytes_client, + .run_server = test_seqpacket_unsent_bytes_server, + }, {}, }; -- cgit v1.2.3 From db61e6a4eee5a7884b2cafeaf407895f253bbaa7 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Thu, 1 Aug 2024 15:27:24 +0200 Subject: selftests/bpf: fix uprobe.path leak in bpf_testmod testmod_unregister_uprobe() forgets to path_put(&uprobe.path). Signed-off-by: Jiri Olsa Signed-off-by: Oleg Nesterov Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20240801132724.GA8791@redhat.com --- tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c | 1 + 1 file changed, 1 insertion(+) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c index fd28c1157bd3..72f565af4f82 100644 --- a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c +++ b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c @@ -477,6 +477,7 @@ static void testmod_unregister_uprobe(void) if (uprobe.offset) { uprobe_unregister(d_real_inode(uprobe.path.dentry), uprobe.offset, &uprobe.consumer); + path_put(&uprobe.path); uprobe.offset = 0; } -- cgit v1.2.3 From e04332ebc8ac128fa551e83f1161ab1c094d13a9 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 1 Aug 2024 15:27:28 +0200 Subject: uprobes: kill uprobe_register_refctr() It doesn't make any sense to have 2 versions of _register(). Note that trace_uprobe_enable(), the only user of uprobe_register(), doesn't need to check tu->ref_ctr_offset to decide which one should be used, it could safely pass ref_ctr_offset == 0 to uprobe_register_refctr(). Add this argument to uprobe_register(), update the callers, and kill uprobe_register_refctr(). Signed-off-by: Oleg Nesterov Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Jiri Olsa Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20240801132728.GA8800@redhat.com --- include/linux/uprobes.h | 9 ++------ kernel/events/uprobes.c | 24 ++++++---------------- kernel/trace/bpf_trace.c | 8 ++++---- kernel/trace/trace_uprobe.c | 7 +------ .../selftests/bpf/bpf_testmod/bpf_testmod.c | 4 ++-- 5 files changed, 15 insertions(+), 37 deletions(-) (limited to 'tools') diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h index a270a5892ab4..788813c0b8fc 100644 --- a/include/linux/uprobes.h +++ b/include/linux/uprobes.h @@ -112,8 +112,7 @@ extern bool is_trap_insn(uprobe_opcode_t *insn); extern unsigned long uprobe_get_swbp_addr(struct pt_regs *regs); extern unsigned long uprobe_get_trap_addr(struct pt_regs *regs); extern int uprobe_write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr, uprobe_opcode_t); -extern int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer *uc); -extern int uprobe_register_refctr(struct inode *inode, loff_t offset, loff_t ref_ctr_offset, struct uprobe_consumer *uc); +extern int uprobe_register(struct inode *inode, loff_t offset, loff_t ref_ctr_offset, struct uprobe_consumer *uc); extern int uprobe_apply(struct inode *inode, loff_t offset, struct uprobe_consumer *uc, bool); extern void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consumer *uc); extern int uprobe_mmap(struct vm_area_struct *vma); @@ -154,11 +153,7 @@ static inline void uprobes_init(void) #define uprobe_get_trap_addr(regs) instruction_pointer(regs) static inline int -uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer *uc) -{ - return -ENOSYS; -} -static inline int uprobe_register_refctr(struct inode *inode, loff_t offset, loff_t ref_ctr_offset, struct uprobe_consumer *uc) +uprobe_register(struct inode *inode, loff_t offset, loff_t ref_ctr_offset, struct uprobe_consumer *uc) { return -ENOSYS; } diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index e9b092acc71b..3a80154bc4c7 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1121,25 +1121,26 @@ void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consume EXPORT_SYMBOL_GPL(uprobe_unregister); /* - * __uprobe_register - register a probe + * uprobe_register - register a probe * @inode: the file in which the probe has to be placed. * @offset: offset from the start of the file. + * @ref_ctr_offset: offset of SDT marker / reference counter * @uc: information on howto handle the probe.. * - * Apart from the access refcount, __uprobe_register() takes a creation + * Apart from the access refcount, uprobe_register() takes a creation * refcount (thro alloc_uprobe) if and only if this @uprobe is getting * inserted into the rbtree (i.e first consumer for a @inode:@offset * tuple). Creation refcount stops uprobe_unregister from freeing the * @uprobe even before the register operation is complete. Creation * refcount is released when the last @uc for the @uprobe - * unregisters. Caller of __uprobe_register() is required to keep @inode + * unregisters. Caller of uprobe_register() is required to keep @inode * (and the containing mount) referenced. * * Return errno if it cannot successully install probes * else return 0 (success) */ -static int __uprobe_register(struct inode *inode, loff_t offset, - loff_t ref_ctr_offset, struct uprobe_consumer *uc) +int uprobe_register(struct inode *inode, loff_t offset, loff_t ref_ctr_offset, + struct uprobe_consumer *uc) { struct uprobe *uprobe; int ret; @@ -1189,21 +1190,8 @@ static int __uprobe_register(struct inode *inode, loff_t offset, goto retry; return ret; } - -int uprobe_register(struct inode *inode, loff_t offset, - struct uprobe_consumer *uc) -{ - return __uprobe_register(inode, offset, 0, uc); -} EXPORT_SYMBOL_GPL(uprobe_register); -int uprobe_register_refctr(struct inode *inode, loff_t offset, - loff_t ref_ctr_offset, struct uprobe_consumer *uc) -{ - return __uprobe_register(inode, offset, ref_ctr_offset, uc); -} -EXPORT_SYMBOL_GPL(uprobe_register_refctr); - /* * uprobe_apply - unregister an already registered probe. * @inode: the file in which the probe has to be removed. diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index cd098846e251..afa909e17824 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -3480,10 +3480,10 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr &bpf_uprobe_multi_link_lops, prog); for (i = 0; i < cnt; i++) { - err = uprobe_register_refctr(d_real_inode(link->path.dentry), - uprobes[i].offset, - uprobes[i].ref_ctr_offset, - &uprobes[i].consumer); + err = uprobe_register(d_real_inode(link->path.dentry), + uprobes[i].offset, + uprobes[i].ref_ctr_offset, + &uprobes[i].consumer); if (err) { bpf_uprobe_unregister(&path, uprobes, i); goto error_free; diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index c98e3b3386ba..1f590f989c1e 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -1089,12 +1089,7 @@ static int trace_uprobe_enable(struct trace_uprobe *tu, filter_func_t filter) tu->consumer.filter = filter; tu->inode = d_real_inode(tu->path.dentry); - if (tu->ref_ctr_offset) - ret = uprobe_register_refctr(tu->inode, tu->offset, - tu->ref_ctr_offset, &tu->consumer); - else - ret = uprobe_register(tu->inode, tu->offset, &tu->consumer); - + ret = uprobe_register(tu->inode, tu->offset, tu->ref_ctr_offset, &tu->consumer); if (ret) tu->inode = NULL; diff --git a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c index 72f565af4f82..55f6905de743 100644 --- a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c +++ b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c @@ -458,8 +458,8 @@ static int testmod_register_uprobe(loff_t offset) if (err) goto out; - err = uprobe_register_refctr(d_real_inode(uprobe.path.dentry), - offset, 0, &uprobe.consumer); + err = uprobe_register(d_real_inode(uprobe.path.dentry), + offset, 0, &uprobe.consumer); if (err) path_put(&uprobe.path); else -- cgit v1.2.3 From 3c83a9ad0295eb63bdeb81d821b8c3b9417fbcac Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 1 Aug 2024 15:27:34 +0200 Subject: uprobes: make uprobe_register() return struct uprobe * This way uprobe_unregister() and uprobe_apply() can use "struct uprobe *" rather than inode + offset. This simplifies the code and allows to avoid the unnecessary find_uprobe() + put_uprobe() in these functions. TODO: uprobe_unregister() still needs get_uprobe/put_uprobe to ensure that this uprobe can't be freed before up_write(&uprobe->register_rwsem). Co-developed-by: Andrii Nakryiko Signed-off-by: Andrii Nakryiko Signed-off-by: Oleg Nesterov Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Jiri Olsa Link: https://lore.kernel.org/r/20240801132734.GA8803@redhat.com --- include/linux/uprobes.h | 15 +++--- kernel/events/uprobes.c | 56 +++++++++------------- kernel/trace/bpf_trace.c | 25 +++++----- kernel/trace/trace_uprobe.c | 26 +++++----- .../selftests/bpf/bpf_testmod/bpf_testmod.c | 25 +++++----- 5 files changed, 67 insertions(+), 80 deletions(-) (limited to 'tools') diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h index 788813c0b8fc..f50df1fa93e7 100644 --- a/include/linux/uprobes.h +++ b/include/linux/uprobes.h @@ -16,6 +16,7 @@ #include #include +struct uprobe; struct vm_area_struct; struct mm_struct; struct inode; @@ -112,9 +113,9 @@ extern bool is_trap_insn(uprobe_opcode_t *insn); extern unsigned long uprobe_get_swbp_addr(struct pt_regs *regs); extern unsigned long uprobe_get_trap_addr(struct pt_regs *regs); extern int uprobe_write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr, uprobe_opcode_t); -extern int uprobe_register(struct inode *inode, loff_t offset, loff_t ref_ctr_offset, struct uprobe_consumer *uc); -extern int uprobe_apply(struct inode *inode, loff_t offset, struct uprobe_consumer *uc, bool); -extern void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consumer *uc); +extern struct uprobe *uprobe_register(struct inode *inode, loff_t offset, loff_t ref_ctr_offset, struct uprobe_consumer *uc); +extern int uprobe_apply(struct uprobe *uprobe, struct uprobe_consumer *uc, bool); +extern void uprobe_unregister(struct uprobe *uprobe, struct uprobe_consumer *uc); extern int uprobe_mmap(struct vm_area_struct *vma); extern void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned long end); extern void uprobe_start_dup_mmap(void); @@ -152,18 +153,18 @@ static inline void uprobes_init(void) #define uprobe_get_trap_addr(regs) instruction_pointer(regs) -static inline int +static inline struct uprobe * uprobe_register(struct inode *inode, loff_t offset, loff_t ref_ctr_offset, struct uprobe_consumer *uc) { - return -ENOSYS; + return ERR_PTR(-ENOSYS); } static inline int -uprobe_apply(struct inode *inode, loff_t offset, struct uprobe_consumer *uc, bool add) +uprobe_apply(struct uprobe* uprobe, struct uprobe_consumer *uc, bool add) { return -ENOSYS; } static inline void -uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consumer *uc) +uprobe_unregister(struct uprobe *uprobe, struct uprobe_consumer *uc) { } static inline int uprobe_mmap(struct vm_area_struct *vma) diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 3a80154bc4c7..b33f1397dae0 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1099,20 +1099,14 @@ __uprobe_unregister(struct uprobe *uprobe, struct uprobe_consumer *uc) delete_uprobe(uprobe); } -/* +/** * uprobe_unregister - unregister an already registered probe. - * @inode: the file in which the probe has to be removed. - * @offset: offset from the start of the file. + * @uprobe: uprobe to remove * @uc: identify which probe if multiple probes are colocated. */ -void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consumer *uc) +void uprobe_unregister(struct uprobe *uprobe, struct uprobe_consumer *uc) { - struct uprobe *uprobe; - - uprobe = find_uprobe(inode, offset); - if (WARN_ON(!uprobe)) - return; - + get_uprobe(uprobe); down_write(&uprobe->register_rwsem); __uprobe_unregister(uprobe, uc); up_write(&uprobe->register_rwsem); @@ -1120,7 +1114,7 @@ void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consume } EXPORT_SYMBOL_GPL(uprobe_unregister); -/* +/** * uprobe_register - register a probe * @inode: the file in which the probe has to be placed. * @offset: offset from the start of the file. @@ -1136,40 +1130,40 @@ EXPORT_SYMBOL_GPL(uprobe_unregister); * unregisters. Caller of uprobe_register() is required to keep @inode * (and the containing mount) referenced. * - * Return errno if it cannot successully install probes - * else return 0 (success) + * Return: pointer to the new uprobe on success or an ERR_PTR on failure. */ -int uprobe_register(struct inode *inode, loff_t offset, loff_t ref_ctr_offset, - struct uprobe_consumer *uc) +struct uprobe *uprobe_register(struct inode *inode, + loff_t offset, loff_t ref_ctr_offset, + struct uprobe_consumer *uc) { struct uprobe *uprobe; int ret; /* Uprobe must have at least one set consumer */ if (!uc->handler && !uc->ret_handler) - return -EINVAL; + return ERR_PTR(-EINVAL); /* copy_insn() uses read_mapping_page() or shmem_read_mapping_page() */ if (!inode->i_mapping->a_ops->read_folio && !shmem_mapping(inode->i_mapping)) - return -EIO; + return ERR_PTR(-EIO); /* Racy, just to catch the obvious mistakes */ if (offset > i_size_read(inode)) - return -EINVAL; + return ERR_PTR(-EINVAL); /* * This ensures that copy_from_page(), copy_to_page() and * __update_ref_ctr() can't cross page boundary. */ if (!IS_ALIGNED(offset, UPROBE_SWBP_INSN_SIZE)) - return -EINVAL; + return ERR_PTR(-EINVAL); if (!IS_ALIGNED(ref_ctr_offset, sizeof(short))) - return -EINVAL; + return ERR_PTR(-EINVAL); retry: uprobe = alloc_uprobe(inode, offset, ref_ctr_offset); if (IS_ERR(uprobe)) - return PTR_ERR(uprobe); + return uprobe; /* * We can race with uprobe_unregister()->delete_uprobe(). @@ -1188,35 +1182,29 @@ int uprobe_register(struct inode *inode, loff_t offset, loff_t ref_ctr_offset, if (unlikely(ret == -EAGAIN)) goto retry; - return ret; + + return ret ? ERR_PTR(ret) : uprobe; } EXPORT_SYMBOL_GPL(uprobe_register); -/* - * uprobe_apply - unregister an already registered probe. - * @inode: the file in which the probe has to be removed. - * @offset: offset from the start of the file. +/** + * uprobe_apply - add or remove the breakpoints according to @uc->filter + * @uprobe: uprobe which "owns" the breakpoint * @uc: consumer which wants to add more or remove some breakpoints * @add: add or remove the breakpoints + * Return: 0 on success or negative error code. */ -int uprobe_apply(struct inode *inode, loff_t offset, - struct uprobe_consumer *uc, bool add) +int uprobe_apply(struct uprobe *uprobe, struct uprobe_consumer *uc, bool add) { - struct uprobe *uprobe; struct uprobe_consumer *con; int ret = -ENOENT; - uprobe = find_uprobe(inode, offset); - if (WARN_ON(!uprobe)) - return ret; - down_write(&uprobe->register_rwsem); for (con = uprobe->consumers; con && con != uc ; con = con->next) ; if (con) ret = register_for_each_vma(uprobe, add ? uc : NULL); up_write(&uprobe->register_rwsem); - put_uprobe(uprobe); return ret; } diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index afa909e17824..4e391daafa64 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -3160,6 +3160,7 @@ struct bpf_uprobe { loff_t offset; unsigned long ref_ctr_offset; u64 cookie; + struct uprobe *uprobe; struct uprobe_consumer consumer; }; @@ -3178,15 +3179,12 @@ struct bpf_uprobe_multi_run_ctx { struct bpf_uprobe *uprobe; }; -static void bpf_uprobe_unregister(struct path *path, struct bpf_uprobe *uprobes, - u32 cnt) +static void bpf_uprobe_unregister(struct bpf_uprobe *uprobes, u32 cnt) { u32 i; - for (i = 0; i < cnt; i++) { - uprobe_unregister(d_real_inode(path->dentry), uprobes[i].offset, - &uprobes[i].consumer); - } + for (i = 0; i < cnt; i++) + uprobe_unregister(uprobes[i].uprobe, &uprobes[i].consumer); } static void bpf_uprobe_multi_link_release(struct bpf_link *link) @@ -3194,7 +3192,7 @@ static void bpf_uprobe_multi_link_release(struct bpf_link *link) struct bpf_uprobe_multi_link *umulti_link; umulti_link = container_of(link, struct bpf_uprobe_multi_link, link); - bpf_uprobe_unregister(&umulti_link->path, umulti_link->uprobes, umulti_link->cnt); + bpf_uprobe_unregister(umulti_link->uprobes, umulti_link->cnt); if (umulti_link->task) put_task_struct(umulti_link->task); path_put(&umulti_link->path); @@ -3480,12 +3478,13 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr &bpf_uprobe_multi_link_lops, prog); for (i = 0; i < cnt; i++) { - err = uprobe_register(d_real_inode(link->path.dentry), - uprobes[i].offset, - uprobes[i].ref_ctr_offset, - &uprobes[i].consumer); - if (err) { - bpf_uprobe_unregister(&path, uprobes, i); + uprobes[i].uprobe = uprobe_register(d_real_inode(link->path.dentry), + uprobes[i].offset, + uprobes[i].ref_ctr_offset, + &uprobes[i].consumer); + if (IS_ERR(uprobes[i].uprobe)) { + err = PTR_ERR(uprobes[i].uprobe); + bpf_uprobe_unregister(uprobes, i); goto error_free; } } diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 1f590f989c1e..52e76a73fa7c 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -58,8 +58,8 @@ struct trace_uprobe { struct dyn_event devent; struct uprobe_consumer consumer; struct path path; - struct inode *inode; char *filename; + struct uprobe *uprobe; unsigned long offset; unsigned long ref_ctr_offset; unsigned long nhit; @@ -1084,16 +1084,16 @@ typedef bool (*filter_func_t)(struct uprobe_consumer *self, static int trace_uprobe_enable(struct trace_uprobe *tu, filter_func_t filter) { - int ret; + struct inode *inode = d_real_inode(tu->path.dentry); + struct uprobe *uprobe; tu->consumer.filter = filter; - tu->inode = d_real_inode(tu->path.dentry); - - ret = uprobe_register(tu->inode, tu->offset, tu->ref_ctr_offset, &tu->consumer); - if (ret) - tu->inode = NULL; + uprobe = uprobe_register(inode, tu->offset, tu->ref_ctr_offset, &tu->consumer); + if (IS_ERR(uprobe)) + return PTR_ERR(uprobe); - return ret; + tu->uprobe = uprobe; + return 0; } static void __probe_event_disable(struct trace_probe *tp) @@ -1104,11 +1104,11 @@ static void __probe_event_disable(struct trace_probe *tp) WARN_ON(!uprobe_filter_is_empty(tu->tp.event->filter)); list_for_each_entry(tu, trace_probe_probe_list(tp), tp.list) { - if (!tu->inode) + if (!tu->uprobe) continue; - uprobe_unregister(tu->inode, tu->offset, &tu->consumer); - tu->inode = NULL; + uprobe_unregister(tu->uprobe, &tu->consumer); + tu->uprobe = NULL; } } @@ -1305,7 +1305,7 @@ static int uprobe_perf_close(struct trace_event_call *call, return 0; list_for_each_entry(tu, trace_probe_probe_list(tp), tp.list) { - ret = uprobe_apply(tu->inode, tu->offset, &tu->consumer, false); + ret = uprobe_apply(tu->uprobe, &tu->consumer, false); if (ret) break; } @@ -1329,7 +1329,7 @@ static int uprobe_perf_open(struct trace_event_call *call, return 0; list_for_each_entry(tu, trace_probe_probe_list(tp), tp.list) { - err = uprobe_apply(tu->inode, tu->offset, &tu->consumer, true); + err = uprobe_apply(tu->uprobe, &tu->consumer, true); if (err) { uprobe_perf_close(call, event); break; diff --git a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c index 55f6905de743..3c0515a27842 100644 --- a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c +++ b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c @@ -432,7 +432,7 @@ uprobe_ret_handler(struct uprobe_consumer *self, unsigned long func, struct testmod_uprobe { struct path path; - loff_t offset; + struct uprobe *uprobe; struct uprobe_consumer consumer; }; @@ -446,25 +446,25 @@ static int testmod_register_uprobe(loff_t offset) { int err = -EBUSY; - if (uprobe.offset) + if (uprobe.uprobe) return -EBUSY; mutex_lock(&testmod_uprobe_mutex); - if (uprobe.offset) + if (uprobe.uprobe) goto out; err = kern_path("/proc/self/exe", LOOKUP_FOLLOW, &uprobe.path); if (err) goto out; - err = uprobe_register(d_real_inode(uprobe.path.dentry), - offset, 0, &uprobe.consumer); - if (err) + uprobe.uprobe = uprobe_register(d_real_inode(uprobe.path.dentry), + offset, 0, &uprobe.consumer); + if (IS_ERR(uprobe.uprobe)) { + err = PTR_ERR(uprobe.uprobe); path_put(&uprobe.path); - else - uprobe.offset = offset; - + uprobe.uprobe = NULL; + } out: mutex_unlock(&testmod_uprobe_mutex); return err; @@ -474,11 +474,10 @@ static void testmod_unregister_uprobe(void) { mutex_lock(&testmod_uprobe_mutex); - if (uprobe.offset) { - uprobe_unregister(d_real_inode(uprobe.path.dentry), - uprobe.offset, &uprobe.consumer); + if (uprobe.uprobe) { + uprobe_unregister(uprobe.uprobe, &uprobe.consumer); path_put(&uprobe.path); - uprobe.offset = 0; + uprobe.uprobe = NULL; } mutex_unlock(&testmod_uprobe_mutex); -- cgit v1.2.3 From 6998a73efbb8a87f4dd0bddde90b7f5b0d47b5e0 Mon Sep 17 00:00:00 2001 From: Keith Lucas Date: Fri, 2 Aug 2024 06:13:18 +0000 Subject: selftests/mm: Add new testcases for pkeys Add a few new tests to exercise the signal handler flow, especially with PKEY 0 disabled: - Verify that the SIGSEGV handler is invoked when pkey 0 is disabled. - Verify that a thread which disables PKEY 0 segfaults with PKUERR when accessing the stack. - Verify that the SIGSEGV handler that uses an alternate signal stack is correctly invoked when the thread disabled PKEY 0 - Verify that the PKRU value set by the application is correctly restored upon return from signal handling. - Verify that sigreturn() is able to restore the altstack even if the thread had PKEY 0 disabled [ Aruna: Adapted to upstream ] [ tglx: Made it actually compile. Restored protection_keys compile. Added useful info to the changelog instead of bare function names. ] Signed-off-by: Keith Lucas Signed-off-by: Aruna Ramakrishna Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20240802061318.2140081-6-aruna.ramakrishna@oracle.com --- tools/testing/selftests/mm/Makefile | 1 + tools/testing/selftests/mm/pkey-helpers.h | 13 +- tools/testing/selftests/mm/pkey_sighandler_tests.c | 481 +++++++++++++++++++++ tools/testing/selftests/mm/protection_keys.c | 10 - 4 files changed, 494 insertions(+), 11 deletions(-) create mode 100644 tools/testing/selftests/mm/pkey_sighandler_tests.c (limited to 'tools') diff --git a/tools/testing/selftests/mm/Makefile b/tools/testing/selftests/mm/Makefile index 901e0d07765b..1f176fff7054 100644 --- a/tools/testing/selftests/mm/Makefile +++ b/tools/testing/selftests/mm/Makefile @@ -88,6 +88,7 @@ CAN_BUILD_X86_64 := $(shell ./../x86/check_cc.sh "$(CC)" ../x86/trivial_64bit_pr CAN_BUILD_WITH_NOPIE := $(shell ./../x86/check_cc.sh "$(CC)" ../x86/trivial_program.c -no-pie) VMTARGETS := protection_keys +VMTARGETS += pkey_sighandler_tests BINARIES_32 := $(VMTARGETS:%=%_32) BINARIES_64 := $(VMTARGETS:%=%_64) diff --git a/tools/testing/selftests/mm/pkey-helpers.h b/tools/testing/selftests/mm/pkey-helpers.h index 1af3156a9db8..4d31a309a46b 100644 --- a/tools/testing/selftests/mm/pkey-helpers.h +++ b/tools/testing/selftests/mm/pkey-helpers.h @@ -79,7 +79,18 @@ extern void abort_hooks(void); } \ } while (0) -__attribute__((noinline)) int read_ptr(int *ptr); +#define barrier() __asm__ __volatile__("": : :"memory") +#ifndef noinline +# define noinline __attribute__((noinline)) +#endif + +noinline int read_ptr(int *ptr) +{ + /* Keep GCC from optimizing this away somehow */ + barrier(); + return *ptr; +} + void expected_pkey_fault(int pkey); int sys_pkey_alloc(unsigned long flags, unsigned long init_val); int sys_pkey_free(unsigned long pkey); diff --git a/tools/testing/selftests/mm/pkey_sighandler_tests.c b/tools/testing/selftests/mm/pkey_sighandler_tests.c new file mode 100644 index 000000000000..a8088b645ad6 --- /dev/null +++ b/tools/testing/selftests/mm/pkey_sighandler_tests.c @@ -0,0 +1,481 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Tests Memory Protection Keys (see Documentation/core-api/protection-keys.rst) + * + * The testcases in this file exercise various flows related to signal handling, + * using an alternate signal stack, with the default pkey (pkey 0) disabled. + * + * Compile with: + * gcc -mxsave -o pkey_sighandler_tests -O2 -g -std=gnu99 -pthread -Wall pkey_sighandler_tests.c -I../../../../tools/include -lrt -ldl -lm + * gcc -mxsave -m32 -o pkey_sighandler_tests -O2 -g -std=gnu99 -pthread -Wall pkey_sighandler_tests.c -I../../../../tools/include -lrt -ldl -lm + */ +#define _GNU_SOURCE +#define __SANE_USERSPACE_TYPES__ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "pkey-helpers.h" + +#define STACK_SIZE PTHREAD_STACK_MIN + +void expected_pkey_fault(int pkey) {} + +pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER; +pthread_cond_t cond = PTHREAD_COND_INITIALIZER; +siginfo_t siginfo = {0}; + +/* + * We need to use inline assembly instead of glibc's syscall because glibc's + * syscall will attempt to access the PLT in order to call a library function + * which is protected by MPK 0 which we don't have access to. + */ +static inline __always_inline +long syscall_raw(long n, long a1, long a2, long a3, long a4, long a5, long a6) +{ + unsigned long ret; +#ifdef __x86_64__ + register long r10 asm("r10") = a4; + register long r8 asm("r8") = a5; + register long r9 asm("r9") = a6; + asm volatile ("syscall" + : "=a"(ret) + : "a"(n), "D"(a1), "S"(a2), "d"(a3), "r"(r10), "r"(r8), "r"(r9) + : "rcx", "r11", "memory"); +#elif defined __i386__ + asm volatile ("int $0x80" + : "=a"(ret) + : "a"(n), "b"(a1), "c"(a2), "d"(a3), "S"(a4), "D"(a5) + : "memory"); +#else +# error syscall_raw() not implemented +#endif + return ret; +} + +static void sigsegv_handler(int signo, siginfo_t *info, void *ucontext) +{ + pthread_mutex_lock(&mutex); + + memcpy(&siginfo, info, sizeof(siginfo_t)); + + pthread_cond_signal(&cond); + pthread_mutex_unlock(&mutex); + + syscall_raw(SYS_exit, 0, 0, 0, 0, 0, 0); +} + +static void sigusr1_handler(int signo, siginfo_t *info, void *ucontext) +{ + pthread_mutex_lock(&mutex); + + memcpy(&siginfo, info, sizeof(siginfo_t)); + + pthread_cond_signal(&cond); + pthread_mutex_unlock(&mutex); +} + +static void sigusr2_handler(int signo, siginfo_t *info, void *ucontext) +{ + /* + * pkru should be the init_pkru value which enabled MPK 0 so + * we can use library functions. + */ + printf("%s invoked.\n", __func__); +} + +static void raise_sigusr2(void) +{ + pid_t tid = 0; + + tid = syscall_raw(SYS_gettid, 0, 0, 0, 0, 0, 0); + + syscall_raw(SYS_tkill, tid, SIGUSR2, 0, 0, 0, 0); + + /* + * We should return from the signal handler here and be able to + * return to the interrupted thread. + */ +} + +static void *thread_segv_with_pkey0_disabled(void *ptr) +{ + /* Disable MPK 0 (and all others too) */ + __write_pkey_reg(0x55555555); + + /* Segfault (with SEGV_MAPERR) */ + *(int *) (0x1) = 1; + return NULL; +} + +static void *thread_segv_pkuerr_stack(void *ptr) +{ + /* Disable MPK 0 (and all others too) */ + __write_pkey_reg(0x55555555); + + /* After we disable MPK 0, we can't access the stack to return */ + return NULL; +} + +static void *thread_segv_maperr_ptr(void *ptr) +{ + stack_t *stack = ptr; + int *bad = (int *)1; + + /* + * Setup alternate signal stack, which should be pkey_mprotect()ed by + * MPK 0. The thread's stack cannot be used for signals because it is + * not accessible by the default init_pkru value of 0x55555554. + */ + syscall_raw(SYS_sigaltstack, (long)stack, 0, 0, 0, 0, 0); + + /* Disable MPK 0. Only MPK 1 is enabled. */ + __write_pkey_reg(0x55555551); + + /* Segfault */ + *bad = 1; + syscall_raw(SYS_exit, 0, 0, 0, 0, 0, 0); + return NULL; +} + +/* + * Verify that the sigsegv handler is invoked when pkey 0 is disabled. + * Note that the new thread stack and the alternate signal stack is + * protected by MPK 0. + */ +static void test_sigsegv_handler_with_pkey0_disabled(void) +{ + struct sigaction sa; + pthread_attr_t attr; + pthread_t thr; + + sa.sa_flags = SA_SIGINFO; + + sa.sa_sigaction = sigsegv_handler; + sigemptyset(&sa.sa_mask); + if (sigaction(SIGSEGV, &sa, NULL) == -1) { + perror("sigaction"); + exit(EXIT_FAILURE); + } + + memset(&siginfo, 0, sizeof(siginfo)); + + pthread_attr_init(&attr); + pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED); + + pthread_create(&thr, &attr, thread_segv_with_pkey0_disabled, NULL); + + pthread_mutex_lock(&mutex); + while (siginfo.si_signo == 0) + pthread_cond_wait(&cond, &mutex); + pthread_mutex_unlock(&mutex); + + ksft_test_result(siginfo.si_signo == SIGSEGV && + siginfo.si_code == SEGV_MAPERR && + siginfo.si_addr == (void *)1, + "%s\n", __func__); +} + +/* + * Verify that the sigsegv handler is invoked when pkey 0 is disabled. + * Note that the new thread stack and the alternate signal stack is + * protected by MPK 0, which renders them inaccessible when MPK 0 + * is disabled. So just the return from the thread should cause a + * segfault with SEGV_PKUERR. + */ +static void test_sigsegv_handler_cannot_access_stack(void) +{ + struct sigaction sa; + pthread_attr_t attr; + pthread_t thr; + + sa.sa_flags = SA_SIGINFO; + + sa.sa_sigaction = sigsegv_handler; + sigemptyset(&sa.sa_mask); + if (sigaction(SIGSEGV, &sa, NULL) == -1) { + perror("sigaction"); + exit(EXIT_FAILURE); + } + + memset(&siginfo, 0, sizeof(siginfo)); + + pthread_attr_init(&attr); + pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED); + + pthread_create(&thr, &attr, thread_segv_pkuerr_stack, NULL); + + pthread_mutex_lock(&mutex); + while (siginfo.si_signo == 0) + pthread_cond_wait(&cond, &mutex); + pthread_mutex_unlock(&mutex); + + ksft_test_result(siginfo.si_signo == SIGSEGV && + siginfo.si_code == SEGV_PKUERR, + "%s\n", __func__); +} + +/* + * Verify that the sigsegv handler that uses an alternate signal stack + * is correctly invoked for a thread which uses a non-zero MPK to protect + * its own stack, and disables all other MPKs (including 0). + */ +static void test_sigsegv_handler_with_different_pkey_for_stack(void) +{ + struct sigaction sa; + static stack_t sigstack; + void *stack; + int pkey; + int parent_pid = 0; + int child_pid = 0; + + sa.sa_flags = SA_SIGINFO | SA_ONSTACK; + + sa.sa_sigaction = sigsegv_handler; + + sigemptyset(&sa.sa_mask); + if (sigaction(SIGSEGV, &sa, NULL) == -1) { + perror("sigaction"); + exit(EXIT_FAILURE); + } + + stack = mmap(0, STACK_SIZE, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + + assert(stack != MAP_FAILED); + + /* Allow access to MPK 0 and MPK 1 */ + __write_pkey_reg(0x55555550); + + /* Protect the new stack with MPK 1 */ + pkey = pkey_alloc(0, 0); + pkey_mprotect(stack, STACK_SIZE, PROT_READ | PROT_WRITE, pkey); + + /* Set up alternate signal stack that will use the default MPK */ + sigstack.ss_sp = mmap(0, STACK_SIZE, PROT_READ | PROT_WRITE | PROT_EXEC, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + sigstack.ss_flags = 0; + sigstack.ss_size = STACK_SIZE; + + memset(&siginfo, 0, sizeof(siginfo)); + + /* Use clone to avoid newer glibcs using rseq on new threads */ + long ret = syscall_raw(SYS_clone, + CLONE_VM | CLONE_FS | CLONE_FILES | + CLONE_SIGHAND | CLONE_THREAD | CLONE_SYSVSEM | + CLONE_PARENT_SETTID | CLONE_CHILD_CLEARTID | + CLONE_DETACHED, + (long) ((char *)(stack) + STACK_SIZE), + (long) &parent_pid, + (long) &child_pid, 0, 0); + + if (ret < 0) { + errno = -ret; + perror("clone"); + } else if (ret == 0) { + thread_segv_maperr_ptr(&sigstack); + syscall_raw(SYS_exit, 0, 0, 0, 0, 0, 0); + } + + pthread_mutex_lock(&mutex); + while (siginfo.si_signo == 0) + pthread_cond_wait(&cond, &mutex); + pthread_mutex_unlock(&mutex); + + ksft_test_result(siginfo.si_signo == SIGSEGV && + siginfo.si_code == SEGV_MAPERR && + siginfo.si_addr == (void *)1, + "%s\n", __func__); +} + +/* + * Verify that the PKRU value set by the application is correctly + * restored upon return from signal handling. + */ +static void test_pkru_preserved_after_sigusr1(void) +{ + struct sigaction sa; + unsigned long pkru = 0x45454544; + + sa.sa_flags = SA_SIGINFO; + + sa.sa_sigaction = sigusr1_handler; + sigemptyset(&sa.sa_mask); + if (sigaction(SIGUSR1, &sa, NULL) == -1) { + perror("sigaction"); + exit(EXIT_FAILURE); + } + + memset(&siginfo, 0, sizeof(siginfo)); + + __write_pkey_reg(pkru); + + raise(SIGUSR1); + + pthread_mutex_lock(&mutex); + while (siginfo.si_signo == 0) + pthread_cond_wait(&cond, &mutex); + pthread_mutex_unlock(&mutex); + + /* Ensure the pkru value is the same after returning from signal. */ + ksft_test_result(pkru == __read_pkey_reg() && + siginfo.si_signo == SIGUSR1, + "%s\n", __func__); +} + +static noinline void *thread_sigusr2_self(void *ptr) +{ + /* + * A const char array like "Resuming after SIGUSR2" won't be stored on + * the stack and the code could access it via an offset from the program + * counter. This makes sure it's on the function's stack frame. + */ + char str[] = {'R', 'e', 's', 'u', 'm', 'i', 'n', 'g', ' ', + 'a', 'f', 't', 'e', 'r', ' ', + 'S', 'I', 'G', 'U', 'S', 'R', '2', + '.', '.', '.', '\n', '\0'}; + stack_t *stack = ptr; + + /* + * Setup alternate signal stack, which should be pkey_mprotect()ed by + * MPK 0. The thread's stack cannot be used for signals because it is + * not accessible by the default init_pkru value of 0x55555554. + */ + syscall(SYS_sigaltstack, (long)stack, 0, 0, 0, 0, 0); + + /* Disable MPK 0. Only MPK 2 is enabled. */ + __write_pkey_reg(0x55555545); + + raise_sigusr2(); + + /* Do something, to show the thread resumed execution after the signal */ + syscall_raw(SYS_write, 1, (long) str, sizeof(str) - 1, 0, 0, 0); + + /* + * We can't return to test_pkru_sigreturn because it + * will attempt to use a %rbp value which is on the stack + * of the main thread. + */ + syscall_raw(SYS_exit, 0, 0, 0, 0, 0, 0); + return NULL; +} + +/* + * Verify that sigreturn is able to restore altstack even if the thread had + * disabled pkey 0. + */ +static void test_pkru_sigreturn(void) +{ + struct sigaction sa = {0}; + static stack_t sigstack; + void *stack; + int pkey; + int parent_pid = 0; + int child_pid = 0; + + sa.sa_handler = SIG_DFL; + sa.sa_flags = 0; + sigemptyset(&sa.sa_mask); + + /* + * For this testcase, we do not want to handle SIGSEGV. Reset handler + * to default so that the application can crash if it receives SIGSEGV. + */ + if (sigaction(SIGSEGV, &sa, NULL) == -1) { + perror("sigaction"); + exit(EXIT_FAILURE); + } + + sa.sa_flags = SA_SIGINFO | SA_ONSTACK; + sa.sa_sigaction = sigusr2_handler; + sigemptyset(&sa.sa_mask); + + if (sigaction(SIGUSR2, &sa, NULL) == -1) { + perror("sigaction"); + exit(EXIT_FAILURE); + } + + stack = mmap(0, STACK_SIZE, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + + assert(stack != MAP_FAILED); + + /* + * Allow access to MPK 0 and MPK 2. The child thread (to be created + * later in this flow) will have its stack protected by MPK 2, whereas + * the current thread's stack is protected by the default MPK 0. Hence + * both need to be enabled. + */ + __write_pkey_reg(0x55555544); + + /* Protect the stack with MPK 2 */ + pkey = pkey_alloc(0, 0); + pkey_mprotect(stack, STACK_SIZE, PROT_READ | PROT_WRITE, pkey); + + /* Set up alternate signal stack that will use the default MPK */ + sigstack.ss_sp = mmap(0, STACK_SIZE, PROT_READ | PROT_WRITE | PROT_EXEC, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + sigstack.ss_flags = 0; + sigstack.ss_size = STACK_SIZE; + + /* Use clone to avoid newer glibcs using rseq on new threads */ + long ret = syscall_raw(SYS_clone, + CLONE_VM | CLONE_FS | CLONE_FILES | + CLONE_SIGHAND | CLONE_THREAD | CLONE_SYSVSEM | + CLONE_PARENT_SETTID | CLONE_CHILD_CLEARTID | + CLONE_DETACHED, + (long) ((char *)(stack) + STACK_SIZE), + (long) &parent_pid, + (long) &child_pid, 0, 0); + + if (ret < 0) { + errno = -ret; + perror("clone"); + } else if (ret == 0) { + thread_sigusr2_self(&sigstack); + syscall_raw(SYS_exit, 0, 0, 0, 0, 0, 0); + } + + child_pid = ret; + /* Check that thread exited */ + do { + sched_yield(); + ret = syscall_raw(SYS_tkill, child_pid, 0, 0, 0, 0, 0); + } while (ret != -ESRCH && ret != -EINVAL); + + ksft_test_result_pass("%s\n", __func__); +} + +static void (*pkey_tests[])(void) = { + test_sigsegv_handler_with_pkey0_disabled, + test_sigsegv_handler_cannot_access_stack, + test_sigsegv_handler_with_different_pkey_for_stack, + test_pkru_preserved_after_sigusr1, + test_pkru_sigreturn +}; + +int main(int argc, char *argv[]) +{ + int i; + + ksft_print_header(); + ksft_set_plan(ARRAY_SIZE(pkey_tests)); + + for (i = 0; i < ARRAY_SIZE(pkey_tests); i++) + (*pkey_tests[i])(); + + ksft_finished(); + return 0; +} diff --git a/tools/testing/selftests/mm/protection_keys.c b/tools/testing/selftests/mm/protection_keys.c index eaa6d1fc5328..cc6de1644360 100644 --- a/tools/testing/selftests/mm/protection_keys.c +++ b/tools/testing/selftests/mm/protection_keys.c @@ -950,16 +950,6 @@ void close_test_fds(void) nr_test_fds = 0; } -#define barrier() __asm__ __volatile__("": : :"memory") -__attribute__((noinline)) int read_ptr(int *ptr) -{ - /* - * Keep GCC from optimizing this away somehow - */ - barrier(); - return *ptr; -} - void test_pkey_alloc_free_attach_pkey0(int *ptr, u16 pkey) { int i, err; -- cgit v1.2.3 From 3d650ab5e7d9c4d7306e4c116f8aa9980bf13295 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Fri, 2 Aug 2024 11:54:34 -0700 Subject: selftests/bpf: Fix a btf_dump selftest failure Jakub reported bpf selftest "btf_dump" failure after forwarding to v6.11-rc1 with netdev. Error: #33 btf_dump Error: #33/15 btf_dump/btf_dump: var_data btf_dump_data:FAIL:find type id unexpected find type id: actual -2 < expected 0 The reason for the failure is due to commit 94ede2a3e913 ("profiling: remove stale percpu flip buffer variables") where percpu static variable "cpu_profile_flip" is removed. Let us replace "cpu_profile_flip" with a variable in bpf subsystem so whenever that variable gets deleted or renamed, we can detect the failure immediately. In this case, I picked a static percpu variable "bpf_cgrp_storage_busy" which is defined in kernel/bpf/bpf_cgrp_storage.c. Reported-by: Jakub Kicinski Signed-off-by: Yonghong Song Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20240802185434.1749056-1-yonghong.song@linux.dev --- tools/testing/selftests/bpf/prog_tests/btf_dump.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/btf_dump.c b/tools/testing/selftests/bpf/prog_tests/btf_dump.c index 09a8e6f9b379..b293b8501fd6 100644 --- a/tools/testing/selftests/bpf/prog_tests/btf_dump.c +++ b/tools/testing/selftests/bpf/prog_tests/btf_dump.c @@ -805,8 +805,8 @@ static void test_btf_dump_var_data(struct btf *btf, struct btf_dump *d, TEST_BTF_DUMP_VAR(btf, d, NULL, str, "cpu_number", int, BTF_F_COMPACT, "int cpu_number = (int)100", 100); #endif - TEST_BTF_DUMP_VAR(btf, d, NULL, str, "cpu_profile_flip", int, BTF_F_COMPACT, - "static int cpu_profile_flip = (int)2", 2); + TEST_BTF_DUMP_VAR(btf, d, NULL, str, "bpf_cgrp_storage_busy", int, BTF_F_COMPACT, + "static int bpf_cgrp_storage_busy = (int)2", 2); } static void test_btf_datasec(struct btf *btf, struct btf_dump *d, char *str, -- cgit v1.2.3 From ab1000976cc7de8e57bdef811dfcfcb6c17a929f Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Thu, 1 Aug 2024 17:03:07 -0700 Subject: selftests: net-drv: exercise queue stats when the device is down Verify that total device stats don't decrease after it has been turned down. Also make sure the device doesn't crash when we access per-queue stats when it's down (in case it tries to access some pointers that are NULL). KTAP version 1 1..5 ok 1 stats.check_pause ok 2 stats.check_fec ok 3 stats.pkt_byte_sum ok 4 stats.qstat_by_ifindex ok 5 stats.check_down # Totals: pass:5 fail:0 xfail:0 xpass:0 skip:0 error:0 v3: - use errno.EOPNOTSUPP (Petr) - move qstat[0] under try (Petr) v2: - KTAP output formatting (Jakub) - defer instead of try/finally (Jakub) - disappearing stats is an error (Jakub) - ksft_ge instead of open coding (Jakub) Signed-off-by: Stanislav Fomichev Link: https://patch.msgid.link/20240802000309.2368-1-sdf@fomichev.me Signed-off-by: Jakub Kicinski --- tools/testing/selftests/drivers/net/stats.py | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/drivers/net/stats.py b/tools/testing/selftests/drivers/net/stats.py index 820b8e0a22c6..2fdde8cf0307 100755 --- a/tools/testing/selftests/drivers/net/stats.py +++ b/tools/testing/selftests/drivers/net/stats.py @@ -1,10 +1,12 @@ #!/usr/bin/env python3 # SPDX-License-Identifier: GPL-2.0 +import errno from lib.py import ksft_run, ksft_exit, ksft_pr from lib.py import ksft_ge, ksft_eq, ksft_in, ksft_true, ksft_raises, KsftSkipEx, KsftXfailEx from lib.py import EthtoolFamily, NetdevFamily, RtnlFamily, NlError from lib.py import NetDrvEnv +from lib.py import ip, defer ethnl = EthtoolFamily() netfam = NetdevFamily() @@ -133,9 +135,30 @@ def qstat_by_ifindex(cfg) -> None: ksft_eq(cm.exception.nl_msg.extack['bad-attr'], '.ifindex') +def check_down(cfg) -> None: + try: + qstat = netfam.qstats_get({"ifindex": cfg.ifindex}, dump=True)[0] + except NlError as e: + if e.error == errno.EOPNOTSUPP: + raise KsftSkipEx("qstats not supported by the device") + raise + + ip(f"link set dev {cfg.dev['ifname']} down") + defer(ip, f"link set dev {cfg.dev['ifname']} up") + + qstat2 = netfam.qstats_get({"ifindex": cfg.ifindex}, dump=True)[0] + for k, v in qstat.items(): + ksft_ge(qstat2[k], qstat[k], comment=f"{k} went backwards on device down") + + # exercise per-queue API to make sure that "device down" state + # is handled correctly and doesn't crash + netfam.qstats_get({"ifindex": cfg.ifindex, "scope": "queue"}, dump=True) + + def main() -> None: with NetDrvEnv(__file__) as cfg: - ksft_run([check_pause, check_fec, pkt_byte_sum, qstat_by_ifindex], + ksft_run([check_pause, check_fec, pkt_byte_sum, qstat_by_ifindex, + check_down], args=(cfg, )) ksft_exit() -- cgit v1.2.3 From f879306834818ebd1722a4372079610cdd466fec Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Thu, 1 Aug 2024 17:03:08 -0700 Subject: selftests: net: ksft: support marking tests as disruptive Add new @ksft_disruptive decorator to mark the tests that might be disruptive to the system. Depending on how well the previous test works in the CI we might want to disable disruptive tests by default and only let the developers run them manually. KSFT framework runs disruptive tests by default. DISRUPTIVE=False environment (or config file) can be used to disable these tests. ksft_setup should be called by the test cases that want to use new decorator (ksft_setup is only called via NetDrvEnv/NetDrvEpEnv for now). In the future we can add similar decorators to, for example, avoid running slow tests all the time. And/or have some option to run only 'fast' tests for some sort of smoke test scenario. $ DISRUPTIVE=False ./stats.py KTAP version 1 1..5 ok 1 stats.check_pause ok 2 stats.check_fec ok 3 stats.pkt_byte_sum ok 4 stats.qstat_by_ifindex ok 5 stats.check_down # SKIP marked as disruptive # Totals: pass:4 fail:0 xfail:0 xpass:0 skip:1 error:0 v3: - parse yes and properly treat non-zero nums as true (Petr) v2: - convert from cli argument to env variable (Jakub) Signed-off-by: Stanislav Fomichev Link: https://patch.msgid.link/20240802000309.2368-2-sdf@fomichev.me Signed-off-by: Jakub Kicinski --- tools/testing/selftests/drivers/net/lib/py/env.py | 5 +-- tools/testing/selftests/drivers/net/stats.py | 2 ++ tools/testing/selftests/net/lib/py/ksft.py | 40 +++++++++++++++++++++++ 3 files changed, 45 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/drivers/net/lib/py/env.py b/tools/testing/selftests/drivers/net/lib/py/env.py index a5e800b8f103..1ea9bb695e94 100644 --- a/tools/testing/selftests/drivers/net/lib/py/env.py +++ b/tools/testing/selftests/drivers/net/lib/py/env.py @@ -4,6 +4,7 @@ import os import time from pathlib import Path from lib.py import KsftSkipEx, KsftXfailEx +from lib.py import ksft_setup from lib.py import cmd, ethtool, ip from lib.py import NetNS, NetdevSimDev from .remote import Remote @@ -14,7 +15,7 @@ def _load_env_file(src_path): src_dir = Path(src_path).parent.resolve() if not (src_dir / "net.config").exists(): - return env + return ksft_setup(env) with open((src_dir / "net.config").as_posix(), 'r') as fp: for line in fp.readlines(): @@ -30,7 +31,7 @@ def _load_env_file(src_path): if len(pair) != 2: raise Exception("Can't parse configuration line:", full_file) env[pair[0]] = pair[1] - return env + return ksft_setup(env) class NetDrvEnv: diff --git a/tools/testing/selftests/drivers/net/stats.py b/tools/testing/selftests/drivers/net/stats.py index 2fdde8cf0307..d17dfed2788f 100755 --- a/tools/testing/selftests/drivers/net/stats.py +++ b/tools/testing/selftests/drivers/net/stats.py @@ -4,6 +4,7 @@ import errno from lib.py import ksft_run, ksft_exit, ksft_pr from lib.py import ksft_ge, ksft_eq, ksft_in, ksft_true, ksft_raises, KsftSkipEx, KsftXfailEx +from lib.py import ksft_disruptive from lib.py import EthtoolFamily, NetdevFamily, RtnlFamily, NlError from lib.py import NetDrvEnv from lib.py import ip, defer @@ -135,6 +136,7 @@ def qstat_by_ifindex(cfg) -> None: ksft_eq(cm.exception.nl_msg.extack['bad-attr'], '.ifindex') +@ksft_disruptive def check_down(cfg) -> None: try: qstat = netfam.qstats_get({"ifindex": cfg.ifindex}, dump=True)[0] diff --git a/tools/testing/selftests/net/lib/py/ksft.py b/tools/testing/selftests/net/lib/py/ksft.py index f26c20df9db4..353860fe6223 100644 --- a/tools/testing/selftests/net/lib/py/ksft.py +++ b/tools/testing/selftests/net/lib/py/ksft.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 import builtins +import functools import inspect import sys import time @@ -10,6 +11,7 @@ from .utils import global_defer_queue KSFT_RESULT = None KSFT_RESULT_ALL = True +KSFT_DISRUPTIVE = True class KsftFailEx(Exception): @@ -127,6 +129,44 @@ def ksft_flush_defer(): KSFT_RESULT = False +def ksft_disruptive(func): + """ + Decorator that marks the test as disruptive (e.g. the test + that can down the interface). Disruptive tests can be skipped + by passing DISRUPTIVE=False environment variable. + """ + + @functools.wraps(func) + def wrapper(*args, **kwargs): + if not KSFT_DISRUPTIVE: + raise KsftSkipEx(f"marked as disruptive") + return func(*args, **kwargs) + return wrapper + + +def ksft_setup(env): + """ + Setup test framework global state from the environment. + """ + + def get_bool(env, name): + value = env.get(name, "").lower() + if value in ["yes", "true"]: + return True + if value in ["no", "false"]: + return False + try: + return bool(int(value)) + except: + raise Exception(f"failed to parse {name}") + + if "DISRUPTIVE" in env: + global KSFT_DISRUPTIVE + KSFT_DISRUPTIVE = get_bool(env, "DISRUPTIVE") + + return env + + def ksft_run(cases=None, globs=None, case_pfx=None, args=()): cases = cases or [] -- cgit v1.2.3 From a48395f22b8c8687ceb77ae3014a0eabcd4bf688 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Thu, 1 Aug 2024 17:03:09 -0700 Subject: selftests: net: ksft: replace 95 with errno.EOPNOTSUPP Petr suggested to use errno.EOPNOTSUPP instead of hard-coded 95 in the new test case. Adjust existing ones to match this style. Signed-off-by: Stanislav Fomichev Link: https://patch.msgid.link/20240802000309.2368-3-sdf@fomichev.me Signed-off-by: Jakub Kicinski --- tools/testing/selftests/drivers/net/hw/pp_alloc_fail.py | 3 ++- tools/testing/selftests/drivers/net/stats.py | 6 +++--- 2 files changed, 5 insertions(+), 4 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/drivers/net/hw/pp_alloc_fail.py b/tools/testing/selftests/drivers/net/hw/pp_alloc_fail.py index 026d98976c35..05b6fbb3fcdd 100755 --- a/tools/testing/selftests/drivers/net/hw/pp_alloc_fail.py +++ b/tools/testing/selftests/drivers/net/hw/pp_alloc_fail.py @@ -1,6 +1,7 @@ #!/usr/bin/env python3 # SPDX-License-Identifier: GPL-2.0 +import errno import time import os from lib.py import ksft_run, ksft_exit, ksft_pr @@ -61,7 +62,7 @@ def test_pp_alloc(cfg, netdevnl): try: stats = get_stats() except NlError as e: - if e.nl_msg.error == -95: + if e.nl_msg.error == -errno.EOPNOTSUPP: stats = {} else: raise diff --git a/tools/testing/selftests/drivers/net/stats.py b/tools/testing/selftests/drivers/net/stats.py index d17dfed2788f..63e3c045a3b2 100755 --- a/tools/testing/selftests/drivers/net/stats.py +++ b/tools/testing/selftests/drivers/net/stats.py @@ -20,7 +20,7 @@ def check_pause(cfg) -> None: try: ethnl.pause_get({"header": {"dev-index": cfg.ifindex}}) except NlError as e: - if e.error == 95: + if e.error == errno.EOPNOTSUPP: raise KsftXfailEx("pause not supported by the device") raise @@ -35,7 +35,7 @@ def check_fec(cfg) -> None: try: ethnl.fec_get({"header": {"dev-index": cfg.ifindex}}) except NlError as e: - if e.error == 95: + if e.error == errno.EOPNOTSUPP: raise KsftXfailEx("FEC not supported by the device") raise @@ -120,7 +120,7 @@ def qstat_by_ifindex(cfg) -> None: # loopback has no stats with ksft_raises(NlError) as cm: netfam.qstats_get({"ifindex": 1}, dump=True) - ksft_eq(cm.exception.nl_msg.error, -95) + ksft_eq(cm.exception.nl_msg.error, -errno.EOPNOTSUPP) ksft_eq(cm.exception.nl_msg.extack['bad-attr'], '.ifindex') # Try to get stats for lowest unused ifindex but not 0 -- cgit v1.2.3 From 46619175f1b7e4f7fc5dc98547f5eca27193f8dc Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 1 Aug 2024 16:23:17 -0700 Subject: selftests: net: ksft: print more of the stack for checks Print more stack frames and the failing line when check fails. This helps when tests use helpers to do the checks. Before: # At ./ksft/drivers/net/hw/rss_ctx.py line 92: # Check failed 1037698 >= 396893.0 traffic on other queues:[344612, 462380, 233020, 449174, 342298] not ok 8 rss_ctx.test_rss_context_queue_reconfigure After: # Check| At ./ksft/drivers/net/hw/rss_ctx.py, line 387, in test_rss_context_queue_reconfigure: # Check| test_rss_queue_reconfigure(cfg, main_ctx=False) # Check| At ./ksft/drivers/net/hw/rss_ctx.py, line 230, in test_rss_queue_reconfigure: # Check| _send_traffic_check(cfg, port, ctx_ref, { 'target': (0, 3), # Check| At ./ksft/drivers/net/hw/rss_ctx.py, line 92, in _send_traffic_check: # Check| ksft_lt(sum(cnts[i] for i in params['noise']), directed / 2, # Check failed 1045235 >= 405823.5 traffic on other queues (context 1)':[460068, 351995, 565970, 351579, 127270] not ok 8 rss_ctx.test_rss_context_queue_reconfigure Link: https://patch.msgid.link/20240801232317.545577-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/lib/py/ksft.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/lib/py/ksft.py b/tools/testing/selftests/net/lib/py/ksft.py index 353860fe6223..f663e0daec0d 100644 --- a/tools/testing/selftests/net/lib/py/ksft.py +++ b/tools/testing/selftests/net/lib/py/ksft.py @@ -34,8 +34,18 @@ def _fail(*args): global KSFT_RESULT KSFT_RESULT = False - frame = inspect.stack()[2] - ksft_pr("At " + frame.filename + " line " + str(frame.lineno) + ":") + stack = inspect.stack() + started = False + for frame in reversed(stack[2:]): + # Start printing from the test case function + if not started: + if frame.function == 'ksft_run': + started = True + continue + + ksft_pr("Check| At " + frame.filename + ", line " + str(frame.lineno) + + ", in " + frame.function + ":") + ksft_pr("Check| " + frame.code_context[0].strip()) ksft_pr(*args) -- cgit v1.2.3 From c6e2b45a544b45ce1a26858ded61a0dc4896d54a Mon Sep 17 00:00:00 2001 From: Anthony Nandaa Date: Tue, 2 Jul 2024 10:22:50 +0000 Subject: tools: hv: lsvmbus: change shebang to use python3 In many modern Linux distros, running `lsvmbus` returns the error: ``` /usr/bin/env: 'python': No such file or directory ``` because 'python' doesn't point anywhere. Now that python2 has reached EOL as of January 1, 2020 and is no longer maintained[1], these distros have python3 instead. Also, the script isn't executable by default because the permissions are set to mode 644. Fix this by updating the shebang in the `lsvmbus` to use python3 instead of python. Also fix the permissions to be 755 so that is executable by default, which matches other similar scripts in `tools/hv`. The script is also tested and verified that is compatible with python3. [1] https://www.python.org/doc/sunset-python-2/ Signed-off-by: Anthony Nandaa Reviewed-by: Michael Kelley Link: https://lore.kernel.org/r/20240702102250.13935-1-profnandaa@gmail.com Signed-off-by: Wei Liu Message-ID: <20240702102250.13935-1-profnandaa@gmail.com> --- tools/hv/lsvmbus | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) mode change 100644 => 100755 tools/hv/lsvmbus (limited to 'tools') diff --git a/tools/hv/lsvmbus b/tools/hv/lsvmbus old mode 100644 new mode 100755 index 099f2c44dbed..f83698f14da2 --- a/tools/hv/lsvmbus +++ b/tools/hv/lsvmbus @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 # SPDX-License-Identifier: GPL-2.0 import os -- cgit v1.2.3 From 13159a139d85fa12bbbaefcde5a647e2b9721b19 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Fri, 2 Aug 2024 11:09:13 -0700 Subject: perf mem: Update documentation for new options Add a common options section and move some items to the section. Also add description of new options to report options. Suggested-by: Ian Rogers Reviewed-by: Ian Rogers Signed-off-by: Namhyung Kim Cc: Adrian Hunter Cc: Athira Rajeev Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kan Liang Cc: Peter Zijlstra Cc: Stephane Eranian Link: https://lore.kernel.org/lkml/20240802180913.1023886-1-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Documentation/perf-mem.txt | 94 +++++++++++++++++++++++------------ 1 file changed, 63 insertions(+), 31 deletions(-) (limited to 'tools') diff --git a/tools/perf/Documentation/perf-mem.txt b/tools/perf/Documentation/perf-mem.txt index 47456b212e99..8a1bd9ff0f86 100644 --- a/tools/perf/Documentation/perf-mem.txt +++ b/tools/perf/Documentation/perf-mem.txt @@ -28,15 +28,8 @@ and kernel support is required. See linkperf:perf-arm-spe[1] for a setup guide. Due to the statistical nature of SPE sampling, not every memory operation will be sampled. -OPTIONS -------- -...:: - Any command you can specify in a shell. - --i:: ---input=:: - Input file name. - +COMMON OPTIONS +-------------- -f:: --force:: Don't do ownership validation @@ -45,24 +38,9 @@ OPTIONS --type=:: Select the memory operation type: load or store (default: load,store) --D:: ---dump-raw-samples:: - Dump the raw decoded samples on the screen in a format that is easy to parse with - one sample per line. - --x:: ---field-separator=:: - Specify the field separator used when dump raw samples (-D option). By default, - The separator is the space character. - --C:: ---cpu=:: - Monitor only on the list of CPUs provided. Multiple CPUs can be provided as a - comma-separated list with no space: 0,1. Ranges of CPUs are specified with -: 0-2. Default - is to monitor all CPUS. --U:: ---hide-unresolved:: - Only display entries resolved to a symbol. +-v:: +--verbose:: + Be more verbose (show counter open errors, etc) -p:: --phys-data:: @@ -73,6 +51,9 @@ OPTIONS RECORD OPTIONS -------------- +...:: + Any command you can specify in a shell. + -e:: --event :: Event selector. Use 'perf mem record -e list' to list available events. @@ -85,14 +66,65 @@ RECORD OPTIONS --all-user:: Configure all used events to run in user space. --v:: ---verbose:: - Be more verbose (show counter open errors, etc) - --ldlat :: Specify desired latency for loads event. Supported on Intel and Arm64 processors only. Ignored on other archs. +REPORT OPTIONS +-------------- +-i:: +--input=:: + Input file name. + +-C:: +--cpu=:: + Monitor only on the list of CPUs provided. Multiple CPUs can be provided as a + comma-separated list with no space: 0,1. Ranges of CPUs are specified with - + like 0-2. Default is to monitor all CPUS. + +-D:: +--dump-raw-samples:: + Dump the raw decoded samples on the screen in a format that is easy to parse with + one sample per line. + +-s:: +--sort=:: + Group result by given key(s) - multiple keys can be specified + in CSV format. The keys are specific to memory samples are: + symbol_daddr, symbol_iaddr, dso_daddr, locked, tlb, mem, snoop, + dcacheline, phys_daddr, data_page_size, blocked. + + - symbol_daddr: name of data symbol being executed on at the time of sample + - symbol_iaddr: name of code symbol being executed on at the time of sample + - dso_daddr: name of library or module containing the data being executed + on at the time of the sample + - locked: whether the bus was locked at the time of the sample + - tlb: type of tlb access for the data at the time of the sample + - mem: type of memory access for the data at the time of the sample + - snoop: type of snoop (if any) for the data at the time of the sample + - dcacheline: the cacheline the data address is on at the time of the sample + - phys_daddr: physical address of data being executed on at the time of sample + - data_page_size: the data page size of data being executed on at the time of sample + - blocked: reason of blocked load access for the data at the time of the sample + + And the default sort keys are changed to local_weight, mem, sym, dso, + symbol_daddr, dso_daddr, snoop, tlb, locked, blocked, local_ins_lat. + +-T:: +--type-profile:: + Show data-type profile result instead of code symbols. This requires + the debug information and it will change the default sort keys to: + mem, snoop, tlb, type. + +-U:: +--hide-unresolved:: + Only display entries resolved to a symbol. + +-x:: +--field-separator=:: + Specify the field separator used when dump raw samples (-D option). By default, + The separator is the space character. + In addition, for report all perf report options are valid, and for record all perf record options. -- cgit v1.2.3 From 96f30c8f0aa9923aa39b30bcaefeacf88b490231 Mon Sep 17 00:00:00 2001 From: Brian Norris Date: Mon, 15 Jul 2024 13:32:42 -0700 Subject: tools build: Correct libsubcmd fixdep dependencies All built targets need fixdep to be built first, before handling object dependencies [1]. We're missing one such dependency before the libsubcmd target. This resolves .cmd file generation issues such that the following sequence produces many fewer results: $ git clean -xfd tools/ $ make tools/objtool $ grep "cannot find fixdep" $(find tools/objtool -name '*.cmd') In particular, only a buggy tools/objtool/libsubcmd/.fixdep.o.cmd remains, due to circular dependencies of fixdep on itself. Such incomplete .cmd files don't usually cause a direct problem, since they're designed to fail "open", but they can cause some subtle problems that would otherwise be handled by proper fixdep'd dependency files. [2] [1] This problem is better described in commit abb26210a395 ("perf tools: Force fixdep compilation at the start of the build"). I don't apply its solution here, because additional recursive make can be a bit of overkill. [2] Example failure case: cp -arl linux-src linux-src2 cd linux-src2 make O=/path/to/out cd ../linux-src rm -rf ../linux-src2 make O=/path/to/out Previously, we'd see errors like: make[6]: *** No rule to make target '/path/to/linux-src2/tools/include/linux/compiler.h', needed by '/path/to/out/tools/bpf/resolve_btfids/libsubcmd/exec-cmd.o'. Stop. Now, the properly-fixdep'd .cmd files will ignore a missing /path/to/linux-src2/... Signed-off-by: Brian Norris Acked-by: Jiri Olsa Cc: Ian Rogers Cc: Josh Poimboeuf Cc: Masahiro Yamada Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Thomas Richter Link: https://lore.kernel.org/all/ZGVi9HbI43R5trN8@bhelgaas/ Link: https://lore.kernel.org/all/Zk-C5Eg84yt6_nml@google.com/ Link: https://lore.kernel.org/r/20240715203325.3832977-2-briannorris@chromium.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/lib/subcmd/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/lib/subcmd/Makefile b/tools/lib/subcmd/Makefile index b87213263a5e..59b09f280e49 100644 --- a/tools/lib/subcmd/Makefile +++ b/tools/lib/subcmd/Makefile @@ -76,7 +76,7 @@ include $(srctree)/tools/build/Makefile.include all: fixdep $(LIBFILE) -$(SUBCMD_IN): FORCE +$(SUBCMD_IN): fixdep FORCE @$(MAKE) $(build)=libsubcmd $(LIBFILE): $(SUBCMD_IN) -- cgit v1.2.3 From ea974028a049f2cea4bb6be963ee3e3844a03f6d Mon Sep 17 00:00:00 2001 From: Brian Norris Date: Mon, 15 Jul 2024 13:32:43 -0700 Subject: tools build: Avoid circular .fixdep-in.o.cmd issues The 'fixdep' tool is used to post-process dependency files for various reasons, and it runs after every object file generation command. This even includes 'fixdep' itself. In Kbuild, this isn't actually a problem, because it uses a single command to generate fixdep (a compile-and-link command on fixdep.c), and afterward runs the fixdep command on the accompanying .fixdep.cmd file. In tools/ builds (which notably is maintained separately from Kbuild), fixdep is generated in several phases: 1. fixdep.c -> fixdep-in.o 2. fixdep-in.o -> fixdep Thus, fixdep is not available in the post-processing for step 1, and instead, we generate .cmd files that look like: ## from tools/objtool/libsubcmd/.fixdep.o.cmd # cannot find fixdep (/path/to/linux/tools/objtool/libsubcmd//fixdep) [...] These invalid .cmd files are benign in some respects, but cause problems in others (such as the linked reports). Because the tools/ build system is rather complicated in its own right (and pointedly different than Kbuild), I choose to simply open-code the rule for building fixdep, and avoid the recursive-make indirection that produces the problem in the first place. Signed-off-by: Brian Norris Acked-by: Jiri Olsa Cc: Ian Rogers Cc: Josh Poimboeuf Cc: Masahiro Yamada Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Thomas Richter Link: https://lore.kernel.org/all/Zk-C5Eg84yt6_nml@google.com/ Link: https://lore.kernel.org/r/20240715203325.3832977-3-briannorris@chromium.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/build/Build | 3 --- tools/build/Makefile | 11 ++--------- 2 files changed, 2 insertions(+), 12 deletions(-) delete mode 100644 tools/build/Build (limited to 'tools') diff --git a/tools/build/Build b/tools/build/Build deleted file mode 100644 index 76d1a4960973..000000000000 --- a/tools/build/Build +++ /dev/null @@ -1,3 +0,0 @@ -hostprogs := fixdep - -fixdep-y := fixdep.o diff --git a/tools/build/Makefile b/tools/build/Makefile index 17cdf01e29a0..fea3cf647f5b 100644 --- a/tools/build/Makefile +++ b/tools/build/Makefile @@ -43,12 +43,5 @@ ifneq ($(wildcard $(TMP_O)),) $(Q)$(MAKE) -C feature OUTPUT=$(TMP_O) clean >/dev/null endif -$(OUTPUT)fixdep-in.o: FORCE - $(Q)$(MAKE) $(build)=fixdep - -$(OUTPUT)fixdep: $(OUTPUT)fixdep-in.o - $(QUIET_LINK)$(HOSTCC) $(KBUILD_HOSTLDFLAGS) -o $@ $< - -FORCE: - -.PHONY: FORCE +$(OUTPUT)fixdep: $(srctree)/tools/build/fixdep.c + $(QUIET_CC)$(HOSTCC) $(KBUILD_HOSTLDFLAGS) -o $@ $< -- cgit v1.2.3 From dbb2a7a986971ef43d5a60d235c05491647e16f4 Mon Sep 17 00:00:00 2001 From: Brian Norris Date: Mon, 15 Jul 2024 13:32:44 -0700 Subject: tools build: Correct bpf fixdep dependencies The dependencies in tools/lib/bpf/Makefile are incorrect. Before we recurse to build $(BPF_IN_STATIC), we need to build its 'fixdep' executable. I can't use the usual shortcut from Makefile.include: : fixdep because its 'fixdep' target relies on $(OUTPUT), and $(OUTPUT) differs in the parent 'make' versus the child 'make' -- so I imitate it via open-coding. I tweak a few $(MAKE) invocations while I'm at it, because 1. I'm adding a new recursive make; and 2. these recursive 'make's print spurious lines about files that are "up to date" (which isn't normally a feature in Kbuild subtargets) or "jobserver not available" (see [1]) I also need to tweak the assignment of the OUTPUT variable, so that relative path builds work. For example, for 'make tools/lib/bpf', OUTPUT is unset, and is usually treated as "cwd" -- but recursive make will change cwd and so OUTPUT has a new meaning. For consistency, I ensure OUTPUT is always an absolute path. And $(Q) gets a backup definition in tools/build/Makefile.include, because Makefile.include is sometimes included without tools/build/Makefile, so the "quiet command" stuff doesn't actually work consistently without it. After this change, top-level builds result in an empty grep result from: $ grep 'cannot find fixdep' $(find tools/ -name '*.cmd') [1] https://www.gnu.org/software/make/manual/html_node/MAKE-Variable.html If we're not using $(MAKE) directly, then we need to use more '+'. Signed-off-by: Brian Norris Acked-by: Andrii Nakryiko Acked-by: Jiri Olsa Cc: Ian Rogers Cc: Josh Poimboeuf Cc: Masahiro Yamada Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Thomas Richter Link: https://lore.kernel.org/r/20240715203325.3832977-4-briannorris@chromium.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/build/Makefile.include | 12 +++++++++++- tools/lib/bpf/.gitignore | 1 + tools/lib/bpf/Makefile | 13 ++++++++++--- 3 files changed, 22 insertions(+), 4 deletions(-) (limited to 'tools') diff --git a/tools/build/Makefile.include b/tools/build/Makefile.include index 8dadaa0fbb43..0e4de83400ac 100644 --- a/tools/build/Makefile.include +++ b/tools/build/Makefile.include @@ -1,8 +1,18 @@ # SPDX-License-Identifier: GPL-2.0-only build := -f $(srctree)/tools/build/Makefile.build dir=. obj +# More than just $(Q), we sometimes want to suppress all command output from a +# recursive make -- even the 'up to date' printout. +ifeq ($(V),1) + Q ?= + SILENT_MAKE = +$(Q)$(MAKE) +else + Q ?= @ + SILENT_MAKE = +$(Q)$(MAKE) --silent +endif + fixdep: - $(Q)$(MAKE) -C $(srctree)/tools/build CFLAGS= LDFLAGS= $(OUTPUT)fixdep + $(SILENT_MAKE) -C $(srctree)/tools/build CFLAGS= LDFLAGS= $(OUTPUT)fixdep fixdep-clean: $(Q)$(MAKE) -C $(srctree)/tools/build clean diff --git a/tools/lib/bpf/.gitignore b/tools/lib/bpf/.gitignore index 0da84cb9e66d..f02725b123b3 100644 --- a/tools/lib/bpf/.gitignore +++ b/tools/lib/bpf/.gitignore @@ -5,3 +5,4 @@ TAGS tags cscope.* /bpf_helper_defs.h +fixdep diff --git a/tools/lib/bpf/Makefile b/tools/lib/bpf/Makefile index 2cf892774346..1b22f0f37288 100644 --- a/tools/lib/bpf/Makefile +++ b/tools/lib/bpf/Makefile @@ -108,6 +108,8 @@ MAKEOVERRIDES= all: +OUTPUT ?= ./ +OUTPUT := $(abspath $(OUTPUT))/ export srctree OUTPUT CC LD CFLAGS V include $(srctree)/tools/build/Makefile.include @@ -141,7 +143,10 @@ all: fixdep all_cmd: $(CMD_TARGETS) check -$(BPF_IN_SHARED): force $(BPF_GENERATED) +$(SHARED_OBJDIR) $(STATIC_OBJDIR): + $(Q)mkdir -p $@ + +$(BPF_IN_SHARED): force $(BPF_GENERATED) | $(SHARED_OBJDIR) @(test -f ../../include/uapi/linux/bpf.h -a -f ../../../include/uapi/linux/bpf.h && ( \ (diff -B ../../include/uapi/linux/bpf.h ../../../include/uapi/linux/bpf.h >/dev/null) || \ echo "Warning: Kernel ABI header at 'tools/include/uapi/linux/bpf.h' differs from latest version at 'include/uapi/linux/bpf.h'" >&2 )) || true @@ -151,9 +156,11 @@ $(BPF_IN_SHARED): force $(BPF_GENERATED) @(test -f ../../include/uapi/linux/if_xdp.h -a -f ../../../include/uapi/linux/if_xdp.h && ( \ (diff -B ../../include/uapi/linux/if_xdp.h ../../../include/uapi/linux/if_xdp.h >/dev/null) || \ echo "Warning: Kernel ABI header at 'tools/include/uapi/linux/if_xdp.h' differs from latest version at 'include/uapi/linux/if_xdp.h'" >&2 )) || true + $(SILENT_MAKE) -C $(srctree)/tools/build CFLAGS= LDFLAGS= OUTPUT=$(SHARED_OBJDIR) $(SHARED_OBJDIR)fixdep $(Q)$(MAKE) $(build)=libbpf OUTPUT=$(SHARED_OBJDIR) CFLAGS="$(CFLAGS) $(SHLIB_FLAGS)" -$(BPF_IN_STATIC): force $(BPF_GENERATED) +$(BPF_IN_STATIC): force $(BPF_GENERATED) | $(STATIC_OBJDIR) + $(SILENT_MAKE) -C $(srctree)/tools/build CFLAGS= LDFLAGS= OUTPUT=$(STATIC_OBJDIR) $(STATIC_OBJDIR)fixdep $(Q)$(MAKE) $(build)=libbpf OUTPUT=$(STATIC_OBJDIR) $(BPF_HELPER_DEFS): $(srctree)/tools/include/uapi/linux/bpf.h @@ -263,7 +270,7 @@ install_pkgconfig: $(PC_FILE) install: install_lib install_pkgconfig install_headers -clean: +clean: fixdep-clean $(call QUIET_CLEAN, libbpf) $(RM) -rf $(CMD_TARGETS) \ *~ .*.d .*.cmd LIBBPF-CFLAGS $(BPF_GENERATED) \ $(SHARED_OBJDIR) $(STATIC_OBJDIR) \ -- cgit v1.2.3 From 2dc02c26419b984692e20949252ca6cba515abaf Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Sat, 3 Aug 2024 14:13:28 -0700 Subject: perf annotate: Use al->data_nr if possible The data_nr keeps the number of entries in al->data[] so it should use it when it iterates the array. The notes->src->nr_events should have the same number but it'd be natural to use al->data_nr. Signed-off-by: Namhyung Kim Cc: Adrian Hunter Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kan Liang Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20240803211332.1107222-2-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/annotate.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c index a2ee4074f768..91ad948c89d5 100644 --- a/tools/perf/util/annotate.c +++ b/tools/perf/util/annotate.c @@ -1594,13 +1594,12 @@ bool ui__has_annotation(void) static double annotation_line__max_percent(struct annotation_line *al, - struct annotation *notes, unsigned int percent_type) { double percent_max = 0.0; int i; - for (i = 0; i < notes->src->nr_events; i++) { + for (i = 0; i < al->data_nr; i++) { double percent; percent = annotation_data__percent(&al->data[i], @@ -1672,7 +1671,7 @@ static void __annotation_line__write(struct annotation_line *al, struct annotati void (*obj__write_graph)(void *obj, int graph)) { - double percent_max = annotation_line__max_percent(al, notes, percent_type); + double percent_max = annotation_line__max_percent(al, percent_type); int pcnt_width = annotation__pcnt_width(notes), cycles_width = annotation__cycles_width(notes); bool show_title = false; @@ -1690,7 +1689,7 @@ static void __annotation_line__write(struct annotation_line *al, struct annotati if (al->offset != -1 && percent_max != 0.0) { int i; - for (i = 0; i < notes->src->nr_events; i++) { + for (i = 0; i < al->data_nr; i++) { double percent; percent = annotation_data__percent(&al->data[i], percent_type); -- cgit v1.2.3 From cb1e8bfc7914747692c06b215539ef80105fddbc Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Sat, 3 Aug 2024 14:13:29 -0700 Subject: perf annotate: Set notes->src->nr_events early We want to use it in different places so make sure it sets properly in symbol__annotate() before creating the disasm lines. Signed-off-by: Namhyung Kim Cc: Adrian Hunter Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kan Liang Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20240803211332.1107222-3-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/annotate.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c index 91ad948c89d5..09e6fdf344db 100644 --- a/tools/perf/util/annotate.c +++ b/tools/perf/util/annotate.c @@ -925,6 +925,11 @@ int symbol__annotate(struct map_symbol *ms, struct evsel *evsel, return -1; } + if (evsel__is_group_event(evsel)) + notes->src->nr_events = evsel->core.nr_members; + else + notes->src->nr_events = 1; + if (annotate_opts.full_addr) notes->src->start = map__objdump_2mem(ms->map, ms->sym->start); else @@ -1842,10 +1847,7 @@ int symbol__annotate2(struct map_symbol *ms, struct evsel *evsel, struct symbol *sym = ms->sym; struct annotation *notes = symbol__annotation(sym); size_t size = symbol__size(sym); - int nr_pcnt = 1, err; - - if (evsel__is_group_event(evsel)) - nr_pcnt = evsel->core.nr_members; + int err; err = symbol__annotate(ms, evsel, parch); if (err) @@ -1861,8 +1863,6 @@ int symbol__annotate2(struct map_symbol *ms, struct evsel *evsel, return err; annotation__init_column_widths(notes, sym); - notes->src->nr_events = nr_pcnt; - annotation__update_column_widths(notes); sym->annotate2 = 1; -- cgit v1.2.3 From b00e4d0d93d30a693ee88b2c25f40181d327e53b Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Sat, 3 Aug 2024 14:13:30 -0700 Subject: perf annotate: Use annotation__pcnt_width() consistently The annotation__pcnt_width() calculates the screen width for the overhead (percent) area considering event groups properly. Use this function consistently so that we can make sure it has similar output in different modes. But there's a difference in stdio and tui output: stdio uses 8 and tui uses 7 for a percent. Let's use 8 and adjust the print width in __annotation_line__write() properly. Signed-off-by: Namhyung Kim Tested-by: Arnaldo Carvalho de Melo Cc: Adrian Hunter Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kan Liang Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20240803211332.1107222-4-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/annotate.c | 14 +++++--------- tools/perf/util/annotate.h | 2 +- 2 files changed, 6 insertions(+), 10 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c index 09e6fdf344db..917897fe44a2 100644 --- a/tools/perf/util/annotate.c +++ b/tools/perf/util/annotate.c @@ -699,13 +699,13 @@ annotation_line__print(struct annotation_line *al, struct symbol *sym, u64 start int percent_type) { struct disasm_line *dl = container_of(al, struct disasm_line, al); + struct annotation *notes = symbol__annotation(sym); static const char *prev_line; if (al->offset != -1) { double max_percent = 0.0; int i, nr_percent = 1; const char *color; - struct annotation *notes = symbol__annotation(sym); for (i = 0; i < al->data_nr; i++) { double percent; @@ -775,14 +775,11 @@ annotation_line__print(struct annotation_line *al, struct symbol *sym, u64 start } else if (max_lines && printed >= max_lines) return 1; else { - int width = symbol_conf.show_total_period ? 12 : 8; + int width = annotation__pcnt_width(notes); if (queue) return -1; - if (evsel__is_group_event(evsel)) - width *= evsel->core.nr_members; - if (!*al->line) printf(" %*s:\n", width, " "); else @@ -1111,7 +1108,7 @@ int symbol__annotate_printf(struct map_symbol *ms, struct evsel *evsel) int more = 0; bool context = opts->context; u64 len; - int width = symbol_conf.show_total_period ? 12 : 8; + int width = annotation__pcnt_width(notes); int graph_dotted_len; char buf[512]; @@ -1127,7 +1124,6 @@ int symbol__annotate_printf(struct map_symbol *ms, struct evsel *evsel) len = symbol__size(sym); if (evsel__is_group_event(evsel)) { - width *= evsel->core.nr_members; evsel__group_desc(evsel, buf, sizeof(buf)); evsel_name = buf; } @@ -1703,10 +1699,10 @@ static void __annotation_line__write(struct annotation_line *al, struct annotati if (symbol_conf.show_total_period) { obj__printf(obj, "%11" PRIu64 " ", al->data[i].he.period); } else if (symbol_conf.show_nr_samples) { - obj__printf(obj, "%6" PRIu64 " ", + obj__printf(obj, "%7" PRIu64 " ", al->data[i].he.nr_samples); } else { - obj__printf(obj, "%6.2f ", percent); + obj__printf(obj, "%7.2f ", percent); } } } else { diff --git a/tools/perf/util/annotate.h b/tools/perf/util/annotate.h index 9ba772f46270..64e70d716ff1 100644 --- a/tools/perf/util/annotate.h +++ b/tools/perf/util/annotate.h @@ -339,7 +339,7 @@ static inline int annotation__cycles_width(struct annotation *notes) static inline int annotation__pcnt_width(struct annotation *notes) { - return (symbol_conf.show_total_period ? 12 : 7) * notes->src->nr_events; + return (symbol_conf.show_total_period ? 12 : 8) * notes->src->nr_events; } static inline bool annotation_line__filter(struct annotation_line *al) -- cgit v1.2.3 From bb588e38290fb7236e7f2c0bcbd720150277ca83 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Sat, 3 Aug 2024 14:13:31 -0700 Subject: perf annotate: Set al->data_nr using the notes->src->nr_events This is a preparation to support skipping empty events. Signed-off-by: Namhyung Kim Cc: Adrian Hunter Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kan Liang Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20240803211332.1107222-5-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/disasm.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/disasm.c b/tools/perf/util/disasm.c index 85fb0cfedf94..22289003e16d 100644 --- a/tools/perf/util/disasm.c +++ b/tools/perf/util/disasm.c @@ -1037,10 +1037,8 @@ static size_t disasm_line_size(int nr) struct disasm_line *disasm_line__new(struct annotate_args *args) { struct disasm_line *dl = NULL; - int nr = 1; - - if (evsel__is_group_event(args->evsel)) - nr = args->evsel->core.nr_members; + struct annotation *notes = symbol__annotation(args->ms.sym); + int nr = notes->src->nr_events; dl = zalloc(disasm_line_size(nr)); if (!dl) -- cgit v1.2.3 From ce533c9bc6deb12535348693fd48fab92a748c2a Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Sat, 3 Aug 2024 14:13:32 -0700 Subject: perf annotate: Add --skip-empty option Like in 'perf report', we want to hide empty events in the 'perf annotate' output. This is consistent when the option is set in perf report. For example, the following command would use 3 events including dummy. $ perf mem record -a -- perf test -w noploop $ perf evlist cpu/mem-loads,ldlat=30/P cpu/mem-stores/P dummy:u Just using perf annotate with --group will show the all 3 events. $ perf annotate --group --stdio | head Percent | Source code & Disassembly of ... -------------------------------------------------------------- : 0 0xe060 <_dl_relocate_object>: 0.00 0.00 0.00 : e060: pushq %rbp 0.00 0.00 0.00 : e061: movq %rsp, %rbp 0.00 0.00 0.00 : e064: pushq %r15 0.00 0.00 0.00 : e066: movq %rdi, %r15 0.00 0.00 0.00 : e069: pushq %r14 0.00 0.00 0.00 : e06b: pushq %r13 0.00 0.00 0.00 : e06d: movl %edx, %r13d Now with --skip-empty, it'll hide the last dummy event. $ perf annotate --group --stdio --skip-empty | head Percent | Source code & Disassembly of ... ------------------------------------------------------ : 0 0xe060 <_dl_relocate_object>: 0.00 0.00 : e060: pushq %rbp 0.00 0.00 : e061: movq %rsp, %rbp 0.00 0.00 : e064: pushq %r15 0.00 0.00 : e066: movq %rdi, %r15 0.00 0.00 : e069: pushq %r14 0.00 0.00 : e06b: pushq %r13 0.00 0.00 : e06d: movl %edx, %r13d Committer testing: root@x1:~# perf evlist cpu_atom/mem-loads,ldlat=30/P cpu_atom/mem-stores/P dummy:u root@x1:~# Before: root@x1:~# perf annotate --group --stdio2 do_lookup_x | head -25 Samples: 20 of events 'cpu_atom/mem-loads,ldlat=30/P, cpu_atom/mem-stores/P, dummy:u', 4000 Hz, Event count (approx.): 769079, [percent: local period] do_lookup_x() /usr/lib64/ld-linux-x86-64.so.2 Percent 0x9900 : pushq %rbp movq %rsp,%rbp pushq %r15 pushq %r14 pushq %r13 pushq %r12 pushq %rbx subq $0x88,%rsp movq %rdi,-0x50(%rbp) movl 8(%r9),%edi movq 0x10(%rbp),%r12 movq 0x28(%rbp),%r10 movq %rdx,-0x70(%rbp) movq %rcx,-0x58(%rbp) movq %rdi,%r11 0.00 5.73 0.00 movq %r8,-0x68(%rbp) movq (%r9),%r8 movl %esi,%eax 8.30 0.00 0.00 movl 0x30(%rbp),%r9d movl %esi,%r15d shrl $6, %eax movq %r8,%r13 root@x1:~# After: root@x1:~# perf annotate --group --skip-empty --stdio2 do_lookup_x | head -25 Samples: 20 of events 'cpu_atom/mem-loads,ldlat=30/P, cpu_atom/mem-stores/P', 4000 Hz, Event count (approx.): 769079, [percent: local period] do_lookup_x() /usr/lib64/ld-linux-x86-64.so.2 Percent 0x9900 : pushq %rbp movq %rsp,%rbp pushq %r15 pushq %r14 pushq %r13 pushq %r12 pushq %rbx subq $0x88,%rsp movq %rdi,-0x50(%rbp) movl 8(%r9),%edi movq 0x10(%rbp),%r12 movq 0x28(%rbp),%r10 movq %rdx,-0x70(%rbp) movq %rcx,-0x58(%rbp) movq %rdi,%r11 0.00 5.73 movq %r8,-0x68(%rbp) movq (%r9),%r8 movl %esi,%eax 8.30 0.00 movl 0x30(%rbp),%r9d movl %esi,%r15d shrl $6, %eax movq %r8,%r13 root@x1:~# Signed-off-by: Namhyung Kim Tested-by: Arnaldo Carvalho de Melo Cc: Adrian Hunter Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kan Liang Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20240803211332.1107222-6-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Documentation/perf-annotate.txt | 3 +++ tools/perf/builtin-annotate.c | 2 ++ tools/perf/util/annotate.c | 22 +++++++++++++++++----- 3 files changed, 22 insertions(+), 5 deletions(-) (limited to 'tools') diff --git a/tools/perf/Documentation/perf-annotate.txt b/tools/perf/Documentation/perf-annotate.txt index b95524bea021..156c5f37b051 100644 --- a/tools/perf/Documentation/perf-annotate.txt +++ b/tools/perf/Documentation/perf-annotate.txt @@ -165,6 +165,9 @@ include::itrace.txt[] --type-stat:: Show stats for the data type annotation. +--skip-empty:: + Do not display empty (or dummy) events. + SEE ALSO -------- diff --git a/tools/perf/builtin-annotate.c b/tools/perf/builtin-annotate.c index cf60392b1c19..efcadb7620b8 100644 --- a/tools/perf/builtin-annotate.c +++ b/tools/perf/builtin-annotate.c @@ -795,6 +795,8 @@ int cmd_annotate(int argc, const char **argv) "Show stats for the data type annotation"), OPT_BOOLEAN(0, "insn-stat", &annotate.insn_stat, "Show instruction stats for the data type annotation"), + OPT_BOOLEAN(0, "skip-empty", &symbol_conf.skip_empty, + "Do not display empty (or dummy) events in the output"), OPT_END() }; int ret; diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c index 917897fe44a2..eafe8d65052e 100644 --- a/tools/perf/util/annotate.c +++ b/tools/perf/util/annotate.c @@ -848,6 +848,10 @@ static void annotation__calc_percent(struct annotation *notes, BUG_ON(i >= al->data_nr); + if (symbol_conf.skip_empty && + evsel__hists(evsel)->stats.nr_samples == 0) + continue; + data = &al->data[i++]; calc_percent(notes, evsel, data, al->offset, end); @@ -901,7 +905,7 @@ int symbol__annotate(struct map_symbol *ms, struct evsel *evsel, .options = &annotate_opts, }; struct arch *arch = NULL; - int err; + int err, nr; err = evsel__get_arch(evsel, &arch); if (err < 0) @@ -922,10 +926,18 @@ int symbol__annotate(struct map_symbol *ms, struct evsel *evsel, return -1; } - if (evsel__is_group_event(evsel)) - notes->src->nr_events = evsel->core.nr_members; - else - notes->src->nr_events = 1; + nr = 0; + if (evsel__is_group_event(evsel)) { + struct evsel *pos; + + for_each_group_evsel(pos, evsel) { + if (symbol_conf.skip_empty && + evsel__hists(pos)->stats.nr_samples == 0) + continue; + nr++; + } + } + notes->src->nr_events = nr ? nr : 1; if (annotate_opts.full_addr) notes->src->start = map__objdump_2mem(ms->map, ms->sym->start); -- cgit v1.2.3 From 92841d6e23de09f180f948f99ddb40135775cedc Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Sun, 4 Aug 2024 21:30:19 -0400 Subject: selftest/cgroup: Add new test cases to test_cpuset_prs.sh Add new test cases to test_cpuset_prs.sh to cover corner cases reported in previous fix commits. Signed-off-by: Waiman Long Signed-off-by: Tejun Heo --- tools/testing/selftests/cgroup/test_cpuset_prs.sh | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/cgroup/test_cpuset_prs.sh b/tools/testing/selftests/cgroup/test_cpuset_prs.sh index 7c08cc153367..7295424502b9 100755 --- a/tools/testing/selftests/cgroup/test_cpuset_prs.sh +++ b/tools/testing/selftests/cgroup/test_cpuset_prs.sh @@ -321,7 +321,7 @@ TEST_MATRIX=( # old-A1 old-A2 old-A3 old-B1 new-A1 new-A2 new-A3 new-B1 fail ECPUs Pstate ISOLCPUS # ------ ------ ------ ------ ------ ------ ------ ------ ---- ----- ------ -------- # - # Incorrect change to cpuset.cpus invalidates partition root + # Incorrect change to cpuset.cpus[.exclusive] invalidates partition root # # Adding CPUs to partition root that are not in parent's # cpuset.cpus is allowed, but those extra CPUs are ignored. @@ -365,6 +365,16 @@ TEST_MATRIX=( # cpuset.cpus can overlap with sibling cpuset.cpus.exclusive but not subsumed by it " C0-3 . . C4-5 X5 . . . 0 A1:0-3,B1:4-5" + # Child partition root that try to take all CPUs from parent partition + # with tasks will remain invalid. + " C1-4:P1:S+ P1 . . . . . . 0 A1:1-4,A2:1-4 A1:P1,A2:P-1" + " C1-4:P1:S+ P1 . . . C1-4 . . 0 A1,A2:1-4 A1:P1,A2:P1" + " C1-4:P1:S+ P1 . . T C1-4 . . 0 A1:1-4,A2:1-4 A1:P1,A2:P-1" + + # Clearing of cpuset.cpus with a preset cpuset.cpus.exclusive shouldn't + # affect cpuset.cpus.exclusive.effective. + " C1-4:X3:S+ C1:X3 . . . C . . 0 A2:1-4,XA2:3" + # old-A1 old-A2 old-A3 old-B1 new-A1 new-A2 new-A3 new-B1 fail ECPUs Pstate ISOLCPUS # ------ ------ ------ ------ ------ ------ ------ ------ ---- ----- ------ -------- # Failure cases: -- cgit v1.2.3 From e8fc78eb658a1ebcb871e93bbe4f18e9d51fdd3a Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Fri, 2 Aug 2024 12:34:36 +0100 Subject: tools: ynl: remove extraneous ; after statements There are a couple of statements with two following semicolons, replace these with just one semicolon. Signed-off-by: Colin Ian King Reviewed-by: Donald Hunter Link: https://patch.msgid.link/20240802113436.448939-1-colin.i.king@gmail.com Signed-off-by: Jakub Kicinski --- tools/net/ynl/lib/ynl.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/net/ynl/lib/ynl.c b/tools/net/ynl/lib/ynl.c index fcb18a5a6d70..e16cef160bc2 100644 --- a/tools/net/ynl/lib/ynl.c +++ b/tools/net/ynl/lib/ynl.c @@ -696,14 +696,14 @@ ynl_sock_create(const struct ynl_family *yf, struct ynl_error *yse) addr.nl_family = AF_NETLINK; if (bind(ys->socket, (struct sockaddr *)&addr, sizeof(addr)) < 0) { __perr(yse, "unable to bind to a socket address"); - goto err_close_sock;; + goto err_close_sock; } memset(&addr, 0, sizeof(addr)); addrlen = sizeof(addr); if (getsockname(ys->socket, (struct sockaddr *)&addr, &addrlen) < 0) { __perr(yse, "unable to read socket address"); - goto err_close_sock;; + goto err_close_sock; } ys->portid = addr.nl_pid; ys->seq = random(); -- cgit v1.2.3 From 19c91bd8932a2e00f8de76022c780951166be9a0 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Fri, 12 Jul 2024 03:51:36 +0000 Subject: memblock tests: include memory_hotplug.h in mmzone.h as kernel dose In kernel code, memory_hotplug.h is included in mmzone.h instead of in init.h. Let's sync with kernel. This is a preparation for move init.h in common include directory. Signed-off-by: Wei Yang CC: Mike Rapoport Link: https://lore.kernel.org/r/20240712035138.24674-1-richard.weiyang@gmail.com Signed-off-by: Mike Rapoport (Microsoft) --- tools/testing/memblock/linux/init.h | 1 - tools/testing/memblock/linux/mmzone.h | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/memblock/linux/init.h b/tools/testing/memblock/linux/init.h index 828e0ee0bc6c..4aeddce53310 100644 --- a/tools/testing/memblock/linux/init.h +++ b/tools/testing/memblock/linux/init.h @@ -4,7 +4,6 @@ #include #include -#include #define __section(section) __attribute__((__section__(section))) diff --git a/tools/testing/memblock/linux/mmzone.h b/tools/testing/memblock/linux/mmzone.h index 71546e15bdd3..bb682659a12d 100644 --- a/tools/testing/memblock/linux/mmzone.h +++ b/tools/testing/memblock/linux/mmzone.h @@ -3,6 +3,7 @@ #define _TOOLS_MMZONE_H #include +#include struct pglist_data *first_online_pgdat(void); struct pglist_data *next_online_pgdat(struct pglist_data *pgdat); -- cgit v1.2.3 From d68c08173b70952cd74bb45075b843f4a637a43b Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Fri, 12 Jul 2024 03:51:37 +0000 Subject: memblock tests: include export.h in linkage.h as kernel dose In kernel code, linkage.h includes export.h. Let's sync with kernel. This is a preparation for move init.h in common include directory. Signed-off-by: Wei Yang CC: Mike Rapoport Link: https://lore.kernel.org/r/20240712035138.24674-2-richard.weiyang@gmail.com Signed-off-by: Mike Rapoport (Microsoft) --- tools/include/linux/linkage.h | 2 ++ tools/testing/memblock/linux/init.h | 1 - 2 files changed, 2 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/include/linux/linkage.h b/tools/include/linux/linkage.h index bc763d500262..20dee24d7e1b 100644 --- a/tools/include/linux/linkage.h +++ b/tools/include/linux/linkage.h @@ -1,4 +1,6 @@ #ifndef _TOOLS_INCLUDE_LINUX_LINKAGE_H #define _TOOLS_INCLUDE_LINUX_LINKAGE_H +#include + #endif /* _TOOLS_INCLUDE_LINUX_LINKAGE_H */ diff --git a/tools/testing/memblock/linux/init.h b/tools/testing/memblock/linux/init.h index 4aeddce53310..bd74abc5cba6 100644 --- a/tools/testing/memblock/linux/init.h +++ b/tools/testing/memblock/linux/init.h @@ -3,7 +3,6 @@ #define _LINUX_INIT_H #include -#include #define __section(section) __attribute__((__section__(section))) -- cgit v1.2.3 From e2ae9cf39f8806d2d8f5eb0a22ba511804a804ec Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Fri, 12 Jul 2024 03:51:38 +0000 Subject: tools/testing: abstract two init.h into common include directory Currently we have two test suits define its own init.h. This is a little redundant. Let's create a init.h in common include directory and merge these two into it. Signed-off-by: Wei Yang CC: Mike Rapoport CC: Liam R. Howlett Link: https://lore.kernel.org/r/20240712035138.24674-3-richard.weiyang@gmail.com Signed-off-by: Mike Rapoport (Microsoft) --- tools/include/linux/compiler.h | 4 ---- tools/include/linux/init.h | 40 +++++++++++++++++++++++++++++++++++ tools/testing/memblock/linux/init.h | 32 ---------------------------- tools/testing/radix-tree/linux/init.h | 2 -- tools/testing/radix-tree/maple.c | 2 +- 5 files changed, 41 insertions(+), 39 deletions(-) create mode 100644 tools/include/linux/init.h delete mode 100644 tools/testing/memblock/linux/init.h delete mode 100644 tools/testing/radix-tree/linux/init.h (limited to 'tools') diff --git a/tools/include/linux/compiler.h b/tools/include/linux/compiler.h index 6f7f22ac9da5..4b5a45919ff8 100644 --- a/tools/include/linux/compiler.h +++ b/tools/include/linux/compiler.h @@ -126,10 +126,6 @@ # define unlikely(x) __builtin_expect(!!(x), 0) #endif -#ifndef __init -# define __init -#endif - #include /* diff --git a/tools/include/linux/init.h b/tools/include/linux/init.h new file mode 100644 index 000000000000..7ed407976dda --- /dev/null +++ b/tools/include/linux/init.h @@ -0,0 +1,40 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _TOOLS_LINUX_INIT_H_ +#define _TOOLS_LINUX_INIT_H_ + +#include + +#ifndef __init +# define __init +#endif + +#ifndef __exit +# define __exit +#endif + +#define __section(section) __attribute__((__section__(section))) + +#define __initconst +#define __meminit +#define __meminitdata +#define __refdata +#define __initdata + +struct obs_kernel_param { + const char *str; + int (*setup_func)(char *st); + int early; +}; + +#define __setup_param(str, unique_id, fn, early) \ + static const char __setup_str_##unique_id[] __initconst \ + __aligned(1) = str; \ + static struct obs_kernel_param __setup_##unique_id \ + __used __section(".init.setup") \ + __aligned(__alignof__(struct obs_kernel_param)) = \ + { __setup_str_##unique_id, fn, early } + +#define early_param(str, fn) \ + __setup_param(str, fn, fn, 1) + +#endif /* _TOOLS_LINUX_INIT_H_ */ diff --git a/tools/testing/memblock/linux/init.h b/tools/testing/memblock/linux/init.h deleted file mode 100644 index bd74abc5cba6..000000000000 --- a/tools/testing/memblock/linux/init.h +++ /dev/null @@ -1,32 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _LINUX_INIT_H -#define _LINUX_INIT_H - -#include - -#define __section(section) __attribute__((__section__(section))) - -#define __initconst -#define __meminit -#define __meminitdata -#define __refdata -#define __initdata - -struct obs_kernel_param { - const char *str; - int (*setup_func)(char *st); - int early; -}; - -#define __setup_param(str, unique_id, fn, early) \ - static const char __setup_str_##unique_id[] __initconst \ - __aligned(1) = str; \ - static struct obs_kernel_param __setup_##unique_id \ - __used __section(".init.setup") \ - __aligned(__alignof__(struct obs_kernel_param)) = \ - { __setup_str_##unique_id, fn, early } - -#define early_param(str, fn) \ - __setup_param(str, fn, fn, 1) - -#endif diff --git a/tools/testing/radix-tree/linux/init.h b/tools/testing/radix-tree/linux/init.h deleted file mode 100644 index 81563c3dfce7..000000000000 --- a/tools/testing/radix-tree/linux/init.h +++ /dev/null @@ -1,2 +0,0 @@ -#define __init -#define __exit diff --git a/tools/testing/radix-tree/maple.c b/tools/testing/radix-tree/maple.c index cd1cf05503b4..3437292babff 100644 --- a/tools/testing/radix-tree/maple.c +++ b/tools/testing/radix-tree/maple.c @@ -14,7 +14,7 @@ #include "test.h" #include #include -#include "linux/init.h" +#include #define module_init(x) #define module_exit(x) -- cgit v1.2.3 From 39f64e402f659890a99d415eaf63a01f3b80a9a8 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Tue, 6 Aug 2024 01:03:15 +0000 Subject: memblock test: fix implicit declaration of function 'virt_to_phys' Commit 94ff46de4a73 ("memblock: Move late alloc warning down to phys alloc") introduce the usage of virt_to_phys(), which is not defined in memblock tests. Define it in mm.h to fix the build error. Signed-off-by: Wei Yang Link: https://lore.kernel.org/r/20240806010319.29194-1-richard.weiyang@gmail.com Signed-off-by: Mike Rapoport (Microsoft) --- tools/include/linux/mm.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'tools') diff --git a/tools/include/linux/mm.h b/tools/include/linux/mm.h index cad4f2927983..677c37e4a18c 100644 --- a/tools/include/linux/mm.h +++ b/tools/include/linux/mm.h @@ -25,6 +25,12 @@ static inline void *phys_to_virt(unsigned long address) return __va(address); } +#define virt_to_phys virt_to_phys +static inline phys_addr_t virt_to_phys(volatile void *address) +{ + return (phys_addr_t)address; +} + void reserve_bootmem_region(phys_addr_t start, phys_addr_t end, int nid); static inline void totalram_pages_inc(void) -- cgit v1.2.3 From 9f76c2ade323121f9006f6a529e0795317e16b5c Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Tue, 6 Aug 2024 01:03:16 +0000 Subject: memblock test: add the definition of __setup() Commit 1e4c64b71c9b ("mm/memblock: Add "reserve_mem" to reserved named memory at boot up") introduce usage of __setup(), which is not defined in memblock test. Define it in init.h to fix the build error. Signed-off-by: Wei Yang Link: https://lore.kernel.org/r/20240806010319.29194-2-richard.weiyang@gmail.com Signed-off-by: Mike Rapoport (Microsoft) --- tools/include/linux/init.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'tools') diff --git a/tools/include/linux/init.h b/tools/include/linux/init.h index 7ed407976dda..51b5cde28639 100644 --- a/tools/include/linux/init.h +++ b/tools/include/linux/init.h @@ -34,6 +34,9 @@ struct obs_kernel_param { __aligned(__alignof__(struct obs_kernel_param)) = \ { __setup_str_##unique_id, fn, early } +#define __setup(str, fn) \ + __setup_param(str, fn, fn, 0) + #define early_param(str, fn) \ __setup_param(str, fn, fn, 1) -- cgit v1.2.3 From a88cde5769d523e4ae6aad61237e4a5f6bd2309a Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Tue, 6 Aug 2024 01:03:17 +0000 Subject: memblock test: fix implicit declaration of function 'memparse' Commit 1e4c64b71c9b ("mm/memblock: Add "reserve_mem" to reserved named memory at boot up") introduce the usage of memparse(), which is not defined in memblock test. Add the definition and link it to fix the build. Signed-off-by: Wei Yang Link: https://lore.kernel.org/r/20240806010319.29194-3-richard.weiyang@gmail.com Signed-off-by: Mike Rapoport (Microsoft) --- tools/include/linux/string.h | 1 + tools/lib/cmdline.c | 53 +++++++++++++++++++++++++++++++++++ tools/testing/memblock/Makefile | 2 +- tools/testing/memblock/linux/kernel.h | 1 + 4 files changed, 56 insertions(+), 1 deletion(-) create mode 100644 tools/lib/cmdline.c (limited to 'tools') diff --git a/tools/include/linux/string.h b/tools/include/linux/string.h index db5c99318c79..fb8eda3019b5 100644 --- a/tools/include/linux/string.h +++ b/tools/include/linux/string.h @@ -47,4 +47,5 @@ extern char * __must_check skip_spaces(const char *); extern char *strim(char *); extern void *memchr_inv(const void *start, int c, size_t bytes); +extern unsigned long long memparse(const char *ptr, char **retptr); #endif /* _TOOLS_LINUX_STRING_H_ */ diff --git a/tools/lib/cmdline.c b/tools/lib/cmdline.c new file mode 100644 index 000000000000..c85f00f43c5e --- /dev/null +++ b/tools/lib/cmdline.c @@ -0,0 +1,53 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * From lib/cmdline.c + */ +#include + +#if __has_attribute(__fallthrough__) +# define fallthrough __attribute__((__fallthrough__)) +#else +# define fallthrough do {} while (0) /* fallthrough */ +#endif + +unsigned long long memparse(const char *ptr, char **retptr) +{ + char *endptr; /* local pointer to end of parsed string */ + + unsigned long long ret = strtoll(ptr, &endptr, 0); + + switch (*endptr) { + case 'E': + case 'e': + ret <<= 10; + fallthrough; + case 'P': + case 'p': + ret <<= 10; + fallthrough; + case 'T': + case 't': + ret <<= 10; + fallthrough; + case 'G': + case 'g': + ret <<= 10; + fallthrough; + case 'M': + case 'm': + ret <<= 10; + fallthrough; + case 'K': + case 'k': + ret <<= 10; + endptr++; + fallthrough; + default: + break; + } + + if (retptr) + *retptr = endptr; + + return ret; +} diff --git a/tools/testing/memblock/Makefile b/tools/testing/memblock/Makefile index 7a1ca694a982..d80982ccdc20 100644 --- a/tools/testing/memblock/Makefile +++ b/tools/testing/memblock/Makefile @@ -8,7 +8,7 @@ LDFLAGS += -fsanitize=address -fsanitize=undefined TARGETS = main TEST_OFILES = tests/alloc_nid_api.o tests/alloc_helpers_api.o tests/alloc_api.o \ tests/basic_api.o tests/common.o tests/alloc_exact_nid_api.o -DEP_OFILES = memblock.o lib/slab.o mmzone.o slab.o +DEP_OFILES = memblock.o lib/slab.o mmzone.o slab.o cmdline.o OFILES = main.o $(DEP_OFILES) $(TEST_OFILES) EXTR_SRC = ../../../mm/memblock.c diff --git a/tools/testing/memblock/linux/kernel.h b/tools/testing/memblock/linux/kernel.h index d2f148bd8902..c16d9cd2d1ae 100644 --- a/tools/testing/memblock/linux/kernel.h +++ b/tools/testing/memblock/linux/kernel.h @@ -8,5 +8,6 @@ #include #include #include +#include #endif -- cgit v1.2.3 From 8ac13bc7c266102f1068faafa5314522b68ebe65 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Tue, 6 Aug 2024 01:03:18 +0000 Subject: memblock test: fix implicit declaration of function 'isspace' Commit 1e4c64b71c9b ("mm/memblock: Add "reserve_mem" to reserved named memory at boot up") introduce usage of isspace(). Let's include in kernel.h to fix this. Signed-off-by: Wei Yang Link: https://lore.kernel.org/r/20240806010319.29194-4-richard.weiyang@gmail.com Signed-off-by: Mike Rapoport (Microsoft) --- tools/testing/memblock/linux/kernel.h | 1 + 1 file changed, 1 insertion(+) (limited to 'tools') diff --git a/tools/testing/memblock/linux/kernel.h b/tools/testing/memblock/linux/kernel.h index c16d9cd2d1ae..4d1012d5be6e 100644 --- a/tools/testing/memblock/linux/kernel.h +++ b/tools/testing/memblock/linux/kernel.h @@ -9,5 +9,6 @@ #include #include #include +#include #endif -- cgit v1.2.3 From 9e3d665384fca2a1c56283c7a79a968243ef4614 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Tue, 6 Aug 2024 01:03:19 +0000 Subject: memblock test: fix implicit declaration of function 'strscpy' Commit 1e4c64b71c9b ("mm/memblock: Add "reserve_mem" to reserved named memory at boot up") introduce the usage of strscpy, which breaks the memblock test. Let's define it as strcpy in userspace to fix it. Signed-off-by: Wei Yang Link: https://lore.kernel.org/r/20240806010319.29194-5-richard.weiyang@gmail.com Signed-off-by: Mike Rapoport (Microsoft) --- tools/include/linux/string.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'tools') diff --git a/tools/include/linux/string.h b/tools/include/linux/string.h index fb8eda3019b5..41e7fa734922 100644 --- a/tools/include/linux/string.h +++ b/tools/include/linux/string.h @@ -12,6 +12,8 @@ void argv_free(char **argv); int strtobool(const char *s, bool *res); +#define strscpy strcpy + /* * glibc based builds needs the extern while uClibc doesn't. * However uClibc headers also define __GLIBC__ hence the hack below -- cgit v1.2.3 From 2576b20abdb18d80c22cea783fda80e2235f36dd Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Mon, 5 Aug 2024 12:44:20 -0700 Subject: perf test: Add build test for JEVENTS_ARCH=all Building with JEVENTS_ARCH=all builds all CPU types and allows things like assertions to check the validity of the input JSON. Signed-off-by: Ian Rogers Tested-by: Arnaldo Carvalho de Melo Cc: Adrian Hunter Cc: Albert Ou Cc: Alexander Shishkin Cc: Atish Patra Cc: Changbin Du Cc: Charles Ci-Jyun Wu Cc: Eric Lin Cc: Greentime Hu Cc: Guilherme Amadio Cc: Ingo Molnar Cc: Inochi Amaoto Cc: James Clark Cc: Ji Sheng Teoh Cc: Jing Zhang Cc: Jiri Olsa Cc: John Garry Cc: Kan Liang Cc: Leo Yan Cc: Locus Wei-Han Chen Cc: Mark Rutland Cc: Mike Leach Cc: Namhyung Kim Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Peter Zijlstra Cc: Samuel Holland Cc: Sandipan Das Cc: Vincent Chen Cc: Will Deacon Cc: Xu Yang Link: https://lore.kernel.org/r/20240805194424.597244-1-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/make | 2 ++ 1 file changed, 2 insertions(+) (limited to 'tools') diff --git a/tools/perf/tests/make b/tools/perf/tests/make index a1f8adf85367..8a9edf758f10 100644 --- a/tools/perf/tests/make +++ b/tools/perf/tests/make @@ -70,6 +70,7 @@ make_python_perf_so := $(python_perf_so) make_debug := DEBUG=1 make_nondistro := BUILD_NONDISTRO=1 make_extra_tests := EXTRA_TESTS=1 +make_jevents_all := JEVENTS_ARCH=all make_no_bpf_skel := BUILD_BPF_SKEL=0 make_gen_vmlinux_h := GEN_VMLINUX_H=1 make_no_libperl := NO_LIBPERL=1 @@ -140,6 +141,7 @@ run += make_python_perf_so run += make_debug run += make_nondistro run += make_extra_tests +run += make_jevents_all run += make_no_bpf_skel run += make_gen_vmlinux_h run += make_no_libperl -- cgit v1.2.3 From b79f9a437a5758c6dee29c0dbcce8db293f088c0 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Mon, 5 Aug 2024 12:44:21 -0700 Subject: perf pmu-events: Change dependencies for empty-pmu-events.c test Switch from $? (all the prerequisites that are newer than the target) to $^ (all the prerequisites) as touching jevents.py will mean that empty-pmu-events.c won't be passed to the diff command breaking the build. Reported-by: Arnaldo Carvalho de Melo Signed-off-by: Ian Rogers Tested-by: Arnaldo Carvalho de Melo Cc: Adrian Hunter Cc: Albert Ou Cc: Alexander Shishkin Cc: Atish Patra Cc: Changbin Du Cc: Charles Ci-Jyun Wu Cc: Eric Lin Cc: Greentime Hu Cc: Guilherme Amadio Cc: Ingo Molnar Cc: Inochi Amaoto Cc: James Clark Cc: Ji Sheng Teoh Cc: Jing Zhang Cc: Jiri Olsa Cc: John Garry Cc: Kan Liang Cc: Leo Yan Cc: Locus Wei-Han Chen Cc: Mark Rutland Cc: Mike Leach Cc: Namhyung Kim Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Peter Zijlstra Cc: Samuel Holland Cc: Sandipan Das Cc: Vincent Chen Cc: Will Deacon Cc: Xu Yang Link: https://lore.kernel.org/r/20240805194424.597244-2-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/pmu-events/Build | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/perf/pmu-events/Build b/tools/perf/pmu-events/Build index c3fa43c49706..d941bc9d16e9 100644 --- a/tools/perf/pmu-events/Build +++ b/tools/perf/pmu-events/Build @@ -39,7 +39,7 @@ $(TEST_EMPTY_PMU_EVENTS_C): $(JSON) $(JSON_TEST) $(JEVENTS_PY) $(METRIC_PY) $(ME $(EMPTY_PMU_EVENTS_TEST_LOG): $(EMPTY_PMU_EVENTS_C) $(TEST_EMPTY_PMU_EVENTS_C) $(call rule_mkdir) - $(Q)$(call echo-cmd,test)diff -u $? 2> $@ || (cat $@ && false) + $(Q)$(call echo-cmd,test)diff -u $^ 2> $@ || (cat $@ && false) $(PMU_EVENTS_C): $(JSON) $(JSON_TEST) $(JEVENTS_PY) $(METRIC_PY) $(METRIC_TEST_LOG) $(EMPTY_PMU_EVENTS_TEST_LOG) $(call rule_mkdir) -- cgit v1.2.3 From c4f74bb61ae01b30067ae3878ca8ba7d677283ae Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Mon, 5 Aug 2024 12:44:22 -0700 Subject: perf pmu-events: Remove duplicated ampereone event OP_SPEC is repeated twice in the file which will break invariants in 'perf list' as discussed in this thread: https://lore.kernel.org/linux-perf-users/20240719081651.24853-1-eric.lin@sifive.com/ Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Albert Ou Cc: Alexander Shishkin Cc: Atish Patra Cc: Changbin Du Cc: Charles Ci-Jyun Wu Cc: Eric Lin Cc: Greentime Hu Cc: Guilherme Amadio Cc: Ingo Molnar Cc: Inochi Amaoto Cc: James Clark Cc: Ji Sheng Teoh Cc: Jing Zhang Cc: Jiri Olsa Cc: John Garry Cc: Kan Liang Cc: Leo Yan Cc: Locus Wei-Han Chen Cc: Mark Rutland Cc: Mike Leach Cc: Namhyung Kim Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Peter Zijlstra Cc: Samuel Holland Cc: Sandipan Das Cc: Vincent Chen Cc: Will Deacon Cc: Xu Yang Link: https://lore.kernel.org/r/20240805194424.597244-3-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/pmu-events/arch/arm64/ampere/ampereone/instruction.json | 3 --- 1 file changed, 3 deletions(-) (limited to 'tools') diff --git a/tools/perf/pmu-events/arch/arm64/ampere/ampereone/instruction.json b/tools/perf/pmu-events/arch/arm64/ampere/ampereone/instruction.json index 18d1f2f76a23..9fe697d12fe0 100644 --- a/tools/perf/pmu-events/arch/arm64/ampere/ampereone/instruction.json +++ b/tools/perf/pmu-events/arch/arm64/ampere/ampereone/instruction.json @@ -77,9 +77,6 @@ { "ArchStdEvent": "OP_RETIRED" }, - { - "ArchStdEvent": "OP_SPEC" - }, { "PublicDescription": "Operation speculatively executed, NOP", "EventCode": "0x100", -- cgit v1.2.3 From 4bd380390fcce5c77610bc9067a9a97cfd999402 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Mon, 5 Aug 2024 12:44:24 -0700 Subject: perf jevents.py: Ensure event names aren't duplicated Duplicate event names break invariants in 'perf list'. Assert that an event name isn't duplicated so that broken JSON won't build. Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Albert Ou Cc: Alexander Shishkin Cc: Atish Patra Cc: Changbin Du Cc: Charles Ci-Jyun Wu Cc: Eric Lin Cc: Greentime Hu Cc: Guilherme Amadio Cc: Ingo Molnar Cc: Inochi Amaoto Cc: James Clark Cc: Ji Sheng Teoh Cc: Jing Zhang Cc: Jiri Olsa Cc: John Garry Cc: Kan Liang Cc: Leo Yan Cc: Locus Wei-Han Chen Cc: Mark Rutland Cc: Mike Leach Cc: Namhyung Kim Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Peter Zijlstra Cc: Samuel Holland Cc: Sandipan Das Cc: Vincent Chen Cc: Will Deacon Cc: Xu Yang Link: https://lore.kernel.org/r/20240805194424.597244-5-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/pmu-events/jevents.py | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'tools') diff --git a/tools/perf/pmu-events/jevents.py b/tools/perf/pmu-events/jevents.py index fcf0158438b5..1d96b2204e52 100755 --- a/tools/perf/pmu-events/jevents.py +++ b/tools/perf/pmu-events/jevents.py @@ -503,8 +503,11 @@ def print_pending_events() -> None: first = True last_pmu = None + last_name = None pmus = set() for event in sorted(_pending_events, key=event_cmp_key): + if last_pmu and last_pmu == event.pmu: + assert event.name != last_name, f"Duplicate event: {last_pmu}/{last_name}/ in {_pending_events_tblname}" if event.pmu != last_pmu: if not first: _args.output_file.write('};\n') @@ -516,6 +519,7 @@ def print_pending_events() -> None: pmus.add((event.pmu, pmu_name)) _args.output_file.write(event.to_c_string(metric=False)) + last_name = event.name _pending_events = [] _args.output_file.write(f""" -- cgit v1.2.3 From ff358ada070fa1b64a6368147b2b56cc6a921847 Mon Sep 17 00:00:00 2001 From: Matt Bobrowski Date: Wed, 31 Jul 2024 11:08:32 +0000 Subject: selftests/bpf: add negative tests for new VFS based BPF kfuncs Add a bunch of negative selftests responsible for asserting that the BPF verifier successfully rejects a BPF program load when the underlying BPF program misuses one of the newly introduced VFS based BPF kfuncs. The following VFS based BPF kfuncs are extensively tested within this new selftest: * struct file *bpf_get_task_exe_file(struct task_struct *); * void bpf_put_file(struct file *); * int bpf_path_d_path(struct path *, char *, size_t); Acked-by: Christian Brauner Acked-by: Song Liu Signed-off-by: Matt Bobrowski Link: https://lore.kernel.org/r/20240731110833.1834742-3-mattbobrowski@google.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/bpf_experimental.h | 26 ++++ tools/testing/selftests/bpf/prog_tests/verifier.c | 2 + .../selftests/bpf/progs/verifier_vfs_reject.c | 161 +++++++++++++++++++++ 3 files changed, 189 insertions(+) create mode 100644 tools/testing/selftests/bpf/progs/verifier_vfs_reject.c (limited to 'tools') diff --git a/tools/testing/selftests/bpf/bpf_experimental.h b/tools/testing/selftests/bpf/bpf_experimental.h index 828556cdc2f0..b0668f29f7b3 100644 --- a/tools/testing/selftests/bpf/bpf_experimental.h +++ b/tools/testing/selftests/bpf/bpf_experimental.h @@ -195,6 +195,32 @@ extern void bpf_iter_task_vma_destroy(struct bpf_iter_task_vma *it) __ksym; */ extern void bpf_throw(u64 cookie) __ksym; +/* Description + * Acquire a reference on the exe_file member field belonging to the + * mm_struct that is nested within the supplied task_struct. The supplied + * task_struct must be trusted/referenced. + * Returns + * A referenced file pointer pointing to the exe_file member field of the + * mm_struct nested in the supplied task_struct, or NULL. + */ +extern struct file *bpf_get_task_exe_file(struct task_struct *task) __ksym; + +/* Description + * Release a reference on the supplied file. The supplied file must be + * acquired. + */ +extern void bpf_put_file(struct file *file) __ksym; + +/* Description + * Resolve a pathname for the supplied path and store it in the supplied + * buffer. The supplied path must be trusted/referenced. + * Returns + * A positive integer corresponding to the length of the resolved pathname, + * including the NULL termination character, stored in the supplied + * buffer. On error, a negative integer is returned. + */ +extern int bpf_path_d_path(struct path *path, char *buf, size_t buf__sz) __ksym; + /* This macro must be used to mark the exception callback corresponding to the * main program. For example: * diff --git a/tools/testing/selftests/bpf/prog_tests/verifier.c b/tools/testing/selftests/bpf/prog_tests/verifier.c index 67a49d12472c..14d74ba2188e 100644 --- a/tools/testing/selftests/bpf/prog_tests/verifier.c +++ b/tools/testing/selftests/bpf/prog_tests/verifier.c @@ -85,6 +85,7 @@ #include "verifier_value_or_null.skel.h" #include "verifier_value_ptr_arith.skel.h" #include "verifier_var_off.skel.h" +#include "verifier_vfs_reject.skel.h" #include "verifier_xadd.skel.h" #include "verifier_xdp.skel.h" #include "verifier_xdp_direct_packet_access.skel.h" @@ -205,6 +206,7 @@ void test_verifier_value(void) { RUN(verifier_value); } void test_verifier_value_illegal_alu(void) { RUN(verifier_value_illegal_alu); } void test_verifier_value_or_null(void) { RUN(verifier_value_or_null); } void test_verifier_var_off(void) { RUN(verifier_var_off); } +void test_verifier_vfs_reject(void) { RUN(verifier_vfs_reject); } void test_verifier_xadd(void) { RUN(verifier_xadd); } void test_verifier_xdp(void) { RUN(verifier_xdp); } void test_verifier_xdp_direct_packet_access(void) { RUN(verifier_xdp_direct_packet_access); } diff --git a/tools/testing/selftests/bpf/progs/verifier_vfs_reject.c b/tools/testing/selftests/bpf/progs/verifier_vfs_reject.c new file mode 100644 index 000000000000..d6d3f4fcb24c --- /dev/null +++ b/tools/testing/selftests/bpf/progs/verifier_vfs_reject.c @@ -0,0 +1,161 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2024 Google LLC. */ + +#include +#include +#include +#include + +#include "bpf_misc.h" +#include "bpf_experimental.h" + +static char buf[PATH_MAX]; + +SEC("lsm.s/file_open") +__failure __msg("Possibly NULL pointer passed to trusted arg0") +int BPF_PROG(get_task_exe_file_kfunc_null) +{ + struct file *acquired; + + /* Can't pass a NULL pointer to bpf_get_task_exe_file(). */ + acquired = bpf_get_task_exe_file(NULL); + if (!acquired) + return 0; + + bpf_put_file(acquired); + return 0; +} + +SEC("lsm.s/inode_getxattr") +__failure __msg("arg#0 pointer type STRUCT task_struct must point to scalar, or struct with scalar") +int BPF_PROG(get_task_exe_file_kfunc_fp) +{ + u64 x; + struct file *acquired; + struct task_struct *task; + + task = (struct task_struct *)&x; + /* Can't pass random frame pointer to bpf_get_task_exe_file(). */ + acquired = bpf_get_task_exe_file(task); + if (!acquired) + return 0; + + bpf_put_file(acquired); + return 0; +} + +SEC("lsm.s/file_open") +__failure __msg("R1 must be referenced or trusted") +int BPF_PROG(get_task_exe_file_kfunc_untrusted) +{ + struct file *acquired; + struct task_struct *parent; + + /* Walking a trusted struct task_struct returned from + * bpf_get_current_task_btf() yields an untrusted pointer. + */ + parent = bpf_get_current_task_btf()->parent; + /* Can't pass untrusted pointer to bpf_get_task_exe_file(). */ + acquired = bpf_get_task_exe_file(parent); + if (!acquired) + return 0; + + bpf_put_file(acquired); + return 0; +} + +SEC("lsm.s/file_open") +__failure __msg("Unreleased reference") +int BPF_PROG(get_task_exe_file_kfunc_unreleased) +{ + struct file *acquired; + + acquired = bpf_get_task_exe_file(bpf_get_current_task_btf()); + if (!acquired) + return 0; + + /* Acquired but never released. */ + return 0; +} + +SEC("lsm.s/file_open") +__failure __msg("release kernel function bpf_put_file expects") +int BPF_PROG(put_file_kfunc_unacquired, struct file *file) +{ + /* Can't release an unacquired pointer. */ + bpf_put_file(file); + return 0; +} + +SEC("lsm.s/file_open") +__failure __msg("Possibly NULL pointer passed to trusted arg0") +int BPF_PROG(path_d_path_kfunc_null) +{ + /* Can't pass NULL value to bpf_path_d_path() kfunc. */ + bpf_path_d_path(NULL, buf, sizeof(buf)); + return 0; +} + +SEC("lsm.s/task_alloc") +__failure __msg("R1 must be referenced or trusted") +int BPF_PROG(path_d_path_kfunc_untrusted_from_argument, struct task_struct *task) +{ + struct path *root; + + /* Walking a trusted argument typically yields an untrusted + * pointer. This is one example of that. + */ + root = &task->fs->root; + bpf_path_d_path(root, buf, sizeof(buf)); + return 0; +} + +SEC("lsm.s/file_open") +__failure __msg("R1 must be referenced or trusted") +int BPF_PROG(path_d_path_kfunc_untrusted_from_current) +{ + struct path *pwd; + struct task_struct *current; + + current = bpf_get_current_task_btf(); + /* Walking a trusted pointer returned from bpf_get_current_task_btf() + * yields an untrusted pointer. + */ + pwd = ¤t->fs->pwd; + bpf_path_d_path(pwd, buf, sizeof(buf)); + return 0; +} + +SEC("lsm.s/file_open") +__failure __msg("kernel function bpf_path_d_path args#0 expected pointer to STRUCT path but R1 has a pointer to STRUCT file") +int BPF_PROG(path_d_path_kfunc_type_mismatch, struct file *file) +{ + bpf_path_d_path((struct path *)&file->f_task_work, buf, sizeof(buf)); + return 0; +} + +SEC("lsm.s/file_open") +__failure __msg("invalid access to map value, value_size=4096 off=0 size=8192") +int BPF_PROG(path_d_path_kfunc_invalid_buf_sz, struct file *file) +{ + /* bpf_path_d_path() enforces a constraint on the buffer size supplied + * by the BPF LSM program via the __sz annotation. buf here is set to + * PATH_MAX, so let's ensure that the BPF verifier rejects BPF_PROG_LOAD + * attempts if the supplied size and the actual size of the buffer + * mismatches. + */ + bpf_path_d_path(&file->f_path, buf, PATH_MAX * 2); + return 0; +} + +SEC("fentry/vfs_open") +__failure __msg("calling kernel function bpf_path_d_path is not allowed") +int BPF_PROG(path_d_path_kfunc_non_lsm, struct path *path, struct file *f) +{ + /* Calling bpf_path_d_path() from a non-LSM BPF program isn't permitted. + */ + bpf_path_d_path(path, buf, sizeof(buf)); + return 0; +} + +char _license[] SEC("license") = "GPL"; -- cgit v1.2.3 From 2b399b9b1f995d71f53e94a41e5717db46c39459 Mon Sep 17 00:00:00 2001 From: Matt Bobrowski Date: Wed, 31 Jul 2024 11:08:33 +0000 Subject: selftests/bpf: add positive tests for new VFS based BPF kfuncs Add a bunch of positive selftests which extensively cover the various contexts and parameters in which the new VFS based BPF kfuncs may be used from. Again, the following VFS based BPF kfuncs are thoroughly tested within this new selftest: * struct file *bpf_get_task_exe_file(struct task_struct *); * void bpf_put_file(struct file *); * int bpf_path_d_path(struct path *, char *, size_t); Acked-by: Christian Brauner Acked-by: Song Liu Signed-off-by: Matt Bobrowski Link: https://lore.kernel.org/r/20240731110833.1834742-4-mattbobrowski@google.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/prog_tests/verifier.c | 2 + .../selftests/bpf/progs/verifier_vfs_accept.c | 85 ++++++++++++++++++++++ 2 files changed, 87 insertions(+) create mode 100644 tools/testing/selftests/bpf/progs/verifier_vfs_accept.c (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/verifier.c b/tools/testing/selftests/bpf/prog_tests/verifier.c index 14d74ba2188e..f8f546eba488 100644 --- a/tools/testing/selftests/bpf/prog_tests/verifier.c +++ b/tools/testing/selftests/bpf/prog_tests/verifier.c @@ -85,6 +85,7 @@ #include "verifier_value_or_null.skel.h" #include "verifier_value_ptr_arith.skel.h" #include "verifier_var_off.skel.h" +#include "verifier_vfs_accept.skel.h" #include "verifier_vfs_reject.skel.h" #include "verifier_xadd.skel.h" #include "verifier_xdp.skel.h" @@ -206,6 +207,7 @@ void test_verifier_value(void) { RUN(verifier_value); } void test_verifier_value_illegal_alu(void) { RUN(verifier_value_illegal_alu); } void test_verifier_value_or_null(void) { RUN(verifier_value_or_null); } void test_verifier_var_off(void) { RUN(verifier_var_off); } +void test_verifier_vfs_accept(void) { RUN(verifier_vfs_accept); } void test_verifier_vfs_reject(void) { RUN(verifier_vfs_reject); } void test_verifier_xadd(void) { RUN(verifier_xadd); } void test_verifier_xdp(void) { RUN(verifier_xdp); } diff --git a/tools/testing/selftests/bpf/progs/verifier_vfs_accept.c b/tools/testing/selftests/bpf/progs/verifier_vfs_accept.c new file mode 100644 index 000000000000..a7c0a553aa50 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/verifier_vfs_accept.c @@ -0,0 +1,85 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2024 Google LLC. */ + +#include +#include +#include + +#include "bpf_misc.h" +#include "bpf_experimental.h" + +static char buf[64]; + +SEC("lsm.s/file_open") +__success +int BPF_PROG(get_task_exe_file_and_put_kfunc_from_current_sleepable) +{ + struct file *acquired; + + acquired = bpf_get_task_exe_file(bpf_get_current_task_btf()); + if (!acquired) + return 0; + + bpf_put_file(acquired); + return 0; +} + +SEC("lsm/file_open") +__success +int BPF_PROG(get_task_exe_file_and_put_kfunc_from_current_non_sleepable, struct file *file) +{ + struct file *acquired; + + acquired = bpf_get_task_exe_file(bpf_get_current_task_btf()); + if (!acquired) + return 0; + + bpf_put_file(acquired); + return 0; +} + +SEC("lsm.s/task_alloc") +__success +int BPF_PROG(get_task_exe_file_and_put_kfunc_from_argument, + struct task_struct *task) +{ + struct file *acquired; + + acquired = bpf_get_task_exe_file(task); + if (!acquired) + return 0; + + bpf_put_file(acquired); + return 0; +} + +SEC("lsm.s/inode_getattr") +__success +int BPF_PROG(path_d_path_from_path_argument, struct path *path) +{ + int ret; + + ret = bpf_path_d_path(path, buf, sizeof(buf)); + __sink(ret); + return 0; +} + +SEC("lsm.s/file_open") +__success +int BPF_PROG(path_d_path_from_file_argument, struct file *file) +{ + int ret; + struct path *path; + + /* The f_path member is a path which is embedded directly within a + * file. Therefore, a pointer to such embedded members are still + * recognized by the BPF verifier as being PTR_TRUSTED as it's + * essentially PTR_TRUSTED w/ a non-zero fixed offset. + */ + path = &file->f_path; + ret = bpf_path_d_path(path, buf, sizeof(buf)); + __sink(ret); + return 0; +} + +char _license[] SEC("license") = "GPL"; -- cgit v1.2.3 From 2a6b6c9a226279b4f6668450ddb21ae655558087 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Sat, 27 Jul 2024 23:37:36 +0900 Subject: selftests: harness: remove unneeded __constructor_order_last() __constructor_order_last() is unneeded. If __constructor_order_last() is not called on backward-order systems, __constructor_order will remain 0 instead of being set to _CONSTRUCTOR_ORDER_BACKWARD (= -1). __LIST_APPEND() will still take the 'else' branch, so there is no difference in the behavior. Signed-off-by: Masahiro Yamada Signed-off-by: Shuah Khan --- tools/testing/selftests/drivers/s390x/uvdevice/test_uvdevice.c | 6 ------ tools/testing/selftests/hid/hid_bpf.c | 6 ------ tools/testing/selftests/kselftest_harness.h | 10 +--------- tools/testing/selftests/rtc/rtctest.c | 7 ------- 4 files changed, 1 insertion(+), 28 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/drivers/s390x/uvdevice/test_uvdevice.c b/tools/testing/selftests/drivers/s390x/uvdevice/test_uvdevice.c index ea0cdc37b44f..7ee7492138c6 100644 --- a/tools/testing/selftests/drivers/s390x/uvdevice/test_uvdevice.c +++ b/tools/testing/selftests/drivers/s390x/uvdevice/test_uvdevice.c @@ -257,12 +257,6 @@ TEST_F(attest_fixture, att_inval_addr) att_inval_addr_test(&self->uvio_attest.meas_addr, _metadata, self); } -static void __attribute__((constructor)) __constructor_order_last(void) -{ - if (!__constructor_order) - __constructor_order = _CONSTRUCTOR_ORDER_BACKWARD; -} - int main(int argc, char **argv) { int fd = open(UV_PATH, O_ACCMODE); diff --git a/tools/testing/selftests/hid/hid_bpf.c b/tools/testing/selftests/hid/hid_bpf.c index dc0408a831d0..f2bd20ca88bb 100644 --- a/tools/testing/selftests/hid/hid_bpf.c +++ b/tools/testing/selftests/hid/hid_bpf.c @@ -1331,12 +1331,6 @@ static int libbpf_print_fn(enum libbpf_print_level level, return 0; } -static void __attribute__((constructor)) __constructor_order_last(void) -{ - if (!__constructor_order) - __constructor_order = _CONSTRUCTOR_ORDER_BACKWARD; -} - int main(int argc, char **argv) { /* Use libbpf 1.0 API mode */ diff --git a/tools/testing/selftests/kselftest_harness.h b/tools/testing/selftests/kselftest_harness.h index 40723a6a083f..71648d2102cb 100644 --- a/tools/testing/selftests/kselftest_harness.h +++ b/tools/testing/selftests/kselftest_harness.h @@ -488,12 +488,6 @@ * Use once to append a main() to the test file. */ #define TEST_HARNESS_MAIN \ - static void __attribute__((constructor)) \ - __constructor_order_last(void) \ - { \ - if (!__constructor_order) \ - __constructor_order = _CONSTRUCTOR_ORDER_BACKWARD; \ - } \ int main(int argc, char **argv) { \ return test_harness_run(argc, argv); \ } @@ -891,7 +885,6 @@ static struct __fixture_metadata *__fixture_list = &_fixture_global; static int __constructor_order; #define _CONSTRUCTOR_ORDER_FORWARD 1 -#define _CONSTRUCTOR_ORDER_BACKWARD -1 static inline void __register_fixture(struct __fixture_metadata *f) { @@ -1337,8 +1330,7 @@ static int test_harness_run(int argc, char **argv) static void __attribute__((constructor)) __constructor_order_first(void) { - if (!__constructor_order) - __constructor_order = _CONSTRUCTOR_ORDER_FORWARD; + __constructor_order = _CONSTRUCTOR_ORDER_FORWARD; } #endif /* __KSELFTEST_HARNESS_H */ diff --git a/tools/testing/selftests/rtc/rtctest.c b/tools/testing/selftests/rtc/rtctest.c index 63ce02d1d5cc..9647b14b47c5 100644 --- a/tools/testing/selftests/rtc/rtctest.c +++ b/tools/testing/selftests/rtc/rtctest.c @@ -410,13 +410,6 @@ TEST_F_TIMEOUT(rtc, alarm_wkalm_set_minute, 65) { ASSERT_EQ(new, secs); } -static void __attribute__((constructor)) -__constructor_order_last(void) -{ - if (!__constructor_order) - __constructor_order = _CONSTRUCTOR_ORDER_BACKWARD; -} - int main(int argc, char **argv) { switch (argc) { -- cgit v1.2.3 From 2c082b62aeb5e818d9e9588253fc7b73fc5ab81b Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Sat, 27 Jul 2024 23:37:37 +0900 Subject: selftests: harness: rename __constructor_order for clarification Now, __constructor_order is boolean; 1 for forward-order systems, 0 for backward-order systems while parsing __LIST_APPEND(). Change it into a bool variable, and rename it for clarification. Signed-off-by: Masahiro Yamada Signed-off-by: Shuah Khan --- tools/testing/selftests/kselftest_harness.h | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kselftest_harness.h b/tools/testing/selftests/kselftest_harness.h index 71648d2102cb..a5a72415e37b 100644 --- a/tools/testing/selftests/kselftest_harness.h +++ b/tools/testing/selftests/kselftest_harness.h @@ -818,7 +818,7 @@ item->prev = item; \ return; \ } \ - if (__constructor_order == _CONSTRUCTOR_ORDER_FORWARD) { \ + if (__constructor_order_forward) { \ item->next = NULL; \ item->prev = head->prev; \ item->prev->next = item; \ @@ -882,9 +882,7 @@ struct __test_xfail { } static struct __fixture_metadata *__fixture_list = &_fixture_global; -static int __constructor_order; - -#define _CONSTRUCTOR_ORDER_FORWARD 1 +static bool __constructor_order_forward; static inline void __register_fixture(struct __fixture_metadata *f) { @@ -935,7 +933,7 @@ static inline bool __test_passed(struct __test_metadata *metadata) * list so tests are run in source declaration order. * https://gcc.gnu.org/onlinedocs/gccint/Initialization.html * However, it seems not all toolchains do this correctly, so use - * __constructor_order to detect which direction is called first + * __constructor_order_foward to detect which direction is called first * and adjust list building logic to get things running in the right * direction. */ @@ -1330,7 +1328,7 @@ static int test_harness_run(int argc, char **argv) static void __attribute__((constructor)) __constructor_order_first(void) { - __constructor_order = _CONSTRUCTOR_ORDER_FORWARD; + __constructor_order_forward = true; } #endif /* __KSELFTEST_HARNESS_H */ -- cgit v1.2.3 From cfdbfb94b3824b8da3a6b21252d24754d0162ab6 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 1 Aug 2024 17:37:48 -0700 Subject: rcutorture: Add rcutree.nohz_full_patience_delay to TREE07 This commit adds the rcutree.nohz_full_patience_delay=1000 kernel boot parameter to the TREE07 scenario, on the observation that "if it ain't tested, it don't work". Signed-off-by: "Paul E. McKenney" Signed-off-by: Neeraj Upadhyay --- tools/testing/selftests/rcutorture/configs/rcu/TREE07.boot | 1 + 1 file changed, 1 insertion(+) (limited to 'tools') diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE07.boot b/tools/testing/selftests/rcutorture/configs/rcu/TREE07.boot index 979edbf4c820..55ce305b2a3d 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/TREE07.boot +++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE07.boot @@ -2,3 +2,4 @@ nohz_full=2-9 rcutorture.stall_cpu=14 rcutorture.stall_cpu_holdoff=90 rcutorture.fwd_progress=0 +rcutree.nohz_full_patience_delay=1000 -- cgit v1.2.3 From 8681156c0939a7511c47b9cc462390a83f0e846a Mon Sep 17 00:00:00 2001 From: Song Liu Date: Tue, 6 Aug 2024 16:09:04 -0700 Subject: selftests/bpf: Add tests for bpf_get_dentry_xattr Add test for bpf_get_dentry_xattr on hook security_inode_getxattr. Verify that the kfunc can read the xattr. Also test failing getxattr from user space by returning non-zero from the LSM bpf program. Acked-by: Christian Brauner Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20240806230904.71194-4-song@kernel.org Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/bpf_kfuncs.h | 11 ++++++- tools/testing/selftests/bpf/prog_tests/fs_kfuncs.c | 9 +++++- tools/testing/selftests/bpf/progs/test_get_xattr.c | 37 +++++++++++++++++++--- 3 files changed, 50 insertions(+), 7 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/bpf_kfuncs.h b/tools/testing/selftests/bpf/bpf_kfuncs.h index 3b6675ab4086..2eb3483f2fb0 100644 --- a/tools/testing/selftests/bpf/bpf_kfuncs.h +++ b/tools/testing/selftests/bpf/bpf_kfuncs.h @@ -45,7 +45,7 @@ extern int bpf_dynptr_clone(const struct bpf_dynptr *ptr, struct bpf_dynptr *clo /* Description * Modify the address of a AF_UNIX sockaddr. - * Returns__bpf_kfunc + * Returns * -EINVAL if the address size is too big or, 0 if the sockaddr was successfully modified. */ extern int bpf_sock_addr_set_sun_path(struct bpf_sock_addr_kern *sa_kern, @@ -78,4 +78,13 @@ extern int bpf_verify_pkcs7_signature(struct bpf_dynptr *data_ptr, extern bool bpf_session_is_return(void) __ksym __weak; extern __u64 *bpf_session_cookie(void) __ksym __weak; + +struct dentry; +/* Description + * Returns xattr of a dentry + * Returns + * Error code + */ +extern int bpf_get_dentry_xattr(struct dentry *dentry, const char *name, + struct bpf_dynptr *value_ptr) __ksym __weak; #endif diff --git a/tools/testing/selftests/bpf/prog_tests/fs_kfuncs.c b/tools/testing/selftests/bpf/prog_tests/fs_kfuncs.c index 37056ba73847..5a0b51157451 100644 --- a/tools/testing/selftests/bpf/prog_tests/fs_kfuncs.c +++ b/tools/testing/selftests/bpf/prog_tests/fs_kfuncs.c @@ -16,6 +16,7 @@ static void test_xattr(void) { struct test_get_xattr *skel = NULL; int fd = -1, err; + int v[32]; fd = open(testfile, O_CREAT | O_RDONLY, 0644); if (!ASSERT_GE(fd, 0, "create_file")) @@ -50,7 +51,13 @@ static void test_xattr(void) if (!ASSERT_GE(fd, 0, "open_file")) goto out; - ASSERT_EQ(skel->bss->found_xattr, 1, "found_xattr"); + ASSERT_EQ(skel->bss->found_xattr_from_file, 1, "found_xattr_from_file"); + + /* Trigger security_inode_getxattr */ + err = getxattr(testfile, "user.kfuncs", v, sizeof(v)); + ASSERT_EQ(err, -1, "getxattr_return"); + ASSERT_EQ(errno, EINVAL, "getxattr_errno"); + ASSERT_EQ(skel->bss->found_xattr_from_dentry, 1, "found_xattr_from_dentry"); out: close(fd); diff --git a/tools/testing/selftests/bpf/progs/test_get_xattr.c b/tools/testing/selftests/bpf/progs/test_get_xattr.c index 7eb2a4e5a3e5..66e737720f7c 100644 --- a/tools/testing/selftests/bpf/progs/test_get_xattr.c +++ b/tools/testing/selftests/bpf/progs/test_get_xattr.c @@ -2,6 +2,7 @@ /* Copyright (c) 2023 Meta Platforms, Inc. and affiliates. */ #include "vmlinux.h" +#include #include #include #include "bpf_kfuncs.h" @@ -9,10 +10,12 @@ char _license[] SEC("license") = "GPL"; __u32 monitored_pid; -__u32 found_xattr; +__u32 found_xattr_from_file; +__u32 found_xattr_from_dentry; static const char expected_value[] = "hello"; -char value[32]; +char value1[32]; +char value2[32]; SEC("lsm.s/file_open") int BPF_PROG(test_file_open, struct file *f) @@ -25,13 +28,37 @@ int BPF_PROG(test_file_open, struct file *f) if (pid != monitored_pid) return 0; - bpf_dynptr_from_mem(value, sizeof(value), 0, &value_ptr); + bpf_dynptr_from_mem(value1, sizeof(value1), 0, &value_ptr); ret = bpf_get_file_xattr(f, "user.kfuncs", &value_ptr); if (ret != sizeof(expected_value)) return 0; - if (bpf_strncmp(value, ret, expected_value)) + if (bpf_strncmp(value1, ret, expected_value)) return 0; - found_xattr = 1; + found_xattr_from_file = 1; return 0; } + +SEC("lsm.s/inode_getxattr") +int BPF_PROG(test_inode_getxattr, struct dentry *dentry, char *name) +{ + struct bpf_dynptr value_ptr; + __u32 pid; + int ret; + + pid = bpf_get_current_pid_tgid() >> 32; + if (pid != monitored_pid) + return 0; + + bpf_dynptr_from_mem(value2, sizeof(value2), 0, &value_ptr); + + ret = bpf_get_dentry_xattr(dentry, "user.kfuncs", &value_ptr); + if (ret != sizeof(expected_value)) + return 0; + if (bpf_strncmp(value2, ret, expected_value)) + return 0; + found_xattr_from_dentry = 1; + + /* return non-zero to fail getxattr from user space */ + return -EINVAL; +} -- cgit v1.2.3 From 98f8faea4b6379de2c1483125253b73f7449e6b6 Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Fri, 24 May 2024 09:49:55 +0900 Subject: selftests/uprobes: Add a basic uprobe testcase Add a basic uprobe testcase which checks whether add/remove/trace operations works on /bin/sh. Signed-off-by: Masami Hiramatsu (Google) Reviewed-by: Steven Rostedt (Google) Signed-off-by: Shuah Khan --- .../ftrace/test.d/dynevent/add_remove_uprobe.tc | 26 ++++++++++++++++++++++ 1 file changed, 26 insertions(+) create mode 100644 tools/testing/selftests/ftrace/test.d/dynevent/add_remove_uprobe.tc (limited to 'tools') diff --git a/tools/testing/selftests/ftrace/test.d/dynevent/add_remove_uprobe.tc b/tools/testing/selftests/ftrace/test.d/dynevent/add_remove_uprobe.tc new file mode 100644 index 000000000000..a275decdc880 --- /dev/null +++ b/tools/testing/selftests/ftrace/test.d/dynevent/add_remove_uprobe.tc @@ -0,0 +1,26 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0 +# description: Generic dynamic event - add/remove/test uprobe events +# requires: uprobe_events + +echo 0 > events/enable +echo > dynamic_events + +echo 'cat /proc/$$/maps' | /bin/sh | \ + grep "r-xp .*/bin/.*sh$" | \ + awk '{printf "p:myevent %s:0x%s\n", $6,$3 }' >> uprobe_events + +grep -q myevent uprobe_events +test -d events/uprobes/myevent + +echo 1 > events/uprobes/myevent/enable +echo 'ls' | /bin/sh > /dev/null +echo 0 > events/uprobes/myevent/enable +grep -q myevent trace + +echo "-:myevent" >> uprobe_events +! grep -q myevent uprobe_events + +echo > uprobe_events + +clear_trace -- cgit v1.2.3 From 53af1a4b6a55b3910a253fce7a0893e6d51952be Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Thu, 23 May 2024 12:45:41 -0400 Subject: tracing/selftests: Run the ownership test twice A regression happened where running the ownership test passes on the first iteration but fails running it a second time. This was caught and fixed, but a later change brought it back. The regression was missed because the automated tests only run the tests once per boot. Change the ownership test to iterate through the tests twice, as this will catch the regression with a single run. Signed-off-by: Steven Rostedt (Google) Acked-by: Masami Hiramatsu (Google) Signed-off-by: Shuah Khan --- .../ftrace/test.d/00basic/test_ownership.tc | 34 +++++++++++++--------- 1 file changed, 20 insertions(+), 14 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/ftrace/test.d/00basic/test_ownership.tc b/tools/testing/selftests/ftrace/test.d/00basic/test_ownership.tc index c45094d1e1d2..71e43a92352a 100644 --- a/tools/testing/selftests/ftrace/test.d/00basic/test_ownership.tc +++ b/tools/testing/selftests/ftrace/test.d/00basic/test_ownership.tc @@ -83,32 +83,38 @@ run_tests() { done } -mount -o remount,"$new_options" . +# Run the tests twice as leftovers can cause issues +for loop in 1 2 ; do -run_tests + echo "Running iteration $loop" -mount -o remount,"$mount_options" . + mount -o remount,"$new_options" . -for d in "." "events" "events/sched" "events/sched/sched_switch" "events/sched/sched_switch/enable" $canary; do - test "$d" $original_group -done + run_tests + + mount -o remount,"$mount_options" . + + for d in "." "events" "events/sched" "events/sched/sched_switch" "events/sched/sched_switch/enable" $canary; do + test "$d" $original_group + done # check instances as well -chgrp $other_group instances + chgrp $other_group instances -instance="$(mktemp -u test-XXXXXX)" + instance="$(mktemp -u test-XXXXXX)" -mkdir instances/$instance + mkdir instances/$instance -cd instances/$instance + cd instances/$instance -run_tests + run_tests -cd ../.. + cd ../.. -rmdir instances/$instance + rmdir instances/$instance -chgrp $original_group instances + chgrp $original_group instances +done exit 0 -- cgit v1.2.3 From b2f70c99edc7e4dcd201abbdcc4b75030973edce Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Tue, 6 Aug 2024 23:51:36 -0700 Subject: perf hist: Fix reference counting of branch_info iter_finish_branch_entry() doesn't put the branch_info from/to map elements creating memory leaks. This can be seen with: ``` $ perf record -e cycles -b perf test -w noploop $ perf report -D ... Direct leak of 984344 byte(s) in 123043 object(s) allocated from: #0 0x7fb2654f3bd7 in malloc libsanitizer/asan/asan_malloc_linux.cpp:69 #1 0x564d3400d10b in map__get util/map.h:186 #2 0x564d3400d10b in ip__resolve_ams util/machine.c:1981 #3 0x564d34014d81 in sample__resolve_bstack util/machine.c:2151 #4 0x564d34094790 in iter_prepare_branch_entry util/hist.c:898 #5 0x564d34098fa4 in hist_entry_iter__add util/hist.c:1238 #6 0x564d33d1f0c7 in process_sample_event tools/perf/builtin-report.c:334 #7 0x564d34031eb7 in perf_session__deliver_event util/session.c:1655 #8 0x564d3403ba52 in do_flush util/ordered-events.c:245 #9 0x564d3403ba52 in __ordered_events__flush util/ordered-events.c:324 #10 0x564d3402d32e in perf_session__process_user_event util/session.c:1708 #11 0x564d34032480 in perf_session__process_event util/session.c:1877 #12 0x564d340336ad in reader__read_event util/session.c:2399 #13 0x564d34033fdc in reader__process_events util/session.c:2448 #14 0x564d34033fdc in __perf_session__process_events util/session.c:2495 #15 0x564d34033fdc in perf_session__process_events util/session.c:2661 #16 0x564d33d27113 in __cmd_report tools/perf/builtin-report.c:1065 #17 0x564d33d27113 in cmd_report tools/perf/builtin-report.c:1805 #18 0x564d33e0ccb7 in run_builtin tools/perf/perf.c:350 #19 0x564d33e0d45e in handle_internal_command tools/perf/perf.c:403 #20 0x564d33cdd827 in run_argv tools/perf/perf.c:447 #21 0x564d33cdd827 in main tools/perf/perf.c:561 ... ``` Clearing up the map_symbols properly creates maps reference count issues so resolve those. Resolving this issue doesn't improve peak heap consumption for the test above. Committer testing: $ sudo dnf install libasan $ make -k CORESIGHT=1 EXTRA_CFLAGS="-fsanitize=address" CC=clang O=/tmp/build/$(basename $PWD)/ -C tools/perf install-bin Reviewed-by: Kan Liang Signed-off-by: Ian Rogers Tested-by: Arnaldo Carvalho de Melo Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ingo Molnar Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Sun Haiyong Cc: Yanteng Si Link: https://lore.kernel.org/r/20240807065136.1039977-1-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/hist.c | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c index f8ee1cd6929d..c8c1b511f8a7 100644 --- a/tools/perf/util/hist.c +++ b/tools/perf/util/hist.c @@ -472,7 +472,9 @@ static int hist_entry__init(struct hist_entry *he, memcpy(he->branch_info, template->branch_info, sizeof(*he->branch_info)); + he->branch_info->from.ms.maps = maps__get(he->branch_info->from.ms.maps); he->branch_info->from.ms.map = map__get(he->branch_info->from.ms.map); + he->branch_info->to.ms.maps = maps__get(he->branch_info->to.ms.maps); he->branch_info->to.ms.map = map__get(he->branch_info->to.ms.map); } @@ -970,10 +972,21 @@ out: return err; } +static void branch_info__exit(struct branch_info *bi) +{ + map_symbol__exit(&bi->from.ms); + map_symbol__exit(&bi->to.ms); + zfree_srcline(&bi->srcline_from); + zfree_srcline(&bi->srcline_to); +} + static int iter_finish_branch_entry(struct hist_entry_iter *iter, struct addr_location *al __maybe_unused) { + for (int i = 0; i < iter->total; i++) + branch_info__exit(&iter->bi[i]); + zfree(&iter->bi); iter->he = NULL; @@ -1319,10 +1332,7 @@ void hist_entry__delete(struct hist_entry *he) map_symbol__exit(&he->ms); if (he->branch_info) { - map_symbol__exit(&he->branch_info->from.ms); - map_symbol__exit(&he->branch_info->to.ms); - zfree_srcline(&he->branch_info->srcline_from); - zfree_srcline(&he->branch_info->srcline_to); + branch_info__exit(he->branch_info); zfree(&he->branch_info); } -- cgit v1.2.3 From 037f1b67e81c2500c94625b089cfd1ecc76a18b5 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Mon, 5 Aug 2024 16:46:48 -0700 Subject: perf annotate: Cache debuginfo for data type profiling In find_data_type(), it creates and deletes a debug info whenver it tries to find data type for a sample. This is inefficient and it most likely accesses the same binary again and again. Let's add a single entry cache the debug info structure for the last DSO. Depending on sample data, it usually gives me 2~3x (and sometimes more) speed ups. Note that this will introduce a little difference in the output due to the order of checking stack operations. It used to check the stack ops before checking the availability of debug info but I moved it after the symbol check. So it'll report stack operations in DSOs without debug info as unknown. But I think it's ok and better to have the checking near the caching logic. Committer testing: root@x1:~# perf mem record -a sleep 5s root@x1:~# perf evlist cpu_atom/mem-loads,ldlat=30/P cpu_atom/mem-stores/P dummy:u root@x1:~# diff -u before after --- before 2024-08-08 09:33:53.880780784 -0300 +++ after 2024-08-08 09:35:13.917325041 -0300 @@ -81,8 +81,8 @@ # Overhead Data Type # ........ ......... # - 55.43% (unknown) - 11.61% (stack operation) + 55.56% (unknown) + 11.48% (stack operation) 4.93% struct pcpu_hot 3.26% unsigned int 2.48% struct Signed-off-by: Namhyung Kim Tested-by: Arnaldo Carvalho de Melo Cc: Adrian Hunter Cc: Athira Rajeev Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kan Liang Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20240805234648.1453689-1-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/annotate-data.c | 15 ++------------- tools/perf/util/annotate-data.h | 2 +- tools/perf/util/annotate.c | 37 +++++++++++++++++++++++++++++++++++++ tools/perf/util/annotate.h | 2 ++ tools/perf/util/session.c | 2 ++ 5 files changed, 44 insertions(+), 14 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/annotate-data.c b/tools/perf/util/annotate-data.c index 734acdd8c4b7..f125ac5f0bda 100644 --- a/tools/perf/util/annotate-data.c +++ b/tools/perf/util/annotate-data.c @@ -1345,16 +1345,9 @@ out: */ struct annotated_data_type *find_data_type(struct data_loc_info *dloc) { - struct annotated_data_type *result = NULL; struct dso *dso = map__dso(dloc->ms->map); Dwarf_Die type_die; - dloc->di = debuginfo__new(dso__long_name(dso)); - if (dloc->di == NULL) { - pr_debug_dtp("cannot get the debug info\n"); - return NULL; - } - /* * The type offset is the same as instruction offset by default. * But when finding a global variable, the offset won't be valid. @@ -1364,13 +1357,9 @@ struct annotated_data_type *find_data_type(struct data_loc_info *dloc) dloc->fbreg = -1; if (find_data_type_die(dloc, &type_die) < 0) - goto out; - - result = dso__findnew_data_type(dso, &type_die); + return NULL; -out: - debuginfo__delete(dloc->di); - return result; + return dso__findnew_data_type(dso, &type_die); } static int alloc_data_type_histograms(struct annotated_data_type *adt, int nr_entries) diff --git a/tools/perf/util/annotate-data.h b/tools/perf/util/annotate-data.h index 992b7ce4bd11..37a1a3b68e0b 100644 --- a/tools/perf/util/annotate-data.h +++ b/tools/perf/util/annotate-data.h @@ -123,9 +123,9 @@ struct data_loc_info { u64 var_addr; u8 cpumode; struct annotated_op_loc *op; + struct debuginfo *di; /* These are used internally */ - struct debuginfo *di; int fbreg; bool fb_cfa; diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c index eafe8d65052e..a87d2e97e10d 100644 --- a/tools/perf/util/annotate.c +++ b/tools/perf/util/annotate.c @@ -25,6 +25,7 @@ #include "srcline.h" #include "units.h" #include "debug.h" +#include "debuginfo.h" #include "annotate.h" #include "annotate-data.h" #include "evsel.h" @@ -2333,6 +2334,20 @@ u64 annotate_calc_pcrel(struct map_symbol *ms, u64 ip, int offset, return map__rip_2objdump(ms->map, addr); } +static struct debuginfo_cache { + struct dso *dso; + struct debuginfo *dbg; +} di_cache; + +void debuginfo_cache__delete(void) +{ + dso__put(di_cache.dso); + di_cache.dso = NULL; + + debuginfo__delete(di_cache.dbg); + di_cache.dbg = NULL; +} + /** * hist_entry__get_data_type - find data type for given hist entry * @he: hist entry @@ -2367,6 +2382,27 @@ struct annotated_data_type *hist_entry__get_data_type(struct hist_entry *he) return NULL; } + /* + * di_cache holds a pair of values, but code below assumes + * di_cache.dso can be compared/updated and di_cache.dbg can be + * read/updated independently from each other. That assumption only + * holds in single threaded code. + */ + assert(perf_singlethreaded); + + if (map__dso(ms->map) != di_cache.dso) { + dso__put(di_cache.dso); + di_cache.dso = dso__get(map__dso(ms->map)); + + debuginfo__delete(di_cache.dbg); + di_cache.dbg = debuginfo__new(dso__long_name(di_cache.dso)); + } + + if (di_cache.dbg == NULL) { + ann_data_stat.no_dbginfo++; + return NULL; + } + /* Make sure it has the disasm of the function */ if (symbol__annotate(ms, evsel, &arch) < 0) { ann_data_stat.no_insn++; @@ -2411,6 +2447,7 @@ retry: .ip = ms->sym->start + dl->al.offset, .cpumode = he->cpumode, .op = op_loc, + .di = di_cache.dbg, }; if (!op_loc->mem_ref && op_loc->segment == INSN_SEG_NONE) diff --git a/tools/perf/util/annotate.h b/tools/perf/util/annotate.h index 64e70d716ff1..0e52558baa01 100644 --- a/tools/perf/util/annotate.h +++ b/tools/perf/util/annotate.h @@ -543,4 +543,6 @@ struct annotated_basic_block { int annotate_get_basic_blocks(struct symbol *sym, s64 src, s64 dst, struct list_head *head); +void debuginfo_cache__delete(void); + #endif /* __PERF_ANNOTATE_H */ diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c index 5596bed1b8c8..f9072e003367 100644 --- a/tools/perf/util/session.c +++ b/tools/perf/util/session.c @@ -36,6 +36,7 @@ #include "util.h" #include "arch/common.h" #include "units.h" +#include "annotate.h" #include #ifdef HAVE_ZSTD_SUPPORT @@ -304,6 +305,7 @@ void perf_session__delete(struct perf_session *session) return; auxtrace__free(session); auxtrace_index__free(&session->auxtrace_index); + debuginfo_cache__delete(); perf_session__destroy_kernel_maps(session); perf_decomp__release_events(session->decomp_data.decomp); perf_env__exit(&session->header.env); -- cgit v1.2.3 From 90d78e7b8e57b064f700545081fc9288c1564e92 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Wed, 7 Aug 2024 15:31:29 -0700 Subject: perf annotate-data: Show typedef names properly The die_get_typename() would resolve typedef and get to the original type. But sometimes the original type is a struct without name and it makes the output confusing and hard to read. This is a diff of perf report -s type before and after the change. New types such as atomic{,64}_t and sigset_t appeared and the portion of unnamed struct was reduced. Also u32, u64 and size_t were splitted from the base types. --- b 2024-08-01 17:02:34.307809952 -0700 +++ a 2024-08-07 14:17:05.245853999 -0700 - 2.40% long unsigned int + 2.26% long unsigned int - 1.56% unsigned int + 1.27% unsigned int - 0.98% struct - 0.79% long long unsigned int + 0.58% long long unsigned int + 0.36% struct + 0.27% atomic64_t + 0.22% u32 + 0.21% u64 + 0.19% atomic_t + 0.13% size_t - 0.08% struct seqcount_spinlock + 0.08% seqcount_spinlock_t + 0.08% sigset_t + 0.08% __poll_t Let's use the typedef name directly and the resolved to get the size of the type. Committer testing: root@x1:~# diff -u before after | head -30 --- before 2024-08-08 09:35:13.917325041 -0300 +++ after 2024-08-08 09:37:35.312257905 -0300 @@ -10,25 +10,27 @@ # ........ ......... # 79.40% (unknown) - 2.28% union 1.96% (stack operation) - 1.24% struct + 1.87% pthread_mutex_t 0.99% u32[] - 0.92% unsigned int 0.77% struct task_struct + 0.75% U32 0.75% struct pcpu_hot 0.63% struct qspinlock + 0.61% atomic_t 0.59% struct list_head - 0.58% int 0.53% struct cfs_rq 0.51% BYTE* - 0.48% unsigned char + 0.48% BYTE 0.48% long unsigned int 0.46% struct rq 0.41% struct worker 0.41% struct memcg_vmstats_percpu + 0.41% pthread_cond_t 0.37% _Bool + 0.36% int root@x1:~# Signed-off-by: Namhyung Kim Tested-by: Arnaldo Carvalho de Melo Cc: Adrian Hunter Cc: Athira Rajeev Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kan Liang Cc: Masami Hiramatsu Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20240807223129.1738004-1-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/annotate-data.c | 39 ++++++++++++++++++++++++++++++--------- tools/perf/util/dwarf-aux.c | 2 +- tools/perf/util/dwarf-aux.h | 2 ++ 3 files changed, 33 insertions(+), 10 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/annotate-data.c b/tools/perf/util/annotate-data.c index f125ac5f0bda..c18272092a6b 100644 --- a/tools/perf/util/annotate-data.c +++ b/tools/perf/util/annotate-data.c @@ -217,8 +217,13 @@ static int __add_member_cb(Dwarf_Die *die, void *arg) strbuf_init(&sb, 32); die_get_typename(die, &sb); - die_get_real_type(die, &member_type); - if (dwarf_aggregate_size(&member_type, &size) < 0) + __die_get_real_type(die, &member_type); + if (dwarf_tag(&member_type) == DW_TAG_typedef) + die_get_real_type(&member_type, &die_mem); + else + die_mem = member_type; + + if (dwarf_aggregate_size(&die_mem, &size) < 0) size = 0; if (!dwarf_attr_integrate(die, DW_AT_data_member_location, &attr)) @@ -235,11 +240,11 @@ static int __add_member_cb(Dwarf_Die *die, void *arg) INIT_LIST_HEAD(&member->children); list_add_tail(&member->node, &parent->children); - tag = dwarf_tag(&member_type); + tag = dwarf_tag(&die_mem); switch (tag) { case DW_TAG_structure_type: case DW_TAG_union_type: - die_find_child(&member_type, __add_member_cb, member, &die_mem); + die_find_child(&die_mem, __add_member_cb, member, &die_mem); break; default: break; @@ -281,6 +286,10 @@ static struct annotated_data_type *dso__findnew_data_type(struct dso *dso, if (die_get_typename_from_type(type_die, &sb) < 0) strbuf_add(&sb, "(unknown type)", 14); type_name = strbuf_detach(&sb, NULL); + + if (dwarf_tag(type_die) == DW_TAG_typedef) + die_get_real_type(type_die, type_die); + dwarf_aggregate_size(type_die, &size); /* Check existing nodes in dso->data_types tree */ @@ -342,6 +351,7 @@ static int check_variable(struct data_loc_info *dloc, Dwarf_Die *var_die, { Dwarf_Word size; bool is_pointer = true; + Dwarf_Die sized_type; if (reg == DWARF_REG_PC) is_pointer = false; @@ -351,7 +361,7 @@ static int check_variable(struct data_loc_info *dloc, Dwarf_Die *var_die, is_pointer = false; /* Get the type of the variable */ - if (die_get_real_type(var_die, type_die) == NULL) { + if (__die_get_real_type(var_die, type_die) == NULL) { pr_debug_dtp("variable has no type\n"); ann_data_stat.no_typeinfo++; return -1; @@ -365,15 +375,20 @@ static int check_variable(struct data_loc_info *dloc, Dwarf_Die *var_die, if (is_pointer) { if ((dwarf_tag(type_die) != DW_TAG_pointer_type && dwarf_tag(type_die) != DW_TAG_array_type) || - die_get_real_type(type_die, type_die) == NULL) { + __die_get_real_type(type_die, type_die) == NULL) { pr_debug_dtp("no pointer or no type\n"); ann_data_stat.no_typeinfo++; return -1; } } + if (dwarf_tag(type_die) == DW_TAG_typedef) + die_get_real_type(type_die, &sized_type); + else + sized_type = *type_die; + /* Get the size of the actual type */ - if (dwarf_aggregate_size(type_die, &size) < 0) { + if (dwarf_aggregate_size(&sized_type, &size) < 0) { pr_debug_dtp("type size is unknown\n"); ann_data_stat.invalid_size++; return -1; @@ -846,6 +861,7 @@ static int check_matching_type(struct type_state *state, if (state->regs[reg].ok && state->regs[reg].kind == TSR_KIND_TYPE) { int tag = dwarf_tag(&state->regs[reg].type); + Dwarf_Die sized_type; /* * Normal registers should hold a pointer (or array) to @@ -862,13 +878,18 @@ static int check_matching_type(struct type_state *state, pr_debug_dtp("\n"); /* Remove the pointer and get the target type */ - if (die_get_real_type(&state->regs[reg].type, type_die) == NULL) + if (__die_get_real_type(&state->regs[reg].type, type_die) == NULL) return -1; dloc->type_offset = dloc->op->offset; + if (dwarf_tag(type_die) == DW_TAG_typedef) + die_get_real_type(type_die, &sized_type); + else + sized_type = *type_die; + /* Get the size of the actual type */ - if (dwarf_aggregate_size(type_die, &size) < 0 || + if (dwarf_aggregate_size(&sized_type, &size) < 0 || (unsigned)dloc->type_offset >= size) return -1; diff --git a/tools/perf/util/dwarf-aux.c b/tools/perf/util/dwarf-aux.c index 44ef968a7ad3..5e080d7e22c2 100644 --- a/tools/perf/util/dwarf-aux.c +++ b/tools/perf/util/dwarf-aux.c @@ -267,7 +267,7 @@ Dwarf_Die *die_get_type(Dwarf_Die *vr_die, Dwarf_Die *die_mem) } /* Get a type die, but skip qualifiers */ -static Dwarf_Die *__die_get_real_type(Dwarf_Die *vr_die, Dwarf_Die *die_mem) +Dwarf_Die *__die_get_real_type(Dwarf_Die *vr_die, Dwarf_Die *die_mem) { int tag; diff --git a/tools/perf/util/dwarf-aux.h b/tools/perf/util/dwarf-aux.h index 24446412b869..336a3a183a78 100644 --- a/tools/perf/util/dwarf-aux.h +++ b/tools/perf/util/dwarf-aux.h @@ -56,6 +56,8 @@ const char *die_get_decl_file(Dwarf_Die *dw_die); /* Get type die */ Dwarf_Die *die_get_type(Dwarf_Die *vr_die, Dwarf_Die *die_mem); +/* Get a type die, but skip qualifiers */ +Dwarf_Die *__die_get_real_type(Dwarf_Die *vr_die, Dwarf_Die *die_mem); /* Get a type die, but skip qualifiers and typedef */ Dwarf_Die *die_get_real_type(Dwarf_Die *vr_die, Dwarf_Die *die_mem); -- cgit v1.2.3 From ed5bb548cc648de6dbb5894cf95955998e466589 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Wed, 7 Aug 2024 21:49:54 -0700 Subject: perf test: Add a new shell test for perf ftrace $ sudo ./perf test ftrace -vv 86: perf ftrace tests: --- start --- test child forked, pid 1772223 perf ftrace list test syscalls for sleep: __x64_sys_nanosleep __ia32_sys_nanosleep __x64_sys_clock_nanosleep __ia32_sys_clock_nanosleep perf ftrace list test [Success] perf ftrace trace test # tracer: function_graph # # CPU DURATION FUNCTION CALLS # | | | | | | | 0) | __x64_sys_clock_nanosleep() { 0) | common_nsleep() { 0) | hrtimer_nanosleep() { 0) | do_nanosleep() { perf ftrace trace test [Success] perf ftrace latency test target function: __x64_sys_clock_nanosleep # DURATION | COUNT | GRAPH | 32 - 64 ms | 1 | ############################################## | perf ftrace latency test [Success] perf ftrace profile test # Total (us) Avg (us) Max (us) Count Function 100136.400 100136.400 100136.400 1 __x64_sys_clock_nanosleep 100135.200 100135.200 100135.200 1 common_nsleep 100134.700 100134.700 100134.700 1 hrtimer_nanosleep 100133.700 100133.700 100133.700 1 do_nanosleep 100130.600 100130.600 100130.600 1 schedule 166.868 55.623 80.299 3 scheduler_tick 5.926 5.926 5.926 1 native_smp_send_reschedule 301.941 301.941 301.941 1 __x64_sys_execve 295.786 295.786 295.786 1 do_execveat_common.isra.0 71.397 35.699 46.403 2 bprm_execve 2.519 1.260 1.547 2 sched_mm_cid_before_execve 1.098 0.549 0.686 2 sched_mm_cid_after_execve perf ftrace profile test [Success] ---- end(0) ---- 86: perf ftrace tests : Ok Signed-off-by: Namhyung Kim Tested-by: Arnaldo Carvalho de Melo Cc: Adrian Hunter Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kan Liang Cc: Peter Zijlstra Cc: Steven Rostedt (VMware) Link: https://lore.kernel.org/r/20240808044954.1775333-1-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/shell/ftrace.sh | 84 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 84 insertions(+) create mode 100755 tools/perf/tests/shell/ftrace.sh (limited to 'tools') diff --git a/tools/perf/tests/shell/ftrace.sh b/tools/perf/tests/shell/ftrace.sh new file mode 100755 index 000000000000..b1c36d30559a --- /dev/null +++ b/tools/perf/tests/shell/ftrace.sh @@ -0,0 +1,84 @@ +#!/bin/sh +# perf ftrace tests +# SPDX-License-Identifier: GPL-2.0 + +set -e + +# perf ftrace commands only works for root +if [ "$(id -u)" != 0 ]; then + echo "perf ftrace test [Skipped: no permission]" + exit 2 +fi + +output=$(mktemp /tmp/__perf_test.ftrace.XXXXXX) + +cleanup() { + rm -f "${output}" + + trap - EXIT TERM INT +} + +trap_cleanup() { + cleanup + exit 1 +} +trap trap_cleanup EXIT TERM INT + +# this will be set in test_ftrace_trace() +target_function= + +test_ftrace_list() { + echo "perf ftrace list test" + perf ftrace -F > "${output}" + # this will be used in test_ftrace_trace() + sleep_functions=$(grep 'sys_.*sleep$' "${output}") + echo "syscalls for sleep:" + echo "${sleep_functions}" + echo "perf ftrace list test [Success]" +} + +test_ftrace_trace() { + echo "perf ftrace trace test" + perf ftrace trace --graph-opts depth=5 sleep 0.1 > "${output}" + # it should have some function name contains 'sleep' + grep "^#" "${output}" + grep -F 'sleep()' "${output}" + # find actual syscall function name + for FN in ${sleep_functions}; do + if grep -q "${FN}" "${output}"; then + target_function="${FN}" + echo "perf ftrace trace test [Success]" + return + fi + done + + echo "perf ftrace trace test [Failure: sleep syscall not found]" + exit 1 +} + +test_ftrace_latency() { + echo "perf ftrace latency test" + echo "target function: ${target_function}" + perf ftrace latency -T "${target_function}" sleep 0.1 > "${output}" + grep "^#" "${output}" + grep "###" "${output}" + echo "perf ftrace latency test [Success]" +} + +test_ftrace_profile() { + echo "perf ftrace profile test" + perf ftrace profile sleep 0.1 > "${output}" + grep ^# "${output}" + grep sleep "${output}" + grep schedule "${output}" + grep execve "${output}" + echo "perf ftrace profile test [Success]" +} + +test_ftrace_list +test_ftrace_trace +test_ftrace_latency +test_ftrace_profile + +cleanup +exit 0 -- cgit v1.2.3 From 2df5484bbf2f1fa3efa2b6385efc1350d89b7783 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Thu, 8 Aug 2024 09:59:40 -0300 Subject: perf tests ftrace: Add pattern check for time, count In 'perf ftrace profile sleep 0.1' we know that we'll have an specific kernel function that will take a bit more than 0.1 seconds and will take place just one time, so we can add a check for that so that we validate more than just the presence of some functions in the profile. Cc: Adrian Hunter Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kan Liang Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Steven Rostedt (VMware) Link: https://lore.kernel.org/lkml/ZrTBo7KACZeuCyLj@x1 Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/shell/ftrace.sh | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'tools') diff --git a/tools/perf/tests/shell/ftrace.sh b/tools/perf/tests/shell/ftrace.sh index b1c36d30559a..a6ee740f0d7e 100755 --- a/tools/perf/tests/shell/ftrace.sh +++ b/tools/perf/tests/shell/ftrace.sh @@ -72,6 +72,11 @@ test_ftrace_profile() { grep sleep "${output}" grep schedule "${output}" grep execve "${output}" + time_re="[[:space:]]+10[[:digit:]]{4}\.[[:digit:]]{3}" + # 100283.000 100283.000 100283.000 1 __x64_sys_clock_nanosleep + # Check for one *clock_nanosleep line with a Count of just 1 that takes a bit more than 0.1 seconds + # Strip the _x64_sys part to work with other architectures + grep -E "^${time_re}${time_re}${time_re}[[:space:]]+1[[:space:]]+.*clock_nanosleep" "${output}" echo "perf ftrace profile test [Success]" } -- cgit v1.2.3 From 37e2a19c98bf99747ca997be876dfc13f9165e0a Mon Sep 17 00:00:00 2001 From: Veronika Molnarova Date: Thu, 8 Aug 2024 12:37:49 +0200 Subject: perf test pmu: Set uninitialized PMU alias to null Commit 3e0bf9fde2984469 ("perf pmu: Restore full PMU name wildcard support") adds a test case "PMU cmdline match" that covers PMU name wildcard support provided by function perf_pmu__match(). The test works with a wide range of supported combinations of PMU name matching but omits the case that if the perf_pmu__match() cannot match the PMU name to the wildcard, it tries to match its alias. However, this variable is not set up, causing the test case to fail when run with subprocesses or to segfault if run as a single process. ./perf test -vv 9 9: Sysfs PMU tests : 9.1: Parsing with PMU format directory : Ok 9.2: Parsing with PMU event : Ok 9.3: PMU event names : Ok 9.4: PMU name combining : Ok 9.5: PMU name comparison : Ok 9.6: PMU cmdline match : FAILED! ./perf test -F 9 9.1: Parsing with PMU format directory : Ok 9.2: Parsing with PMU event : Ok 9.3: PMU event names : Ok 9.4: PMU name combining : Ok 9.5: PMU name comparison : Ok Segmentation fault (core dumped) Initialize the PMU alias to null for all tests of perf_pmu__match() as this functionality is not being tested and the alias matching works exactly the same as the matching of the PMU name. ./perf test -F 9 9.1: Parsing with PMU format directory : Ok 9.2: Parsing with PMU event : Ok 9.3: PMU event names : Ok 9.4: PMU name combining : Ok 9.5: PMU name comparison : Ok 9.6: PMU cmdline match : Ok Fixes: 3e0bf9fde2984469 ("perf pmu: Restore full PMU name wildcard support") Signed-off-by: Veronika Molnarova Cc: James Clark Cc: Michael Petlan Cc: Namhyung Kim Cc: Radostin Stoyanov Link: https://lore.kernel.org/r/20240808103749.9356-1-vmolnaro@redhat.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/pmu.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/perf/tests/pmu.c b/tools/perf/tests/pmu.c index 0b2f04a55d7b..a4730b5dc0d9 100644 --- a/tools/perf/tests/pmu.c +++ b/tools/perf/tests/pmu.c @@ -453,11 +453,13 @@ static int test__name_cmp(struct test_suite *test __maybe_unused, int subtest __ /** * Test perf_pmu__match() that's used to search for a PMU given a name passed * on the command line. The name that's passed may also be a filename type glob - * match. + * match. If the name does not match, perf_pmu__match() attempts to match the + * alias of the PMU, if provided. */ static int test__pmu_match(struct test_suite *test __maybe_unused, int subtest __maybe_unused) { struct perf_pmu test_pmu; + test_pmu.alias_name = NULL; test_pmu.name = "pmuname"; TEST_ASSERT_EQUAL("Exact match", perf_pmu__match(&test_pmu, "pmuname"), true); -- cgit v1.2.3 From 599c19397b17d197fc1184bbc950f163a292efc9 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Wed, 7 Aug 2024 22:46:43 -0700 Subject: perf callchain: Fix stitch LBR memory leaks The 'struct callchain_cursor_node' has a 'struct map_symbol' whose maps and map members are reference counted. Ensure these values use a _get routine to increment the reference counts and use map_symbol__exit() to release the reference counts. Do similar for 'struct thread's prev_lbr_cursor, but save the size of the prev_lbr_cursor array so that it may be iterated. Ensure that when stitch_nodes are placed on the free list the map_symbols are exited. Fix resolve_lbr_callchain_sample() by replacing list_replace_init() to list_splice_init(), so the whole list is moved and nodes aren't leaked. A reproduction of the memory leaks is possible with a leak sanitizer build in the perf report command of: ``` $ perf record -e cycles --call-graph lbr perf test -w thloop $ perf report --stitch-lbr ``` Reviewed-by: Kan Liang Fixes: ff165628d72644e3 ("perf callchain: Stitch LBR call stack") Signed-off-by: Ian Rogers [ Basic tests after applying the patch, repeating the example above ] Tested-by: Arnaldo Carvalho de Melo Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Andi Kleen Cc: Anne Macedo Cc: Changbin Du Cc: Ingo Molnar Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20240808054644.1286065-1-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/machine.c | 17 +++++++++++++++-- tools/perf/util/thread.c | 4 ++++ tools/perf/util/thread.h | 1 + 3 files changed, 20 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c index 8477edefc299..706be5e4a076 100644 --- a/tools/perf/util/machine.c +++ b/tools/perf/util/machine.c @@ -2270,8 +2270,12 @@ static void save_lbr_cursor_node(struct thread *thread, cursor->curr = cursor->first; else cursor->curr = cursor->curr->next; + + map_symbol__exit(&lbr_stitch->prev_lbr_cursor[idx].ms); memcpy(&lbr_stitch->prev_lbr_cursor[idx], cursor->curr, sizeof(struct callchain_cursor_node)); + lbr_stitch->prev_lbr_cursor[idx].ms.maps = maps__get(cursor->curr->ms.maps); + lbr_stitch->prev_lbr_cursor[idx].ms.map = map__get(cursor->curr->ms.map); lbr_stitch->prev_lbr_cursor[idx].valid = true; cursor->pos++; @@ -2482,6 +2486,9 @@ static bool has_stitched_lbr(struct thread *thread, memcpy(&stitch_node->cursor, &lbr_stitch->prev_lbr_cursor[i], sizeof(struct callchain_cursor_node)); + stitch_node->cursor.ms.maps = maps__get(lbr_stitch->prev_lbr_cursor[i].ms.maps); + stitch_node->cursor.ms.map = map__get(lbr_stitch->prev_lbr_cursor[i].ms.map); + if (callee) list_add(&stitch_node->node, &lbr_stitch->lists); else @@ -2505,6 +2512,8 @@ static bool alloc_lbr_stitch(struct thread *thread, unsigned int max_lbr) if (!thread__lbr_stitch(thread)->prev_lbr_cursor) goto free_lbr_stitch; + thread__lbr_stitch(thread)->prev_lbr_cursor_size = max_lbr + 1; + INIT_LIST_HEAD(&thread__lbr_stitch(thread)->lists); INIT_LIST_HEAD(&thread__lbr_stitch(thread)->free_lists); @@ -2560,8 +2569,12 @@ static int resolve_lbr_callchain_sample(struct thread *thread, max_lbr, callee); if (!stitched_lbr && !list_empty(&lbr_stitch->lists)) { - list_replace_init(&lbr_stitch->lists, - &lbr_stitch->free_lists); + struct stitch_list *stitch_node; + + list_for_each_entry(stitch_node, &lbr_stitch->lists, node) + map_symbol__exit(&stitch_node->cursor.ms); + + list_splice_init(&lbr_stitch->lists, &lbr_stitch->free_lists); } memcpy(&lbr_stitch->prev_sample, sample, sizeof(*sample)); } diff --git a/tools/perf/util/thread.c b/tools/perf/util/thread.c index 87c59aa9fe38..0ffdd52d86d7 100644 --- a/tools/perf/util/thread.c +++ b/tools/perf/util/thread.c @@ -476,6 +476,7 @@ void thread__free_stitch_list(struct thread *thread) return; list_for_each_entry_safe(pos, tmp, &lbr_stitch->lists, node) { + map_symbol__exit(&pos->cursor.ms); list_del_init(&pos->node); free(pos); } @@ -485,6 +486,9 @@ void thread__free_stitch_list(struct thread *thread) free(pos); } + for (unsigned int i = 0 ; i < lbr_stitch->prev_lbr_cursor_size; i++) + map_symbol__exit(&lbr_stitch->prev_lbr_cursor[i].ms); + zfree(&lbr_stitch->prev_lbr_cursor); free(thread__lbr_stitch(thread)); thread__set_lbr_stitch(thread, NULL); diff --git a/tools/perf/util/thread.h b/tools/perf/util/thread.h index 8b4a3c69bad1..6cbf6eb2812e 100644 --- a/tools/perf/util/thread.h +++ b/tools/perf/util/thread.h @@ -26,6 +26,7 @@ struct lbr_stitch { struct list_head free_lists; struct perf_sample prev_sample; struct callchain_cursor_node *prev_lbr_cursor; + unsigned int prev_lbr_cursor_size; }; DECLARE_RC_STRUCT(thread) { -- cgit v1.2.3 From 32559b99e0f590700dcc2522d97bc637979bcf61 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Wed, 7 Aug 2024 22:46:44 -0700 Subject: perf test: Add set of perf record LBR tests Adds coverage for LBR operations and LBR callgraph. Reviewed-by: Kan Liang Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Andi Kleen Cc: Anne Macedo Cc: Changbin Du Cc: Ingo Molnar Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20240808054644.1286065-2-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/shell/record_lbr.sh | 161 +++++++++++++++++++++++++++++++++++ 1 file changed, 161 insertions(+) create mode 100755 tools/perf/tests/shell/record_lbr.sh (limited to 'tools') diff --git a/tools/perf/tests/shell/record_lbr.sh b/tools/perf/tests/shell/record_lbr.sh new file mode 100755 index 000000000000..baf168d0ddbb --- /dev/null +++ b/tools/perf/tests/shell/record_lbr.sh @@ -0,0 +1,161 @@ +#!/bin/bash +# perf record LBR tests +# SPDX-License-Identifier: GPL-2.0 + +set -e + +if [ ! -f /sys/devices/cpu/caps/branches ] +then + echo "Skip: only x86 CPUs support LBR" + exit 2 +fi + +err=0 +perfdata=$(mktemp /tmp/__perf_test.perf.data.XXXXX) + +cleanup() { + rm -rf "${perfdata}" + rm -rf "${perfdata}".old + rm -rf "${perfdata}".txt + + trap - EXIT TERM INT +} + +trap_cleanup() { + cleanup + exit 1 +} +trap trap_cleanup EXIT TERM INT + + +lbr_callgraph_test() { + test="LBR callgraph" + + echo "$test" + if ! perf record -e cycles --call-graph lbr -o "${perfdata}" perf test -w thloop + then + echo "$test [Failed support missing]" + if [ $err -eq 0 ] + then + err=2 + fi + return + fi + + if ! perf report --stitch-lbr -i "${perfdata}" > "${perfdata}".txt + then + cat "${perfdata}".txt + echo "$test [Failed in perf report]" + err=1 + return + fi + + echo "$test [Success]" +} + +lbr_test() { + local branch_flags=$1 + local test="LBR $2 test" + local threshold=$3 + local out + local sam_nr + local bs_nr + local zero_nr + local r + + echo "$test" + if ! perf record -e cycles $branch_flags -o "${perfdata}" perf test -w thloop + then + echo "$test [Failed support missing]" + perf record -e cycles $branch_flags -o "${perfdata}" perf test -w thloop || true + if [ $err -eq 0 ] + then + err=2 + fi + return + fi + + out=$(perf report -D -i "${perfdata}" 2> /dev/null | grep -A1 'PERF_RECORD_SAMPLE') + sam_nr=$(echo "$out" | grep -c 'PERF_RECORD_SAMPLE' || true) + if [ $sam_nr -eq 0 ] + then + echo "$test [Failed no samples captured]" + err=1 + return + fi + echo "$test: $sam_nr samples" + + bs_nr=$(echo "$out" | grep -c 'branch stack: nr:' || true) + if [ $sam_nr -ne $bs_nr ] + then + echo "$test [Failed samples missing branch stacks]" + err=1 + return + fi + + zero_nr=$(echo "$out" | grep -c 'branch stack: nr:0' || true) + r=$(($zero_nr * 100 / $bs_nr)) + if [ $r -gt $threshold ]; then + echo "$test [Failed empty br stack ratio exceed $threshold%: $r%]" + err=1 + return + fi + + echo "$test [Success]" +} + +parallel_lbr_test() { + err=0 + perfdata=$(mktemp /tmp/__perf_test.perf.data.XXXXX) + lbr_test "$1" "$2" "$3" + cleanup + exit $err +} + +lbr_callgraph_test + +# Sequential +lbr_test "-b" "any branch" 2 +lbr_test "-j any_call" "any call" 2 +lbr_test "-j any_ret" "any ret" 2 +lbr_test "-j ind_call" "any indirect call" 2 +lbr_test "-j ind_jmp" "any indirect jump" 100 +lbr_test "-j call" "direct calls" 2 +lbr_test "-j ind_call,u" "any indirect user call" 100 +lbr_test "-a -b" "system wide any branch" 2 +lbr_test "-a -j any_call" "system wide any call" 2 + +# Parallel +parallel_lbr_test "-b" "parallel any branch" 100 & +pid1=$! +parallel_lbr_test "-j any_call" "parallel any call" 100 & +pid2=$! +parallel_lbr_test "-j any_ret" "parallel any ret" 100 & +pid3=$! +parallel_lbr_test "-j ind_call" "parallel any indirect call" 100 & +pid4=$! +parallel_lbr_test "-j ind_jmp" "parallel any indirect jump" 100 & +pid5=$! +parallel_lbr_test "-j call" "parallel direct calls" 100 & +pid6=$! +parallel_lbr_test "-j ind_call,u" "parallel any indirect user call" 100 & +pid7=$! +parallel_lbr_test "-a -b" "parallel system wide any branch" 100 & +pid8=$! +parallel_lbr_test "-a -j any_call" "parallel system wide any call" 100 & +pid9=$! + +for pid in $pid1 $pid2 $pid3 $pid4 $pid5 $pid6 $pid7 $pid8 $pid9 +do + set +e + wait $pid + child_err=$? + set -e + if ([ $err -eq 2 ] && [ $child_err -eq 1 ]) || [ $err -eq 0 ] + then + err=$child_err + fi +done + +cleanup +exit $err -- cgit v1.2.3 From 9e9d0a79d34716f8e2590ccab0eee08f4fdfc7f4 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Thu, 8 Aug 2024 11:26:13 -0300 Subject: perf test shell lbr: Support hybrid x86 systems too Running on a: root@x1:~# grep 'model name' -m1 /proc/cpuinfo model name : 13th Gen Intel(R) Core(TM) i7-1365U root@x1:~# It skips all the tests with: root@x1:~# perf test -vvvv LBR 97: perf record LBR tests: --- start --- test child forked, pid 2033388 Skip: only x86 CPUs support LBR ---- end(-2) ---- 97: perf record LBR tests : Skip root@x1:~# Because the test checks for the /sys/devices/cpu/caps/branches file, that isn't present as we have instead: root@x1:~# ls -la /sys/devices/cpu*/caps/branches -r--r--r--. 1 root root 4096 Aug 8 11:22 /sys/devices/cpu_atom/caps/branches -r--r--r--. 1 root root 4096 Aug 8 11:21 /sys/devices/cpu_core/caps/branches root@x1:~# If we check as well for one of those, /sys/devices/cpu_core/caps/branches, then we don't skip the tests and all are run on these x86 Intel Hybrid systems as well, passing all of them: root@x1:~# perf test -vvvv LBR 97: perf record LBR tests: --- start --- test child forked, pid 2034956 LBR callgraph [ perf record: Woken up 5 times to write data ] [ perf record: Captured and wrote 1.812 MB /tmp/__perf_test.perf.data.B2HvQ (8114 samples) ] LBR callgraph [Success] LBR any branch test [ perf record: Woken up 25 times to write data ] [ perf record: Captured and wrote 6.382 MB /tmp/__perf_test.perf.data.B2HvQ (8071 samples) ] LBR any branch test: 8071 samples LBR any branch test [Success] LBR any call test [ perf record: Woken up 23 times to write data ] [ perf record: Captured and wrote 6.208 MB /tmp/__perf_test.perf.data.B2HvQ (8092 samples) ] LBR any call test: 8092 samples LBR any call test [Success] LBR any ret test [ perf record: Woken up 24 times to write data ] [ perf record: Captured and wrote 6.396 MB /tmp/__perf_test.perf.data.B2HvQ (8093 samples) ] LBR any ret test: 8093 samples LBR any ret test [Success] LBR any indirect call test [ perf record: Woken up 25 times to write data ] [ perf record: Captured and wrote 6.344 MB /tmp/__perf_test.perf.data.B2HvQ (8067 samples) ] LBR any indirect call test: 8067 samples LBR any indirect call test [Success] LBR any indirect jump test [ perf record: Woken up 12 times to write data ] [ perf record: Captured and wrote 3.073 MB /tmp/__perf_test.perf.data.B2HvQ (8061 samples) ] LBR any indirect jump test: 8061 samples LBR any indirect jump test [Success] LBR direct calls test [ perf record: Woken up 25 times to write data ] [ perf record: Captured and wrote 6.380 MB /tmp/__perf_test.perf.data.B2HvQ (8076 samples) ] LBR direct calls test: 8076 samples LBR direct calls test [Success] LBR any indirect user call test [ perf record: Woken up 5 times to write data ] [ perf record: Captured and wrote 1.597 MB /tmp/__perf_test.perf.data.B2HvQ (8079 samples) ] LBR any indirect user call test: 8079 samples LBR any indirect user call test [Success] LBR system wide any branch test [ perf record: Woken up 26 times to write data ] [ perf record: Captured and wrote 9.088 MB /tmp/__perf_test.perf.data.B2HvQ (9209 samples) ] LBR system wide any branch test: 9209 samples LBR system wide any branch test [Success] LBR system wide any call test [ perf record: Woken up 25 times to write data ] [ perf record: Captured and wrote 8.945 MB /tmp/__perf_test.perf.data.B2HvQ (9333 samples) ] LBR system wide any call test: 9333 samples LBR system wide any call test [Success] LBR parallel any branch test LBR parallel any call test LBR parallel any ret test LBR parallel any indirect call test LBR parallel any indirect jump test LBR parallel direct calls test LBR parallel system wide any branch test LBR parallel any indirect user call test LBR parallel system wide any call test [ perf record: Woken up 9 times to write data ] [ perf record: Woken up 51 times to write data ] [ perf record: Woken up 1 times to write data ] [ perf record: Woken up 5 times to write data ] [ perf record: Woken up 559 times to write data ] [ perf record: Woken up 14 times to write data ] [ perf record: Woken up 17 times to write data ] [ perf record: Woken up 1 times to write data ] [ perf record: Woken up 11 times to write data ] [ perf record: Captured and wrote 0.150 MB /tmp/__perf_test.perf.data.lANpR (1909 samples) ] [ perf record: Captured and wrote 2.371 MB /tmp/__perf_test.perf.data.Olum8 (3033 samples) ] [ perf record: Captured and wrote 1.230 MB /tmp/__perf_test.perf.data.njfJ8 (1742 samples) ] [ perf record: Captured and wrote 5.554 MB /tmp/__perf_test.perf.data.4ZTrj (29662 samples) ] [ perf record: Captured and wrote 19.906 MB /tmp/__perf_test.perf.data.dlGQt (29576 samples) ] [ perf record: Captured and wrote 0.289 MB /tmp/__perf_test.perf.data.CAT7y (4311 samples) ] [ perf record: Captured and wrote 3.129 MB /tmp/__perf_test.perf.data.diuKG (3971 samples) ] LBR parallel any indirect user call test: 1909 samples [ perf record: Captured and wrote 4.858 MB /tmp/__perf_test.perf.data.sVjtN (6130 samples) ] LBR parallel any indirect user call test [Success] [ perf record: Captured and wrote 3.669 MB /tmp/__perf_test.perf.data.AJtNI (4827 samples) ] LBR parallel any indirect jump test: 4311 samples LBR parallel any indirect jump test [Success] LBR parallel direct calls test: 3033 samples LBR parallel direct calls test [Success] LBR parallel any indirect call test: 1742 samples LBR parallel any indirect call test [Success] LBR parallel any call test: 4827 samples LBR parallel any call test [Success] LBR parallel any branch test: 6130 samples LBR parallel any branch test [Success] LBR parallel system wide any branch test: 29662 samples LBR parallel any ret test: 3971 samples LBR parallel any ret test [Success] LBR parallel system wide any branch test [Success] LBR parallel system wide any call test: 29576 samples LBR parallel system wide any call test [Success] ---- end(0) ---- 97: perf record LBR tests : Ok root@x1:~# Reviewed-by: Kan Liang Acked-by: Ian Rogers Cc: Adrian Hunter Cc: Jiri Olsa Cc: Namhyung Kim Link: https://lore.kernel.org/lkml/ZrTXftup0H46R8WK@x1 Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/shell/record_lbr.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/perf/tests/shell/record_lbr.sh b/tools/perf/tests/shell/record_lbr.sh index baf168d0ddbb..32314641217e 100755 --- a/tools/perf/tests/shell/record_lbr.sh +++ b/tools/perf/tests/shell/record_lbr.sh @@ -4,7 +4,7 @@ set -e -if [ ! -f /sys/devices/cpu/caps/branches ] +if [ ! -f /sys/devices/cpu/caps/branches ] && [ ! -f /sys/devices/cpu_core/caps/branches ] then echo "Skip: only x86 CPUs support LBR" exit 2 -- cgit v1.2.3 From 3882dccf48f9fbe787b5df3187d708ef348ac860 Mon Sep 17 00:00:00 2001 From: Alan Maguire Date: Thu, 8 Aug 2024 16:05:57 +0100 Subject: bpf/bpf_get,set_sockopt: add option to set TCP-BPF sock ops flags Currently the only opportunity to set sock ops flags dictating which callbacks fire for a socket is from within a TCP-BPF sockops program. This is problematic if the connection is already set up as there is no further chance to specify callbacks for that socket. Add TCP_BPF_SOCK_OPS_CB_FLAGS to bpf_setsockopt() and bpf_getsockopt() to allow users to specify callbacks later, either via an iterator over sockets or via a socket-specific program triggered by a setsockopt() on the socket. Previous discussion on this here [1]. [1] https://lore.kernel.org/bpf/f42f157b-6e52-dd4d-3d97-9b86c84c0b00@oracle.com/ Signed-off-by: Alan Maguire Link: https://lore.kernel.org/r/20240808150558.1035626-2-alan.maguire@oracle.com Signed-off-by: Martin KaFai Lau --- include/uapi/linux/bpf.h | 3 ++- net/core/filter.c | 16 ++++++++++++++++ tools/include/uapi/linux/bpf.h | 3 ++- 3 files changed, 20 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 35bcf52dbc65..e05b39e39c3f 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2851,7 +2851,7 @@ union bpf_attr { * **TCP_SYNCNT**, **TCP_USER_TIMEOUT**, **TCP_NOTSENT_LOWAT**, * **TCP_NODELAY**, **TCP_MAXSEG**, **TCP_WINDOW_CLAMP**, * **TCP_THIN_LINEAR_TIMEOUTS**, **TCP_BPF_DELACK_MAX**, - * **TCP_BPF_RTO_MIN**. + * **TCP_BPF_RTO_MIN**, **TCP_BPF_SOCK_OPS_CB_FLAGS**. * * **IPPROTO_IP**, which supports *optname* **IP_TOS**. * * **IPPROTO_IPV6**, which supports the following *optname*\ s: * **IPV6_TCLASS**, **IPV6_AUTOFLOWLABEL**. @@ -7080,6 +7080,7 @@ enum { TCP_BPF_SYN = 1005, /* Copy the TCP header */ TCP_BPF_SYN_IP = 1006, /* Copy the IP[46] and TCP header */ TCP_BPF_SYN_MAC = 1007, /* Copy the MAC, IP[46], and TCP header */ + TCP_BPF_SOCK_OPS_CB_FLAGS = 1008, /* Get or Set TCP sock ops flags */ }; enum { diff --git a/net/core/filter.c b/net/core/filter.c index f3c72cf86099..d96a50f3f016 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -5278,6 +5278,11 @@ static int bpf_sol_tcp_setsockopt(struct sock *sk, int optname, return -EINVAL; inet_csk(sk)->icsk_rto_min = timeout; break; + case TCP_BPF_SOCK_OPS_CB_FLAGS: + if (val & ~(BPF_SOCK_OPS_ALL_CB_FLAGS)) + return -EINVAL; + tp->bpf_sock_ops_cb_flags = val; + break; default: return -EINVAL; } @@ -5366,6 +5371,17 @@ static int sol_tcp_sockopt(struct sock *sk, int optname, if (*optlen < 1) return -EINVAL; break; + case TCP_BPF_SOCK_OPS_CB_FLAGS: + if (*optlen != sizeof(int)) + return -EINVAL; + if (getopt) { + struct tcp_sock *tp = tcp_sk(sk); + int cb_flags = tp->bpf_sock_ops_cb_flags; + + memcpy(optval, &cb_flags, *optlen); + return 0; + } + return bpf_sol_tcp_setsockopt(sk, optname, optval, *optlen); default: if (getopt) return -EINVAL; diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 35bcf52dbc65..e05b39e39c3f 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -2851,7 +2851,7 @@ union bpf_attr { * **TCP_SYNCNT**, **TCP_USER_TIMEOUT**, **TCP_NOTSENT_LOWAT**, * **TCP_NODELAY**, **TCP_MAXSEG**, **TCP_WINDOW_CLAMP**, * **TCP_THIN_LINEAR_TIMEOUTS**, **TCP_BPF_DELACK_MAX**, - * **TCP_BPF_RTO_MIN**. + * **TCP_BPF_RTO_MIN**, **TCP_BPF_SOCK_OPS_CB_FLAGS**. * * **IPPROTO_IP**, which supports *optname* **IP_TOS**. * * **IPPROTO_IPV6**, which supports the following *optname*\ s: * **IPV6_TCLASS**, **IPV6_AUTOFLOWLABEL**. @@ -7080,6 +7080,7 @@ enum { TCP_BPF_SYN = 1005, /* Copy the TCP header */ TCP_BPF_SYN_IP = 1006, /* Copy the IP[46] and TCP header */ TCP_BPF_SYN_MAC = 1007, /* Copy the MAC, IP[46], and TCP header */ + TCP_BPF_SOCK_OPS_CB_FLAGS = 1008, /* Get or Set TCP sock ops flags */ }; enum { -- cgit v1.2.3 From d53050934e66dbee64caed1309cef963a416c52f Mon Sep 17 00:00:00 2001 From: Alan Maguire Date: Thu, 8 Aug 2024 16:05:58 +0100 Subject: selftests/bpf: add sockopt tests for TCP_BPF_SOCK_OPS_CB_FLAGS Add tests to set TCP sockopt TCP_BPF_SOCK_OPS_CB_FLAGS via bpf_setsockopt() and use a cgroup/getsockopt program to retrieve the value to verify it was set. Signed-off-by: Alan Maguire Link: https://lore.kernel.org/r/20240808150558.1035626-3-alan.maguire@oracle.com Signed-off-by: Martin KaFai Lau --- .../selftests/bpf/prog_tests/setget_sockopt.c | 47 ++++++++++++++++++++++ tools/testing/selftests/bpf/progs/setget_sockopt.c | 26 ++++++++++-- 2 files changed, 70 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/setget_sockopt.c b/tools/testing/selftests/bpf/prog_tests/setget_sockopt.c index 7d4a9b3d3722..e12255121c15 100644 --- a/tools/testing/selftests/bpf/prog_tests/setget_sockopt.c +++ b/tools/testing/selftests/bpf/prog_tests/setget_sockopt.c @@ -154,6 +154,51 @@ err_out: close(sfd); } +static void test_nonstandard_opt(int family) +{ + struct setget_sockopt__bss *bss = skel->bss; + struct bpf_link *getsockopt_link = NULL; + int sfd = -1, fd = -1, cfd = -1, flags; + socklen_t flagslen = sizeof(flags); + + memset(bss, 0, sizeof(*bss)); + + sfd = start_server(family, SOCK_STREAM, + family == AF_INET6 ? addr6_str : addr4_str, 0, 0); + if (!ASSERT_GE(sfd, 0, "start_server")) + return; + + fd = connect_to_fd(sfd, 0); + if (!ASSERT_GE(fd, 0, "connect_to_fd_server")) + goto err_out; + + /* cgroup/getsockopt prog will intercept getsockopt() below and + * retrieve the tcp socket bpf_sock_ops_cb_flags value for the + * accept()ed socket; this was set earlier in the passive established + * callback for the accept()ed socket via bpf_setsockopt(). + */ + getsockopt_link = bpf_program__attach_cgroup(skel->progs._getsockopt, cg_fd); + if (!ASSERT_OK_PTR(getsockopt_link, "getsockopt prog")) + goto err_out; + + cfd = accept(sfd, NULL, 0); + if (!ASSERT_GE(cfd, 0, "accept")) + goto err_out; + + if (!ASSERT_OK(getsockopt(cfd, SOL_TCP, TCP_BPF_SOCK_OPS_CB_FLAGS, &flags, &flagslen), + "getsockopt_flags")) + goto err_out; + ASSERT_EQ(flags & BPF_SOCK_OPS_STATE_CB_FLAG, BPF_SOCK_OPS_STATE_CB_FLAG, + "cb_flags_set"); +err_out: + close(sfd); + if (fd != -1) + close(fd); + if (cfd != -1) + close(cfd); + bpf_link__destroy(getsockopt_link); +} + void test_setget_sockopt(void) { cg_fd = test__join_cgroup(CG_NAME); @@ -191,6 +236,8 @@ void test_setget_sockopt(void) test_udp(AF_INET); test_ktls(AF_INET6); test_ktls(AF_INET); + test_nonstandard_opt(AF_INET); + test_nonstandard_opt(AF_INET6); done: setget_sockopt__destroy(skel); diff --git a/tools/testing/selftests/bpf/progs/setget_sockopt.c b/tools/testing/selftests/bpf/progs/setget_sockopt.c index 60518aed1ffc..6dd4318debbf 100644 --- a/tools/testing/selftests/bpf/progs/setget_sockopt.c +++ b/tools/testing/selftests/bpf/progs/setget_sockopt.c @@ -59,6 +59,8 @@ static const struct sockopt_test sol_tcp_tests[] = { { .opt = TCP_THIN_LINEAR_TIMEOUTS, .flip = 1, }, { .opt = TCP_USER_TIMEOUT, .new = 123400, .expected = 123400, }, { .opt = TCP_NOTSENT_LOWAT, .new = 1314, .expected = 1314, }, + { .opt = TCP_BPF_SOCK_OPS_CB_FLAGS, .new = BPF_SOCK_OPS_ALL_CB_FLAGS, + .expected = BPF_SOCK_OPS_ALL_CB_FLAGS, }, { .opt = 0, }, }; @@ -353,11 +355,30 @@ int BPF_PROG(socket_post_create, struct socket *sock, int family, return 1; } +SEC("cgroup/getsockopt") +int _getsockopt(struct bpf_sockopt *ctx) +{ + struct bpf_sock *sk = ctx->sk; + int *optval = ctx->optval; + struct tcp_sock *tp; + + if (!sk || ctx->level != SOL_TCP || ctx->optname != TCP_BPF_SOCK_OPS_CB_FLAGS) + return 1; + + tp = bpf_core_cast(sk, struct tcp_sock); + if (ctx->optval + sizeof(int) <= ctx->optval_end) { + *optval = tp->bpf_sock_ops_cb_flags; + ctx->retval = 0; + } + return 1; +} + SEC("sockops") int skops_sockopt(struct bpf_sock_ops *skops) { struct bpf_sock *bpf_sk = skops->sk; struct sock *sk; + int flags; if (!bpf_sk) return 1; @@ -384,9 +405,8 @@ int skops_sockopt(struct bpf_sock_ops *skops) nr_passive += !(bpf_test_sockopt(skops, sk) || test_tcp_maxseg(skops, sk) || test_tcp_saved_syn(skops, sk)); - bpf_sock_ops_cb_flags_set(skops, - skops->bpf_sock_ops_cb_flags | - BPF_SOCK_OPS_STATE_CB_FLAG); + flags = skops->bpf_sock_ops_cb_flags | BPF_SOCK_OPS_STATE_CB_FLAG; + bpf_setsockopt(skops, SOL_TCP, TCP_BPF_SOCK_OPS_CB_FLAGS, &flags, sizeof(flags)); break; case BPF_SOCK_OPS_STATE_CB: if (skops->args[1] == BPF_TCP_CLOSE_WAIT) -- cgit v1.2.3 From 754283ce8326fdce75d589c99cc58456199c123d Mon Sep 17 00:00:00 2001 From: Thomas Weißschuh Date: Sun, 28 Jul 2024 22:34:11 +0200 Subject: tools/nolibc: pass argc, argv and envp to constructors MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since 2005 glibc has passed argc, argv, and envp to all constructors. As it is cheap and easy to do so, mirror that behaviour in nolibc. This makes it easier to migrate applications to nolibc. Link: https://lore.kernel.org/r/20240728-nolibc-constructor-args-v1-1-36d0bf5cd4c0@weissschuh.net Signed-off-by: Thomas Weißschuh --- tools/include/nolibc/crt.h | 23 ++++++++++++----------- tools/testing/selftests/nolibc/nolibc-test.c | 5 +++-- 2 files changed, 15 insertions(+), 13 deletions(-) (limited to 'tools') diff --git a/tools/include/nolibc/crt.h b/tools/include/nolibc/crt.h index 43b551468c2a..ac291574f6c0 100644 --- a/tools/include/nolibc/crt.h +++ b/tools/include/nolibc/crt.h @@ -13,11 +13,11 @@ const unsigned long *_auxv __attribute__((weak)); static void __stack_chk_init(void); static void exit(int); -extern void (*const __preinit_array_start[])(void) __attribute__((weak)); -extern void (*const __preinit_array_end[])(void) __attribute__((weak)); +extern void (*const __preinit_array_start[])(int, char **, char**) __attribute__((weak)); +extern void (*const __preinit_array_end[])(int, char **, char**) __attribute__((weak)); -extern void (*const __init_array_start[])(void) __attribute__((weak)); -extern void (*const __init_array_end[])(void) __attribute__((weak)); +extern void (*const __init_array_start[])(int, char **, char**) __attribute__((weak)); +extern void (*const __init_array_end[])(int, char **, char**) __attribute__((weak)); extern void (*const __fini_array_start[])(void) __attribute__((weak)); extern void (*const __fini_array_end[])(void) __attribute__((weak)); @@ -29,7 +29,8 @@ void _start_c(long *sp) char **argv; char **envp; int exitcode; - void (* const *func)(void); + void (* const *ctor_func)(int, char **, char **); + void (* const *dtor_func)(void); const unsigned long *auxv; /* silence potential warning: conflicting types for 'main' */ int _nolibc_main(int, char **, char **) __asm__ ("main"); @@ -66,16 +67,16 @@ void _start_c(long *sp) ; _auxv = auxv; - for (func = __preinit_array_start; func < __preinit_array_end; func++) - (*func)(); - for (func = __init_array_start; func < __init_array_end; func++) - (*func)(); + for (ctor_func = __preinit_array_start; ctor_func < __preinit_array_end; ctor_func++) + (*ctor_func)(argc, argv, envp); + for (ctor_func = __init_array_start; ctor_func < __init_array_end; ctor_func++) + (*ctor_func)(argc, argv, envp); /* go to application */ exitcode = _nolibc_main(argc, argv, envp); - for (func = __fini_array_end; func > __fini_array_start;) - (*--func)(); + for (dtor_func = __fini_array_end; dtor_func > __fini_array_start;) + (*--dtor_func)(); exit(exitcode); } diff --git a/tools/testing/selftests/nolibc/nolibc-test.c b/tools/testing/selftests/nolibc/nolibc-test.c index 093d0512f4c5..0800b10fc3f7 100644 --- a/tools/testing/selftests/nolibc/nolibc-test.c +++ b/tools/testing/selftests/nolibc/nolibc-test.c @@ -686,9 +686,10 @@ static void constructor1(void) } __attribute__((constructor)) -static void constructor2(void) +static void constructor2(int argc, char **argv, char **envp) { - constructor_test_value *= 2; + if (argc && argv && envp) + constructor_test_value *= 2; } int run_startup(int min, int max) -- cgit v1.2.3 From 3ade6ce1255e6e97f91b8ba77408dce9d2292df2 Mon Sep 17 00:00:00 2001 From: Vegard Nossum Date: Tue, 6 Aug 2024 08:38:09 -0700 Subject: selftests: rds: add testing infrastructure This adds some basic self-testing infrastructure for RDS-TCP. Signed-off-by: Vegard Nossum Signed-off-by: Chuck Lever Signed-off-by: Allison Henderson Signed-off-by: David S. Miller --- Documentation/dev-tools/gcov.rst | 11 ++ MAINTAINERS | 1 + tools/testing/selftests/Makefile | 1 + tools/testing/selftests/net/rds/Makefile | 12 ++ tools/testing/selftests/net/rds/README.txt | 41 +++++ tools/testing/selftests/net/rds/config.sh | 53 ++++++ tools/testing/selftests/net/rds/run.sh | 224 ++++++++++++++++++++++++ tools/testing/selftests/net/rds/test.py | 262 +++++++++++++++++++++++++++++ 8 files changed, 605 insertions(+) create mode 100644 tools/testing/selftests/net/rds/Makefile create mode 100644 tools/testing/selftests/net/rds/README.txt create mode 100755 tools/testing/selftests/net/rds/config.sh create mode 100755 tools/testing/selftests/net/rds/run.sh create mode 100644 tools/testing/selftests/net/rds/test.py (limited to 'tools') diff --git a/Documentation/dev-tools/gcov.rst b/Documentation/dev-tools/gcov.rst index 5fce2b06f229..dbd26b02ff3c 100644 --- a/Documentation/dev-tools/gcov.rst +++ b/Documentation/dev-tools/gcov.rst @@ -75,6 +75,17 @@ Only files which are linked to the main kernel image or are compiled as kernel modules are supported by this mechanism. +Module specific configs +----------------------- + +Gcov kernel configs for specific modules are described below: + +CONFIG_GCOV_PROFILE_RDS: + Enables GCOV profiling on RDS for checking which functions or + lines are executed. This config is used by the rds selftest to + generate coverage reports. If left unset the report is omitted. + + Files ----- diff --git a/MAINTAINERS b/MAINTAINERS index 1a7ea5c7c710..7b291c3a9aa4 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -19197,6 +19197,7 @@ S: Supported W: https://oss.oracle.com/projects/rds/ F: Documentation/networking/rds.rst F: net/rds/ +F: tools/testing/selftests/net/rds/ RDT - RESOURCE ALLOCATION M: Fenghua Yu diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile index bc8fe9e8f7f2..a5f1c0c27dff 100644 --- a/tools/testing/selftests/Makefile +++ b/tools/testing/selftests/Makefile @@ -68,6 +68,7 @@ TARGETS += net/mptcp TARGETS += net/openvswitch TARGETS += net/tcp_ao TARGETS += net/netfilter +TARGETS += net/rds TARGETS += nsfs TARGETS += perf_events TARGETS += pidfd diff --git a/tools/testing/selftests/net/rds/Makefile b/tools/testing/selftests/net/rds/Makefile new file mode 100644 index 000000000000..da9714bc7aad --- /dev/null +++ b/tools/testing/selftests/net/rds/Makefile @@ -0,0 +1,12 @@ +# SPDX-License-Identifier: GPL-2.0 + +all: + @echo mk_build_dir="$(shell pwd)" > include.sh + +TEST_PROGS := run.sh \ + include.sh \ + test.py + +EXTRA_CLEAN := /tmp/rds_logs + +include ../../lib.mk diff --git a/tools/testing/selftests/net/rds/README.txt b/tools/testing/selftests/net/rds/README.txt new file mode 100644 index 000000000000..cbde2951ab13 --- /dev/null +++ b/tools/testing/selftests/net/rds/README.txt @@ -0,0 +1,41 @@ +RDS self-tests +============== + +These scripts provide a coverage test for RDS-TCP by creating two +network namespaces and running rds packets between them. A loopback +network is provisioned with optional probability of packet loss or +corruption. A workload of 50000 hashes, each 64 characters in size, +are passed over an RDS socket on this test network. A passing test means +the RDS-TCP stack was able to recover properly. The provided config.sh +can be used to compile the kernel with the necessary gcov options. The +kernel may optionally be configured to omit the coverage report as well. + +USAGE: + run.sh [-d logdir] [-l packet_loss] [-c packet_corruption] + [-u packet_duplcate] + +OPTIONS: + -d Log directory. Defaults to tools/testing/selftests/net/rds/rds_logs + + -l Simulates a percentage of packet loss + + -c Simulates a percentage of packet corruption + + -u Simulates a percentage of packet duplication. + +EXAMPLE: + + # Create a suitable gcov enabled .config + tools/testing/selftests/net/rds/config.sh -g + + # Alternatly create a gcov disabled .config + tools/testing/selftests/net/rds/config.sh + + # build the kernel + vng --build --config tools/testing/selftests/net/config + + # launch the tests in a VM + vng -v --rwdir ./ --run . --user root --cpus 4 -- \ + "export PYTHONPATH=tools/testing/selftests/net/; tools/testing/selftests/net/rds/run.sh" + +An HTML coverage report will be output in tools/testing/selftests/net/rds/rds_logs/coverage/. diff --git a/tools/testing/selftests/net/rds/config.sh b/tools/testing/selftests/net/rds/config.sh new file mode 100755 index 000000000000..791c8dbe1095 --- /dev/null +++ b/tools/testing/selftests/net/rds/config.sh @@ -0,0 +1,53 @@ +#! /bin/bash +# SPDX-License-Identifier: GPL-2.0 + +set -e +set -u +set -x + +unset KBUILD_OUTPUT + +GENERATE_GCOV_REPORT=0 +while getopts "g" opt; do + case ${opt} in + g) + GENERATE_GCOV_REPORT=1 + ;; + :) + echo "USAGE: config.sh [-g]" + exit 1 + ;; + ?) + echo "Invalid option: -${OPTARG}." + exit 1 + ;; + esac +done + +CONF_FILE="tools/testing/selftests/net/config" + +# no modules +scripts/config --file "$CONF_FILE" --disable CONFIG_MODULES + +# enable RDS +scripts/config --file "$CONF_FILE" --enable CONFIG_RDS +scripts/config --file "$CONF_FILE" --enable CONFIG_RDS_TCP + +if [ "$GENERATE_GCOV_REPORT" -eq 1 ]; then + # instrument RDS and only RDS + scripts/config --file "$CONF_FILE" --enable CONFIG_GCOV_KERNEL + scripts/config --file "$CONF_FILE" --disable GCOV_PROFILE_ALL + scripts/config --file "$CONF_FILE" --enable GCOV_PROFILE_RDS +else + scripts/config --file "$CONF_FILE" --disable CONFIG_GCOV_KERNEL + scripts/config --file "$CONF_FILE" --disable GCOV_PROFILE_ALL + scripts/config --file "$CONF_FILE" --disable GCOV_PROFILE_RDS +fi + +# need network namespaces to run tests with veth network interfaces +scripts/config --file "$CONF_FILE" --enable CONFIG_NET_NS +scripts/config --file "$CONF_FILE" --enable CONFIG_VETH + +# simulate packet loss +scripts/config --file "$CONF_FILE" --enable CONFIG_NET_SCH_NETEM + diff --git a/tools/testing/selftests/net/rds/run.sh b/tools/testing/selftests/net/rds/run.sh new file mode 100755 index 000000000000..8aee244f582a --- /dev/null +++ b/tools/testing/selftests/net/rds/run.sh @@ -0,0 +1,224 @@ +#! /bin/bash +# SPDX-License-Identifier: GPL-2.0 + +set -e +set -u + +unset KBUILD_OUTPUT + +current_dir="$(realpath "$(dirname "$0")")" +build_dir="$current_dir" + +build_include="$current_dir/include.sh" +if test -f "$build_include"; then + # this include will define "$mk_build_dir" as the location the test was + # built. We will need this if the tests are installed in a location + # other than the kernel source + + source "$build_include" + build_dir="$mk_build_dir" +fi + +# This test requires kernel source and the *.gcda data therein +# Locate the top level of the kernel source, and the net/rds +# subfolder with the appropriate *.gcno object files +ksrc_dir="$(realpath "$build_dir"/../../../../../)" +kconfig="$ksrc_dir/.config" +obj_dir="$ksrc_dir/net/rds" + +GCOV_CMD=gcov + +#check to see if the host has the required packages to generate a gcov report +check_gcov_env() +{ + if ! which "$GCOV_CMD" > /dev/null 2>&1; then + echo "Warning: Could not find gcov. " + GENERATE_GCOV_REPORT=0 + return + fi + + # the gcov version must match the gcc version + GCC_VER=$(gcc -dumpfullversion) + GCOV_VER=$($GCOV_CMD -v | grep gcov | awk '{print $3}'| awk 'BEGIN {FS="-"}{print $1}') + if [ "$GCOV_VER" != "$GCC_VER" ]; then + #attempt to find a matching gcov version + GCOV_CMD=gcov-$(gcc -dumpversion) + + if ! which "$GCOV_CMD" > /dev/null 2>&1; then + echo "Warning: Could not find an appropriate gcov installation. \ + gcov version must match gcc version" + GENERATE_GCOV_REPORT=0 + return + fi + + #recheck version number of found gcov executable + GCOV_VER=$($GCOV_CMD -v | grep gcov | awk '{print $3}'| \ + awk 'BEGIN {FS="-"}{print $1}') + if [ "$GCOV_VER" != "$GCC_VER" ]; then + echo "Warning: Could not find an appropriate gcov installation. \ + gcov version must match gcc version" + GENERATE_GCOV_REPORT=0 + else + echo "Warning: Mismatched gcc and gcov detected. Using $GCOV_CMD" + fi + fi +} + +# Check to see if the kconfig has the required configs to generate a coverage report +check_gcov_conf() +{ + if ! grep -x "CONFIG_GCOV_PROFILE_RDS=y" "$kconfig" > /dev/null 2>&1; then + echo "INFO: CONFIG_GCOV_PROFILE_RDS should be enabled for coverage reports" + GENERATE_GCOV_REPORT=0 + fi + if ! grep -x "CONFIG_GCOV_KERNEL=y" "$kconfig" > /dev/null 2>&1; then + echo "INFO: CONFIG_GCOV_KERNEL should be enabled for coverage reports" + GENERATE_GCOV_REPORT=0 + fi + if grep -x "CONFIG_GCOV_PROFILE_ALL=y" "$kconfig" > /dev/null 2>&1; then + echo "INFO: CONFIG_GCOV_PROFILE_ALL should be disabled for coverage reports" + GENERATE_GCOV_REPORT=0 + fi + + if [ "$GENERATE_GCOV_REPORT" -eq 0 ]; then + echo "To enable gcov reports, please run "\ + "\"tools/testing/selftests/net/rds/config.sh -g\" and rebuild the kernel" + else + # if we have the required kernel configs, proceed to check the environment to + # ensure we have the required gcov packages + check_gcov_env + fi +} + +# Kselftest framework requirement - SKIP code is 4. +check_conf_enabled() { + if ! grep -x "$1=y" "$kconfig" > /dev/null 2>&1; then + echo "selftests: [SKIP] This test requires $1 enabled" + echo "Please run tools/testing/selftests/net/rds/config.sh and rebuild the kernel" + exit 4 + fi +} +check_conf_disabled() { + if grep -x "$1=y" "$kconfig" > /dev/null 2>&1; then + echo "selftests: [SKIP] This test requires $1 disabled" + echo "Please run tools/testing/selftests/net/rds/config.sh and rebuild the kernel" + exit 4 + fi +} +check_conf() { + check_conf_enabled CONFIG_NET_SCH_NETEM + check_conf_enabled CONFIG_VETH + check_conf_enabled CONFIG_NET_NS + check_conf_enabled CONFIG_RDS_TCP + check_conf_enabled CONFIG_RDS + check_conf_disabled CONFIG_MODULES +} + +check_env() +{ + if ! test -d "$obj_dir"; then + echo "selftests: [SKIP] This test requires a kernel source tree" + exit 4 + fi + if ! test -e "$kconfig"; then + echo "selftests: [SKIP] This test requires a configured kernel source tree" + exit 4 + fi + if ! which strace > /dev/null 2>&1; then + echo "selftests: [SKIP] Could not run test without strace" + exit 4 + fi + if ! which tcpdump > /dev/null 2>&1; then + echo "selftests: [SKIP] Could not run test without tcpdump" + exit 4 + fi + + if ! which python3 > /dev/null 2>&1; then + echo "selftests: [SKIP] Could not run test without python3" + exit 4 + fi + + python_major=$(python3 -c "import sys; print(sys.version_info[0])") + python_minor=$(python3 -c "import sys; print(sys.version_info[1])") + if [[ python_major -lt 3 || ( python_major -eq 3 && python_minor -lt 9 ) ]] ; then + echo "selftests: [SKIP] Could not run test without at least python3.9" + python3 -V + exit 4 + fi +} + +LOG_DIR="$current_dir"/rds_logs +PLOSS=0 +PCORRUPT=0 +PDUP=0 +GENERATE_GCOV_REPORT=1 +while getopts "d:l:c:u:" opt; do + case ${opt} in + d) + LOG_DIR=${OPTARG} + ;; + l) + PLOSS=${OPTARG} + ;; + c) + PCORRUPT=${OPTARG} + ;; + u) + PDUP=${OPTARG} + ;; + :) + echo "USAGE: run.sh [-d logdir] [-l packet_loss] [-c packet_corruption]" \ + "[-u packet_duplcate] [-g]" + exit 1 + ;; + ?) + echo "Invalid option: -${OPTARG}." + exit 1 + ;; + esac +done + + +check_env +check_conf +check_gcov_conf + + +rm -fr "$LOG_DIR" +TRACE_FILE="${LOG_DIR}/rds-strace.txt" +COVR_DIR="${LOG_DIR}/coverage/" +mkdir -p "$LOG_DIR" +mkdir -p "$COVR_DIR" + +set +e +echo running RDS tests... +echo Traces will be logged to "$TRACE_FILE" +rm -f "$TRACE_FILE" +strace -T -tt -o "$TRACE_FILE" python3 "$(dirname "$0")/test.py" --timeout 400 -d "$LOG_DIR" \ + -l "$PLOSS" -c "$PCORRUPT" -u "$PDUP" + +test_rc=$? +dmesg > "${LOG_DIR}/dmesg.out" + +if [ "$GENERATE_GCOV_REPORT" -eq 1 ]; then + echo saving coverage data... + (set +x; cd /sys/kernel/debug/gcov; find ./* -name '*.gcda' | \ + while read -r f + do + cat < "/sys/kernel/debug/gcov/$f" > "/$f" + done) + + echo running gcovr... + gcovr -s --html-details --gcov-executable "$GCOV_CMD" --gcov-ignore-parse-errors \ + -o "${COVR_DIR}/gcovr" "${ksrc_dir}/net/rds/" +else + echo "Coverage report will be skipped" +fi + +if [ "$test_rc" -eq 0 ]; then + echo "PASS: Test completed successfully" +else + echo "FAIL: Test failed" +fi + +exit "$test_rc" diff --git a/tools/testing/selftests/net/rds/test.py b/tools/testing/selftests/net/rds/test.py new file mode 100644 index 000000000000..e6bb109bcead --- /dev/null +++ b/tools/testing/selftests/net/rds/test.py @@ -0,0 +1,262 @@ +#! /usr/bin/env python3 +# SPDX-License-Identifier: GPL-2.0 + +import argparse +import ctypes +import errno +import hashlib +import os +import select +import signal +import socket +import subprocess +import sys +import atexit +from pwd import getpwuid +from os import stat +from lib.py import ip + + +libc = ctypes.cdll.LoadLibrary('libc.so.6') +setns = libc.setns + +net0 = 'net0' +net1 = 'net1' + +veth0 = 'veth0' +veth1 = 'veth1' + +# Helper function for creating a socket inside a network namespace. +# We need this because otherwise RDS will detect that the two TCP +# sockets are on the same interface and use the loop transport instead +# of the TCP transport. +def netns_socket(netns, *args): + u0, u1 = socket.socketpair(socket.AF_UNIX, socket.SOCK_SEQPACKET) + + child = os.fork() + if child == 0: + # change network namespace + with open(f'/var/run/netns/{netns}') as f: + try: + ret = setns(f.fileno(), 0) + except IOError as e: + print(e.errno) + print(e) + + # create socket in target namespace + s = socket.socket(*args) + + # send resulting socket to parent + socket.send_fds(u0, [], [s.fileno()]) + + sys.exit(0) + + # receive socket from child + _, s, _, _ = socket.recv_fds(u1, 0, 1) + os.waitpid(child, 0) + u0.close() + u1.close() + return socket.fromfd(s[0], *args) + +def signal_handler(sig, frame): + print('Test timed out') + sys.exit(1) + +#Parse out command line arguments. We take an optional +# timeout parameter and an optional log output folder +parser = argparse.ArgumentParser(description="init script args", + formatter_class=argparse.ArgumentDefaultsHelpFormatter) +parser.add_argument("-d", "--logdir", action="store", + help="directory to store logs", default="/tmp") +parser.add_argument('--timeout', help="timeout to terminate hung test", + type=int, default=0) +parser.add_argument('-l', '--loss', help="Simulate tcp packet loss", + type=int, default=0) +parser.add_argument('-c', '--corruption', help="Simulate tcp packet corruption", + type=int, default=0) +parser.add_argument('-u', '--duplicate', help="Simulate tcp packet duplication", + type=int, default=0) +args = parser.parse_args() +logdir=args.logdir +packet_loss=str(args.loss)+'%' +packet_corruption=str(args.corruption)+'%' +packet_duplicate=str(args.duplicate)+'%' + +ip(f"netns add {net0}") +ip(f"netns add {net1}") +ip(f"link add type veth") + +addrs = [ + # we technically don't need different port numbers, but this will + # help identify traffic in the network analyzer + ('10.0.0.1', 10000), + ('10.0.0.2', 20000), +] + +# move interfaces to separate namespaces so they can no longer be +# bound directly; this prevents rds from switching over from the tcp +# transport to the loop transport. +ip(f"link set {veth0} netns {net0} up") +ip(f"link set {veth1} netns {net1} up") + + + +# add addresses +ip(f"-n {net0} addr add {addrs[0][0]}/32 dev {veth0}") +ip(f"-n {net1} addr add {addrs[1][0]}/32 dev {veth1}") + +# add routes +ip(f"-n {net0} route add {addrs[1][0]}/32 dev {veth0}") +ip(f"-n {net1} route add {addrs[0][0]}/32 dev {veth1}") + +# sanity check that our two interfaces/addresses are correctly set up +# and communicating by doing a single ping +ip(f"netns exec {net0} ping -c 1 {addrs[1][0]}") + +# Start a packet capture on each network +for net in [net0, net1]: + tcpdump_pid = os.fork() + if tcpdump_pid == 0: + pcap = logdir+'/'+net+'.pcap' + subprocess.check_call(['touch', pcap]) + user = getpwuid(stat(pcap).st_uid).pw_name + ip(f"netns exec {net} /usr/sbin/tcpdump -Z {user} -i any -w {pcap}") + sys.exit(0) + +# simulate packet loss, duplication and corruption +for net, iface in [(net0, veth0), (net1, veth1)]: + ip(f"netns exec {net} /usr/sbin/tc qdisc add dev {iface} root netem \ + corrupt {packet_corruption} loss {packet_loss} duplicate \ + {packet_duplicate}") + +# add a timeout +if args.timeout > 0: + signal.alarm(args.timeout) + signal.signal(signal.SIGALRM, signal_handler) + +sockets = [ + netns_socket(net0, socket.AF_RDS, socket.SOCK_SEQPACKET), + netns_socket(net1, socket.AF_RDS, socket.SOCK_SEQPACKET), +] + +for s, addr in zip(sockets, addrs): + s.bind(addr) + s.setblocking(0) + +fileno_to_socket = { + s.fileno(): s for s in sockets +} + +addr_to_socket = { + addr: s for addr, s in zip(addrs, sockets) +} + +socket_to_addr = { + s: addr for addr, s in zip(addrs, sockets) +} + +send_hashes = {} +recv_hashes = {} + +ep = select.epoll() + +for s in sockets: + ep.register(s, select.EPOLLRDNORM) + +n = 50000 +nr_send = 0 +nr_recv = 0 + +while nr_send < n: + # Send as much as we can without blocking + print("sending...", nr_send, nr_recv) + while nr_send < n: + send_data = hashlib.sha256( + f'packet {nr_send}'.encode('utf-8')).hexdigest().encode('utf-8') + + # pseudo-random send/receive pattern + sender = sockets[nr_send % 2] + receiver = sockets[1 - (nr_send % 3) % 2] + + try: + sender.sendto(send_data, socket_to_addr[receiver]) + send_hashes.setdefault((sender.fileno(), receiver.fileno()), + hashlib.sha256()).update(f'<{send_data}>'.encode('utf-8')) + nr_send = nr_send + 1 + except BlockingIOError as e: + break + except OSError as e: + if e.errno in [errno.ENOBUFS, errno.ECONNRESET, errno.EPIPE]: + break + raise + + # Receive as much as we can without blocking + print("receiving...", nr_send, nr_recv) + while nr_recv < nr_send: + for fileno, eventmask in ep.poll(): + receiver = fileno_to_socket[fileno] + + if eventmask & select.EPOLLRDNORM: + while True: + try: + recv_data, address = receiver.recvfrom(1024) + sender = addr_to_socket[address] + recv_hashes.setdefault((sender.fileno(), + receiver.fileno()), hashlib.sha256()).update( + f'<{recv_data}>'.encode('utf-8')) + nr_recv = nr_recv + 1 + except BlockingIOError as e: + break + + # exercise net/rds/tcp.c:rds_tcp_sysctl_reset() + for net in [net0, net1]: + ip(f"netns exec {net} /usr/sbin/sysctl net.rds.tcp.rds_tcp_rcvbuf=10000") + ip(f"netns exec {net} /usr/sbin/sysctl net.rds.tcp.rds_tcp_sndbuf=10000") + +print("done", nr_send, nr_recv) + +# the Python socket module doesn't know these +RDS_INFO_FIRST = 10000 +RDS_INFO_LAST = 10017 + +nr_success = 0 +nr_error = 0 + +for s in sockets: + for optname in range(RDS_INFO_FIRST, RDS_INFO_LAST + 1): + # Sigh, the Python socket module doesn't allow us to pass + # buffer lengths greater than 1024 for some reason. RDS + # wants multiple pages. + try: + s.getsockopt(socket.SOL_RDS, optname, 1024) + nr_success = nr_success + 1 + except OSError as e: + nr_error = nr_error + 1 + if e.errno == errno.ENOSPC: + # ignore + pass + +print(f"getsockopt(): {nr_success}/{nr_error}") + +print("Stopping network packet captures") +subprocess.check_call(['killall', '-q', 'tcpdump']) + +# We're done sending and receiving stuff, now let's check if what +# we received is what we sent. +for (sender, receiver), send_hash in send_hashes.items(): + recv_hash = recv_hashes.get((sender, receiver)) + + if recv_hash is None: + print("FAIL: No data received") + sys.exit(1) + + if send_hash.hexdigest() != recv_hash.hexdigest(): + print("FAIL: Send/recv mismatch") + print("hash expected:", send_hash.hexdigest()) + print("hash received:", recv_hash.hexdigest()) + sys.exit(1) + + print(f"{sender}/{receiver}: ok") + +print("Success") +sys.exit(0) -- cgit v1.2.3 From 05673c42f73965c6381ab986b4dd2145700e6a0c Mon Sep 17 00:00:00 2001 From: Zixian Cai Date: Fri, 9 Aug 2024 08:01:36 +0000 Subject: perf script python: Add the 'ins_lat' field to event handler For example, when using the Alder Lake PMU memory load event, the instruction latency is stored in 'ins_lat', while the cache latency is stored in 'weight'. This patch reports the 'ins_lat' field for Python scripting. Committer testing: On a Rocket Lake Refresh Intel machine (14th gen): root@number:~# grep -m1 'model name' /proc/cpuinfo model name : Intel(R) Core(TM) i7-14700K root@number:~# perf mem record -a sleep 5 Memory events are enabled on a subset of CPUs: 16-27 [ perf record: Woken up 85 times to write data ] [ perf record: Captured and wrote 41.236 MB perf.data (191390 samples) ] root@number:~# perf evlist -v cpu_atom/mem-loads,ldlat=30/P: type: 10 (cpu_atom), size: 136, config: 0x5d0 (mem-loads), { sample_period, sample_freq }: 4000, sample_type: IP|TID|TIME|ADDR|CPU|PERIOD|IDENTIFIER|DATA_SRC|WEIGHT_STRUCT, read_format: ID|LOST, disabled: 1, inherit: 1, freq: 1, precise_ip: 3, sample_id_all: 1, { bp_addr, config1 }: 0x1f cpu_atom/mem-stores/P: type: 10 (cpu_atom), size: 136, config: 0x6d0 (mem-stores), { sample_period, sample_freq }: 4000, sample_type: IP|TID|TIME|ADDR|CPU|PERIOD|IDENTIFIER|DATA_SRC|WEIGHT_STRUCT, read_format: ID|LOST, disabled: 1, inherit: 1, freq: 1, precise_ip: 3, sample_id_all: 1 dummy:u: type: 1 (software), size: 136, config: 0x9 (PERF_COUNT_SW_DUMMY), { sample_period, sample_freq }: 1, sample_type: IP|TID|TIME|ADDR|CPU|IDENTIFIER|DATA_SRC|WEIGHT_STRUCT, read_format: ID|LOST, inherit: 1, exclude_kernel: 1, exclude_hv: 1, mmap: 1, comm: 1, task: 1, mmap_data: 1, sample_id_all: 1, exclude_guest: 1, mmap2: 1, comm_exec: 1, ksymbol: 1, bpf_event: 1 root@number:~# Now generate a python script to then dump the dictionary that now needs to have that 'ins_lat' field: root@number:~# perf script --gen python generated Python script: perf-script.py root@number:~# vim perf-script.py root@number:~# perf script -s perf-script.py | head -40 in trace_begin in trace_end root@number:~# vim perf-script.py Reviewed-by: Adrian Hunter Signed-off-by: Zixian Cai Cc: Alexander Shishkin Cc: Ben Gainey Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kan Liang Cc: Mark Rutland Cc: Namhyung Kim Cc: Paran Lee Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20240809080137.3590148-1-fzczx123@gmail.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/scripting-engines/trace-event-python.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/perf/util/scripting-engines/trace-event-python.c b/tools/perf/util/scripting-engines/trace-event-python.c index fb00f3ad6815..6971dd6c231f 100644 --- a/tools/perf/util/scripting-engines/trace-event-python.c +++ b/tools/perf/util/scripting-engines/trace-event-python.c @@ -888,6 +888,8 @@ static PyObject *get_perf_sample_dict(struct perf_sample *sample, set_sample_read_in_dict(dict_sample, sample, evsel); pydict_set_item_string_decref(dict_sample, "weight", PyLong_FromUnsignedLongLong(sample->weight)); + pydict_set_item_string_decref(dict_sample, "ins_lat", + PyLong_FromUnsignedLong(sample->ins_lat)); pydict_set_item_string_decref(dict_sample, "transaction", PyLong_FromUnsignedLongLong(sample->transaction)); set_sample_datasrc_in_dict(dict_sample, sample); @@ -1317,7 +1319,7 @@ static void python_export_sample_table(struct db_export *dbe, struct tables *tables = container_of(dbe, struct tables, dbe); PyObject *t; - t = tuple_new(27); + t = tuple_new(28); tuple_set_d64(t, 0, es->db_id); tuple_set_d64(t, 1, es->evsel->db_id); @@ -1346,6 +1348,7 @@ static void python_export_sample_table(struct db_export *dbe, tuple_set_s32(t, 24, es->sample->flags); tuple_set_d64(t, 25, es->sample->id); tuple_set_d64(t, 26, es->sample->stream_id); + tuple_set_u32(t, 27, es->sample->ins_lat); call_object(tables->sample_handler, t, "sample_table"); -- cgit v1.2.3 From 76fb981ad6774b82f06703c896b492c8659b543b Mon Sep 17 00:00:00 2001 From: Aboorva Devarajan Date: Fri, 9 Aug 2024 14:07:28 +0530 Subject: tools/cpupower: display residency value in idle-info Update cpuidle tool to display the residency value of cpuidle states. This addition provides a clearer and more detailed view of idle state information when using cpuidle-info. -------------------------------- Before Patch: -------------------------------- $ cpupower idle-info CPUidle driver: intel_idle CPUidle governor: menu analyzing CPU 28: Number of idle states: 3 Available idle states: POLL C1 C1E POLL: Flags/Description: CPUIDLE CORE POLL IDLE Latency: 0 Usage: 7448 Duration: 207170 C1: Flags/Description: MWAIT 0x00 Latency: 2 Usage: 7023 Duration: 3736853 C1E: Flags/Description: MWAIT 0x01 Latency: 10 Usage: 18468 Duration: 11396212 -------------------------------- After Patch: -------------------------------- $ cpupower idle-info CPUidle driver: intel_idle CPUidle governor: menu analyzing CPU 12: Number of idle states: 3 Available idle states: POLL C1 C1E POLL: Flags/Description: CPUIDLE CORE POLL IDLE Latency: 0 Residency: 0 Usage: 1950 Duration: 38458 C1: Flags/Description: MWAIT 0x00 Latency: 2 Residency: 2 Usage: 10688 Duration: 7133020 C1E: Flags/Description: MWAIT 0x01 Latency: 10 Residency: 20 Usage: 22356 Duration: 15687259 -------------------------------- Signed-off-by: Aboorva Devarajan Signed-off-by: Shuah Khan --- tools/power/cpupower/lib/cpuidle.c | 8 ++++++++ tools/power/cpupower/lib/cpuidle.h | 2 ++ tools/power/cpupower/utils/cpuidle-info.c | 4 ++++ 3 files changed, 14 insertions(+) (limited to 'tools') diff --git a/tools/power/cpupower/lib/cpuidle.c b/tools/power/cpupower/lib/cpuidle.c index 479c5971aa6d..0ecac009273c 100644 --- a/tools/power/cpupower/lib/cpuidle.c +++ b/tools/power/cpupower/lib/cpuidle.c @@ -116,6 +116,7 @@ enum idlestate_value { IDLESTATE_USAGE, IDLESTATE_POWER, IDLESTATE_LATENCY, + IDLESTATE_RESIDENCY, IDLESTATE_TIME, IDLESTATE_DISABLE, MAX_IDLESTATE_VALUE_FILES @@ -125,6 +126,7 @@ static const char *idlestate_value_files[MAX_IDLESTATE_VALUE_FILES] = { [IDLESTATE_USAGE] = "usage", [IDLESTATE_POWER] = "power", [IDLESTATE_LATENCY] = "latency", + [IDLESTATE_RESIDENCY] = "residency", [IDLESTATE_TIME] = "time", [IDLESTATE_DISABLE] = "disable", }; @@ -254,6 +256,12 @@ unsigned long cpuidle_state_latency(unsigned int cpu, return cpuidle_state_get_one_value(cpu, idlestate, IDLESTATE_LATENCY); } +unsigned long cpuidle_state_residency(unsigned int cpu, + unsigned int idlestate) +{ + return cpuidle_state_get_one_value(cpu, idlestate, IDLESTATE_RESIDENCY); +} + unsigned long cpuidle_state_usage(unsigned int cpu, unsigned int idlestate) { diff --git a/tools/power/cpupower/lib/cpuidle.h b/tools/power/cpupower/lib/cpuidle.h index 2e10fead2e1e..2ab404d40259 100644 --- a/tools/power/cpupower/lib/cpuidle.h +++ b/tools/power/cpupower/lib/cpuidle.h @@ -8,6 +8,8 @@ int cpuidle_state_disable(unsigned int cpu, unsigned int idlestate, unsigned int disable); unsigned long cpuidle_state_latency(unsigned int cpu, unsigned int idlestate); +unsigned long cpuidle_state_residency(unsigned int cpu, + unsigned int idlestate); unsigned long cpuidle_state_usage(unsigned int cpu, unsigned int idlestate); unsigned long long cpuidle_state_time(unsigned int cpu, diff --git a/tools/power/cpupower/utils/cpuidle-info.c b/tools/power/cpupower/utils/cpuidle-info.c index 44126a87fa7a..e0d17f0de3fe 100644 --- a/tools/power/cpupower/utils/cpuidle-info.c +++ b/tools/power/cpupower/utils/cpuidle-info.c @@ -64,6 +64,8 @@ static void cpuidle_cpu_output(unsigned int cpu, int verbose) printf(_("Latency: %lu\n"), cpuidle_state_latency(cpu, idlestate)); + printf(_("Residency: %lu\n"), + cpuidle_state_residency(cpu, idlestate)); printf(_("Usage: %lu\n"), cpuidle_state_usage(cpu, idlestate)); printf(_("Duration: %llu\n"), @@ -115,6 +117,8 @@ static void proc_cpuidle_cpu_output(unsigned int cpu) printf(_("promotion[--] demotion[--] ")); printf(_("latency[%03lu] "), cpuidle_state_latency(cpu, cstate)); + printf(_("residency[%05lu] "), + cpuidle_state_residency(cpu, cstate)); printf(_("usage[%08lu] "), cpuidle_state_usage(cpu, cstate)); printf(_("duration[%020Lu] \n"), -- cgit v1.2.3 From 13d675aea6cac5bb4619ef9e3fdd90129fe6d139 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Fri, 9 Aug 2024 11:32:45 -0300 Subject: perf debuginfo: Fix the build with !HAVE_DWARF_SUPPORT MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In that case we have a set of placeholder functions, one of them uses a 'Dwarf_Addr' type that is not present as it is defined in the missing DWARF libraries, so provide a placeholder typedef for that as well. The build error before this patch: In file included from util/annotate.c:28: util/debuginfo.h:44:46: error: unknown type name ‘Dwarf_Addr’ 44 | Dwarf_Addr *offs __maybe_unused, | ^~~~~~~~~~ make[6]: *** [/home/acme/git/perf-tools-next/tools/build/Makefile.build:106: util/annotate.o] Error 1 make[6]: *** Waiting for unfinished jobs.... Acked-by: Namhyung Kim Cc: Adrian Hunter Cc: Ian Rogers Cc: Jiri Olsa Cc: Kan Liang Link: https://lore.kernel.org/lkml/CAM9d7ciushSwEfj7yW4rtDEJBTcCB991V4cswwFEL+cv6QF2pg@mail.gmail.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/debuginfo.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'tools') diff --git a/tools/perf/util/debuginfo.h b/tools/perf/util/debuginfo.h index 4d65b8c605fc..ad6422c3f8ca 100644 --- a/tools/perf/util/debuginfo.h +++ b/tools/perf/util/debuginfo.h @@ -40,6 +40,8 @@ static inline void debuginfo__delete(struct debuginfo *dbg __maybe_unused) { } +typedef void Dwarf_Addr; + static inline int debuginfo__get_text_offset(struct debuginfo *dbg __maybe_unused, Dwarf_Addr *offs __maybe_unused, bool adjust_offset __maybe_unused) -- cgit v1.2.3 From 890a1961c812db801e0f9dfaa4af233aa3c6ab63 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 7 Aug 2024 16:18:20 -0700 Subject: perf tools: Create source symlink in perf object dir MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Create a source symlink to the original source in the objdir. This is similar to what the main kernel build script does. Committer testing: ⬢[acme@toolbox perf-tools-next]$ make O=/tmp/build/$(basename $PWD)/ -C tools/perf install-bin ⬢[acme@toolbox perf-tools-next]$ ls -la /tmp/build/perf-tools-next/source lrwxrwxrwx. 1 acme acme 41 Aug 9 16:26 /tmp/build/perf-tools-next/source -> /home/acme/git/perf-tools-next/tools/perf ⬢[acme@toolbox perf-tools-next]$ Signed-off-by: Andi Kleen Tested-by: Arnaldo Carvalho de Melo Acked-by: Ian Rogers Acked-by: Namhyung Kim Link: https://lore.kernel.org/r/20240807231823.898979-1-ak@linux.intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Makefile.perf | 2 ++ 1 file changed, 2 insertions(+) (limited to 'tools') diff --git a/tools/perf/Makefile.perf b/tools/perf/Makefile.perf index 72534bb72d43..9731f5e84131 100644 --- a/tools/perf/Makefile.perf +++ b/tools/perf/Makefile.perf @@ -163,6 +163,8 @@ ifneq ($(OUTPUT),) # for flex/bison parsers. VPATH += $(OUTPUT) export VPATH +# create symlink to the original source +SOURCE := $(shell ln -sf $(srctree)/tools/perf $(OUTPUT)/source) endif ifeq ($(V),1) -- cgit v1.2.3 From 336989d00f2140c7524b76126aa62c8a47a5a1d3 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Tue, 6 Aug 2024 23:15:55 -0700 Subject: perf annotate: Fix --group behavior when leader has no samples MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When --group option is used, it should display all events together. But the current logic only checks if the first (leader) event has samples or not. Let's check the member events as well. Also it missed to put the linked samples from member evsels to the output RB-tree so that it can be displayed in the output. For example, take a look at this example. $ ./perf evlist cpu/mem-loads,ldlat=30/P cpu/mem-stores/P dummy:u It has three events but 'path_put' function has samples only for mem-stores (second) event. $ sudo ./perf annotate --stdio -f path_put Percent | Source code & Disassembly of kcore for cpu/mem-stores/P (2 samples, percent: local period) ---------------------------------------------------------------------------------------------------------- : 0 0xffffffffae600020 : 0.00 : ffffffffae600020: endbr64 0.00 : ffffffffae600024: nopl (%rax, %rax) 91.22 : ffffffffae600029: pushq %rbx 0.00 : ffffffffae60002a: movq %rdi, %rbx 0.00 : ffffffffae60002d: movq 8(%rdi), %rdi 8.78 : ffffffffae600031: callq 0xffffffffae614aa0 0.00 : ffffffffae600036: movq (%rbx), %rdi 0.00 : ffffffffae600039: popq %rbx 0.00 : ffffffffae60003a: jmp 0xffffffffae620670 0.00 : ffffffffae60003f: nop Therefore, it didn't show up when --group option is used since the leader ("mem-loads") event has no samples. But now it checks both events. Before: $ sudo ./perf annotate --stdio -f --group path_put (no output) After: $ sudo ./perf annotate --stdio -f --group path_put Percent | Source code & Disassembly of kcore for cpu/mem-loads,ldlat=30/P, cpu/mem-stores/P, dummy:u (0 samples, percent: local period) ------------------------------------------------------------------------------------------------------------------------------------------------------------- : 0 0xffffffffae600020 : 0.00 0.00 0.00 : ffffffffae600020: endbr64 0.00 0.00 0.00 : ffffffffae600024: nopl (%rax, %rax) 0.00 91.22 0.00 : ffffffffae600029: pushq %rbx 0.00 0.00 0.00 : ffffffffae60002a: movq %rdi, %rbx 0.00 0.00 0.00 : ffffffffae60002d: movq 8(%rdi), %rdi 0.00 8.78 0.00 : ffffffffae600031: callq 0xffffffffae614aa0 0.00 0.00 0.00 : ffffffffae600036: movq (%rbx), %rdi 0.00 0.00 0.00 : ffffffffae600039: popq %rbx 0.00 0.00 0.00 : ffffffffae60003a: jmp 0xffffffffae620670 0.00 0.00 0.00 : ffffffffae60003f: nop Committer testing: Before: root@number:~# perf annotate --group --stdio2 clear_page_erms root@number:~# After: root@number:~# perf annotate --group --stdio2 clear_page_erms Samples: 125 of events 'cpu_atom/mem-loads,ldlat=30/P, cpu_atom/mem-stores/P, dummy:u', 4000 Hz, Event count (approx.): 13198416, [percent: local period] clear_page_erms() /proc/kcore Percent 0xffffffff990c6cc0 : endbr64 movl $0x1000,%ecx xorl %eax,%eax 0.00 100.00 0.00 rep stosb %al, (%rdi) ← retq int3 int3 int3 int3 nop nop root@number:~# Reported-by: Arnaldo Carvalho de Melo Signed-off-by: Namhyung Kim Tested-by: Arnaldo Carvalho de Melo Cc: Adrian Hunter Cc: Ian Rogers Cc: Jiri Olsa Cc: Kan Liang Cc: Peter Zijlstra Link: http://lore.kernel.org/lkml/20240807061555.1642669-1-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-annotate.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/perf/builtin-annotate.c b/tools/perf/builtin-annotate.c index efcadb7620b8..1bfe41783a7c 100644 --- a/tools/perf/builtin-annotate.c +++ b/tools/perf/builtin-annotate.c @@ -632,13 +632,23 @@ static int __cmd_annotate(struct perf_annotate *ann) evlist__for_each_entry(session->evlist, pos) { struct hists *hists = evsel__hists(pos); u32 nr_samples = hists->stats.nr_samples; + struct ui_progress prog; + struct evsel *evsel; - if (nr_samples == 0) + if (!symbol_conf.event_group || !evsel__is_group_leader(pos)) continue; - if (!symbol_conf.event_group || !evsel__is_group_leader(pos)) + for_each_group_member(evsel, pos) + nr_samples += evsel__hists(evsel)->stats.nr_samples; + + if (nr_samples == 0) continue; + ui_progress__init(&prog, nr_samples, + "Sorting group events for output..."); + evsel__output_resort(pos, &prog); + ui_progress__finish(); + hists__find_annotations(hists, pos, ann); } -- cgit v1.2.3 From cb1898f58e0f175d168a5a8c5a269a4d24cbbef5 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Tue, 6 Aug 2024 23:17:13 -0700 Subject: perf annotate-data: Support --skip-empty option The --skip-empty option is to hide dummy events in a group. Like other output mode in 'perf report' and 'perf annotate', the data-type profiling output should support the option. Committer testing: With dummy: root@number:~# perf annotate --stdio --group --data-type --skip-empty | head -24 Annotate type: 'pthread_mutex_t' in /usr/lib64/libc.so.6 (50 samples): event[0] = cpu_atom/mem-loads,ldlat=30/P event[1] = cpu_atom/mem-stores/P event[2] = dummy:u ============================================================================ Percent offset size field 100.00 100.00 0.00 0 40 pthread_mutex_t { 100.00 100.00 0.00 0 40 struct __pthread_mutex_s __data { 45.21 84.54 0.00 0 4 int __lock; 0.00 0.00 0.00 4 4 unsigned int __count; 0.00 1.83 0.00 8 4 int __owner; 5.19 10.65 0.00 12 4 unsigned int __nusers; 49.61 2.97 0.00 16 4 int __kind; 0.00 0.00 0.00 20 2 short int __spins; 0.00 0.00 0.00 22 2 short int __elision; 0.00 0.00 0.00 24 16 __pthread_list_t __list { 0.00 0.00 0.00 24 8 struct __pthread_internal_list* __prev; 0.00 0.00 0.00 32 8 struct __pthread_internal_list* __next; }; }; 0.00 0.00 0.00 0 0 char[] __size; 45.21 84.54 0.00 0 8 long int __align; }; Skipping it: root@number:~# perf annotate --stdio --group --data-type --skip-empty | head -24 Annotate type: 'pthread_mutex_t' in /usr/lib64/libc.so.6 (50 samples): event[0] = cpu_atom/mem-loads,ldlat=30/P event[1] = cpu_atom/mem-stores/P ============================================================================ Percent offset size field 100.00 100.00 0 40 pthread_mutex_t { 100.00 100.00 0 40 struct __pthread_mutex_s __data { 45.21 84.54 0 4 int __lock; 0.00 0.00 4 4 unsigned int __count; 0.00 1.83 8 4 int __owner; 5.19 10.65 12 4 unsigned int __nusers; 49.61 2.97 16 4 int __kind; 0.00 0.00 20 2 short int __spins; 0.00 0.00 22 2 short int __elision; 0.00 0.00 24 16 __pthread_list_t __list { 0.00 0.00 24 8 struct __pthread_internal_list* __prev; 0.00 0.00 32 8 struct __pthread_internal_list* __next; }; }; 0.00 0.00 0 0 char[] __size; 45.21 84.54 0 8 long int __align; }; Annotate type: 'pthread_mutexattr_t' in /usr/lib64/libc.so.6 (1 samples): root@number:~# Signed-off-by: Namhyung Kim Tested-by: Arnaldo Carvalho de Melo Cc: Adrian Hunter Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kan Liang Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20240807061713.1642924-1-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/ui/browsers/annotate-data.c | 30 +++++++++++++++++------ tools/perf/util/annotate-data.c | 44 +++++++++++++++++----------------- 2 files changed, 45 insertions(+), 29 deletions(-) (limited to 'tools') diff --git a/tools/perf/ui/browsers/annotate-data.c b/tools/perf/ui/browsers/annotate-data.c index 8d6bf08d371d..c3db80a7589a 100644 --- a/tools/perf/ui/browsers/annotate-data.c +++ b/tools/perf/ui/browsers/annotate-data.c @@ -46,15 +46,18 @@ static int get_member_overhead(struct annotated_data_type *adt, struct annotated_member *member = entry->data; int i, k; - for (i = 0; i < member->size; i++) { + for (i = 0, k = 0; i < member->size; i++) { struct type_hist *h; struct evsel *evsel; int offset = member->offset + i; for_each_group_evsel(evsel, leader) { + if (symbol_conf.skip_empty && + evsel__hists(evsel)->stats.nr_samples == 0) + continue; + h = adt->histograms[evsel->core.idx]; - k = evsel__group_idx(evsel); - update_hist_entry(&entry->hists[k], &h->addr[offset]); + update_hist_entry(&entry->hists[k++], &h->addr[offset]); } } return 0; @@ -203,6 +206,7 @@ static void browser__write(struct ui_browser *uib, void *entry, int row) struct annotated_data_type *adt = he->mem_type; struct evsel *leader = hists_to_evsel(he->hists); struct evsel *evsel; + int idx = 0; if (member == NULL) { bool current = ui_browser__is_current_entry(uib, row); @@ -219,9 +223,12 @@ static void browser__write(struct ui_browser *uib, void *entry, int row) /* print the number */ for_each_group_evsel(evsel, leader) { struct type_hist *h = adt->histograms[evsel->core.idx]; - int idx = evsel__group_idx(evsel); - browser__write_overhead(uib, h, &be->hists[idx], row); + if (symbol_conf.skip_empty && + evsel__hists(evsel)->stats.nr_samples == 0) + continue; + + browser__write_overhead(uib, h, &be->hists[idx++], row); } /* print type info */ @@ -300,8 +307,17 @@ int hist_entry__annotate_data_tui(struct hist_entry *he, struct evsel *evsel, ui_helpline__push("Press ESC to exit"); - if (evsel__is_group_event(evsel)) - browser.nr_events = evsel->core.nr_members; + if (evsel__is_group_event(evsel)) { + struct evsel *pos; + int nr = 0; + + for_each_group_evsel(pos, evsel) { + if (!symbol_conf.skip_empty || + evsel__hists(pos)->stats.nr_samples) + nr++; + } + browser.nr_events = nr; + } ret = annotated_data_browser__collect_entries(&browser); if (ret == 0) diff --git a/tools/perf/util/annotate-data.c b/tools/perf/util/annotate-data.c index c18272092a6b..ff85d190e3ac 100644 --- a/tools/perf/util/annotate-data.c +++ b/tools/perf/util/annotate-data.c @@ -1502,10 +1502,15 @@ static void print_annotated_data_header(struct hist_entry *he, struct evsel *evs struct evsel *pos; int i = 0; - for_each_group_evsel(pos, evsel) - printf(" event[%d] = %s\n", i++, pos->name); + nr_members = 0; + for_each_group_evsel(pos, evsel) { + if (symbol_conf.skip_empty && + evsel__hists(pos)->stats.nr_samples == 0) + continue; - nr_members = evsel->core.nr_members; + printf(" event[%d] = %s\n", i++, pos->name); + nr_members++; + } } if (symbol_conf.show_total_period) { @@ -1540,31 +1545,26 @@ static void print_annotated_data_type(struct annotated_data_type *mem_type, { struct annotated_member *child; struct type_hist *h = mem_type->histograms[evsel->core.idx]; - int i, nr_events = 1, samples = 0; + int i, nr_events = 0, samples = 0; u64 period = 0; int width = symbol_conf.show_total_period ? 11 : 7; + struct evsel *pos; - for (i = 0; i < member->size; i++) { - samples += h->addr[member->offset + i].nr_samples; - period += h->addr[member->offset + i].period; - } - print_annotated_data_value(h, period, samples); - - if (evsel__is_group_event(evsel)) { - struct evsel *pos; + for_each_group_evsel(pos, evsel) { + h = mem_type->histograms[pos->core.idx]; - for_each_group_member(pos, evsel) { - h = mem_type->histograms[pos->core.idx]; + if (symbol_conf.skip_empty && + evsel__hists(pos)->stats.nr_samples == 0) + continue; - samples = 0; - period = 0; - for (i = 0; i < member->size; i++) { - samples += h->addr[member->offset + i].nr_samples; - period += h->addr[member->offset + i].period; - } - print_annotated_data_value(h, period, samples); + samples = 0; + period = 0; + for (i = 0; i < member->size; i++) { + samples += h->addr[member->offset + i].nr_samples; + period += h->addr[member->offset + i].period; } - nr_events = evsel->core.nr_members; + print_annotated_data_value(h, period, samples); + nr_events++; } printf(" %10d %10d %*s%s\t%s", -- cgit v1.2.3 From 2c402bd2e85b44dc00ef85b5c0e217de684b5372 Mon Sep 17 00:00:00 2001 From: Li Ming Date: Fri, 9 Aug 2024 08:27:50 +0000 Subject: cxl/test: Skip cxl_setup_parent_dport() for emulated dports The cxl_test unit test environment on qemu always hits below call trace with KASAN enabled: BUG: KASAN: slab-out-of-bounds in cxl_setup_parent_dport+0x480/0x530 [cxl_core] Read of size 1 at addr ff110000676014f8 by task (udev-worker)/676[ 24.424403] CPU: 2 PID: 676 Comm: (udev-worker) Tainted: G O N 6.10.0-qemucxl #1 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS edk2-20240214-2.el9 02/14/2024 Call Trace: dump_stack_lvl+0xea/0x150 print_report+0xce/0x610 ? kasan_complete_mode_report_info+0x40/0x200 kasan_report+0xcc/0x110 __asan_report_load1_noabort+0x18/0x20 cxl_setup_parent_dport+0x480/0x530 [cxl_core] cxl_mem_probe+0x49b/0xaa0 [cxl_mem] cxl_test module models a CXL topology for testing, it creates some emulated dports with platform devices in the CXL topology, so the dport_dev of an emulated dport points to a platform device rather than a pci device or a pci host bridge in the case. Currently, cxl_setup_parent_dport() is used to set up RAS and AER capability on the dport connected to the CXL memory device, but cxl_test does not support RAS or AER functionality yet, so the fix is implementing a __wrap_cxl_setup_parent_dport() to filter out all emulated dports, guarantees only real dports can be handled by cxl_setup_parent_dport(). Fixes: f05fd10d138d ("cxl/pci: Add RCH downstream port AER register discovery") Reported-by: Pengfei Xu Closes: https://lore.kernel.org/linux-cxl/ZrHTBp2O+HtUe6kt@xpf.sh.intel.com/T/#t Signed-off-by: Li Ming Reviewed-by: Dan Williams Reviewed-by: Ira Weiny Reviewed-by: Alison Schofield Tested-by: Ira Weiny Tested-by: Alison Schofield Link: https://patch.msgid.link/20240809082750.3015641-3-ming4.li@intel.com Signed-off-by: Dave Jiang --- tools/testing/cxl/Kbuild | 1 + tools/testing/cxl/test/mock.c | 12 ++++++++++++ 2 files changed, 13 insertions(+) (limited to 'tools') diff --git a/tools/testing/cxl/Kbuild b/tools/testing/cxl/Kbuild index 030b388800f0..3d1ca9e38b1f 100644 --- a/tools/testing/cxl/Kbuild +++ b/tools/testing/cxl/Kbuild @@ -14,6 +14,7 @@ ldflags-y += --wrap=cxl_dvsec_rr_decode ldflags-y += --wrap=devm_cxl_add_rch_dport ldflags-y += --wrap=cxl_rcd_component_reg_phys ldflags-y += --wrap=cxl_endpoint_parse_cdat +ldflags-y += --wrap=cxl_setup_parent_dport DRIVERS := ../../../drivers CXL_SRC := $(DRIVERS)/cxl diff --git a/tools/testing/cxl/test/mock.c b/tools/testing/cxl/test/mock.c index 6f737941dc0e..d619672faa49 100644 --- a/tools/testing/cxl/test/mock.c +++ b/tools/testing/cxl/test/mock.c @@ -299,6 +299,18 @@ void __wrap_cxl_endpoint_parse_cdat(struct cxl_port *port) } EXPORT_SYMBOL_NS_GPL(__wrap_cxl_endpoint_parse_cdat, CXL); +void __wrap_cxl_setup_parent_dport(struct device *host, struct cxl_dport *dport) +{ + int index; + struct cxl_mock_ops *ops = get_cxl_mock_ops(&index); + + if (!ops || !ops->is_mock_port(dport->dport_dev)) + cxl_setup_parent_dport(host, dport); + + put_cxl_mock_ops(index); +} +EXPORT_SYMBOL_NS_GPL(__wrap_cxl_setup_parent_dport, CXL); + MODULE_LICENSE("GPL v2"); MODULE_IMPORT_NS(ACPI); MODULE_IMPORT_NS(CXL); -- cgit v1.2.3 From 55850eb4e582f0696f40b8315ac31a45d6a4955e Mon Sep 17 00:00:00 2001 From: Thomas Weißschuh Date: Wed, 7 Aug 2024 23:51:37 +0200 Subject: tools/nolibc: arm: use clang-compatible asm syntax MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The clang assembler rejects the current syntax. Switch to a syntax accepted by both GCC and clang. Acked-by: Willy Tarreau Link: https://lore.kernel.org/r/20240807-nolibc-llvm-v2-1-c20f2f5fc7c2@weissschuh.net Signed-off-by: Thomas Weißschuh --- tools/include/nolibc/arch-arm.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/include/nolibc/arch-arm.h b/tools/include/nolibc/arch-arm.h index cae4afa7c1c7..d1c19d973e55 100644 --- a/tools/include/nolibc/arch-arm.h +++ b/tools/include/nolibc/arch-arm.h @@ -188,8 +188,8 @@ void __attribute__((weak, noreturn, optimize("Os", "omit-frame-pointer"))) __no_stack_protector _start(void) { __asm__ volatile ( - "mov %r0, sp\n" /* save stack pointer to %r0, as arg1 of _start_c */ - "and ip, %r0, #-8\n" /* sp must be 8-byte aligned in the callee */ + "mov r0, sp\n" /* save stack pointer to %r0, as arg1 of _start_c */ + "and ip, r0, #-8\n" /* sp must be 8-byte aligned in the callee */ "mov sp, ip\n" "bl _start_c\n" /* transfer to c runtime */ ); -- cgit v1.2.3 From 0daf8c86a45163f35b8d4f7795068c2511dd0ad9 Mon Sep 17 00:00:00 2001 From: Thomas Weißschuh Date: Wed, 7 Aug 2024 23:51:38 +0200 Subject: tools/nolibc: mips: load current function to $t9 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The MIPS calling convention requires the address of the current function to be available in $t9. This was not done so far. For GCC this seems to have worked, but when compiled with clang the executable segfault instantly. Properly load the address of _start_c() into $t9 before calling it. Acked-by: Willy Tarreau Link: https://lore.kernel.org/r/20240807-nolibc-llvm-v2-2-c20f2f5fc7c2@weissschuh.net Signed-off-by: Thomas Weißschuh --- tools/include/nolibc/arch-mips.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/include/nolibc/arch-mips.h b/tools/include/nolibc/arch-mips.h index 62cc50ef3288..a2ee77ed2fbb 100644 --- a/tools/include/nolibc/arch-mips.h +++ b/tools/include/nolibc/arch-mips.h @@ -194,7 +194,9 @@ void __attribute__((weak, noreturn, optimize("Os", "omit-frame-pointer"))) __no_ "li $t0, -8\n" "and $sp, $sp, $t0\n" /* $sp must be 8-byte aligned */ "addiu $sp, $sp, -16\n" /* the callee expects to save a0..a3 there */ - "jal _start_c\n" /* transfer to c runtime */ + "lui $t9, %hi(_start_c)\n" /* ABI requires current function address in $t9 */ + "ori $t9, %lo(_start_c)\n" + "jalr $t9\n" /* transfer to c runtime */ " nop\n" /* delayed slot */ ".set pop\n" ); -- cgit v1.2.3 From 1daea158d0aae0770371f3079305a29fdb66829e Mon Sep 17 00:00:00 2001 From: Thomas Weißschuh Date: Wed, 7 Aug 2024 23:51:39 +0200 Subject: tools/nolibc: powerpc: limit stack-protector workaround to GCC MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit As mentioned in the comment, the workaround for __attribute__((no_stack_protector)) is only necessary on GCC. Avoid applying the workaround on clang, as clang does not recognize __attribute__((__optimize__)) and would fail. Acked-by: Willy Tarreau Link: https://lore.kernel.org/r/20240807-nolibc-llvm-v2-3-c20f2f5fc7c2@weissschuh.net Signed-off-by: Thomas Weißschuh --- tools/include/nolibc/arch-powerpc.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/include/nolibc/arch-powerpc.h b/tools/include/nolibc/arch-powerpc.h index ac212e6185b2..41ebd394b90c 100644 --- a/tools/include/nolibc/arch-powerpc.h +++ b/tools/include/nolibc/arch-powerpc.h @@ -172,7 +172,7 @@ _ret; \ }) -#ifndef __powerpc64__ +#if !defined(__powerpc64__) && !defined(__clang__) /* FIXME: For 32-bit PowerPC, with newer gcc compilers (e.g. gcc 13.1.0), * "omit-frame-pointer" fails with __attribute__((no_stack_protector)) but * works with __attribute__((__optimize__("-fno-stack-protector"))) -- cgit v1.2.3 From 02a62b551cee585f7c2c93f54d1230d5714ff7d4 Mon Sep 17 00:00:00 2001 From: Thomas Weißschuh Date: Wed, 7 Aug 2024 23:51:40 +0200 Subject: tools/nolibc: compiler: introduce __nolibc_has_attribute() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Recent compilers support __has_attribute() to check if a certain compiler attribute is supported. Unfortunately we have to first check if __has_attribute is supported in the first place and then if a specific attribute is present. These two checks can't be folded into a single condition as that would lead to errors. Nesting the two conditions like below works, but becomes ugly as soon as #else blocks are used as those need to be duplicated for both levels of #if. #if defined __has_attribute # if __has_attribute (nonnull) # define ATTR_NONNULL __attribute__ ((nonnull)) # endif #endif Introduce a new helper which makes the usage of __has_attribute() nicer and migrate the current user to it. Acked-by: Willy Tarreau Link: https://lore.kernel.org/r/20240807-nolibc-llvm-v2-4-c20f2f5fc7c2@weissschuh.net Signed-off-by: Thomas Weißschuh --- tools/include/nolibc/compiler.h | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) (limited to 'tools') diff --git a/tools/include/nolibc/compiler.h b/tools/include/nolibc/compiler.h index beddc3665d69..1730d0454a55 100644 --- a/tools/include/nolibc/compiler.h +++ b/tools/include/nolibc/compiler.h @@ -6,20 +6,22 @@ #ifndef _NOLIBC_COMPILER_H #define _NOLIBC_COMPILER_H +#if defined(__has_attribute) +# define __nolibc_has_attribute(attr) __has_attribute(attr) +#else +# define __nolibc_has_attribute(attr) 0 +#endif + #if defined(__SSP__) || defined(__SSP_STRONG__) || defined(__SSP_ALL__) || defined(__SSP_EXPLICIT__) #define _NOLIBC_STACKPROTECTOR #endif /* defined(__SSP__) ... */ -#if defined(__has_attribute) -# if __has_attribute(no_stack_protector) -# define __no_stack_protector __attribute__((no_stack_protector)) -# else -# define __no_stack_protector __attribute__((__optimize__("-fno-stack-protector"))) -# endif +#if __nolibc_has_attribute(no_stack_protector) +# define __no_stack_protector __attribute__((no_stack_protector)) #else # define __no_stack_protector __attribute__((__optimize__("-fno-stack-protector"))) -#endif /* defined(__has_attribute) */ +#endif /* __nolibc_has_attribute(no_stack_protector) */ #endif /* _NOLIBC_COMPILER_H */ -- cgit v1.2.3 From d0f8a8973f265f6a276f99d091af99edfb2b87de Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Thu, 8 Aug 2024 00:14:13 +0000 Subject: mm/memblock: introduce a new helper memblock_estimated_nr_free_pages() During bootup, system may need the number of free pages in the whole system to do some calculation before all pages are freed to buddy system. Usually this number is get from totalram_pages(). Since we plan to move the free pages accounting in __free_pages_core(), this value may not represent total free pages at the early stage, especially when CONFIG_DEFERRED_STRUCT_PAGE_INIT is enabled. Instead of using raw memblock api, let's introduce a new helper for user to get the estimated number of free pages from memblock point of view. Signed-off-by: Wei Yang CC: David Hildenbrand Reviewed-by: David Hildenbrand Link: https://lore.kernel.org/r/20240808001415.6298-1-richard.weiyang@gmail.com Signed-off-by: Mike Rapoport (Microsoft) --- include/linux/memblock.h | 1 + mm/memblock.c | 17 +++++++++++++++++ tools/include/linux/pfn.h | 1 + 3 files changed, 19 insertions(+) (limited to 'tools') diff --git a/include/linux/memblock.h b/include/linux/memblock.h index fc4d75c6cec3..673d5cae7c81 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -467,6 +467,7 @@ static inline __init_memblock bool memblock_bottom_up(void) phys_addr_t memblock_phys_mem_size(void); phys_addr_t memblock_reserved_size(void); +unsigned long memblock_estimated_nr_free_pages(void); phys_addr_t memblock_start_of_DRAM(void); phys_addr_t memblock_end_of_DRAM(void); void memblock_enforce_memory_limit(phys_addr_t memory_limit); diff --git a/mm/memblock.c b/mm/memblock.c index 3b9dc2d89b8a..213057603b65 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -1731,6 +1731,23 @@ phys_addr_t __init_memblock memblock_reserved_size(void) return memblock.reserved.total_size; } +/** + * memblock_estimated_nr_free_pages - return estimated number of free pages + * from memblock point of view + * + * During bootup, subsystems might need a rough estimate of the number of free + * pages in the whole system, before precise numbers are available from the + * buddy. Especially with CONFIG_DEFERRED_STRUCT_PAGE_INIT, the numbers + * obtained from the buddy might be very imprecise during bootup. + * + * Return: + * An estimated number of free pages from memblock point of view. + */ +unsigned long __init memblock_estimated_nr_free_pages(void) +{ + return PHYS_PFN(memblock_phys_mem_size() - memblock_reserved_size()); +} + /* lowest address */ phys_addr_t __init_memblock memblock_start_of_DRAM(void) { diff --git a/tools/include/linux/pfn.h b/tools/include/linux/pfn.h index 7512a58189eb..f77a30d70152 100644 --- a/tools/include/linux/pfn.h +++ b/tools/include/linux/pfn.h @@ -7,4 +7,5 @@ #define PFN_UP(x) (((x) + PAGE_SIZE - 1) >> PAGE_SHIFT) #define PFN_DOWN(x) ((x) >> PAGE_SHIFT) #define PFN_PHYS(x) ((phys_addr_t)(x) << PAGE_SHIFT) +#define PHYS_PFN(x) ((unsigned long)((x) >> PAGE_SHIFT)) #endif -- cgit v1.2.3 From fe8340a750002269a3e4178efa1229a88814a656 Mon Sep 17 00:00:00 2001 From: Anders Roxell Date: Fri, 2 Aug 2024 14:45:35 +0200 Subject: selftests: rust: config: add trailing newline If adding multiple config files to the merge_config.sh script and rust/config is the fist one, then the last config fragment in this file and the first config fragment in the second file won't be set, since there isn't a newline in this file, so those two fragements end up at the same row like: CONFIG_SAMPLE_RUST_PRINT=mCONFIG_FRAGMENT=y And non of those will be enabled when running 'olddefconfig' after. Fixing the issue by adding a newline to the file. Signed-off-by: Anders Roxell Acked-by: Miguel Ojeda Signed-off-by: Shuah Khan --- tools/testing/selftests/rust/config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/rust/config b/tools/testing/selftests/rust/config index b4002acd40bc..fa06cebae232 100644 --- a/tools/testing/selftests/rust/config +++ b/tools/testing/selftests/rust/config @@ -2,4 +2,4 @@ CONFIG_RUST=y CONFIG_SAMPLES=y CONFIG_SAMPLES_RUST=y CONFIG_SAMPLE_RUST_MINIMAL=m -CONFIG_SAMPLE_RUST_PRINT=m \ No newline at end of file +CONFIG_SAMPLE_RUST_PRINT=m -- cgit v1.2.3 From 8afc0816f5f6213c2f40399317e2ecd43371729c Mon Sep 17 00:00:00 2001 From: Anders Roxell Date: Fri, 2 Aug 2024 14:45:36 +0200 Subject: selftests: rust: config: disable GCC_PLUGINS CONFIG_RUST depends on !CONFIG_GCC_PLUGINS. Disable CONFIG_GCC_PLUGINS in rust/config file to make sure it doesn't get enabled. Signed-off-by: Anders Roxell Acked-by: Miguel Ojeda Signed-off-by: Shuah Khan --- tools/testing/selftests/rust/config | 1 + 1 file changed, 1 insertion(+) (limited to 'tools') diff --git a/tools/testing/selftests/rust/config b/tools/testing/selftests/rust/config index fa06cebae232..5f942b5c8c17 100644 --- a/tools/testing/selftests/rust/config +++ b/tools/testing/selftests/rust/config @@ -1,3 +1,4 @@ +# CONFIG_GCC_PLUGINS is not set CONFIG_RUST=y CONFIG_SAMPLES=y CONFIG_SAMPLES_RUST=y -- cgit v1.2.3 From 10fbe8c082fdde74b1f0814bc30e194cf330cdf7 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 9 Aug 2024 22:37:17 -0700 Subject: selftests: drv-net: rss_ctx: add identifier to traffic comments Include the "name" of the context in the comment for traffic checks. Makes it easier to reason about which context failed when we loop over 32 contexts (it may matter if we failed in first vs last, for example). Reviewed-by: Gal Pressman Reviewed-by: Joe Damato Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- tools/testing/selftests/drivers/net/hw/rss_ctx.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/drivers/net/hw/rss_ctx.py b/tools/testing/selftests/drivers/net/hw/rss_ctx.py index 011508ca604b..1da6b214f4fe 100755 --- a/tools/testing/selftests/drivers/net/hw/rss_ctx.py +++ b/tools/testing/selftests/drivers/net/hw/rss_ctx.py @@ -90,10 +90,10 @@ def _send_traffic_check(cfg, port, name, params): ksft_ge(directed, 20000, f"traffic on {name}: " + str(cnts)) if params.get('noise'): ksft_lt(sum(cnts[i] for i in params['noise']), directed / 2, - "traffic on other queues:" + str(cnts)) + f"traffic on other queues ({name})':" + str(cnts)) if params.get('empty'): ksft_eq(sum(cnts[i] for i in params['empty']), 0, - "traffic on inactive queues: " + str(cnts)) + f"traffic on inactive queues ({name}): " + str(cnts)) def test_rss_key_indir(cfg): -- cgit v1.2.3 From c1ad8ef804e40fc1ef3cb008e0113d76f4acc1a0 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 9 Aug 2024 22:37:28 -0700 Subject: selftests: drv-net: rss_ctx: test dumping RSS contexts Add a test for dumping RSS contexts. Make sure indir table and key are sane when contexts are created with various combination of inputs. Test the dump filtering by ifname and start-context. Reviewed-by: Petr Machata Reviewed-by: Edward Cree Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- tools/testing/selftests/drivers/net/hw/rss_ctx.py | 76 ++++++++++++++++++++++- tools/testing/selftests/net/lib/py/ksft.py | 6 ++ 2 files changed, 80 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/drivers/net/hw/rss_ctx.py b/tools/testing/selftests/drivers/net/hw/rss_ctx.py index 1da6b214f4fe..9d7adb3cf33b 100755 --- a/tools/testing/selftests/drivers/net/hw/rss_ctx.py +++ b/tools/testing/selftests/drivers/net/hw/rss_ctx.py @@ -3,7 +3,7 @@ import datetime import random -from lib.py import ksft_run, ksft_pr, ksft_exit, ksft_eq, ksft_ge, ksft_lt +from lib.py import ksft_run, ksft_pr, ksft_exit, ksft_eq, ksft_ne, ksft_ge, ksft_lt from lib.py import NetDrvEpEnv from lib.py import EthtoolFamily, NetdevFamily from lib.py import KsftSkipEx @@ -302,6 +302,78 @@ def test_hitless_key_update(cfg): ksft_eq(carrier1 - carrier0, 0) +def test_rss_context_dump(cfg): + """ + Test dumping RSS contexts. This tests mostly exercises the kernel APIs. + """ + + # Get a random key of the right size + data = get_rss(cfg) + if 'rss-hash-key' in data: + key_data = _rss_key_rand(len(data['rss-hash-key'])) + key = _rss_key_str(key_data) + else: + key_data = [] + key = "ba:ad" + + ids = [] + try: + ids.append(ethtool_create(cfg, "-X", f"context new")) + defer(ethtool, f"-X {cfg.ifname} context {ids[-1]} delete") + + ids.append(ethtool_create(cfg, "-X", f"context new weight 1 1")) + defer(ethtool, f"-X {cfg.ifname} context {ids[-1]} delete") + + ids.append(ethtool_create(cfg, "-X", f"context new hkey {key}")) + defer(ethtool, f"-X {cfg.ifname} context {ids[-1]} delete") + except CmdExitFailure: + if not ids: + raise KsftSkipEx("Unable to add any contexts") + ksft_pr(f"Added only {len(ids)} out of 3 contexts") + + expect_tuples = set([(cfg.ifname, -1)] + [(cfg.ifname, ctx_id) for ctx_id in ids]) + + # Dump all + ctxs = cfg.ethnl.rss_get({}, dump=True) + tuples = [(c['header']['dev-name'], c.get('context', -1)) for c in ctxs] + ksft_eq(len(tuples), len(set(tuples)), "duplicates in context dump") + ctx_tuples = set([ctx for ctx in tuples if ctx[0] == cfg.ifname]) + ksft_eq(expect_tuples, ctx_tuples) + + # Sanity-check the results + for data in ctxs: + ksft_ne(set(data['indir']), {0}, "indir table is all zero") + ksft_ne(set(data.get('hkey', [1])), {0}, "key is all zero") + + # More specific checks + if len(ids) > 1 and data.get('context') == ids[1]: + ksft_eq(set(data['indir']), {0, 1}, + "ctx1 - indir table mismatch") + if len(ids) > 2 and data.get('context') == ids[2]: + ksft_eq(data['hkey'], bytes(key_data), "ctx2 - key mismatch") + + # Ifindex filter + ctxs = cfg.ethnl.rss_get({'header': {'dev-name': cfg.ifname}}, dump=True) + tuples = [(c['header']['dev-name'], c.get('context', -1)) for c in ctxs] + ctx_tuples = set(tuples) + ksft_eq(len(tuples), len(ctx_tuples), "duplicates in context dump") + ksft_eq(expect_tuples, ctx_tuples) + + # Skip ctx 0 + expect_tuples.remove((cfg.ifname, -1)) + + ctxs = cfg.ethnl.rss_get({'start-context': 1}, dump=True) + tuples = [(c['header']['dev-name'], c.get('context', -1)) for c in ctxs] + ksft_eq(len(tuples), len(set(tuples)), "duplicates in context dump") + ctx_tuples = set([ctx for ctx in tuples if ctx[0] == cfg.ifname]) + ksft_eq(expect_tuples, ctx_tuples) + + # And finally both with ifindex and skip main + ctxs = cfg.ethnl.rss_get({'header': {'dev-name': cfg.ifname}, 'start-context': 1}, dump=True) + ctx_tuples = set([(c['header']['dev-name'], c.get('context', -1)) for c in ctxs]) + ksft_eq(expect_tuples, ctx_tuples) + + def test_rss_context(cfg, ctx_cnt=1, create_with_cfg=None): """ Test separating traffic into RSS contexts. @@ -542,7 +614,7 @@ def main() -> None: ksft_run([test_rss_key_indir, test_rss_queue_reconfigure, test_rss_resize, test_hitless_key_update, test_rss_context, test_rss_context4, test_rss_context32, - test_rss_context_queue_reconfigure, + test_rss_context_dump, test_rss_context_queue_reconfigure, test_rss_context_overlap, test_rss_context_overlap2, test_rss_context_out_of_order, test_rss_context4_create_with_cfg], args=(cfg, )) diff --git a/tools/testing/selftests/net/lib/py/ksft.py b/tools/testing/selftests/net/lib/py/ksft.py index f663e0daec0d..477ae76de93d 100644 --- a/tools/testing/selftests/net/lib/py/ksft.py +++ b/tools/testing/selftests/net/lib/py/ksft.py @@ -55,6 +55,12 @@ def ksft_eq(a, b, comment=""): _fail("Check failed", a, "!=", b, comment) +def ksft_ne(a, b, comment=""): + global KSFT_RESULT + if a == b: + _fail("Check failed", a, "==", b, comment) + + def ksft_true(a, comment=""): if not a: _fail("Check failed", a, "does not eval to True", comment) -- cgit v1.2.3 From 4f21bfed691c96da1fc85b76c16e8f6d8fe98a3d Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Mon, 12 Aug 2024 09:57:20 -0300 Subject: perf tests pmu: Initialize all fields of test_pmu variable Instead of explicitely initializing just the .name and .alias_name, use struct member named initialization of just the non-null -name field, the compiler will initialize all the other non-explicitely initialized fields to NULL. This makes the code more robust, avoiding the error recently fixed when the .alias_name was used and contained a random value. Reviewed-by: Veronika Molnarova Cc: Adrian Hunter Cc: Ian Rogers Cc: James Clark Cc: Jiri Olsa Cc: Kan Liang Cc: Michael Petlan Cc: Namhyung Kim Cc: Radostin Stoyanov Link: https://lore.kernel.org/lkml/e26941f9-f86c-4f2e-b812-20c49fb2c0d3@redhat.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/pmu.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/perf/tests/pmu.c b/tools/perf/tests/pmu.c index a4730b5dc0d9..be18506f6a24 100644 --- a/tools/perf/tests/pmu.c +++ b/tools/perf/tests/pmu.c @@ -458,10 +458,10 @@ static int test__name_cmp(struct test_suite *test __maybe_unused, int subtest __ */ static int test__pmu_match(struct test_suite *test __maybe_unused, int subtest __maybe_unused) { - struct perf_pmu test_pmu; - test_pmu.alias_name = NULL; + struct perf_pmu test_pmu = { + .name = "pmuname", + }; - test_pmu.name = "pmuname"; TEST_ASSERT_EQUAL("Exact match", perf_pmu__match(&test_pmu, "pmuname"), true); TEST_ASSERT_EQUAL("Longer token", perf_pmu__match(&test_pmu, "longertoken"), false); TEST_ASSERT_EQUAL("Shorter token", perf_pmu__match(&test_pmu, "pmu"), false); -- cgit v1.2.3 From e6b56ae7c2d82976398fdbf00858f31193cf5971 Mon Sep 17 00:00:00 2001 From: Martin Liška Date: Fri, 19 Jul 2024 12:57:08 +0200 Subject: perf script: add --addr2line option Similarly to other subcommands (like report, top), it would be handy to provide a path for addr2line command. Signed-off-by: Martin Liska Cc: Ian Rogers Link: https://lore.kernel.org/r/eadc3e36-029d-4848-9d69-272fe5a83a26@foxlink.cz Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Documentation/perf-script.txt | 3 +++ tools/perf/builtin-script.c | 2 ++ tools/perf/util/symbol_conf.h | 2 +- 3 files changed, 6 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/perf/Documentation/perf-script.txt b/tools/perf/Documentation/perf-script.txt index ff086ef05a0c..5abb960c4960 100644 --- a/tools/perf/Documentation/perf-script.txt +++ b/tools/perf/Documentation/perf-script.txt @@ -369,6 +369,9 @@ OPTIONS --demangle-kernel:: Demangle kernel symbol names to human readable form (for C++ kernels). +--addr2line=:: + Path to addr2line binary. + --header Show perf.data header. diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c index c16224b1fef3..932167b2362b 100644 --- a/tools/perf/builtin-script.c +++ b/tools/perf/builtin-script.c @@ -4052,6 +4052,8 @@ int cmd_script(int argc, const char **argv) "Enable symbol demangling"), OPT_BOOLEAN(0, "demangle-kernel", &symbol_conf.demangle_kernel, "Enable kernel symbol demangling"), + OPT_STRING(0, "addr2line", &symbol_conf.addr2line_path, "path", + "addr2line binary to use for line numbers"), OPT_STRING(0, "time", &script.time_str, "str", "Time span of interest (start,stop)"), OPT_BOOLEAN(0, "inline", &symbol_conf.inline_name, diff --git a/tools/perf/util/symbol_conf.h b/tools/perf/util/symbol_conf.h index 657cfa5af43c..a9c51acc722f 100644 --- a/tools/perf/util/symbol_conf.h +++ b/tools/perf/util/symbol_conf.h @@ -64,7 +64,7 @@ struct symbol_conf { *sym_list_str, *col_width_list_str, *bt_stop_list_str; - char *addr2line_path; + const char *addr2line_path; unsigned long time_quantum; struct strlist *dso_list, *comm_list, -- cgit v1.2.3 From 043da846c2b2dfd5b187bf3a9993b8fb0a6ed94a Mon Sep 17 00:00:00 2001 From: Leo Yan Date: Mon, 12 Aug 2024 10:34:59 +0100 Subject: perf docs: Refine the description for the buffer size Current description for the AUX trace buffer size is misleading. When a user specifies the option '-m,512M', it represents a size value in bytes (512MiB) but not 512M pages (512M x 4KiB regard to a page of 4KiB). Make the document clear that the normal buffer and the AUX tracing buffer share the same semantics. Syncs the documents for consistent text. Reviewed-by: James Clark Signed-off-by: Leo Yan Cc: Adrian Hunter Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kan Liang Cc: Namhyung Kim Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20240812093459.2575278-1-leo.yan@arm.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Documentation/perf-kvm.txt | 6 +++--- tools/perf/Documentation/perf-record.txt | 9 +++++---- tools/perf/Documentation/perf-top.txt | 4 ++-- tools/perf/Documentation/perf-trace.txt | 4 ++-- 4 files changed, 12 insertions(+), 11 deletions(-) (limited to 'tools') diff --git a/tools/perf/Documentation/perf-kvm.txt b/tools/perf/Documentation/perf-kvm.txt index b66be66fe836..c26524d38f47 100644 --- a/tools/perf/Documentation/perf-kvm.txt +++ b/tools/perf/Documentation/perf-kvm.txt @@ -115,9 +115,9 @@ STAT LIVE OPTIONS -m:: --mmap-pages=:: - Number of mmap data pages (must be a power of two) or size - specification with appended unit character - B/K/M/G. The - size is rounded up to have nearest pages power of two value. + Number of mmap data pages (must be a power of two) or size + specification in bytes with appended unit character - B/K/M/G. + The size is rounded up to the nearest power-of-two page value. -a:: --all-cpus:: diff --git a/tools/perf/Documentation/perf-record.txt b/tools/perf/Documentation/perf-record.txt index 41e36b4dc765..242223240a08 100644 --- a/tools/perf/Documentation/perf-record.txt +++ b/tools/perf/Documentation/perf-record.txt @@ -273,10 +273,11 @@ OPTIONS -m:: --mmap-pages=:: Number of mmap data pages (must be a power of two) or size - specification with appended unit character - B/K/M/G. The - size is rounded up to have nearest pages power of two value. - Also, by adding a comma, the number of mmap pages for AUX - area tracing can be specified. + specification in bytes with appended unit character - B/K/M/G. + The size is rounded up to the nearest power-of-two page value. + By adding a comma, an additional parameter with the same + semantics used for the normal mmap areas can be specified for + AUX tracing area. -g:: Enables call-graph (stack chain/backtrace) recording for both diff --git a/tools/perf/Documentation/perf-top.txt b/tools/perf/Documentation/perf-top.txt index 667e5102075e..af3e4230c72f 100644 --- a/tools/perf/Documentation/perf-top.txt +++ b/tools/perf/Documentation/perf-top.txt @@ -83,8 +83,8 @@ Default is to monitor all CPUS. -m :: --mmap-pages=:: Number of mmap data pages (must be a power of two) or size - specification with appended unit character - B/K/M/G. The - size is rounded up to have nearest pages power of two value. + specification in bytes with appended unit character - B/K/M/G. + The size is rounded up to the nearest power-of-two page value. -p :: --pid=:: diff --git a/tools/perf/Documentation/perf-trace.txt b/tools/perf/Documentation/perf-trace.txt index f0da8cf63e9a..6e0cc50bbc13 100644 --- a/tools/perf/Documentation/perf-trace.txt +++ b/tools/perf/Documentation/perf-trace.txt @@ -106,8 +106,8 @@ filter out the startup phase of the program, which is often very different. -m:: --mmap-pages=:: Number of mmap data pages (must be a power of two) or size - specification with appended unit character - B/K/M/G. The - size is rounded up to have nearest pages power of two value. + specification in bytes with appended unit character - B/K/M/G. + The size is rounded up to the nearest power-of-two page value. -C:: --cpu:: -- cgit v1.2.3 From 4a4c013d3385b0db85dc361203dc42ff048b6fd6 Mon Sep 17 00:00:00 2001 From: Alan Maguire Date: Sat, 10 Aug 2024 10:35:04 +0100 Subject: libbpf: Fix license for btf_relocate.c License should be // SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) ...as with other libbpf files. Fixes: 19e00c897d50 ("libbpf: Split BTF relocation") Reported-by: Neill Kapron Signed-off-by: Alan Maguire Signed-off-by: Andrii Nakryiko Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20240810093504.2111134-1-alan.maguire@oracle.com --- tools/lib/bpf/btf_relocate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/lib/bpf/btf_relocate.c b/tools/lib/bpf/btf_relocate.c index 17f8b32f94a0..4f7399d85eab 100644 --- a/tools/lib/bpf/btf_relocate.c +++ b/tools/lib/bpf/btf_relocate.c @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) /* Copyright (c) 2024, Oracle and/or its affiliates. */ #ifndef _GNU_SOURCE -- cgit v1.2.3 From ef32e9b6a325d1d013b30d898b4dff94082902cd Mon Sep 17 00:00:00 2001 From: Thomas Weißschuh Date: Wed, 7 Aug 2024 23:51:41 +0200 Subject: tools/nolibc: move entrypoint specifics to compiler.h MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The specific attributes for the _start entrypoint are duplicated for each architecture. Deduplicate it into a dedicated #define into compiler.h. For clang compatibility, the epilogue will also need to be adapted, so move that one, too. Acked-by: Willy Tarreau Link: https://lore.kernel.org/r/20240807-nolibc-llvm-v2-5-c20f2f5fc7c2@weissschuh.net Signed-off-by: Thomas Weißschuh --- tools/include/nolibc/arch-aarch64.h | 4 ++-- tools/include/nolibc/arch-arm.h | 4 ++-- tools/include/nolibc/arch-i386.h | 4 ++-- tools/include/nolibc/arch-loongarch.h | 4 ++-- tools/include/nolibc/arch-mips.h | 4 ++-- tools/include/nolibc/arch-powerpc.h | 4 ++-- tools/include/nolibc/arch-riscv.h | 4 ++-- tools/include/nolibc/arch-s390.h | 4 ++-- tools/include/nolibc/arch-x86_64.h | 4 ++-- tools/include/nolibc/compiler.h | 3 +++ 10 files changed, 21 insertions(+), 18 deletions(-) (limited to 'tools') diff --git a/tools/include/nolibc/arch-aarch64.h b/tools/include/nolibc/arch-aarch64.h index b23ac1f04035..06fdef7b291a 100644 --- a/tools/include/nolibc/arch-aarch64.h +++ b/tools/include/nolibc/arch-aarch64.h @@ -142,13 +142,13 @@ }) /* startup code */ -void __attribute__((weak, noreturn, optimize("Os", "omit-frame-pointer"))) __no_stack_protector _start(void) +void __attribute__((weak, noreturn)) __nolibc_entrypoint __no_stack_protector _start(void) { __asm__ volatile ( "mov x0, sp\n" /* save stack pointer to x0, as arg1 of _start_c */ "and sp, x0, -16\n" /* sp must be 16-byte aligned in the callee */ "bl _start_c\n" /* transfer to c runtime */ ); - __builtin_unreachable(); + __nolibc_entrypoint_epilogue(); } #endif /* _NOLIBC_ARCH_AARCH64_H */ diff --git a/tools/include/nolibc/arch-arm.h b/tools/include/nolibc/arch-arm.h index d1c19d973e55..6180ff99ab43 100644 --- a/tools/include/nolibc/arch-arm.h +++ b/tools/include/nolibc/arch-arm.h @@ -185,7 +185,7 @@ }) /* startup code */ -void __attribute__((weak, noreturn, optimize("Os", "omit-frame-pointer"))) __no_stack_protector _start(void) +void __attribute__((weak, noreturn)) __nolibc_entrypoint __no_stack_protector _start(void) { __asm__ volatile ( "mov r0, sp\n" /* save stack pointer to %r0, as arg1 of _start_c */ @@ -193,7 +193,7 @@ void __attribute__((weak, noreturn, optimize("Os", "omit-frame-pointer"))) __no_ "mov sp, ip\n" "bl _start_c\n" /* transfer to c runtime */ ); - __builtin_unreachable(); + __nolibc_entrypoint_epilogue(); } #endif /* _NOLIBC_ARCH_ARM_H */ diff --git a/tools/include/nolibc/arch-i386.h b/tools/include/nolibc/arch-i386.h index 28c26a00a762..ff5afc35bbd8 100644 --- a/tools/include/nolibc/arch-i386.h +++ b/tools/include/nolibc/arch-i386.h @@ -162,7 +162,7 @@ * 2) The deepest stack frame should be set to zero * */ -void __attribute__((weak, noreturn, optimize("Os", "omit-frame-pointer"))) __no_stack_protector _start(void) +void __attribute__((weak, noreturn)) __nolibc_entrypoint __no_stack_protector _start(void) { __asm__ volatile ( "xor %ebp, %ebp\n" /* zero the stack frame */ @@ -174,7 +174,7 @@ void __attribute__((weak, noreturn, optimize("Os", "omit-frame-pointer"))) __no_ "call _start_c\n" /* transfer to c runtime */ "hlt\n" /* ensure it does not return */ ); - __builtin_unreachable(); + __nolibc_entrypoint_epilogue(); } #endif /* _NOLIBC_ARCH_I386_H */ diff --git a/tools/include/nolibc/arch-loongarch.h b/tools/include/nolibc/arch-loongarch.h index 3f8ef8f86c0f..fb519545959e 100644 --- a/tools/include/nolibc/arch-loongarch.h +++ b/tools/include/nolibc/arch-loongarch.h @@ -149,14 +149,14 @@ #endif /* startup code */ -void __attribute__((weak, noreturn, optimize("Os", "omit-frame-pointer"))) __no_stack_protector _start(void) +void __attribute__((weak, noreturn)) __nolibc_entrypoint __no_stack_protector _start(void) { __asm__ volatile ( "move $a0, $sp\n" /* save stack pointer to $a0, as arg1 of _start_c */ LONG_BSTRINS " $sp, $zero, 3, 0\n" /* $sp must be 16-byte aligned */ "bl _start_c\n" /* transfer to c runtime */ ); - __builtin_unreachable(); + __nolibc_entrypoint_epilogue(); } #endif /* _NOLIBC_ARCH_LOONGARCH_H */ diff --git a/tools/include/nolibc/arch-mips.h b/tools/include/nolibc/arch-mips.h index a2ee77ed2fbb..1791a8ce58da 100644 --- a/tools/include/nolibc/arch-mips.h +++ b/tools/include/nolibc/arch-mips.h @@ -179,7 +179,7 @@ }) /* startup code, note that it's called __start on MIPS */ -void __attribute__((weak, noreturn, optimize("Os", "omit-frame-pointer"))) __no_stack_protector __start(void) +void __attribute__((weak, noreturn)) __nolibc_entrypoint __no_stack_protector __start(void) { __asm__ volatile ( ".set push\n" @@ -200,7 +200,7 @@ void __attribute__((weak, noreturn, optimize("Os", "omit-frame-pointer"))) __no_ " nop\n" /* delayed slot */ ".set pop\n" ); - __builtin_unreachable(); + __nolibc_entrypoint_epilogue(); } #endif /* _NOLIBC_ARCH_MIPS_H */ diff --git a/tools/include/nolibc/arch-powerpc.h b/tools/include/nolibc/arch-powerpc.h index 41ebd394b90c..ee2fdb8d601d 100644 --- a/tools/include/nolibc/arch-powerpc.h +++ b/tools/include/nolibc/arch-powerpc.h @@ -184,7 +184,7 @@ #endif /* !__powerpc64__ */ /* startup code */ -void __attribute__((weak, noreturn, optimize("Os", "omit-frame-pointer"))) __no_stack_protector _start(void) +void __attribute__((weak, noreturn)) __nolibc_entrypoint __no_stack_protector _start(void) { #ifdef __powerpc64__ #if _CALL_ELF == 2 @@ -215,7 +215,7 @@ void __attribute__((weak, noreturn, optimize("Os", "omit-frame-pointer"))) __no_ "bl _start_c\n" /* transfer to c runtime */ ); #endif - __builtin_unreachable(); + __nolibc_entrypoint_epilogue(); } #endif /* _NOLIBC_ARCH_POWERPC_H */ diff --git a/tools/include/nolibc/arch-riscv.h b/tools/include/nolibc/arch-riscv.h index 1927c643c739..8827bf936212 100644 --- a/tools/include/nolibc/arch-riscv.h +++ b/tools/include/nolibc/arch-riscv.h @@ -140,7 +140,7 @@ }) /* startup code */ -void __attribute__((weak, noreturn, optimize("Os", "omit-frame-pointer"))) __no_stack_protector _start(void) +void __attribute__((weak, noreturn)) __nolibc_entrypoint __no_stack_protector _start(void) { __asm__ volatile ( ".option push\n" @@ -151,7 +151,7 @@ void __attribute__((weak, noreturn, optimize("Os", "omit-frame-pointer"))) __no_ "andi sp, a0, -16\n" /* sp must be 16-byte aligned */ "call _start_c\n" /* transfer to c runtime */ ); - __builtin_unreachable(); + __nolibc_entrypoint_epilogue(); } #endif /* _NOLIBC_ARCH_RISCV_H */ diff --git a/tools/include/nolibc/arch-s390.h b/tools/include/nolibc/arch-s390.h index 5d60fd43f883..2ec13d8b9a2d 100644 --- a/tools/include/nolibc/arch-s390.h +++ b/tools/include/nolibc/arch-s390.h @@ -139,7 +139,7 @@ }) /* startup code */ -void __attribute__((weak, noreturn, optimize("Os", "omit-frame-pointer"))) __no_stack_protector _start(void) +void __attribute__((weak, noreturn)) __nolibc_entrypoint __no_stack_protector _start(void) { __asm__ volatile ( "lgr %r2, %r15\n" /* save stack pointer to %r2, as arg1 of _start_c */ @@ -147,7 +147,7 @@ void __attribute__((weak, noreturn, optimize("Os", "omit-frame-pointer"))) __no_ "xc 0(8,%r15), 0(%r15)\n" /* clear backchain */ "brasl %r14, _start_c\n" /* transfer to c runtime */ ); - __builtin_unreachable(); + __nolibc_entrypoint_epilogue(); } struct s390_mmap_arg_struct { diff --git a/tools/include/nolibc/arch-x86_64.h b/tools/include/nolibc/arch-x86_64.h index 68609f421934..65252c005a30 100644 --- a/tools/include/nolibc/arch-x86_64.h +++ b/tools/include/nolibc/arch-x86_64.h @@ -161,7 +161,7 @@ * 2) The deepest stack frame should be zero (the %rbp). * */ -void __attribute__((weak, noreturn, optimize("Os", "omit-frame-pointer"))) __no_stack_protector _start(void) +void __attribute__((weak, noreturn)) __nolibc_entrypoint __no_stack_protector _start(void) { __asm__ volatile ( "xor %ebp, %ebp\n" /* zero the stack frame */ @@ -170,7 +170,7 @@ void __attribute__((weak, noreturn, optimize("Os", "omit-frame-pointer"))) __no_ "call _start_c\n" /* transfer to c runtime */ "hlt\n" /* ensure it does not return */ ); - __builtin_unreachable(); + __nolibc_entrypoint_epilogue(); } #define NOLIBC_ARCH_HAS_MEMMOVE diff --git a/tools/include/nolibc/compiler.h b/tools/include/nolibc/compiler.h index 1730d0454a55..3a1d5b8d4fb9 100644 --- a/tools/include/nolibc/compiler.h +++ b/tools/include/nolibc/compiler.h @@ -12,6 +12,9 @@ # define __nolibc_has_attribute(attr) 0 #endif +#define __nolibc_entrypoint __attribute__((optimize("Os", "omit-frame-pointer"))) +#define __nolibc_entrypoint_epilogue() __builtin_unreachable() + #if defined(__SSP__) || defined(__SSP_STRONG__) || defined(__SSP_ALL__) || defined(__SSP_EXPLICIT__) #define _NOLIBC_STACKPROTECTOR -- cgit v1.2.3 From e098eebb63cb1c03813559b5db9da4451ba3a318 Mon Sep 17 00:00:00 2001 From: Thomas Weißschuh Date: Wed, 7 Aug 2024 23:51:42 +0200 Subject: tools/nolibc: compiler: use attribute((naked)) if available MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The current entrypoint attributes optimize("Os", "omit-frame-pointer") are intended to avoid all compiler generated code, like function porologue and epilogue. This is the exact usecase implemented by the attribute "naked". Unfortunately this is not implemented by GCC for all targets, so only use it where available. This also provides compatibility with clang, which recognizes the "naked" attribute but not the previously used attribute "optimized". Acked-by: Willy Tarreau Link: https://lore.kernel.org/r/20240807-nolibc-llvm-v2-6-c20f2f5fc7c2@weissschuh.net Signed-off-by: Thomas Weißschuh --- tools/include/nolibc/compiler.h | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/include/nolibc/compiler.h b/tools/include/nolibc/compiler.h index 3a1d5b8d4fb9..9bc6a706a332 100644 --- a/tools/include/nolibc/compiler.h +++ b/tools/include/nolibc/compiler.h @@ -12,8 +12,13 @@ # define __nolibc_has_attribute(attr) 0 #endif -#define __nolibc_entrypoint __attribute__((optimize("Os", "omit-frame-pointer"))) -#define __nolibc_entrypoint_epilogue() __builtin_unreachable() +#if __nolibc_has_attribute(naked) +# define __nolibc_entrypoint __attribute__((naked)) +# define __nolibc_entrypoint_epilogue() +#else +# define __nolibc_entrypoint __attribute__((optimize("Os", "omit-frame-pointer"))) +# define __nolibc_entrypoint_epilogue() __builtin_unreachable() +#endif /* __nolibc_has_attribute(naked) */ #if defined(__SSP__) || defined(__SSP_STRONG__) || defined(__SSP_ALL__) || defined(__SSP_EXPLICIT__) -- cgit v1.2.3 From ddae1d7fab8c5dc5d12da120dc410c4f374d37c3 Mon Sep 17 00:00:00 2001 From: Thomas Weißschuh Date: Wed, 7 Aug 2024 23:51:43 +0200 Subject: selftests/nolibc: report failure if no testcase passed MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When nolibc-test is so broken, it doesn't even start, don't report success. Reviewed-by: Shuah Khan Acked-by: Willy Tarreau Link: https://lore.kernel.org/r/20240807-nolibc-llvm-v2-7-c20f2f5fc7c2@weissschuh.net Signed-off-by: Thomas Weißschuh --- tools/testing/selftests/nolibc/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/nolibc/Makefile b/tools/testing/selftests/nolibc/Makefile index 3fbabab46958..46dfbb50fae5 100644 --- a/tools/testing/selftests/nolibc/Makefile +++ b/tools/testing/selftests/nolibc/Makefile @@ -157,7 +157,7 @@ LDFLAGS := REPORT ?= awk '/\[OK\][\r]*$$/{p++} /\[FAIL\][\r]*$$/{if (!f) printf("\n"); f++; print;} /\[SKIPPED\][\r]*$$/{s++} \ END{ printf("\n%3d test(s): %3d passed, %3d skipped, %3d failed => status: ", p+s+f, p, s, f); \ - if (f) printf("failure\n"); else if (s) printf("warning\n"); else printf("success\n");; \ + if (f || !p) printf("failure\n"); else if (s) printf("warning\n"); else printf("success\n");; \ printf("\nSee all results in %s\n", ARGV[1]); }' help: -- cgit v1.2.3 From f1a58f61d88642ae1e6e97e9d72d73bc70a93cb8 Mon Sep 17 00:00:00 2001 From: Thomas Weißschuh Date: Wed, 7 Aug 2024 23:51:44 +0200 Subject: selftests/nolibc: avoid passing NULL to printf("%s") MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Clang on higher optimization levels detects that NULL is passed to printf("%s") and warns about it. While printf() from nolibc gracefully handles that NULL, it is undefined behavior as per POSIX, so the warning is reasonable. Avoid the warning by transforming NULL into a non-NULL placeholder. Reviewed-by: Shuah Khan Acked-by: Willy Tarreau Link: https://lore.kernel.org/r/20240807-nolibc-llvm-v2-8-c20f2f5fc7c2@weissschuh.net Signed-off-by: Thomas Weißschuh --- tools/testing/selftests/nolibc/nolibc-test.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/nolibc/nolibc-test.c b/tools/testing/selftests/nolibc/nolibc-test.c index 0800b10fc3f7..6fba7025c5e3 100644 --- a/tools/testing/selftests/nolibc/nolibc-test.c +++ b/tools/testing/selftests/nolibc/nolibc-test.c @@ -542,7 +542,7 @@ int expect_strzr(const char *expr, int llen) { int ret = 0; - llen += printf(" = <%s> ", expr); + llen += printf(" = <%s> ", expr ? expr : "(null)"); if (expr) { ret = 1; result(llen, FAIL); @@ -561,7 +561,7 @@ int expect_strnz(const char *expr, int llen) { int ret = 0; - llen += printf(" = <%s> ", expr); + llen += printf(" = <%s> ", expr ? expr : "(null)"); if (!expr) { ret = 1; result(llen, FAIL); -- cgit v1.2.3 From 1a1200b66fd52dca33255270a09a093592c3b877 Mon Sep 17 00:00:00 2001 From: Thomas Weißschuh Date: Wed, 7 Aug 2024 23:51:45 +0200 Subject: selftests/nolibc: determine $(srctree) first MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The nolibc-test Makefile includes various other Makefiles from the tree. At first these are included with relative paths like "../../../build/Build.include" but as soon as $(srctree) is set up, the inclusions use that instead to build full paths. To keep the style of inclusions consistent, perform the setup $(srctree) as early as possible and use it for all inclusions. Reviewed-by: Shuah Khan Acked-by: Willy Tarreau Link: https://lore.kernel.org/r/20240807-nolibc-llvm-v2-9-c20f2f5fc7c2@weissschuh.net Signed-off-by: Thomas Weißschuh --- tools/testing/selftests/nolibc/Makefile | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/nolibc/Makefile b/tools/testing/selftests/nolibc/Makefile index 46dfbb50fae5..803a4e1bbe24 100644 --- a/tools/testing/selftests/nolibc/Makefile +++ b/tools/testing/selftests/nolibc/Makefile @@ -1,9 +1,14 @@ # SPDX-License-Identifier: GPL-2.0 # Makefile for nolibc tests -include ../../../scripts/Makefile.include -include ../../../scripts/utilities.mak +# we're in ".../tools/testing/selftests/nolibc" +ifeq ($(srctree),) +srctree := $(patsubst %/tools/testing/selftests/,%,$(dir $(CURDIR))) +endif + +include $(srctree)/tools/scripts/Makefile.include +include $(srctree)/tools/scripts/utilities.mak # We need this for the "cc-option" macro. -include ../../../build/Build.include +include $(srctree)/tools/build/Build.include ifneq ($(O),) ifneq ($(call is-absolute,$(O)),y) @@ -11,11 +16,6 @@ $(error Only absolute O= parameters are supported) endif endif -# we're in ".../tools/testing/selftests/nolibc" -ifeq ($(srctree),) -srctree := $(patsubst %/tools/testing/selftests/,%,$(dir $(CURDIR))) -endif - ifeq ($(ARCH),) include $(srctree)/scripts/subarch.include ARCH = $(SUBARCH) -- cgit v1.2.3 From ae574ae37059da67b90916f14b482bbce0c0ab01 Mon Sep 17 00:00:00 2001 From: Thomas Weißschuh Date: Wed, 7 Aug 2024 23:51:46 +0200 Subject: selftests/nolibc: add support for LLVM= parameter MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Makefile.include can modify CC and CFLAGS for usage with clang. Make use of it. Makefile.include is currently used to handle the O= variable. This is incompatible with the LLVM= handling as for O= it has to be included as early as possible, while for LLVM= it needs to be included after CFLAGS are set up. To avoid this incompatibility, switch the O= handling to custom logic. Reviewed-by: Shuah Khan Acked-by: Willy Tarreau Link: https://lore.kernel.org/r/20240807-nolibc-llvm-v2-10-c20f2f5fc7c2@weissschuh.net Signed-off-by: Thomas Weißschuh --- tools/testing/selftests/nolibc/Makefile | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/nolibc/Makefile b/tools/testing/selftests/nolibc/Makefile index 803a4e1bbe24..cdff317c35f2 100644 --- a/tools/testing/selftests/nolibc/Makefile +++ b/tools/testing/selftests/nolibc/Makefile @@ -5,7 +5,6 @@ ifeq ($(srctree),) srctree := $(patsubst %/tools/testing/selftests/,%,$(dir $(CURDIR))) endif -include $(srctree)/tools/scripts/Makefile.include include $(srctree)/tools/scripts/utilities.mak # We need this for the "cc-option" macro. include $(srctree)/tools/build/Build.include @@ -14,6 +13,9 @@ ifneq ($(O),) ifneq ($(call is-absolute,$(O)),y) $(error Only absolute O= parameters are supported) endif +objtree := $(O) +else +objtree ?= $(srctree) endif ifeq ($(ARCH),) @@ -21,8 +23,6 @@ include $(srctree)/scripts/subarch.include ARCH = $(SUBARCH) endif -objtree ?= $(srctree) - # XARCH extends the kernel's ARCH with a few variants of the same # architecture that only differ by the configuration, the toolchain # and the Qemu program used. It is copied as-is into ARCH except for @@ -155,6 +155,9 @@ CFLAGS ?= -Os -fno-ident -fno-asynchronous-unwind-tables -std=c89 -W -Wall -Wex $(CFLAGS_$(XARCH)) $(CFLAGS_STACKPROTECTOR) $(CFLAGS_EXTRA) LDFLAGS := +# Modify CFLAGS based on LLVM= +include $(srctree)/tools/scripts/Makefile.include + REPORT ?= awk '/\[OK\][\r]*$$/{p++} /\[FAIL\][\r]*$$/{if (!f) printf("\n"); f++; print;} /\[SKIPPED\][\r]*$$/{s++} \ END{ printf("\n%3d test(s): %3d passed, %3d skipped, %3d failed => status: ", p+s+f, p, s, f); \ if (f || !p) printf("failure\n"); else if (s) printf("warning\n"); else printf("success\n");; \ -- cgit v1.2.3 From 1bd75aeb5446eb2b1351465e88a98fd784003250 Mon Sep 17 00:00:00 2001 From: Thomas Weißschuh Date: Wed, 7 Aug 2024 23:51:47 +0200 Subject: selftests/nolibc: add cc-option compatible with clang cross builds MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The cc-option macro from Build.include is not compatible with clang cross builds, as it does not respect the "--target" and similar flags, set up by Mekfile.include. Provide a custom variant which works correctly. Reviewed-by: Shuah Khan Acked-by: Willy Tarreau Link: https://lore.kernel.org/r/20240807-nolibc-llvm-v2-11-c20f2f5fc7c2@weissschuh.net Signed-off-by: Thomas Weißschuh --- tools/testing/selftests/nolibc/Makefile | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/nolibc/Makefile b/tools/testing/selftests/nolibc/Makefile index cdff317c35f2..b8577086e008 100644 --- a/tools/testing/selftests/nolibc/Makefile +++ b/tools/testing/selftests/nolibc/Makefile @@ -6,8 +6,8 @@ srctree := $(patsubst %/tools/testing/selftests/,%,$(dir $(CURDIR))) endif include $(srctree)/tools/scripts/utilities.mak -# We need this for the "cc-option" macro. -include $(srctree)/tools/build/Build.include +# We need this for the "__cc-option" macro. +include $(srctree)/scripts/Makefile.compiler ifneq ($(O),) ifneq ($(call is-absolute,$(O)),y) @@ -23,6 +23,8 @@ include $(srctree)/scripts/subarch.include ARCH = $(SUBARCH) endif +cc-option = $(call __cc-option, $(CC),$(CLANG_CROSS_FLAGS),$(1),$(2)) + # XARCH extends the kernel's ARCH with a few variants of the same # architecture that only differ by the configuration, the toolchain # and the Qemu program used. It is copied as-is into ARCH except for -- cgit v1.2.3 From 27e458bbebdb0aa2aecce86d22c7f02b66e68ee1 Mon Sep 17 00:00:00 2001 From: Thomas Weißschuh Date: Wed, 7 Aug 2024 23:51:48 +0200 Subject: selftests/nolibc: run-tests.sh: avoid overwriting CFLAGS_EXTRA MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If the user specified their own CFLAGS_EXTRA these should not be overwritten by `-e`. Reviewed-by: Shuah Khan Acked-by: Willy Tarreau Link: https://lore.kernel.org/r/20240807-nolibc-llvm-v2-12-c20f2f5fc7c2@weissschuh.net Signed-off-by: Thomas Weißschuh --- tools/testing/selftests/nolibc/run-tests.sh | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/nolibc/run-tests.sh b/tools/testing/selftests/nolibc/run-tests.sh index 0446e6326a40..324509b99e2c 100755 --- a/tools/testing/selftests/nolibc/run-tests.sh +++ b/tools/testing/selftests/nolibc/run-tests.sh @@ -15,7 +15,7 @@ download_location="${cache_dir}/crosstools/" build_location="$(realpath "${cache_dir}"/nolibc-tests/)" perform_download=0 test_mode=system -CFLAGS_EXTRA="-Werror" +werror=1 archs="i386 x86_64 arm64 arm mips32le mips32be ppc ppc64 ppc64le riscv s390 loongarch" TEMP=$(getopt -o 'j:d:c:b:a:m:peh' -n "$0" -- "$@") @@ -69,7 +69,7 @@ while true; do test_mode="$2" shift 2; continue ;; '-e') - CFLAGS_EXTRA="" + werror=0 shift; continue ;; '-h') print_usage @@ -140,6 +140,9 @@ test_arch() { ct_abi=$(crosstool_abi "$1") cross_compile=$(realpath "${download_location}gcc-${crosstool_version}-nolibc/${ct_arch}-${ct_abi}/bin/${ct_arch}-${ct_abi}-") build_dir="${build_location}/${arch}" + if [ "$werror" -ne 0 ]; then + CFLAGS_EXTRA="$CFLAGS_EXTRA -Werror" + fi MAKE=(make -j"${nproc}" XARCH="${arch}" CROSS_COMPILE="${cross_compile}" O="${build_dir}") mkdir -p "$build_dir" -- cgit v1.2.3 From 801cf69ca03040a75ed73be263aa6f0fdcb8af5d Mon Sep 17 00:00:00 2001 From: Thomas Weißschuh Date: Wed, 7 Aug 2024 23:51:49 +0200 Subject: selftests/nolibc: don't use libgcc when building with clang MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The logic in clang to find the libgcc.a from a GCC toolchain for a specific ABI does not work reliably and can lead to errors. Instead disable libgcc when building with clang, as it's not needed anyways. Acked-by: Willy Tarreau Reviewed-by: Shuah Khan Link: https://lore.kernel.org/r/20240807-nolibc-llvm-v2-13-c20f2f5fc7c2@weissschuh.net Signed-off-by: Thomas Weißschuh --- tools/testing/selftests/nolibc/Makefile | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/nolibc/Makefile b/tools/testing/selftests/nolibc/Makefile index b8577086e008..f0ce8264baac 100644 --- a/tools/testing/selftests/nolibc/Makefile +++ b/tools/testing/selftests/nolibc/Makefile @@ -157,6 +157,13 @@ CFLAGS ?= -Os -fno-ident -fno-asynchronous-unwind-tables -std=c89 -W -Wall -Wex $(CFLAGS_$(XARCH)) $(CFLAGS_STACKPROTECTOR) $(CFLAGS_EXTRA) LDFLAGS := +LIBGCC := -lgcc + +ifneq ($(LLVM),) +# Not needed for clang +LIBGCC := +endif + # Modify CFLAGS based on LLVM= include $(srctree)/tools/scripts/Makefile.include @@ -209,11 +216,11 @@ sysroot/$(ARCH)/include: ifneq ($(NOLIBC_SYSROOT),0) nolibc-test: nolibc-test.c nolibc-test-linkage.c sysroot/$(ARCH)/include $(QUIET_CC)$(CC) $(CFLAGS) $(LDFLAGS) -o $@ \ - -nostdlib -nostdinc -static -Isysroot/$(ARCH)/include nolibc-test.c nolibc-test-linkage.c -lgcc + -nostdlib -nostdinc -static -Isysroot/$(ARCH)/include nolibc-test.c nolibc-test-linkage.c $(LIBGCC) else nolibc-test: nolibc-test.c nolibc-test-linkage.c $(QUIET_CC)$(CC) $(CFLAGS) $(LDFLAGS) -o $@ \ - -nostdlib -static -include $(srctree)/tools/include/nolibc/nolibc.h nolibc-test.c nolibc-test-linkage.c -lgcc + -nostdlib -static -include $(srctree)/tools/include/nolibc/nolibc.h nolibc-test.c nolibc-test-linkage.c $(LIBGCC) endif libc-test: nolibc-test.c nolibc-test-linkage.c -- cgit v1.2.3 From 8404af7e13eeef91d0e69b44214cbafc2767b7f2 Mon Sep 17 00:00:00 2001 From: Thomas Weißschuh Date: Wed, 7 Aug 2024 23:51:50 +0200 Subject: selftests/nolibc: use correct clang target for s390/systemz MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The target names between GCC and clang differ for s390. While GCC uses "s390", clang uses "systemz". This mapping is not handled by tools/scripts/Makefile.include, so do it in the nolibc-test Makefile. Acked-by: Willy Tarreau Reviewed-by: Shuah Khan Link: https://lore.kernel.org/r/20240807-nolibc-llvm-v2-14-c20f2f5fc7c2@weissschuh.net Signed-off-by: Thomas Weißschuh --- tools/testing/selftests/nolibc/Makefile | 3 +++ 1 file changed, 3 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/nolibc/Makefile b/tools/testing/selftests/nolibc/Makefile index f0ce8264baac..8de98ea7af80 100644 --- a/tools/testing/selftests/nolibc/Makefile +++ b/tools/testing/selftests/nolibc/Makefile @@ -167,6 +167,9 @@ endif # Modify CFLAGS based on LLVM= include $(srctree)/tools/scripts/Makefile.include +# GCC uses "s390", clang "systemz" +CLANG_CROSS_FLAGS := $(subst --target=s390-linux,--target=systemz-linux,$(CLANG_CROSS_FLAGS)) + REPORT ?= awk '/\[OK\][\r]*$$/{p++} /\[FAIL\][\r]*$$/{if (!f) printf("\n"); f++; print;} /\[SKIPPED\][\r]*$$/{s++} \ END{ printf("\n%3d test(s): %3d passed, %3d skipped, %3d failed => status: ", p+s+f, p, s, f); \ if (f || !p) printf("failure\n"); else if (s) printf("warning\n"); else printf("success\n");; \ -- cgit v1.2.3 From 22ba81c50a49468d80055674d8d6f78afb1c92c4 Mon Sep 17 00:00:00 2001 From: Thomas Weißschuh Date: Wed, 7 Aug 2024 23:51:51 +0200 Subject: selftests/nolibc: run-tests.sh: allow building through LLVM MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The nolibc tests can now be properly built with LLVM. Expose this through run-tests.sh. Reviewed-by: Shuah Khan Acked-by: Willy Tarreau Link: https://lore.kernel.org/r/20240807-nolibc-llvm-v2-15-c20f2f5fc7c2@weissschuh.net Signed-off-by: Thomas Weißschuh --- tools/testing/selftests/nolibc/run-tests.sh | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/nolibc/run-tests.sh b/tools/testing/selftests/nolibc/run-tests.sh index 324509b99e2c..e7ecda4ae796 100755 --- a/tools/testing/selftests/nolibc/run-tests.sh +++ b/tools/testing/selftests/nolibc/run-tests.sh @@ -16,9 +16,10 @@ build_location="$(realpath "${cache_dir}"/nolibc-tests/)" perform_download=0 test_mode=system werror=1 +llvm= archs="i386 x86_64 arm64 arm mips32le mips32be ppc ppc64 ppc64le riscv s390 loongarch" -TEMP=$(getopt -o 'j:d:c:b:a:m:peh' -n "$0" -- "$@") +TEMP=$(getopt -o 'j:d:c:b:a:m:pelh' -n "$0" -- "$@") eval set -- "$TEMP" unset TEMP @@ -42,6 +43,7 @@ Options: -b [DIR] Build location (default: ${build_location}) -m [MODE] Test mode user/system (default: ${test_mode}) -e Disable -Werror + -l Build with LLVM/clang EOF } @@ -71,6 +73,9 @@ while true; do '-e') werror=0 shift; continue ;; + '-l') + llvm=1 + shift; continue ;; '-h') print_usage exit 0 @@ -143,7 +148,7 @@ test_arch() { if [ "$werror" -ne 0 ]; then CFLAGS_EXTRA="$CFLAGS_EXTRA -Werror" fi - MAKE=(make -j"${nproc}" XARCH="${arch}" CROSS_COMPILE="${cross_compile}" O="${build_dir}") + MAKE=(make -j"${nproc}" XARCH="${arch}" CROSS_COMPILE="${cross_compile}" LLVM="${llvm}" O="${build_dir}") mkdir -p "$build_dir" if [ "$test_mode" = "system" ] && [ ! -f "${build_dir}/.config" ]; then -- cgit v1.2.3 From 00b04242683ebae91164244883b46e6ab2669d63 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Sat, 10 Aug 2024 12:15:02 -0700 Subject: perf annotate-data: Fix a buffer overflow in TUI browser In get_member_overhead(), k is updated when it has a entry in the histogram. But the entry->hists array is allocated with the number of evsel in the group. So the k should be reset when it iterates the event using for_each_group_evsel(), otherwise it'd crash due to a buffer overflow. Fixes: cb1898f58e0f175d ("perf annotate-data: Support --skip-empty option") Signed-off-by: Namhyung Kim Cc: Adrian Hunter Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kan Liang Cc: Namhyung Kim Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20240810191502.1947959-1-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/ui/browsers/annotate-data.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/perf/ui/browsers/annotate-data.c b/tools/perf/ui/browsers/annotate-data.c index c3db80a7589a..a937b55da736 100644 --- a/tools/perf/ui/browsers/annotate-data.c +++ b/tools/perf/ui/browsers/annotate-data.c @@ -46,11 +46,12 @@ static int get_member_overhead(struct annotated_data_type *adt, struct annotated_member *member = entry->data; int i, k; - for (i = 0, k = 0; i < member->size; i++) { + for (i = 0; i < member->size; i++) { struct type_hist *h; struct evsel *evsel; int offset = member->offset + i; + k = 0; for_each_group_evsel(evsel, leader) { if (symbol_conf.skip_empty && evsel__hists(evsel)->stats.nr_samples == 0) -- cgit v1.2.3 From 040c0f887fdcfe747a3f63c94e9cd29e9ed0b872 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Mon, 12 Aug 2024 10:25:33 -0700 Subject: perf lock contention: Change stack_id type to s32 The bpf_get_stackid() helper returns a signed type to check whether it failed to get a stacktrace or not. But it saved the result in u32 and checked if the value is negative. 376 if (needs_callstack) { 377 pelem->stack_id = bpf_get_stackid(ctx, &stacks, 378 BPF_F_FAST_STACK_CMP | stack_skip); --> 379 if (pelem->stack_id < 0) ./tools/perf/util/bpf_skel/lock_contention.bpf.c:379 contention_begin() warn: unsigned 'pelem->stack_id' is never less than zero. Let's change the type to s32 instead. Fixes: 6d499a6b3d90277d ("perf lock: Print the number of lost entries for BPF") Reported-by: Dan Carpenter Signed-off-by: Namhyung Kim Cc: Adrian Hunter Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kan Liang Cc: Namhyung Kim Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20240812172533.2015291-1-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/bpf_skel/lock_data.h | 4 ++-- tools/perf/util/bpf_skel/vmlinux/vmlinux.h | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/bpf_skel/lock_data.h b/tools/perf/util/bpf_skel/lock_data.h index 36af11faad03..de12892f992f 100644 --- a/tools/perf/util/bpf_skel/lock_data.h +++ b/tools/perf/util/bpf_skel/lock_data.h @@ -7,11 +7,11 @@ struct tstamp_data { u64 timestamp; u64 lock; u32 flags; - u32 stack_id; + s32 stack_id; }; struct contention_key { - u32 stack_id; + s32 stack_id; u32 pid; u64 lock_addr_or_cgroup; }; diff --git a/tools/perf/util/bpf_skel/vmlinux/vmlinux.h b/tools/perf/util/bpf_skel/vmlinux/vmlinux.h index e9028235d771..d818e30c5457 100644 --- a/tools/perf/util/bpf_skel/vmlinux/vmlinux.h +++ b/tools/perf/util/bpf_skel/vmlinux/vmlinux.h @@ -15,6 +15,7 @@ typedef __u8 u8; typedef __u32 u32; +typedef __s32 s32; typedef __u64 u64; typedef __s64 s64; -- cgit v1.2.3 From 7a75c6c23a2ea8dd22d90805b3a42bd65c53830e Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Sat, 10 Aug 2024 21:20:04 -0700 Subject: perf vendor events: SKX, CLX, SNR uncore cache event fixes Cache home agent (CHA) events were setting the low rather than high config1 bits. SNR was using CLX CHA events, however its CHA is similar to ICX so remove the events. Incorporate the updates in: https://github.com/intel/perfmon/pull/215 https://github.com/intel/perfmon/pull/216 Fixes: 4cc49942444e958b ("perf vendor events: Update cascadelakex events/metrics") Closes: https://lore.kernel.org/linux-perf-users/CAPhsuW4nem9XZP+b=sJJ7kqXG-cafz0djZf51HsgjCiwkGBA+A@mail.gmail.com/ Reported-by: Song Liu Reviewed-by: Kan Liang Co-authored-by: Weilin Wang Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20240811042004.421869-1-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- .../arch/x86/cascadelakex/uncore-cache.json | 60 +++++++++++----------- .../pmu-events/arch/x86/skylakex/uncore-cache.json | 60 +++++++++++----------- .../arch/x86/snowridgex/uncore-cache.json | 57 -------------------- 3 files changed, 60 insertions(+), 117 deletions(-) (limited to 'tools') diff --git a/tools/perf/pmu-events/arch/x86/cascadelakex/uncore-cache.json b/tools/perf/pmu-events/arch/x86/cascadelakex/uncore-cache.json index c9596e18ec09..6347eba48810 100644 --- a/tools/perf/pmu-events/arch/x86/cascadelakex/uncore-cache.json +++ b/tools/perf/pmu-events/arch/x86/cascadelakex/uncore-cache.json @@ -4577,7 +4577,7 @@ "Counter": "0,1,2,3", "EventCode": "0x35", "EventName": "UNC_CHA_TOR_INSERTS.IA_HIT_CRD", - "Filter": "config1=0x40233", + "Filter": "config1=0x4023300000000", "PerPkg": "1", "PublicDescription": "TOR Inserts : CRds issued by iA Cores that Hit the LLC : Counts the number of entries successfully inserted into the TOR that match qualifications specified by the subevent. Does not include addressless requests such as locks and interrupts.", "UMask": "0x11", @@ -4588,7 +4588,7 @@ "Counter": "0,1,2,3", "EventCode": "0x35", "EventName": "UNC_CHA_TOR_INSERTS.IA_HIT_DRD", - "Filter": "config1=0x40433", + "Filter": "config1=0x4043300000000", "PerPkg": "1", "PublicDescription": "TOR Inserts : DRds issued by iA Cores that Hit the LLC : Counts the number of entries successfully inserted into the TOR that match qualifications specified by the subevent. Does not include addressless requests such as locks and interrupts.", "UMask": "0x11", @@ -4599,7 +4599,7 @@ "Counter": "0,1,2,3", "EventCode": "0x35", "EventName": "UNC_CHA_TOR_INSERTS.IA_HIT_LlcPrefCRD", - "Filter": "config1=0x4b233", + "Filter": "config1=0x4b23300000000", "PerPkg": "1", "UMask": "0x11", "Unit": "CHA" @@ -4609,7 +4609,7 @@ "Counter": "0,1,2,3", "EventCode": "0x35", "EventName": "UNC_CHA_TOR_INSERTS.IA_HIT_LlcPrefDRD", - "Filter": "config1=0x4b433", + "Filter": "config1=0x4b43300000000", "PerPkg": "1", "UMask": "0x11", "Unit": "CHA" @@ -4619,7 +4619,7 @@ "Counter": "0,1,2,3", "EventCode": "0x35", "EventName": "UNC_CHA_TOR_INSERTS.IA_HIT_LlcPrefRFO", - "Filter": "config1=0x4b033", + "Filter": "config1=0x4b03300000000", "PerPkg": "1", "PublicDescription": "TOR Inserts : LLCPrefRFO issued by iA Cores that hit the LLC : Counts the number of entries successfully inserted into the TOR that match qualifications specified by the subevent. Does not include addressless requests such as locks and interrupts.", "UMask": "0x11", @@ -4630,7 +4630,7 @@ "Counter": "0,1,2,3", "EventCode": "0x35", "EventName": "UNC_CHA_TOR_INSERTS.IA_HIT_RFO", - "Filter": "config1=0x40033", + "Filter": "config1=0x4003300000000", "PerPkg": "1", "PublicDescription": "TOR Inserts : RFOs issued by iA Cores that Hit the LLC : Counts the number of entries successfully inserted into the TOR that match qualifications specified by the subevent. Does not include addressless requests such as locks and interrupts.", "UMask": "0x11", @@ -4651,7 +4651,7 @@ "Counter": "0,1,2,3", "EventCode": "0x35", "EventName": "UNC_CHA_TOR_INSERTS.IA_MISS_CRD", - "Filter": "config1=0x40233", + "Filter": "config1=0x4023300000000", "PerPkg": "1", "PublicDescription": "TOR Inserts : CRds issued by iA Cores that Missed the LLC : Counts the number of entries successfully inserted into the TOR that match qualifications specified by the subevent. Does not include addressless requests such as locks and interrupts.", "UMask": "0x21", @@ -4662,7 +4662,7 @@ "Counter": "0,1,2,3", "EventCode": "0x35", "EventName": "UNC_CHA_TOR_INSERTS.IA_MISS_DRD", - "Filter": "config1=0x40433", + "Filter": "config1=0x4043300000000", "PerPkg": "1", "PublicDescription": "TOR Inserts : DRds issued by iA Cores that Missed the LLC : Counts the number of entries successfully inserted into the TOR that match qualifications specified by the subevent. Does not include addressless requests such as locks and interrupts.", "UMask": "0x21", @@ -4673,7 +4673,7 @@ "Counter": "0,1,2,3", "EventCode": "0x35", "EventName": "UNC_CHA_TOR_INSERTS.IA_MISS_LlcPrefCRD", - "Filter": "config1=0x4b233", + "Filter": "config1=0x4b23300000000", "PerPkg": "1", "UMask": "0x21", "Unit": "CHA" @@ -4683,7 +4683,7 @@ "Counter": "0,1,2,3", "EventCode": "0x35", "EventName": "UNC_CHA_TOR_INSERTS.IA_MISS_LlcPrefDRD", - "Filter": "config1=0x4b433", + "Filter": "config1=0x4b43300000000", "PerPkg": "1", "UMask": "0x21", "Unit": "CHA" @@ -4693,7 +4693,7 @@ "Counter": "0,1,2,3", "EventCode": "0x35", "EventName": "UNC_CHA_TOR_INSERTS.IA_MISS_LlcPrefRFO", - "Filter": "config1=0x4b033", + "Filter": "config1=0x4b03300000000", "PerPkg": "1", "PublicDescription": "TOR Inserts : LLCPrefRFO issued by iA Cores that missed the LLC : Counts the number of entries successfully inserted into the TOR that match qualifications specified by the subevent. Does not include addressless requests such as locks and interrupts.", "UMask": "0x21", @@ -4704,7 +4704,7 @@ "Counter": "0,1,2,3", "EventCode": "0x35", "EventName": "UNC_CHA_TOR_INSERTS.IA_MISS_RFO", - "Filter": "config1=0x40033", + "Filter": "config1=0x4003300000000", "PerPkg": "1", "PublicDescription": "TOR Inserts : RFOs issued by iA Cores that Missed the LLC : Counts the number of entries successfully inserted into the TOR that match qualifications specified by the subevent. Does not include addressless requests such as locks and interrupts.", "UMask": "0x21", @@ -4747,7 +4747,7 @@ "EventCode": "0x35", "EventName": "UNC_CHA_TOR_INSERTS.IO_MISS_ITOM", "Experimental": "1", - "Filter": "config1=0x49033", + "Filter": "config1=0x4903300000000", "PerPkg": "1", "PublicDescription": "Counts the number of entries successfully inserted into the TOR that are generated from local IO ItoM requests that miss the LLC. An ItoM request is used by IIO to request a data write without first reading the data for ownership.", "UMask": "0x24", @@ -4759,7 +4759,7 @@ "EventCode": "0x35", "EventName": "UNC_CHA_TOR_INSERTS.IO_MISS_RDCUR", "Experimental": "1", - "Filter": "config1=0x43C33", + "Filter": "config1=0x43c3300000000", "PerPkg": "1", "PublicDescription": "Counts the number of entries successfully inserted into the TOR that are generated from local IO RdCur requests and miss the LLC. A RdCur request is used by IIO to read data without changing state.", "UMask": "0x24", @@ -4771,7 +4771,7 @@ "EventCode": "0x35", "EventName": "UNC_CHA_TOR_INSERTS.IO_MISS_RFO", "Experimental": "1", - "Filter": "config1=0x40033", + "Filter": "config1=0x4003300000000", "PerPkg": "1", "PublicDescription": "Counts the number of entries successfully inserted into the TOR that are generated from local IO RFO requests that miss the LLC. A read for ownership (RFO) requests a cache line to be cached in E state with the intent to modify.", "UMask": "0x24", @@ -4999,7 +4999,7 @@ "Counter": "0", "EventCode": "0x36", "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_HIT_CRD", - "Filter": "config1=0x40233", + "Filter": "config1=0x4023300000000", "PerPkg": "1", "PublicDescription": "TOR Occupancy : CRds issued by iA Cores that Hit the LLC : For each cycle, this event accumulates the number of valid entries in the TOR that match qualifications specified by the subevent. Does not include addressless requests such as locks and interrupts.", "UMask": "0x11", @@ -5010,7 +5010,7 @@ "Counter": "0", "EventCode": "0x36", "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_HIT_DRD", - "Filter": "config1=0x40433", + "Filter": "config1=0x4043300000000", "PerPkg": "1", "PublicDescription": "TOR Occupancy : DRds issued by iA Cores that Hit the LLC : For each cycle, this event accumulates the number of valid entries in the TOR that match qualifications specified by the subevent. Does not include addressless requests such as locks and interrupts.", "UMask": "0x11", @@ -5021,7 +5021,7 @@ "Counter": "0", "EventCode": "0x36", "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_HIT_LlcPrefCRD", - "Filter": "config1=0x4b233", + "Filter": "config1=0x4b23300000000", "PerPkg": "1", "UMask": "0x11", "Unit": "CHA" @@ -5031,7 +5031,7 @@ "Counter": "0", "EventCode": "0x36", "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_HIT_LlcPrefDRD", - "Filter": "config1=0x4b433", + "Filter": "config1=0x4b43300000000", "PerPkg": "1", "UMask": "0x11", "Unit": "CHA" @@ -5041,7 +5041,7 @@ "Counter": "0", "EventCode": "0x36", "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_HIT_LlcPrefRFO", - "Filter": "config1=0x4b033", + "Filter": "config1=0x4b03300000000", "PerPkg": "1", "PublicDescription": "TOR Occupancy : LLCPrefRFO issued by iA Cores that hit the LLC : For each cycle, this event accumulates the number of valid entries in the TOR that match qualifications specified by the subevent. Does not include addressless requests such as locks and interrupts.", "UMask": "0x11", @@ -5052,7 +5052,7 @@ "Counter": "0", "EventCode": "0x36", "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_HIT_RFO", - "Filter": "config1=0x40033", + "Filter": "config1=0x4003300000000", "PerPkg": "1", "PublicDescription": "TOR Occupancy : RFOs issued by iA Cores that Hit the LLC : For each cycle, this event accumulates the number of valid entries in the TOR that match qualifications specified by the subevent. Does not include addressless requests such as locks and interrupts.", "UMask": "0x11", @@ -5073,7 +5073,7 @@ "Counter": "0", "EventCode": "0x36", "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_CRD", - "Filter": "config1=0x40233", + "Filter": "config1=0x4023300000000", "PerPkg": "1", "PublicDescription": "TOR Occupancy : CRds issued by iA Cores that Missed the LLC : For each cycle, this event accumulates the number of valid entries in the TOR that match qualifications specified by the subevent. Does not include addressless requests such as locks and interrupts.", "UMask": "0x21", @@ -5084,7 +5084,7 @@ "Counter": "0", "EventCode": "0x36", "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD", - "Filter": "config1=0x40433", + "Filter": "config1=0x4043300000000", "PerPkg": "1", "PublicDescription": "TOR Occupancy : DRds issued by iA Cores that Missed the LLC : For each cycle, this event accumulates the number of valid entries in the TOR that match qualifications specified by the subevent. Does not include addressless requests such as locks and interrupts.", "UMask": "0x21", @@ -5095,7 +5095,7 @@ "Counter": "0", "EventCode": "0x36", "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_LlcPrefCRD", - "Filter": "config1=0x4b233", + "Filter": "config1=0x4b23300000000", "PerPkg": "1", "UMask": "0x21", "Unit": "CHA" @@ -5105,7 +5105,7 @@ "Counter": "0", "EventCode": "0x36", "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_LlcPrefDRD", - "Filter": "config1=0x4b433", + "Filter": "config1=0x4b43300000000", "PerPkg": "1", "UMask": "0x21", "Unit": "CHA" @@ -5115,7 +5115,7 @@ "Counter": "0", "EventCode": "0x36", "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_LlcPrefRFO", - "Filter": "config1=0x4b033", + "Filter": "config1=0x4b03300000000", "PerPkg": "1", "PublicDescription": "TOR Occupancy : LLCPrefRFO issued by iA Cores that missed the LLC : For each cycle, this event accumulates the number of valid entries in the TOR that match qualifications specified by the subevent. Does not include addressless requests such as locks and interrupts.", "UMask": "0x21", @@ -5126,7 +5126,7 @@ "Counter": "0", "EventCode": "0x36", "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_RFO", - "Filter": "config1=0x40033", + "Filter": "config1=0x4003300000000", "PerPkg": "1", "PublicDescription": "TOR Occupancy : RFOs issued by iA Cores that Missed the LLC : For each cycle, this event accumulates the number of valid entries in the TOR that match qualifications specified by the subevent. Does not include addressless requests such as locks and interrupts.", "UMask": "0x21", @@ -5171,7 +5171,7 @@ "EventCode": "0x36", "EventName": "UNC_CHA_TOR_OCCUPANCY.IO_MISS_ITOM", "Experimental": "1", - "Filter": "config1=0x49033", + "Filter": "config1=0x4903300000000", "PerPkg": "1", "PublicDescription": "For each cycle, this event accumulates the number of valid entries in the TOR that are generated from local IO ItoM requests that miss the LLC. An ItoM is used by IIO to request a data write without first reading the data for ownership.", "UMask": "0x24", @@ -5183,7 +5183,7 @@ "EventCode": "0x36", "EventName": "UNC_CHA_TOR_OCCUPANCY.IO_MISS_RDCUR", "Experimental": "1", - "Filter": "config1=0x43C33", + "Filter": "config1=0x43c3300000000", "PerPkg": "1", "PublicDescription": "For each cycle, this event accumulates the number of valid entries in the TOR that are generated from local IO RdCur requests that miss the LLC. A RdCur request is used by IIO to read data without changing state.", "UMask": "0x24", @@ -5195,7 +5195,7 @@ "EventCode": "0x36", "EventName": "UNC_CHA_TOR_OCCUPANCY.IO_MISS_RFO", "Experimental": "1", - "Filter": "config1=0x40033", + "Filter": "config1=0x4003300000000", "PerPkg": "1", "PublicDescription": "For each cycle, this event accumulates the number of valid entries in the TOR that are generated from local IO RFO requests that miss the LLC. A read for ownership (RFO) requests data to be cached in E state with the intent to modify.", "UMask": "0x24", diff --git a/tools/perf/pmu-events/arch/x86/skylakex/uncore-cache.json b/tools/perf/pmu-events/arch/x86/skylakex/uncore-cache.json index da46a3aeb58c..4fc818626491 100644 --- a/tools/perf/pmu-events/arch/x86/skylakex/uncore-cache.json +++ b/tools/perf/pmu-events/arch/x86/skylakex/uncore-cache.json @@ -4454,7 +4454,7 @@ "Counter": "0,1,2,3", "EventCode": "0x35", "EventName": "UNC_CHA_TOR_INSERTS.IA_HIT_CRD", - "Filter": "config1=0x40233", + "Filter": "config1=0x4023300000000", "PerPkg": "1", "PublicDescription": "TOR Inserts : CRds issued by iA Cores that Hit the LLC : Counts the number of entries successfully inserted into the TOR that match qualifications specified by the subevent. Does not include addressless requests such as locks and interrupts.", "UMask": "0x11", @@ -4465,7 +4465,7 @@ "Counter": "0,1,2,3", "EventCode": "0x35", "EventName": "UNC_CHA_TOR_INSERTS.IA_HIT_DRD", - "Filter": "config1=0x40433", + "Filter": "config1=0x4043300000000", "PerPkg": "1", "PublicDescription": "TOR Inserts : DRds issued by iA Cores that Hit the LLC : Counts the number of entries successfully inserted into the TOR that match qualifications specified by the subevent. Does not include addressless requests such as locks and interrupts.", "UMask": "0x11", @@ -4476,7 +4476,7 @@ "Counter": "0,1,2,3", "EventCode": "0x35", "EventName": "UNC_CHA_TOR_INSERTS.IA_HIT_LlcPrefCRD", - "Filter": "config1=0x4b233", + "Filter": "config1=0x4b23300000000", "PerPkg": "1", "UMask": "0x11", "Unit": "CHA" @@ -4486,7 +4486,7 @@ "Counter": "0,1,2,3", "EventCode": "0x35", "EventName": "UNC_CHA_TOR_INSERTS.IA_HIT_LlcPrefDRD", - "Filter": "config1=0x4b433", + "Filter": "config1=0x4b43300000000", "PerPkg": "1", "UMask": "0x11", "Unit": "CHA" @@ -4496,7 +4496,7 @@ "Counter": "0,1,2,3", "EventCode": "0x35", "EventName": "UNC_CHA_TOR_INSERTS.IA_HIT_LlcPrefRFO", - "Filter": "config1=0x4b033", + "Filter": "config1=0x4b03300000000", "PerPkg": "1", "PublicDescription": "TOR Inserts : LLCPrefRFO issued by iA Cores that hit the LLC : Counts the number of entries successfully inserted into the TOR that match qualifications specified by the subevent. Does not include addressless requests such as locks and interrupts.", "UMask": "0x11", @@ -4507,7 +4507,7 @@ "Counter": "0,1,2,3", "EventCode": "0x35", "EventName": "UNC_CHA_TOR_INSERTS.IA_HIT_RFO", - "Filter": "config1=0x40033", + "Filter": "config1=0x4003300000000", "PerPkg": "1", "PublicDescription": "TOR Inserts : RFOs issued by iA Cores that Hit the LLC : Counts the number of entries successfully inserted into the TOR that match qualifications specified by the subevent. Does not include addressless requests such as locks and interrupts.", "UMask": "0x11", @@ -4528,7 +4528,7 @@ "Counter": "0,1,2,3", "EventCode": "0x35", "EventName": "UNC_CHA_TOR_INSERTS.IA_MISS_CRD", - "Filter": "config1=0x40233", + "Filter": "config1=0x4023300000000", "PerPkg": "1", "PublicDescription": "TOR Inserts : CRds issued by iA Cores that Missed the LLC : Counts the number of entries successfully inserted into the TOR that match qualifications specified by the subevent. Does not include addressless requests such as locks and interrupts.", "UMask": "0x21", @@ -4539,7 +4539,7 @@ "Counter": "0,1,2,3", "EventCode": "0x35", "EventName": "UNC_CHA_TOR_INSERTS.IA_MISS_DRD", - "Filter": "config1=0x40433", + "Filter": "config1=0x4043300000000", "PerPkg": "1", "PublicDescription": "TOR Inserts : DRds issued by iA Cores that Missed the LLC : Counts the number of entries successfully inserted into the TOR that match qualifications specified by the subevent. Does not include addressless requests such as locks and interrupts.", "UMask": "0x21", @@ -4550,7 +4550,7 @@ "Counter": "0,1,2,3", "EventCode": "0x35", "EventName": "UNC_CHA_TOR_INSERTS.IA_MISS_LlcPrefCRD", - "Filter": "config1=0x4b233", + "Filter": "config1=0x4b23300000000", "PerPkg": "1", "UMask": "0x21", "Unit": "CHA" @@ -4560,7 +4560,7 @@ "Counter": "0,1,2,3", "EventCode": "0x35", "EventName": "UNC_CHA_TOR_INSERTS.IA_MISS_LlcPrefDRD", - "Filter": "config1=0x4b433", + "Filter": "config1=0x4b43300000000", "PerPkg": "1", "UMask": "0x21", "Unit": "CHA" @@ -4570,7 +4570,7 @@ "Counter": "0,1,2,3", "EventCode": "0x35", "EventName": "UNC_CHA_TOR_INSERTS.IA_MISS_LlcPrefRFO", - "Filter": "config1=0x4b033", + "Filter": "config1=0x4b03300000000", "PerPkg": "1", "PublicDescription": "TOR Inserts : LLCPrefRFO issued by iA Cores that missed the LLC : Counts the number of entries successfully inserted into the TOR that match qualifications specified by the subevent. Does not include addressless requests such as locks and interrupts.", "UMask": "0x21", @@ -4581,7 +4581,7 @@ "Counter": "0,1,2,3", "EventCode": "0x35", "EventName": "UNC_CHA_TOR_INSERTS.IA_MISS_RFO", - "Filter": "config1=0x40033", + "Filter": "config1=0x4003300000000", "PerPkg": "1", "PublicDescription": "TOR Inserts : RFOs issued by iA Cores that Missed the LLC : Counts the number of entries successfully inserted into the TOR that match qualifications specified by the subevent. Does not include addressless requests such as locks and interrupts.", "UMask": "0x21", @@ -4624,7 +4624,7 @@ "EventCode": "0x35", "EventName": "UNC_CHA_TOR_INSERTS.IO_MISS_ITOM", "Experimental": "1", - "Filter": "config1=0x49033", + "Filter": "config1=0x4903300000000", "PerPkg": "1", "PublicDescription": "Counts the number of entries successfully inserted into the TOR that are generated from local IO ItoM requests that miss the LLC. An ItoM request is used by IIO to request a data write without first reading the data for ownership.", "UMask": "0x24", @@ -4636,7 +4636,7 @@ "EventCode": "0x35", "EventName": "UNC_CHA_TOR_INSERTS.IO_MISS_RDCUR", "Experimental": "1", - "Filter": "config1=0x43C33", + "Filter": "config1=0x43c3300000000", "PerPkg": "1", "PublicDescription": "Counts the number of entries successfully inserted into the TOR that are generated from local IO RdCur requests and miss the LLC. A RdCur request is used by IIO to read data without changing state.", "UMask": "0x24", @@ -4648,7 +4648,7 @@ "EventCode": "0x35", "EventName": "UNC_CHA_TOR_INSERTS.IO_MISS_RFO", "Experimental": "1", - "Filter": "config1=0x40033", + "Filter": "config1=0x4003300000000", "PerPkg": "1", "PublicDescription": "Counts the number of entries successfully inserted into the TOR that are generated from local IO RFO requests that miss the LLC. A read for ownership (RFO) requests a cache line to be cached in E state with the intent to modify.", "UMask": "0x24", @@ -4865,7 +4865,7 @@ "Counter": "0", "EventCode": "0x36", "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_HIT_CRD", - "Filter": "config1=0x40233", + "Filter": "config1=0x4023300000000", "PerPkg": "1", "PublicDescription": "TOR Occupancy : CRds issued by iA Cores that Hit the LLC : For each cycle, this event accumulates the number of valid entries in the TOR that match qualifications specified by the subevent. Does not include addressless requests such as locks and interrupts.", "UMask": "0x11", @@ -4876,7 +4876,7 @@ "Counter": "0", "EventCode": "0x36", "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_HIT_DRD", - "Filter": "config1=0x40433", + "Filter": "config1=0x4043300000000", "PerPkg": "1", "PublicDescription": "TOR Occupancy : DRds issued by iA Cores that Hit the LLC : For each cycle, this event accumulates the number of valid entries in the TOR that match qualifications specified by the subevent. Does not include addressless requests such as locks and interrupts.", "UMask": "0x11", @@ -4887,7 +4887,7 @@ "Counter": "0", "EventCode": "0x36", "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_HIT_LlcPrefCRD", - "Filter": "config1=0x4b233", + "Filter": "config1=0x4b23300000000", "PerPkg": "1", "UMask": "0x11", "Unit": "CHA" @@ -4897,7 +4897,7 @@ "Counter": "0", "EventCode": "0x36", "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_HIT_LlcPrefDRD", - "Filter": "config1=0x4b433", + "Filter": "config1=0x4b43300000000", "PerPkg": "1", "UMask": "0x11", "Unit": "CHA" @@ -4907,7 +4907,7 @@ "Counter": "0", "EventCode": "0x36", "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_HIT_LlcPrefRFO", - "Filter": "config1=0x4b033", + "Filter": "config1=0x4b03300000000", "PerPkg": "1", "PublicDescription": "TOR Occupancy : LLCPrefRFO issued by iA Cores that hit the LLC : For each cycle, this event accumulates the number of valid entries in the TOR that match qualifications specified by the subevent. Does not include addressless requests such as locks and interrupts.", "UMask": "0x11", @@ -4918,7 +4918,7 @@ "Counter": "0", "EventCode": "0x36", "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_HIT_RFO", - "Filter": "config1=0x40033", + "Filter": "config1=0x4003300000000", "PerPkg": "1", "PublicDescription": "TOR Occupancy : RFOs issued by iA Cores that Hit the LLC : For each cycle, this event accumulates the number of valid entries in the TOR that match qualifications specified by the subevent. Does not include addressless requests such as locks and interrupts.", "UMask": "0x11", @@ -4939,7 +4939,7 @@ "Counter": "0", "EventCode": "0x36", "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_CRD", - "Filter": "config1=0x40233", + "Filter": "config1=0x4023300000000", "PerPkg": "1", "PublicDescription": "TOR Occupancy : CRds issued by iA Cores that Missed the LLC : For each cycle, this event accumulates the number of valid entries in the TOR that match qualifications specified by the subevent. Does not include addressless requests such as locks and interrupts.", "UMask": "0x21", @@ -4950,7 +4950,7 @@ "Counter": "0", "EventCode": "0x36", "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD", - "Filter": "config1=0x40433", + "Filter": "config1=0x4043300000000", "PerPkg": "1", "PublicDescription": "TOR Occupancy : DRds issued by iA Cores that Missed the LLC : For each cycle, this event accumulates the number of valid entries in the TOR that match qualifications specified by the subevent. Does not include addressless requests such as locks and interrupts.", "UMask": "0x21", @@ -4961,7 +4961,7 @@ "Counter": "0", "EventCode": "0x36", "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_LlcPrefCRD", - "Filter": "config1=0x4b233", + "Filter": "config1=0x4b23300000000", "PerPkg": "1", "UMask": "0x21", "Unit": "CHA" @@ -4971,7 +4971,7 @@ "Counter": "0", "EventCode": "0x36", "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_LlcPrefDRD", - "Filter": "config1=0x4b433", + "Filter": "config1=0x4b43300000000", "PerPkg": "1", "UMask": "0x21", "Unit": "CHA" @@ -4981,7 +4981,7 @@ "Counter": "0", "EventCode": "0x36", "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_LlcPrefRFO", - "Filter": "config1=0x4b033", + "Filter": "config1=0x4b03300000000", "PerPkg": "1", "PublicDescription": "TOR Occupancy : LLCPrefRFO issued by iA Cores that missed the LLC : For each cycle, this event accumulates the number of valid entries in the TOR that match qualifications specified by the subevent. Does not include addressless requests such as locks and interrupts.", "UMask": "0x21", @@ -4992,7 +4992,7 @@ "Counter": "0", "EventCode": "0x36", "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_RFO", - "Filter": "config1=0x40033", + "Filter": "config1=0x4003300000000", "PerPkg": "1", "PublicDescription": "TOR Occupancy : RFOs issued by iA Cores that Missed the LLC : For each cycle, this event accumulates the number of valid entries in the TOR that match qualifications specified by the subevent. Does not include addressless requests such as locks and interrupts.", "UMask": "0x21", @@ -5037,7 +5037,7 @@ "EventCode": "0x36", "EventName": "UNC_CHA_TOR_OCCUPANCY.IO_MISS_ITOM", "Experimental": "1", - "Filter": "config1=0x49033", + "Filter": "config1=0x4903300000000", "PerPkg": "1", "PublicDescription": "For each cycle, this event accumulates the number of valid entries in the TOR that are generated from local IO ItoM requests that miss the LLC. An ItoM is used by IIO to request a data write without first reading the data for ownership.", "UMask": "0x24", @@ -5049,7 +5049,7 @@ "EventCode": "0x36", "EventName": "UNC_CHA_TOR_OCCUPANCY.IO_MISS_RDCUR", "Experimental": "1", - "Filter": "config1=0x43C33", + "Filter": "config1=0x43c3300000000", "PerPkg": "1", "PublicDescription": "For each cycle, this event accumulates the number of valid entries in the TOR that are generated from local IO RdCur requests that miss the LLC. A RdCur request is used by IIO to read data without changing state.", "UMask": "0x24", @@ -5061,7 +5061,7 @@ "EventCode": "0x36", "EventName": "UNC_CHA_TOR_OCCUPANCY.IO_MISS_RFO", "Experimental": "1", - "Filter": "config1=0x40033", + "Filter": "config1=0x4003300000000", "PerPkg": "1", "PublicDescription": "For each cycle, this event accumulates the number of valid entries in the TOR that are generated from local IO RFO requests that miss the LLC. A read for ownership (RFO) requests data to be cached in E state with the intent to modify.", "UMask": "0x24", diff --git a/tools/perf/pmu-events/arch/x86/snowridgex/uncore-cache.json b/tools/perf/pmu-events/arch/x86/snowridgex/uncore-cache.json index 7551fb91a9d7..a81776deb2e6 100644 --- a/tools/perf/pmu-events/arch/x86/snowridgex/uncore-cache.json +++ b/tools/perf/pmu-events/arch/x86/snowridgex/uncore-cache.json @@ -1,61 +1,4 @@ [ - { - "BriefDescription": "MMIO reads. Derived from unc_cha_tor_inserts.ia_miss", - "Counter": "0,1,2,3", - "EventCode": "0x35", - "EventName": "LLC_MISSES.MMIO_READ", - "Filter": "config1=0x40040e33", - "PerPkg": "1", - "PublicDescription": "TOR Inserts : All requests from iA Cores that Missed the LLC : Counts the number of entries successfully inserted into the TOR that match qualifications specified by the subevent. Does not include addressless requests such as locks and interrupts.", - "UMask": "0xc001fe01", - "Unit": "CHA" - }, - { - "BriefDescription": "MMIO writes. Derived from unc_cha_tor_inserts.ia_miss", - "Counter": "0,1,2,3", - "EventCode": "0x35", - "EventName": "LLC_MISSES.MMIO_WRITE", - "Filter": "config1=0x40041e33", - "PerPkg": "1", - "PublicDescription": "TOR Inserts : All requests from iA Cores that Missed the LLC : Counts the number of entries successfully inserted into the TOR that match qualifications specified by the subevent. Does not include addressless requests such as locks and interrupts.", - "UMask": "0xc001fe01", - "Unit": "CHA" - }, - { - "BriefDescription": "LLC misses - Uncacheable reads (from cpu) . Derived from unc_cha_tor_inserts.ia_miss", - "Counter": "0,1,2,3", - "EventCode": "0x35", - "EventName": "LLC_MISSES.UNCACHEABLE", - "Filter": "config1=0x40e33", - "PerPkg": "1", - "PublicDescription": "TOR Inserts : All requests from iA Cores that Missed the LLC : Counts the number of entries successfully inserted into the TOR that match qualifications specified by the subevent. Does not include addressless requests such as locks and interrupts.", - "UMask": "0xc001fe01", - "Unit": "CHA" - }, - { - "BriefDescription": "Streaming stores (full cache line). Derived from unc_cha_tor_inserts.ia_miss", - "Counter": "0,1,2,3", - "EventCode": "0x35", - "EventName": "LLC_REFERENCES.STREAMING_FULL", - "Filter": "config1=0x41833", - "PerPkg": "1", - "PublicDescription": "TOR Inserts : All requests from iA Cores that Missed the LLC : Counts the number of entries successfully inserted into the TOR that match qualifications specified by the subevent. Does not include addressless requests such as locks and interrupts.", - "ScaleUnit": "64Bytes", - "UMask": "0xc001fe01", - "Unit": "CHA" - }, - { - "BriefDescription": "Streaming stores (partial cache line). Derived from unc_cha_tor_inserts.ia_miss", - "Counter": "0,1,2,3", - "EventCode": "0x35", - "EventName": "LLC_REFERENCES.STREAMING_PARTIAL", - "Filter": "config1=0x41a33", - "PerPkg": "1", - "PublicDescription": "TOR Inserts : All requests from iA Cores that Missed the LLC : Counts the number of entries successfully inserted into the TOR that match qualifications specified by the subevent. Does not include addressless requests such as locks and interrupts.", - "ScaleUnit": "64Bytes", - "UMask": "0xc001fe01", - "Unit": "CHA" - }, { "BriefDescription": "CMS Agent0 AD Credits Acquired : For Transgress 0", "Counter": "0,1,2,3", -- cgit v1.2.3 From 05fc5b7de395c82a17871e441a551566a49c1ea8 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Mon, 12 Aug 2024 12:44:45 -0700 Subject: perf annotate-data: Support folding in TUI browser MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Like in the hists browser, it should support folding current entry so that it can hide unwanted details in some data structures. The folded entries will be displayed with the '+' sign, while unfolded entries will have the '-' sign. Entries that have no children will not show any signs. Annotate type: 'struct socket' (1 samples) Percent Offset Size Field - 100.00 0 128 struct socket { ◆ 0.00 0 4 socket_state state; ▒ 0.00 4 2 short int type; ▒ 0.00 8 8 long unsigned int flags; ▒ 0.00 16 8 struct file* file; ▒ 100.00 24 8 struct sock* sk; ▒ 0.00 32 8 struct proto_ops* ops; ▒ - 0.00 64 64 struct socket_wq wq { ▒ - 0.00 64 24 wait_queue_head_t wait { ▒ + 0.00 64 4 spinlock_t lock; ▒ - 0.00 72 16 struct list_head head { ▒ 0.00 72 8 struct list_head* next; ▒ 0.00 80 8 struct list_head* prev; ▒ }; ▒ }; ▒ 0.00 88 8 struct fasync_struct* fasync_list; ▒ 0.00 96 8 long unsigned int flags; ▒ + 0.00 104 16 struct callback_head rcu; ▒ }; ▒ }; ▒ This just adds the display logic for folding, actually folding action will be implemented in the next patch. Signed-off-by: Namhyung Kim Tested-by: Arnaldo Carvalho de Melo Cc: Adrian Hunter Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kan Liang Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20240812194447.2049187-2-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/ui/browsers/annotate-data.c | 235 +++++++++++++++++++++++++++++---- 1 file changed, 212 insertions(+), 23 deletions(-) (limited to 'tools') diff --git a/tools/perf/ui/browsers/annotate-data.c b/tools/perf/ui/browsers/annotate-data.c index a937b55da736..04c73b67cd6c 100644 --- a/tools/perf/ui/browsers/annotate-data.c +++ b/tools/perf/ui/browsers/annotate-data.c @@ -14,6 +14,10 @@ #include "util/evlist.h" #include "util/sort.h" +#define FOLDED_SIGN '+' +#define UNFOLD_SIGN '-' +#define NOCHLD_SIGN ' ' + struct annotated_data_browser { struct ui_browser b; struct list_head entries; @@ -24,7 +28,11 @@ struct browser_entry { struct list_head node; struct annotated_member *data; struct type_hist_entry *hists; - int indent; + struct browser_entry *parent; + struct list_head children; + int indent; /*indentation level, starts from 0 */ + int nr_entries; /* # of visible entries: self + descendents */ + bool folded; /* only can be false when it has children */ }; static struct annotated_data_browser *get_browser(struct ui_browser *uib) @@ -65,13 +73,14 @@ static int get_member_overhead(struct annotated_data_type *adt, } static int add_child_entries(struct annotated_data_browser *browser, + struct browser_entry *parent, struct annotated_data_type *adt, struct annotated_member *member, struct evsel *evsel, int indent) { struct annotated_member *pos; struct browser_entry *entry; - int nr_entries = 0; + struct list_head *parent_list; entry = zalloc(sizeof(*entry)); if (entry == NULL) @@ -84,36 +93,60 @@ static int add_child_entries(struct annotated_data_browser *browser, } entry->data = member; + entry->parent = parent; entry->indent = indent; if (get_member_overhead(adt, entry, evsel) < 0) { free(entry); return -1; } - list_add_tail(&entry->node, &browser->entries); - nr_entries++; + INIT_LIST_HEAD(&entry->children); + if (parent) + parent_list = &parent->children; + else + parent_list = &browser->entries; - list_for_each_entry(pos, &member->children, node) { - int nr = add_child_entries(browser, adt, pos, evsel, indent + 1); + list_add_tail(&entry->node, parent_list); + list_for_each_entry(pos, &member->children, node) { + int nr = add_child_entries(browser, entry, adt, pos, evsel, + indent + 1); if (nr < 0) return nr; - - nr_entries += nr; } /* add an entry for the closing bracket ("}") */ if (!list_empty(&member->children)) { - entry = zalloc(sizeof(*entry)); - if (entry == NULL) + struct browser_entry *bracket; + + bracket = zalloc(sizeof(*bracket)); + if (bracket == NULL) return -1; - entry->indent = indent; - list_add_tail(&entry->node, &browser->entries); - nr_entries++; + bracket->indent = indent; + bracket->parent = entry; + bracket->folded = true; + bracket->nr_entries = 1; + + INIT_LIST_HEAD(&bracket->children); + list_add_tail(&bracket->node, &entry->children); } - return nr_entries; + /* fold child entries by default */ + entry->folded = true; + entry->nr_entries = 1; + return 0; +} + +static u32 count_visible_entries(struct annotated_data_browser *browser) +{ + int nr = 0; + struct browser_entry *entry; + + list_for_each_entry(entry, &browser->entries, node) + nr += entry->nr_entries; + + return nr; } static int annotated_data_browser__collect_entries(struct annotated_data_browser *browser) @@ -123,9 +156,12 @@ static int annotated_data_browser__collect_entries(struct annotated_data_browser struct evsel *evsel = hists_to_evsel(he->hists); INIT_LIST_HEAD(&browser->entries); + + add_child_entries(browser, /*parent=*/NULL, adt, &adt->self, evsel, + /*indent=*/0); + browser->b.entries = &browser->entries; - browser->b.nr_entries = add_child_entries(browser, adt, &adt->self, - evsel, /*indent=*/0); + browser->b.nr_entries = count_visible_entries(browser); return 0; } @@ -140,9 +176,155 @@ static void annotated_data_browser__delete_entries(struct annotated_data_browser } } +static struct browser_entry *get_first_child(struct browser_entry *entry) +{ + if (list_empty(&entry->children)) + return NULL; + + return list_first_entry(&entry->children, struct browser_entry, node); +} + +static struct browser_entry *get_last_child(struct browser_entry *entry) +{ + if (list_empty(&entry->children)) + return NULL; + + return list_last_entry(&entry->children, struct browser_entry, node); +} + +static bool is_first_child(struct browser_entry *entry) +{ + /* This will be checked in a different way */ + if (entry->parent == NULL) + return false; + + return get_first_child(entry->parent) == entry; +} + +static bool is_last_child(struct browser_entry *entry) +{ + /* This will be checked in a different way */ + if (entry->parent == NULL) + return false; + + return get_last_child(entry->parent) == entry; +} + +static struct browser_entry *browser__prev_entry(struct ui_browser *uib, + struct browser_entry *entry) +{ + struct annotated_data_browser *browser = get_browser(uib); + struct browser_entry *first; + + first = list_first_entry(&browser->entries, struct browser_entry, node); + + while (entry != first) { + if (is_first_child(entry)) + entry = entry->parent; + else { + entry = list_prev_entry(entry, node); + while (!entry->folded) + entry = get_last_child(entry); + } + + if (!uib->filter || !uib->filter(uib, &entry->node)) + return entry; + } + return first; +} + +static struct browser_entry *browser__next_entry(struct ui_browser *uib, + struct browser_entry *entry) +{ + struct annotated_data_browser *browser = get_browser(uib); + struct browser_entry *last; + + last = list_last_entry(&browser->entries, struct browser_entry, node); + while (!last->folded) + last = get_last_child(last); + + while (entry != last) { + if (!entry->folded) + entry = get_first_child(entry); + else { + while (is_last_child(entry)) + entry = entry->parent; + + entry = list_next_entry(entry, node); + } + + if (!uib->filter || !uib->filter(uib, &entry->node)) + return entry; + } + return last; +} + +static void browser__seek(struct ui_browser *uib, off_t offset, int whence) +{ + struct annotated_data_browser *browser = get_browser(uib); + struct browser_entry *entry; + + if (uib->nr_entries == 0) + return; + + switch (whence) { + case SEEK_SET: + entry = list_first_entry(&browser->entries, typeof(*entry), node); + if (uib->filter && uib->filter(uib, &entry->node)) + entry = browser__next_entry(uib, entry); + break; + case SEEK_CUR: + entry = list_entry(uib->top, typeof(*entry), node); + break; + case SEEK_END: + entry = list_last_entry(&browser->entries, typeof(*entry), node); + while (!entry->folded) + entry = get_last_child(entry); + if (uib->filter && uib->filter(uib, &entry->node)) + entry = browser__prev_entry(uib, entry); + break; + default: + return; + } + + assert(entry != NULL); + + if (offset > 0) { + while (offset-- != 0) + entry = browser__next_entry(uib, entry); + } else { + while (offset++ != 0) + entry = browser__prev_entry(uib, entry); + } + + uib->top = &entry->node; +} + static unsigned int browser__refresh(struct ui_browser *uib) { - return ui_browser__list_head_refresh(uib); + struct browser_entry *entry, *next; + int row = 0; + + if (uib->top == NULL || uib->top == uib->entries) + browser__seek(uib, SEEK_SET, 0); + + entry = list_entry(uib->top, typeof(*entry), node); + + while (true) { + if (!uib->filter || !uib->filter(uib, &entry->node)) { + ui_browser__gotorc(uib, row, 0); + uib->write(uib, entry, row); + if (++row == uib->rows) + break; + } + next = browser__next_entry(uib, entry); + if (next == entry) + break; + + entry = next; + } + + return row; } static int browser__show(struct ui_browser *uib) @@ -171,7 +353,7 @@ static int browser__show(struct ui_browser *uib) strcpy(title, "Percent"); ui_browser__printf(uib, "%*s %10s %10s %10s %s", - 11 * (browser->nr_events - 1), "", + 2 + 11 * (browser->nr_events - 1), "", title, "Offset", "Size", "Field"); ui_browser__write_nstring(uib, "", uib->width); return 0; @@ -208,12 +390,12 @@ static void browser__write(struct ui_browser *uib, void *entry, int row) struct evsel *leader = hists_to_evsel(he->hists); struct evsel *evsel; int idx = 0; + bool current = ui_browser__is_current_entry(uib, row); if (member == NULL) { - bool current = ui_browser__is_current_entry(uib, row); - /* print the closing bracket */ ui_browser__set_percent_color(uib, 0, current); + ui_browser__printf(uib, "%c ", NOCHLD_SIGN); ui_browser__write_nstring(uib, "", 11 * browser->nr_events); ui_browser__printf(uib, " %10s %10s %*s};", "", "", be->indent * 4, ""); @@ -221,6 +403,13 @@ static void browser__write(struct ui_browser *uib, void *entry, int row) return; } + ui_browser__set_percent_color(uib, 0, current); + + if (!list_empty(&be->children)) + ui_browser__printf(uib, "%c ", be->folded ? FOLDED_SIGN : UNFOLD_SIGN); + else + ui_browser__printf(uib, "%c ", NOCHLD_SIGN); + /* print the number */ for_each_group_evsel(evsel, leader) { struct type_hist *h = adt->histograms[evsel->core.idx]; @@ -237,13 +426,13 @@ static void browser__write(struct ui_browser *uib, void *entry, int row) ui_browser__printf(uib, " %10d %10d %s%s", member->offset, member->size, member->type_name, - list_empty(&member->children) ? ";" : " {"); + list_empty(&member->children) || be->folded? ";" : " {"); } else { ui_browser__printf(uib, " %10d %10d %*s%s\t%s%s", member->offset, member->size, be->indent * 4, "", member->type_name, member->var_name ?: "", - list_empty(&member->children) ? ";" : " {"); + list_empty(&member->children) || be->folded ? ";" : " {"); } /* fill the rest */ ui_browser__write_nstring(uib, "", uib->width); @@ -297,7 +486,7 @@ int hist_entry__annotate_data_tui(struct hist_entry *he, struct evsel *evsel, struct annotated_data_browser browser = { .b = { .refresh = browser__refresh, - .seek = ui_browser__list_head_seek, + .seek = browser__seek, .write = browser__write, .priv = he, .extra_title_lines = 1, -- cgit v1.2.3 From af73856e9ac818bfe8e2f95b32734acbcabe6689 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Mon, 12 Aug 2024 12:44:46 -0700 Subject: perf annotate-data: Implement folding in TUI browser Like 'perf report', use 'e' or 'E' key to toggle folding the current entry so that it can control displaying child entries. Note I didn't add the 'c' and 'C' key to collapse the entry because it's also handled with the 'e'/'E' since it toggles the state. Committer testing: Do some 'perf mem record' for some workload of the whole system, using the target options, as usual (--pid/-p, -C/--cpu, -a for the system wide profiling, etc) and then: # perf annotate --skip-empty --data-type=pthread_mutex_t That, by default, will start as --tui, then press 'E' to see the whole struct unfolded, etc. Signed-off-by: Namhyung Kim Tested-by: Arnaldo Carvalho de Melo Cc: Adrian Hunter Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kan Liang Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20240812194447.2049187-3-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/ui/browsers/annotate-data.c | 98 +++++++++++++++++++++++++++++++--- 1 file changed, 92 insertions(+), 6 deletions(-) (limited to 'tools') diff --git a/tools/perf/ui/browsers/annotate-data.c b/tools/perf/ui/browsers/annotate-data.c index 04c73b67cd6c..a5c5ad63425e 100644 --- a/tools/perf/ui/browsers/annotate-data.c +++ b/tools/perf/ui/browsers/annotate-data.c @@ -18,12 +18,6 @@ #define UNFOLD_SIGN '-' #define NOCHLD_SIGN ' ' -struct annotated_data_browser { - struct ui_browser b; - struct list_head entries; - int nr_events; -}; - struct browser_entry { struct list_head node; struct annotated_member *data; @@ -35,6 +29,13 @@ struct browser_entry { bool folded; /* only can be false when it has children */ }; +struct annotated_data_browser { + struct ui_browser b; + struct list_head entries; + struct browser_entry *curr; + int nr_events; +}; + static struct annotated_data_browser *get_browser(struct ui_browser *uib) { return container_of(uib, struct annotated_data_browser, b); @@ -302,6 +303,7 @@ static void browser__seek(struct ui_browser *uib, off_t offset, int whence) static unsigned int browser__refresh(struct ui_browser *uib) { + struct annotated_data_browser *browser = get_browser(uib); struct browser_entry *entry, *next; int row = 0; @@ -314,6 +316,8 @@ static unsigned int browser__refresh(struct ui_browser *uib) if (!uib->filter || !uib->filter(uib, &entry->node)) { ui_browser__gotorc(uib, row, 0); uib->write(uib, entry, row); + if (uib->top_idx + row == uib->index) + browser->curr = entry; if (++row == uib->rows) break; } @@ -438,6 +442,78 @@ static void browser__write(struct ui_browser *uib, void *entry, int row) ui_browser__write_nstring(uib, "", uib->width); } +static void annotated_data_browser__fold(struct annotated_data_browser *browser, + struct browser_entry *entry, + bool recursive) +{ + struct browser_entry *child; + + if (list_empty(&entry->children)) + return; + if (entry->folded && !recursive) + return; + + if (recursive) { + list_for_each_entry(child, &entry->children, node) + annotated_data_browser__fold(browser, child, true); + } + + entry->nr_entries = 1; + entry->folded = true; +} + +static void annotated_data_browser__unfold(struct annotated_data_browser *browser, + struct browser_entry *entry, + bool recursive) +{ + struct browser_entry *child; + int nr_entries; + + if (list_empty(&entry->children)) + return; + if (!entry->folded && !recursive) + return; + + nr_entries = 1; /* for self */ + list_for_each_entry(child, &entry->children, node) { + if (recursive) + annotated_data_browser__unfold(browser, child, true); + + nr_entries += child->nr_entries; + } + + entry->nr_entries = nr_entries; + entry->folded = false; +} + +static void annotated_data_browser__toggle_fold(struct annotated_data_browser *browser, + bool recursive) +{ + struct browser_entry *curr = browser->curr; + struct browser_entry *parent; + + parent = curr->parent; + while (parent) { + parent->nr_entries -= curr->nr_entries; + parent = parent->parent; + } + browser->b.nr_entries -= curr->nr_entries; + + if (curr->folded) + annotated_data_browser__unfold(browser, curr, recursive); + else + annotated_data_browser__fold(browser, curr, recursive); + + parent = curr->parent; + while (parent) { + parent->nr_entries += curr->nr_entries; + parent = parent->parent; + } + browser->b.nr_entries += curr->nr_entries; + + assert(browser->b.nr_entries == count_visible_entries(browser)); +} + static int annotated_data_browser__run(struct annotated_data_browser *browser, struct evsel *evsel __maybe_unused, struct hist_browser_timer *hbt) @@ -462,8 +538,18 @@ static int annotated_data_browser__run(struct annotated_data_browser *browser, "UP/DOWN/PGUP\n" "PGDN/SPACE Navigate\n" " Move to prev/next symbol\n" + "e Expand/Collapse current entry\n" + "E Expand/Collapse all children of the current\n" "q/ESC/CTRL+C Exit\n\n"); continue; + case 'e': + annotated_data_browser__toggle_fold(browser, + /*recursive=*/false); + break; + case 'E': + annotated_data_browser__toggle_fold(browser, + /*recursive=*/true); + break; case K_LEFT: case '<': case '>': -- cgit v1.2.3 From 7f3c8f13ad93044ab1d18949b8b840fed7c59c30 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Mon, 12 Aug 2024 12:44:47 -0700 Subject: perf annotate-data: Show first-level children by default in TUI MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Now default is to fold everything but it only shows the name of the top-level data type which is not very useful. Instead just expand the top level entry so that it can show the layout at a higher level. Annotate type: 'struct task_struct' (4 samples) Percent Offset Size Field - 100.00 0 9792 struct task_struct { ◆ + 0.50 0 24 struct thread_info thread_info; ▒ 0.00 24 4 unsigned int __state; ▒ 0.00 32 8 void* stack; ▒ + 0.00 40 4 refcount_t usage; ▒ 0.00 44 4 unsigned int flags; ▒ 0.00 48 4 unsigned int ptrace; ▒ 0.00 52 4 int on_cpu; ▒ + 0.00 56 16 struct __call_single_node wake_entry; ▒ 0.00 72 4 unsigned int wakee_flips; ▒ 0.00 80 8 long unsigned int wakee_flip_decay_ts;▒ 0.00 88 8 struct task_struct* last_wakee; ▒ 0.00 96 4 int recent_used_cpu; ▒ 0.00 100 4 int wake_cpu; ▒ 0.00 104 4 int on_rq; ▒ 0.00 108 4 int prio; ▒ 0.00 112 4 int static_prio; ▒ 0.00 116 4 int normal_prio; ▒ 0.00 120 4 unsigned int rt_priority; ▒ + 0.00 128 256 struct sched_entity se; ▒ + 0.00 384 48 struct sched_rt_entity rt; ▒ + 0.00 432 224 struct sched_dl_entity dl; ▒ 0.00 656 8 struct sched_class* sched_class; ▒ ... Committer testing: # perf mem record -a sleep 5s # perf annotate --group --data-type=pthread_mutex_t Annotate type: 'pthread_mutex_t' (13 samples) Percent Offset Size Field - 100.00 0 40 pthread_mutex_t { ▒ - 100.00 0 40 struct __pthread_mutex_s __data { ▒ 39.45 0 4 int __lock; ▒ 0.00 4 4 unsigned int __count; ▒ 7.80 8 4 int __owner; ▒ 6.88 12 4 unsigned int __nusers; ▒ 45.87 16 4 int __kind; ▒ 0.00 20 2 short int __spins; ▒ 0.00 22 2 short int __elision; ▒ + 0.00 24 16 __pthread_list_t __list; ▒ }; ▒ 0.00 0 0 char[] __size; ▒ 39.45 0 8 long int __align; Signed-off-by: Namhyung Kim Tested-by: Arnaldo Carvalho de Melo Cc: Adrian Hunter Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kan Liang Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20240812194447.2049187-4-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/ui/browsers/annotate-data.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/perf/ui/browsers/annotate-data.c b/tools/perf/ui/browsers/annotate-data.c index a5c5ad63425e..f563a3bb072c 100644 --- a/tools/perf/ui/browsers/annotate-data.c +++ b/tools/perf/ui/browsers/annotate-data.c @@ -596,9 +596,17 @@ int hist_entry__annotate_data_tui(struct hist_entry *he, struct evsel *evsel, } ret = annotated_data_browser__collect_entries(&browser); - if (ret == 0) - ret = annotated_data_browser__run(&browser, evsel, hbt); + if (ret < 0) + goto out; + /* To get the top and current entry */ + browser__refresh(&browser.b); + /* Show the first-level child entries by default */ + annotated_data_browser__toggle_fold(&browser, /*recursive=*/false); + + ret = annotated_data_browser__run(&browser, evsel, hbt); + +out: annotated_data_browser__delete_entries(&browser); return ret; -- cgit v1.2.3 From 79bcd34e0f3da39fda841406ccc957405e724852 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Mon, 29 Jul 2024 15:06:20 -0700 Subject: perf inject: Fix leader sampling inserting additional samples The processing of leader samples would turn an individual sample with a group of read values into multiple samples. 'perf inject' would pass through the additional samples increasing the output data file size: $ perf record -g -e "{instructions,cycles}:S" -o perf.orig.data true $ perf script -D -i perf.orig.data | sed -e 's/perf.orig.data/perf.data/g' > orig.txt $ perf inject -i perf.orig.data -o perf.new.data $ perf script -D -i perf.new.data | sed -e 's/perf.new.data/perf.data/g' > new.txt $ diff -u orig.txt new.txt --- orig.txt 2024-07-29 14:29:40.606576769 -0700 +++ new.txt 2024-07-29 14:30:04.142737434 -0700 ... -0xc550@perf.data [0x30]: event: 3 +0xc550@perf.data [0xd0]: event: 9 +. +. ... raw event: size 208 bytes +. 0000: 09 00 00 00 01 00 d0 00 fc 72 01 86 ff ff ff ff .........r...... +. 0010: 74 7d 2c 00 74 7d 2c 00 fb c3 79 f9 ba d5 05 00 t},.t},...y..... +. 0020: e6 cb 1a 00 00 00 00 00 01 00 00 00 00 00 00 00 ................ +. 0030: 02 00 00 00 00 00 00 00 76 01 00 00 00 00 00 00 ........v....... +. 0040: e6 cb 1a 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ +. 0050: 62 18 00 00 00 00 00 00 f6 cb 1a 00 00 00 00 00 b............... +. 0060: 00 00 00 00 00 00 00 00 0c 00 00 00 00 00 00 00 ................ +. 0070: 80 ff ff ff ff ff ff ff fc 72 01 86 ff ff ff ff .........r...... +. 0080: f3 0e 6e 85 ff ff ff ff 0c cb 7f 85 ff ff ff ff ..n............. +. 0090: bc f2 87 85 ff ff ff ff 44 af 7f 85 ff ff ff ff ........D....... +. 00a0: bd be 7f 85 ff ff ff ff 26 d0 7f 85 ff ff ff ff ........&....... +. 00b0: 6d a4 ff 85 ff ff ff ff ea 00 20 86 ff ff ff ff m......... ..... +. 00c0: 00 fe ff ff ff ff ff ff 57 14 4f 43 fc 7e 00 00 ........W.OC.~.. + +1642373909693435 0xc550 [0xd0]: PERF_RECORD_SAMPLE(IP, 0x1): 2915700/2915700: 0xffffffff860172fc period: 1 addr: 0 +... FP chain: nr:12 +..... 0: ffffffffffffff80 +..... 1: ffffffff860172fc +..... 2: ffffffff856e0ef3 +..... 3: ffffffff857fcb0c +..... 4: ffffffff8587f2bc +..... 5: ffffffff857faf44 +..... 6: ffffffff857fbebd +..... 7: ffffffff857fd026 +..... 8: ffffffff85ffa46d +..... 9: ffffffff862000ea +..... 10: fffffffffffffe00 +..... 11: 00007efc434f1457 +... sample_read: +.... group nr 2 +..... id 00000000001acbe6, value 0000000000000176, lost 0 +..... id 00000000001acbf6, value 0000000000001862, lost 0 + +0xc620@perf.data [0x30]: event: 3 ... This behavior is incorrect as in the case above 'perf inject' should have done nothing. Fix this behavior by disabling separating samples for a tool that requests it. Only request this for `perf inject` so as to not affect other perf tools. With the patch and the test above there are no differences between the orig.txt and new.txt. Fixes: e4caec0d1af3d608 ("perf evsel: Add PERF_SAMPLE_READ sample related processing") Signed-off-by: Ian Rogers Acked-by: Namhyung Kim Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Andi Kleen Cc: Ingo Molnar Cc: Jiri Olsa Cc: Jiri Olsa Cc: Kan Liang Cc: Mark Rutland Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20240729220620.2957754-1-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-inject.c | 1 + tools/perf/util/session.c | 3 +++ tools/perf/util/tool.h | 1 + 3 files changed, 5 insertions(+) (limited to 'tools') diff --git a/tools/perf/builtin-inject.c b/tools/perf/builtin-inject.c index 7b4a5d56d279..8f3dd8980da0 100644 --- a/tools/perf/builtin-inject.c +++ b/tools/perf/builtin-inject.c @@ -2204,6 +2204,7 @@ int cmd_inject(int argc, const char **argv) .finished_init = perf_event__repipe_op2_synth, .compressed = perf_event__repipe_op4_synth, .auxtrace = perf_event__repipe_auxtrace, + .dont_split_sample_group = true, }, .input_name = "-", .samples = LIST_HEAD_INIT(inject.samples), diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c index f9072e003367..453ca0297c80 100644 --- a/tools/perf/util/session.c +++ b/tools/perf/util/session.c @@ -1513,6 +1513,9 @@ static int deliver_sample_group(struct evlist *evlist, int ret = -EINVAL; struct sample_read_value *v = sample->read.group.values; + if (tool->dont_split_sample_group) + return deliver_sample_value(evlist, tool, event, sample, v, machine); + sample_read_group__for_each(v, sample->read.group.nr, read_format) { ret = deliver_sample_value(evlist, tool, event, sample, v, machine); diff --git a/tools/perf/util/tool.h b/tools/perf/util/tool.h index c957fb849ac6..62bbc9cec151 100644 --- a/tools/perf/util/tool.h +++ b/tools/perf/util/tool.h @@ -85,6 +85,7 @@ struct perf_tool { bool namespace_events; bool cgroup_events; bool no_warn; + bool dont_split_sample_group; enum show_feature_header show_feat_hdr; }; -- cgit v1.2.3 From 4e322c785514e85c5ede49c54d72795a5573fa79 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Mon, 12 Aug 2024 13:46:53 -0700 Subject: perf auxtrace: Remove dummy tools Add perf_session__deliver_synth_attr_event that synthesizes a perf_record_header_attr event with one id. Remove use of perf_event__synthesize_attr that necessitates the use of the dummy tool in order to pass the session. Reviewed-by: Adrian Hunter Signed-off-by: Ian Rogers Tested-by: Adrian Hunter Tested-by: Leo Yan Cc: Alexander Shishkin Cc: Anshuman Khandual Cc: Athira Rajeev Cc: Huacai Chen Cc: Ilkka Koskinen Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: John Garry Cc: Jonathan Cameron Cc: Kan Liang Cc: Leo Yan Cc: Mark Rutland Cc: Mike Leach Cc: Namhyung Kim Cc: Nick Desaulniers Cc: Nick Terrell Cc: Oliver Upton Cc: Peter Zijlstra Cc: Song Liu Cc: Sun Haiyong Cc: Suzuki Poulouse Cc: Will Deacon Cc: Yanteng Si Cc: Yicong Yang Cc: linux-arm-kernel@lists.infradead.org Link: https://lore.kernel.org/r/20240812204720.631678-2-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/arm-spe.c | 49 +++++++++------------------------------------ tools/perf/util/cs-etm.c | 33 ++---------------------------- tools/perf/util/intel-bts.c | 31 +--------------------------- tools/perf/util/intel-pt.c | 24 +--------------------- tools/perf/util/session.c | 24 ++++++++++++++++++++++ tools/perf/util/session.h | 3 +++ 6 files changed, 41 insertions(+), 123 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/arm-spe.c b/tools/perf/util/arm-spe.c index afbd5869f6bf..fa40f3cb6266 100644 --- a/tools/perf/util/arm-spe.c +++ b/tools/perf/util/arm-spe.c @@ -1073,35 +1073,6 @@ static void arm_spe_print_info(__u64 *arr) fprintf(stdout, arm_spe_info_fmts[ARM_SPE_PMU_TYPE], arr[ARM_SPE_PMU_TYPE]); } -struct arm_spe_synth { - struct perf_tool dummy_tool; - struct perf_session *session; -}; - -static int arm_spe_event_synth(struct perf_tool *tool, - union perf_event *event, - struct perf_sample *sample __maybe_unused, - struct machine *machine __maybe_unused) -{ - struct arm_spe_synth *arm_spe_synth = - container_of(tool, struct arm_spe_synth, dummy_tool); - - return perf_session__deliver_synth_event(arm_spe_synth->session, - event, NULL); -} - -static int arm_spe_synth_event(struct perf_session *session, - struct perf_event_attr *attr, u64 id) -{ - struct arm_spe_synth arm_spe_synth; - - memset(&arm_spe_synth, 0, sizeof(struct arm_spe_synth)); - arm_spe_synth.session = session; - - return perf_event__synthesize_attr(&arm_spe_synth.dummy_tool, attr, 1, - &id, arm_spe_event_synth); -} - static void arm_spe_set_event_name(struct evlist *evlist, u64 id, const char *name) { @@ -1172,7 +1143,7 @@ arm_spe_synth_events(struct arm_spe *spe, struct perf_session *session) spe->sample_flc = true; /* Level 1 data cache miss */ - err = arm_spe_synth_event(session, &attr, id); + err = perf_session__deliver_synth_attr_event(session, &attr, id); if (err) return err; spe->l1d_miss_id = id; @@ -1180,7 +1151,7 @@ arm_spe_synth_events(struct arm_spe *spe, struct perf_session *session) id += 1; /* Level 1 data cache access */ - err = arm_spe_synth_event(session, &attr, id); + err = perf_session__deliver_synth_attr_event(session, &attr, id); if (err) return err; spe->l1d_access_id = id; @@ -1192,7 +1163,7 @@ arm_spe_synth_events(struct arm_spe *spe, struct perf_session *session) spe->sample_llc = true; /* Last level cache miss */ - err = arm_spe_synth_event(session, &attr, id); + err = perf_session__deliver_synth_attr_event(session, &attr, id); if (err) return err; spe->llc_miss_id = id; @@ -1200,7 +1171,7 @@ arm_spe_synth_events(struct arm_spe *spe, struct perf_session *session) id += 1; /* Last level cache access */ - err = arm_spe_synth_event(session, &attr, id); + err = perf_session__deliver_synth_attr_event(session, &attr, id); if (err) return err; spe->llc_access_id = id; @@ -1212,7 +1183,7 @@ arm_spe_synth_events(struct arm_spe *spe, struct perf_session *session) spe->sample_tlb = true; /* TLB miss */ - err = arm_spe_synth_event(session, &attr, id); + err = perf_session__deliver_synth_attr_event(session, &attr, id); if (err) return err; spe->tlb_miss_id = id; @@ -1220,7 +1191,7 @@ arm_spe_synth_events(struct arm_spe *spe, struct perf_session *session) id += 1; /* TLB access */ - err = arm_spe_synth_event(session, &attr, id); + err = perf_session__deliver_synth_attr_event(session, &attr, id); if (err) return err; spe->tlb_access_id = id; @@ -1232,7 +1203,7 @@ arm_spe_synth_events(struct arm_spe *spe, struct perf_session *session) spe->sample_branch = true; /* Branch miss */ - err = arm_spe_synth_event(session, &attr, id); + err = perf_session__deliver_synth_attr_event(session, &attr, id); if (err) return err; spe->branch_miss_id = id; @@ -1244,7 +1215,7 @@ arm_spe_synth_events(struct arm_spe *spe, struct perf_session *session) spe->sample_remote_access = true; /* Remote access */ - err = arm_spe_synth_event(session, &attr, id); + err = perf_session__deliver_synth_attr_event(session, &attr, id); if (err) return err; spe->remote_access_id = id; @@ -1255,7 +1226,7 @@ arm_spe_synth_events(struct arm_spe *spe, struct perf_session *session) if (spe->synth_opts.mem) { spe->sample_memory = true; - err = arm_spe_synth_event(session, &attr, id); + err = perf_session__deliver_synth_attr_event(session, &attr, id); if (err) return err; spe->memory_id = id; @@ -1276,7 +1247,7 @@ arm_spe_synth_events(struct arm_spe *spe, struct perf_session *session) attr.config = PERF_COUNT_HW_INSTRUCTIONS; attr.sample_period = spe->synth_opts.period; spe->instructions_sample_period = attr.sample_period; - err = arm_spe_synth_event(session, &attr, id); + err = perf_session__deliver_synth_attr_event(session, &attr, id); if (err) return err; spe->instructions_id = id; diff --git a/tools/perf/util/cs-etm.c b/tools/perf/util/cs-etm.c index d3e9c64d17d4..566498ebc694 100644 --- a/tools/perf/util/cs-etm.c +++ b/tools/perf/util/cs-etm.c @@ -1599,35 +1599,6 @@ static int cs_etm__synth_branch_sample(struct cs_etm_queue *etmq, return ret; } -struct cs_etm_synth { - struct perf_tool dummy_tool; - struct perf_session *session; -}; - -static int cs_etm__event_synth(struct perf_tool *tool, - union perf_event *event, - struct perf_sample *sample __maybe_unused, - struct machine *machine __maybe_unused) -{ - struct cs_etm_synth *cs_etm_synth = - container_of(tool, struct cs_etm_synth, dummy_tool); - - return perf_session__deliver_synth_event(cs_etm_synth->session, - event, NULL); -} - -static int cs_etm__synth_event(struct perf_session *session, - struct perf_event_attr *attr, u64 id) -{ - struct cs_etm_synth cs_etm_synth; - - memset(&cs_etm_synth, 0, sizeof(struct cs_etm_synth)); - cs_etm_synth.session = session; - - return perf_event__synthesize_attr(&cs_etm_synth.dummy_tool, attr, 1, - &id, cs_etm__event_synth); -} - static int cs_etm__synth_events(struct cs_etm_auxtrace *etm, struct perf_session *session) { @@ -1679,7 +1650,7 @@ static int cs_etm__synth_events(struct cs_etm_auxtrace *etm, attr.config = PERF_COUNT_HW_BRANCH_INSTRUCTIONS; attr.sample_period = 1; attr.sample_type |= PERF_SAMPLE_ADDR; - err = cs_etm__synth_event(session, &attr, id); + err = perf_session__deliver_synth_attr_event(session, &attr, id); if (err) return err; etm->branches_sample_type = attr.sample_type; @@ -1702,7 +1673,7 @@ static int cs_etm__synth_events(struct cs_etm_auxtrace *etm, attr.config = PERF_COUNT_HW_INSTRUCTIONS; attr.sample_period = etm->synth_opts.period; etm->instructions_sample_period = attr.sample_period; - err = cs_etm__synth_event(session, &attr, id); + err = perf_session__deliver_synth_attr_event(session, &attr, id); if (err) return err; etm->instructions_sample_type = attr.sample_type; diff --git a/tools/perf/util/intel-bts.c b/tools/perf/util/intel-bts.c index ec1b3bd9f530..160d346cb450 100644 --- a/tools/perf/util/intel-bts.c +++ b/tools/perf/util/intel-bts.c @@ -737,35 +737,6 @@ static bool intel_bts_evsel_is_auxtrace(struct perf_session *session, return evsel->core.attr.type == bts->pmu_type; } -struct intel_bts_synth { - struct perf_tool dummy_tool; - struct perf_session *session; -}; - -static int intel_bts_event_synth(struct perf_tool *tool, - union perf_event *event, - struct perf_sample *sample __maybe_unused, - struct machine *machine __maybe_unused) -{ - struct intel_bts_synth *intel_bts_synth = - container_of(tool, struct intel_bts_synth, dummy_tool); - - return perf_session__deliver_synth_event(intel_bts_synth->session, - event, NULL); -} - -static int intel_bts_synth_event(struct perf_session *session, - struct perf_event_attr *attr, u64 id) -{ - struct intel_bts_synth intel_bts_synth; - - memset(&intel_bts_synth, 0, sizeof(struct intel_bts_synth)); - intel_bts_synth.session = session; - - return perf_event__synthesize_attr(&intel_bts_synth.dummy_tool, attr, 1, - &id, intel_bts_event_synth); -} - static int intel_bts_synth_events(struct intel_bts *bts, struct perf_session *session) { @@ -814,7 +785,7 @@ static int intel_bts_synth_events(struct intel_bts *bts, attr.sample_type |= PERF_SAMPLE_ADDR; pr_debug("Synthesizing 'branches' event with id %" PRIu64 " sample type %#" PRIx64 "\n", id, (u64)attr.sample_type); - err = intel_bts_synth_event(session, &attr, id); + err = perf_session__deliver_synth_attr_event(session, &attr, id); if (err) { pr_err("%s: failed to synthesize 'branches' event type\n", __func__); diff --git a/tools/perf/util/intel-pt.c b/tools/perf/util/intel-pt.c index d6d7b7512505..1fc03bfd14c3 100644 --- a/tools/perf/util/intel-pt.c +++ b/tools/perf/util/intel-pt.c @@ -3659,37 +3659,15 @@ static int intel_pt_queue_data(struct perf_session *session, data_offset, timestamp); } -struct intel_pt_synth { - struct perf_tool dummy_tool; - struct perf_session *session; -}; - -static int intel_pt_event_synth(struct perf_tool *tool, - union perf_event *event, - struct perf_sample *sample __maybe_unused, - struct machine *machine __maybe_unused) -{ - struct intel_pt_synth *intel_pt_synth = - container_of(tool, struct intel_pt_synth, dummy_tool); - - return perf_session__deliver_synth_event(intel_pt_synth->session, event, - NULL); -} - static int intel_pt_synth_event(struct perf_session *session, const char *name, struct perf_event_attr *attr, u64 id) { - struct intel_pt_synth intel_pt_synth; int err; pr_debug("Synthesizing '%s' event with id %" PRIu64 " sample type %#" PRIx64 "\n", name, id, (u64)attr->sample_type); - memset(&intel_pt_synth, 0, sizeof(struct intel_pt_synth)); - intel_pt_synth.session = session; - - err = perf_event__synthesize_attr(&intel_pt_synth.dummy_tool, attr, 1, - &id, intel_pt_event_synth); + err = perf_session__deliver_synth_attr_event(session, attr, id); if (err) pr_err("%s: failed to synthesize '%s' event type\n", __func__, name); diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c index 453ca0297c80..aa02c1d7b1f9 100644 --- a/tools/perf/util/session.c +++ b/tools/perf/util/session.c @@ -1769,6 +1769,30 @@ int perf_session__deliver_synth_event(struct perf_session *session, return machines__deliver_event(&session->machines, evlist, event, sample, tool, 0, NULL); } +int perf_session__deliver_synth_attr_event(struct perf_session *session, + const struct perf_event_attr *attr, + u64 id) +{ + union { + struct { + struct perf_record_header_attr attr; + u64 ids[1]; + } attr_id; + union perf_event ev; + } ev = { + .attr_id.attr.header.type = PERF_RECORD_HEADER_ATTR, + .attr_id.attr.header.size = sizeof(ev.attr_id), + .attr_id.ids[0] = id, + }; + + if (attr->size != sizeof(ev.attr_id.attr.attr)) { + pr_debug("Unexpected perf_event_attr size\n"); + return -EINVAL; + } + ev.attr_id.attr.attr = *attr; + return perf_session__deliver_synth_event(session, &ev.ev, NULL); +} + static void event_swap(union perf_event *event, bool sample_id_all) { perf_event__swap_op swap; diff --git a/tools/perf/util/session.h b/tools/perf/util/session.h index 4c29dc86956f..cc28976bb00e 100644 --- a/tools/perf/util/session.h +++ b/tools/perf/util/session.h @@ -154,6 +154,9 @@ extern volatile int session_done; int perf_session__deliver_synth_event(struct perf_session *session, union perf_event *event, struct perf_sample *sample); +int perf_session__deliver_synth_attr_event(struct perf_session *session, + const struct perf_event_attr *attr, + u64 id); int perf_session__dsos_hit_all(struct perf_session *session); -- cgit v1.2.3 From 1816dc4bc5be050cc543e6e4fcedf3805a2aea20 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Mon, 12 Aug 2024 13:46:54 -0700 Subject: perf s390-cpumsf: Remove unused struct struct s390_cpumsf_synth was likely cargo culted from other auxtrace examples. It has no users, so remove. Reviewed-by: Adrian Hunter Signed-off-by: Ian Rogers Cc: Alexander Shishkin Cc: Anshuman Khandual Cc: Athira Rajeev Cc: Huacai Chen Cc: Ilkka Koskinen Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: John Garry Cc: Jonathan Cameron Cc: Kan Liang Cc: Leo Yan Cc: Mark Rutland Cc: Mike Leach Cc: Namhyung Kim Cc: Nick Desaulniers Cc: Nick Terrell Cc: Oliver Upton Cc: Peter Zijlstra Cc: Song Liu Cc: Sun Haiyong Cc: Suzuki Poulouse Cc: Will Deacon Cc: Yanteng Si Cc: Yicong Yang Cc: linux-arm-kernel@lists.infradead.org Link: https://lore.kernel.org/r/20240812204720.631678-3-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/s390-cpumsf.c | 5 ----- 1 file changed, 5 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/s390-cpumsf.c b/tools/perf/util/s390-cpumsf.c index 6fe478b0b61b..4ec583e511af 100644 --- a/tools/perf/util/s390-cpumsf.c +++ b/tools/perf/util/s390-cpumsf.c @@ -952,11 +952,6 @@ s390_cpumsf_process_event(struct perf_session *session, return err; } -struct s390_cpumsf_synth { - struct perf_tool cpumsf_tool; - struct perf_session *session; -}; - static int s390_cpumsf_process_auxtrace_event(struct perf_session *session, union perf_event *event __maybe_unused, -- cgit v1.2.3 From 30f29bae9142f34e978a4861ed07aa512af21416 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Mon, 12 Aug 2024 13:46:55 -0700 Subject: perf tool: Constify tool pointers The tool pointer (to a struct largely of function pointers) is passed around but is unchanged except at initialization. Change parameter and variable types to be const to lower the possibilities of what could happen with a tool. Reviewed-by: Adrian Hunter Signed-off-by: Ian Rogers Tested-by: Adrian Hunter Tested-by: Leo Yan Cc: Alexander Shishkin Cc: Anshuman Khandual Cc: Athira Rajeev Cc: Huacai Chen Cc: Ilkka Koskinen Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: John Garry Cc: Jonathan Cameron Cc: Kan Liang Cc: Leo Yan Cc: Mark Rutland Cc: Mike Leach Cc: Namhyung Kim Cc: Nick Desaulniers Cc: Nick Terrell Cc: Oliver Upton Cc: Peter Zijlstra Cc: Song Liu Cc: Sun Haiyong Cc: Suzuki Poulouse Cc: Will Deacon Cc: Yanteng Si Cc: Yicong Yang Cc: linux-arm-kernel@lists.infradead.org Link: https://lore.kernel.org/r/20240812204720.631678-4-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/arch/x86/util/event.c | 4 +- tools/perf/bench/synthesize.c | 2 +- tools/perf/builtin-annotate.c | 2 +- tools/perf/builtin-c2c.c | 2 +- tools/perf/builtin-diff.c | 2 +- tools/perf/builtin-inject.c | 70 ++++++++++++++++---------------- tools/perf/builtin-kmem.c | 3 +- tools/perf/builtin-kvm.c | 4 +- tools/perf/builtin-kwork.c | 22 +++++----- tools/perf/builtin-lock.c | 4 +- tools/perf/builtin-mem.c | 4 +- tools/perf/builtin-record.c | 14 +++---- tools/perf/builtin-report.c | 12 +++--- tools/perf/builtin-sched.c | 34 ++++++++-------- tools/perf/builtin-script.c | 41 ++++++++++--------- tools/perf/builtin-stat.c | 8 ++-- tools/perf/builtin-timechart.c | 8 ++-- tools/perf/builtin-top.c | 2 +- tools/perf/builtin-trace.c | 4 +- tools/perf/tests/cpumap.c | 6 +-- tools/perf/tests/dlfilter-test.c | 2 +- tools/perf/tests/dwarf-unwind.c | 2 +- tools/perf/tests/event_update.c | 8 ++-- tools/perf/tests/stat.c | 6 +-- tools/perf/tests/thread-map.c | 2 +- tools/perf/util/arm-spe.c | 6 +-- tools/perf/util/auxtrace.c | 12 +++--- tools/perf/util/auxtrace.h | 20 +++++----- tools/perf/util/bpf-event.c | 4 +- tools/perf/util/build-id.c | 4 +- tools/perf/util/build-id.h | 4 +- tools/perf/util/cs-etm.c | 6 +-- tools/perf/util/data-convert-bt.c | 4 +- tools/perf/util/data-convert-json.c | 4 +- tools/perf/util/event.c | 34 ++++++++-------- tools/perf/util/event.h | 34 ++++++++-------- tools/perf/util/header.c | 6 +-- tools/perf/util/header.h | 4 +- tools/perf/util/hisi-ptt.c | 6 +-- tools/perf/util/intel-bts.c | 6 +-- tools/perf/util/intel-pt.c | 6 +-- tools/perf/util/jitdump.c | 4 +- tools/perf/util/s390-cpumsf.c | 6 +-- tools/perf/util/session.c | 30 +++++++------- tools/perf/util/session.h | 2 +- tools/perf/util/synthetic-events.c | 80 ++++++++++++++++++------------------- tools/perf/util/synthetic-events.h | 70 ++++++++++++++++---------------- tools/perf/util/tool.h | 8 ++-- tools/perf/util/tsc.c | 2 +- 49 files changed, 314 insertions(+), 316 deletions(-) (limited to 'tools') diff --git a/tools/perf/arch/x86/util/event.c b/tools/perf/arch/x86/util/event.c index e65b7dbe27fb..a0400707180c 100644 --- a/tools/perf/arch/x86/util/event.c +++ b/tools/perf/arch/x86/util/event.c @@ -15,7 +15,7 @@ #if defined(__x86_64__) struct perf_event__synthesize_extra_kmaps_cb_args { - struct perf_tool *tool; + const struct perf_tool *tool; perf_event__handler_t process; struct machine *machine; union perf_event *event; @@ -65,7 +65,7 @@ static int perf_event__synthesize_extra_kmaps_cb(struct map *map, void *data) return 0; } -int perf_event__synthesize_extra_kmaps(struct perf_tool *tool, +int perf_event__synthesize_extra_kmaps(const struct perf_tool *tool, perf_event__handler_t process, struct machine *machine) { diff --git a/tools/perf/bench/synthesize.c b/tools/perf/bench/synthesize.c index 7401ebbac100..9b333276cbdb 100644 --- a/tools/perf/bench/synthesize.c +++ b/tools/perf/bench/synthesize.c @@ -49,7 +49,7 @@ static const char *const bench_usage[] = { static atomic_t event_count; -static int process_synthesized_event(struct perf_tool *tool __maybe_unused, +static int process_synthesized_event(const struct perf_tool *tool __maybe_unused, union perf_event *event __maybe_unused, struct perf_sample *sample __maybe_unused, struct machine *machine __maybe_unused) diff --git a/tools/perf/builtin-annotate.c b/tools/perf/builtin-annotate.c index 1bfe41783a7c..cc65e6f8f4da 100644 --- a/tools/perf/builtin-annotate.c +++ b/tools/perf/builtin-annotate.c @@ -279,7 +279,7 @@ static int evsel__add_sample(struct evsel *evsel, struct perf_sample *sample, return ret; } -static int process_sample_event(struct perf_tool *tool, +static int process_sample_event(const struct perf_tool *tool, union perf_event *event, struct perf_sample *sample, struct evsel *evsel, diff --git a/tools/perf/builtin-c2c.c b/tools/perf/builtin-c2c.c index c157bd31f2e5..88c131d05186 100644 --- a/tools/perf/builtin-c2c.c +++ b/tools/perf/builtin-c2c.c @@ -273,7 +273,7 @@ static void compute_stats(struct c2c_hist_entry *c2c_he, update_stats(&cstats->load, weight); } -static int process_sample_event(struct perf_tool *tool __maybe_unused, +static int process_sample_event(const struct perf_tool *tool __maybe_unused, union perf_event *event, struct perf_sample *sample, struct evsel *evsel, diff --git a/tools/perf/builtin-diff.c b/tools/perf/builtin-diff.c index 57d300d8e570..4c0567882a7a 100644 --- a/tools/perf/builtin-diff.c +++ b/tools/perf/builtin-diff.c @@ -388,7 +388,7 @@ struct hist_entry_ops block_hist_ops = { .free = block_hist_free, }; -static int diff__process_sample_event(struct perf_tool *tool, +static int diff__process_sample_event(const struct perf_tool *tool, union perf_event *event, struct perf_sample *sample, struct evsel *evsel, diff --git a/tools/perf/builtin-inject.c b/tools/perf/builtin-inject.c index 8f3dd8980da0..283429ccd034 100644 --- a/tools/perf/builtin-inject.c +++ b/tools/perf/builtin-inject.c @@ -134,7 +134,7 @@ struct event_entry { union perf_event event[]; }; -static int dso__inject_build_id(struct dso *dso, struct perf_tool *tool, +static int dso__inject_build_id(struct dso *dso, const struct perf_tool *tool, struct machine *machine, u8 cpumode, u32 flags); static int output_bytes(struct perf_inject *inject, void *buf, size_t sz) @@ -149,7 +149,7 @@ static int output_bytes(struct perf_inject *inject, void *buf, size_t sz) return 0; } -static int perf_event__repipe_synth(struct perf_tool *tool, +static int perf_event__repipe_synth(const struct perf_tool *tool, union perf_event *event) { struct perf_inject *inject = container_of(tool, struct perf_inject, @@ -158,7 +158,7 @@ static int perf_event__repipe_synth(struct perf_tool *tool, return output_bytes(inject, event, event->header.size); } -static int perf_event__repipe_oe_synth(struct perf_tool *tool, +static int perf_event__repipe_oe_synth(const struct perf_tool *tool, union perf_event *event, struct ordered_events *oe __maybe_unused) { @@ -166,7 +166,7 @@ static int perf_event__repipe_oe_synth(struct perf_tool *tool, } #ifdef HAVE_JITDUMP -static int perf_event__drop_oe(struct perf_tool *tool __maybe_unused, +static int perf_event__drop_oe(const struct perf_tool *tool __maybe_unused, union perf_event *event __maybe_unused, struct ordered_events *oe __maybe_unused) { @@ -188,7 +188,7 @@ static int perf_event__repipe_op4_synth(struct perf_session *session, return perf_event__repipe_synth(session->tool, event); } -static int perf_event__repipe_attr(struct perf_tool *tool, +static int perf_event__repipe_attr(const struct perf_tool *tool, union perf_event *event, struct evlist **pevlist) { @@ -206,7 +206,7 @@ static int perf_event__repipe_attr(struct perf_tool *tool, return perf_event__repipe_synth(tool, event); } -static int perf_event__repipe_event_update(struct perf_tool *tool, +static int perf_event__repipe_event_update(const struct perf_tool *tool, union perf_event *event, struct evlist **pevlist __maybe_unused) { @@ -237,7 +237,7 @@ static int copy_bytes(struct perf_inject *inject, struct perf_data *data, off_t static s64 perf_event__repipe_auxtrace(struct perf_session *session, union perf_event *event) { - struct perf_tool *tool = session->tool; + const struct perf_tool *tool = session->tool; struct perf_inject *inject = container_of(tool, struct perf_inject, tool); int ret; @@ -284,7 +284,7 @@ perf_event__repipe_auxtrace(struct perf_session *session __maybe_unused, #endif -static int perf_event__repipe(struct perf_tool *tool, +static int perf_event__repipe(const struct perf_tool *tool, union perf_event *event, struct perf_sample *sample __maybe_unused, struct machine *machine __maybe_unused) @@ -292,7 +292,7 @@ static int perf_event__repipe(struct perf_tool *tool, return perf_event__repipe_synth(tool, event); } -static int perf_event__drop(struct perf_tool *tool __maybe_unused, +static int perf_event__drop(const struct perf_tool *tool __maybe_unused, union perf_event *event __maybe_unused, struct perf_sample *sample __maybe_unused, struct machine *machine __maybe_unused) @@ -300,7 +300,7 @@ static int perf_event__drop(struct perf_tool *tool __maybe_unused, return 0; } -static int perf_event__drop_aux(struct perf_tool *tool, +static int perf_event__drop_aux(const struct perf_tool *tool, union perf_event *event __maybe_unused, struct perf_sample *sample, struct machine *machine __maybe_unused) @@ -341,13 +341,13 @@ perf_inject__cut_auxtrace_sample(struct perf_inject *inject, return ev; } -typedef int (*inject_handler)(struct perf_tool *tool, +typedef int (*inject_handler)(const struct perf_tool *tool, union perf_event *event, struct perf_sample *sample, struct evsel *evsel, struct machine *machine); -static int perf_event__repipe_sample(struct perf_tool *tool, +static int perf_event__repipe_sample(const struct perf_tool *tool, union perf_event *event, struct perf_sample *sample, struct evsel *evsel, @@ -372,7 +372,7 @@ static int perf_event__repipe_sample(struct perf_tool *tool, return perf_event__repipe_synth(tool, event); } -static int perf_event__repipe_mmap(struct perf_tool *tool, +static int perf_event__repipe_mmap(const struct perf_tool *tool, union perf_event *event, struct perf_sample *sample, struct machine *machine) @@ -386,7 +386,7 @@ static int perf_event__repipe_mmap(struct perf_tool *tool, } #ifdef HAVE_JITDUMP -static int perf_event__jit_repipe_mmap(struct perf_tool *tool, +static int perf_event__jit_repipe_mmap(const struct perf_tool *tool, union perf_event *event, struct perf_sample *sample, struct machine *machine) @@ -455,7 +455,7 @@ static struct dso *findnew_dso(int pid, int tid, const char *filename, return dso; } -static int perf_event__repipe_buildid_mmap(struct perf_tool *tool, +static int perf_event__repipe_buildid_mmap(const struct perf_tool *tool, union perf_event *event, struct perf_sample *sample, struct machine *machine) @@ -474,7 +474,7 @@ static int perf_event__repipe_buildid_mmap(struct perf_tool *tool, return perf_event__repipe(tool, event, sample, machine); } -static int perf_event__repipe_mmap2(struct perf_tool *tool, +static int perf_event__repipe_mmap2(const struct perf_tool *tool, union perf_event *event, struct perf_sample *sample, struct machine *machine) @@ -500,7 +500,7 @@ static int perf_event__repipe_mmap2(struct perf_tool *tool, } #ifdef HAVE_JITDUMP -static int perf_event__jit_repipe_mmap2(struct perf_tool *tool, +static int perf_event__jit_repipe_mmap2(const struct perf_tool *tool, union perf_event *event, struct perf_sample *sample, struct machine *machine) @@ -524,7 +524,7 @@ static int perf_event__jit_repipe_mmap2(struct perf_tool *tool, } #endif -static int perf_event__repipe_buildid_mmap2(struct perf_tool *tool, +static int perf_event__repipe_buildid_mmap2(const struct perf_tool *tool, union perf_event *event, struct perf_sample *sample, struct machine *machine) @@ -565,7 +565,7 @@ static int perf_event__repipe_buildid_mmap2(struct perf_tool *tool, return 0; } -static int perf_event__repipe_fork(struct perf_tool *tool, +static int perf_event__repipe_fork(const struct perf_tool *tool, union perf_event *event, struct perf_sample *sample, struct machine *machine) @@ -578,7 +578,7 @@ static int perf_event__repipe_fork(struct perf_tool *tool, return err; } -static int perf_event__repipe_comm(struct perf_tool *tool, +static int perf_event__repipe_comm(const struct perf_tool *tool, union perf_event *event, struct perf_sample *sample, struct machine *machine) @@ -591,7 +591,7 @@ static int perf_event__repipe_comm(struct perf_tool *tool, return err; } -static int perf_event__repipe_namespaces(struct perf_tool *tool, +static int perf_event__repipe_namespaces(const struct perf_tool *tool, union perf_event *event, struct perf_sample *sample, struct machine *machine) @@ -603,7 +603,7 @@ static int perf_event__repipe_namespaces(struct perf_tool *tool, return err; } -static int perf_event__repipe_exit(struct perf_tool *tool, +static int perf_event__repipe_exit(const struct perf_tool *tool, union perf_event *event, struct perf_sample *sample, struct machine *machine) @@ -712,7 +712,7 @@ static bool perf_inject__lookup_known_build_id(struct perf_inject *inject, return false; } -static int dso__inject_build_id(struct dso *dso, struct perf_tool *tool, +static int dso__inject_build_id(struct dso *dso, const struct perf_tool *tool, struct machine *machine, u8 cpumode, u32 flags) { struct perf_inject *inject = container_of(tool, struct perf_inject, @@ -743,7 +743,7 @@ static int dso__inject_build_id(struct dso *dso, struct perf_tool *tool, return 0; } -int perf_event__inject_buildid(struct perf_tool *tool, union perf_event *event, +int perf_event__inject_buildid(const struct perf_tool *tool, union perf_event *event, struct perf_sample *sample, struct evsel *evsel __maybe_unused, struct machine *machine) @@ -776,7 +776,7 @@ repipe: return 0; } -static int perf_inject__sched_process_exit(struct perf_tool *tool, +static int perf_inject__sched_process_exit(const struct perf_tool *tool, union perf_event *event __maybe_unused, struct perf_sample *sample, struct evsel *evsel __maybe_unused, @@ -796,7 +796,7 @@ static int perf_inject__sched_process_exit(struct perf_tool *tool, return 0; } -static int perf_inject__sched_switch(struct perf_tool *tool, +static int perf_inject__sched_switch(const struct perf_tool *tool, union perf_event *event, struct perf_sample *sample, struct evsel *evsel, @@ -821,7 +821,7 @@ static int perf_inject__sched_switch(struct perf_tool *tool, } #ifdef HAVE_LIBTRACEEVENT -static int perf_inject__sched_stat(struct perf_tool *tool, +static int perf_inject__sched_stat(const struct perf_tool *tool, union perf_event *event __maybe_unused, struct perf_sample *sample, struct evsel *evsel, @@ -866,7 +866,7 @@ static int guest_session__output_bytes(struct guest_session *gs, void *buf, size return ret < 0 ? ret : 0; } -static int guest_session__repipe(struct perf_tool *tool, +static int guest_session__repipe(const struct perf_tool *tool, union perf_event *event, struct perf_sample *sample __maybe_unused, struct machine *machine __maybe_unused) @@ -1032,7 +1032,7 @@ static struct guest_id *guest_session__lookup_id(struct guest_session *gs, u64 i return NULL; } -static int process_attr(struct perf_tool *tool, union perf_event *event, +static int process_attr(const struct perf_tool *tool, union perf_event *event, struct perf_sample *sample __maybe_unused, struct machine *machine __maybe_unused) { @@ -1160,7 +1160,7 @@ static u64 evlist__first_id(struct evlist *evlist) return 0; } -static int process_build_id(struct perf_tool *tool, +static int process_build_id(const struct perf_tool *tool, union perf_event *event, struct perf_sample *sample __maybe_unused, struct machine *machine __maybe_unused) @@ -1210,7 +1210,7 @@ static int guest_session__add_build_ids(struct guest_session *gs) gs); } -static int guest_session__ksymbol_event(struct perf_tool *tool, +static int guest_session__ksymbol_event(const struct perf_tool *tool, union perf_event *event, struct perf_sample *sample __maybe_unused, struct machine *machine __maybe_unused) @@ -1574,7 +1574,7 @@ static int guest_session__flush_events(struct guest_session *gs) return guest_session__inject_events(gs, -1); } -static int host__repipe(struct perf_tool *tool, +static int host__repipe(const struct perf_tool *tool, union perf_event *event, struct perf_sample *sample, struct machine *machine) @@ -1647,7 +1647,7 @@ static int host__finished_init(struct perf_session *session, union perf_event *e * guest events up to the same time. Finally write out the FINISHED_ROUND event * itself. */ -static int host__finished_round(struct perf_tool *tool, +static int host__finished_round(const struct perf_tool *tool, union perf_event *event, struct ordered_events *oe) { @@ -1665,7 +1665,7 @@ static int host__finished_round(struct perf_tool *tool, return perf_event__repipe_oe_synth(tool, event, oe); } -static int host__context_switch(struct perf_tool *tool, +static int host__context_switch(const struct perf_tool *tool, union perf_event *event, struct perf_sample *sample, struct machine *machine) @@ -1719,7 +1719,7 @@ static int evsel__check_stype(struct evsel *evsel, u64 sample_type, const char * return 0; } -static int drop_sample(struct perf_tool *tool __maybe_unused, +static int drop_sample(const struct perf_tool *tool __maybe_unused, union perf_event *event __maybe_unused, struct perf_sample *sample __maybe_unused, struct evsel *evsel __maybe_unused, diff --git a/tools/perf/builtin-kmem.c b/tools/perf/builtin-kmem.c index 6fd95be5032b..859ff018eace 100644 --- a/tools/perf/builtin-kmem.c +++ b/tools/perf/builtin-kmem.c @@ -955,7 +955,7 @@ static bool perf_kmem__skip_sample(struct perf_sample *sample) typedef int (*tracepoint_handler)(struct evsel *evsel, struct perf_sample *sample); -static int process_sample_event(struct perf_tool *tool __maybe_unused, +static int process_sample_event(const struct perf_tool *tool __maybe_unused, union perf_event *event, struct perf_sample *sample, struct evsel *evsel, @@ -2061,4 +2061,3 @@ out_delete: return ret; } - diff --git a/tools/perf/builtin-kvm.c b/tools/perf/builtin-kvm.c index 71165036e4ca..a3b903cf4311 100644 --- a/tools/perf/builtin-kvm.c +++ b/tools/perf/builtin-kvm.c @@ -1166,7 +1166,7 @@ static void print_result(struct perf_kvm_stat *kvm) } #if defined(HAVE_TIMERFD_SUPPORT) && defined(HAVE_LIBTRACEEVENT) -static int process_lost_event(struct perf_tool *tool, +static int process_lost_event(const struct perf_tool *tool, union perf_event *event __maybe_unused, struct perf_sample *sample __maybe_unused, struct machine *machine __maybe_unused) @@ -1187,7 +1187,7 @@ static bool skip_sample(struct perf_kvm_stat *kvm, return false; } -static int process_sample_event(struct perf_tool *tool, +static int process_sample_event(const struct perf_tool *tool, union perf_event *event, struct perf_sample *sample, struct evsel *evsel, diff --git a/tools/perf/builtin-kwork.c b/tools/perf/builtin-kwork.c index 56e3f3a5e03a..8ffaa80a2d1d 100644 --- a/tools/perf/builtin-kwork.c +++ b/tools/perf/builtin-kwork.c @@ -958,7 +958,7 @@ static int top_sched_switch_event(struct perf_kwork *kwork, } static struct kwork_class kwork_irq; -static int process_irq_handler_entry_event(struct perf_tool *tool, +static int process_irq_handler_entry_event(const struct perf_tool *tool, struct evsel *evsel, struct perf_sample *sample, struct machine *machine) @@ -971,7 +971,7 @@ static int process_irq_handler_entry_event(struct perf_tool *tool, return 0; } -static int process_irq_handler_exit_event(struct perf_tool *tool, +static int process_irq_handler_exit_event(const struct perf_tool *tool, struct evsel *evsel, struct perf_sample *sample, struct machine *machine) @@ -1037,7 +1037,7 @@ static struct kwork_class kwork_irq = { }; static struct kwork_class kwork_softirq; -static int process_softirq_raise_event(struct perf_tool *tool, +static int process_softirq_raise_event(const struct perf_tool *tool, struct evsel *evsel, struct perf_sample *sample, struct machine *machine) @@ -1051,7 +1051,7 @@ static int process_softirq_raise_event(struct perf_tool *tool, return 0; } -static int process_softirq_entry_event(struct perf_tool *tool, +static int process_softirq_entry_event(const struct perf_tool *tool, struct evsel *evsel, struct perf_sample *sample, struct machine *machine) @@ -1065,7 +1065,7 @@ static int process_softirq_entry_event(struct perf_tool *tool, return 0; } -static int process_softirq_exit_event(struct perf_tool *tool, +static int process_softirq_exit_event(const struct perf_tool *tool, struct evsel *evsel, struct perf_sample *sample, struct machine *machine) @@ -1167,7 +1167,7 @@ static struct kwork_class kwork_softirq = { }; static struct kwork_class kwork_workqueue; -static int process_workqueue_activate_work_event(struct perf_tool *tool, +static int process_workqueue_activate_work_event(const struct perf_tool *tool, struct evsel *evsel, struct perf_sample *sample, struct machine *machine) @@ -1181,7 +1181,7 @@ static int process_workqueue_activate_work_event(struct perf_tool *tool, return 0; } -static int process_workqueue_execute_start_event(struct perf_tool *tool, +static int process_workqueue_execute_start_event(const struct perf_tool *tool, struct evsel *evsel, struct perf_sample *sample, struct machine *machine) @@ -1195,7 +1195,7 @@ static int process_workqueue_execute_start_event(struct perf_tool *tool, return 0; } -static int process_workqueue_execute_end_event(struct perf_tool *tool, +static int process_workqueue_execute_end_event(const struct perf_tool *tool, struct evsel *evsel, struct perf_sample *sample, struct machine *machine) @@ -1266,7 +1266,7 @@ static struct kwork_class kwork_workqueue = { }; static struct kwork_class kwork_sched; -static int process_sched_switch_event(struct perf_tool *tool, +static int process_sched_switch_event(const struct perf_tool *tool, struct evsel *evsel, struct perf_sample *sample, struct machine *machine) @@ -1945,12 +1945,12 @@ static int perf_kwork__report(struct perf_kwork *kwork) return 0; } -typedef int (*tracepoint_handler)(struct perf_tool *tool, +typedef int (*tracepoint_handler)(const struct perf_tool *tool, struct evsel *evsel, struct perf_sample *sample, struct machine *machine); -static int perf_kwork__process_tracepoint_sample(struct perf_tool *tool, +static int perf_kwork__process_tracepoint_sample(const struct perf_tool *tool, union perf_event *event __maybe_unused, struct perf_sample *sample, struct evsel *evsel, diff --git a/tools/perf/builtin-lock.c b/tools/perf/builtin-lock.c index 0253184b3b58..6efa9d646637 100644 --- a/tools/perf/builtin-lock.c +++ b/tools/perf/builtin-lock.c @@ -1501,7 +1501,7 @@ static const struct evsel_str_handler contention_tracepoints[] = { { "lock:contention_end", evsel__process_contention_end, }, }; -static int process_event_update(struct perf_tool *tool, +static int process_event_update(const struct perf_tool *tool, union perf_event *event, struct evlist **pevlist) { @@ -1520,7 +1520,7 @@ static int process_event_update(struct perf_tool *tool, typedef int (*tracepoint_handler)(struct evsel *evsel, struct perf_sample *sample); -static int process_sample_event(struct perf_tool *tool __maybe_unused, +static int process_sample_event(const struct perf_tool *tool __maybe_unused, union perf_event *event, struct perf_sample *sample, struct evsel *evsel, diff --git a/tools/perf/builtin-mem.c b/tools/perf/builtin-mem.c index efa700d14c98..0fb4d75ab959 100644 --- a/tools/perf/builtin-mem.c +++ b/tools/perf/builtin-mem.c @@ -179,7 +179,7 @@ out: } static int -dump_raw_samples(struct perf_tool *tool, +dump_raw_samples(const struct perf_tool *tool, union perf_event *event, struct perf_sample *sample, struct machine *machine) @@ -253,7 +253,7 @@ out_put: return 0; } -static int process_sample_event(struct perf_tool *tool, +static int process_sample_event(const struct perf_tool *tool, union perf_event *event, struct perf_sample *sample, struct evsel *evsel __maybe_unused, diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index 72345d1e54b0..46410eb3a76b 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -609,7 +609,7 @@ static int record__comp_enabled(struct record *rec) return rec->opts.comp_level > 0; } -static int process_synthesized_event(struct perf_tool *tool, +static int process_synthesized_event(const struct perf_tool *tool, union perf_event *event, struct perf_sample *sample __maybe_unused, struct machine *machine __maybe_unused) @@ -620,7 +620,7 @@ static int process_synthesized_event(struct perf_tool *tool, static struct mutex synth_lock; -static int process_locked_synthesized_event(struct perf_tool *tool, +static int process_locked_synthesized_event(const struct perf_tool *tool, union perf_event *event, struct perf_sample *sample __maybe_unused, struct machine *machine __maybe_unused) @@ -705,7 +705,7 @@ static void record__sig_exit(void) #ifdef HAVE_AUXTRACE_SUPPORT -static int record__process_auxtrace(struct perf_tool *tool, +static int record__process_auxtrace(const struct perf_tool *tool, struct mmap *map, union perf_event *event, void *data1, size_t len1, void *data2, size_t len2) @@ -1417,7 +1417,7 @@ static void set_timestamp_boundary(struct record *rec, u64 sample_time) rec->evlist->last_sample_time = sample_time; } -static int process_sample_event(struct perf_tool *tool, +static int process_sample_event(const struct perf_tool *tool, union perf_event *event, struct perf_sample *sample, struct evsel *evsel, @@ -3244,7 +3244,7 @@ static const char * const __record_usage[] = { }; const char * const *record_usage = __record_usage; -static int build_id__process_mmap(struct perf_tool *tool, union perf_event *event, +static int build_id__process_mmap(const struct perf_tool *tool, union perf_event *event, struct perf_sample *sample, struct machine *machine) { /* @@ -3256,7 +3256,7 @@ static int build_id__process_mmap(struct perf_tool *tool, union perf_event *even return perf_event__process_mmap(tool, event, sample, machine); } -static int build_id__process_mmap2(struct perf_tool *tool, union perf_event *event, +static int build_id__process_mmap2(const struct perf_tool *tool, union perf_event *event, struct perf_sample *sample, struct machine *machine) { /* @@ -3269,7 +3269,7 @@ static int build_id__process_mmap2(struct perf_tool *tool, union perf_event *eve return perf_event__process_mmap2(tool, event, sample, machine); } -static int process_timestamp_boundary(struct perf_tool *tool, +static int process_timestamp_boundary(const struct perf_tool *tool, union perf_event *event __maybe_unused, struct perf_sample *sample, struct machine *machine __maybe_unused) diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c index 930052961c1a..5f609a7791ea 100644 --- a/tools/perf/builtin-report.c +++ b/tools/perf/builtin-report.c @@ -263,7 +263,7 @@ static int process_feature_event(struct perf_session *session, return 0; } -static int process_sample_event(struct perf_tool *tool, +static int process_sample_event(const struct perf_tool *tool, union perf_event *event, struct perf_sample *sample, struct evsel *evsel, @@ -339,7 +339,7 @@ out_put: return ret; } -static int process_read_event(struct perf_tool *tool, +static int process_read_event(const struct perf_tool *tool, union perf_event *event, struct perf_sample *sample __maybe_unused, struct evsel *evsel, @@ -765,7 +765,7 @@ static void report__output_resort(struct report *rep) ui_progress__finish(); } -static int count_sample_event(struct perf_tool *tool __maybe_unused, +static int count_sample_event(const struct perf_tool *tool __maybe_unused, union perf_event *event __maybe_unused, struct perf_sample *sample __maybe_unused, struct evsel *evsel, @@ -777,7 +777,7 @@ static int count_sample_event(struct perf_tool *tool __maybe_unused, return 0; } -static int count_lost_samples_event(struct perf_tool *tool, +static int count_lost_samples_event(const struct perf_tool *tool, union perf_event *event, struct perf_sample *sample, struct machine *machine __maybe_unused) @@ -793,7 +793,7 @@ static int count_lost_samples_event(struct perf_tool *tool, return 0; } -static int process_attr(struct perf_tool *tool __maybe_unused, +static int process_attr(const struct perf_tool *tool __maybe_unused, union perf_event *event, struct evlist **pevlist); @@ -1233,7 +1233,7 @@ parse_percent_limit(const struct option *opt, const char *str, return 0; } -static int process_attr(struct perf_tool *tool __maybe_unused, +static int process_attr(const struct perf_tool *tool __maybe_unused, union perf_event *event, struct evlist **pevlist) { diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c index 8750b5f2d49b..2c60bd3a8149 100644 --- a/tools/perf/builtin-sched.c +++ b/tools/perf/builtin-sched.c @@ -1489,7 +1489,7 @@ again: } } -static int process_sched_wakeup_event(struct perf_tool *tool, +static int process_sched_wakeup_event(const struct perf_tool *tool, struct evsel *evsel, struct perf_sample *sample, struct machine *machine) @@ -1502,7 +1502,7 @@ static int process_sched_wakeup_event(struct perf_tool *tool, return 0; } -static int process_sched_wakeup_ignore(struct perf_tool *tool __maybe_unused, +static int process_sched_wakeup_ignore(const struct perf_tool *tool __maybe_unused, struct evsel *evsel __maybe_unused, struct perf_sample *sample __maybe_unused, struct machine *machine __maybe_unused) @@ -1770,7 +1770,7 @@ out: return 0; } -static int process_sched_switch_event(struct perf_tool *tool, +static int process_sched_switch_event(const struct perf_tool *tool, struct evsel *evsel, struct perf_sample *sample, struct machine *machine) @@ -1796,7 +1796,7 @@ static int process_sched_switch_event(struct perf_tool *tool, return err; } -static int process_sched_runtime_event(struct perf_tool *tool, +static int process_sched_runtime_event(const struct perf_tool *tool, struct evsel *evsel, struct perf_sample *sample, struct machine *machine) @@ -1809,7 +1809,7 @@ static int process_sched_runtime_event(struct perf_tool *tool, return 0; } -static int perf_sched__process_fork_event(struct perf_tool *tool, +static int perf_sched__process_fork_event(const struct perf_tool *tool, union perf_event *event, struct perf_sample *sample, struct machine *machine) @@ -1826,7 +1826,7 @@ static int perf_sched__process_fork_event(struct perf_tool *tool, return 0; } -static int process_sched_migrate_task_event(struct perf_tool *tool, +static int process_sched_migrate_task_event(const struct perf_tool *tool, struct evsel *evsel, struct perf_sample *sample, struct machine *machine) @@ -1839,12 +1839,12 @@ static int process_sched_migrate_task_event(struct perf_tool *tool, return 0; } -typedef int (*tracepoint_handler)(struct perf_tool *tool, +typedef int (*tracepoint_handler)(const struct perf_tool *tool, struct evsel *evsel, struct perf_sample *sample, struct machine *machine); -static int perf_sched__process_tracepoint_sample(struct perf_tool *tool __maybe_unused, +static int perf_sched__process_tracepoint_sample(const struct perf_tool *tool __maybe_unused, union perf_event *event __maybe_unused, struct perf_sample *sample, struct evsel *evsel, @@ -1860,7 +1860,7 @@ static int perf_sched__process_tracepoint_sample(struct perf_tool *tool __maybe_ return err; } -static int perf_sched__process_comm(struct perf_tool *tool __maybe_unused, +static int perf_sched__process_comm(const struct perf_tool *tool __maybe_unused, union perf_event *event, struct perf_sample *sample, struct machine *machine) @@ -2506,7 +2506,7 @@ static void timehist_print_wakeup_event(struct perf_sched *sched, printf("\n"); } -static int timehist_sched_wakeup_ignore(struct perf_tool *tool __maybe_unused, +static int timehist_sched_wakeup_ignore(const struct perf_tool *tool __maybe_unused, union perf_event *event __maybe_unused, struct evsel *evsel __maybe_unused, struct perf_sample *sample __maybe_unused, @@ -2515,7 +2515,7 @@ static int timehist_sched_wakeup_ignore(struct perf_tool *tool __maybe_unused, return 0; } -static int timehist_sched_wakeup_event(struct perf_tool *tool, +static int timehist_sched_wakeup_event(const struct perf_tool *tool, union perf_event *event __maybe_unused, struct evsel *evsel, struct perf_sample *sample, @@ -2599,7 +2599,7 @@ static void timehist_print_migration_event(struct perf_sched *sched, printf("\n"); } -static int timehist_migrate_task_event(struct perf_tool *tool, +static int timehist_migrate_task_event(const struct perf_tool *tool, union perf_event *event __maybe_unused, struct evsel *evsel, struct perf_sample *sample, @@ -2627,7 +2627,7 @@ static int timehist_migrate_task_event(struct perf_tool *tool, return 0; } -static int timehist_sched_change_event(struct perf_tool *tool, +static int timehist_sched_change_event(const struct perf_tool *tool, union perf_event *event, struct evsel *evsel, struct perf_sample *sample, @@ -2758,7 +2758,7 @@ out: return rc; } -static int timehist_sched_switch_event(struct perf_tool *tool, +static int timehist_sched_switch_event(const struct perf_tool *tool, union perf_event *event, struct evsel *evsel, struct perf_sample *sample, @@ -2767,7 +2767,7 @@ static int timehist_sched_switch_event(struct perf_tool *tool, return timehist_sched_change_event(tool, event, evsel, sample, machine); } -static int process_lost(struct perf_tool *tool __maybe_unused, +static int process_lost(const struct perf_tool *tool __maybe_unused, union perf_event *event, struct perf_sample *sample, struct machine *machine __maybe_unused) @@ -3010,13 +3010,13 @@ static void timehist_print_summary(struct perf_sched *sched, printf(" (x %d)\n", sched->max_cpu.cpu); } -typedef int (*sched_handler)(struct perf_tool *tool, +typedef int (*sched_handler)(const struct perf_tool *tool, union perf_event *event, struct evsel *evsel, struct perf_sample *sample, struct machine *machine); -static int perf_timehist__process_sample(struct perf_tool *tool, +static int perf_timehist__process_sample(const struct perf_tool *tool, union perf_event *event, struct perf_sample *sample, struct evsel *evsel, diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c index 932167b2362b..31a6f800d55f 100644 --- a/tools/perf/builtin-script.c +++ b/tools/perf/builtin-script.c @@ -2399,7 +2399,7 @@ static bool filter_cpu(struct perf_sample *sample) return false; } -static int process_sample_event(struct perf_tool *tool, +static int process_sample_event(const struct perf_tool *tool, union perf_event *event, struct perf_sample *sample, struct evsel *evsel, @@ -2486,7 +2486,7 @@ out_put: // Used when scr->per_event_dump is not set static struct evsel_script es_stdout; -static int process_attr(struct perf_tool *tool, union perf_event *event, +static int process_attr(const struct perf_tool *tool, union perf_event *event, struct evlist **pevlist) { struct perf_script *scr = container_of(tool, struct perf_script, tool); @@ -2552,7 +2552,7 @@ static int process_attr(struct perf_tool *tool, union perf_event *event, return 0; } -static int print_event_with_time(struct perf_tool *tool, +static int print_event_with_time(const struct perf_tool *tool, union perf_event *event, struct perf_sample *sample, struct machine *machine, @@ -2588,14 +2588,14 @@ static int print_event_with_time(struct perf_tool *tool, return 0; } -static int print_event(struct perf_tool *tool, union perf_event *event, +static int print_event(const struct perf_tool *tool, union perf_event *event, struct perf_sample *sample, struct machine *machine, pid_t pid, pid_t tid) { return print_event_with_time(tool, event, sample, machine, pid, tid, 0); } -static int process_comm_event(struct perf_tool *tool, +static int process_comm_event(const struct perf_tool *tool, union perf_event *event, struct perf_sample *sample, struct machine *machine) @@ -2607,7 +2607,7 @@ static int process_comm_event(struct perf_tool *tool, event->comm.tid); } -static int process_namespaces_event(struct perf_tool *tool, +static int process_namespaces_event(const struct perf_tool *tool, union perf_event *event, struct perf_sample *sample, struct machine *machine) @@ -2619,7 +2619,7 @@ static int process_namespaces_event(struct perf_tool *tool, event->namespaces.tid); } -static int process_cgroup_event(struct perf_tool *tool, +static int process_cgroup_event(const struct perf_tool *tool, union perf_event *event, struct perf_sample *sample, struct machine *machine) @@ -2631,7 +2631,7 @@ static int process_cgroup_event(struct perf_tool *tool, sample->tid); } -static int process_fork_event(struct perf_tool *tool, +static int process_fork_event(const struct perf_tool *tool, union perf_event *event, struct perf_sample *sample, struct machine *machine) @@ -2643,7 +2643,7 @@ static int process_fork_event(struct perf_tool *tool, event->fork.pid, event->fork.tid, event->fork.time); } -static int process_exit_event(struct perf_tool *tool, +static int process_exit_event(const struct perf_tool *tool, union perf_event *event, struct perf_sample *sample, struct machine *machine) @@ -2656,7 +2656,7 @@ static int process_exit_event(struct perf_tool *tool, return perf_event__process_exit(tool, event, sample, machine); } -static int process_mmap_event(struct perf_tool *tool, +static int process_mmap_event(const struct perf_tool *tool, union perf_event *event, struct perf_sample *sample, struct machine *machine) @@ -2668,7 +2668,7 @@ static int process_mmap_event(struct perf_tool *tool, event->mmap.tid); } -static int process_mmap2_event(struct perf_tool *tool, +static int process_mmap2_event(const struct perf_tool *tool, union perf_event *event, struct perf_sample *sample, struct machine *machine) @@ -2680,7 +2680,7 @@ static int process_mmap2_event(struct perf_tool *tool, event->mmap2.tid); } -static int process_switch_event(struct perf_tool *tool, +static int process_switch_event(const struct perf_tool *tool, union perf_event *event, struct perf_sample *sample, struct machine *machine) @@ -2712,7 +2712,7 @@ static int process_auxtrace_error(struct perf_session *session, } static int -process_lost_event(struct perf_tool *tool, +process_lost_event(const struct perf_tool *tool, union perf_event *event, struct perf_sample *sample, struct machine *machine) @@ -2722,7 +2722,7 @@ process_lost_event(struct perf_tool *tool, } static int -process_throttle_event(struct perf_tool *tool __maybe_unused, +process_throttle_event(const struct perf_tool *tool __maybe_unused, union perf_event *event, struct perf_sample *sample, struct machine *machine) @@ -2733,7 +2733,7 @@ process_throttle_event(struct perf_tool *tool __maybe_unused, } static int -process_finished_round_event(struct perf_tool *tool __maybe_unused, +process_finished_round_event(const struct perf_tool *tool __maybe_unused, union perf_event *event, struct ordered_events *oe __maybe_unused) @@ -2743,7 +2743,7 @@ process_finished_round_event(struct perf_tool *tool __maybe_unused, } static int -process_bpf_events(struct perf_tool *tool __maybe_unused, +process_bpf_events(const struct perf_tool *tool __maybe_unused, union perf_event *event, struct perf_sample *sample, struct machine *machine) @@ -2755,7 +2755,7 @@ process_bpf_events(struct perf_tool *tool __maybe_unused, sample->tid); } -static int process_text_poke_events(struct perf_tool *tool, +static int process_text_poke_events(const struct perf_tool *tool, union perf_event *event, struct perf_sample *sample, struct machine *machine) @@ -3757,7 +3757,7 @@ static int process_thread_map_event(struct perf_session *session, union perf_event *event) { - struct perf_tool *tool = session->tool; + const struct perf_tool *tool = session->tool; struct perf_script *script = container_of(tool, struct perf_script, tool); if (dump_trace) @@ -3779,7 +3779,7 @@ static int process_cpu_map_event(struct perf_session *session, union perf_event *event) { - struct perf_tool *tool = session->tool; + const struct perf_tool *tool = session->tool; struct perf_script *script = container_of(tool, struct perf_script, tool); if (dump_trace) @@ -3809,11 +3809,10 @@ static int process_feature_event(struct perf_session *session, static int perf_script__process_auxtrace_info(struct perf_session *session, union perf_event *event) { - struct perf_tool *tool = session->tool; - int ret = perf_event__process_auxtrace_info(session, event); if (ret == 0) { + const struct perf_tool *tool = session->tool; struct perf_script *script = container_of(tool, struct perf_script, tool); ret = perf_script__setup_per_event_dump(script); diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index 1f92445f7480..d44e708f8a44 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -248,7 +248,7 @@ static void perf_stat__reset_stats(void) perf_stat__reset_shadow_stats(); } -static int process_synthesized_event(struct perf_tool *tool __maybe_unused, +static int process_synthesized_event(const struct perf_tool *tool __maybe_unused, union perf_event *event, struct perf_sample *sample __maybe_unused, struct machine *machine __maybe_unused) @@ -2180,7 +2180,7 @@ static int process_stat_config_event(struct perf_session *session, union perf_event *event) { - struct perf_tool *tool = session->tool; + const struct perf_tool *tool = session->tool; struct perf_stat *st = container_of(tool, struct perf_stat, tool); perf_event__read_stat_config(&stat_config, &event->stat_config); @@ -2229,7 +2229,7 @@ static int process_thread_map_event(struct perf_session *session, union perf_event *event) { - struct perf_tool *tool = session->tool; + const struct perf_tool *tool = session->tool; struct perf_stat *st = container_of(tool, struct perf_stat, tool); if (st->threads) { @@ -2248,7 +2248,7 @@ static int process_cpu_map_event(struct perf_session *session, union perf_event *event) { - struct perf_tool *tool = session->tool; + const struct perf_tool *tool = session->tool; struct perf_stat *st = container_of(tool, struct perf_stat, tool); struct perf_cpu_map *cpus; diff --git a/tools/perf/builtin-timechart.c b/tools/perf/builtin-timechart.c index 19d4542ea18a..5bf818baa662 100644 --- a/tools/perf/builtin-timechart.c +++ b/tools/perf/builtin-timechart.c @@ -320,7 +320,7 @@ static int *cpus_cstate_state; static u64 *cpus_pstate_start_times; static u64 *cpus_pstate_state; -static int process_comm_event(struct perf_tool *tool, +static int process_comm_event(const struct perf_tool *tool, union perf_event *event, struct perf_sample *sample __maybe_unused, struct machine *machine __maybe_unused) @@ -330,7 +330,7 @@ static int process_comm_event(struct perf_tool *tool, return 0; } -static int process_fork_event(struct perf_tool *tool, +static int process_fork_event(const struct perf_tool *tool, union perf_event *event, struct perf_sample *sample __maybe_unused, struct machine *machine __maybe_unused) @@ -340,7 +340,7 @@ static int process_fork_event(struct perf_tool *tool, return 0; } -static int process_exit_event(struct perf_tool *tool, +static int process_exit_event(const struct perf_tool *tool, union perf_event *event, struct perf_sample *sample __maybe_unused, struct machine *machine __maybe_unused) @@ -571,7 +571,7 @@ typedef int (*tracepoint_handler)(struct timechart *tchart, struct perf_sample *sample, const char *backtrace); -static int process_sample_event(struct perf_tool *tool, +static int process_sample_event(const struct perf_tool *tool, union perf_event *event, struct perf_sample *sample, struct evsel *evsel, diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c index d1a06a88d693..1cdac4e190e9 100644 --- a/tools/perf/builtin-top.c +++ b/tools/perf/builtin-top.c @@ -740,7 +740,7 @@ static int hist_iter__top_callback(struct hist_entry_iter *iter, return 0; } -static void perf_event__process_sample(struct perf_tool *tool, +static void perf_event__process_sample(const struct perf_tool *tool, const union perf_event *event, struct evsel *evsel, struct perf_sample *sample, diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c index ef951ce1a0dd..d6ca541fdc78 100644 --- a/tools/perf/builtin-trace.c +++ b/tools/perf/builtin-trace.c @@ -1778,7 +1778,7 @@ static int trace__process_event(struct trace *trace, struct machine *machine, return ret; } -static int trace__tool_process(struct perf_tool *tool, +static int trace__tool_process(const struct perf_tool *tool, union perf_event *event, struct perf_sample *sample, struct machine *machine) @@ -3179,7 +3179,7 @@ static void trace__set_base_time(struct trace *trace, trace->base_time = sample->time; } -static int trace__process_sample(struct perf_tool *tool, +static int trace__process_sample(const struct perf_tool *tool, union perf_event *event, struct perf_sample *sample, struct evsel *evsel, diff --git a/tools/perf/tests/cpumap.c b/tools/perf/tests/cpumap.c index bd8e396f3e57..2f0168b2a5a9 100644 --- a/tools/perf/tests/cpumap.c +++ b/tools/perf/tests/cpumap.c @@ -11,7 +11,7 @@ struct machine; -static int process_event_mask(struct perf_tool *tool __maybe_unused, +static int process_event_mask(const struct perf_tool *tool __maybe_unused, union perf_event *event, struct perf_sample *sample __maybe_unused, struct machine *machine __maybe_unused) @@ -47,7 +47,7 @@ static int process_event_mask(struct perf_tool *tool __maybe_unused, return 0; } -static int process_event_cpus(struct perf_tool *tool __maybe_unused, +static int process_event_cpus(const struct perf_tool *tool __maybe_unused, union perf_event *event, struct perf_sample *sample __maybe_unused, struct machine *machine __maybe_unused) @@ -73,7 +73,7 @@ static int process_event_cpus(struct perf_tool *tool __maybe_unused, return 0; } -static int process_event_range_cpus(struct perf_tool *tool __maybe_unused, +static int process_event_range_cpus(const struct perf_tool *tool __maybe_unused, union perf_event *event, struct perf_sample *sample __maybe_unused, struct machine *machine __maybe_unused) diff --git a/tools/perf/tests/dlfilter-test.c b/tools/perf/tests/dlfilter-test.c index da3a9b50b1b1..54f59d1246bc 100644 --- a/tools/perf/tests/dlfilter-test.c +++ b/tools/perf/tests/dlfilter-test.c @@ -62,7 +62,7 @@ static int test_result(const char *msg, int ret) return ret; } -static int process(struct perf_tool *tool, union perf_event *event, +static int process(const struct perf_tool *tool, union perf_event *event, struct perf_sample *sample __maybe_unused, struct machine *machine __maybe_unused) { diff --git a/tools/perf/tests/dwarf-unwind.c b/tools/perf/tests/dwarf-unwind.c index d01aa931fe81..f85d391ced98 100644 --- a/tools/perf/tests/dwarf-unwind.c +++ b/tools/perf/tests/dwarf-unwind.c @@ -37,7 +37,7 @@ #define NO_TAIL_CALL_BARRIER __asm__ __volatile__("" : : : "memory"); #endif -static int mmap_handler(struct perf_tool *tool __maybe_unused, +static int mmap_handler(const struct perf_tool *tool __maybe_unused, union perf_event *event, struct perf_sample *sample, struct machine *machine) diff --git a/tools/perf/tests/event_update.c b/tools/perf/tests/event_update.c index d093a9b878d1..fdecad920f59 100644 --- a/tools/perf/tests/event_update.c +++ b/tools/perf/tests/event_update.c @@ -12,7 +12,7 @@ #include "tests.h" #include "debug.h" -static int process_event_unit(struct perf_tool *tool __maybe_unused, +static int process_event_unit(const struct perf_tool *tool __maybe_unused, union perf_event *event, struct perf_sample *sample __maybe_unused, struct machine *machine __maybe_unused) @@ -25,7 +25,7 @@ static int process_event_unit(struct perf_tool *tool __maybe_unused, return 0; } -static int process_event_scale(struct perf_tool *tool __maybe_unused, +static int process_event_scale(const struct perf_tool *tool __maybe_unused, union perf_event *event, struct perf_sample *sample __maybe_unused, struct machine *machine __maybe_unused) @@ -43,7 +43,7 @@ struct event_name { const char *name; }; -static int process_event_name(struct perf_tool *tool, +static int process_event_name(const struct perf_tool *tool, union perf_event *event, struct perf_sample *sample __maybe_unused, struct machine *machine __maybe_unused) @@ -57,7 +57,7 @@ static int process_event_name(struct perf_tool *tool, return 0; } -static int process_event_cpus(struct perf_tool *tool __maybe_unused, +static int process_event_cpus(const struct perf_tool *tool __maybe_unused, union perf_event *event, struct perf_sample *sample __maybe_unused, struct machine *machine __maybe_unused) diff --git a/tools/perf/tests/stat.c b/tools/perf/tests/stat.c index 706780fb5695..6468cc0d0204 100644 --- a/tools/perf/tests/stat.c +++ b/tools/perf/tests/stat.c @@ -21,7 +21,7 @@ static bool has_term(struct perf_record_stat_config *config, return false; } -static int process_stat_config_event(struct perf_tool *tool __maybe_unused, +static int process_stat_config_event(const struct perf_tool *tool __maybe_unused, union perf_event *event, struct perf_sample *sample __maybe_unused, struct machine *machine __maybe_unused) @@ -62,7 +62,7 @@ static int test__synthesize_stat_config(struct test_suite *test __maybe_unused, return 0; } -static int process_stat_event(struct perf_tool *tool __maybe_unused, +static int process_stat_event(const struct perf_tool *tool __maybe_unused, union perf_event *event, struct perf_sample *sample __maybe_unused, struct machine *machine __maybe_unused) @@ -93,7 +93,7 @@ static int test__synthesize_stat(struct test_suite *test __maybe_unused, int sub return 0; } -static int process_stat_round_event(struct perf_tool *tool __maybe_unused, +static int process_stat_round_event(const struct perf_tool *tool __maybe_unused, union perf_event *event, struct perf_sample *sample __maybe_unused, struct machine *machine __maybe_unused) diff --git a/tools/perf/tests/thread-map.c b/tools/perf/tests/thread-map.c index 74308c1368fe..1fe521466bf4 100644 --- a/tools/perf/tests/thread-map.c +++ b/tools/perf/tests/thread-map.c @@ -60,7 +60,7 @@ static int test__thread_map(struct test_suite *test __maybe_unused, int subtest return 0; } -static int process_event(struct perf_tool *tool __maybe_unused, +static int process_event(const struct perf_tool *tool __maybe_unused, union perf_event *event, struct perf_sample *sample __maybe_unused, struct machine *machine __maybe_unused) diff --git a/tools/perf/util/arm-spe.c b/tools/perf/util/arm-spe.c index fa40f3cb6266..138ffc71b32d 100644 --- a/tools/perf/util/arm-spe.c +++ b/tools/perf/util/arm-spe.c @@ -899,7 +899,7 @@ static int arm_spe_context_switch(struct arm_spe *spe, union perf_event *event, static int arm_spe_process_event(struct perf_session *session, union perf_event *event, struct perf_sample *sample, - struct perf_tool *tool) + const struct perf_tool *tool) { int err = 0; u64 timestamp; @@ -947,7 +947,7 @@ static int arm_spe_process_event(struct perf_session *session, static int arm_spe_process_auxtrace_event(struct perf_session *session, union perf_event *event, - struct perf_tool *tool __maybe_unused) + const struct perf_tool *tool __maybe_unused) { struct arm_spe *spe = container_of(session->auxtrace, struct arm_spe, auxtrace); @@ -985,7 +985,7 @@ static int arm_spe_process_auxtrace_event(struct perf_session *session, } static int arm_spe_flush(struct perf_session *session __maybe_unused, - struct perf_tool *tool __maybe_unused) + const struct perf_tool *tool __maybe_unused) { struct arm_spe *spe = container_of(session->auxtrace, struct arm_spe, auxtrace); diff --git a/tools/perf/util/auxtrace.c b/tools/perf/util/auxtrace.c index e2f317063eec..cbb773ed6f1a 100644 --- a/tools/perf/util/auxtrace.c +++ b/tools/perf/util/auxtrace.c @@ -1240,7 +1240,7 @@ void auxtrace_synth_error(struct perf_record_auxtrace_error *auxtrace_error, int } int perf_event__synthesize_auxtrace_info(struct auxtrace_record *itr, - struct perf_tool *tool, + const struct perf_tool *tool, struct perf_session *session, perf_event__handler_t process) { @@ -1831,7 +1831,7 @@ int __weak compat_auxtrace_mmap__write_tail(struct auxtrace_mmap *mm, u64 tail) static int __auxtrace_mmap__read(struct mmap *map, struct auxtrace_record *itr, - struct perf_tool *tool, process_auxtrace_t fn, + const struct perf_tool *tool, process_auxtrace_t fn, bool snapshot, size_t snapshot_size) { struct auxtrace_mmap *mm = &map->auxtrace_mmap; @@ -1942,14 +1942,14 @@ static int __auxtrace_mmap__read(struct mmap *map, } int auxtrace_mmap__read(struct mmap *map, struct auxtrace_record *itr, - struct perf_tool *tool, process_auxtrace_t fn) + const struct perf_tool *tool, process_auxtrace_t fn) { return __auxtrace_mmap__read(map, itr, tool, fn, false, 0); } int auxtrace_mmap__read_snapshot(struct mmap *map, struct auxtrace_record *itr, - struct perf_tool *tool, process_auxtrace_t fn, + const struct perf_tool *tool, process_auxtrace_t fn, size_t snapshot_size) { return __auxtrace_mmap__read(map, itr, tool, fn, true, snapshot_size); @@ -2829,7 +2829,7 @@ int auxtrace_parse_filters(struct evlist *evlist) } int auxtrace__process_event(struct perf_session *session, union perf_event *event, - struct perf_sample *sample, struct perf_tool *tool) + struct perf_sample *sample, const struct perf_tool *tool) { if (!session->auxtrace) return 0; @@ -2847,7 +2847,7 @@ void auxtrace__dump_auxtrace_sample(struct perf_session *session, session->auxtrace->dump_auxtrace_sample(session, sample); } -int auxtrace__flush_events(struct perf_session *session, struct perf_tool *tool) +int auxtrace__flush_events(struct perf_session *session, const struct perf_tool *tool) { if (!session->auxtrace) return 0; diff --git a/tools/perf/util/auxtrace.h b/tools/perf/util/auxtrace.h index 8a6ec9565835..d405efcd8708 100644 --- a/tools/perf/util/auxtrace.h +++ b/tools/perf/util/auxtrace.h @@ -208,17 +208,17 @@ struct auxtrace { int (*process_event)(struct perf_session *session, union perf_event *event, struct perf_sample *sample, - struct perf_tool *tool); + const struct perf_tool *tool); int (*process_auxtrace_event)(struct perf_session *session, union perf_event *event, - struct perf_tool *tool); + const struct perf_tool *tool); int (*queue_data)(struct perf_session *session, struct perf_sample *sample, union perf_event *event, u64 data_offset); void (*dump_auxtrace_sample)(struct perf_session *session, struct perf_sample *sample); int (*flush_events)(struct perf_session *session, - struct perf_tool *tool); + const struct perf_tool *tool); void (*free_events)(struct perf_session *session); void (*free)(struct perf_session *session); bool (*evsel_is_auxtrace)(struct perf_session *session, @@ -508,17 +508,17 @@ void auxtrace_mmap_params__set_idx(struct auxtrace_mmap_params *mp, struct evlist *evlist, struct evsel *evsel, int idx); -typedef int (*process_auxtrace_t)(struct perf_tool *tool, +typedef int (*process_auxtrace_t)(const struct perf_tool *tool, struct mmap *map, union perf_event *event, void *data1, size_t len1, void *data2, size_t len2); int auxtrace_mmap__read(struct mmap *map, struct auxtrace_record *itr, - struct perf_tool *tool, process_auxtrace_t fn); + const struct perf_tool *tool, process_auxtrace_t fn); int auxtrace_mmap__read_snapshot(struct mmap *map, struct auxtrace_record *itr, - struct perf_tool *tool, process_auxtrace_t fn, + const struct perf_tool *tool, process_auxtrace_t fn, size_t snapshot_size); int auxtrace_queues__init_nr(struct auxtrace_queues *queues, int nr_queues); @@ -639,10 +639,10 @@ int addr_filters__parse_bare_filter(struct addr_filters *filts, int auxtrace_parse_filters(struct evlist *evlist); int auxtrace__process_event(struct perf_session *session, union perf_event *event, - struct perf_sample *sample, struct perf_tool *tool); + struct perf_sample *sample, const struct perf_tool *tool); void auxtrace__dump_auxtrace_sample(struct perf_session *session, struct perf_sample *sample); -int auxtrace__flush_events(struct perf_session *session, struct perf_tool *tool); +int auxtrace__flush_events(struct perf_session *session, const struct perf_tool *tool); void auxtrace__free_events(struct perf_session *session); void auxtrace__free(struct perf_session *session); bool auxtrace__evsel_is_auxtrace(struct perf_session *session, @@ -809,7 +809,7 @@ static inline int auxtrace__process_event(struct perf_session *session __maybe_unused, union perf_event *event __maybe_unused, struct perf_sample *sample __maybe_unused, - struct perf_tool *tool __maybe_unused) + const struct perf_tool *tool __maybe_unused) { return 0; } @@ -822,7 +822,7 @@ void auxtrace__dump_auxtrace_sample(struct perf_session *session __maybe_unused, static inline int auxtrace__flush_events(struct perf_session *session __maybe_unused, - struct perf_tool *tool __maybe_unused) + const struct perf_tool *tool __maybe_unused) { return 0; } diff --git a/tools/perf/util/bpf-event.c b/tools/perf/util/bpf-event.c index 827695cd0408..13608237c50e 100644 --- a/tools/perf/util/bpf-event.c +++ b/tools/perf/util/bpf-event.c @@ -170,7 +170,7 @@ static int perf_event__synthesize_one_bpf_prog(struct perf_session *session, { struct perf_record_ksymbol *ksymbol_event = &event->ksymbol; struct perf_record_bpf_event *bpf_event = &event->bpf; - struct perf_tool *tool = session->tool; + const struct perf_tool *tool = session->tool; struct bpf_prog_info_node *info_node; struct perf_bpil *info_linear; struct bpf_prog_info *info; @@ -310,7 +310,7 @@ struct kallsyms_parse { union perf_event *event; perf_event__handler_t process; struct machine *machine; - struct perf_tool *tool; + const struct perf_tool *tool; }; static int diff --git a/tools/perf/util/build-id.c b/tools/perf/util/build-id.c index 83a1581e8cf1..098fcc625d91 100644 --- a/tools/perf/util/build-id.c +++ b/tools/perf/util/build-id.c @@ -42,7 +42,7 @@ static bool no_buildid_cache; -int build_id__mark_dso_hit(struct perf_tool *tool __maybe_unused, +int build_id__mark_dso_hit(const struct perf_tool *tool __maybe_unused, union perf_event *event, struct perf_sample *sample, struct evsel *evsel __maybe_unused, @@ -67,7 +67,7 @@ int build_id__mark_dso_hit(struct perf_tool *tool __maybe_unused, return 0; } -static int perf_event__exit_del_thread(struct perf_tool *tool __maybe_unused, +static int perf_event__exit_del_thread(const struct perf_tool *tool __maybe_unused, union perf_event *event, struct perf_sample *sample __maybe_unused, diff --git a/tools/perf/util/build-id.h b/tools/perf/util/build-id.h index 3fa8bffb07ca..ae87c4c58d5b 100644 --- a/tools/perf/util/build-id.h +++ b/tools/perf/util/build-id.h @@ -35,11 +35,11 @@ char *dso__build_id_filename(const struct dso *dso, char *bf, size_t size, char *__dso__build_id_filename(const struct dso *dso, char *bf, size_t size, bool is_debug, bool is_kallsyms); -int build_id__mark_dso_hit(struct perf_tool *tool, union perf_event *event, +int build_id__mark_dso_hit(const struct perf_tool *tool, union perf_event *event, struct perf_sample *sample, struct evsel *evsel, struct machine *machine); -int perf_event__inject_buildid(struct perf_tool *tool, union perf_event *event, +int perf_event__inject_buildid(const struct perf_tool *tool, union perf_event *event, struct perf_sample *sample, struct evsel *evsel, struct machine *machine); diff --git a/tools/perf/util/cs-etm.c b/tools/perf/util/cs-etm.c index 566498ebc694..34552940ec1d 100644 --- a/tools/perf/util/cs-etm.c +++ b/tools/perf/util/cs-etm.c @@ -788,7 +788,7 @@ static void cs_etm__dump_event(struct cs_etm_queue *etmq, } static int cs_etm__flush_events(struct perf_session *session, - struct perf_tool *tool) + const struct perf_tool *tool) { struct cs_etm_auxtrace *etm = container_of(session->auxtrace, struct cs_etm_auxtrace, @@ -2715,7 +2715,7 @@ static int cs_etm__process_switch_cpu_wide(struct cs_etm_auxtrace *etm, static int cs_etm__process_event(struct perf_session *session, union perf_event *event, struct perf_sample *sample, - struct perf_tool *tool) + const struct perf_tool *tool) { struct cs_etm_auxtrace *etm = container_of(session->auxtrace, struct cs_etm_auxtrace, @@ -2785,7 +2785,7 @@ static void dump_queued_data(struct cs_etm_auxtrace *etm, static int cs_etm__process_auxtrace_event(struct perf_session *session, union perf_event *event, - struct perf_tool *tool __maybe_unused) + const struct perf_tool *tool __maybe_unused) { struct cs_etm_auxtrace *etm = container_of(session->auxtrace, struct cs_etm_auxtrace, diff --git a/tools/perf/util/data-convert-bt.c b/tools/perf/util/data-convert-bt.c index 2b732bccabad..9e2170604b66 100644 --- a/tools/perf/util/data-convert-bt.c +++ b/tools/perf/util/data-convert-bt.c @@ -792,7 +792,7 @@ static bool is_flush_needed(struct ctf_stream *cs) return cs->count >= STREAM_FLUSH_COUNT; } -static int process_sample_event(struct perf_tool *tool, +static int process_sample_event(const struct perf_tool *tool, union perf_event *_event, struct perf_sample *sample, struct evsel *evsel, @@ -871,7 +871,7 @@ do { \ } while(0) #define __FUNC_PROCESS_NON_SAMPLE(_name, body) \ -static int process_##_name##_event(struct perf_tool *tool, \ +static int process_##_name##_event(const struct perf_tool *tool, \ union perf_event *_event, \ struct perf_sample *sample, \ struct machine *machine) \ diff --git a/tools/perf/util/data-convert-json.c b/tools/perf/util/data-convert-json.c index 3cf64f5b23ee..905ea9823f9d 100644 --- a/tools/perf/util/data-convert-json.c +++ b/tools/perf/util/data-convert-json.c @@ -118,7 +118,7 @@ static void output_json_key_format(FILE *out, bool comma, int depth, va_end(args); } -static void output_sample_callchain_entry(struct perf_tool *tool, +static void output_sample_callchain_entry(const struct perf_tool *tool, u64 ip, struct addr_location *al) { struct convert_json *c = container_of(tool, struct convert_json, tool); @@ -146,7 +146,7 @@ static void output_sample_callchain_entry(struct perf_tool *tool, output_json_format(out, false, 4, "}"); } -static int process_sample_event(struct perf_tool *tool, +static int process_sample_event(const struct perf_tool *tool, union perf_event *event __maybe_unused, struct perf_sample *sample, struct evsel *evsel __maybe_unused, diff --git a/tools/perf/util/event.c b/tools/perf/util/event.c index f32f9abf6344..c2f0e7f40ad5 100644 --- a/tools/perf/util/event.c +++ b/tools/perf/util/event.c @@ -216,7 +216,7 @@ size_t perf_event__fprintf_cgroup(union perf_event *event, FILE *fp) event->cgroup.id, event->cgroup.path); } -int perf_event__process_comm(struct perf_tool *tool __maybe_unused, +int perf_event__process_comm(const struct perf_tool *tool __maybe_unused, union perf_event *event, struct perf_sample *sample, struct machine *machine) @@ -224,7 +224,7 @@ int perf_event__process_comm(struct perf_tool *tool __maybe_unused, return machine__process_comm_event(machine, event, sample); } -int perf_event__process_namespaces(struct perf_tool *tool __maybe_unused, +int perf_event__process_namespaces(const struct perf_tool *tool __maybe_unused, union perf_event *event, struct perf_sample *sample, struct machine *machine) @@ -232,7 +232,7 @@ int perf_event__process_namespaces(struct perf_tool *tool __maybe_unused, return machine__process_namespaces_event(machine, event, sample); } -int perf_event__process_cgroup(struct perf_tool *tool __maybe_unused, +int perf_event__process_cgroup(const struct perf_tool *tool __maybe_unused, union perf_event *event, struct perf_sample *sample, struct machine *machine) @@ -240,7 +240,7 @@ int perf_event__process_cgroup(struct perf_tool *tool __maybe_unused, return machine__process_cgroup_event(machine, event, sample); } -int perf_event__process_lost(struct perf_tool *tool __maybe_unused, +int perf_event__process_lost(const struct perf_tool *tool __maybe_unused, union perf_event *event, struct perf_sample *sample, struct machine *machine) @@ -248,7 +248,7 @@ int perf_event__process_lost(struct perf_tool *tool __maybe_unused, return machine__process_lost_event(machine, event, sample); } -int perf_event__process_aux(struct perf_tool *tool __maybe_unused, +int perf_event__process_aux(const struct perf_tool *tool __maybe_unused, union perf_event *event, struct perf_sample *sample __maybe_unused, struct machine *machine) @@ -256,7 +256,7 @@ int perf_event__process_aux(struct perf_tool *tool __maybe_unused, return machine__process_aux_event(machine, event); } -int perf_event__process_itrace_start(struct perf_tool *tool __maybe_unused, +int perf_event__process_itrace_start(const struct perf_tool *tool __maybe_unused, union perf_event *event, struct perf_sample *sample __maybe_unused, struct machine *machine) @@ -264,7 +264,7 @@ int perf_event__process_itrace_start(struct perf_tool *tool __maybe_unused, return machine__process_itrace_start_event(machine, event); } -int perf_event__process_aux_output_hw_id(struct perf_tool *tool __maybe_unused, +int perf_event__process_aux_output_hw_id(const struct perf_tool *tool __maybe_unused, union perf_event *event, struct perf_sample *sample __maybe_unused, struct machine *machine) @@ -272,7 +272,7 @@ int perf_event__process_aux_output_hw_id(struct perf_tool *tool __maybe_unused, return machine__process_aux_output_hw_id_event(machine, event); } -int perf_event__process_lost_samples(struct perf_tool *tool __maybe_unused, +int perf_event__process_lost_samples(const struct perf_tool *tool __maybe_unused, union perf_event *event, struct perf_sample *sample, struct machine *machine) @@ -280,7 +280,7 @@ int perf_event__process_lost_samples(struct perf_tool *tool __maybe_unused, return machine__process_lost_samples_event(machine, event, sample); } -int perf_event__process_switch(struct perf_tool *tool __maybe_unused, +int perf_event__process_switch(const struct perf_tool *tool __maybe_unused, union perf_event *event, struct perf_sample *sample __maybe_unused, struct machine *machine) @@ -288,7 +288,7 @@ int perf_event__process_switch(struct perf_tool *tool __maybe_unused, return machine__process_switch_event(machine, event); } -int perf_event__process_ksymbol(struct perf_tool *tool __maybe_unused, +int perf_event__process_ksymbol(const struct perf_tool *tool __maybe_unused, union perf_event *event, struct perf_sample *sample __maybe_unused, struct machine *machine) @@ -296,7 +296,7 @@ int perf_event__process_ksymbol(struct perf_tool *tool __maybe_unused, return machine__process_ksymbol(machine, event, sample); } -int perf_event__process_bpf(struct perf_tool *tool __maybe_unused, +int perf_event__process_bpf(const struct perf_tool *tool __maybe_unused, union perf_event *event, struct perf_sample *sample, struct machine *machine) @@ -304,7 +304,7 @@ int perf_event__process_bpf(struct perf_tool *tool __maybe_unused, return machine__process_bpf(machine, event, sample); } -int perf_event__process_text_poke(struct perf_tool *tool __maybe_unused, +int perf_event__process_text_poke(const struct perf_tool *tool __maybe_unused, union perf_event *event, struct perf_sample *sample, struct machine *machine) @@ -387,7 +387,7 @@ size_t perf_event__fprintf_cpu_map(union perf_event *event, FILE *fp) return ret; } -int perf_event__process_mmap(struct perf_tool *tool __maybe_unused, +int perf_event__process_mmap(const struct perf_tool *tool __maybe_unused, union perf_event *event, struct perf_sample *sample, struct machine *machine) @@ -395,7 +395,7 @@ int perf_event__process_mmap(struct perf_tool *tool __maybe_unused, return machine__process_mmap_event(machine, event, sample); } -int perf_event__process_mmap2(struct perf_tool *tool __maybe_unused, +int perf_event__process_mmap2(const struct perf_tool *tool __maybe_unused, union perf_event *event, struct perf_sample *sample, struct machine *machine) @@ -410,7 +410,7 @@ size_t perf_event__fprintf_task(union perf_event *event, FILE *fp) event->fork.ppid, event->fork.ptid); } -int perf_event__process_fork(struct perf_tool *tool __maybe_unused, +int perf_event__process_fork(const struct perf_tool *tool __maybe_unused, union perf_event *event, struct perf_sample *sample, struct machine *machine) @@ -418,7 +418,7 @@ int perf_event__process_fork(struct perf_tool *tool __maybe_unused, return machine__process_fork_event(machine, event, sample); } -int perf_event__process_exit(struct perf_tool *tool __maybe_unused, +int perf_event__process_exit(const struct perf_tool *tool __maybe_unused, union perf_event *event, struct perf_sample *sample, struct machine *machine) @@ -587,7 +587,7 @@ size_t perf_event__fprintf(union perf_event *event, struct machine *machine, FIL return ret; } -int perf_event__process(struct perf_tool *tool __maybe_unused, +int perf_event__process(const struct perf_tool *tool __maybe_unused, union perf_event *event, struct perf_sample *sample, struct machine *machine) diff --git a/tools/perf/util/event.h b/tools/perf/util/event.h index d8bcee2e9b93..4b24f1c580fd 100644 --- a/tools/perf/util/event.h +++ b/tools/perf/util/event.h @@ -267,71 +267,71 @@ struct perf_tool; void perf_event__read_stat_config(struct perf_stat_config *config, struct perf_record_stat_config *event); -int perf_event__process_comm(struct perf_tool *tool, +int perf_event__process_comm(const struct perf_tool *tool, union perf_event *event, struct perf_sample *sample, struct machine *machine); -int perf_event__process_lost(struct perf_tool *tool, +int perf_event__process_lost(const struct perf_tool *tool, union perf_event *event, struct perf_sample *sample, struct machine *machine); -int perf_event__process_lost_samples(struct perf_tool *tool, +int perf_event__process_lost_samples(const struct perf_tool *tool, union perf_event *event, struct perf_sample *sample, struct machine *machine); -int perf_event__process_aux(struct perf_tool *tool, +int perf_event__process_aux(const struct perf_tool *tool, union perf_event *event, struct perf_sample *sample, struct machine *machine); -int perf_event__process_itrace_start(struct perf_tool *tool, +int perf_event__process_itrace_start(const struct perf_tool *tool, union perf_event *event, struct perf_sample *sample, struct machine *machine); -int perf_event__process_aux_output_hw_id(struct perf_tool *tool, +int perf_event__process_aux_output_hw_id(const struct perf_tool *tool, union perf_event *event, struct perf_sample *sample, struct machine *machine); -int perf_event__process_switch(struct perf_tool *tool, +int perf_event__process_switch(const struct perf_tool *tool, union perf_event *event, struct perf_sample *sample, struct machine *machine); -int perf_event__process_namespaces(struct perf_tool *tool, +int perf_event__process_namespaces(const struct perf_tool *tool, union perf_event *event, struct perf_sample *sample, struct machine *machine); -int perf_event__process_cgroup(struct perf_tool *tool, +int perf_event__process_cgroup(const struct perf_tool *tool, union perf_event *event, struct perf_sample *sample, struct machine *machine); -int perf_event__process_mmap(struct perf_tool *tool, +int perf_event__process_mmap(const struct perf_tool *tool, union perf_event *event, struct perf_sample *sample, struct machine *machine); -int perf_event__process_mmap2(struct perf_tool *tool, +int perf_event__process_mmap2(const struct perf_tool *tool, union perf_event *event, struct perf_sample *sample, struct machine *machine); -int perf_event__process_fork(struct perf_tool *tool, +int perf_event__process_fork(const struct perf_tool *tool, union perf_event *event, struct perf_sample *sample, struct machine *machine); -int perf_event__process_exit(struct perf_tool *tool, +int perf_event__process_exit(const struct perf_tool *tool, union perf_event *event, struct perf_sample *sample, struct machine *machine); -int perf_event__process_ksymbol(struct perf_tool *tool, +int perf_event__process_ksymbol(const struct perf_tool *tool, union perf_event *event, struct perf_sample *sample, struct machine *machine); -int perf_event__process_bpf(struct perf_tool *tool, +int perf_event__process_bpf(const struct perf_tool *tool, union perf_event *event, struct perf_sample *sample, struct machine *machine); -int perf_event__process_text_poke(struct perf_tool *tool, +int perf_event__process_text_poke(const struct perf_tool *tool, union perf_event *event, struct perf_sample *sample, struct machine *machine); -int perf_event__process(struct perf_tool *tool, +int perf_event__process(const struct perf_tool *tool, union perf_event *event, struct perf_sample *sample, struct machine *machine); diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c index 55e9553861d0..3309fe7f1d12 100644 --- a/tools/perf/util/header.c +++ b/tools/perf/util/header.c @@ -4326,7 +4326,7 @@ out_delete_evlist: int perf_event__process_feature(struct perf_session *session, union perf_event *event) { - struct perf_tool *tool = session->tool; + const struct perf_tool *tool = session->tool; struct feat_fd ff = { .fd = 0 }; struct perf_record_header_feature *fe = (struct perf_record_header_feature *)event; int type = fe->header.type; @@ -4405,7 +4405,7 @@ size_t perf_event__fprintf_event_update(union perf_event *event, FILE *fp) return ret; } -int perf_event__process_attr(struct perf_tool *tool __maybe_unused, +int perf_event__process_attr(const struct perf_tool *tool __maybe_unused, union perf_event *event, struct evlist **pevlist) { @@ -4444,7 +4444,7 @@ int perf_event__process_attr(struct perf_tool *tool __maybe_unused, return 0; } -int perf_event__process_event_update(struct perf_tool *tool __maybe_unused, +int perf_event__process_event_update(const struct perf_tool *tool __maybe_unused, union perf_event *event, struct evlist **pevlist) { diff --git a/tools/perf/util/header.h b/tools/perf/util/header.h index 7c16a250e738..07ff647197ff 100644 --- a/tools/perf/util/header.h +++ b/tools/perf/util/header.h @@ -156,9 +156,9 @@ int perf_header__fprintf_info(struct perf_session *s, FILE *fp, bool full); int perf_event__process_feature(struct perf_session *session, union perf_event *event); -int perf_event__process_attr(struct perf_tool *tool, union perf_event *event, +int perf_event__process_attr(const struct perf_tool *tool, union perf_event *event, struct evlist **pevlist); -int perf_event__process_event_update(struct perf_tool *tool, +int perf_event__process_event_update(const struct perf_tool *tool, union perf_event *event, struct evlist **pevlist); size_t perf_event__fprintf_event_update(union perf_event *event, FILE *fp); diff --git a/tools/perf/util/hisi-ptt.c b/tools/perf/util/hisi-ptt.c index 37ea987017f6..e4cc4785f744 100644 --- a/tools/perf/util/hisi-ptt.c +++ b/tools/perf/util/hisi-ptt.c @@ -79,14 +79,14 @@ static void hisi_ptt_dump_event(struct hisi_ptt *ptt, unsigned char *buf, static int hisi_ptt_process_event(struct perf_session *session __maybe_unused, union perf_event *event __maybe_unused, struct perf_sample *sample __maybe_unused, - struct perf_tool *tool __maybe_unused) + const struct perf_tool *tool __maybe_unused) { return 0; } static int hisi_ptt_process_auxtrace_event(struct perf_session *session, union perf_event *event, - struct perf_tool *tool __maybe_unused) + const struct perf_tool *tool __maybe_unused) { struct hisi_ptt *ptt = container_of(session->auxtrace, struct hisi_ptt, auxtrace); @@ -123,7 +123,7 @@ static int hisi_ptt_process_auxtrace_event(struct perf_session *session, } static int hisi_ptt_flush(struct perf_session *session __maybe_unused, - struct perf_tool *tool __maybe_unused) + const struct perf_tool *tool __maybe_unused) { return 0; } diff --git a/tools/perf/util/intel-bts.c b/tools/perf/util/intel-bts.c index 160d346cb450..27d9b5c9fec8 100644 --- a/tools/perf/util/intel-bts.c +++ b/tools/perf/util/intel-bts.c @@ -591,7 +591,7 @@ static int intel_bts_process_queues(struct intel_bts *bts, u64 timestamp) static int intel_bts_process_event(struct perf_session *session, union perf_event *event, struct perf_sample *sample, - struct perf_tool *tool) + const struct perf_tool *tool) { struct intel_bts *bts = container_of(session->auxtrace, struct intel_bts, auxtrace); @@ -634,7 +634,7 @@ static int intel_bts_process_event(struct perf_session *session, static int intel_bts_process_auxtrace_event(struct perf_session *session, union perf_event *event, - struct perf_tool *tool __maybe_unused) + const struct perf_tool *tool __maybe_unused) { struct intel_bts *bts = container_of(session->auxtrace, struct intel_bts, auxtrace); @@ -675,7 +675,7 @@ static int intel_bts_process_auxtrace_event(struct perf_session *session, } static int intel_bts_flush(struct perf_session *session, - struct perf_tool *tool __maybe_unused) + const struct perf_tool *tool __maybe_unused) { struct intel_bts *bts = container_of(session->auxtrace, struct intel_bts, auxtrace); diff --git a/tools/perf/util/intel-pt.c b/tools/perf/util/intel-pt.c index 1fc03bfd14c3..fd2597613f3d 100644 --- a/tools/perf/util/intel-pt.c +++ b/tools/perf/util/intel-pt.c @@ -3449,7 +3449,7 @@ out: static int intel_pt_process_event(struct perf_session *session, union perf_event *event, struct perf_sample *sample, - struct perf_tool *tool) + const struct perf_tool *tool) { struct intel_pt *pt = container_of(session->auxtrace, struct intel_pt, auxtrace); @@ -3533,7 +3533,7 @@ static int intel_pt_process_event(struct perf_session *session, return err; } -static int intel_pt_flush(struct perf_session *session, struct perf_tool *tool) +static int intel_pt_flush(struct perf_session *session, const struct perf_tool *tool) { struct intel_pt *pt = container_of(session->auxtrace, struct intel_pt, auxtrace); @@ -3600,7 +3600,7 @@ static bool intel_pt_evsel_is_auxtrace(struct perf_session *session, static int intel_pt_process_auxtrace_event(struct perf_session *session, union perf_event *event, - struct perf_tool *tool __maybe_unused) + const struct perf_tool *tool __maybe_unused) { struct intel_pt *pt = container_of(session->auxtrace, struct intel_pt, auxtrace); diff --git a/tools/perf/util/jitdump.c b/tools/perf/util/jitdump.c index 1f657ef8975f..5ce13653512b 100644 --- a/tools/perf/util/jitdump.c +++ b/tools/perf/util/jitdump.c @@ -424,7 +424,7 @@ static int jit_repipe_code_load(struct jit_buf_desc *jd, union jr_entry *jr) { struct perf_sample sample; union perf_event *event; - struct perf_tool *tool = jd->session->tool; + const struct perf_tool *tool = jd->session->tool; uint64_t code, addr; uintptr_t uaddr; char *filename; @@ -543,7 +543,7 @@ static int jit_repipe_code_move(struct jit_buf_desc *jd, union jr_entry *jr) { struct perf_sample sample; union perf_event *event; - struct perf_tool *tool = jd->session->tool; + const struct perf_tool *tool = jd->session->tool; char *filename; size_t size; struct stat st; diff --git a/tools/perf/util/s390-cpumsf.c b/tools/perf/util/s390-cpumsf.c index 4ec583e511af..73846b73d0cf 100644 --- a/tools/perf/util/s390-cpumsf.c +++ b/tools/perf/util/s390-cpumsf.c @@ -912,7 +912,7 @@ static int s390_cpumsf_process_event(struct perf_session *session, union perf_event *event, struct perf_sample *sample, - struct perf_tool *tool) + const struct perf_tool *tool) { struct s390_cpumsf *sf = container_of(session->auxtrace, struct s390_cpumsf, @@ -955,7 +955,7 @@ s390_cpumsf_process_event(struct perf_session *session, static int s390_cpumsf_process_auxtrace_event(struct perf_session *session, union perf_event *event __maybe_unused, - struct perf_tool *tool __maybe_unused) + const struct perf_tool *tool __maybe_unused) { struct s390_cpumsf *sf = container_of(session->auxtrace, struct s390_cpumsf, @@ -998,7 +998,7 @@ static void s390_cpumsf_free_events(struct perf_session *session __maybe_unused) } static int s390_cpumsf_flush(struct perf_session *session __maybe_unused, - struct perf_tool *tool __maybe_unused) + const struct perf_tool *tool __maybe_unused) { return 0; } diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c index aa02c1d7b1f9..41beebd0913c 100644 --- a/tools/perf/util/session.c +++ b/tools/perf/util/session.c @@ -103,7 +103,7 @@ static int perf_session__process_compressed_event(struct perf_session *session, static int perf_session__deliver_event(struct perf_session *session, union perf_event *event, - struct perf_tool *tool, + const struct perf_tool *tool, u64 file_offset, const char *file_path); @@ -330,7 +330,7 @@ static int process_event_synth_tracing_data_stub(struct perf_session *session return 0; } -static int process_event_synth_attr_stub(struct perf_tool *tool __maybe_unused, +static int process_event_synth_attr_stub(const struct perf_tool *tool __maybe_unused, union perf_event *event __maybe_unused, struct evlist **pevlist __maybe_unused) @@ -339,7 +339,7 @@ static int process_event_synth_attr_stub(struct perf_tool *tool __maybe_unused, return 0; } -static int process_event_synth_event_update_stub(struct perf_tool *tool __maybe_unused, +static int process_event_synth_event_update_stub(const struct perf_tool *tool __maybe_unused, union perf_event *event __maybe_unused, struct evlist **pevlist __maybe_unused) @@ -351,7 +351,7 @@ static int process_event_synth_event_update_stub(struct perf_tool *tool __maybe_ return 0; } -static int process_event_sample_stub(struct perf_tool *tool __maybe_unused, +static int process_event_sample_stub(const struct perf_tool *tool __maybe_unused, union perf_event *event __maybe_unused, struct perf_sample *sample __maybe_unused, struct evsel *evsel __maybe_unused, @@ -361,7 +361,7 @@ static int process_event_sample_stub(struct perf_tool *tool __maybe_unused, return 0; } -static int process_event_stub(struct perf_tool *tool __maybe_unused, +static int process_event_stub(const struct perf_tool *tool __maybe_unused, union perf_event *event __maybe_unused, struct perf_sample *sample __maybe_unused, struct machine *machine __maybe_unused) @@ -370,7 +370,7 @@ static int process_event_stub(struct perf_tool *tool __maybe_unused, return 0; } -static int process_finished_round_stub(struct perf_tool *tool __maybe_unused, +static int process_finished_round_stub(const struct perf_tool *tool __maybe_unused, union perf_event *event __maybe_unused, struct ordered_events *oe __maybe_unused) { @@ -1078,7 +1078,7 @@ static perf_event__swap_op perf_event__swap_ops[] = { * Flush every events below timestamp 7 * etc... */ -int perf_event__process_finished_round(struct perf_tool *tool __maybe_unused, +int perf_event__process_finished_round(const struct perf_tool *tool __maybe_unused, union perf_event *event __maybe_unused, struct ordered_events *oe) { @@ -1472,7 +1472,7 @@ static struct machine *machines__find_for_cpumode(struct machines *machines, } static int deliver_sample_value(struct evlist *evlist, - struct perf_tool *tool, + const struct perf_tool *tool, union perf_event *event, struct perf_sample *sample, struct sample_read_value *v, @@ -1504,7 +1504,7 @@ static int deliver_sample_value(struct evlist *evlist, } static int deliver_sample_group(struct evlist *evlist, - struct perf_tool *tool, + const struct perf_tool *tool, union perf_event *event, struct perf_sample *sample, struct machine *machine, @@ -1526,7 +1526,7 @@ static int deliver_sample_group(struct evlist *evlist, return ret; } -static int evlist__deliver_sample(struct evlist *evlist, struct perf_tool *tool, +static int evlist__deliver_sample(struct evlist *evlist, const struct perf_tool *tool, union perf_event *event, struct perf_sample *sample, struct evsel *evsel, struct machine *machine) { @@ -1551,7 +1551,7 @@ static int machines__deliver_event(struct machines *machines, struct evlist *evlist, union perf_event *event, struct perf_sample *sample, - struct perf_tool *tool, u64 file_offset, + const struct perf_tool *tool, u64 file_offset, const char *file_path) { struct evsel *evsel; @@ -1639,7 +1639,7 @@ static int machines__deliver_event(struct machines *machines, static int perf_session__deliver_event(struct perf_session *session, union perf_event *event, - struct perf_tool *tool, + const struct perf_tool *tool, u64 file_offset, const char *file_path) { @@ -1672,7 +1672,7 @@ static s64 perf_session__process_user_event(struct perf_session *session, const char *file_path) { struct ordered_events *oe = &session->ordered_events; - struct perf_tool *tool = session->tool; + const struct perf_tool *tool = session->tool; struct perf_sample sample = { .time = 0, }; int fd = perf_data__fd(session->data); int err; @@ -1759,7 +1759,7 @@ int perf_session__deliver_synth_event(struct perf_session *session, struct perf_sample *sample) { struct evlist *evlist = session->evlist; - struct perf_tool *tool = session->tool; + const struct perf_tool *tool = session->tool; events_stats__inc(&evlist->stats, event->header.type); @@ -1891,7 +1891,7 @@ static s64 perf_session__process_event(struct perf_session *session, const char *file_path) { struct evlist *evlist = session->evlist; - struct perf_tool *tool = session->tool; + const struct perf_tool *tool = session->tool; int ret; if (session->header.needs_swap) diff --git a/tools/perf/util/session.h b/tools/perf/util/session.h index cc28976bb00e..6886cc85600f 100644 --- a/tools/perf/util/session.h +++ b/tools/perf/util/session.h @@ -163,7 +163,7 @@ int perf_session__dsos_hit_all(struct perf_session *session); int perf_event__process_id_index(struct perf_session *session, union perf_event *event); -int perf_event__process_finished_round(struct perf_tool *tool, +int perf_event__process_finished_round(const struct perf_tool *tool, union perf_event *event, struct ordered_events *oe); diff --git a/tools/perf/util/synthetic-events.c b/tools/perf/util/synthetic-events.c index 5498048f56ea..7f884d70de81 100644 --- a/tools/perf/util/synthetic-events.c +++ b/tools/perf/util/synthetic-events.c @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0-only +// SPDX-License-Identifier: GPL-2.0-only #include "util/cgroup.h" #include "util/data.h" @@ -47,7 +47,7 @@ unsigned int proc_map_timeout = DEFAULT_PROC_MAP_PARSE_TIMEOUT; -int perf_tool__process_synth_event(struct perf_tool *tool, +int perf_tool__process_synth_event(const struct perf_tool *tool, union perf_event *event, struct machine *machine, perf_event__handler_t process) @@ -187,7 +187,7 @@ static int perf_event__prepare_comm(union perf_event *event, pid_t pid, pid_t ti return 0; } -pid_t perf_event__synthesize_comm(struct perf_tool *tool, +pid_t perf_event__synthesize_comm(const struct perf_tool *tool, union perf_event *event, pid_t pid, perf_event__handler_t process, struct machine *machine) @@ -218,7 +218,7 @@ static void perf_event__get_ns_link_info(pid_t pid, const char *ns, } } -int perf_event__synthesize_namespaces(struct perf_tool *tool, +int perf_event__synthesize_namespaces(const struct perf_tool *tool, union perf_event *event, pid_t pid, pid_t tgid, perf_event__handler_t process, @@ -257,7 +257,7 @@ int perf_event__synthesize_namespaces(struct perf_tool *tool, return 0; } -static int perf_event__synthesize_fork(struct perf_tool *tool, +static int perf_event__synthesize_fork(const struct perf_tool *tool, union perf_event *event, pid_t pid, pid_t tgid, pid_t ppid, perf_event__handler_t process, @@ -418,7 +418,7 @@ out: dso__put(dso); } -int perf_event__synthesize_mmap_events(struct perf_tool *tool, +int perf_event__synthesize_mmap_events(const struct perf_tool *tool, union perf_event *event, pid_t pid, pid_t tgid, perf_event__handler_t process, @@ -542,7 +542,7 @@ out: } #ifdef HAVE_FILE_HANDLE -static int perf_event__synthesize_cgroup(struct perf_tool *tool, +static int perf_event__synthesize_cgroup(const struct perf_tool *tool, union perf_event *event, char *path, size_t mount_len, perf_event__handler_t process, @@ -582,7 +582,7 @@ static int perf_event__synthesize_cgroup(struct perf_tool *tool, return 0; } -static int perf_event__walk_cgroup_tree(struct perf_tool *tool, +static int perf_event__walk_cgroup_tree(const struct perf_tool *tool, union perf_event *event, char *path, size_t mount_len, perf_event__handler_t process, @@ -630,7 +630,7 @@ static int perf_event__walk_cgroup_tree(struct perf_tool *tool, return ret; } -int perf_event__synthesize_cgroups(struct perf_tool *tool, +int perf_event__synthesize_cgroups(const struct perf_tool *tool, perf_event__handler_t process, struct machine *machine) { @@ -657,7 +657,7 @@ int perf_event__synthesize_cgroups(struct perf_tool *tool, return 0; } #else -int perf_event__synthesize_cgroups(struct perf_tool *tool __maybe_unused, +int perf_event__synthesize_cgroups(const struct perf_tool *tool __maybe_unused, perf_event__handler_t process __maybe_unused, struct machine *machine __maybe_unused) { @@ -666,7 +666,7 @@ int perf_event__synthesize_cgroups(struct perf_tool *tool __maybe_unused, #endif struct perf_event__synthesize_modules_maps_cb_args { - struct perf_tool *tool; + const struct perf_tool *tool; perf_event__handler_t process; struct machine *machine; union perf_event *event; @@ -717,7 +717,7 @@ static int perf_event__synthesize_modules_maps_cb(struct map *map, void *data) return 0; } -int perf_event__synthesize_modules(struct perf_tool *tool, perf_event__handler_t process, +int perf_event__synthesize_modules(const struct perf_tool *tool, perf_event__handler_t process, struct machine *machine) { int rc; @@ -763,7 +763,7 @@ static int __event__synthesize_thread(union perf_event *comm_event, union perf_event *fork_event, union perf_event *namespaces_event, pid_t pid, int full, perf_event__handler_t process, - struct perf_tool *tool, struct machine *machine, + const struct perf_tool *tool, struct machine *machine, bool needs_mmap, bool mmap_data) { char filename[PATH_MAX]; @@ -852,7 +852,7 @@ static int __event__synthesize_thread(union perf_event *comm_event, return rc; } -int perf_event__synthesize_thread_map(struct perf_tool *tool, +int perf_event__synthesize_thread_map(const struct perf_tool *tool, struct perf_thread_map *threads, perf_event__handler_t process, struct machine *machine, @@ -929,7 +929,7 @@ out: return err; } -static int __perf_event__synthesize_threads(struct perf_tool *tool, +static int __perf_event__synthesize_threads(const struct perf_tool *tool, perf_event__handler_t process, struct machine *machine, bool needs_mmap, @@ -993,7 +993,7 @@ out: } struct synthesize_threads_arg { - struct perf_tool *tool; + const struct perf_tool *tool; perf_event__handler_t process; struct machine *machine; bool needs_mmap; @@ -1015,7 +1015,7 @@ static void *synthesize_threads_worker(void *arg) return NULL; } -int perf_event__synthesize_threads(struct perf_tool *tool, +int perf_event__synthesize_threads(const struct perf_tool *tool, perf_event__handler_t process, struct machine *machine, bool needs_mmap, bool mmap_data, @@ -1104,14 +1104,14 @@ free_dirent: return err; } -int __weak perf_event__synthesize_extra_kmaps(struct perf_tool *tool __maybe_unused, +int __weak perf_event__synthesize_extra_kmaps(const struct perf_tool *tool __maybe_unused, perf_event__handler_t process __maybe_unused, struct machine *machine __maybe_unused) { return 0; } -static int __perf_event__synthesize_kernel_mmap(struct perf_tool *tool, +static int __perf_event__synthesize_kernel_mmap(const struct perf_tool *tool, perf_event__handler_t process, struct machine *machine) { @@ -1183,7 +1183,7 @@ static int __perf_event__synthesize_kernel_mmap(struct perf_tool *tool, return err; } -int perf_event__synthesize_kernel_mmap(struct perf_tool *tool, +int perf_event__synthesize_kernel_mmap(const struct perf_tool *tool, perf_event__handler_t process, struct machine *machine) { @@ -1196,7 +1196,7 @@ int perf_event__synthesize_kernel_mmap(struct perf_tool *tool, return perf_event__synthesize_extra_kmaps(tool, process, machine); } -int perf_event__synthesize_thread_map2(struct perf_tool *tool, +int perf_event__synthesize_thread_map2(const struct perf_tool *tool, struct perf_thread_map *threads, perf_event__handler_t process, struct machine *machine) @@ -1346,7 +1346,7 @@ static struct perf_record_cpu_map *cpu_map_event__new(const struct perf_cpu_map } -int perf_event__synthesize_cpu_map(struct perf_tool *tool, +int perf_event__synthesize_cpu_map(const struct perf_tool *tool, const struct perf_cpu_map *map, perf_event__handler_t process, struct machine *machine) @@ -1364,7 +1364,7 @@ int perf_event__synthesize_cpu_map(struct perf_tool *tool, return err; } -int perf_event__synthesize_stat_config(struct perf_tool *tool, +int perf_event__synthesize_stat_config(const struct perf_tool *tool, struct perf_stat_config *config, perf_event__handler_t process, struct machine *machine) @@ -1403,7 +1403,7 @@ int perf_event__synthesize_stat_config(struct perf_tool *tool, return err; } -int perf_event__synthesize_stat(struct perf_tool *tool, +int perf_event__synthesize_stat(const struct perf_tool *tool, struct perf_cpu cpu, u32 thread, u64 id, struct perf_counts_values *count, perf_event__handler_t process, @@ -1425,7 +1425,7 @@ int perf_event__synthesize_stat(struct perf_tool *tool, return process(tool, (union perf_event *) &event, NULL, machine); } -int perf_event__synthesize_stat_round(struct perf_tool *tool, +int perf_event__synthesize_stat_round(const struct perf_tool *tool, u64 evtime, u64 type, perf_event__handler_t process, struct machine *machine) @@ -1826,7 +1826,7 @@ int perf_event__synthesize_id_sample(__u64 *array, u64 type, const struct perf_s return (void *)array - (void *)start; } -int __perf_event__synthesize_id_index(struct perf_tool *tool, perf_event__handler_t process, +int __perf_event__synthesize_id_index(const struct perf_tool *tool, perf_event__handler_t process, struct evlist *evlist, struct machine *machine, size_t from) { union perf_event *ev; @@ -1918,13 +1918,13 @@ out_err: return err; } -int perf_event__synthesize_id_index(struct perf_tool *tool, perf_event__handler_t process, +int perf_event__synthesize_id_index(const struct perf_tool *tool, perf_event__handler_t process, struct evlist *evlist, struct machine *machine) { return __perf_event__synthesize_id_index(tool, process, evlist, machine, 0); } -int __machine__synthesize_threads(struct machine *machine, struct perf_tool *tool, +int __machine__synthesize_threads(struct machine *machine, const struct perf_tool *tool, struct target *target, struct perf_thread_map *threads, perf_event__handler_t process, bool needs_mmap, bool data_mmap, unsigned int nr_threads_synthesize) @@ -1985,7 +1985,7 @@ static struct perf_record_event_update *event_update_event__new(size_t size, u64 return ev; } -int perf_event__synthesize_event_update_unit(struct perf_tool *tool, struct evsel *evsel, +int perf_event__synthesize_event_update_unit(const struct perf_tool *tool, struct evsel *evsel, perf_event__handler_t process) { size_t size = strlen(evsel->unit); @@ -2002,7 +2002,7 @@ int perf_event__synthesize_event_update_unit(struct perf_tool *tool, struct evse return err; } -int perf_event__synthesize_event_update_scale(struct perf_tool *tool, struct evsel *evsel, +int perf_event__synthesize_event_update_scale(const struct perf_tool *tool, struct evsel *evsel, perf_event__handler_t process) { struct perf_record_event_update *ev; @@ -2019,7 +2019,7 @@ int perf_event__synthesize_event_update_scale(struct perf_tool *tool, struct evs return err; } -int perf_event__synthesize_event_update_name(struct perf_tool *tool, struct evsel *evsel, +int perf_event__synthesize_event_update_name(const struct perf_tool *tool, struct evsel *evsel, perf_event__handler_t process) { struct perf_record_event_update *ev; @@ -2036,7 +2036,7 @@ int perf_event__synthesize_event_update_name(struct perf_tool *tool, struct evse return err; } -int perf_event__synthesize_event_update_cpus(struct perf_tool *tool, struct evsel *evsel, +int perf_event__synthesize_event_update_cpus(const struct perf_tool *tool, struct evsel *evsel, perf_event__handler_t process) { struct synthesize_cpu_map_data syn_data = { .map = evsel->core.own_cpus }; @@ -2059,7 +2059,7 @@ int perf_event__synthesize_event_update_cpus(struct perf_tool *tool, struct evse return err; } -int perf_event__synthesize_attrs(struct perf_tool *tool, struct evlist *evlist, +int perf_event__synthesize_attrs(const struct perf_tool *tool, struct evlist *evlist, perf_event__handler_t process) { struct evsel *evsel; @@ -2087,7 +2087,7 @@ static bool has_scale(struct evsel *evsel) return evsel->scale != 1; } -int perf_event__synthesize_extra_attr(struct perf_tool *tool, struct evlist *evsel_list, +int perf_event__synthesize_extra_attr(const struct perf_tool *tool, struct evlist *evsel_list, perf_event__handler_t process, bool is_pipe) { struct evsel *evsel; @@ -2143,7 +2143,7 @@ int perf_event__synthesize_extra_attr(struct perf_tool *tool, struct evlist *evs return 0; } -int perf_event__synthesize_attr(struct perf_tool *tool, struct perf_event_attr *attr, +int perf_event__synthesize_attr(const struct perf_tool *tool, struct perf_event_attr *attr, u32 ids, u64 *id, perf_event__handler_t process) { union perf_event *ev; @@ -2177,7 +2177,7 @@ int perf_event__synthesize_attr(struct perf_tool *tool, struct perf_event_attr * } #ifdef HAVE_LIBTRACEEVENT -int perf_event__synthesize_tracing_data(struct perf_tool *tool, int fd, struct evlist *evlist, +int perf_event__synthesize_tracing_data(const struct perf_tool *tool, int fd, struct evlist *evlist, perf_event__handler_t process) { union perf_event ev; @@ -2225,7 +2225,7 @@ int perf_event__synthesize_tracing_data(struct perf_tool *tool, int fd, struct e } #endif -int perf_event__synthesize_build_id(struct perf_tool *tool, struct dso *pos, u16 misc, +int perf_event__synthesize_build_id(const struct perf_tool *tool, struct dso *pos, u16 misc, perf_event__handler_t process, struct machine *machine) { union perf_event ev; @@ -2249,7 +2249,7 @@ int perf_event__synthesize_build_id(struct perf_tool *tool, struct dso *pos, u16 return process(tool, &ev, NULL, machine); } -int perf_event__synthesize_stat_events(struct perf_stat_config *config, struct perf_tool *tool, +int perf_event__synthesize_stat_events(struct perf_stat_config *config, const struct perf_tool *tool, struct evlist *evlist, perf_event__handler_t process, bool attrs) { int err; @@ -2286,7 +2286,7 @@ int perf_event__synthesize_stat_events(struct perf_stat_config *config, struct p extern const struct perf_header_feature_ops feat_ops[HEADER_LAST_FEATURE]; -int perf_event__synthesize_features(struct perf_tool *tool, struct perf_session *session, +int perf_event__synthesize_features(const struct perf_tool *tool, struct perf_session *session, struct evlist *evlist, perf_event__handler_t process) { struct perf_header *header = &session->header; @@ -2349,7 +2349,7 @@ int perf_event__synthesize_features(struct perf_tool *tool, struct perf_session return ret; } -int perf_event__synthesize_for_pipe(struct perf_tool *tool, +int perf_event__synthesize_for_pipe(const struct perf_tool *tool, struct perf_session *session, struct perf_data *data, perf_event__handler_t process) diff --git a/tools/perf/util/synthetic-events.h b/tools/perf/util/synthetic-events.h index 53737d1619a4..31df7653677f 100644 --- a/tools/perf/util/synthetic-events.h +++ b/tools/perf/util/synthetic-events.h @@ -40,45 +40,45 @@ enum perf_record_synth { int parse_synth_opt(char *str); -typedef int (*perf_event__handler_t)(struct perf_tool *tool, union perf_event *event, +typedef int (*perf_event__handler_t)(const struct perf_tool *tool, union perf_event *event, struct perf_sample *sample, struct machine *machine); -int perf_event__synthesize_attrs(struct perf_tool *tool, struct evlist *evlist, perf_event__handler_t process); -int perf_event__synthesize_attr(struct perf_tool *tool, struct perf_event_attr *attr, u32 ids, u64 *id, perf_event__handler_t process); -int perf_event__synthesize_build_id(struct perf_tool *tool, struct dso *pos, u16 misc, perf_event__handler_t process, struct machine *machine); -int perf_event__synthesize_cpu_map(struct perf_tool *tool, const struct perf_cpu_map *cpus, perf_event__handler_t process, struct machine *machine); -int perf_event__synthesize_event_update_cpus(struct perf_tool *tool, struct evsel *evsel, perf_event__handler_t process); -int perf_event__synthesize_event_update_name(struct perf_tool *tool, struct evsel *evsel, perf_event__handler_t process); -int perf_event__synthesize_event_update_scale(struct perf_tool *tool, struct evsel *evsel, perf_event__handler_t process); -int perf_event__synthesize_event_update_unit(struct perf_tool *tool, struct evsel *evsel, perf_event__handler_t process); -int perf_event__synthesize_extra_attr(struct perf_tool *tool, struct evlist *evsel_list, perf_event__handler_t process, bool is_pipe); -int perf_event__synthesize_extra_kmaps(struct perf_tool *tool, perf_event__handler_t process, struct machine *machine); -int perf_event__synthesize_features(struct perf_tool *tool, struct perf_session *session, struct evlist *evlist, perf_event__handler_t process); -int perf_event__synthesize_id_index(struct perf_tool *tool, perf_event__handler_t process, struct evlist *evlist, struct machine *machine); -int __perf_event__synthesize_id_index(struct perf_tool *tool, perf_event__handler_t process, struct evlist *evlist, struct machine *machine, size_t from); +int perf_event__synthesize_attrs(const struct perf_tool *tool, struct evlist *evlist, perf_event__handler_t process); +int perf_event__synthesize_attr(const struct perf_tool *tool, struct perf_event_attr *attr, u32 ids, u64 *id, perf_event__handler_t process); +int perf_event__synthesize_build_id(const struct perf_tool *tool, struct dso *pos, u16 misc, perf_event__handler_t process, struct machine *machine); +int perf_event__synthesize_cpu_map(const struct perf_tool *tool, const struct perf_cpu_map *cpus, perf_event__handler_t process, struct machine *machine); +int perf_event__synthesize_event_update_cpus(const struct perf_tool *tool, struct evsel *evsel, perf_event__handler_t process); +int perf_event__synthesize_event_update_name(const struct perf_tool *tool, struct evsel *evsel, perf_event__handler_t process); +int perf_event__synthesize_event_update_scale(const struct perf_tool *tool, struct evsel *evsel, perf_event__handler_t process); +int perf_event__synthesize_event_update_unit(const struct perf_tool *tool, struct evsel *evsel, perf_event__handler_t process); +int perf_event__synthesize_extra_attr(const struct perf_tool *tool, struct evlist *evsel_list, perf_event__handler_t process, bool is_pipe); +int perf_event__synthesize_extra_kmaps(const struct perf_tool *tool, perf_event__handler_t process, struct machine *machine); +int perf_event__synthesize_features(const struct perf_tool *tool, struct perf_session *session, struct evlist *evlist, perf_event__handler_t process); +int perf_event__synthesize_id_index(const struct perf_tool *tool, perf_event__handler_t process, struct evlist *evlist, struct machine *machine); +int __perf_event__synthesize_id_index(const struct perf_tool *tool, perf_event__handler_t process, struct evlist *evlist, struct machine *machine, size_t from); int perf_event__synthesize_id_sample(__u64 *array, u64 type, const struct perf_sample *sample); -int perf_event__synthesize_kernel_mmap(struct perf_tool *tool, perf_event__handler_t process, struct machine *machine); -int perf_event__synthesize_mmap_events(struct perf_tool *tool, union perf_event *event, pid_t pid, pid_t tgid, perf_event__handler_t process, struct machine *machine, bool mmap_data); -int perf_event__synthesize_modules(struct perf_tool *tool, perf_event__handler_t process, struct machine *machine); -int perf_event__synthesize_namespaces(struct perf_tool *tool, union perf_event *event, pid_t pid, pid_t tgid, perf_event__handler_t process, struct machine *machine); -int perf_event__synthesize_cgroups(struct perf_tool *tool, perf_event__handler_t process, struct machine *machine); +int perf_event__synthesize_kernel_mmap(const struct perf_tool *tool, perf_event__handler_t process, struct machine *machine); +int perf_event__synthesize_mmap_events(const struct perf_tool *tool, union perf_event *event, pid_t pid, pid_t tgid, perf_event__handler_t process, struct machine *machine, bool mmap_data); +int perf_event__synthesize_modules(const struct perf_tool *tool, perf_event__handler_t process, struct machine *machine); +int perf_event__synthesize_namespaces(const struct perf_tool *tool, union perf_event *event, pid_t pid, pid_t tgid, perf_event__handler_t process, struct machine *machine); +int perf_event__synthesize_cgroups(const struct perf_tool *tool, perf_event__handler_t process, struct machine *machine); int perf_event__synthesize_sample(union perf_event *event, u64 type, u64 read_format, const struct perf_sample *sample); -int perf_event__synthesize_stat_config(struct perf_tool *tool, struct perf_stat_config *config, perf_event__handler_t process, struct machine *machine); -int perf_event__synthesize_stat_events(struct perf_stat_config *config, struct perf_tool *tool, struct evlist *evlist, perf_event__handler_t process, bool attrs); -int perf_event__synthesize_stat_round(struct perf_tool *tool, u64 time, u64 type, perf_event__handler_t process, struct machine *machine); -int perf_event__synthesize_stat(struct perf_tool *tool, struct perf_cpu cpu, u32 thread, u64 id, struct perf_counts_values *count, perf_event__handler_t process, struct machine *machine); -int perf_event__synthesize_thread_map2(struct perf_tool *tool, struct perf_thread_map *threads, perf_event__handler_t process, struct machine *machine); -int perf_event__synthesize_thread_map(struct perf_tool *tool, struct perf_thread_map *threads, perf_event__handler_t process, struct machine *machine, bool needs_mmap, bool mmap_data); -int perf_event__synthesize_threads(struct perf_tool *tool, perf_event__handler_t process, struct machine *machine, bool needs_mmap, bool mmap_data, unsigned int nr_threads_synthesize); -int perf_event__synthesize_tracing_data(struct perf_tool *tool, int fd, struct evlist *evlist, perf_event__handler_t process); -int perf_event__synth_time_conv(const struct perf_event_mmap_page *pc, struct perf_tool *tool, perf_event__handler_t process, struct machine *machine); -pid_t perf_event__synthesize_comm(struct perf_tool *tool, union perf_event *event, pid_t pid, perf_event__handler_t process, struct machine *machine); - -int perf_tool__process_synth_event(struct perf_tool *tool, union perf_event *event, struct machine *machine, perf_event__handler_t process); +int perf_event__synthesize_stat_config(const struct perf_tool *tool, struct perf_stat_config *config, perf_event__handler_t process, struct machine *machine); +int perf_event__synthesize_stat_events(struct perf_stat_config *config, const struct perf_tool *tool, struct evlist *evlist, perf_event__handler_t process, bool attrs); +int perf_event__synthesize_stat_round(const struct perf_tool *tool, u64 time, u64 type, perf_event__handler_t process, struct machine *machine); +int perf_event__synthesize_stat(const struct perf_tool *tool, struct perf_cpu cpu, u32 thread, u64 id, struct perf_counts_values *count, perf_event__handler_t process, struct machine *machine); +int perf_event__synthesize_thread_map2(const struct perf_tool *tool, struct perf_thread_map *threads, perf_event__handler_t process, struct machine *machine); +int perf_event__synthesize_thread_map(const struct perf_tool *tool, struct perf_thread_map *threads, perf_event__handler_t process, struct machine *machine, bool needs_mmap, bool mmap_data); +int perf_event__synthesize_threads(const struct perf_tool *tool, perf_event__handler_t process, struct machine *machine, bool needs_mmap, bool mmap_data, unsigned int nr_threads_synthesize); +int perf_event__synthesize_tracing_data(const struct perf_tool *tool, int fd, struct evlist *evlist, perf_event__handler_t process); +int perf_event__synth_time_conv(const struct perf_event_mmap_page *pc, const struct perf_tool *tool, perf_event__handler_t process, struct machine *machine); +pid_t perf_event__synthesize_comm(const struct perf_tool *tool, union perf_event *event, pid_t pid, perf_event__handler_t process, struct machine *machine); + +int perf_tool__process_synth_event(const struct perf_tool *tool, union perf_event *event, struct machine *machine, perf_event__handler_t process); size_t perf_event__sample_event_size(const struct perf_sample *sample, u64 type, u64 read_format); -int __machine__synthesize_threads(struct machine *machine, struct perf_tool *tool, +int __machine__synthesize_threads(struct machine *machine, const struct perf_tool *tool, struct target *target, struct perf_thread_map *threads, perf_event__handler_t process, bool needs_mmap, bool data_mmap, unsigned int nr_threads_synthesize); @@ -87,7 +87,7 @@ int machine__synthesize_threads(struct machine *machine, struct target *target, unsigned int nr_threads_synthesize); #ifdef HAVE_AUXTRACE_SUPPORT -int perf_event__synthesize_auxtrace_info(struct auxtrace_record *itr, struct perf_tool *tool, +int perf_event__synthesize_auxtrace_info(struct auxtrace_record *itr, const struct perf_tool *tool, struct perf_session *session, perf_event__handler_t process); #else // HAVE_AUXTRACE_SUPPORT @@ -96,7 +96,7 @@ int perf_event__synthesize_auxtrace_info(struct auxtrace_record *itr, struct per static inline int perf_event__synthesize_auxtrace_info(struct auxtrace_record *itr __maybe_unused, - struct perf_tool *tool __maybe_unused, + const struct perf_tool *tool __maybe_unused, struct perf_session *session __maybe_unused, perf_event__handler_t process __maybe_unused) { @@ -117,7 +117,7 @@ static inline int perf_event__synthesize_bpf_events(struct perf_session *session } #endif // HAVE_LIBBPF_SUPPORT -int perf_event__synthesize_for_pipe(struct perf_tool *tool, +int perf_event__synthesize_for_pipe(const struct perf_tool *tool, struct perf_session *session, struct perf_data *data, perf_event__handler_t process); diff --git a/tools/perf/util/tool.h b/tools/perf/util/tool.h index 62bbc9cec151..2b12c674768f 100644 --- a/tools/perf/util/tool.h +++ b/tools/perf/util/tool.h @@ -15,14 +15,14 @@ struct perf_tool; struct machine; struct ordered_events; -typedef int (*event_sample)(struct perf_tool *tool, union perf_event *event, +typedef int (*event_sample)(const struct perf_tool *tool, union perf_event *event, struct perf_sample *sample, struct evsel *evsel, struct machine *machine); -typedef int (*event_op)(struct perf_tool *tool, union perf_event *event, +typedef int (*event_op)(const struct perf_tool *tool, union perf_event *event, struct perf_sample *sample, struct machine *machine); -typedef int (*event_attr_op)(struct perf_tool *tool, +typedef int (*event_attr_op)(const struct perf_tool *tool, union perf_event *event, struct evlist **pevlist); @@ -31,7 +31,7 @@ typedef s64 (*event_op3)(struct perf_session *session, union perf_event *event); typedef int (*event_op4)(struct perf_session *session, union perf_event *event, u64 data, const char *str); -typedef int (*event_oe)(struct perf_tool *tool, union perf_event *event, +typedef int (*event_oe)(const struct perf_tool *tool, union perf_event *event, struct ordered_events *oe); enum show_feature_header { diff --git a/tools/perf/util/tsc.c b/tools/perf/util/tsc.c index f19791d46e99..2e33a20e1e1b 100644 --- a/tools/perf/util/tsc.c +++ b/tools/perf/util/tsc.c @@ -72,7 +72,7 @@ int perf_read_tsc_conversion(const struct perf_event_mmap_page *pc, } int perf_event__synth_time_conv(const struct perf_event_mmap_page *pc, - struct perf_tool *tool, + const struct perf_tool *tool, perf_event__handler_t process, struct machine *machine) { -- cgit v1.2.3 From 564e5cbcfdf5c55491461edc61f6b7e2a4418063 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Mon, 12 Aug 2024 13:46:56 -0700 Subject: perf tool: Move fill defaults into tool.c The aim here is to eventually make perf_tool__fill_defaults() an init function so that the tools struct is more const. Create a tool.c to go along with tool.h. Move perf_tool__fill_defaults() out of session.c into tool.c along with the default stub values. Add perf_tool__compressed_is_stub() for a test in perf_session__process_user_event(). perf_session__process_compressed_event() is only used from being default initialized so migrate into tool.c. Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Anshuman Khandual Cc: Athira Rajeev Cc: Huacai Chen Cc: Ilkka Koskinen Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: John Garry Cc: Jonathan Cameron Cc: Kan Liang Cc: Leo Yan Cc: Mark Rutland Cc: Mike Leach Cc: Namhyung Kim Cc: Nick Desaulniers Cc: Nick Terrell Cc: Oliver Upton Cc: Peter Zijlstra Cc: Song Liu Cc: Sun Haiyong Cc: Suzuki Poulouse Cc: Will Deacon Cc: Yanteng Si Cc: Yicong Yang Cc: linux-arm-kernel@lists.infradead.org Link: https://lore.kernel.org/r/20240812204720.631678-5-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/Build | 1 + tools/perf/util/session.c | 310 +------------------------------------------ tools/perf/util/session.h | 2 - tools/perf/util/tool.c | 325 ++++++++++++++++++++++++++++++++++++++++++++++ tools/perf/util/tool.h | 4 + 5 files changed, 331 insertions(+), 311 deletions(-) create mode 100644 tools/perf/util/tool.c (limited to 'tools') diff --git a/tools/perf/util/Build b/tools/perf/util/Build index b24360c04aae..f6f18b4228bb 100644 --- a/tools/perf/util/Build +++ b/tools/perf/util/Build @@ -66,6 +66,7 @@ perf-util-y += map.o perf-util-y += maps.o perf-util-y += pstack.o perf-util-y += session.o +perf-util-y += tool.o perf-util-y += sample-raw.o perf-util-y += s390-sample-raw.o perf-util-y += amd-sample-raw.o diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c index 41beebd0913c..5c0e0341d39e 100644 --- a/tools/perf/util/session.c +++ b/tools/perf/util/session.c @@ -39,68 +39,6 @@ #include "annotate.h" #include -#ifdef HAVE_ZSTD_SUPPORT -static int perf_session__process_compressed_event(struct perf_session *session, - union perf_event *event, u64 file_offset, - const char *file_path) -{ - void *src; - size_t decomp_size, src_size; - u64 decomp_last_rem = 0; - size_t mmap_len, decomp_len = session->header.env.comp_mmap_len; - struct decomp *decomp, *decomp_last = session->active_decomp->decomp_last; - - if (decomp_last) { - decomp_last_rem = decomp_last->size - decomp_last->head; - decomp_len += decomp_last_rem; - } - - mmap_len = sizeof(struct decomp) + decomp_len; - decomp = mmap(NULL, mmap_len, PROT_READ|PROT_WRITE, - MAP_ANONYMOUS|MAP_PRIVATE, -1, 0); - if (decomp == MAP_FAILED) { - pr_err("Couldn't allocate memory for decompression\n"); - return -1; - } - - decomp->file_pos = file_offset; - decomp->file_path = file_path; - decomp->mmap_len = mmap_len; - decomp->head = 0; - - if (decomp_last_rem) { - memcpy(decomp->data, &(decomp_last->data[decomp_last->head]), decomp_last_rem); - decomp->size = decomp_last_rem; - } - - src = (void *)event + sizeof(struct perf_record_compressed); - src_size = event->pack.header.size - sizeof(struct perf_record_compressed); - - decomp_size = zstd_decompress_stream(session->active_decomp->zstd_decomp, src, src_size, - &(decomp->data[decomp_last_rem]), decomp_len - decomp_last_rem); - if (!decomp_size) { - munmap(decomp, mmap_len); - pr_err("Couldn't decompress data\n"); - return -1; - } - - decomp->size += decomp_size; - - if (session->active_decomp->decomp == NULL) - session->active_decomp->decomp = decomp; - else - session->active_decomp->decomp_last->next = decomp; - - session->active_decomp->decomp_last = decomp; - - pr_debug("decomp (B): %zd to %zd\n", src_size, decomp_size); - - return 0; -} -#else /* !HAVE_ZSTD_SUPPORT */ -#define perf_session__process_compressed_event perf_session__process_compressed_event_stub -#endif - static int perf_session__deliver_event(struct perf_session *session, union perf_event *event, const struct perf_tool *tool, @@ -321,251 +259,6 @@ void perf_session__delete(struct perf_session *session) free(session); } -static int process_event_synth_tracing_data_stub(struct perf_session *session - __maybe_unused, - union perf_event *event - __maybe_unused) -{ - dump_printf(": unhandled!\n"); - return 0; -} - -static int process_event_synth_attr_stub(const struct perf_tool *tool __maybe_unused, - union perf_event *event __maybe_unused, - struct evlist **pevlist - __maybe_unused) -{ - dump_printf(": unhandled!\n"); - return 0; -} - -static int process_event_synth_event_update_stub(const struct perf_tool *tool __maybe_unused, - union perf_event *event __maybe_unused, - struct evlist **pevlist - __maybe_unused) -{ - if (dump_trace) - perf_event__fprintf_event_update(event, stdout); - - dump_printf(": unhandled!\n"); - return 0; -} - -static int process_event_sample_stub(const struct perf_tool *tool __maybe_unused, - union perf_event *event __maybe_unused, - struct perf_sample *sample __maybe_unused, - struct evsel *evsel __maybe_unused, - struct machine *machine __maybe_unused) -{ - dump_printf(": unhandled!\n"); - return 0; -} - -static int process_event_stub(const struct perf_tool *tool __maybe_unused, - union perf_event *event __maybe_unused, - struct perf_sample *sample __maybe_unused, - struct machine *machine __maybe_unused) -{ - dump_printf(": unhandled!\n"); - return 0; -} - -static int process_finished_round_stub(const struct perf_tool *tool __maybe_unused, - union perf_event *event __maybe_unused, - struct ordered_events *oe __maybe_unused) -{ - dump_printf(": unhandled!\n"); - return 0; -} - -static int skipn(int fd, off_t n) -{ - char buf[4096]; - ssize_t ret; - - while (n > 0) { - ret = read(fd, buf, min(n, (off_t)sizeof(buf))); - if (ret <= 0) - return ret; - n -= ret; - } - - return 0; -} - -static s64 process_event_auxtrace_stub(struct perf_session *session __maybe_unused, - union perf_event *event) -{ - dump_printf(": unhandled!\n"); - if (perf_data__is_pipe(session->data)) - skipn(perf_data__fd(session->data), event->auxtrace.size); - return event->auxtrace.size; -} - -static int process_event_op2_stub(struct perf_session *session __maybe_unused, - union perf_event *event __maybe_unused) -{ - dump_printf(": unhandled!\n"); - return 0; -} - - -static -int process_event_thread_map_stub(struct perf_session *session __maybe_unused, - union perf_event *event __maybe_unused) -{ - if (dump_trace) - perf_event__fprintf_thread_map(event, stdout); - - dump_printf(": unhandled!\n"); - return 0; -} - -static -int process_event_cpu_map_stub(struct perf_session *session __maybe_unused, - union perf_event *event __maybe_unused) -{ - if (dump_trace) - perf_event__fprintf_cpu_map(event, stdout); - - dump_printf(": unhandled!\n"); - return 0; -} - -static -int process_event_stat_config_stub(struct perf_session *session __maybe_unused, - union perf_event *event __maybe_unused) -{ - if (dump_trace) - perf_event__fprintf_stat_config(event, stdout); - - dump_printf(": unhandled!\n"); - return 0; -} - -static int process_stat_stub(struct perf_session *perf_session __maybe_unused, - union perf_event *event) -{ - if (dump_trace) - perf_event__fprintf_stat(event, stdout); - - dump_printf(": unhandled!\n"); - return 0; -} - -static int process_stat_round_stub(struct perf_session *perf_session __maybe_unused, - union perf_event *event) -{ - if (dump_trace) - perf_event__fprintf_stat_round(event, stdout); - - dump_printf(": unhandled!\n"); - return 0; -} - -static int process_event_time_conv_stub(struct perf_session *perf_session __maybe_unused, - union perf_event *event) -{ - if (dump_trace) - perf_event__fprintf_time_conv(event, stdout); - - dump_printf(": unhandled!\n"); - return 0; -} - -static int perf_session__process_compressed_event_stub(struct perf_session *session __maybe_unused, - union perf_event *event __maybe_unused, - u64 file_offset __maybe_unused, - const char *file_path __maybe_unused) -{ - dump_printf(": unhandled!\n"); - return 0; -} - -void perf_tool__fill_defaults(struct perf_tool *tool) -{ - if (tool->sample == NULL) - tool->sample = process_event_sample_stub; - if (tool->mmap == NULL) - tool->mmap = process_event_stub; - if (tool->mmap2 == NULL) - tool->mmap2 = process_event_stub; - if (tool->comm == NULL) - tool->comm = process_event_stub; - if (tool->namespaces == NULL) - tool->namespaces = process_event_stub; - if (tool->cgroup == NULL) - tool->cgroup = process_event_stub; - if (tool->fork == NULL) - tool->fork = process_event_stub; - if (tool->exit == NULL) - tool->exit = process_event_stub; - if (tool->lost == NULL) - tool->lost = perf_event__process_lost; - if (tool->lost_samples == NULL) - tool->lost_samples = perf_event__process_lost_samples; - if (tool->aux == NULL) - tool->aux = perf_event__process_aux; - if (tool->itrace_start == NULL) - tool->itrace_start = perf_event__process_itrace_start; - if (tool->context_switch == NULL) - tool->context_switch = perf_event__process_switch; - if (tool->ksymbol == NULL) - tool->ksymbol = perf_event__process_ksymbol; - if (tool->bpf == NULL) - tool->bpf = perf_event__process_bpf; - if (tool->text_poke == NULL) - tool->text_poke = perf_event__process_text_poke; - if (tool->aux_output_hw_id == NULL) - tool->aux_output_hw_id = perf_event__process_aux_output_hw_id; - if (tool->read == NULL) - tool->read = process_event_sample_stub; - if (tool->throttle == NULL) - tool->throttle = process_event_stub; - if (tool->unthrottle == NULL) - tool->unthrottle = process_event_stub; - if (tool->attr == NULL) - tool->attr = process_event_synth_attr_stub; - if (tool->event_update == NULL) - tool->event_update = process_event_synth_event_update_stub; - if (tool->tracing_data == NULL) - tool->tracing_data = process_event_synth_tracing_data_stub; - if (tool->build_id == NULL) - tool->build_id = process_event_op2_stub; - if (tool->finished_round == NULL) { - if (tool->ordered_events) - tool->finished_round = perf_event__process_finished_round; - else - tool->finished_round = process_finished_round_stub; - } - if (tool->id_index == NULL) - tool->id_index = process_event_op2_stub; - if (tool->auxtrace_info == NULL) - tool->auxtrace_info = process_event_op2_stub; - if (tool->auxtrace == NULL) - tool->auxtrace = process_event_auxtrace_stub; - if (tool->auxtrace_error == NULL) - tool->auxtrace_error = process_event_op2_stub; - if (tool->thread_map == NULL) - tool->thread_map = process_event_thread_map_stub; - if (tool->cpu_map == NULL) - tool->cpu_map = process_event_cpu_map_stub; - if (tool->stat_config == NULL) - tool->stat_config = process_event_stat_config_stub; - if (tool->stat == NULL) - tool->stat = process_stat_stub; - if (tool->stat_round == NULL) - tool->stat_round = process_stat_round_stub; - if (tool->time_conv == NULL) - tool->time_conv = process_event_time_conv_stub; - if (tool->feature == NULL) - tool->feature = process_event_op2_stub; - if (tool->compressed == NULL) - tool->compressed = perf_session__process_compressed_event; - if (tool->finished_init == NULL) - tool->finished_init = process_event_op2_stub; -} - static void swap_sample_id_all(union perf_event *event, void *data) { void *end = (void *) event + event->header.size; @@ -1677,8 +1370,7 @@ static s64 perf_session__process_user_event(struct perf_session *session, int fd = perf_data__fd(session->data); int err; - if (event->header.type != PERF_RECORD_COMPRESSED || - tool->compressed == perf_session__process_compressed_event_stub) + if (event->header.type != PERF_RECORD_COMPRESSED || perf_tool__compressed_is_stub(tool)) dump_event(session->evlist, event, file_offset, &sample, file_path); /* These events are processed right away */ diff --git a/tools/perf/util/session.h b/tools/perf/util/session.h index 6886cc85600f..7f69baeae7fb 100644 --- a/tools/perf/util/session.h +++ b/tools/perf/util/session.h @@ -92,8 +92,6 @@ int perf_session__process_events(struct perf_session *session); int perf_session__queue_event(struct perf_session *s, union perf_event *event, u64 timestamp, u64 file_offset, const char *file_path); -void perf_tool__fill_defaults(struct perf_tool *tool); - int perf_session__resolve_callchain(struct perf_session *session, struct evsel *evsel, struct thread *thread, diff --git a/tools/perf/util/tool.c b/tools/perf/util/tool.c new file mode 100644 index 000000000000..17219ecb8fa6 --- /dev/null +++ b/tools/perf/util/tool.c @@ -0,0 +1,325 @@ +// SPDX-License-Identifier: GPL-2.0 +#include "data.h" +#include "debug.h" +#include "header.h" +#include "session.h" +#include "stat.h" +#include "tool.h" +#include "tsc.h" +#include +#include + +#ifdef HAVE_ZSTD_SUPPORT +static int perf_session__process_compressed_event(struct perf_session *session, + union perf_event *event, u64 file_offset, + const char *file_path) +{ + void *src; + size_t decomp_size, src_size; + u64 decomp_last_rem = 0; + size_t mmap_len, decomp_len = session->header.env.comp_mmap_len; + struct decomp *decomp, *decomp_last = session->active_decomp->decomp_last; + + if (decomp_last) { + decomp_last_rem = decomp_last->size - decomp_last->head; + decomp_len += decomp_last_rem; + } + + mmap_len = sizeof(struct decomp) + decomp_len; + decomp = mmap(NULL, mmap_len, PROT_READ|PROT_WRITE, + MAP_ANONYMOUS|MAP_PRIVATE, -1, 0); + if (decomp == MAP_FAILED) { + pr_err("Couldn't allocate memory for decompression\n"); + return -1; + } + + decomp->file_pos = file_offset; + decomp->file_path = file_path; + decomp->mmap_len = mmap_len; + decomp->head = 0; + + if (decomp_last_rem) { + memcpy(decomp->data, &(decomp_last->data[decomp_last->head]), decomp_last_rem); + decomp->size = decomp_last_rem; + } + + src = (void *)event + sizeof(struct perf_record_compressed); + src_size = event->pack.header.size - sizeof(struct perf_record_compressed); + + decomp_size = zstd_decompress_stream(session->active_decomp->zstd_decomp, src, src_size, + &(decomp->data[decomp_last_rem]), decomp_len - decomp_last_rem); + if (!decomp_size) { + munmap(decomp, mmap_len); + pr_err("Couldn't decompress data\n"); + return -1; + } + + decomp->size += decomp_size; + + if (session->active_decomp->decomp == NULL) + session->active_decomp->decomp = decomp; + else + session->active_decomp->decomp_last->next = decomp; + + session->active_decomp->decomp_last = decomp; + + pr_debug("decomp (B): %zd to %zd\n", src_size, decomp_size); + + return 0; +} +#endif + +static int process_event_synth_tracing_data_stub(struct perf_session *session + __maybe_unused, + union perf_event *event + __maybe_unused) +{ + dump_printf(": unhandled!\n"); + return 0; +} + +static int process_event_synth_attr_stub(const struct perf_tool *tool __maybe_unused, + union perf_event *event __maybe_unused, + struct evlist **pevlist + __maybe_unused) +{ + dump_printf(": unhandled!\n"); + return 0; +} + +static int process_event_synth_event_update_stub(const struct perf_tool *tool __maybe_unused, + union perf_event *event __maybe_unused, + struct evlist **pevlist + __maybe_unused) +{ + if (dump_trace) + perf_event__fprintf_event_update(event, stdout); + + dump_printf(": unhandled!\n"); + return 0; +} + +static int process_event_sample_stub(const struct perf_tool *tool __maybe_unused, + union perf_event *event __maybe_unused, + struct perf_sample *sample __maybe_unused, + struct evsel *evsel __maybe_unused, + struct machine *machine __maybe_unused) +{ + dump_printf(": unhandled!\n"); + return 0; +} + +static int process_event_stub(const struct perf_tool *tool __maybe_unused, + union perf_event *event __maybe_unused, + struct perf_sample *sample __maybe_unused, + struct machine *machine __maybe_unused) +{ + dump_printf(": unhandled!\n"); + return 0; +} + +static int process_finished_round_stub(const struct perf_tool *tool __maybe_unused, + union perf_event *event __maybe_unused, + struct ordered_events *oe __maybe_unused) +{ + dump_printf(": unhandled!\n"); + return 0; +} + +static int skipn(int fd, off_t n) +{ + char buf[4096]; + ssize_t ret; + + while (n > 0) { + ret = read(fd, buf, min(n, (off_t)sizeof(buf))); + if (ret <= 0) + return ret; + n -= ret; + } + + return 0; +} + +static s64 process_event_auxtrace_stub(struct perf_session *session __maybe_unused, + union perf_event *event) +{ + dump_printf(": unhandled!\n"); + if (perf_data__is_pipe(session->data)) + skipn(perf_data__fd(session->data), event->auxtrace.size); + return event->auxtrace.size; +} + +static int process_event_op2_stub(struct perf_session *session __maybe_unused, + union perf_event *event __maybe_unused) +{ + dump_printf(": unhandled!\n"); + return 0; +} + + +static +int process_event_thread_map_stub(struct perf_session *session __maybe_unused, + union perf_event *event __maybe_unused) +{ + if (dump_trace) + perf_event__fprintf_thread_map(event, stdout); + + dump_printf(": unhandled!\n"); + return 0; +} + +static +int process_event_cpu_map_stub(struct perf_session *session __maybe_unused, + union perf_event *event __maybe_unused) +{ + if (dump_trace) + perf_event__fprintf_cpu_map(event, stdout); + + dump_printf(": unhandled!\n"); + return 0; +} + +static +int process_event_stat_config_stub(struct perf_session *session __maybe_unused, + union perf_event *event __maybe_unused) +{ + if (dump_trace) + perf_event__fprintf_stat_config(event, stdout); + + dump_printf(": unhandled!\n"); + return 0; +} + +static int process_stat_stub(struct perf_session *perf_session __maybe_unused, + union perf_event *event) +{ + if (dump_trace) + perf_event__fprintf_stat(event, stdout); + + dump_printf(": unhandled!\n"); + return 0; +} + +static int process_stat_round_stub(struct perf_session *perf_session __maybe_unused, + union perf_event *event) +{ + if (dump_trace) + perf_event__fprintf_stat_round(event, stdout); + + dump_printf(": unhandled!\n"); + return 0; +} + +static int process_event_time_conv_stub(struct perf_session *perf_session __maybe_unused, + union perf_event *event) +{ + if (dump_trace) + perf_event__fprintf_time_conv(event, stdout); + + dump_printf(": unhandled!\n"); + return 0; +} + +static int perf_session__process_compressed_event_stub(struct perf_session *session __maybe_unused, + union perf_event *event __maybe_unused, + u64 file_offset __maybe_unused, + const char *file_path __maybe_unused) +{ + dump_printf(": unhandled!\n"); + return 0; +} + +void perf_tool__fill_defaults(struct perf_tool *tool) +{ + if (tool->sample == NULL) + tool->sample = process_event_sample_stub; + if (tool->mmap == NULL) + tool->mmap = process_event_stub; + if (tool->mmap2 == NULL) + tool->mmap2 = process_event_stub; + if (tool->comm == NULL) + tool->comm = process_event_stub; + if (tool->namespaces == NULL) + tool->namespaces = process_event_stub; + if (tool->cgroup == NULL) + tool->cgroup = process_event_stub; + if (tool->fork == NULL) + tool->fork = process_event_stub; + if (tool->exit == NULL) + tool->exit = process_event_stub; + if (tool->lost == NULL) + tool->lost = perf_event__process_lost; + if (tool->lost_samples == NULL) + tool->lost_samples = perf_event__process_lost_samples; + if (tool->aux == NULL) + tool->aux = perf_event__process_aux; + if (tool->itrace_start == NULL) + tool->itrace_start = perf_event__process_itrace_start; + if (tool->context_switch == NULL) + tool->context_switch = perf_event__process_switch; + if (tool->ksymbol == NULL) + tool->ksymbol = perf_event__process_ksymbol; + if (tool->bpf == NULL) + tool->bpf = perf_event__process_bpf; + if (tool->text_poke == NULL) + tool->text_poke = perf_event__process_text_poke; + if (tool->aux_output_hw_id == NULL) + tool->aux_output_hw_id = perf_event__process_aux_output_hw_id; + if (tool->read == NULL) + tool->read = process_event_sample_stub; + if (tool->throttle == NULL) + tool->throttle = process_event_stub; + if (tool->unthrottle == NULL) + tool->unthrottle = process_event_stub; + if (tool->attr == NULL) + tool->attr = process_event_synth_attr_stub; + if (tool->event_update == NULL) + tool->event_update = process_event_synth_event_update_stub; + if (tool->tracing_data == NULL) + tool->tracing_data = process_event_synth_tracing_data_stub; + if (tool->build_id == NULL) + tool->build_id = process_event_op2_stub; + if (tool->finished_round == NULL) { + if (tool->ordered_events) + tool->finished_round = perf_event__process_finished_round; + else + tool->finished_round = process_finished_round_stub; + } + if (tool->id_index == NULL) + tool->id_index = process_event_op2_stub; + if (tool->auxtrace_info == NULL) + tool->auxtrace_info = process_event_op2_stub; + if (tool->auxtrace == NULL) + tool->auxtrace = process_event_auxtrace_stub; + if (tool->auxtrace_error == NULL) + tool->auxtrace_error = process_event_op2_stub; + if (tool->thread_map == NULL) + tool->thread_map = process_event_thread_map_stub; + if (tool->cpu_map == NULL) + tool->cpu_map = process_event_cpu_map_stub; + if (tool->stat_config == NULL) + tool->stat_config = process_event_stat_config_stub; + if (tool->stat == NULL) + tool->stat = process_stat_stub; + if (tool->stat_round == NULL) + tool->stat_round = process_stat_round_stub; + if (tool->time_conv == NULL) + tool->time_conv = process_event_time_conv_stub; + if (tool->feature == NULL) + tool->feature = process_event_op2_stub; + if (tool->compressed == NULL) { +#ifdef HAVE_ZSTD_SUPPORT + tool->compressed = perf_session__process_compressed_event; +#else + tool->compressed = perf_session__process_compressed_event_stub; +#endif + } + if (tool->finished_init == NULL) + tool->finished_init = process_event_op2_stub; +} + +bool perf_tool__compressed_is_stub(const struct perf_tool *tool) +{ + return tool->compressed == perf_session__process_compressed_event_stub; +} diff --git a/tools/perf/util/tool.h b/tools/perf/util/tool.h index 2b12c674768f..b917b1fa6204 100644 --- a/tools/perf/util/tool.h +++ b/tools/perf/util/tool.h @@ -89,4 +89,8 @@ struct perf_tool { enum show_feature_header show_feat_hdr; }; +void perf_tool__fill_defaults(struct perf_tool *tool); + +bool perf_tool__compressed_is_stub(const struct perf_tool *tool); + #endif /* __PERF_TOOL_H */ -- cgit v1.2.3 From ae737b61029ced77391e2b850433cd316716ddf9 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Mon, 12 Aug 2024 13:46:57 -0700 Subject: perf tool: Add perf_tool__init() Add init function that behaves like perf_tool__fill_defaults() but assumes all values haven't been initialized. Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Anshuman Khandual Cc: Athira Rajeev Cc: Huacai Chen Cc: Ilkka Koskinen Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: John Garry Cc: Jonathan Cameron Cc: Kan Liang Cc: Leo Yan Cc: Mark Rutland Cc: Mike Leach Cc: Namhyung Kim Cc: Nick Desaulniers Cc: Nick Terrell Cc: Oliver Upton Cc: Peter Zijlstra Cc: Song Liu Cc: Sun Haiyong Cc: Suzuki Poulouse Cc: Will Deacon Cc: Yanteng Si Cc: Yicong Yang Cc: linux-arm-kernel@lists.infradead.org Link: https://lore.kernel.org/r/20240812204720.631678-6-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/tool.c | 58 ++++++++++++++++++++++++++++++++++++++++++++++++++ tools/perf/util/tool.h | 1 + 2 files changed, 59 insertions(+) (limited to 'tools') diff --git a/tools/perf/util/tool.c b/tools/perf/util/tool.c index 17219ecb8fa6..cbd9b888bd73 100644 --- a/tools/perf/util/tool.c +++ b/tools/perf/util/tool.c @@ -230,6 +230,64 @@ static int perf_session__process_compressed_event_stub(struct perf_session *sess return 0; } +void perf_tool__init(struct perf_tool *tool, bool ordered_events) +{ + tool->ordered_events = ordered_events; + tool->ordering_requires_timestamps = false; + tool->namespace_events = false; + tool->cgroup_events = false; + tool->no_warn = false; + tool->show_feat_hdr = SHOW_FEAT_NO_HEADER; + + tool->sample = process_event_sample_stub; + tool->mmap = process_event_stub; + tool->mmap2 = process_event_stub; + tool->comm = process_event_stub; + tool->namespaces = process_event_stub; + tool->cgroup = process_event_stub; + tool->fork = process_event_stub; + tool->exit = process_event_stub; + tool->lost = perf_event__process_lost; + tool->lost_samples = perf_event__process_lost_samples; + tool->aux = perf_event__process_aux; + tool->itrace_start = perf_event__process_itrace_start; + tool->context_switch = perf_event__process_switch; + tool->ksymbol = perf_event__process_ksymbol; + tool->bpf = perf_event__process_bpf; + tool->text_poke = perf_event__process_text_poke; + tool->aux_output_hw_id = perf_event__process_aux_output_hw_id; + tool->read = process_event_sample_stub; + tool->throttle = process_event_stub; + tool->unthrottle = process_event_stub; + tool->attr = process_event_synth_attr_stub; + tool->event_update = process_event_synth_event_update_stub; + tool->tracing_data = process_event_synth_tracing_data_stub; + tool->build_id = process_event_op2_stub; + + if (ordered_events) + tool->finished_round = perf_event__process_finished_round; + else + tool->finished_round = process_finished_round_stub; + + tool->id_index = process_event_op2_stub; + tool->auxtrace_info = process_event_op2_stub; + tool->auxtrace = process_event_auxtrace_stub; + tool->auxtrace_error = process_event_op2_stub; + tool->thread_map = process_event_thread_map_stub; + tool->cpu_map = process_event_cpu_map_stub; + tool->stat_config = process_event_stat_config_stub; + tool->stat = process_stat_stub; + tool->stat_round = process_stat_round_stub; + tool->time_conv = process_event_time_conv_stub; + tool->feature = process_event_op2_stub; +#ifdef HAVE_ZSTD_SUPPORT + tool->compressed = perf_session__process_compressed_event; +#else + tool->compressed = perf_session__process_compressed_event_stub; +#endif + tool->finished_init = process_event_op2_stub; +} + void perf_tool__fill_defaults(struct perf_tool *tool) { if (tool->sample == NULL) diff --git a/tools/perf/util/tool.h b/tools/perf/util/tool.h index b917b1fa6204..7c7ce395e573 100644 --- a/tools/perf/util/tool.h +++ b/tools/perf/util/tool.h @@ -89,6 +89,7 @@ struct perf_tool { enum show_feature_header show_feat_hdr; }; +void perf_tool__init(struct perf_tool *tool, bool ordered_events); void perf_tool__fill_defaults(struct perf_tool *tool); bool perf_tool__compressed_is_stub(const struct perf_tool *tool); -- cgit v1.2.3 From f32b37cc783ac28a58ab75542de7cec2c7c01e74 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Mon, 12 Aug 2024 13:46:58 -0700 Subject: perf kmem: Use perf_tool__init Reduce the scope of the tool from global/static to just that of the cmd_kmem function where the session is scoped. Use the perf_tool__init() to initialize default values. Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Anshuman Khandual Cc: Athira Rajeev Cc: Huacai Chen Cc: Ilkka Koskinen Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: John Garry Cc: Jonathan Cameron Cc: Kan Liang Cc: Leo Yan Cc: Mark Rutland Cc: Mike Leach Cc: Namhyung Kim Cc: Nick Desaulniers Cc: Nick Terrell Cc: Oliver Upton Cc: Peter Zijlstra Cc: Song Liu Cc: Sun Haiyong Cc: Suzuki Poulouse Cc: Will Deacon Cc: Yanteng Si Cc: Yicong Yang Cc: linux-arm-kernel@lists.infradead.org Link: https://lore.kernel.org/r/20240812204720.631678-7-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-kmem.c | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) (limited to 'tools') diff --git a/tools/perf/builtin-kmem.c b/tools/perf/builtin-kmem.c index 859ff018eace..b3cbac40b8c7 100644 --- a/tools/perf/builtin-kmem.c +++ b/tools/perf/builtin-kmem.c @@ -986,15 +986,6 @@ static int process_sample_event(const struct perf_tool *tool __maybe_unused, return err; } -static struct perf_tool perf_kmem = { - .sample = process_sample_event, - .comm = perf_event__process_comm, - .mmap = perf_event__process_mmap, - .mmap2 = perf_event__process_mmap2, - .namespaces = perf_event__process_namespaces, - .ordered_events = true, -}; - static double fragmentation(unsigned long n_req, unsigned long n_alloc) { if (n_alloc == 0) @@ -1971,6 +1962,7 @@ int cmd_kmem(int argc, const char **argv) NULL }; struct perf_session *session; + struct perf_tool perf_kmem; static const char errmsg[] = "No %s allocation events found. Have you run 'perf kmem record --%s'?\n"; int ret = perf_config(kmem_config, NULL); @@ -1998,6 +1990,13 @@ int cmd_kmem(int argc, const char **argv) data.path = input_name; + perf_tool__init(&perf_kmem, /*ordered_events=*/true); + perf_kmem.sample = process_sample_event; + perf_kmem.comm = perf_event__process_comm; + perf_kmem.mmap = perf_event__process_mmap; + perf_kmem.mmap2 = perf_event__process_mmap2; + perf_kmem.namespaces = perf_event__process_namespaces; + kmem_session = session = perf_session__new(&data, &perf_kmem); if (IS_ERR(session)) return PTR_ERR(session); -- cgit v1.2.3 From 584a268f50758a35b7176ecad798bbe77d672d21 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Mon, 12 Aug 2024 13:46:59 -0700 Subject: perf buildid-list: Use perf_tool__init Reduce scope of build_id__mark_dso_hit_ops() to the scope of function perf_session__list_build_ids, its only use, and use perf_tool__init() for the default values. Move perf_event__exit_del_thread() to event.[ch] so it can be used in builtin-buildid-list.c. Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Anshuman Khandual Cc: Athira Rajeev Cc: Huacai Chen Cc: Ilkka Koskinen Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: John Garry Cc: Jonathan Cameron Cc: Kan Liang Cc: Leo Yan Cc: Mark Rutland Cc: Mike Leach Cc: Namhyung Kim Cc: Nick Desaulniers Cc: Nick Terrell Cc: Oliver Upton Cc: Peter Zijlstra Cc: Song Liu Cc: Sun Haiyong Cc: Suzuki Poulouse Cc: Will Deacon Cc: Yanteng Si Cc: Yicong Yang Cc: linux-arm-kernel@lists.infradead.org Link: https://lore.kernel.org/r/20240812204720.631678-8-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-buildid-list.c | 10 ++++++++++ tools/perf/util/build-id.c | 32 -------------------------------- tools/perf/util/build-id.h | 4 +--- tools/perf/util/event.c | 20 ++++++++++++++++++++ tools/perf/util/event.h | 4 ++++ 5 files changed, 35 insertions(+), 35 deletions(-) (limited to 'tools') diff --git a/tools/perf/builtin-buildid-list.c b/tools/perf/builtin-buildid-list.c index 383d5de36ce4..52dfacaff8e3 100644 --- a/tools/perf/builtin-buildid-list.c +++ b/tools/perf/builtin-buildid-list.c @@ -89,6 +89,7 @@ static int perf_session__list_build_ids(bool force, bool with_hits) .mode = PERF_DATA_MODE_READ, .force = force, }; + struct perf_tool build_id__mark_dso_hit_ops; symbol__elf_init(); /* @@ -97,6 +98,15 @@ static int perf_session__list_build_ids(bool force, bool with_hits) if (filename__fprintf_build_id(input_name, stdout) > 0) goto out; + perf_tool__init(&build_id__mark_dso_hit_ops, /*ordered_events=*/true); + build_id__mark_dso_hit_ops.sample = build_id__mark_dso_hit; + build_id__mark_dso_hit_ops.mmap = perf_event__process_mmap; + build_id__mark_dso_hit_ops.mmap2 = perf_event__process_mmap2; + build_id__mark_dso_hit_ops.fork = perf_event__process_fork; + build_id__mark_dso_hit_ops.exit = perf_event__exit_del_thread; + build_id__mark_dso_hit_ops.attr = perf_event__process_attr; + build_id__mark_dso_hit_ops.build_id = perf_event__process_build_id; + session = perf_session__new(&data, &build_id__mark_dso_hit_ops); if (IS_ERR(session)) return PTR_ERR(session); diff --git a/tools/perf/util/build-id.c b/tools/perf/util/build-id.c index 098fcc625d91..451d145fa4ed 100644 --- a/tools/perf/util/build-id.c +++ b/tools/perf/util/build-id.c @@ -67,38 +67,6 @@ int build_id__mark_dso_hit(const struct perf_tool *tool __maybe_unused, return 0; } -static int perf_event__exit_del_thread(const struct perf_tool *tool __maybe_unused, - union perf_event *event, - struct perf_sample *sample - __maybe_unused, - struct machine *machine) -{ - struct thread *thread = machine__findnew_thread(machine, - event->fork.pid, - event->fork.tid); - - dump_printf("(%d:%d):(%d:%d)\n", event->fork.pid, event->fork.tid, - event->fork.ppid, event->fork.ptid); - - if (thread) { - machine__remove_thread(machine, thread); - thread__put(thread); - } - - return 0; -} - -struct perf_tool build_id__mark_dso_hit_ops = { - .sample = build_id__mark_dso_hit, - .mmap = perf_event__process_mmap, - .mmap2 = perf_event__process_mmap2, - .fork = perf_event__process_fork, - .exit = perf_event__exit_del_thread, - .attr = perf_event__process_attr, - .build_id = perf_event__process_build_id, - .ordered_events = true, -}; - int build_id__sprintf(const struct build_id *build_id, char *bf) { char *bid = bf; diff --git a/tools/perf/util/build-id.h b/tools/perf/util/build-id.h index ae87c4c58d5b..a212497bfdb0 100644 --- a/tools/perf/util/build-id.h +++ b/tools/perf/util/build-id.h @@ -16,11 +16,9 @@ struct build_id { size_t size; }; -struct nsinfo; - -extern struct perf_tool build_id__mark_dso_hit_ops; struct dso; struct feat_fd; +struct nsinfo; void build_id__init(struct build_id *bid, const u8 *data, size_t size); int build_id__sprintf(const struct build_id *build_id, char *bf); diff --git a/tools/perf/util/event.c b/tools/perf/util/event.c index c2f0e7f40ad5..aac96d5d1917 100644 --- a/tools/perf/util/event.c +++ b/tools/perf/util/event.c @@ -426,6 +426,26 @@ int perf_event__process_exit(const struct perf_tool *tool __maybe_unused, return machine__process_exit_event(machine, event, sample); } +int perf_event__exit_del_thread(const struct perf_tool *tool __maybe_unused, + union perf_event *event, + struct perf_sample *sample __maybe_unused, + struct machine *machine) +{ + struct thread *thread = machine__findnew_thread(machine, + event->fork.pid, + event->fork.tid); + + dump_printf("(%d:%d):(%d:%d)\n", event->fork.pid, event->fork.tid, + event->fork.ppid, event->fork.ptid); + + if (thread) { + machine__remove_thread(machine, thread); + thread__put(thread); + } + + return 0; +} + size_t perf_event__fprintf_aux(union perf_event *event, FILE *fp) { return fprintf(fp, " offset: %#"PRI_lx64" size: %#"PRI_lx64" flags: %#"PRI_lx64" [%s%s%s]\n", diff --git a/tools/perf/util/event.h b/tools/perf/util/event.h index 4b24f1c580fd..f8742e6230a5 100644 --- a/tools/perf/util/event.h +++ b/tools/perf/util/event.h @@ -319,6 +319,10 @@ int perf_event__process_exit(const struct perf_tool *tool, union perf_event *event, struct perf_sample *sample, struct machine *machine); +int perf_event__exit_del_thread(const struct perf_tool *tool, + union perf_event *event, + struct perf_sample *sample, + struct machine *machine); int perf_event__process_ksymbol(const struct perf_tool *tool, union perf_event *event, struct perf_sample *sample, -- cgit v1.2.3 From a01a5ef98870bd7a72f7eee0cb6168233c41bf53 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Mon, 12 Aug 2024 13:47:00 -0700 Subject: perf kvm: Use perf_tool__init() Use perf_tool__init() so that more uses of 'struct perf_tool' can be const and not relying on perf_tool__fill_defaults(). Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Anshuman Khandual Cc: Athira Rajeev Cc: Huacai Chen Cc: Ilkka Koskinen Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: John Garry Cc: Jonathan Cameron Cc: Kan Liang Cc: Leo Yan Cc: Mark Rutland Cc: Mike Leach Cc: Namhyung Kim Cc: Nick Desaulniers Cc: Nick Terrell Cc: Oliver Upton Cc: Peter Zijlstra Cc: Song Liu Cc: Sun Haiyong Cc: Suzuki Poulouse Cc: Will Deacon Cc: Yanteng Si Cc: Yicong Yang Cc: linux-arm-kernel@lists.infradead.org Link: https://lore.kernel.org/r/20240812204720.631678-9-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-kvm.c | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) (limited to 'tools') diff --git a/tools/perf/builtin-kvm.c b/tools/perf/builtin-kvm.c index a3b903cf4311..692267b1b7e8 100644 --- a/tools/perf/builtin-kvm.c +++ b/tools/perf/builtin-kvm.c @@ -1603,19 +1603,17 @@ static int read_events(struct perf_kvm_stat *kvm) { int ret; - struct perf_tool eops = { - .sample = process_sample_event, - .comm = perf_event__process_comm, - .namespaces = perf_event__process_namespaces, - .ordered_events = true, - }; struct perf_data file = { .path = kvm->file_name, .mode = PERF_DATA_MODE_READ, .force = kvm->force, }; - kvm->tool = eops; + perf_tool__init(&kvm->tool, /*ordered_events=*/true); + kvm->tool.sample = process_sample_event; + kvm->tool.comm = perf_event__process_comm; + kvm->tool.namespaces = perf_event__process_namespaces; + kvm->session = perf_session__new(&file, &kvm->tool); if (IS_ERR(kvm->session)) { pr_err("Initializing perf session failed\n"); @@ -1919,14 +1917,13 @@ static int kvm_events_live(struct perf_kvm_stat *kvm, /* event handling */ + perf_tool__init(&kvm->tool, /*ordered_events=*/true); kvm->tool.sample = process_sample_event; kvm->tool.comm = perf_event__process_comm; kvm->tool.exit = perf_event__process_exit; kvm->tool.fork = perf_event__process_fork; kvm->tool.lost = process_lost_event; kvm->tool.namespaces = perf_event__process_namespaces; - kvm->tool.ordered_events = true; - perf_tool__fill_defaults(&kvm->tool); /* set defaults */ kvm->display_time = 1; -- cgit v1.2.3 From b4fd4d00f9d4f51eab54e5b336afe27a846f1578 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Mon, 12 Aug 2024 13:47:01 -0700 Subject: perf lock: Use perf_tool__init() Use perf_tool__init() so that more uses of 'struct perf_tool' can be const and not relying on perf_tool__fill_defaults(). Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Anshuman Khandual Cc: Athira Rajeev Cc: Huacai Chen Cc: Ilkka Koskinen Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: John Garry Cc: Jonathan Cameron Cc: Kan Liang Cc: Leo Yan Cc: Mark Rutland Cc: Mike Leach Cc: Namhyung Kim Cc: Nick Desaulniers Cc: Nick Terrell Cc: Oliver Upton Cc: Peter Zijlstra Cc: Song Liu Cc: Sun Haiyong Cc: Suzuki Poulouse Cc: Will Deacon Cc: Yanteng Si Cc: Yicong Yang Cc: linux-arm-kernel@lists.infradead.org Link: https://lore.kernel.org/r/20240812204720.631678-10-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-lock.c | 37 ++++++++++++++++++------------------- 1 file changed, 18 insertions(+), 19 deletions(-) (limited to 'tools') diff --git a/tools/perf/builtin-lock.c b/tools/perf/builtin-lock.c index 6efa9d646637..2c216427e929 100644 --- a/tools/perf/builtin-lock.c +++ b/tools/perf/builtin-lock.c @@ -1933,22 +1933,21 @@ static bool force; static int __cmd_report(bool display_info) { int err = -EINVAL; - struct perf_tool eops = { - .attr = perf_event__process_attr, - .event_update = process_event_update, - .sample = process_sample_event, - .comm = perf_event__process_comm, - .mmap = perf_event__process_mmap, - .namespaces = perf_event__process_namespaces, - .tracing_data = perf_event__process_tracing_data, - .ordered_events = true, - }; + struct perf_tool eops; struct perf_data data = { .path = input_name, .mode = PERF_DATA_MODE_READ, .force = force, }; + perf_tool__init(&eops, /*ordered_events=*/true); + eops.attr = perf_event__process_attr; + eops.event_update = process_event_update; + eops.sample = process_sample_event; + eops.comm = perf_event__process_comm; + eops.mmap = perf_event__process_mmap; + eops.namespaces = perf_event__process_namespaces; + eops.tracing_data = perf_event__process_tracing_data; session = perf_session__new(&data, &eops); if (IS_ERR(session)) { pr_err("Initializing perf session failed\n"); @@ -2069,15 +2068,7 @@ static int check_lock_contention_options(const struct option *options, static int __cmd_contention(int argc, const char **argv) { int err = -EINVAL; - struct perf_tool eops = { - .attr = perf_event__process_attr, - .event_update = process_event_update, - .sample = process_sample_event, - .comm = perf_event__process_comm, - .mmap = perf_event__process_mmap, - .tracing_data = perf_event__process_tracing_data, - .ordered_events = true, - }; + struct perf_tool eops; struct perf_data data = { .path = input_name, .mode = PERF_DATA_MODE_READ, @@ -2100,6 +2091,14 @@ static int __cmd_contention(int argc, const char **argv) con.result = &lockhash_table[0]; + perf_tool__init(&eops, /*ordered_events=*/true); + eops.attr = perf_event__process_attr; + eops.event_update = process_event_update; + eops.sample = process_sample_event; + eops.comm = perf_event__process_comm; + eops.mmap = perf_event__process_mmap; + eops.tracing_data = perf_event__process_tracing_data; + session = perf_session__new(use_bpf ? NULL : &data, &eops); if (IS_ERR(session)) { pr_err("Initializing perf session failed\n"); -- cgit v1.2.3 From 419cbc44f5b94c1bc6a68e4c694ee46fe3907fc0 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Mon, 12 Aug 2024 13:47:02 -0700 Subject: perf evlist: Use perf_tool__init() Use perf_tool__init() so that more uses of 'struct perf_tool' can be const and not relying on perf_tool__fill_defaults(). Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Anshuman Khandual Cc: Athira Rajeev Cc: Huacai Chen Cc: Ilkka Koskinen Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: John Garry Cc: Jonathan Cameron Cc: Kan Liang Cc: Leo Yan Cc: Mark Rutland Cc: Mike Leach Cc: Namhyung Kim Cc: Nick Desaulniers Cc: Nick Terrell Cc: Oliver Upton Cc: Peter Zijlstra Cc: Song Liu Cc: Sun Haiyong Cc: Suzuki Poulouse Cc: Will Deacon Cc: Yanteng Si Cc: Yicong Yang Cc: linux-arm-kernel@lists.infradead.org Link: https://lore.kernel.org/r/20240812204720.631678-11-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-evlist.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'tools') diff --git a/tools/perf/builtin-evlist.c b/tools/perf/builtin-evlist.c index 7117656939e7..818ab21c3f73 100644 --- a/tools/perf/builtin-evlist.c +++ b/tools/perf/builtin-evlist.c @@ -35,13 +35,13 @@ static int __cmd_evlist(const char *file_name, struct perf_attr_details *details .mode = PERF_DATA_MODE_READ, .force = details->force, }; - struct perf_tool tool = { - /* only needed for pipe mode */ - .attr = perf_event__process_attr, - .feature = process_header_feature, - }; + struct perf_tool tool; bool has_tracepoint = false; + perf_tool__init(&tool, /*ordered_events=*/false); + /* only needed for pipe mode */ + tool.attr = perf_event__process_attr; + tool.feature = process_header_feature; session = perf_session__new(&data, &tool); if (IS_ERR(session)) return PTR_ERR(session); -- cgit v1.2.3 From cecb1cf154b301c6f8da434adccd7bcb855d3191 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Mon, 12 Aug 2024 13:47:03 -0700 Subject: perf record: Use perf_tool__init() Use perf_tool__init() so that more uses of 'struct perf_tool' can be const and not relying on perf_tool__fill_defaults(). Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Anshuman Khandual Cc: Athira Rajeev Cc: Huacai Chen Cc: Ilkka Koskinen Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: John Garry Cc: Jonathan Cameron Cc: Kan Liang Cc: Leo Yan Cc: Mark Rutland Cc: Mike Leach Cc: Namhyung Kim Cc: Nick Desaulniers Cc: Nick Terrell Cc: Oliver Upton Cc: Peter Zijlstra Cc: Song Liu Cc: Sun Haiyong Cc: Suzuki Poulouse Cc: Will Deacon Cc: Yanteng Si Cc: Yicong Yang Cc: linux-arm-kernel@lists.infradead.org Link: https://lore.kernel.org/r/20240812204720.631678-12-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-record.c | 33 ++++++++++++++++++++------------- tools/perf/util/tool.c | 10 +++++----- tools/perf/util/tool.h | 6 ++++++ 3 files changed, 31 insertions(+), 18 deletions(-) (limited to 'tools') diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index 46410eb3a76b..39367709fd99 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -194,6 +194,15 @@ static const char *affinity_tags[PERF_AFFINITY_MAX] = { "SYS", "NODE", "CPU" }; +static int build_id__process_mmap(const struct perf_tool *tool, union perf_event *event, + struct perf_sample *sample, struct machine *machine); +static int build_id__process_mmap2(const struct perf_tool *tool, union perf_event *event, + struct perf_sample *sample, struct machine *machine); +static int process_timestamp_boundary(const struct perf_tool *tool, + union perf_event *event, + struct perf_sample *sample, + struct machine *machine); + #ifndef HAVE_GETTID static inline pid_t gettid(void) { @@ -1459,7 +1468,7 @@ static int process_buildids(struct record *rec) * first/last samples. */ if (rec->buildid_all && !rec->timestamp_boundary) - rec->tool.sample = NULL; + rec->tool.sample = process_event_sample_stub; return perf_session__process_events(session); } @@ -2387,6 +2396,16 @@ static int __cmd_record(struct record *rec, int argc, const char **argv) signal(SIGUSR2, SIG_IGN); } + perf_tool__init(tool, /*ordered_events=*/true); + tool->sample = process_sample_event; + tool->fork = perf_event__process_fork; + tool->exit = perf_event__process_exit; + tool->comm = perf_event__process_comm; + tool->namespaces = perf_event__process_namespaces; + tool->mmap = build_id__process_mmap; + tool->mmap2 = build_id__process_mmap2; + tool->itrace_start = process_timestamp_boundary; + tool->aux = process_timestamp_boundary; session = perf_session__new(data, tool); if (IS_ERR(session)) { pr_err("Perf session creation failed.\n"); @@ -3327,18 +3346,6 @@ static struct record record = { .ctl_fd_ack = -1, .synth = PERF_SYNTH_ALL, }, - .tool = { - .sample = process_sample_event, - .fork = perf_event__process_fork, - .exit = perf_event__process_exit, - .comm = perf_event__process_comm, - .namespaces = perf_event__process_namespaces, - .mmap = build_id__process_mmap, - .mmap2 = build_id__process_mmap2, - .itrace_start = process_timestamp_boundary, - .aux = process_timestamp_boundary, - .ordered_events = true, - }, }; const char record_callchain_help[] = CALLCHAIN_RECORD_HELP diff --git a/tools/perf/util/tool.c b/tools/perf/util/tool.c index cbd9b888bd73..a4ca16e5eefe 100644 --- a/tools/perf/util/tool.c +++ b/tools/perf/util/tool.c @@ -99,11 +99,11 @@ static int process_event_synth_event_update_stub(const struct perf_tool *tool __ return 0; } -static int process_event_sample_stub(const struct perf_tool *tool __maybe_unused, - union perf_event *event __maybe_unused, - struct perf_sample *sample __maybe_unused, - struct evsel *evsel __maybe_unused, - struct machine *machine __maybe_unused) +int process_event_sample_stub(const struct perf_tool *tool __maybe_unused, + union perf_event *event __maybe_unused, + struct perf_sample *sample __maybe_unused, + struct evsel *evsel __maybe_unused, + struct machine *machine __maybe_unused) { dump_printf(": unhandled!\n"); return 0; diff --git a/tools/perf/util/tool.h b/tools/perf/util/tool.h index 7c7ce395e573..c7fea58729a5 100644 --- a/tools/perf/util/tool.h +++ b/tools/perf/util/tool.h @@ -94,4 +94,10 @@ void perf_tool__fill_defaults(struct perf_tool *tool); bool perf_tool__compressed_is_stub(const struct perf_tool *tool); +int process_event_sample_stub(const struct perf_tool *tool, + union perf_event *event, + struct perf_sample *sample, + struct evsel *evsel, + struct machine *machine); + #endif /* __PERF_TOOL_H */ -- cgit v1.2.3 From 6bfb6df8663f3baf901f6540bd3f95ccb86c2b9b Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Mon, 12 Aug 2024 13:47:04 -0700 Subject: perf c2c: Use perf_tool__init() Use perf_tool__init() so that more uses of 'struct perf_tool' can be const and not relying on perf_tool__fill_defaults(). Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Anshuman Khandual Cc: Athira Rajeev Cc: Huacai Chen Cc: Ilkka Koskinen Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: John Garry Cc: Jonathan Cameron Cc: Kan Liang Cc: Leo Yan Cc: Mark Rutland Cc: Mike Leach Cc: Namhyung Kim Cc: Nick Desaulniers Cc: Nick Terrell Cc: Oliver Upton Cc: Peter Zijlstra Cc: Song Liu Cc: Sun Haiyong Cc: Suzuki Poulouse Cc: Will Deacon Cc: Yanteng Si Cc: Yicong Yang Cc: linux-arm-kernel@lists.infradead.org Link: https://lore.kernel.org/r/20240812204720.631678-13-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-c2c.c | 31 +++++++++++++------------------ 1 file changed, 13 insertions(+), 18 deletions(-) (limited to 'tools') diff --git a/tools/perf/builtin-c2c.c b/tools/perf/builtin-c2c.c index 88c131d05186..cd2bd573bfc3 100644 --- a/tools/perf/builtin-c2c.c +++ b/tools/perf/builtin-c2c.c @@ -385,24 +385,6 @@ free_mi: goto out; } -static struct perf_c2c c2c = { - .tool = { - .sample = process_sample_event, - .mmap = perf_event__process_mmap, - .mmap2 = perf_event__process_mmap2, - .comm = perf_event__process_comm, - .exit = perf_event__process_exit, - .fork = perf_event__process_fork, - .lost = perf_event__process_lost, - .attr = perf_event__process_attr, - .auxtrace_info = perf_event__process_auxtrace_info, - .auxtrace = perf_event__process_auxtrace, - .auxtrace_error = perf_event__process_auxtrace_error, - .ordered_events = true, - .ordering_requires_timestamps = true, - }, -}; - static const char * const c2c_usage[] = { "perf c2c {record|report}", NULL @@ -3070,6 +3052,19 @@ static int perf_c2c__report(int argc, const char **argv) data.path = input_name; data.force = symbol_conf.force; + perf_tool__init(&c2c.tool, /*ordered_events=*/true); + c2c.tool.sample = process_sample_event; + c2c.tool.mmap = perf_event__process_mmap; + c2c.tool.mmap2 = perf_event__process_mmap2; + c2c.tool.comm = perf_event__process_comm; + c2c.tool.exit = perf_event__process_exit; + c2c.tool.fork = perf_event__process_fork; + c2c.tool.lost = perf_event__process_lost; + c2c.tool.attr = perf_event__process_attr; + c2c.tool.auxtrace_info = perf_event__process_auxtrace_info; + c2c.tool.auxtrace = perf_event__process_auxtrace; + c2c.tool.auxtrace_error = perf_event__process_auxtrace_error; + c2c.tool.ordering_requires_timestamps = true; session = perf_session__new(&data, &c2c.tool); if (IS_ERR(session)) { err = PTR_ERR(session); -- cgit v1.2.3 From 2fa28ccb17c4bfc31f7752b59e4b90856d7b33b1 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Mon, 12 Aug 2024 13:47:05 -0700 Subject: perf script: Use perf_tool__init() Use perf_tool__init() so that more uses of 'struct perf_tool' can be const and not relying on perf_tool__fill_defaults(). Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Anshuman Khandual Cc: Athira Rajeev Cc: Huacai Chen Cc: Ilkka Koskinen Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: John Garry Cc: Jonathan Cameron Cc: Kan Liang Cc: Leo Yan Cc: Mark Rutland Cc: Mike Leach Cc: Namhyung Kim Cc: Nick Desaulniers Cc: Nick Terrell Cc: Oliver Upton Cc: Peter Zijlstra Cc: Song Liu Cc: Sun Haiyong Cc: Suzuki Poulouse Cc: Will Deacon Cc: Yanteng Si Cc: Yicong Yang Cc: linux-arm-kernel@lists.infradead.org Link: https://lore.kernel.org/r/20240812204720.631678-14-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-script.c | 65 +++++++++++++++++++++------------------------ 1 file changed, 30 insertions(+), 35 deletions(-) (limited to 'tools') diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c index 31a6f800d55f..54598d1e815a 100644 --- a/tools/perf/builtin-script.c +++ b/tools/perf/builtin-script.c @@ -3899,38 +3899,7 @@ int cmd_script(int argc, const char **argv) const char *dlfilter_file = NULL; const char **__argv; int i, j, err = 0; - struct perf_script script = { - .tool = { - .sample = process_sample_event, - .mmap = perf_event__process_mmap, - .mmap2 = perf_event__process_mmap2, - .comm = perf_event__process_comm, - .namespaces = perf_event__process_namespaces, - .cgroup = perf_event__process_cgroup, - .exit = perf_event__process_exit, - .fork = perf_event__process_fork, - .attr = process_attr, - .event_update = perf_event__process_event_update, -#ifdef HAVE_LIBTRACEEVENT - .tracing_data = perf_event__process_tracing_data, -#endif - .feature = process_feature_event, - .build_id = perf_event__process_build_id, - .id_index = perf_event__process_id_index, - .auxtrace_info = perf_script__process_auxtrace_info, - .auxtrace = perf_event__process_auxtrace, - .auxtrace_error = perf_event__process_auxtrace_error, - .stat = perf_event__process_stat_event, - .stat_round = process_stat_round_event, - .stat_config = process_stat_config_event, - .thread_map = process_thread_map_event, - .cpu_map = process_cpu_map_event, - .throttle = process_throttle_event, - .unthrottle = process_throttle_event, - .ordered_events = true, - .ordering_requires_timestamps = true, - }, - }; + struct perf_script script = {}; struct perf_data data = { .mode = PERF_DATA_MODE_READ, }; @@ -4104,10 +4073,8 @@ int cmd_script(int argc, const char **argv) data.path = input_name; data.force = symbol_conf.force; - if (unsorted_dump) { + if (unsorted_dump) dump_trace = true; - script.tool.ordered_events = false; - } if (symbol__validate_sym_arguments()) return -1; @@ -4298,6 +4265,34 @@ script_found: use_browser = 0; } + perf_tool__init(&script.tool, !unsorted_dump); + script.tool.sample = process_sample_event; + script.tool.mmap = perf_event__process_mmap; + script.tool.mmap2 = perf_event__process_mmap2; + script.tool.comm = perf_event__process_comm; + script.tool.namespaces = perf_event__process_namespaces; + script.tool.cgroup = perf_event__process_cgroup; + script.tool.exit = perf_event__process_exit; + script.tool.fork = perf_event__process_fork; + script.tool.attr = process_attr; + script.tool.event_update = perf_event__process_event_update; +#ifdef HAVE_LIBTRACEEVENT + script.tool.tracing_data = perf_event__process_tracing_data; +#endif + script.tool.feature = process_feature_event; + script.tool.build_id = perf_event__process_build_id; + script.tool.id_index = perf_event__process_id_index; + script.tool.auxtrace_info = perf_script__process_auxtrace_info; + script.tool.auxtrace = perf_event__process_auxtrace; + script.tool.auxtrace_error = perf_event__process_auxtrace_error; + script.tool.stat = perf_event__process_stat_event; + script.tool.stat_round = process_stat_round_event; + script.tool.stat_config = process_stat_config_event; + script.tool.thread_map = process_thread_map_event; + script.tool.cpu_map = process_cpu_map_event; + script.tool.throttle = process_throttle_event; + script.tool.unthrottle = process_throttle_event; + script.tool.ordering_requires_timestamps = true; session = perf_session__new(&data, &script.tool); if (IS_ERR(session)) return PTR_ERR(session); -- cgit v1.2.3 From a37c0436f3799d02f13781d5e1c836b956b8c05d Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Mon, 12 Aug 2024 13:47:06 -0700 Subject: perf inject: Use perf_tool__init() Use perf_tool__init() so that more uses of 'struct perf_tool' can be const and not relying on perf_tool__fill_defaults(). Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Anshuman Khandual Cc: Athira Rajeev Cc: Huacai Chen Cc: Ilkka Koskinen Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: John Garry Cc: Jonathan Cameron Cc: Kan Liang Cc: Leo Yan Cc: Mark Rutland Cc: Mike Leach Cc: Namhyung Kim Cc: Nick Desaulniers Cc: Nick Terrell Cc: Oliver Upton Cc: Peter Zijlstra Cc: Song Liu Cc: Sun Haiyong Cc: Suzuki Poulouse Cc: Will Deacon Cc: Yanteng Si Cc: Yicong Yang Cc: linux-arm-kernel@lists.infradead.org Link: https://lore.kernel.org/r/20240812204720.631678-15-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-inject.c | 91 +++++++++++++++++++++------------------------ 1 file changed, 43 insertions(+), 48 deletions(-) (limited to 'tools') diff --git a/tools/perf/builtin-inject.c b/tools/perf/builtin-inject.c index 283429ccd034..ef9cba173dd2 100644 --- a/tools/perf/builtin-inject.c +++ b/tools/perf/builtin-inject.c @@ -2165,47 +2165,6 @@ static int __cmd_inject(struct perf_inject *inject) int cmd_inject(int argc, const char **argv) { struct perf_inject inject = { - .tool = { - .sample = perf_event__repipe_sample, - .read = perf_event__repipe_sample, - .mmap = perf_event__repipe, - .mmap2 = perf_event__repipe, - .comm = perf_event__repipe, - .namespaces = perf_event__repipe, - .cgroup = perf_event__repipe, - .fork = perf_event__repipe, - .exit = perf_event__repipe, - .lost = perf_event__repipe, - .lost_samples = perf_event__repipe, - .aux = perf_event__repipe, - .itrace_start = perf_event__repipe, - .aux_output_hw_id = perf_event__repipe, - .context_switch = perf_event__repipe, - .throttle = perf_event__repipe, - .unthrottle = perf_event__repipe, - .ksymbol = perf_event__repipe, - .bpf = perf_event__repipe, - .text_poke = perf_event__repipe, - .attr = perf_event__repipe_attr, - .event_update = perf_event__repipe_event_update, - .tracing_data = perf_event__repipe_op2_synth, - .finished_round = perf_event__repipe_oe_synth, - .build_id = perf_event__repipe_op2_synth, - .id_index = perf_event__repipe_op2_synth, - .auxtrace_info = perf_event__repipe_op2_synth, - .auxtrace_error = perf_event__repipe_op2_synth, - .time_conv = perf_event__repipe_op2_synth, - .thread_map = perf_event__repipe_op2_synth, - .cpu_map = perf_event__repipe_op2_synth, - .stat_config = perf_event__repipe_op2_synth, - .stat = perf_event__repipe_op2_synth, - .stat_round = perf_event__repipe_op2_synth, - .feature = perf_event__repipe_op2_synth, - .finished_init = perf_event__repipe_op2_synth, - .compressed = perf_event__repipe_op4_synth, - .auxtrace = perf_event__repipe_auxtrace, - .dont_split_sample_group = true, - }, .input_name = "-", .samples = LIST_HEAD_INIT(inject.samples), .output = { @@ -2270,6 +2229,7 @@ int cmd_inject(int argc, const char **argv) "perf inject []", NULL }; + bool ordered_events; if (!inject.itrace_synth_opts.set) { /* Disable eager loading of kernel symbols that adds overhead to perf inject. */ @@ -2334,7 +2294,48 @@ int cmd_inject(int argc, const char **argv) if (strcmp(inject.input_name, "-")) repipe = false; } - + ordered_events = inject.jit_mode || inject.sched_stat || + (inject.build_ids && !inject.build_id_all); + perf_tool__init(&inject.tool, ordered_events); + inject.tool.sample = perf_event__repipe_sample; + inject.tool.read = perf_event__repipe_sample; + inject.tool.mmap = perf_event__repipe; + inject.tool.mmap2 = perf_event__repipe; + inject.tool.comm = perf_event__repipe; + inject.tool.namespaces = perf_event__repipe; + inject.tool.cgroup = perf_event__repipe; + inject.tool.fork = perf_event__repipe; + inject.tool.exit = perf_event__repipe; + inject.tool.lost = perf_event__repipe; + inject.tool.lost_samples = perf_event__repipe; + inject.tool.aux = perf_event__repipe; + inject.tool.itrace_start = perf_event__repipe; + inject.tool.aux_output_hw_id = perf_event__repipe; + inject.tool.context_switch = perf_event__repipe; + inject.tool.throttle = perf_event__repipe; + inject.tool.unthrottle = perf_event__repipe; + inject.tool.ksymbol = perf_event__repipe; + inject.tool.bpf = perf_event__repipe; + inject.tool.text_poke = perf_event__repipe; + inject.tool.attr = perf_event__repipe_attr; + inject.tool.event_update = perf_event__repipe_event_update; + inject.tool.tracing_data = perf_event__repipe_op2_synth; + inject.tool.finished_round = perf_event__repipe_oe_synth; + inject.tool.build_id = perf_event__repipe_op2_synth; + inject.tool.id_index = perf_event__repipe_op2_synth; + inject.tool.auxtrace_info = perf_event__repipe_op2_synth; + inject.tool.auxtrace_error = perf_event__repipe_op2_synth; + inject.tool.time_conv = perf_event__repipe_op2_synth; + inject.tool.thread_map = perf_event__repipe_op2_synth; + inject.tool.cpu_map = perf_event__repipe_op2_synth; + inject.tool.stat_config = perf_event__repipe_op2_synth; + inject.tool.stat = perf_event__repipe_op2_synth; + inject.tool.stat_round = perf_event__repipe_op2_synth; + inject.tool.feature = perf_event__repipe_op2_synth; + inject.tool.finished_init = perf_event__repipe_op2_synth; + inject.tool.compressed = perf_event__repipe_op4_synth; + inject.tool.auxtrace = perf_event__repipe_auxtrace; + inject.tool.dont_split_sample_group = true; inject.session = __perf_session__new(&data, repipe, output_fd(&inject), &inject.tool); @@ -2373,7 +2374,6 @@ int cmd_inject(int argc, const char **argv) * mmaps. We cannot generate the buildid hit list and * inject the jit mmaps at the same time for now. */ - inject.tool.ordered_events = true; inject.tool.ordering_requires_timestamps = true; if (known_build_ids != NULL) { inject.known_build_ids = @@ -2386,15 +2386,10 @@ int cmd_inject(int argc, const char **argv) } } - if (inject.sched_stat) { - inject.tool.ordered_events = true; - } - #ifdef HAVE_JITDUMP if (inject.jit_mode) { inject.tool.mmap2 = perf_event__jit_repipe_mmap2; inject.tool.mmap = perf_event__jit_repipe_mmap; - inject.tool.ordered_events = true; inject.tool.ordering_requires_timestamps = true; /* * JIT MMAP injection injects all MMAP events in one go, so it -- cgit v1.2.3 From 113f614c6dd0c2fb13f4c6d7f3c4a82212de0c0f Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Mon, 12 Aug 2024 13:47:07 -0700 Subject: perf report: Use perf_tool__init() Use perf_tool__init() so that more uses of 'struct perf_tool' can be const and not relying on perf_tool__fill_defaults(). Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Anshuman Khandual Cc: Athira Rajeev Cc: Huacai Chen Cc: Ilkka Koskinen Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: John Garry Cc: Jonathan Cameron Cc: Kan Liang Cc: Leo Yan Cc: Mark Rutland Cc: Mike Leach Cc: Namhyung Kim Cc: Nick Desaulniers Cc: Nick Terrell Cc: Oliver Upton Cc: Peter Zijlstra Cc: Song Liu Cc: Sun Haiyong Cc: Suzuki Poulouse Cc: Will Deacon Cc: Yanteng Si Cc: Yicong Yang Cc: linux-arm-kernel@lists.infradead.org Link: https://lore.kernel.org/r/20240812204720.631678-16-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-report.c | 55 ++++++++++++++++++++++----------------------- 1 file changed, 27 insertions(+), 28 deletions(-) (limited to 'tools') diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c index 5f609a7791ea..dfb47fa85e5c 100644 --- a/tools/perf/builtin-report.c +++ b/tools/perf/builtin-report.c @@ -799,7 +799,7 @@ static int process_attr(const struct perf_tool *tool __maybe_unused, static void stats_setup(struct report *rep) { - memset(&rep->tool, 0, sizeof(rep->tool)); + perf_tool__init(&rep->tool, /*ordered_events=*/false); rep->tool.attr = process_attr; rep->tool.sample = count_sample_event; rep->tool.lost_samples = count_lost_samples_event; @@ -817,8 +817,7 @@ static int stats_print(struct report *rep) static void tasks_setup(struct report *rep) { - memset(&rep->tool, 0, sizeof(rep->tool)); - rep->tool.ordered_events = true; + perf_tool__init(&rep->tool, /*ordered_events=*/true); if (rep->mmaps_mode) { rep->tool.mmap = perf_event__process_mmap; rep->tool.mmap2 = perf_event__process_mmap2; @@ -1272,30 +1271,6 @@ int cmd_report(int argc, const char **argv) NULL }; struct report report = { - .tool = { - .sample = process_sample_event, - .mmap = perf_event__process_mmap, - .mmap2 = perf_event__process_mmap2, - .comm = perf_event__process_comm, - .namespaces = perf_event__process_namespaces, - .cgroup = perf_event__process_cgroup, - .exit = perf_event__process_exit, - .fork = perf_event__process_fork, - .lost = perf_event__process_lost, - .read = process_read_event, - .attr = process_attr, -#ifdef HAVE_LIBTRACEEVENT - .tracing_data = perf_event__process_tracing_data, -#endif - .build_id = perf_event__process_build_id, - .id_index = perf_event__process_id_index, - .auxtrace_info = perf_event__process_auxtrace_info, - .auxtrace = perf_event__process_auxtrace, - .event_update = perf_event__process_event_update, - .feature = process_feature_event, - .ordered_events = true, - .ordering_requires_timestamps = true, - }, .max_stack = PERF_MAX_STACK_DEPTH, .pretty_printing_style = "normal", .socket_filter = -1, @@ -1477,6 +1452,7 @@ int cmd_report(int argc, const char **argv) }; int ret = hists__init(); char sort_tmp[128]; + bool ordered_events = true; if (ret < 0) goto exit; @@ -1531,7 +1507,7 @@ int cmd_report(int argc, const char **argv) report.tasks_mode = true; if (dump_trace && report.disable_order) - report.tool.ordered_events = false; + ordered_events = false; if (quiet) perf_quiet_option(); @@ -1562,6 +1538,29 @@ int cmd_report(int argc, const char **argv) symbol_conf.skip_empty = report.skip_empty; repeat: + perf_tool__init(&report.tool, ordered_events); + report.tool.sample = process_sample_event; + report.tool.mmap = perf_event__process_mmap; + report.tool.mmap2 = perf_event__process_mmap2; + report.tool.comm = perf_event__process_comm; + report.tool.namespaces = perf_event__process_namespaces; + report.tool.cgroup = perf_event__process_cgroup; + report.tool.exit = perf_event__process_exit; + report.tool.fork = perf_event__process_fork; + report.tool.lost = perf_event__process_lost; + report.tool.read = process_read_event; + report.tool.attr = process_attr; +#ifdef HAVE_LIBTRACEEVENT + report.tool.tracing_data = perf_event__process_tracing_data; +#endif + report.tool.build_id = perf_event__process_build_id; + report.tool.id_index = perf_event__process_id_index; + report.tool.auxtrace_info = perf_event__process_auxtrace_info; + report.tool.auxtrace = perf_event__process_auxtrace; + report.tool.event_update = perf_event__process_event_update; + report.tool.feature = process_feature_event; + report.tool.ordering_requires_timestamps = true; + session = perf_session__new(&data, &report.tool); if (IS_ERR(session)) { ret = PTR_ERR(session); -- cgit v1.2.3 From 071b117e755bd5000ea13f488e8d19ed9e42121c Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Mon, 12 Aug 2024 13:47:08 -0700 Subject: perf stat: Use perf_tool__init() Use perf_tool__init() so that more uses of 'struct perf_tool' can be const and not relying on perf_tool__fill_defaults(). Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Anshuman Khandual Cc: Athira Rajeev Cc: Huacai Chen Cc: Ilkka Koskinen Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: John Garry Cc: Jonathan Cameron Cc: Kan Liang Cc: Leo Yan Cc: Mark Rutland Cc: Mike Leach Cc: Namhyung Kim Cc: Nick Desaulniers Cc: Nick Terrell Cc: Oliver Upton Cc: Peter Zijlstra Cc: Song Liu Cc: Sun Haiyong Cc: Suzuki Poulouse Cc: Will Deacon Cc: Yanteng Si Cc: Yicong Yang Cc: linux-arm-kernel@lists.infradead.org Link: https://lore.kernel.org/r/20240812204720.631678-17-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-stat.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'tools') diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index d44e708f8a44..487eff9a8582 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -2271,15 +2271,6 @@ static const char * const stat_report_usage[] = { }; static struct perf_stat perf_stat = { - .tool = { - .attr = perf_event__process_attr, - .event_update = perf_event__process_event_update, - .thread_map = process_thread_map_event, - .cpu_map = process_cpu_map_event, - .stat_config = process_stat_config_event, - .stat = perf_event__process_stat_event, - .stat_round = process_stat_round_event, - }, .aggr_mode = AGGR_UNSET, .aggr_level = 0, }; @@ -2322,6 +2313,15 @@ static int __cmd_report(int argc, const char **argv) perf_stat.data.path = input_name; perf_stat.data.mode = PERF_DATA_MODE_READ; + perf_tool__init(&perf_stat.tool, /*ordered_events=*/false); + perf_stat.tool.attr = perf_event__process_attr; + perf_stat.tool.event_update = perf_event__process_event_update; + perf_stat.tool.thread_map = process_thread_map_event; + perf_stat.tool.cpu_map = process_cpu_map_event; + perf_stat.tool.stat_config = process_stat_config_event; + perf_stat.tool.stat = perf_event__process_stat_event; + perf_stat.tool.stat_round = process_stat_round_event; + session = perf_session__new(&perf_stat.data, &perf_stat.tool); if (IS_ERR(session)) return PTR_ERR(session); -- cgit v1.2.3 From d48940cabcf70c186901f2c75fe89fed12d4ec18 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Mon, 12 Aug 2024 13:47:09 -0700 Subject: perf annotate: Use perf_tool__init() Use perf_tool__init() so that more uses of 'struct perf_tool' can be const and not relying on perf_tool__fill_defaults(). Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Anshuman Khandual Cc: Athira Rajeev Cc: Huacai Chen Cc: Ilkka Koskinen Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: John Garry Cc: Jonathan Cameron Cc: Kan Liang Cc: Leo Yan Cc: Mark Rutland Cc: Mike Leach Cc: Namhyung Kim Cc: Nick Desaulniers Cc: Nick Terrell Cc: Oliver Upton Cc: Peter Zijlstra Cc: Song Liu Cc: Sun Haiyong Cc: Suzuki Poulouse Cc: Will Deacon Cc: Yanteng Si Cc: Yicong Yang Cc: linux-arm-kernel@lists.infradead.org Link: https://lore.kernel.org/r/20240812204720.631678-18-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-annotate.c | 42 ++++++++++++++++++++---------------------- 1 file changed, 20 insertions(+), 22 deletions(-) (limited to 'tools') diff --git a/tools/perf/builtin-annotate.c b/tools/perf/builtin-annotate.c index cc65e6f8f4da..d6f6ba5a569d 100644 --- a/tools/perf/builtin-annotate.c +++ b/tools/perf/builtin-annotate.c @@ -696,28 +696,7 @@ static const char * const annotate_usage[] = { int cmd_annotate(int argc, const char **argv) { - struct perf_annotate annotate = { - .tool = { - .sample = process_sample_event, - .mmap = perf_event__process_mmap, - .mmap2 = perf_event__process_mmap2, - .comm = perf_event__process_comm, - .exit = perf_event__process_exit, - .fork = perf_event__process_fork, - .namespaces = perf_event__process_namespaces, - .attr = perf_event__process_attr, - .build_id = perf_event__process_build_id, -#ifdef HAVE_LIBTRACEEVENT - .tracing_data = perf_event__process_tracing_data, -#endif - .id_index = perf_event__process_id_index, - .auxtrace_info = perf_event__process_auxtrace_info, - .auxtrace = perf_event__process_auxtrace, - .feature = process_feature_event, - .ordered_events = true, - .ordering_requires_timestamps = true, - }, - }; + struct perf_annotate annotate = {}; struct perf_data data = { .mode = PERF_DATA_MODE_READ, }; @@ -876,6 +855,25 @@ int cmd_annotate(int argc, const char **argv) data.path = input_name; + perf_tool__init(&annotate.tool, /*ordered_events=*/true); + annotate.tool.sample = process_sample_event; + annotate.tool.mmap = perf_event__process_mmap; + annotate.tool.mmap2 = perf_event__process_mmap2; + annotate.tool.comm = perf_event__process_comm; + annotate.tool.exit = perf_event__process_exit; + annotate.tool.fork = perf_event__process_fork; + annotate.tool.namespaces = perf_event__process_namespaces; + annotate.tool.attr = perf_event__process_attr; + annotate.tool.build_id = perf_event__process_build_id; +#ifdef HAVE_LIBTRACEEVENT + annotate.tool.tracing_data = perf_event__process_tracing_data; +#endif + annotate.tool.id_index = perf_event__process_id_index; + annotate.tool.auxtrace_info = perf_event__process_auxtrace_info; + annotate.tool.auxtrace = perf_event__process_auxtrace; + annotate.tool.feature = process_feature_event; + annotate.tool.ordering_requires_timestamps = true; + annotate.session = perf_session__new(&data, &annotate.tool); if (IS_ERR(annotate.session)) return PTR_ERR(annotate.session); -- cgit v1.2.3 From 41860d49473c0c09dc0a2a4d148047f97aaa2539 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Mon, 12 Aug 2024 13:47:10 -0700 Subject: perf sched: Use perf_tool__init() Use perf_tool__init() so that more uses of 'struct perf_tool' can be const and not relying on perf_tool__fill_defaults(). Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Anshuman Khandual Cc: Athira Rajeev Cc: Huacai Chen Cc: Ilkka Koskinen Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: John Garry Cc: Jonathan Cameron Cc: Kan Liang Cc: Leo Yan Cc: Mark Rutland Cc: Mike Leach Cc: Namhyung Kim Cc: Nick Desaulniers Cc: Nick Terrell Cc: Oliver Upton Cc: Peter Zijlstra Cc: Song Liu Cc: Sun Haiyong Cc: Suzuki Poulouse Cc: Will Deacon Cc: Yanteng Si Cc: Yicong Yang Cc: linux-arm-kernel@lists.infradead.org Link: https://lore.kernel.org/r/20240812204720.631678-19-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-sched.c | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) (limited to 'tools') diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c index 2c60bd3a8149..34fe8e540c43 100644 --- a/tools/perf/builtin-sched.c +++ b/tools/perf/builtin-sched.c @@ -3100,7 +3100,6 @@ static int perf_sched__timehist(struct perf_sched *sched) sched->tool.tracing_data = perf_event__process_tracing_data; sched->tool.build_id = perf_event__process_build_id; - sched->tool.ordered_events = true; sched->tool.ordering_requires_timestamps = true; symbol_conf.use_callchain = sched->show_callchain; @@ -3605,14 +3604,6 @@ int cmd_sched(int argc, const char **argv) { static const char default_sort_order[] = "avg, max, switch, runtime"; struct perf_sched sched = { - .tool = { - .sample = perf_sched__process_tracepoint_sample, - .comm = perf_sched__process_comm, - .namespaces = perf_event__process_namespaces, - .lost = perf_event__process_lost, - .fork = perf_sched__process_fork_event, - .ordered_events = true, - }, .cmp_pid = LIST_HEAD_INIT(sched.cmp_pid), .sort_list = LIST_HEAD_INIT(sched.sort_list), .sort_order = default_sort_order, @@ -3733,6 +3724,13 @@ int cmd_sched(int argc, const char **argv) }; int ret; + perf_tool__init(&sched.tool, /*ordered_events=*/true); + sched.tool.sample = perf_sched__process_tracepoint_sample; + sched.tool.comm = perf_sched__process_comm; + sched.tool.namespaces = perf_event__process_namespaces; + sched.tool.lost = perf_event__process_lost; + sched.tool.fork = perf_sched__process_fork_event; + argc = parse_options_subcommand(argc, argv, sched_options, sched_subcommands, sched_usage, PARSE_OPT_STOP_AT_NON_OPTION); if (!argc) -- cgit v1.2.3 From 4a20562bc4118478b4592bab8f0d90975df77095 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Mon, 12 Aug 2024 13:47:11 -0700 Subject: perf mem: Use perf_tool__init() Use perf_tool__init() so that more uses of 'struct perf_tool' can be const and not relying on perf_tool__fill_defaults(). Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Anshuman Khandual Cc: Athira Rajeev Cc: Huacai Chen Cc: Ilkka Koskinen Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: John Garry Cc: Jonathan Cameron Cc: Kan Liang Cc: Leo Yan Cc: Mark Rutland Cc: Mike Leach Cc: Namhyung Kim Cc: Nick Desaulniers Cc: Nick Terrell Cc: Oliver Upton Cc: Peter Zijlstra Cc: Song Liu Cc: Sun Haiyong Cc: Suzuki Poulouse Cc: Will Deacon Cc: Yanteng Si Cc: Yicong Yang Cc: linux-arm-kernel@lists.infradead.org Link: https://lore.kernel.org/r/20240812204720.631678-20-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-mem.c | 33 +++++++++++++++++---------------- 1 file changed, 17 insertions(+), 16 deletions(-) (limited to 'tools') diff --git a/tools/perf/builtin-mem.c b/tools/perf/builtin-mem.c index 0fb4d75ab959..18e5f9a0ddc2 100644 --- a/tools/perf/builtin-mem.c +++ b/tools/perf/builtin-mem.c @@ -276,7 +276,23 @@ static int report_raw_events(struct perf_mem *mem) .force = mem->force, }; int ret; - struct perf_session *session = perf_session__new(&data, &mem->tool); + struct perf_session *session; + + perf_tool__init(&mem->tool, /*ordered_events=*/true); + mem->tool.sample = process_sample_event; + mem->tool.mmap = perf_event__process_mmap; + mem->tool.mmap2 = perf_event__process_mmap2; + mem->tool.comm = perf_event__process_comm; + mem->tool.lost = perf_event__process_lost; + mem->tool.fork = perf_event__process_fork; + mem->tool.attr = perf_event__process_attr; + mem->tool.build_id = perf_event__process_build_id; + mem->tool.namespaces = perf_event__process_namespaces; + mem->tool.auxtrace_info = perf_event__process_auxtrace_info; + mem->tool.auxtrace = perf_event__process_auxtrace; + mem->tool.auxtrace_error = perf_event__process_auxtrace_error; + + session = perf_session__new(&data, &mem->tool); if (IS_ERR(session)) return PTR_ERR(session); @@ -458,21 +474,6 @@ int cmd_mem(int argc, const char **argv) { struct stat st; struct perf_mem mem = { - .tool = { - .sample = process_sample_event, - .mmap = perf_event__process_mmap, - .mmap2 = perf_event__process_mmap2, - .comm = perf_event__process_comm, - .lost = perf_event__process_lost, - .fork = perf_event__process_fork, - .attr = perf_event__process_attr, - .build_id = perf_event__process_build_id, - .namespaces = perf_event__process_namespaces, - .auxtrace_info = perf_event__process_auxtrace_info, - .auxtrace = perf_event__process_auxtrace, - .auxtrace_error = perf_event__process_auxtrace_error, - .ordered_events = true, - }, .input_name = "perf.data", /* * default to both load an store sampling -- cgit v1.2.3 From 60b5fd3f62d0622b70450d4707dc30be47a724c7 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Mon, 12 Aug 2024 13:47:12 -0700 Subject: perf timechart: Use perf_tool__init() Use perf_tool__init() so that more uses of 'struct perf_tool' can be const and not relying on perf_tool__fill_defaults(). Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Anshuman Khandual Cc: Athira Rajeev Cc: Huacai Chen Cc: Ilkka Koskinen Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: John Garry Cc: Jonathan Cameron Cc: Kan Liang Cc: Leo Yan Cc: Mark Rutland Cc: Mike Leach Cc: Namhyung Kim Cc: Nick Desaulniers Cc: Nick Terrell Cc: Oliver Upton Cc: Peter Zijlstra Cc: Song Liu Cc: Sun Haiyong Cc: Suzuki Poulouse Cc: Will Deacon Cc: Yanteng Si Cc: Yicong Yang Cc: linux-arm-kernel@lists.infradead.org Link: https://lore.kernel.org/r/20240812204720.631678-21-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-timechart.c | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) (limited to 'tools') diff --git a/tools/perf/builtin-timechart.c b/tools/perf/builtin-timechart.c index 5bf818baa662..218c8b44d7be 100644 --- a/tools/perf/builtin-timechart.c +++ b/tools/perf/builtin-timechart.c @@ -1606,10 +1606,16 @@ static int __cmd_timechart(struct timechart *tchart, const char *output_name) .mode = PERF_DATA_MODE_READ, .force = tchart->force, }; - - struct perf_session *session = perf_session__new(&data, &tchart->tool); + struct perf_session *session; int ret = -EINVAL; + perf_tool__init(&tchart->tool, /*ordered_events=*/true); + tchart->tool.comm = process_comm_event; + tchart->tool.fork = process_fork_event; + tchart->tool.exit = process_exit_event; + tchart->tool.sample = process_sample_event; + + session = perf_session__new(&data, &tchart->tool); if (IS_ERR(session)) return PTR_ERR(session); @@ -1924,13 +1930,6 @@ parse_time(const struct option *opt, const char *arg, int __maybe_unused unset) int cmd_timechart(int argc, const char **argv) { struct timechart tchart = { - .tool = { - .comm = process_comm_event, - .fork = process_fork_event, - .exit = process_exit_event, - .sample = process_sample_event, - .ordered_events = true, - }, .proc_num = 15, .min_time = NSEC_PER_MSEC, .merge_dist = 1000, -- cgit v1.2.3 From 1e1ec8f2e5fa914da0155170c63099d7189f3cfc Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Mon, 12 Aug 2024 13:47:13 -0700 Subject: perf diff: Use perf_tool__init() Use perf_tool__init() so that more uses of 'struct perf_tool' can be const and not relying on perf_tool__fill_defaults(). Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Anshuman Khandual Cc: Athira Rajeev Cc: Huacai Chen Cc: Ilkka Koskinen Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: John Garry Cc: Jonathan Cameron Cc: Kan Liang Cc: Leo Yan Cc: Mark Rutland Cc: Mike Leach Cc: Namhyung Kim Cc: Nick Desaulniers Cc: Nick Terrell Cc: Oliver Upton Cc: Peter Zijlstra Cc: Song Liu Cc: Sun Haiyong Cc: Suzuki Poulouse Cc: Will Deacon Cc: Yanteng Si Cc: Yicong Yang Cc: linux-arm-kernel@lists.infradead.org Link: https://lore.kernel.org/r/20240812204720.631678-22-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-diff.c | 28 +++++++++++++--------------- 1 file changed, 13 insertions(+), 15 deletions(-) (limited to 'tools') diff --git a/tools/perf/builtin-diff.c b/tools/perf/builtin-diff.c index 4c0567882a7a..28c5208fcdc9 100644 --- a/tools/perf/builtin-diff.c +++ b/tools/perf/builtin-diff.c @@ -467,21 +467,7 @@ out: return ret; } -static struct perf_diff pdiff = { - .tool = { - .sample = diff__process_sample_event, - .mmap = perf_event__process_mmap, - .mmap2 = perf_event__process_mmap2, - .comm = perf_event__process_comm, - .exit = perf_event__process_exit, - .fork = perf_event__process_fork, - .lost = perf_event__process_lost, - .namespaces = perf_event__process_namespaces, - .cgroup = perf_event__process_cgroup, - .ordered_events = true, - .ordering_requires_timestamps = true, - }, -}; +static struct perf_diff pdiff; static struct evsel *evsel_match(struct evsel *evsel, struct evlist *evlist) @@ -1959,6 +1945,18 @@ int cmd_diff(int argc, const char **argv) if (ret < 0) return ret; + perf_tool__init(&pdiff.tool, /*ordered_events=*/true); + pdiff.tool.sample = diff__process_sample_event; + pdiff.tool.mmap = perf_event__process_mmap; + pdiff.tool.mmap2 = perf_event__process_mmap2; + pdiff.tool.comm = perf_event__process_comm; + pdiff.tool.exit = perf_event__process_exit; + pdiff.tool.fork = perf_event__process_fork; + pdiff.tool.lost = perf_event__process_lost; + pdiff.tool.namespaces = perf_event__process_namespaces; + pdiff.tool.cgroup = perf_event__process_cgroup; + pdiff.tool.ordering_requires_timestamps = true; + perf_config(diff__config, NULL); argc = parse_options(argc, argv, options, diff_usage, 0); -- cgit v1.2.3 From b9d276d1a282dfca186d0dbc2a001581b7c6f5b7 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Mon, 12 Aug 2024 13:47:14 -0700 Subject: perf data convert json: Use perf_tool__init() Use perf_tool__init() so that more uses of 'struct perf_tool' can be const and not relying on perf_tool__fill_defaults(). Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Anshuman Khandual Cc: Athira Rajeev Cc: Huacai Chen Cc: Ilkka Koskinen Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: John Garry Cc: Jonathan Cameron Cc: Kan Liang Cc: Leo Yan Cc: Mark Rutland Cc: Mike Leach Cc: Namhyung Kim Cc: Nick Desaulniers Cc: Nick Terrell Cc: Oliver Upton Cc: Peter Zijlstra Cc: Song Liu Cc: Sun Haiyong Cc: Suzuki Poulouse Cc: Will Deacon Cc: Yanteng Si Cc: Yicong Yang Cc: linux-arm-kernel@lists.infradead.org Link: https://lore.kernel.org/r/20240812204720.631678-23-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/data-convert-json.c | 43 +++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 23 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/data-convert-json.c b/tools/perf/util/data-convert-json.c index 905ea9823f9d..20bfb0884e9e 100644 --- a/tools/perf/util/data-convert-json.c +++ b/tools/perf/util/data-convert-json.c @@ -316,39 +316,36 @@ int bt_convert__perf2json(const char *input_name, const char *output_name, struct perf_session *session; int fd; int ret = -1; - struct convert_json c = { - .tool = { - .sample = process_sample_event, - .mmap = perf_event__process_mmap, - .mmap2 = perf_event__process_mmap2, - .comm = perf_event__process_comm, - .namespaces = perf_event__process_namespaces, - .cgroup = perf_event__process_cgroup, - .exit = perf_event__process_exit, - .fork = perf_event__process_fork, - .lost = perf_event__process_lost, -#ifdef HAVE_LIBTRACEEVENT - .tracing_data = perf_event__process_tracing_data, -#endif - .build_id = perf_event__process_build_id, - .id_index = perf_event__process_id_index, - .auxtrace_info = perf_event__process_auxtrace_info, - .auxtrace = perf_event__process_auxtrace, - .event_update = perf_event__process_event_update, - .ordered_events = true, - .ordering_requires_timestamps = true, - }, .first = true, .events_count = 0, }; - struct perf_data data = { .mode = PERF_DATA_MODE_READ, .path = input_name, .force = opts->force, }; + perf_tool__init(&c.tool, /*ordered_events=*/true); + c.tool.sample = process_sample_event; + c.tool.mmap = perf_event__process_mmap; + c.tool.mmap2 = perf_event__process_mmap2; + c.tool.comm = perf_event__process_comm; + c.tool.namespaces = perf_event__process_namespaces; + c.tool.cgroup = perf_event__process_cgroup; + c.tool.exit = perf_event__process_exit; + c.tool.fork = perf_event__process_fork; + c.tool.lost = perf_event__process_lost; +#ifdef HAVE_LIBTRACEEVENT + c.tool.tracing_data = perf_event__process_tracing_data; +#endif + c.tool.build_id = perf_event__process_build_id; + c.tool.id_index = perf_event__process_id_index; + c.tool.auxtrace_info = perf_event__process_auxtrace_info; + c.tool.auxtrace = perf_event__process_auxtrace; + c.tool.event_update = perf_event__process_event_update; + c.tool.ordering_requires_timestamps = true; + if (opts->all) { pr_err("--all is currently unsupported for JSON output.\n"); goto err; -- cgit v1.2.3 From 2721c6cc04d79057b855411a16964965d5a3da0a Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Mon, 12 Aug 2024 13:47:15 -0700 Subject: perf data convert ctf: Use perf_tool__init() Use perf_tool__init() so that more uses of 'struct perf_tool' can be const and not relying on perf_tool__fill_defaults(). Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Anshuman Khandual Cc: Athira Rajeev Cc: Huacai Chen Cc: Ilkka Koskinen Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: John Garry Cc: Jonathan Cameron Cc: Kan Liang Cc: Leo Yan Cc: Mark Rutland Cc: Mike Leach Cc: Namhyung Kim Cc: Nick Desaulniers Cc: Nick Terrell Cc: Oliver Upton Cc: Peter Zijlstra Cc: Song Liu Cc: Sun Haiyong Cc: Suzuki Poulouse Cc: Will Deacon Cc: Yanteng Si Cc: Yicong Yang Cc: linux-arm-kernel@lists.infradead.org Link: https://lore.kernel.org/r/20240812204720.631678-24-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/data-convert-bt.c | 30 ++++++++++++++---------------- 1 file changed, 14 insertions(+), 16 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/data-convert-bt.c b/tools/perf/util/data-convert-bt.c index 9e2170604b66..021e9b1d5cc5 100644 --- a/tools/perf/util/data-convert-bt.c +++ b/tools/perf/util/data-convert-bt.c @@ -1607,25 +1607,23 @@ int bt_convert__perf2ctf(const char *input, const char *path, .mode = PERF_DATA_MODE_READ, .force = opts->force, }; - struct convert c = { - .tool = { - .sample = process_sample_event, - .mmap = perf_event__process_mmap, - .mmap2 = perf_event__process_mmap2, - .comm = perf_event__process_comm, - .exit = perf_event__process_exit, - .fork = perf_event__process_fork, - .lost = perf_event__process_lost, - .tracing_data = perf_event__process_tracing_data, - .build_id = perf_event__process_build_id, - .namespaces = perf_event__process_namespaces, - .ordered_events = true, - .ordering_requires_timestamps = true, - }, - }; + struct convert c = {}; struct ctf_writer *cw = &c.writer; int err; + perf_tool__init(&c.tool, /*ordered_events=*/true); + c.tool.sample = process_sample_event; + c.tool.mmap = perf_event__process_mmap; + c.tool.mmap2 = perf_event__process_mmap2; + c.tool.comm = perf_event__process_comm; + c.tool.exit = perf_event__process_exit; + c.tool.fork = perf_event__process_fork; + c.tool.lost = perf_event__process_lost; + c.tool.tracing_data = perf_event__process_tracing_data; + c.tool.build_id = perf_event__process_build_id; + c.tool.namespaces = perf_event__process_namespaces; + c.tool.ordering_requires_timestamps = true; + if (opts->all) { c.tool.comm = process_comm_event; c.tool.exit = process_exit_event; -- cgit v1.2.3 From 332b897f34b95fea32c998ceca0569ac2c049d3d Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Mon, 12 Aug 2024 13:47:16 -0700 Subject: perf test event_update: Ensure tools is initialized Ensure tool is initialized to avoid lazy initialization pattern so that more uses of struct perf_tool can be made const. Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Anshuman Khandual Cc: Athira Rajeev Cc: Huacai Chen Cc: Ilkka Koskinen Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: John Garry Cc: Jonathan Cameron Cc: Kan Liang Cc: Leo Yan Cc: Mark Rutland Cc: Mike Leach Cc: Namhyung Kim Cc: Nick Desaulniers Cc: Nick Terrell Cc: Oliver Upton Cc: Peter Zijlstra Cc: Song Liu Cc: Sun Haiyong Cc: Suzuki Poulouse Cc: Will Deacon Cc: Yanteng Si Cc: Yicong Yang Cc: linux-arm-kernel@lists.infradead.org Link: https://lore.kernel.org/r/20240812204720.631678-25-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/event_update.c | 1 + 1 file changed, 1 insertion(+) (limited to 'tools') diff --git a/tools/perf/tests/event_update.c b/tools/perf/tests/event_update.c index fdecad920f59..d6b4ce3ef4ee 100644 --- a/tools/perf/tests/event_update.c +++ b/tools/perf/tests/event_update.c @@ -103,6 +103,7 @@ static int test__event_update(struct test_suite *test __maybe_unused, int subtes TEST_ASSERT_VAL("failed to synthesize attr update scale", !perf_event__synthesize_event_update_scale(NULL, evsel, process_event_scale)); + perf_tool__init(&tmp.tool, /*ordered_events=*/false); tmp.name = evsel__name(evsel); TEST_ASSERT_VAL("failed to synthesize attr update name", -- cgit v1.2.3 From fcd00f3e3b375ee87caf5be587840441ef1b4243 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Mon, 12 Aug 2024 13:47:17 -0700 Subject: perf kwork: Use perf_tool__init() Use perf_tool__init() so that more uses of 'struct perf_tool' can be const and not relying on perf_tool__fill_defaults(). Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Anshuman Khandual Cc: Athira Rajeev Cc: Huacai Chen Cc: Ilkka Koskinen Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: John Garry Cc: Jonathan Cameron Cc: Kan Liang Cc: Leo Yan Cc: Mark Rutland Cc: Mike Leach Cc: Namhyung Kim Cc: Nick Desaulniers Cc: Nick Terrell Cc: Oliver Upton Cc: Peter Zijlstra Cc: Song Liu Cc: Sun Haiyong Cc: Suzuki Poulouse Cc: Will Deacon Cc: Yanteng Si Cc: Yicong Yang Cc: linux-arm-kernel@lists.infradead.org Link: https://lore.kernel.org/r/20240812204720.631678-26-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-kwork.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'tools') diff --git a/tools/perf/builtin-kwork.c b/tools/perf/builtin-kwork.c index 8ffaa80a2d1d..6a4281b8fd10 100644 --- a/tools/perf/builtin-kwork.c +++ b/tools/perf/builtin-kwork.c @@ -2322,12 +2322,6 @@ int cmd_kwork(int argc, const char **argv) { static struct perf_kwork kwork = { .class_list = LIST_HEAD_INIT(kwork.class_list), - .tool = { - .mmap = perf_event__process_mmap, - .mmap2 = perf_event__process_mmap2, - .sample = perf_kwork__process_tracepoint_sample, - .ordered_events = true, - }, .atom_page_list = LIST_HEAD_INIT(kwork.atom_page_list), .sort_list = LIST_HEAD_INIT(kwork.sort_list), .cmp_id = LIST_HEAD_INIT(kwork.cmp_id), @@ -2462,6 +2456,11 @@ int cmd_kwork(int argc, const char **argv) "record", "report", "latency", "timehist", "top", NULL }; + perf_tool__init(&kwork.tool, /*ordered_events=*/true); + kwork.tool.mmap = perf_event__process_mmap; + kwork.tool.mmap2 = perf_event__process_mmap2; + kwork.tool.sample = perf_kwork__process_tracepoint_sample; + argc = parse_options_subcommand(argc, argv, kwork_options, kwork_subcommands, kwork_usage, PARSE_OPT_STOP_AT_NON_OPTION); -- cgit v1.2.3 From 15d4a6f41d7275799d318b198f479c9783d56f78 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Mon, 12 Aug 2024 13:47:18 -0700 Subject: perf tool: Remove perf_tool__fill_defaults() Now all tools are fully initialized prior to use it has no use so remove. Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Anshuman Khandual Cc: Athira Rajeev Cc: Huacai Chen Cc: Ilkka Koskinen Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: John Garry Cc: Jonathan Cameron Cc: Kan Liang Cc: Leo Yan Cc: Mark Rutland Cc: Mike Leach Cc: Namhyung Kim Cc: Nick Desaulniers Cc: Nick Terrell Cc: Oliver Upton Cc: Peter Zijlstra Cc: Song Liu Cc: Sun Haiyong Cc: Suzuki Poulouse Cc: Will Deacon Cc: Yanteng Si Cc: Yicong Yang Cc: linux-arm-kernel@lists.infradead.org Link: https://lore.kernel.org/r/20240812204720.631678-27-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/session.c | 6 ---- tools/perf/util/tool.c | 89 ----------------------------------------------- tools/perf/util/tool.h | 1 - 3 files changed, 96 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c index 5c0e0341d39e..a8404cc99cfc 100644 --- a/tools/perf/util/session.c +++ b/tools/perf/util/session.c @@ -1781,8 +1781,6 @@ static int __perf_session__process_pipe_events(struct perf_session *session) void *p; bool update_prog = false; - perf_tool__fill_defaults(tool); - /* * If it's from a file saving pipe data (by redirection), it would have * a file name other than "-". Then we can get the total size and show @@ -2206,8 +2204,6 @@ static int __perf_session__process_events(struct perf_session *session) struct ui_progress prog; int err; - perf_tool__fill_defaults(tool); - if (rd.data_size == 0) return -1; @@ -2260,8 +2256,6 @@ static int __perf_session__process_dir_events(struct perf_session *session) u64 total_size = perf_data__size(session->data); struct reader *rd; - perf_tool__fill_defaults(tool); - ui_progress__init_size(&prog, total_size, "Processing events..."); nr_readers = 1; diff --git a/tools/perf/util/tool.c b/tools/perf/util/tool.c index a4ca16e5eefe..3b7f390f26eb 100644 --- a/tools/perf/util/tool.c +++ b/tools/perf/util/tool.c @@ -288,95 +288,6 @@ void perf_tool__init(struct perf_tool *tool, bool ordered_events) tool->finished_init = process_event_op2_stub; } -void perf_tool__fill_defaults(struct perf_tool *tool) -{ - if (tool->sample == NULL) - tool->sample = process_event_sample_stub; - if (tool->mmap == NULL) - tool->mmap = process_event_stub; - if (tool->mmap2 == NULL) - tool->mmap2 = process_event_stub; - if (tool->comm == NULL) - tool->comm = process_event_stub; - if (tool->namespaces == NULL) - tool->namespaces = process_event_stub; - if (tool->cgroup == NULL) - tool->cgroup = process_event_stub; - if (tool->fork == NULL) - tool->fork = process_event_stub; - if (tool->exit == NULL) - tool->exit = process_event_stub; - if (tool->lost == NULL) - tool->lost = perf_event__process_lost; - if (tool->lost_samples == NULL) - tool->lost_samples = perf_event__process_lost_samples; - if (tool->aux == NULL) - tool->aux = perf_event__process_aux; - if (tool->itrace_start == NULL) - tool->itrace_start = perf_event__process_itrace_start; - if (tool->context_switch == NULL) - tool->context_switch = perf_event__process_switch; - if (tool->ksymbol == NULL) - tool->ksymbol = perf_event__process_ksymbol; - if (tool->bpf == NULL) - tool->bpf = perf_event__process_bpf; - if (tool->text_poke == NULL) - tool->text_poke = perf_event__process_text_poke; - if (tool->aux_output_hw_id == NULL) - tool->aux_output_hw_id = perf_event__process_aux_output_hw_id; - if (tool->read == NULL) - tool->read = process_event_sample_stub; - if (tool->throttle == NULL) - tool->throttle = process_event_stub; - if (tool->unthrottle == NULL) - tool->unthrottle = process_event_stub; - if (tool->attr == NULL) - tool->attr = process_event_synth_attr_stub; - if (tool->event_update == NULL) - tool->event_update = process_event_synth_event_update_stub; - if (tool->tracing_data == NULL) - tool->tracing_data = process_event_synth_tracing_data_stub; - if (tool->build_id == NULL) - tool->build_id = process_event_op2_stub; - if (tool->finished_round == NULL) { - if (tool->ordered_events) - tool->finished_round = perf_event__process_finished_round; - else - tool->finished_round = process_finished_round_stub; - } - if (tool->id_index == NULL) - tool->id_index = process_event_op2_stub; - if (tool->auxtrace_info == NULL) - tool->auxtrace_info = process_event_op2_stub; - if (tool->auxtrace == NULL) - tool->auxtrace = process_event_auxtrace_stub; - if (tool->auxtrace_error == NULL) - tool->auxtrace_error = process_event_op2_stub; - if (tool->thread_map == NULL) - tool->thread_map = process_event_thread_map_stub; - if (tool->cpu_map == NULL) - tool->cpu_map = process_event_cpu_map_stub; - if (tool->stat_config == NULL) - tool->stat_config = process_event_stat_config_stub; - if (tool->stat == NULL) - tool->stat = process_stat_stub; - if (tool->stat_round == NULL) - tool->stat_round = process_stat_round_stub; - if (tool->time_conv == NULL) - tool->time_conv = process_event_time_conv_stub; - if (tool->feature == NULL) - tool->feature = process_event_op2_stub; - if (tool->compressed == NULL) { -#ifdef HAVE_ZSTD_SUPPORT - tool->compressed = perf_session__process_compressed_event; -#else - tool->compressed = perf_session__process_compressed_event_stub; -#endif - } - if (tool->finished_init == NULL) - tool->finished_init = process_event_op2_stub; -} - bool perf_tool__compressed_is_stub(const struct perf_tool *tool) { return tool->compressed == perf_session__process_compressed_event_stub; diff --git a/tools/perf/util/tool.h b/tools/perf/util/tool.h index c7fea58729a5..db1c7642b0d1 100644 --- a/tools/perf/util/tool.h +++ b/tools/perf/util/tool.h @@ -90,7 +90,6 @@ struct perf_tool { }; void perf_tool__init(struct perf_tool *tool, bool ordered_events); -void perf_tool__fill_defaults(struct perf_tool *tool); bool perf_tool__compressed_is_stub(const struct perf_tool *tool); -- cgit v1.2.3 From 8f29be326da202963e3e6d33550d375313009f68 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Mon, 12 Aug 2024 13:47:19 -0700 Subject: perf session: Constify tool Make tool const now that all uses are const and perf_tool__fill_defaults() won't be used. The aim is to better capture that sessions don't mutate tools. Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Anshuman Khandual Cc: Athira Rajeev Cc: Huacai Chen Cc: Ilkka Koskinen Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: John Garry Cc: Jonathan Cameron Cc: Kan Liang Cc: Leo Yan Cc: Mark Rutland Cc: Mike Leach Cc: Namhyung Kim Cc: Nick Desaulniers Cc: Nick Terrell Cc: Oliver Upton Cc: Peter Zijlstra Cc: Song Liu Cc: Sun Haiyong Cc: Suzuki Poulouse Cc: Will Deacon Cc: Yanteng Si Cc: Yicong Yang Cc: linux-arm-kernel@lists.infradead.org Link: https://lore.kernel.org/r/20240812204720.631678-28-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/session.c | 6 +++--- tools/perf/util/session.h | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c index a8404cc99cfc..d2bd563119bc 100644 --- a/tools/perf/util/session.c +++ b/tools/perf/util/session.c @@ -1770,7 +1770,7 @@ static int __perf_session__process_decomp_events(struct perf_session *session); static int __perf_session__process_pipe_events(struct perf_session *session) { struct ordered_events *oe = &session->ordered_events; - struct perf_tool *tool = session->tool; + const struct perf_tool *tool = session->tool; struct ui_progress prog; union perf_event *event; uint32_t size, cur_size = 0; @@ -2200,7 +2200,7 @@ static int __perf_session__process_events(struct perf_session *session) .in_place_update = session->data->in_place_update, }; struct ordered_events *oe = &session->ordered_events; - struct perf_tool *tool = session->tool; + const struct perf_tool *tool = session->tool; struct ui_progress prog; int err; @@ -2250,7 +2250,7 @@ out_err: static int __perf_session__process_dir_events(struct perf_session *session) { struct perf_data *data = session->data; - struct perf_tool *tool = session->tool; + const struct perf_tool *tool = session->tool; int i, ret, readers, nr_readers; struct ui_progress prog; u64 total_size = perf_data__size(session->data); diff --git a/tools/perf/util/session.h b/tools/perf/util/session.h index 7f69baeae7fb..7c8dd6956330 100644 --- a/tools/perf/util/session.h +++ b/tools/perf/util/session.h @@ -43,7 +43,7 @@ struct perf_session { u64 one_mmap_offset; struct ordered_events ordered_events; struct perf_data *data; - struct perf_tool *tool; + const struct perf_tool *tool; u64 bytes_transferred; u64 bytes_compressed; struct zstd_data zstd_data; -- cgit v1.2.3 From 807746b9bdc2acc9b18130d0a7ff0be1626ccc76 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Sat, 20 Jul 2024 02:20:54 -0400 Subject: perf parse-events: Add a retirement latency modifier Retirement latency is a separate sampled count used on newer Intel CPUs. Reviewed-by: Namhyung Kim Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Caleb Biggers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kan Liang Cc: Perry Taylor Cc: Peter Zijlstra Cc: Samantha Alt Link: https://lore.kernel.org/r/20240720062102.444578-2-weilin.wang@intel.com Signed-off-by: Weilin Wang Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/evsel.h | 1 + tools/perf/util/parse-events.c | 2 ++ tools/perf/util/parse-events.h | 1 + tools/perf/util/parse-events.l | 3 ++- 4 files changed, 6 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h index 80b5f6dd868e..14f777b9e03e 100644 --- a/tools/perf/util/evsel.h +++ b/tools/perf/util/evsel.h @@ -98,6 +98,7 @@ struct evsel { bool bpf_counter; bool use_config_name; bool skippable; + bool retire_lat; int bpf_fd; struct bpf_object *bpf_obj; struct list_head config_terms; diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index 321586fb5556..fab01ba54e34 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -1811,6 +1811,8 @@ static int parse_events__modifier_list(struct parse_events_state *parse_state, evsel->weak_group = true; if (mod.bpf) evsel->bpf_counter = true; + if (mod.retire_lat) + evsel->retire_lat = true; } return 0; } diff --git a/tools/perf/util/parse-events.h b/tools/perf/util/parse-events.h index e13de2c8b706..b735cd9e0acf 100644 --- a/tools/perf/util/parse-events.h +++ b/tools/perf/util/parse-events.h @@ -203,6 +203,7 @@ struct parse_events_modifier { bool hypervisor : 1; /* 'h' */ bool guest : 1; /* 'G' */ bool host : 1; /* 'H' */ + bool retire_lat : 1; /* 'R' */ }; int parse_events__modifier_event(struct parse_events_state *parse_state, void *loc, diff --git a/tools/perf/util/parse-events.l b/tools/perf/util/parse-events.l index 16045c383ada..5a0bcd7f166a 100644 --- a/tools/perf/util/parse-events.l +++ b/tools/perf/util/parse-events.l @@ -209,6 +209,7 @@ static int modifiers(struct parse_events_state *parse_state, yyscan_t scanner) CASE('W', weak); CASE('e', exclusive); CASE('b', bpf); + CASE('R', retire_lat); default: return PE_ERROR; } @@ -250,7 +251,7 @@ drv_cfg_term [a-zA-Z0-9_\.]+(=[a-zA-Z0-9_*?\.:]+)? * If you add a modifier you need to update check_modifier(). * Also, the letters in modifier_event must not be in modifier_bp. */ -modifier_event [ukhpPGHSDIWeb]{1,15} +modifier_event [ukhpPGHSDIWebR]{1,16} modifier_bp [rwx]{1,3} lc_type (L1-dcache|l1-d|l1d|L1-data|L1-icache|l1-i|l1i|L1-instruction|LLC|L2|dTLB|d-tlb|Data-TLB|iTLB|i-tlb|Instruction-TLB|branch|branches|bpu|btb|bpc|node) lc_op_result (load|loads|read|store|stores|write|prefetch|prefetches|speculative-read|speculative-load|refs|Reference|ops|access|misses|miss) -- cgit v1.2.3 From a9a4ca5767c1756b1bc5bd3397b515af2cbc2c82 Mon Sep 17 00:00:00 2001 From: Weilin Wang Date: Sat, 20 Jul 2024 02:20:55 -0400 Subject: perf data: Allow to use given fd in data->file.fd When in PIPE mode, allow to use fd dynamically opened and asigned to data->file.fd instead of STDIN_FILENO or STDOUT_FILENO. Reviewed-by: Namhyung Kim Signed-off-by: Weilin Wang Acked-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Caleb Biggers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kan Liang Cc: Perry Taylor Cc: Peter Zijlstra Cc: Samantha Alt Link: https://lore.kernel.org/r/20240720062102.444578-3-weilin.wang@intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/data.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/perf/util/data.c b/tools/perf/util/data.c index 08c4bfbd817f..98661ede2a73 100644 --- a/tools/perf/util/data.c +++ b/tools/perf/util/data.c @@ -204,7 +204,12 @@ static bool check_pipe(struct perf_data *data) data->file.fd = fd; data->use_stdio = false; } - } else { + + /* + * When is_pipe and data->file.fd is given, use given fd + * instead of STDIN_FILENO or STDOUT_FILENO + */ + } else if (data->file.fd <= 0) { data->file.fd = fd; } } -- cgit v1.2.3 From 110d3ffe9d2b42c04faaf92c3fc627004e3328c6 Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Wed, 7 Aug 2024 16:13:48 +0200 Subject: selftests: router_mpath: Sleep after MZ In the context of an offloaded datapath, it may take a while for the ip link stats to be updated. This causes the test to fail when MZ_DELAY is too low. Sleep after the packets are sent for the link stats to get up to date. Signed-off-by: Petr Machata Reviewed-by: Ido Schimmel Reviewed-by: David Ahern Link: https://patch.msgid.link/8b1971d948273afd7de2da3d6a2ba35200540e55.1723036486.git.petrm@nvidia.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/forwarding/router_mpath_nh.sh | 2 ++ tools/testing/selftests/net/forwarding/router_mpath_nh_res.sh | 2 ++ tools/testing/selftests/net/forwarding/router_multipath.sh | 2 ++ 3 files changed, 6 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/net/forwarding/router_mpath_nh.sh b/tools/testing/selftests/net/forwarding/router_mpath_nh.sh index 2ba44247c60a..c5a30f8f55b5 100755 --- a/tools/testing/selftests/net/forwarding/router_mpath_nh.sh +++ b/tools/testing/selftests/net/forwarding/router_mpath_nh.sh @@ -243,6 +243,7 @@ multipath4_test() ip vrf exec vrf-h1 $MZ $h1 -q -p 64 -A 192.0.2.2 -B 198.51.100.2 \ -d $MZ_DELAY -t udp "sp=1024,dp=0-32768" + sleep 1 t1_rp12=$(link_stats_tx_packets_get $rp12) t1_rp13=$(link_stats_tx_packets_get $rp13) @@ -276,6 +277,7 @@ multipath6_test() $MZ $h1 -6 -q -p 64 -A 2001:db8:1::2 -B 2001:db8:2::2 \ -d $MZ_DELAY -t udp "sp=1024,dp=0-32768" + sleep 1 t1_rp12=$(link_stats_tx_packets_get $rp12) t1_rp13=$(link_stats_tx_packets_get $rp13) diff --git a/tools/testing/selftests/net/forwarding/router_mpath_nh_res.sh b/tools/testing/selftests/net/forwarding/router_mpath_nh_res.sh index cd9e346436fc..bd35fe8be9aa 100755 --- a/tools/testing/selftests/net/forwarding/router_mpath_nh_res.sh +++ b/tools/testing/selftests/net/forwarding/router_mpath_nh_res.sh @@ -244,6 +244,7 @@ multipath4_test() ip vrf exec vrf-h1 $MZ $h1 -q -p 64 -A 192.0.2.2 -B 198.51.100.2 \ -d $MZ_DELAY -t udp "sp=1024,dp=0-32768" + sleep 1 t1_rp12=$(link_stats_tx_packets_get $rp12) t1_rp13=$(link_stats_tx_packets_get $rp13) @@ -274,6 +275,7 @@ multipath6_l4_test() $MZ $h1 -6 -q -p 64 -A 2001:db8:1::2 -B 2001:db8:2::2 \ -d $MZ_DELAY -t udp "sp=1024,dp=0-32768" + sleep 1 t1_rp12=$(link_stats_tx_packets_get $rp12) t1_rp13=$(link_stats_tx_packets_get $rp13) diff --git a/tools/testing/selftests/net/forwarding/router_multipath.sh b/tools/testing/selftests/net/forwarding/router_multipath.sh index e2be354167a1..46f365b557b7 100755 --- a/tools/testing/selftests/net/forwarding/router_multipath.sh +++ b/tools/testing/selftests/net/forwarding/router_multipath.sh @@ -180,6 +180,7 @@ multipath4_test() ip vrf exec vrf-h1 $MZ $h1 -q -p 64 -A 192.0.2.2 -B 198.51.100.2 \ -d $MZ_DELAY -t udp "sp=1024,dp=0-32768" + sleep 1 t1_rp12=$(link_stats_tx_packets_get $rp12) t1_rp13=$(link_stats_tx_packets_get $rp13) @@ -217,6 +218,7 @@ multipath6_test() $MZ $h1 -6 -q -p 64 -A 2001:db8:1::2 -B 2001:db8:2::2 \ -d $MZ_DELAY -t udp "sp=1024,dp=0-32768" + sleep 1 t1_rp12=$(link_stats_tx_packets_get $rp12) t1_rp13=$(link_stats_tx_packets_get $rp13) -- cgit v1.2.3 From bb89fdacf99ccbec4dfe2d04f76f870f1483d3ea Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Wed, 7 Aug 2024 16:13:49 +0200 Subject: selftests: router_mpath_nh: Test 16-bit next hop weights Add tests that exercise full 16 bits of NH weight. To test the 255:65535, it is necessary to run more packets than for the other tests. On a debug kernel, the test can take up to a minute, therefore avoid the test when KSFT_MACHINE_SLOW. Signed-off-by: Petr Machata Reviewed-by: Ido Schimmel Reviewed-by: David Ahern Link: https://patch.msgid.link/c0c257c00ad30b07afc3fa5e2afd135925405544.1723036486.git.petrm@nvidia.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/forwarding/lib.sh | 7 ++++ .../selftests/net/forwarding/router_mpath_nh.sh | 38 +++++++++++++++++----- .../net/forwarding/router_mpath_nh_lib.sh | 13 ++++++++ 3 files changed, 50 insertions(+), 8 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/forwarding/lib.sh b/tools/testing/selftests/net/forwarding/lib.sh index ff96bb7535ff..cb0fcd6f0293 100644 --- a/tools/testing/selftests/net/forwarding/lib.sh +++ b/tools/testing/selftests/net/forwarding/lib.sh @@ -509,6 +509,13 @@ xfail_on_slow() fi } +omit_on_slow() +{ + if [[ $KSFT_MACHINE_SLOW != yes ]]; then + "$@" + fi +} + xfail_on_veth() { local dev=$1; shift diff --git a/tools/testing/selftests/net/forwarding/router_mpath_nh.sh b/tools/testing/selftests/net/forwarding/router_mpath_nh.sh index c5a30f8f55b5..a7d8399c8d4f 100755 --- a/tools/testing/selftests/net/forwarding/router_mpath_nh.sh +++ b/tools/testing/selftests/net/forwarding/router_mpath_nh.sh @@ -40,6 +40,7 @@ ALL_TESTS=" ping_ipv4 ping_ipv6 multipath_test + multipath16_test ping_ipv4_blackhole ping_ipv6_blackhole nh_stats_test_v4 @@ -226,9 +227,11 @@ routing_nh_obj() multipath4_test() { - local desc="$1" - local weight_rp12=$2 - local weight_rp13=$3 + local desc=$1; shift + local weight_rp12=$1; shift + local weight_rp13=$1; shift + local ports=${1-sp=1024,dp=0-32768}; shift + local t0_rp12 t0_rp13 t1_rp12 t1_rp13 local packets_rp12 packets_rp13 @@ -242,7 +245,7 @@ multipath4_test() t0_rp13=$(link_stats_tx_packets_get $rp13) ip vrf exec vrf-h1 $MZ $h1 -q -p 64 -A 192.0.2.2 -B 198.51.100.2 \ - -d $MZ_DELAY -t udp "sp=1024,dp=0-32768" + -d $MZ_DELAY -t udp "$ports" sleep 1 t1_rp12=$(link_stats_tx_packets_get $rp12) @@ -259,9 +262,11 @@ multipath4_test() multipath6_test() { - local desc="$1" - local weight_rp12=$2 - local weight_rp13=$3 + local desc=$1; shift + local weight_rp12=$1; shift + local weight_rp13=$1; shift + local ports=${1-sp=1024,dp=0-32768}; shift + local t0_rp12 t0_rp13 t1_rp12 t1_rp13 local packets_rp12 packets_rp13 @@ -276,7 +281,7 @@ multipath6_test() t0_rp13=$(link_stats_tx_packets_get $rp13) $MZ $h1 -6 -q -p 64 -A 2001:db8:1::2 -B 2001:db8:2::2 \ - -d $MZ_DELAY -t udp "sp=1024,dp=0-32768" + -d $MZ_DELAY -t udp "$ports" sleep 1 t1_rp12=$(link_stats_tx_packets_get $rp12) @@ -315,6 +320,23 @@ multipath_test() multipath6_test "Weighted MP 11:45" 11 45 } +multipath16_test() +{ + check_nhgw16 104 || return + + log_info "Running 16-bit IPv4 multipath tests" + multipath4_test "65535:65535" 65535 65535 + multipath4_test "128:512" 128 512 + omit_on_slow \ + multipath4_test "255:65535" 255 65535 sp=1024-1026,dp=0-65535 + + log_info "Running 16-bit IPv6 multipath tests" + multipath6_test "65535:65535" 65535 65535 + multipath6_test "128:512" 128 512 + omit_on_slow \ + multipath6_test "255:65535" 255 65535 sp=1024-1026,dp=0-65535 +} + ping_ipv4_blackhole() { RET=0 diff --git a/tools/testing/selftests/net/forwarding/router_mpath_nh_lib.sh b/tools/testing/selftests/net/forwarding/router_mpath_nh_lib.sh index 2903294d8bca..507b2852dabe 100644 --- a/tools/testing/selftests/net/forwarding/router_mpath_nh_lib.sh +++ b/tools/testing/selftests/net/forwarding/router_mpath_nh_lib.sh @@ -117,3 +117,16 @@ __nh_stats_test_v6() $MZ -6 $h1 -A 2001:db8:1::2 -B 2001:db8:2::2 sysctl_restore net.ipv6.fib_multipath_hash_policy } + +check_nhgw16() +{ + local nhid=$1; shift + + ip nexthop replace id 9999 group "$nhid,65535" &>/dev/null + if (( $? )); then + log_test_skip "16-bit multipath tests" \ + "iproute2 or the kernel do not support 16-bit next hop weights" + return 1 + fi + ip nexthop del id 9999 ||: +} -- cgit v1.2.3 From dce0765c1d5bf810bf3cd8e66cf7f6e73d14d7e2 Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Wed, 7 Aug 2024 16:13:50 +0200 Subject: selftests: router_mpath_nh_res: Test 16-bit next hop weights Add tests that exercise full 16 bits of NH weight. Like in the previous patch, omit the 255:65535 test when KSFT_MACHINE_SLOW. Signed-off-by: Petr Machata Reviewed-by: Ido Schimmel Reviewed-by: David Ahern Link: https://patch.msgid.link/a91d6ead9d1b1b4b7e276ca58a71ef814f42b7dd.1723036486.git.petrm@nvidia.com Signed-off-by: Jakub Kicinski --- .../net/forwarding/router_mpath_nh_res.sh | 56 ++++++++++++++++++---- 1 file changed, 48 insertions(+), 8 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/forwarding/router_mpath_nh_res.sh b/tools/testing/selftests/net/forwarding/router_mpath_nh_res.sh index bd35fe8be9aa..88ddae05b39d 100755 --- a/tools/testing/selftests/net/forwarding/router_mpath_nh_res.sh +++ b/tools/testing/selftests/net/forwarding/router_mpath_nh_res.sh @@ -40,6 +40,7 @@ ALL_TESTS=" ping_ipv4 ping_ipv6 multipath_test + multipath16_test nh_stats_test_v4 nh_stats_test_v6 " @@ -228,9 +229,11 @@ routing_nh_obj() multipath4_test() { - local desc="$1" - local weight_rp12=$2 - local weight_rp13=$3 + local desc=$1; shift + local weight_rp12=$1; shift + local weight_rp13=$1; shift + local ports=${1-sp=1024,dp=0-32768}; shift + local t0_rp12 t0_rp13 t1_rp12 t1_rp13 local packets_rp12 packets_rp13 @@ -243,7 +246,7 @@ multipath4_test() t0_rp13=$(link_stats_tx_packets_get $rp13) ip vrf exec vrf-h1 $MZ $h1 -q -p 64 -A 192.0.2.2 -B 198.51.100.2 \ - -d $MZ_DELAY -t udp "sp=1024,dp=0-32768" + -d $MZ_DELAY -t udp "$ports" sleep 1 t1_rp12=$(link_stats_tx_packets_get $rp12) @@ -259,9 +262,11 @@ multipath4_test() multipath6_l4_test() { - local desc="$1" - local weight_rp12=$2 - local weight_rp13=$3 + local desc=$1; shift + local weight_rp12=$1; shift + local weight_rp13=$1; shift + local ports=${1-sp=1024,dp=0-32768}; shift + local t0_rp12 t0_rp13 t1_rp12 t1_rp13 local packets_rp12 packets_rp13 @@ -274,7 +279,7 @@ multipath6_l4_test() t0_rp13=$(link_stats_tx_packets_get $rp13) $MZ $h1 -6 -q -p 64 -A 2001:db8:1::2 -B 2001:db8:2::2 \ - -d $MZ_DELAY -t udp "sp=1024,dp=0-32768" + -d $MZ_DELAY -t udp "$ports" sleep 1 t1_rp12=$(link_stats_tx_packets_get $rp12) @@ -373,6 +378,41 @@ multipath_test() ip nexthop replace id 106 group 104,1/105,1 type resilient } +multipath16_test() +{ + check_nhgw16 104 || return + + log_info "Running 16-bit IPv4 multipath tests" + ip nexthop replace id 103 group 101/102 type resilient idle_timer 0 + + ip nexthop replace id 103 group 101,65535/102,65535 type resilient + multipath4_test "65535:65535" 65535 65535 + + ip nexthop replace id 103 group 101,128/102,512 type resilient + multipath4_test "128:512" 128 512 + + ip nexthop replace id 103 group 101,255/102,65535 type resilient + omit_on_slow \ + multipath4_test "255:65535" 255 65535 sp=1024-1026,dp=0-65535 + + ip nexthop replace id 103 group 101,1/102,1 type resilient + + log_info "Running 16-bit IPv6 L4 hash multipath tests" + ip nexthop replace id 106 group 104/105 type resilient idle_timer 0 + + ip nexthop replace id 106 group 104,65535/105,65535 type resilient + multipath6_l4_test "65535:65535" 65535 65535 + + ip nexthop replace id 106 group 104,128/105,512 type resilient + multipath6_l4_test "128:512" 128 512 + + ip nexthop replace id 106 group 104,255/105,65535 type resilient + omit_on_slow \ + multipath6_l4_test "255:65535" 255 65535 sp=1024-1026,dp=0-65535 + + ip nexthop replace id 106 group 104,1/105,1 type resilient +} + nh_stats_test_v4() { __nh_stats_test_v4 resilient -- cgit v1.2.3 From 4b808f44733243f981ae04046ef93a65ecc631a9 Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Wed, 7 Aug 2024 16:13:51 +0200 Subject: selftests: fib_nexthops: Test 16-bit next hop weights Add tests that attempt to create NH groups that use full 16 bits of NH weight. Signed-off-by: Petr Machata Reviewed-by: Ido Schimmel Reviewed-by: David Ahern Link: https://patch.msgid.link/101cdd3f2bfd9511c9bec95f909d20ff56f70ba5.1723036486.git.petrm@nvidia.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/fib_nexthops.sh | 55 ++++++++++++++++++++++++++++- 1 file changed, 54 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/fib_nexthops.sh b/tools/testing/selftests/net/fib_nexthops.sh index ac0b2c6a5761..77c83d9508d3 100755 --- a/tools/testing/selftests/net/fib_nexthops.sh +++ b/tools/testing/selftests/net/fib_nexthops.sh @@ -78,7 +78,12 @@ log_test() else ret=1 nfail=$((nfail+1)) - printf "TEST: %-60s [FAIL]\n" "${msg}" + if [[ $rc -eq $ksft_skip ]]; then + printf "TEST: %-60s [SKIP]\n" "${msg}" + else + printf "TEST: %-60s [FAIL]\n" "${msg}" + fi + if [ "$VERBOSE" = "1" ]; then echo " rc=$rc, expected $expected" fi @@ -923,6 +928,29 @@ ipv6_grp_fcnal() ipv6_grp_refs log_test $? 0 "Nexthop group replace refcounts" + + # + # 16-bit weights. + # + run_cmd "$IP nexthop add id 62 via 2001:db8:91::2 dev veth1" + run_cmd "$IP nexthop add id 63 via 2001:db8:91::3 dev veth1" + run_cmd "$IP nexthop add id 64 via 2001:db8:91::4 dev veth1" + run_cmd "$IP nexthop add id 65 via 2001:db8:91::5 dev veth1" + run_cmd "$IP nexthop add id 66 dev veth1" + + run_cmd "$IP nexthop add id 103 group 62,1000" + if [[ $? == 0 ]]; then + local GRP="id 103 group 62,254/63,255/64,256/65,257/66,65535" + run_cmd "$IP nexthop replace $GRP" + check_nexthop "id 103" "$GRP" + rc=$? + else + rc=$ksft_skip + fi + + $IP nexthop flush >/dev/null 2>&1 + + log_test $rc 0 "16-bit weights" } ipv6_res_grp_fcnal() @@ -987,6 +1015,31 @@ ipv6_res_grp_fcnal() check_nexthop_bucket "list id 102" \ "id 102 index 0 nhid 63 id 102 index 1 nhid 62 id 102 index 2 nhid 62 id 102 index 3 nhid 62" log_test $? 0 "Nexthop buckets updated after replace - nECMP" + + # + # 16-bit weights. + # + run_cmd "$IP nexthop add id 62 via 2001:db8:91::2 dev veth1" + run_cmd "$IP nexthop add id 63 via 2001:db8:91::3 dev veth1" + run_cmd "$IP nexthop add id 64 via 2001:db8:91::4 dev veth1" + run_cmd "$IP nexthop add id 65 via 2001:db8:91::5 dev veth1" + run_cmd "$IP nexthop add id 66 dev veth1" + + run_cmd "$IP nexthop add id 103 group 62,1000 type resilient buckets 32" + if [[ $? == 0 ]]; then + local GRP="id 103 group 62,254/63,255/64,256/65,257/66,65535 $(: + )type resilient buckets 32 idle_timer 0 $(: + )unbalanced_timer 0" + run_cmd "$IP nexthop replace $GRP" + check_nexthop "id 103" "$GRP unbalanced_time 0" + rc=$? + else + rc=$ksft_skip + fi + + $IP nexthop flush >/dev/null 2>&1 + + log_test $rc 0 "16-bit weights" } ipv6_fcnal_runtime() -- cgit v1.2.3 From 8db5cabcf1b6f9d62bd024aa6293de715d75518f Mon Sep 17 00:00:00 2001 From: Weilin Wang Date: Sat, 20 Jul 2024 02:20:56 -0400 Subject: perf stat: Fork and launch 'perf record' when 'perf stat' needs to get retire latency value for a metric. When retire_latency value is used in a metric formula, evsel would fork a 'perf record' process with "-e" and "-W" options. 'perf record' will collect required retire_latency values in parallel while 'perf stat' is collecting counting values. At the point of time that 'perf stat' stops counting, evsel would stop 'perf record' by sending sigterm signal to 'perf record' process. Sampled data will be processed to get retire latency value. Another thread is required to synchronize between 'perf stat' and 'perf record' when we pass data through pipe. Retire_latency evsel is not opened for 'perf stat' so that there is no counter wasted on it. This commit includes code suggested by Namhyung to adjust reading size for groups that include retire_latency evsels. In current :R parsing implementation, the parser would recognize events with retire_latency modifier and insert them into the evlist like a normal event. Ideally, we need to avoid counting these events. In this commit, at the time when a retire_latency evsel is read, set the retire latency value processed from the sampled data to count value. This sampled retire latency value will be used for metric calculation and final event count print out. No special metric calculation and event print out code required for retire_latency events. Reviewed-by: Namhyung Kim Signed-off-by: Weilin Wang Acked-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Caleb Biggers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kan Liang Cc: Perry Taylor Cc: Peter Zijlstra Cc: Samantha Alt Link: https://lore.kernel.org/r/20240720062102.444578-4-weilin.wang@intel.com [ Squashed the 3rd and 4th commit in the series to keep it building patch by patch ] [ Constified the 'struct perf_tool' pointer in process_sample_event() ] [ Use perf_tool__init(&tool, false) to address a segfault I reported and Ian/Weilin diagnosed ] Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/arch/x86/util/evlist.c | 6 + tools/perf/builtin-stat.c | 4 + tools/perf/util/Build | 1 + tools/perf/util/evlist.c | 2 + tools/perf/util/evsel.c | 81 ++++++- tools/perf/util/evsel.h | 5 + tools/perf/util/intel-tpebs.c | 432 ++++++++++++++++++++++++++++++++++++++ tools/perf/util/intel-tpebs.h | 35 +++ 8 files changed, 564 insertions(+), 2 deletions(-) create mode 100644 tools/perf/util/intel-tpebs.c create mode 100644 tools/perf/util/intel-tpebs.h (limited to 'tools') diff --git a/tools/perf/arch/x86/util/evlist.c b/tools/perf/arch/x86/util/evlist.c index b1ce0c52d88d..cebdd483149e 100644 --- a/tools/perf/arch/x86/util/evlist.c +++ b/tools/perf/arch/x86/util/evlist.c @@ -89,6 +89,12 @@ int arch_evlist__cmp(const struct evsel *lhs, const struct evsel *rhs) return 1; } + /* Retire latency event should not be group leader*/ + if (lhs->retire_lat && !rhs->retire_lat) + return 1; + if (!lhs->retire_lat && rhs->retire_lat) + return -1; + /* Default ordering by insertion index. */ return lhs->core.idx - rhs->core.idx; } diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index 487eff9a8582..6162b98fc941 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -70,6 +70,7 @@ #include "util/bpf_counter.h" #include "util/iostat.h" #include "util/util.h" +#include "util/intel-tpebs.h" #include "asm/bug.h" #include @@ -683,6 +684,9 @@ static enum counter_recovery stat_handle_error(struct evsel *counter) if (child_pid != -1) kill(child_pid, SIGTERM); + + tpebs_delete(); + return COUNTER_FATAL; } diff --git a/tools/perf/util/Build b/tools/perf/util/Build index f6f18b4228bb..7ea261416c14 100644 --- a/tools/perf/util/Build +++ b/tools/perf/util/Build @@ -156,6 +156,7 @@ perf-util-y += clockid.o perf-util-y += list_sort.o perf-util-y += mutex.o perf-util-y += sharded_mutex.o +perf-util-$(CONFIG_X86_64) += intel-tpebs.o perf-util-$(CONFIG_LIBBPF) += bpf_map.o perf-util-$(CONFIG_PERF_BPF_SKEL) += bpf_counter.o diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c index 1417f9a23083..4b2538e4f679 100644 --- a/tools/perf/util/evlist.c +++ b/tools/perf/util/evlist.c @@ -33,6 +33,7 @@ #include "util/bpf-filter.h" #include "util/stat.h" #include "util/util.h" +#include "util/intel-tpebs.h" #include #include #include @@ -179,6 +180,7 @@ void evlist__delete(struct evlist *evlist) if (evlist == NULL) return; + tpebs_delete(); evlist__free_stats(evlist); evlist__munmap(evlist); evlist__close(evlist); diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index bc603193c477..d607056b73c9 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -59,6 +59,7 @@ #include #include #include +#include "util/intel-tpebs.h" #include @@ -1539,6 +1540,11 @@ static int evsel__read_one(struct evsel *evsel, int cpu_map_idx, int thread) return perf_evsel__read(&evsel->core, cpu_map_idx, thread, count); } +static int evsel__read_retire_lat(struct evsel *evsel, int cpu_map_idx, int thread) +{ + return tpebs_set_evsel(evsel, cpu_map_idx, thread); +} + static void evsel__set_count(struct evsel *counter, int cpu_map_idx, int thread, u64 val, u64 ena, u64 run, u64 lost) { @@ -1546,6 +1552,12 @@ static void evsel__set_count(struct evsel *counter, int cpu_map_idx, int thread, count = perf_counts(counter->counts, cpu_map_idx, thread); + if (counter->retire_lat) { + evsel__read_retire_lat(counter, cpu_map_idx, thread); + perf_counts__set_loaded(counter->counts, cpu_map_idx, thread, true); + return; + } + count->val = val; count->ena = ena; count->run = run; @@ -1554,6 +1566,60 @@ static void evsel__set_count(struct evsel *counter, int cpu_map_idx, int thread, perf_counts__set_loaded(counter->counts, cpu_map_idx, thread, true); } +static bool evsel__group_has_tpebs(struct evsel *leader) +{ + struct evsel *evsel; + + for_each_group_evsel(evsel, leader) { + if (evsel__is_retire_lat(evsel)) + return true; + } + return false; +} + +static u64 evsel__group_read_nr_members(struct evsel *leader) +{ + u64 nr = leader->core.nr_members; + struct evsel *evsel; + + for_each_group_evsel(evsel, leader) { + if (evsel__is_retire_lat(evsel)) + nr--; + } + return nr; +} + +static u64 evsel__group_read_size(struct evsel *leader) +{ + u64 read_format = leader->core.attr.read_format; + int entry = sizeof(u64); /* value */ + int size = 0; + int nr = 1; + + if (!evsel__group_has_tpebs(leader)) + return perf_evsel__read_size(&leader->core); + + if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) + size += sizeof(u64); + + if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) + size += sizeof(u64); + + if (read_format & PERF_FORMAT_ID) + entry += sizeof(u64); + + if (read_format & PERF_FORMAT_LOST) + entry += sizeof(u64); + + if (read_format & PERF_FORMAT_GROUP) { + nr = evsel__group_read_nr_members(leader); + size += sizeof(u64); + } + + size += entry * nr; + return size; +} + static int evsel__process_group_data(struct evsel *leader, int cpu_map_idx, int thread, u64 *data) { u64 read_format = leader->core.attr.read_format; @@ -1562,7 +1628,7 @@ static int evsel__process_group_data(struct evsel *leader, int cpu_map_idx, int nr = *data++; - if (nr != (u64) leader->core.nr_members) + if (nr != evsel__group_read_nr_members(leader)) return -EINVAL; if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) @@ -1592,7 +1658,7 @@ static int evsel__read_group(struct evsel *leader, int cpu_map_idx, int thread) { struct perf_stat_evsel *ps = leader->stats; u64 read_format = leader->core.attr.read_format; - int size = perf_evsel__read_size(&leader->core); + int size = evsel__group_read_size(leader); u64 *data = ps->group_data; if (!(read_format & PERF_FORMAT_ID)) @@ -1784,6 +1850,9 @@ int evsel__read_counter(struct evsel *evsel, int cpu_map_idx, int thread) if (evsel__is_tool(evsel)) return evsel__read_tool(evsel, cpu_map_idx, thread); + if (evsel__is_retire_lat(evsel)) + return evsel__read_retire_lat(evsel, cpu_map_idx, thread); + if (evsel->core.attr.read_format & PERF_FORMAT_GROUP) return evsel__read_group(evsel, cpu_map_idx, thread); @@ -2200,6 +2269,9 @@ static int evsel__open_cpu(struct evsel *evsel, struct perf_cpu_map *cpus, return 0; } + if (evsel__is_retire_lat(evsel)) + return tpebs_start(evsel->evlist); + err = __evsel__prepare_open(evsel, cpus, threads); if (err) return err; @@ -2392,6 +2464,8 @@ int evsel__open(struct evsel *evsel, struct perf_cpu_map *cpus, void evsel__close(struct evsel *evsel) { + if (evsel__is_retire_lat(evsel)) + tpebs_delete(); perf_evsel__close(&evsel->core); perf_evsel__free_id(&evsel->core); } @@ -3357,6 +3431,9 @@ static int store_evsel_ids(struct evsel *evsel, struct evlist *evlist) { int cpu_map_idx, thread; + if (evsel__is_retire_lat(evsel)) + return 0; + for (cpu_map_idx = 0; cpu_map_idx < xyarray__max_x(evsel->core.fd); cpu_map_idx++) { for (thread = 0; thread < xyarray__max_y(evsel->core.fd); thread++) { diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h index 14f777b9e03e..a5da4b03bb1c 100644 --- a/tools/perf/util/evsel.h +++ b/tools/perf/util/evsel.h @@ -311,6 +311,11 @@ static inline bool evsel__is_tool(const struct evsel *evsel) return evsel->tool_event != PERF_TOOL_NONE; } +static inline bool evsel__is_retire_lat(const struct evsel *evsel) +{ + return evsel->retire_lat; +} + const char *evsel__group_name(struct evsel *evsel); int evsel__group_desc(struct evsel *evsel, char *buf, size_t size); diff --git a/tools/perf/util/intel-tpebs.c b/tools/perf/util/intel-tpebs.c new file mode 100644 index 000000000000..50a3c3e07160 --- /dev/null +++ b/tools/perf/util/intel-tpebs.c @@ -0,0 +1,432 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * intel_tpebs.c: Intel TPEBS support + */ + + +#include +#include +#include +#include "intel-tpebs.h" +#include +#include +#include +#include "sample.h" +#include "debug.h" +#include "evlist.h" +#include "evsel.h" +#include "session.h" +#include "tool.h" +#include "cpumap.h" +#include "metricgroup.h" +#include +#include +#include +#include + +#define PERF_DATA "-" + +bool tpebs_recording; +static pid_t tpebs_pid = -1; +static size_t tpebs_event_size; +static LIST_HEAD(tpebs_results); +static pthread_t tpebs_reader_thread; +static struct child_process *tpebs_cmd; + +struct tpebs_retire_lat { + struct list_head nd; + /* Event name */ + const char *name; + /* Event name with the TPEBS modifier R */ + const char *tpebs_name; + /* Count of retire_latency values found in sample data */ + size_t count; + /* Sum of all the retire_latency values in sample data */ + int sum; + /* Average of retire_latency, val = sum / count */ + double val; +}; + +static int get_perf_record_args(const char **record_argv, char buf[], + const char *cpumap_buf) +{ + struct tpebs_retire_lat *e; + int i = 0; + + pr_debug("tpebs: Prepare perf record for retire_latency\n"); + + record_argv[i++] = "perf"; + record_argv[i++] = "record"; + record_argv[i++] = "-W"; + record_argv[i++] = "--synth=no"; + record_argv[i++] = buf; + + if (!cpumap_buf) { + pr_err("tpebs: Require cpumap list to run sampling\n"); + return -ECANCELED; + } + /* Use -C when cpumap_buf is not "-1" */ + if (strcmp(cpumap_buf, "-1")) { + record_argv[i++] = "-C"; + record_argv[i++] = cpumap_buf; + } + + list_for_each_entry(e, &tpebs_results, nd) { + record_argv[i++] = "-e"; + record_argv[i++] = e->name; + } + + record_argv[i++] = "-o"; + record_argv[i++] = PERF_DATA; + + return 0; +} + +static int prepare_run_command(const char **argv) +{ + tpebs_cmd = zalloc(sizeof(struct child_process)); + if (!tpebs_cmd) + return -ENOMEM; + tpebs_cmd->argv = argv; + tpebs_cmd->out = -1; + return 0; +} + +static int start_perf_record(int control_fd[], int ack_fd[], + const char *cpumap_buf) +{ + const char **record_argv; + int ret; + char buf[32]; + + scnprintf(buf, sizeof(buf), "--control=fd:%d,%d", control_fd[0], ack_fd[1]); + + record_argv = calloc(12 + 2 * tpebs_event_size, sizeof(char *)); + if (!record_argv) + return -ENOMEM; + + ret = get_perf_record_args(record_argv, buf, cpumap_buf); + if (ret) + goto out; + + ret = prepare_run_command(record_argv); + if (ret) + goto out; + ret = start_command(tpebs_cmd); +out: + free(record_argv); + return ret; +} + +static int process_sample_event(const struct perf_tool *tool __maybe_unused, + union perf_event *event __maybe_unused, + struct perf_sample *sample, + struct evsel *evsel, + struct machine *machine __maybe_unused) +{ + int ret = 0; + const char *evname; + struct tpebs_retire_lat *t; + + evname = evsel__name(evsel); + + /* + * Need to handle per core results? We are assuming average retire + * latency value will be used. Save the number of samples and the sum of + * retire latency value for each event. + */ + list_for_each_entry(t, &tpebs_results, nd) { + if (!strcmp(evname, t->name)) { + t->count += 1; + t->sum += sample->retire_lat; + t->val = (double) t->sum / t->count; + break; + } + } + + return ret; +} + +static int process_feature_event(struct perf_session *session, + union perf_event *event) +{ + if (event->feat.feat_id < HEADER_LAST_FEATURE) + return perf_event__process_feature(session, event); + return 0; +} + +static void *__sample_reader(void *arg) +{ + struct child_process *child = arg; + struct perf_session *session; + struct perf_data data = { + .mode = PERF_DATA_MODE_READ, + .path = PERF_DATA, + .file.fd = child->out, + }; + struct perf_tool tool; + + perf_tool__init(&tool, /*ordered_events=*/false); + tool.sample = process_sample_event; + tool.feature = process_feature_event; + tool.attr = perf_event__process_attr; + + session = perf_session__new(&data, &tool); + if (IS_ERR(session)) + return NULL; + perf_session__process_events(session); + perf_session__delete(session); + + return NULL; +} + +/* + * tpebs_stop - stop the sample data read thread and the perf record process. + */ +static int tpebs_stop(void) +{ + int ret = 0; + + /* Like tpebs_start, we should only run tpebs_end once. */ + if (tpebs_pid != -1) { + kill(tpebs_cmd->pid, SIGTERM); + tpebs_pid = -1; + pthread_join(tpebs_reader_thread, NULL); + close(tpebs_cmd->out); + ret = finish_command(tpebs_cmd); + if (ret == -ERR_RUN_COMMAND_WAITPID_SIGNAL) + ret = 0; + } + return ret; +} + +/* + * tpebs_start - start tpebs execution. + * @evsel_list: retire_latency evsels in this list will be selected and sampled + * to get the average retire_latency value. + * + * This function will be called from evlist level later when evlist__open() is + * called consistently. + */ +int tpebs_start(struct evlist *evsel_list) +{ + int ret = 0; + struct evsel *evsel; + char cpumap_buf[50]; + + /* + * We should only run tpebs_start when tpebs_recording is enabled. + * And we should only run it once with all the required events. + */ + if (tpebs_pid != -1 || !tpebs_recording) + return 0; + + cpu_map__snprint(evsel_list->core.user_requested_cpus, cpumap_buf, sizeof(cpumap_buf)); + /* + * Prepare perf record for sampling event retire_latency before fork and + * prepare workload + */ + evlist__for_each_entry(evsel_list, evsel) { + int i; + char *name; + struct tpebs_retire_lat *new; + + if (!evsel->retire_lat) + continue; + + pr_debug("tpebs: Retire_latency of event %s is required\n", evsel->name); + for (i = strlen(evsel->name) - 1; i > 0; i--) { + if (evsel->name[i] == 'R') + break; + } + if (i <= 0 || evsel->name[i] != 'R') { + ret = -1; + goto err; + } + + name = strdup(evsel->name); + if (!name) { + ret = -ENOMEM; + goto err; + } + name[i] = 'p'; + + new = zalloc(sizeof(*new)); + if (!new) { + ret = -1; + zfree(name); + goto err; + } + new->name = name; + new->tpebs_name = evsel->name; + list_add_tail(&new->nd, &tpebs_results); + tpebs_event_size += 1; + } + + if (tpebs_event_size > 0) { + struct pollfd pollfd = { .events = POLLIN, }; + int control_fd[2], ack_fd[2], len; + char ack_buf[8]; + + /*Create control and ack fd for --control*/ + if (pipe(control_fd) < 0) { + pr_err("tpebs: Failed to create control fifo"); + ret = -1; + goto out; + } + if (pipe(ack_fd) < 0) { + pr_err("tpebs: Failed to create control fifo"); + ret = -1; + goto out; + } + + ret = start_perf_record(control_fd, ack_fd, cpumap_buf); + if (ret) + goto out; + tpebs_pid = tpebs_cmd->pid; + if (pthread_create(&tpebs_reader_thread, NULL, __sample_reader, tpebs_cmd)) { + kill(tpebs_cmd->pid, SIGTERM); + close(tpebs_cmd->out); + pr_err("Could not create thread to process sample data.\n"); + ret = -1; + goto out; + } + /* Wait for perf record initialization.*/ + len = strlen(EVLIST_CTL_CMD_ENABLE_TAG); + ret = write(control_fd[1], EVLIST_CTL_CMD_ENABLE_TAG, len); + if (ret != len) { + pr_err("perf record control write control message failed\n"); + goto out; + } + + /* wait for an ack */ + pollfd.fd = ack_fd[0]; + + /* + * We need this poll to ensure the ack_fd PIPE will not hang + * when perf record failed for any reason. The timeout value + * 3000ms is an empirical selection. + */ + if (!poll(&pollfd, 1, 3000)) { + pr_err("tpebs failed: perf record ack timeout\n"); + ret = -1; + goto out; + } + + if (!(pollfd.revents & POLLIN)) { + pr_err("tpebs failed: did not received an ack\n"); + ret = -1; + goto out; + } + + ret = read(ack_fd[0], ack_buf, sizeof(ack_buf)); + if (ret > 0) + ret = strcmp(ack_buf, EVLIST_CTL_CMD_ACK_TAG); + else { + pr_err("tpebs: perf record control ack failed\n"); + goto out; + } +out: + close(control_fd[0]); + close(control_fd[1]); + close(ack_fd[0]); + close(ack_fd[1]); + } +err: + if (ret) + tpebs_delete(); + return ret; +} + + +int tpebs_set_evsel(struct evsel *evsel, int cpu_map_idx, int thread) +{ + __u64 val; + bool found = false; + struct tpebs_retire_lat *t; + struct perf_counts_values *count; + + /* Non reitre_latency evsel should never enter this function. */ + if (!evsel__is_retire_lat(evsel)) + return -1; + + /* + * Need to stop the forked record to ensure get sampled data from the + * PIPE to process and get non-zero retire_lat value for hybrid. + */ + tpebs_stop(); + count = perf_counts(evsel->counts, cpu_map_idx, thread); + + list_for_each_entry(t, &tpebs_results, nd) { + if (t->tpebs_name == evsel->name || + (evsel->metric_id && !strcmp(t->tpebs_name, evsel->metric_id))) { + found = true; + break; + } + } + + /* Set ena and run to non-zero */ + count->ena = count->run = 1; + count->lost = 0; + + if (!found) { + /* + * Set default value or 0 when retire_latency for this event is + * not found from sampling data (record_tpebs not set or 0 + * sample recorded). + */ + count->val = 0; + return 0; + } + + /* + * Only set retire_latency value to the first CPU and thread. + */ + if (cpu_map_idx == 0 && thread == 0) + val = rint(t->val); + else + val = 0; + + count->val = val; + return 0; +} + +static void tpebs_retire_lat__delete(struct tpebs_retire_lat *r) +{ + zfree(&r->name); + free(r); +} + + +/* + * tpebs_delete - delete tpebs related data and stop the created thread and + * process by calling tpebs_stop(). + * + * This function is called from evlist_delete() and also from builtin-stat + * stat_handle_error(). If tpebs_start() is called from places other then perf + * stat, need to ensure tpebs_delete() is also called to safely free mem and + * close the data read thread and the forked perf record process. + * + * This function is also called in evsel__close() to be symmetric with + * tpebs_start() being called in evsel__open(). We will update this call site + * when move tpebs_start() to evlist level. + */ +void tpebs_delete(void) +{ + struct tpebs_retire_lat *r, *rtmp; + + if (tpebs_pid == -1) + return; + + tpebs_stop(); + + list_for_each_entry_safe(r, rtmp, &tpebs_results, nd) { + list_del_init(&r->nd); + tpebs_retire_lat__delete(r); + } + + if (tpebs_cmd) { + free(tpebs_cmd); + tpebs_cmd = NULL; + } +} diff --git a/tools/perf/util/intel-tpebs.h b/tools/perf/util/intel-tpebs.h new file mode 100644 index 000000000000..766b3fbd79f1 --- /dev/null +++ b/tools/perf/util/intel-tpebs.h @@ -0,0 +1,35 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * intel_tpebs.h: Intel TEPBS support + */ +#ifndef INCLUDE__PERF_INTEL_TPEBS_H__ +#define INCLUDE__PERF_INTEL_TPEBS_H__ + +#include "stat.h" +#include "evsel.h" + +#ifdef HAVE_ARCH_X86_64_SUPPORT + +extern bool tpebs_recording; +int tpebs_start(struct evlist *evsel_list); +void tpebs_delete(void); +int tpebs_set_evsel(struct evsel *evsel, int cpu_map_idx, int thread); + +#else + +static inline int tpebs_start(struct evlist *evsel_list __maybe_unused) +{ + return 0; +} + +static inline void tpebs_delete(void) {}; + +static inline int tpebs_set_evsel(struct evsel *evsel __maybe_unused, + int cpu_map_idx __maybe_unused, + int thread __maybe_unused) +{ + return 0; +} + +#endif +#endif -- cgit v1.2.3 From 0a7381601b8a55d44c97ea132e23876c0dd9f90e Mon Sep 17 00:00:00 2001 From: Weilin Wang Date: Sat, 20 Jul 2024 02:20:58 -0400 Subject: perf vendor events intel: Add MTL metric JSON files Add MTL metric JSON file for TMA4.8. Some of the metrics' formulas use TPEBS retire_latency in MTL. This also includes lated E-Core TMA3.6 changes. Reviewed-by: Namhyung Kim Signed-off-by: Weilin Wang Acked-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Caleb Biggers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kan Liang Cc: Perry Taylor Cc: Peter Zijlstra Cc: Samantha Alt Link: https://lore.kernel.org/r/20240720062102.444578-6-weilin.wang@intel.com Signed-off-by: Arnaldo Carvalho de Melo --- .../arch/x86/meteorlake/metricgroups.json | 142 ++ .../arch/x86/meteorlake/mtl-metrics.json | 2535 ++++++++++++++++++++ 2 files changed, 2677 insertions(+) create mode 100644 tools/perf/pmu-events/arch/x86/meteorlake/metricgroups.json create mode 100644 tools/perf/pmu-events/arch/x86/meteorlake/mtl-metrics.json (limited to 'tools') diff --git a/tools/perf/pmu-events/arch/x86/meteorlake/metricgroups.json b/tools/perf/pmu-events/arch/x86/meteorlake/metricgroups.json new file mode 100644 index 000000000000..b54a5fc0861f --- /dev/null +++ b/tools/perf/pmu-events/arch/x86/meteorlake/metricgroups.json @@ -0,0 +1,142 @@ +{ + "Backend": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Bad": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "BadSpec": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "BigFootprint": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "BrMispredicts": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Branches": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "BvBC": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "BvBO": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "BvCB": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "BvFB": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "BvIO": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "BvMB": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "BvML": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "BvMP": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "BvMS": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "BvMT": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "BvOB": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "BvUW": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "C0Wait": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "CacheHits": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "CacheMisses": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "CodeGen": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Compute": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Cor": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "DSB": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "DSBmiss": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "DataSharing": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Fed": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "FetchBW": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "FetchLat": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Flops": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "FpScalar": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "FpVector": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Frontend": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "HPC": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "IcMiss": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Ifetch": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "InsType": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "IntVector": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "L2Evicts": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "LSD": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Load_Store_Miss": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "MachineClears": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Machine_Clears": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Mem": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "MemOffcore": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Mem_Exec": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "MemoryBW": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "MemoryBound": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "MemoryLat": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "MemoryTLB": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Memory_BW": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Memory_Lat": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "MicroSeq": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "OS": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Offcore": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "PGO": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Pipeline": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "PortsUtil": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Power": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Prefetches": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Ret": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Retire": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "SMT": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Server": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Snoop": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "SoC": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "Summary": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "TmaL1": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "TmaL2": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "TmaL3mem": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "TopdownL1": "Metrics for top-down breakdown at level 1", + "TopdownL2": "Metrics for top-down breakdown at level 2", + "TopdownL3": "Metrics for top-down breakdown at level 3", + "TopdownL4": "Metrics for top-down breakdown at level 4", + "TopdownL5": "Metrics for top-down breakdown at level 5", + "TopdownL6": "Metrics for top-down breakdown at level 6", + "load_store_bound": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet", + "tma_L1_group": "Metrics for top-down breakdown at level 1", + "tma_L2_group": "Metrics for top-down breakdown at level 2", + "tma_L3_group": "Metrics for top-down breakdown at level 3", + "tma_L4_group": "Metrics for top-down breakdown at level 4", + "tma_L5_group": "Metrics for top-down breakdown at level 5", + "tma_L6_group": "Metrics for top-down breakdown at level 6", + "tma_alu_op_utilization_group": "Metrics contributing to tma_alu_op_utilization category", + "tma_assists_group": "Metrics contributing to tma_assists category", + "tma_backend_bound_group": "Metrics contributing to tma_backend_bound category", + "tma_bad_speculation_group": "Metrics contributing to tma_bad_speculation category", + "tma_branch_mispredicts_group": "Metrics contributing to tma_branch_mispredicts category", + "tma_branch_resteers_group": "Metrics contributing to tma_branch_resteers category", + "tma_core_bound_group": "Metrics contributing to tma_core_bound category", + "tma_dram_bound_group": "Metrics contributing to tma_dram_bound category", + "tma_dtlb_load_group": "Metrics contributing to tma_dtlb_load category", + "tma_dtlb_store_group": "Metrics contributing to tma_dtlb_store category", + "tma_fetch_bandwidth_group": "Metrics contributing to tma_fetch_bandwidth category", + "tma_fetch_latency_group": "Metrics contributing to tma_fetch_latency category", + "tma_fp_arith_group": "Metrics contributing to tma_fp_arith category", + "tma_fp_vector_group": "Metrics contributing to tma_fp_vector category", + "tma_frontend_bound_group": "Metrics contributing to tma_frontend_bound category", + "tma_heavy_operations_group": "Metrics contributing to tma_heavy_operations category", + "tma_ifetch_bandwidth_group": "Metrics contributing to tma_ifetch_bandwidth category", + "tma_ifetch_latency_group": "Metrics contributing to tma_ifetch_latency category", + "tma_int_operations_group": "Metrics contributing to tma_int_operations category", + "tma_issue2P": "Metrics related by the issue $issue2P", + "tma_issueBM": "Metrics related by the issue $issueBM", + "tma_issueBW": "Metrics related by the issue $issueBW", + "tma_issueComp": "Metrics related by the issue $issueComp", + "tma_issueD0": "Metrics related by the issue $issueD0", + "tma_issueFB": "Metrics related by the issue $issueFB", + "tma_issueFL": "Metrics related by the issue $issueFL", + "tma_issueL1": "Metrics related by the issue $issueL1", + "tma_issueLat": "Metrics related by the issue $issueLat", + "tma_issueMC": "Metrics related by the issue $issueMC", + "tma_issueMS": "Metrics related by the issue $issueMS", + "tma_issueMV": "Metrics related by the issue $issueMV", + "tma_issueRFO": "Metrics related by the issue $issueRFO", + "tma_issueSL": "Metrics related by the issue $issueSL", + "tma_issueSO": "Metrics related by the issue $issueSO", + "tma_issueSmSt": "Metrics related by the issue $issueSmSt", + "tma_issueSpSt": "Metrics related by the issue $issueSpSt", + "tma_issueSyncxn": "Metrics related by the issue $issueSyncxn", + "tma_issueTLB": "Metrics related by the issue $issueTLB", + "tma_l1_bound_group": "Metrics contributing to tma_l1_bound category", + "tma_l3_bound_group": "Metrics contributing to tma_l3_bound category", + "tma_light_operations_group": "Metrics contributing to tma_light_operations category", + "tma_load_op_utilization_group": "Metrics contributing to tma_load_op_utilization category", + "tma_machine_clears_group": "Metrics contributing to tma_machine_clears category", + "tma_mem_latency_group": "Metrics contributing to tma_mem_latency category", + "tma_memory_bound_group": "Metrics contributing to tma_memory_bound category", + "tma_microcode_sequencer_group": "Metrics contributing to tma_microcode_sequencer category", + "tma_mite_group": "Metrics contributing to tma_mite category", + "tma_other_light_ops_group": "Metrics contributing to tma_other_light_ops category", + "tma_ports_utilization_group": "Metrics contributing to tma_ports_utilization category", + "tma_ports_utilized_0_group": "Metrics contributing to tma_ports_utilized_0 category", + "tma_ports_utilized_3m_group": "Metrics contributing to tma_ports_utilized_3m category", + "tma_resource_bound_group": "Metrics contributing to tma_resource_bound category", + "tma_retiring_group": "Metrics contributing to tma_retiring category", + "tma_serializing_operation_group": "Metrics contributing to tma_serializing_operation category", + "tma_store_bound_group": "Metrics contributing to tma_store_bound category", + "tma_store_op_utilization_group": "Metrics contributing to tma_store_op_utilization category" +} diff --git a/tools/perf/pmu-events/arch/x86/meteorlake/mtl-metrics.json b/tools/perf/pmu-events/arch/x86/meteorlake/mtl-metrics.json new file mode 100644 index 000000000000..1a7392f0da86 --- /dev/null +++ b/tools/perf/pmu-events/arch/x86/meteorlake/mtl-metrics.json @@ -0,0 +1,2535 @@ +[ + { + "BriefDescription": "C10 residency percent per package", + "MetricExpr": "cstate_pkg@c10\\-residency@ / TSC", + "MetricGroup": "Power", + "MetricName": "C10_Pkg_Residency", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "C1 residency percent per core", + "MetricExpr": "cstate_core@c1\\-residency@ / TSC", + "MetricGroup": "Power", + "MetricName": "C1_Core_Residency", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "C2 residency percent per package", + "MetricExpr": "cstate_pkg@c2\\-residency@ / TSC", + "MetricGroup": "Power", + "MetricName": "C2_Pkg_Residency", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "C3 residency percent per package", + "MetricExpr": "cstate_pkg@c3\\-residency@ / TSC", + "MetricGroup": "Power", + "MetricName": "C3_Pkg_Residency", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "C6 residency percent per core", + "MetricExpr": "cstate_core@c6\\-residency@ / TSC", + "MetricGroup": "Power", + "MetricName": "C6_Core_Residency", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "C6 residency percent per package", + "MetricExpr": "cstate_pkg@c6\\-residency@ / TSC", + "MetricGroup": "Power", + "MetricName": "C6_Pkg_Residency", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "C7 residency percent per core", + "MetricExpr": "cstate_core@c7\\-residency@ / TSC", + "MetricGroup": "Power", + "MetricName": "C7_Core_Residency", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "C7 residency percent per package", + "MetricExpr": "cstate_pkg@c7\\-residency@ / TSC", + "MetricGroup": "Power", + "MetricName": "C7_Pkg_Residency", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "C8 residency percent per package", + "MetricExpr": "cstate_pkg@c8\\-residency@ / TSC", + "MetricGroup": "Power", + "MetricName": "C8_Pkg_Residency", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "C9 residency percent per package", + "MetricExpr": "cstate_pkg@c9\\-residency@ / TSC", + "MetricGroup": "Power", + "MetricName": "C9_Pkg_Residency", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "Percentage of cycles spent in System Management Interrupts.", + "MetricExpr": "((msr@aperf@ - cycles) / msr@aperf@ if msr@smi@ > 0 else 0)", + "MetricGroup": "smi", + "MetricName": "smi_cycles", + "MetricThreshold": "smi_cycles > 0.1", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "Number of SMI interrupts.", + "MetricExpr": "msr@smi@", + "MetricGroup": "smi", + "MetricName": "smi_num", + "ScaleUnit": "1SMI#" + }, + { + "BriefDescription": "Counts the number of issue slots that were not consumed by the backend due to certain allocation restrictions", + "MetricExpr": "tma_core_bound", + "MetricGroup": "TopdownL3;tma_L3_group;tma_core_bound_group", + "MetricName": "tma_allocation_restriction", + "MetricThreshold": "tma_allocation_restriction > 0.1 & (tma_core_bound > 0.1 & tma_backend_bound > 0.1)", + "ScaleUnit": "100%", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Counts the total number of issue slots that were not consumed by the backend due to backend stalls", + "MetricExpr": "cpu_atom@TOPDOWN_BE_BOUND.ALL_P@ / (6 * cpu_atom@CPU_CLK_UNHALTED.CORE@)", + "MetricGroup": "TopdownL1;tma_L1_group", + "MetricName": "tma_backend_bound", + "MetricThreshold": "tma_backend_bound > 0.1", + "MetricgroupNoGroup": "TopdownL1", + "PublicDescription": "Counts the total number of issue slots that were not consumed by the backend due to backend stalls. Note that uops must be available for consumption in order for this event to count. If a uop is not available (IQ is empty), this event will not count", + "ScaleUnit": "100%", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Counts the total number of issue slots that were not consumed by the backend because allocation is stalled due to a mispredicted jump or a machine clear", + "MetricExpr": "cpu_atom@TOPDOWN_BAD_SPECULATION.ALL_P@ / (6 * cpu_atom@CPU_CLK_UNHALTED.CORE@)", + "MetricGroup": "TopdownL1;tma_L1_group", + "MetricName": "tma_bad_speculation", + "MetricThreshold": "tma_bad_speculation > 0.15", + "MetricgroupNoGroup": "TopdownL1", + "PublicDescription": "Counts the total number of issue slots that were not consumed by the backend because allocation is stalled due to a mispredicted jump or a machine clear. Only issue slots wasted due to fast nukes such as memory ordering nukes are counted. Other nukes are not accounted for. Counts all issue slots blocked during this recovery window including relevant microcode flows and while uops are not yet available in the instruction queue (IQ). Also includes the issue slots that were consumed by the backend but were thrown away because they were younger than the mispredict or machine clear.", + "ScaleUnit": "100%", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Counts the number of issue slots that were not delivered by the frontend due to BACLEARS, which occurs when the Branch Target Buffer (BTB) prediction or lack thereof, was corrected by a later branch predictor in the frontend", + "MetricExpr": "cpu_atom@TOPDOWN_FE_BOUND.BRANCH_DETECT@ / (6 * cpu_atom@CPU_CLK_UNHALTED.CORE@)", + "MetricGroup": "TopdownL3;tma_L3_group;tma_ifetch_latency_group", + "MetricName": "tma_branch_detect", + "MetricThreshold": "tma_branch_detect > 0.05 & (tma_ifetch_latency > 0.15 & tma_frontend_bound > 0.2)", + "PublicDescription": "Counts the number of issue slots that were not delivered by the frontend due to BACLEARS, which occurs when the Branch Target Buffer (BTB) prediction or lack thereof, was corrected by a later branch predictor in the frontend. Includes BACLEARS due to all branch types including conditional and unconditional jumps, returns, and indirect branches.", + "ScaleUnit": "100%", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Counts the number of issue slots that were not consumed by the backend due to branch mispredicts", + "MetricExpr": "cpu_atom@TOPDOWN_BAD_SPECULATION.MISPREDICT@ / (6 * cpu_atom@CPU_CLK_UNHALTED.CORE@)", + "MetricGroup": "TopdownL2;tma_L2_group;tma_bad_speculation_group", + "MetricName": "tma_branch_mispredicts", + "MetricThreshold": "tma_branch_mispredicts > 0.05 & tma_bad_speculation > 0.15", + "MetricgroupNoGroup": "TopdownL2", + "ScaleUnit": "100%", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Counts the number of issue slots that were not delivered by the frontend due to BTCLEARS, which occurs when the Branch Target Buffer (BTB) predicts a taken branch.", + "MetricExpr": "cpu_atom@TOPDOWN_FE_BOUND.BRANCH_RESTEER@ / (6 * cpu_atom@CPU_CLK_UNHALTED.CORE@)", + "MetricGroup": "TopdownL3;tma_L3_group;tma_ifetch_latency_group", + "MetricName": "tma_branch_resteer", + "MetricThreshold": "tma_branch_resteer > 0.05 & (tma_ifetch_latency > 0.15 & tma_frontend_bound > 0.2)", + "ScaleUnit": "100%", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Counts the number of issue slots that were not delivered by the frontend due to the microcode sequencer (MS).", + "MetricExpr": "cpu_atom@TOPDOWN_FE_BOUND.CISC@ / (6 * cpu_atom@CPU_CLK_UNHALTED.CORE@)", + "MetricGroup": "TopdownL3;tma_L3_group;tma_ifetch_bandwidth_group", + "MetricName": "tma_cisc", + "MetricThreshold": "tma_cisc > 0.05 & (tma_ifetch_bandwidth > 0.1 & tma_frontend_bound > 0.2)", + "ScaleUnit": "100%", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Counts the number of cycles due to backend bound stalls that are bounded by core restrictions and not attributed to an outstanding load or stores, or resource limitation", + "MetricExpr": "cpu_atom@TOPDOWN_BE_BOUND.ALLOC_RESTRICTIONS@ / (6 * cpu_atom@CPU_CLK_UNHALTED.CORE@)", + "MetricGroup": "TopdownL2;tma_L2_group;tma_backend_bound_group", + "MetricName": "tma_core_bound", + "MetricThreshold": "tma_core_bound > 0.1 & tma_backend_bound > 0.1", + "MetricgroupNoGroup": "TopdownL2", + "ScaleUnit": "100%", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Counts the number of issue slots that were not delivered by the frontend due to decode stalls.", + "MetricExpr": "cpu_atom@TOPDOWN_FE_BOUND.DECODE@ / (6 * cpu_atom@CPU_CLK_UNHALTED.CORE@)", + "MetricGroup": "TopdownL3;tma_L3_group;tma_ifetch_bandwidth_group", + "MetricName": "tma_decode", + "MetricThreshold": "tma_decode > 0.05 & (tma_ifetch_bandwidth > 0.1 & tma_frontend_bound > 0.2)", + "ScaleUnit": "100%", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Counts the number of issue slots that were not consumed by the backend due to a machine clear that does not require the use of microcode, classified as a fast nuke, due to memory ordering, memory disambiguation and memory renaming", + "MetricExpr": "cpu_atom@TOPDOWN_BAD_SPECULATION.FASTNUKE@ / (6 * cpu_atom@CPU_CLK_UNHALTED.CORE@)", + "MetricGroup": "TopdownL3;tma_L3_group;tma_machine_clears_group", + "MetricName": "tma_fast_nuke", + "MetricThreshold": "tma_fast_nuke > 0.05 & (tma_machine_clears > 0.05 & tma_bad_speculation > 0.15)", + "ScaleUnit": "100%", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Counts the number of issue slots that were not consumed by the backend due to frontend stalls.", + "MetricExpr": "cpu_atom@TOPDOWN_FE_BOUND.ALL_P@ / (6 * cpu_atom@CPU_CLK_UNHALTED.CORE@)", + "MetricGroup": "TopdownL1;tma_L1_group", + "MetricName": "tma_frontend_bound", + "MetricThreshold": "tma_frontend_bound > 0.2", + "MetricgroupNoGroup": "TopdownL1", + "ScaleUnit": "100%", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Counts the number of issue slots that were not delivered by the frontend due to instruction cache misses.", + "MetricExpr": "cpu_atom@TOPDOWN_FE_BOUND.ICACHE@ / (6 * cpu_atom@CPU_CLK_UNHALTED.CORE@)", + "MetricGroup": "TopdownL3;tma_L3_group;tma_ifetch_latency_group", + "MetricName": "tma_icache_misses", + "MetricThreshold": "tma_icache_misses > 0.05 & (tma_ifetch_latency > 0.15 & tma_frontend_bound > 0.2)", + "ScaleUnit": "100%", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Counts the number of issue slots that were not delivered by the frontend due to frontend bandwidth restrictions due to decode, predecode, cisc, and other limitations.", + "MetricExpr": "cpu_atom@TOPDOWN_FE_BOUND.FRONTEND_BANDWIDTH@ / (6 * cpu_atom@CPU_CLK_UNHALTED.CORE@)", + "MetricGroup": "TopdownL2;tma_L2_group;tma_frontend_bound_group", + "MetricName": "tma_ifetch_bandwidth", + "MetricThreshold": "tma_ifetch_bandwidth > 0.1 & tma_frontend_bound > 0.2", + "MetricgroupNoGroup": "TopdownL2", + "ScaleUnit": "100%", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Counts the number of issue slots that were not delivered by the frontend due to frontend latency restrictions due to icache misses, itlb misses, branch detection, and resteer limitations.", + "MetricExpr": "cpu_atom@TOPDOWN_FE_BOUND.FRONTEND_LATENCY@ / (6 * cpu_atom@CPU_CLK_UNHALTED.CORE@)", + "MetricGroup": "TopdownL2;tma_L2_group;tma_frontend_bound_group", + "MetricName": "tma_ifetch_latency", + "MetricThreshold": "tma_ifetch_latency > 0.15 & tma_frontend_bound > 0.2", + "MetricgroupNoGroup": "TopdownL2", + "ScaleUnit": "100%", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Instructions per Floating Point (FP) Operation", + "MetricExpr": "cpu_atom@INST_RETIRED.ANY@ / cpu_atom@FP_FLOPS_RETIRED.ALL@", + "MetricGroup": "Flops", + "MetricName": "tma_info_arith_inst_mix_ipflop", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Instructions per FP Arithmetic AVX/SSE 128-bit instruction", + "MetricExpr": "cpu_atom@INST_RETIRED.ANY@ / (cpu_atom@FP_INST_RETIRED.128B_DP@ + cpu_atom@FP_INST_RETIRED.128B_SP@)", + "MetricGroup": "Flops", + "MetricName": "tma_info_arith_inst_mix_ipfparith_avx128", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Instructions per FP Arithmetic Scalar Double-Precision instruction", + "MetricExpr": "cpu_atom@INST_RETIRED.ANY@ / cpu_atom@FP_INST_RETIRED.64B_DP@", + "MetricGroup": "Flops", + "MetricName": "tma_info_arith_inst_mix_ipfparith_scalar_dp", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Instructions per FP Arithmetic Scalar Single-Precision instruction", + "MetricExpr": "cpu_atom@INST_RETIRED.ANY@ / cpu_atom@FP_INST_RETIRED.32B_SP@", + "MetricGroup": "Flops", + "MetricName": "tma_info_arith_inst_mix_ipfparith_scalar_sp", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Percentage of time that retirement is stalled due to a first level data TLB miss", + "MetricExpr": "100 * (cpu_atom@LD_HEAD.DTLB_MISS_AT_RET@ + cpu_atom@LD_HEAD.PGWALK_AT_RET@) / cpu_atom@CPU_CLK_UNHALTED.CORE@", + "MetricName": "tma_info_bottleneck_%_dtlb_miss_bound_cycles", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Percentage of time that allocation and retirement is stalled by the Frontend Cluster due to an Ifetch Miss, either Icache or ITLB Miss", + "MetricExpr": "100 * cpu_atom@MEM_BOUND_STALLS_IFETCH.ALL@ / cpu_atom@CPU_CLK_UNHALTED.CORE@", + "MetricGroup": "Ifetch", + "MetricName": "tma_info_bottleneck_%_ifetch_miss_bound_cycles", + "PublicDescription": "Percentage of time that allocation and retirement is stalled by the Frontend Cluster due to an Ifetch Miss, either Icache or ITLB Miss. See Info.Ifetch_Bound", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Percentage of time that retirement is stalled due to an L1 miss", + "MetricExpr": "100 * cpu_atom@MEM_BOUND_STALLS_LOAD.ALL@ / cpu_atom@CPU_CLK_UNHALTED.CORE@", + "MetricGroup": "Load_Store_Miss", + "MetricName": "tma_info_bottleneck_%_load_miss_bound_cycles", + "PublicDescription": "Percentage of time that retirement is stalled due to an L1 miss. See Info.Load_Miss_Bound", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Percentage of time that retirement is stalled by the Memory Cluster due to a pipeline stall", + "MetricExpr": "100 * cpu_atom@LD_HEAD.ANY_AT_RET@ / cpu_atom@CPU_CLK_UNHALTED.CORE@", + "MetricGroup": "Mem_Exec", + "MetricName": "tma_info_bottleneck_%_mem_exec_bound_cycles", + "PublicDescription": "Percentage of time that retirement is stalled by the Memory Cluster due to a pipeline stall. See Info.Mem_Exec_Bound", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Instructions per Branch (lower number means higher occurrence rate)", + "MetricExpr": "cpu_atom@INST_RETIRED.ANY@ / cpu_atom@BR_INST_RETIRED.ALL_BRANCHES@", + "MetricName": "tma_info_br_inst_mix_ipbranch", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Instruction per (near) call (lower number means higher occurrence rate)", + "MetricExpr": "cpu_atom@INST_RETIRED.ANY@ / cpu_atom@BR_INST_RETIRED.NEAR_CALL@", + "MetricName": "tma_info_br_inst_mix_ipcall", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Instructions per Far Branch ( Far Branches apply upon transition from application to operating system, handling interrupts, exceptions) [lower number means higher occurrence rate]", + "MetricExpr": "cpu_atom@INST_RETIRED.ANY@ / cpu_atom@BR_INST_RETIRED.FAR_BRANCH@u", + "MetricName": "tma_info_br_inst_mix_ipfarbranch", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Instructions per retired conditional Branch Misprediction where the branch was not taken", + "MetricExpr": "cpu_atom@INST_RETIRED.ANY@ / (cpu_atom@BR_MISP_RETIRED.COND@ - cpu_atom@BR_MISP_RETIRED.COND_TAKEN@)", + "MetricName": "tma_info_br_inst_mix_ipmisp_cond_ntaken", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Instructions per retired conditional Branch Misprediction where the branch was taken", + "MetricExpr": "cpu_atom@INST_RETIRED.ANY@ / cpu_atom@BR_MISP_RETIRED.COND_TAKEN@", + "MetricName": "tma_info_br_inst_mix_ipmisp_cond_taken", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Instructions per retired indirect call or jump Branch Misprediction", + "MetricExpr": "cpu_atom@INST_RETIRED.ANY@ / cpu_atom@BR_MISP_RETIRED.INDIRECT@", + "MetricName": "tma_info_br_inst_mix_ipmisp_indirect", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Instructions per retired return Branch Misprediction", + "MetricExpr": "cpu_atom@INST_RETIRED.ANY@ / cpu_atom@BR_MISP_RETIRED.RETURN@", + "MetricName": "tma_info_br_inst_mix_ipmisp_ret", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Instructions per retired Branch Misprediction", + "MetricExpr": "cpu_atom@INST_RETIRED.ANY@ / cpu_atom@BR_MISP_RETIRED.ALL_BRANCHES@", + "MetricName": "tma_info_br_inst_mix_ipmispredict", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Ratio of all branches which mispredict", + "MetricExpr": "cpu_atom@BR_MISP_RETIRED.ALL_BRANCHES@ / cpu_atom@BR_INST_RETIRED.ALL_BRANCHES@", + "MetricName": "tma_info_br_mispredict_bound_branch_mispredict_ratio", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Ratio between Mispredicted branches and unknown branches", + "MetricExpr": "cpu_atom@BR_MISP_RETIRED.ALL_BRANCHES@ / cpu_atom@BACLEARS.ANY@", + "MetricName": "tma_info_br_mispredict_bound_branch_mispredict_to_unknown_branch_ratio", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Percentage of time that allocation is stalled due to load buffer full", + "MetricExpr": "100 * cpu_atom@MEM_SCHEDULER_BLOCK.LD_BUF@ / cpu_atom@CPU_CLK_UNHALTED.CORE@", + "MetricName": "tma_info_buffer_stalls_%_load_buffer_stall_cycles", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Percentage of time that allocation is stalled due to memory reservation stations full", + "MetricExpr": "100 * cpu_atom@MEM_SCHEDULER_BLOCK.RSV@ / cpu_atom@CPU_CLK_UNHALTED.CORE@", + "MetricName": "tma_info_buffer_stalls_%_mem_rsv_stall_cycles", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Percentage of time that allocation is stalled due to store buffer full", + "MetricExpr": "100 * cpu_atom@MEM_SCHEDULER_BLOCK.ST_BUF@ / cpu_atom@CPU_CLK_UNHALTED.CORE@", + "MetricName": "tma_info_buffer_stalls_%_store_buffer_stall_cycles", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Cycles Per Instruction", + "MetricExpr": "cpu_atom@CPU_CLK_UNHALTED.CORE@ / cpu_atom@INST_RETIRED.ANY@", + "MetricName": "tma_info_core_cpi", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Floating Point Operations Per Cycle", + "MetricExpr": "cpu_atom@FP_FLOPS_RETIRED.ALL@ / cpu_atom@CPU_CLK_UNHALTED.CORE@", + "MetricGroup": "Flops", + "MetricName": "tma_info_core_flopc", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Instructions Per Cycle", + "MetricExpr": "cpu_atom@INST_RETIRED.ANY@ / cpu_atom@CPU_CLK_UNHALTED.CORE@", + "MetricName": "tma_info_core_ipc", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Uops Per Instruction", + "MetricExpr": "cpu_atom@TOPDOWN_RETIRING.ALL_P@ / cpu_atom@INST_RETIRED.ANY@", + "MetricName": "tma_info_core_upi", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Percentage of ifetch miss bound stalls, where the ifetch miss hits in the L2", + "MetricExpr": "100 * cpu_atom@MEM_BOUND_STALLS_IFETCH.L2_HIT@ / cpu_atom@MEM_BOUND_STALLS_IFETCH.ALL@", + "MetricName": "tma_info_ifetch_miss_bound_%_ifetchmissbound_with_l2hit", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Percentage of ifetch miss bound stalls, where the ifetch miss hits in the L3", + "MetricExpr": "100 * cpu_atom@MEM_BOUND_STALLS_IFETCH.LLC_HIT@ / cpu_atom@MEM_BOUND_STALLS_IFETCH.ALL@", + "MetricName": "tma_info_ifetch_miss_bound_%_ifetchmissbound_with_l3hit", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Percentage of ifetch miss bound stalls, where the ifetch miss subsequently misses in the L3", + "MetricExpr": "100 * cpu_atom@MEM_BOUND_STALLS_IFETCH.LLC_MISS@ / cpu_atom@MEM_BOUND_STALLS_IFETCH.ALL@", + "MetricName": "tma_info_ifetch_miss_bound_%_ifetchmissbound_with_l3miss", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Percentage of memory bound stalls where retirement is stalled due to an L1 miss that hit the L2", + "MetricExpr": "100 * cpu_atom@MEM_BOUND_STALLS_LOAD.L2_HIT@ / cpu_atom@MEM_BOUND_STALLS_LOAD.ALL@", + "MetricGroup": "load_store_bound", + "MetricName": "tma_info_load_miss_bound_%_loadmissbound_with_l2hit", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Percentage of memory bound stalls where retirement is stalled due to an L1 miss that hit the L3", + "MetricExpr": "100 * cpu_atom@MEM_BOUND_STALLS_LOAD.LLC_HIT@ / cpu_atom@MEM_BOUND_STALLS_LOAD.ALL@", + "MetricGroup": "load_store_bound", + "MetricName": "tma_info_load_miss_bound_%_loadmissbound_with_l3hit", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Percentage of memory bound stalls where retirement is stalled due to an L1 miss that subsequently misses the L3", + "MetricExpr": "100 * cpu_atom@MEM_BOUND_STALLS_LOAD.LLC_MISS@ / cpu_atom@MEM_BOUND_STALLS_LOAD.ALL@", + "MetricGroup": "load_store_bound", + "MetricName": "tma_info_load_miss_bound_%_loadmissbound_with_l3miss", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Counts the number of cycles that the oldest load of the load buffer is stalled at retirement due to a pipeline block", + "MetricExpr": "100 * cpu_atom@LD_HEAD.L1_BOUND_AT_RET@ / cpu_atom@CPU_CLK_UNHALTED.CORE@", + "MetricGroup": "load_store_bound", + "MetricName": "tma_info_load_store_bound_l1_bound", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Counts the number of cycles that the oldest load of the load buffer is stalled at retirement", + "MetricExpr": "100 * (cpu_atom@LD_HEAD.L1_BOUND_AT_RET@ + cpu_atom@MEM_BOUND_STALLS_LOAD.ALL@) / cpu_atom@CPU_CLK_UNHALTED.CORE@", + "MetricGroup": "load_store_bound", + "MetricName": "tma_info_load_store_bound_load_bound", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Counts the number of cycles the core is stalled due to store buffer full", + "MetricExpr": "100 * (cpu_atom@MEM_SCHEDULER_BLOCK.ST_BUF@ / cpu_atom@MEM_SCHEDULER_BLOCK.ALL@) * tma_mem_scheduler", + "MetricGroup": "load_store_bound", + "MetricName": "tma_info_load_store_bound_store_bound", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Counts the number of machine clears relative to thousands of instructions retired, due to floating point assists", + "MetricExpr": "1e3 * cpu_atom@MACHINE_CLEARS.FP_ASSIST@ / cpu_atom@INST_RETIRED.ANY@", + "MetricName": "tma_info_machine_clear_bound_machine_clears_fp_assist_pki", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Counts the number of machine clears relative to thousands of instructions retired, due to page faults", + "MetricExpr": "1e3 * cpu_atom@MACHINE_CLEARS.PAGE_FAULT@ / cpu_atom@INST_RETIRED.ANY@", + "MetricName": "tma_info_machine_clear_bound_machine_clears_page_fault_pki", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Counts the number of machine clears relative to thousands of instructions retired, due to self-modifying code", + "MetricExpr": "1e3 * cpu_atom@MACHINE_CLEARS.SMC@ / cpu_atom@INST_RETIRED.ANY@", + "MetricName": "tma_info_machine_clear_bound_machine_clears_smc_pki", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Percentage of total non-speculative loads with an address aliasing block", + "MetricExpr": "100 * cpu_atom@LD_BLOCKS.ADDRESS_ALIAS@ / cpu_atom@MEM_UOPS_RETIRED.ALL_LOADS@", + "MetricName": "tma_info_mem_exec_blocks_%_loads_with_adressaliasing", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Percentage of total non-speculative loads with a store forward or unknown store address block", + "MetricExpr": "100 * cpu_atom@LD_BLOCKS.DATA_UNKNOWN@ / cpu_atom@MEM_UOPS_RETIRED.ALL_LOADS@", + "MetricName": "tma_info_mem_exec_blocks_%_loads_with_storefwdblk", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Percentage of Memory Execution Bound due to a first level data cache miss", + "MetricExpr": "100 * cpu_atom@LD_HEAD.L1_MISS_AT_RET@ / cpu_atom@LD_HEAD.ANY_AT_RET@", + "MetricName": "tma_info_mem_exec_bound_%_loadhead_with_l1miss", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Percentage of Memory Execution Bound due to other block cases, such as pipeline conflicts, fences, etc", + "MetricExpr": "100 * cpu_atom@LD_HEAD.OTHER_AT_RET@ / cpu_atom@LD_HEAD.ANY_AT_RET@", + "MetricName": "tma_info_mem_exec_bound_%_loadhead_with_otherpipelineblks", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Percentage of Memory Execution Bound due to a pagewalk", + "MetricExpr": "100 * cpu_atom@LD_HEAD.PGWALK_AT_RET@ / cpu_atom@LD_HEAD.ANY_AT_RET@", + "MetricName": "tma_info_mem_exec_bound_%_loadhead_with_pagewalk", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Percentage of Memory Execution Bound due to a second level TLB miss", + "MetricExpr": "100 * cpu_atom@LD_HEAD.DTLB_MISS_AT_RET@ / cpu_atom@LD_HEAD.ANY_AT_RET@", + "MetricName": "tma_info_mem_exec_bound_%_loadhead_with_stlbhit", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Percentage of Memory Execution Bound due to a store forward address match", + "MetricExpr": "100 * cpu_atom@LD_HEAD.ST_ADDR_AT_RET@ / cpu_atom@LD_HEAD.ANY_AT_RET@", + "MetricName": "tma_info_mem_exec_bound_%_loadhead_with_storefwding", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Instructions per Load", + "MetricExpr": "cpu_atom@INST_RETIRED.ANY@ / cpu_atom@MEM_UOPS_RETIRED.ALL_LOADS@", + "MetricName": "tma_info_mem_mix_ipload", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Instructions per Store", + "MetricExpr": "cpu_atom@INST_RETIRED.ANY@ / cpu_atom@MEM_UOPS_RETIRED.ALL_STORES@", + "MetricName": "tma_info_mem_mix_ipstore", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Percentage of total non-speculative loads that perform one or more locks", + "MetricExpr": "100 * cpu_atom@MEM_UOPS_RETIRED.LOCK_LOADS@ / cpu_atom@MEM_UOPS_RETIRED.ALL_LOADS@", + "MetricName": "tma_info_mem_mix_load_locks_ratio", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Percentage of total non-speculative loads that are splits", + "MetricExpr": "100 * cpu_atom@MEM_UOPS_RETIRED.SPLIT_LOADS@ / cpu_atom@MEM_UOPS_RETIRED.ALL_LOADS@", + "MetricName": "tma_info_mem_mix_load_splits_ratio", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Ratio of mem load uops to all uops", + "MetricExpr": "1e3 * cpu_atom@MEM_UOPS_RETIRED.ALL_LOADS@ / cpu_atom@TOPDOWN_RETIRING.ALL_P@", + "MetricName": "tma_info_mem_mix_memload_ratio", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Percentage of time that the core is stalled due to a TPAUSE or UMWAIT instruction", + "MetricExpr": "100 * cpu_atom@SERIALIZATION.C01_MS_SCB@ / (6 * cpu_atom@CPU_CLK_UNHALTED.CORE@)", + "MetricName": "tma_info_serialization _%_tpause_cycles", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Average CPU Utilization", + "MetricExpr": "cpu_atom@CPU_CLK_UNHALTED.REF_TSC@ / TSC", + "MetricName": "tma_info_system_cpu_utilization", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Giga Floating Point Operations Per Second", + "MetricExpr": "cpu_atom@FP_FLOPS_RETIRED.ALL@ / (duration_time * 1e9)", + "MetricGroup": "Flops", + "MetricName": "tma_info_system_gflops", + "PublicDescription": "Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Fraction of cycles spent in Kernel mode", + "MetricExpr": "cpu_atom@CPU_CLK_UNHALTED.CORE_P@k / cpu_atom@CPU_CLK_UNHALTED.CORE@", + "MetricGroup": "Summary", + "MetricName": "tma_info_system_kernel_utilization", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Average Frequency Utilization relative nominal frequency", + "MetricExpr": "cpu_atom@CPU_CLK_UNHALTED.CORE@ / cpu_atom@CPU_CLK_UNHALTED.REF_TSC@", + "MetricGroup": "Power", + "MetricName": "tma_info_system_turbo_utilization", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Percentage of all uops which are FPDiv uops", + "MetricExpr": "100 * cpu_atom@UOPS_RETIRED.FPDIV@ / cpu_atom@TOPDOWN_RETIRING.ALL_P@", + "MetricName": "tma_info_uop_mix_fpdiv_uop_ratio", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Percentage of all uops which are IDiv uops", + "MetricExpr": "100 * cpu_atom@UOPS_RETIRED.IDIV@ / cpu_atom@TOPDOWN_RETIRING.ALL_P@", + "MetricName": "tma_info_uop_mix_idiv_uop_ratio", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Percentage of all uops which are microcode ops", + "MetricExpr": "100 * cpu_atom@UOPS_RETIRED.MS@ / cpu_atom@TOPDOWN_RETIRING.ALL_P@", + "MetricName": "tma_info_uop_mix_microcode_uop_ratio", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Percentage of all uops which are x87 uops", + "MetricExpr": "100 * cpu_atom@UOPS_RETIRED.X87@ / cpu_atom@TOPDOWN_RETIRING.ALL_P@", + "MetricName": "tma_info_uop_mix_x87_uop_ratio", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Counts the number of issue slots that were not delivered by the frontend due to Instruction Table Lookaside Buffer (ITLB) misses.", + "MetricExpr": "cpu_atom@TOPDOWN_FE_BOUND.ITLB_MISS@ / (6 * cpu_atom@CPU_CLK_UNHALTED.CORE@)", + "MetricGroup": "TopdownL3;tma_L3_group;tma_ifetch_latency_group", + "MetricName": "tma_itlb_misses", + "MetricThreshold": "tma_itlb_misses > 0.05 & (tma_ifetch_latency > 0.15 & tma_frontend_bound > 0.2)", + "ScaleUnit": "100%", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Counts the total number of issue slots that were not consumed by the backend because allocation is stalled due to a machine clear (nuke) of any kind including memory ordering and memory disambiguation", + "MetricExpr": "cpu_atom@TOPDOWN_BAD_SPECULATION.MACHINE_CLEARS@ / (6 * cpu_atom@CPU_CLK_UNHALTED.CORE@)", + "MetricGroup": "TopdownL2;tma_L2_group;tma_bad_speculation_group", + "MetricName": "tma_machine_clears", + "MetricThreshold": "tma_machine_clears > 0.05 & tma_bad_speculation > 0.15", + "MetricgroupNoGroup": "TopdownL2", + "ScaleUnit": "100%", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Counts the number of issue slots that were not consumed by the backend due to memory reservation stalls in which a scheduler is not able to accept uops", + "MetricExpr": "cpu_atom@TOPDOWN_BE_BOUND.MEM_SCHEDULER@ / (6 * cpu_atom@CPU_CLK_UNHALTED.CORE@)", + "MetricGroup": "TopdownL3;tma_L3_group;tma_resource_bound_group", + "MetricName": "tma_mem_scheduler", + "MetricThreshold": "tma_mem_scheduler > 0.1 & (tma_resource_bound > 0.2 & tma_backend_bound > 0.1)", + "ScaleUnit": "100%", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Counts the number of issue slots that were not consumed by the backend due to IEC or FPC RAT stalls, which can be due to FIQ or IEC reservation stalls in which the integer, floating point or SIMD scheduler is not able to accept uops", + "MetricExpr": "cpu_atom@TOPDOWN_BE_BOUND.NON_MEM_SCHEDULER@ / (6 * cpu_atom@CPU_CLK_UNHALTED.CORE@)", + "MetricGroup": "TopdownL3;tma_L3_group;tma_resource_bound_group", + "MetricName": "tma_non_mem_scheduler", + "MetricThreshold": "tma_non_mem_scheduler > 0.1 & (tma_resource_bound > 0.2 & tma_backend_bound > 0.1)", + "ScaleUnit": "100%", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Counts the number of issue slots that were not consumed by the backend due to a machine clear that requires the use of microcode (slow nuke)", + "MetricExpr": "cpu_atom@TOPDOWN_BAD_SPECULATION.NUKE@ / (6 * cpu_atom@CPU_CLK_UNHALTED.CORE@)", + "MetricGroup": "TopdownL3;tma_L3_group;tma_machine_clears_group", + "MetricName": "tma_nuke", + "MetricThreshold": "tma_nuke > 0.05 & (tma_machine_clears > 0.05 & tma_bad_speculation > 0.15)", + "ScaleUnit": "100%", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Counts the number of issue slots that were not delivered by the frontend due to other common frontend stalls not categorized.", + "MetricExpr": "cpu_atom@TOPDOWN_FE_BOUND.OTHER@ / (6 * cpu_atom@CPU_CLK_UNHALTED.CORE@)", + "MetricGroup": "TopdownL3;tma_L3_group;tma_ifetch_bandwidth_group", + "MetricName": "tma_other_fb", + "MetricThreshold": "tma_other_fb > 0.05 & (tma_ifetch_bandwidth > 0.1 & tma_frontend_bound > 0.2)", + "ScaleUnit": "100%", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Counts the number of issue slots that were not delivered by the frontend due to wrong predecodes.", + "MetricExpr": "cpu_atom@TOPDOWN_FE_BOUND.PREDECODE@ / (6 * cpu_atom@CPU_CLK_UNHALTED.CORE@)", + "MetricGroup": "TopdownL3;tma_L3_group;tma_ifetch_bandwidth_group", + "MetricName": "tma_predecode", + "MetricThreshold": "tma_predecode > 0.05 & (tma_ifetch_bandwidth > 0.1 & tma_frontend_bound > 0.2)", + "ScaleUnit": "100%", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Counts the number of issue slots that were not consumed by the backend due to the physical register file unable to accept an entry (marble stalls)", + "MetricExpr": "cpu_atom@TOPDOWN_BE_BOUND.REGISTER@ / (6 * cpu_atom@CPU_CLK_UNHALTED.CORE@)", + "MetricGroup": "TopdownL3;tma_L3_group;tma_resource_bound_group", + "MetricName": "tma_register", + "MetricThreshold": "tma_register > 0.1 & (tma_resource_bound > 0.2 & tma_backend_bound > 0.1)", + "ScaleUnit": "100%", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Counts the number of issue slots that were not consumed by the backend due to the reorder buffer being full (ROB stalls)", + "MetricExpr": "cpu_atom@TOPDOWN_BE_BOUND.REORDER_BUFFER@ / (6 * cpu_atom@CPU_CLK_UNHALTED.CORE@)", + "MetricGroup": "TopdownL3;tma_L3_group;tma_resource_bound_group", + "MetricName": "tma_reorder_buffer", + "MetricThreshold": "tma_reorder_buffer > 0.1 & (tma_resource_bound > 0.2 & tma_backend_bound > 0.1)", + "ScaleUnit": "100%", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Counts the number of cycles the core is stalled due to a resource limitation", + "MetricExpr": "tma_backend_bound - tma_core_bound", + "MetricGroup": "TopdownL2;tma_L2_group;tma_backend_bound_group", + "MetricName": "tma_resource_bound", + "MetricThreshold": "tma_resource_bound > 0.2 & tma_backend_bound > 0.1", + "MetricgroupNoGroup": "TopdownL2", + "ScaleUnit": "100%", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Counts the number of issue slots that result in retirement slots", + "MetricExpr": "cpu_atom@TOPDOWN_RETIRING.ALL_P@ / (6 * cpu_atom@CPU_CLK_UNHALTED.CORE@)", + "MetricGroup": "TopdownL1;tma_L1_group", + "MetricName": "tma_retiring", + "MetricThreshold": "tma_retiring > 0.75", + "MetricgroupNoGroup": "TopdownL1", + "ScaleUnit": "100%", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Counts the number of issue slots that were not consumed by the backend due to scoreboards from the instruction queue (IQ), jump execution unit (JEU), or microcode sequencer (MS)", + "MetricExpr": "cpu_atom@TOPDOWN_BE_BOUND.SERIALIZATION@ / (6 * cpu_atom@CPU_CLK_UNHALTED.CORE@)", + "MetricGroup": "TopdownL3;tma_L3_group;tma_resource_bound_group", + "MetricName": "tma_serialization", + "MetricThreshold": "tma_serialization > 0.1 & (tma_resource_bound > 0.2 & tma_backend_bound > 0.1)", + "ScaleUnit": "100%", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Uncore frequency per die [GHZ]", + "MetricExpr": "tma_info_system_socket_clks / #num_dies / duration_time / 1e9", + "MetricGroup": "SoC", + "MetricName": "UNCORE_FREQ", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution ports for ALU operations.", + "MetricExpr": "(cpu_core@UOPS_DISPATCHED.PORT_0@ + cpu_core@UOPS_DISPATCHED.PORT_1@ + cpu_core@UOPS_DISPATCHED.PORT_5_11@ + cpu_core@UOPS_DISPATCHED.PORT_6@) / (5 * tma_info_core_core_clks)", + "MetricGroup": "TopdownL5;tma_L5_group;tma_ports_utilized_3m_group", + "MetricName": "tma_alu_op_utilization", + "MetricThreshold": "tma_alu_op_utilization > 0.4", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This metric estimates fraction of slots the CPU retired uops delivered by the Microcode_Sequencer as a result of Assists", + "MetricExpr": "78 * cpu_core@ASSISTS.ANY@ / tma_info_thread_slots", + "MetricGroup": "BvIO;TopdownL4;tma_L4_group;tma_microcode_sequencer_group", + "MetricName": "tma_assists", + "MetricThreshold": "tma_assists > 0.1 & (tma_microcode_sequencer > 0.05 & tma_heavy_operations > 0.1)", + "PublicDescription": "This metric estimates fraction of slots the CPU retired uops delivered by the Microcode_Sequencer as a result of Assists. Assists are long sequences of uops that are required in certain corner-cases for operations that cannot be handled natively by the execution pipeline. For example; when working with very small floating point values (so-called Denormals); the FP units are not set up to perform these operations natively. Instead; a sequence of instructions to perform the computation on the Denormals is injected into the pipeline. Since these microcode sequences might be dozens of uops long; Assists can be extremely deleterious to performance and they can be avoided in many cases. Sample with: ASSISTS.ANY", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This metric estimates fraction of slots the CPU retired uops as a result of handing SSE to AVX* or AVX* to SSE transition Assists.", + "MetricExpr": "63 * cpu_core@ASSISTS.SSE_AVX_MIX@ / tma_info_thread_slots", + "MetricGroup": "HPC;TopdownL5;tma_L5_group;tma_assists_group", + "MetricName": "tma_avx_assists", + "MetricThreshold": "tma_avx_assists > 0.1", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend", + "MetricExpr": "cpu_core@topdown\\-be\\-bound@ / (cpu_core@topdown\\-fe\\-bound@ + cpu_core@topdown\\-bad\\-spec@ + cpu_core@topdown\\-retiring@ + cpu_core@topdown\\-be\\-bound@) + 0 * tma_info_thread_slots", + "MetricGroup": "BvOB;TmaL1;TopdownL1;tma_L1_group", + "MetricName": "tma_backend_bound", + "MetricThreshold": "tma_backend_bound > 0.2", + "MetricgroupNoGroup": "TopdownL1", + "PublicDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend. Backend is the portion of the processor core where the out-of-order scheduler dispatches ready uops into their respective execution units; and once completed these uops get retired according to program order. For example; stalls due to data-cache misses or stalls due to the divider unit being overloaded are both categorized under Backend Bound. Backend Bound is further divided into two main categories: Memory Bound and Core Bound. Sample with: TOPDOWN.BACKEND_BOUND_SLOTS", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This category represents fraction of slots wasted due to incorrect speculations", + "MetricExpr": "max(1 - (tma_frontend_bound + tma_backend_bound + tma_retiring), 0)", + "MetricGroup": "TmaL1;TopdownL1;tma_L1_group", + "MetricName": "tma_bad_speculation", + "MetricThreshold": "tma_bad_speculation > 0.15", + "MetricgroupNoGroup": "TopdownL1", + "PublicDescription": "This category represents fraction of slots wasted due to incorrect speculations. This include slots used to issue uops that do not eventually get retired and slots for which the issue-pipeline was blocked due to recovery from earlier incorrect speculation. For example; wasted work due to miss-predicted branches are categorized under Bad Speculation category. Incorrect data speculation followed by Memory Ordering Nukes is another example.", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This metric represents fraction of slots the CPU has wasted due to Branch Misprediction", + "MetricExpr": "cpu_core@topdown\\-br\\-mispredict@ / (cpu_core@topdown\\-fe\\-bound@ + cpu_core@topdown\\-bad\\-spec@ + cpu_core@topdown\\-retiring@ + cpu_core@topdown\\-be\\-bound@) + 0 * tma_info_thread_slots", + "MetricGroup": "BadSpec;BrMispredicts;BvMP;TmaL2;TopdownL2;tma_L2_group;tma_bad_speculation_group;tma_issueBM", + "MetricName": "tma_branch_mispredicts", + "MetricThreshold": "tma_branch_mispredicts > 0.1 & tma_bad_speculation > 0.15", + "MetricgroupNoGroup": "TopdownL2", + "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Branch Misprediction. These slots are either wasted by uops fetched from an incorrectly speculated program path; or stalls when the out-of-order part of the machine needs to recover its state from a speculative path. Sample with: TOPDOWN.BR_MISPREDICT_SLOTS. Related metrics: tma_info_bad_spec_branch_misprediction_cost, tma_info_bottleneck_mispredictions, tma_mispredicts_resteers", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers", + "MetricExpr": "cpu_core@INT_MISC.CLEAR_RESTEER_CYCLES@ / tma_info_thread_clks + tma_unknown_branches", + "MetricGroup": "FetchLat;TopdownL3;tma_L3_group;tma_fetch_latency_group", + "MetricName": "tma_branch_resteers", + "MetricThreshold": "tma_branch_resteers > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", + "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers. Branch Resteers estimates the Frontend delay in fetching operations from corrected path; following all sorts of miss-predicted branches. For example; branchy code with lots of miss-predictions might get categorized under Branch Resteers. Note the value of this node may overlap with its siblings. Sample with: BR_MISP_RETIRED.ALL_BRANCHES", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due staying in C0.1 power-performance optimized state (Faster wakeup time; Smaller power savings).", + "MetricExpr": "cpu_core@CPU_CLK_UNHALTED.C01@ / tma_info_thread_clks", + "MetricGroup": "C0Wait;TopdownL4;tma_L4_group;tma_serializing_operation_group", + "MetricName": "tma_c01_wait", + "MetricThreshold": "tma_c01_wait > 0.05 & (tma_serializing_operation > 0.1 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due staying in C0.2 power-performance optimized state (Slower wakeup time; Larger power savings).", + "MetricExpr": "cpu_core@CPU_CLK_UNHALTED.C02@ / tma_info_thread_clks", + "MetricGroup": "C0Wait;TopdownL4;tma_L4_group;tma_serializing_operation_group", + "MetricName": "tma_c02_wait", + "MetricThreshold": "tma_c02_wait > 0.05 & (tma_serializing_operation > 0.1 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This metric estimates fraction of cycles the CPU retired uops originated from CISC (complex instruction set computer) instruction", + "MetricExpr": "max(0, tma_microcode_sequencer - tma_assists)", + "MetricGroup": "TopdownL4;tma_L4_group;tma_microcode_sequencer_group", + "MetricName": "tma_cisc", + "MetricThreshold": "tma_cisc > 0.1 & (tma_microcode_sequencer > 0.05 & tma_heavy_operations > 0.1)", + "PublicDescription": "This metric estimates fraction of cycles the CPU retired uops originated from CISC (complex instruction set computer) instruction. A CISC instruction has multiple uops that are required to perform the instruction's functionality as in the case of read-modify-write as an example. Since these instructions require multiple uops they may or may not imply sub-optimal use of machine resources. Sample with: FRONTEND_RETIRED.MS_FLOWS", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers as a result of Machine Clears", + "MetricExpr": "(1 - tma_branch_mispredicts / tma_bad_speculation) * cpu_core@INT_MISC.CLEAR_RESTEER_CYCLES@ / tma_info_thread_clks", + "MetricGroup": "BadSpec;MachineClears;TopdownL4;tma_L4_group;tma_branch_resteers_group;tma_issueMC", + "MetricName": "tma_clears_resteers", + "MetricThreshold": "tma_clears_resteers > 0.05 & (tma_branch_resteers > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15))", + "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers as a result of Machine Clears. Sample with: INT_MISC.CLEAR_RESTEER_CYCLES. Related metrics: tma_l1_bound, tma_machine_clears, tma_microcode_sequencer, tma_ms_switches", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to contested accesses", + "MetricExpr": "(cpu_core@MEM_LOAD_L3_HIT_RETIRED.XSNP_MISS@ * min(cpu_core@MEM_LOAD_L3_HIT_RETIRED.XSNP_MISS@R, 24 * tma_info_system_core_frequency) + cpu_core@MEM_LOAD_L3_HIT_RETIRED.XSNP_FWD@ * min(cpu_core@MEM_LOAD_L3_HIT_RETIRED.XSNP_FWD@R, 25 * tma_info_system_core_frequency) * (cpu_core@OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM@ / (cpu_core@OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM@ + cpu_core@OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HIT_WITH_FWD@))) * (1 + cpu_core@MEM_LOAD_RETIRED.FB_HIT@ / cpu_core@MEM_LOAD_RETIRED.L1_MISS@ / 2) / tma_info_thread_clks", + "MetricGroup": "BvMS;DataSharing;Offcore;Snoop;TopdownL4;tma_L4_group;tma_issueSyncxn;tma_l3_bound_group", + "MetricName": "tma_contested_accesses", + "MetricThreshold": "tma_contested_accesses > 0.05 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", + "PublicDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to contested accesses. Contested accesses occur when data written by one Logical Processor are read by another Logical Processor on a different Physical Core. Examples of contested accesses include synchronizations such as locks; true data sharing such as modified locked variables; and false sharing. Sample with: MEM_LOAD_L3_HIT_RETIRED.XSNP_FWD;MEM_LOAD_L3_HIT_RETIRED.XSNP_MISS. Related metrics: tma_data_sharing, tma_false_sharing, tma_machine_clears, tma_remote_cache", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This metric represents fraction of slots where Core non-memory issues were of a bottleneck", + "MetricExpr": "max(0, tma_backend_bound - tma_memory_bound)", + "MetricGroup": "Backend;Compute;TmaL2;TopdownL2;tma_L2_group;tma_backend_bound_group", + "MetricName": "tma_core_bound", + "MetricThreshold": "tma_core_bound > 0.1 & tma_backend_bound > 0.2", + "MetricgroupNoGroup": "TopdownL2", + "PublicDescription": "This metric represents fraction of slots where Core non-memory issues were of a bottleneck. Shortage in hardware compute resources; or dependencies in software's instructions are both categorized under Core Bound. Hence it may indicate the machine ran out of an out-of-order resource; certain execution units are overloaded or dependencies in program's data- or instruction-flow are limiting the performance (e.g. FP-chained long-latency arithmetic operations).", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to data-sharing accesses", + "MetricExpr": "(cpu_core@MEM_LOAD_L3_HIT_RETIRED.XSNP_NO_FWD@ * min(cpu_core@MEM_LOAD_L3_HIT_RETIRED.XSNP_NO_FWD@R, 24 * tma_info_system_core_frequency) + cpu_core@MEM_LOAD_L3_HIT_RETIRED.XSNP_FWD@ * min(cpu_core@MEM_LOAD_L3_HIT_RETIRED.XSNP_FWD@R, 24 * tma_info_system_core_frequency) * (1 - cpu_core@OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM@ / (cpu_core@OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM@ + cpu_core@OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HIT_WITH_FWD@))) * (1 + cpu_core@MEM_LOAD_RETIRED.FB_HIT@ / cpu_core@MEM_LOAD_RETIRED.L1_MISS@ / 2) / tma_info_thread_clks", + "MetricGroup": "BvMS;Offcore;Snoop;TopdownL4;tma_L4_group;tma_issueSyncxn;tma_l3_bound_group", + "MetricName": "tma_data_sharing", + "MetricThreshold": "tma_data_sharing > 0.05 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", + "PublicDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to data-sharing accesses. Data shared by multiple Logical Processors (even just read shared) may cause increased access latency due to cache coherency. Excessive data sharing can drastically harm multithreaded performance. Sample with: MEM_LOAD_L3_HIT_RETIRED.XSNP_NO_FWD. Related metrics: tma_contested_accesses, tma_false_sharing, tma_machine_clears, tma_remote_cache", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This metric represents fraction of cycles where decoder-0 was the only active decoder", + "MetricExpr": "(cpu_core@INST_DECODED.DECODERS\\,cmask\\=1@ - cpu_core@INST_DECODED.DECODERS\\,cmask\\=2@) / tma_info_core_core_clks / 2", + "MetricGroup": "DSBmiss;FetchBW;TopdownL4;tma_L4_group;tma_issueD0;tma_mite_group", + "MetricName": "tma_decoder0_alone", + "MetricThreshold": "tma_decoder0_alone > 0.1 & (tma_mite > 0.1 & tma_fetch_bandwidth > 0.2)", + "PublicDescription": "This metric represents fraction of cycles where decoder-0 was the only active decoder. Related metrics: tma_few_uops_instructions", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This metric represents fraction of cycles where the Divider unit was active", + "MetricExpr": "cpu_core@ARITH.DIV_ACTIVE@ / tma_info_thread_clks", + "MetricGroup": "BvCB;TopdownL3;tma_L3_group;tma_core_bound_group", + "MetricName": "tma_divider", + "MetricThreshold": "tma_divider > 0.2 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2)", + "PublicDescription": "This metric represents fraction of cycles where the Divider unit was active. Divide and square root instructions are performed by the Divider unit and can take considerably longer latency than integer or Floating Point addition; subtraction; or multiplication. Sample with: ARITH.DIVIDER_UOPS", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This metric estimates how often the CPU was stalled on accesses to external memory (DRAM) by loads", + "MetricExpr": "cpu_core@MEMORY_ACTIVITY.STALLS_L3_MISS@ / tma_info_thread_clks", + "MetricGroup": "MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group", + "MetricName": "tma_dram_bound", + "MetricThreshold": "tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)", + "PublicDescription": "This metric estimates how often the CPU was stalled on accesses to external memory (DRAM) by loads. Better caching can improve the latency and increase performance. Sample with: MEM_LOAD_RETIRED.L3_MISS_PS", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to DSB (decoded uop cache) fetch pipeline", + "MetricExpr": "(cpu_core@IDQ.DSB_CYCLES_ANY@ - cpu_core@IDQ.DSB_CYCLES_OK@) / tma_info_core_core_clks / 2", + "MetricGroup": "DSB;FetchBW;TopdownL3;tma_L3_group;tma_fetch_bandwidth_group", + "MetricName": "tma_dsb", + "MetricThreshold": "tma_dsb > 0.15 & tma_fetch_bandwidth > 0.2", + "PublicDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to DSB (decoded uop cache) fetch pipeline. For example; inefficient utilization of the DSB cache structure or bank conflict when reading from it; are categorized here.", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to switches from DSB to MITE pipelines", + "MetricExpr": "cpu_core@DSB2MITE_SWITCHES.PENALTY_CYCLES@ / tma_info_thread_clks", + "MetricGroup": "DSBmiss;FetchLat;TopdownL3;tma_L3_group;tma_fetch_latency_group;tma_issueFB", + "MetricName": "tma_dsb_switches", + "MetricThreshold": "tma_dsb_switches > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", + "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to switches from DSB to MITE pipelines. The DSB (decoded i-cache) is a Uop Cache where the front-end directly delivers Uops (micro operations) avoiding heavy x86 decoding. The DSB pipeline has shorter latency and delivered higher bandwidth than the MITE (legacy instruction decode pipeline). Switching between the two pipelines can cause penalties hence this metric measures the exposed penalty. Sample with: FRONTEND_RETIRED.DSB_MISS_PS. Related metrics: tma_fetch_bandwidth, tma_info_botlnk_l2_dsb_bandwidth, tma_info_botlnk_l2_dsb_misses, tma_info_frontend_dsb_coverage, tma_info_inst_mix_iptb, tma_lcp", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This metric roughly estimates the fraction of cycles where the Data TLB (DTLB) was missed by load accesses", + "MetricExpr": "cpu_core@MEM_INST_RETIRED.STLB_HIT_LOADS@ * min(cpu_core@MEM_INST_RETIRED.STLB_HIT_LOADS@R, 7) / tma_info_thread_clks + tma_load_stlb_miss", + "MetricGroup": "BvMT;MemoryTLB;TopdownL4;tma_L4_group;tma_issueTLB;tma_l1_bound_group", + "MetricName": "tma_dtlb_load", + "MetricThreshold": "tma_dtlb_load > 0.1 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", + "PublicDescription": "This metric roughly estimates the fraction of cycles where the Data TLB (DTLB) was missed by load accesses. TLBs (Translation Look-aside Buffers) are processor caches for recently used entries out of the Page Tables that are used to map virtual- to physical-addresses by the operating system. This metric approximates the potential delay of demand loads missing the first-level data TLB (assuming worst case scenario with back to back misses to different pages). This includes hitting in the second-level TLB (STLB) as well as performing a hardware page walk on an STLB miss. Sample with: MEM_INST_RETIRED.STLB_MISS_LOADS_PS. Related metrics: tma_dtlb_store, tma_info_bottleneck_memory_data_tlbs, tma_info_bottleneck_memory_synchronization", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This metric roughly estimates the fraction of cycles spent handling first-level data TLB store misses", + "MetricExpr": "cpu_core@MEM_INST_RETIRED.STLB_HIT_STORES@ * min(cpu_core@MEM_INST_RETIRED.STLB_HIT_STORES@R, 7) / tma_info_thread_clks + tma_store_stlb_miss", + "MetricGroup": "BvMT;MemoryTLB;TopdownL4;tma_L4_group;tma_issueTLB;tma_store_bound_group", + "MetricName": "tma_dtlb_store", + "MetricThreshold": "tma_dtlb_store > 0.05 & (tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", + "PublicDescription": "This metric roughly estimates the fraction of cycles spent handling first-level data TLB store misses. As with ordinary data caching; focus on improving data locality and reducing working-set size to reduce DTLB overhead. Additionally; consider using profile-guided optimization (PGO) to collocate frequently-used data on the same page. Try using larger page sizes for large amounts of frequently-used data. Sample with: MEM_INST_RETIRED.STLB_MISS_STORES_PS. Related metrics: tma_dtlb_load, tma_info_bottleneck_memory_data_tlbs, tma_info_bottleneck_memory_synchronization", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This metric roughly estimates how often CPU was handling synchronizations due to False Sharing", + "MetricExpr": "28 * tma_info_system_core_frequency * cpu_core@OCR.DEMAND_RFO.L3_HIT.SNOOP_HITM@ / tma_info_thread_clks", + "MetricGroup": "BvMS;DataSharing;Offcore;Snoop;TopdownL4;tma_L4_group;tma_issueSyncxn;tma_store_bound_group", + "MetricName": "tma_false_sharing", + "MetricThreshold": "tma_false_sharing > 0.05 & (tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", + "PublicDescription": "This metric roughly estimates how often CPU was handling synchronizations due to False Sharing. False Sharing is a multithreading hiccup; where multiple Logical Processors contend on different data-elements mapped into the same cache line. Sample with: OCR.DEMAND_RFO.L3_HIT.SNOOP_HITM. Related metrics: tma_contested_accesses, tma_data_sharing, tma_machine_clears, tma_remote_cache", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed", + "MetricExpr": "cpu_core@L1D_PEND_MISS.FB_FULL@ / tma_info_thread_clks", + "MetricGroup": "BvMS;MemoryBW;TopdownL4;tma_L4_group;tma_issueBW;tma_issueSL;tma_issueSmSt;tma_l1_bound_group", + "MetricName": "tma_fb_full", + "MetricThreshold": "tma_fb_full > 0.3", + "PublicDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed. The higher the metric value; the deeper the memory hierarchy level the misses are satisfied from (metric values >1 are valid). Often it hints on approaching bandwidth limits (to L2 cache; L3 cache or external memory). Related metrics: tma_info_bottleneck_cache_memory_bandwidth, tma_info_system_dram_bw_use, tma_mem_bandwidth, tma_sq_full, tma_store_latency, tma_streaming_stores", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues", + "MetricExpr": "max(0, tma_frontend_bound - tma_fetch_latency)", + "MetricGroup": "FetchBW;Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group;tma_issueFB", + "MetricName": "tma_fetch_bandwidth", + "MetricThreshold": "tma_fetch_bandwidth > 0.2", + "MetricgroupNoGroup": "TopdownL2", + "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues. For example; inefficiencies at the instruction decoders; or restrictions for caching in the DSB (decoded uops cache) are categorized under Fetch Bandwidth. In such cases; the Frontend typically delivers suboptimal amount of uops to the Backend. Sample with: FRONTEND_RETIRED.LATENCY_GE_2_BUBBLES_GE_1_PS;FRONTEND_RETIRED.LATENCY_GE_1_PS;FRONTEND_RETIRED.LATENCY_GE_2_PS. Related metrics: tma_dsb_switches, tma_info_botlnk_l2_dsb_bandwidth, tma_info_botlnk_l2_dsb_misses, tma_info_frontend_dsb_coverage, tma_info_inst_mix_iptb, tma_lcp", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend latency issues", + "MetricExpr": "cpu_core@topdown\\-fetch\\-lat@ / (cpu_core@topdown\\-fe\\-bound@ + cpu_core@topdown\\-bad\\-spec@ + cpu_core@topdown\\-retiring@ + cpu_core@topdown\\-be\\-bound@) - cpu_core@INT_MISC.UOP_DROPPING@ / tma_info_thread_slots", + "MetricGroup": "Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group", + "MetricName": "tma_fetch_latency", + "MetricThreshold": "tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15", + "MetricgroupNoGroup": "TopdownL2", + "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend latency issues. For example; instruction-cache misses; iTLB misses or fetch stalls after a branch misprediction are categorized under Frontend Latency. In such cases; the Frontend eventually delivers no uops for some period. Sample with: FRONTEND_RETIRED.LATENCY_GE_16_PS;FRONTEND_RETIRED.LATENCY_GE_8_PS", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This metric represents fraction of slots where the CPU was retiring instructions that that are decoder into two or up to ([SNB+] four; [ADL+] five) uops", + "MetricExpr": "max(0, tma_heavy_operations - tma_microcode_sequencer)", + "MetricGroup": "TopdownL3;tma_L3_group;tma_heavy_operations_group;tma_issueD0", + "MetricName": "tma_few_uops_instructions", + "MetricThreshold": "tma_few_uops_instructions > 0.05 & tma_heavy_operations > 0.1", + "PublicDescription": "This metric represents fraction of slots where the CPU was retiring instructions that that are decoder into two or up to ([SNB+] four; [ADL+] five) uops. This highly-correlates with the number of uops in such instructions. Related metrics: tma_decoder0_alone", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This metric represents overall arithmetic floating-point (FP) operations fraction the CPU has executed (retired)", + "MetricExpr": "tma_x87_use + tma_fp_scalar + tma_fp_vector", + "MetricGroup": "HPC;TopdownL3;tma_L3_group;tma_light_operations_group", + "MetricName": "tma_fp_arith", + "MetricThreshold": "tma_fp_arith > 0.2 & tma_light_operations > 0.6", + "PublicDescription": "This metric represents overall arithmetic floating-point (FP) operations fraction the CPU has executed (retired). Note this metric's value may exceed its parent due to use of \"Uops\" CountDomain and FMA double-counting.", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This metric roughly estimates fraction of slots the CPU retired uops as a result of handing Floating Point (FP) Assists", + "MetricExpr": "30 * cpu_core@ASSISTS.FP@ / tma_info_thread_slots", + "MetricGroup": "HPC;TopdownL5;tma_L5_group;tma_assists_group", + "MetricName": "tma_fp_assists", + "MetricThreshold": "tma_fp_assists > 0.1", + "PublicDescription": "This metric roughly estimates fraction of slots the CPU retired uops as a result of handing Floating Point (FP) Assists. FP Assist may apply when working with very small floating point values (so-called Denormals).", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This metric approximates arithmetic floating-point (FP) scalar uops fraction the CPU has retired", + "MetricExpr": "cpu_core@FP_ARITH_INST_RETIRED.SCALAR@ / (tma_retiring * tma_info_thread_slots)", + "MetricGroup": "Compute;Flops;TopdownL4;tma_L4_group;tma_fp_arith_group;tma_issue2P", + "MetricName": "tma_fp_scalar", + "MetricThreshold": "tma_fp_scalar > 0.1 & (tma_fp_arith > 0.2 & tma_light_operations > 0.6)", + "PublicDescription": "This metric approximates arithmetic floating-point (FP) scalar uops fraction the CPU has retired. May overcount due to FMA double counting. Related metrics: tma_fp_vector, tma_fp_vector_128b, tma_fp_vector_256b, tma_fp_vector_512b, tma_int_vector_128b, tma_int_vector_256b, tma_port_0, tma_port_1, tma_port_5, tma_port_6, tma_ports_utilized_2", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This metric approximates arithmetic floating-point (FP) vector uops fraction the CPU has retired aggregated across all vector widths", + "MetricExpr": "cpu_core@FP_ARITH_INST_RETIRED.VECTOR@ / (tma_retiring * tma_info_thread_slots)", + "MetricGroup": "Compute;Flops;TopdownL4;tma_L4_group;tma_fp_arith_group;tma_issue2P", + "MetricName": "tma_fp_vector", + "MetricThreshold": "tma_fp_vector > 0.1 & (tma_fp_arith > 0.2 & tma_light_operations > 0.6)", + "PublicDescription": "This metric approximates arithmetic floating-point (FP) vector uops fraction the CPU has retired aggregated across all vector widths. May overcount due to FMA double counting. Related metrics: tma_fp_scalar, tma_fp_vector_128b, tma_fp_vector_256b, tma_fp_vector_512b, tma_int_vector_128b, tma_int_vector_256b, tma_port_0, tma_port_1, tma_port_5, tma_port_6, tma_ports_utilized_2", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This metric approximates arithmetic FP vector uops fraction the CPU has retired for 128-bit wide vectors", + "MetricExpr": "(cpu_core@FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE@ + cpu_core@FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE@) / (tma_retiring * tma_info_thread_slots)", + "MetricGroup": "Compute;Flops;TopdownL5;tma_L5_group;tma_fp_vector_group;tma_issue2P", + "MetricName": "tma_fp_vector_128b", + "MetricThreshold": "tma_fp_vector_128b > 0.1 & (tma_fp_vector > 0.1 & (tma_fp_arith > 0.2 & tma_light_operations > 0.6))", + "PublicDescription": "This metric approximates arithmetic FP vector uops fraction the CPU has retired for 128-bit wide vectors. May overcount due to FMA double counting. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_256b, tma_fp_vector_512b, tma_int_vector_128b, tma_int_vector_256b, tma_port_0, tma_port_1, tma_port_5, tma_port_6, tma_ports_utilized_2", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This metric approximates arithmetic FP vector uops fraction the CPU has retired for 256-bit wide vectors", + "MetricExpr": "(cpu_core@FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE@ + cpu_core@FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE@) / (tma_retiring * tma_info_thread_slots)", + "MetricGroup": "Compute;Flops;TopdownL5;tma_L5_group;tma_fp_vector_group;tma_issue2P", + "MetricName": "tma_fp_vector_256b", + "MetricThreshold": "tma_fp_vector_256b > 0.1 & (tma_fp_vector > 0.1 & (tma_fp_arith > 0.2 & tma_light_operations > 0.6))", + "PublicDescription": "This metric approximates arithmetic FP vector uops fraction the CPU has retired for 256-bit wide vectors. May overcount due to FMA double counting. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_128b, tma_fp_vector_512b, tma_int_vector_128b, tma_int_vector_256b, tma_port_0, tma_port_1, tma_port_5, tma_port_6, tma_ports_utilized_2", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend", + "MetricExpr": "cpu_core@topdown\\-fe\\-bound@ / (cpu_core@topdown\\-fe\\-bound@ + cpu_core@topdown\\-bad\\-spec@ + cpu_core@topdown\\-retiring@ + cpu_core@topdown\\-be\\-bound@) - cpu_core@INT_MISC.UOP_DROPPING@ / tma_info_thread_slots", + "MetricGroup": "BvFB;BvIO;PGO;TmaL1;TopdownL1;tma_L1_group", + "MetricName": "tma_frontend_bound", + "MetricThreshold": "tma_frontend_bound > 0.15", + "MetricgroupNoGroup": "TopdownL1", + "PublicDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend. Frontend denotes the first part of the processor core responsible to fetch operations that are executed later on by the Backend part. Within the Frontend; a branch predictor predicts the next address to fetch; cache-lines are fetched from the memory subsystem; parsed into instructions; and lastly decoded into micro-operations (uops). Ideally the Frontend can issue Pipeline_Width uops every cycle to the Backend. Frontend Bound denotes unutilized issue-slots when there is no Backend stall; i.e. bubbles where Frontend delivered no uops while Backend could have accepted them. For example; stalls due to instruction-cache misses would be categorized under Frontend Bound. Sample with: FRONTEND_RETIRED.LATENCY_GE_4_PS", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This metric represents fraction of slots where the CPU was retiring fused instructions -- where one uop can represent multiple contiguous instructions", + "MetricExpr": "tma_light_operations * cpu_core@INST_RETIRED.MACRO_FUSED@ / (tma_retiring * tma_info_thread_slots)", + "MetricGroup": "Branches;BvBO;Pipeline;TopdownL3;tma_L3_group;tma_light_operations_group", + "MetricName": "tma_fused_instructions", + "MetricThreshold": "tma_fused_instructions > 0.1 & tma_light_operations > 0.6", + "PublicDescription": "This metric represents fraction of slots where the CPU was retiring fused instructions -- where one uop can represent multiple contiguous instructions. CMP+JCC or DEC+JCC are common examples of legacy fusions. {([MTL] Note new MOV+OP and Load+OP fusions appear under Other_Light_Ops in MTL!)}", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or micro-coded sequences", + "MetricExpr": "cpu_core@topdown\\-heavy\\-ops@ / (cpu_core@topdown\\-fe\\-bound@ + cpu_core@topdown\\-bad\\-spec@ + cpu_core@topdown\\-retiring@ + cpu_core@topdown\\-be\\-bound@) + 0 * tma_info_thread_slots", + "MetricGroup": "Retire;TmaL2;TopdownL2;tma_L2_group;tma_retiring_group", + "MetricName": "tma_heavy_operations", + "MetricThreshold": "tma_heavy_operations > 0.1", + "MetricgroupNoGroup": "TopdownL2", + "PublicDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or micro-coded sequences. This highly-correlates with the uop length of these instructions/sequences. ([ICL+] Note this may overcount due to approximation using indirect events; [ADL+] .). Sample with: UOPS_RETIRED.HEAVY", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to instruction cache misses", + "MetricExpr": "cpu_core@ICACHE_DATA.STALLS@ / tma_info_thread_clks", + "MetricGroup": "BigFootprint;BvBC;FetchLat;IcMiss;TopdownL3;tma_L3_group;tma_fetch_latency_group", + "MetricName": "tma_icache_misses", + "MetricThreshold": "tma_icache_misses > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", + "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to instruction cache misses. Sample with: FRONTEND_RETIRED.L2_MISS_PS;FRONTEND_RETIRED.L1I_MISS_PS", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Branch Misprediction Cost: Fraction of TMA slots wasted per non-speculative branch misprediction (retired JEClear)", + "MetricExpr": "tma_info_bottleneck_mispredictions * tma_info_thread_slots / cpu_core@BR_MISP_RETIRED.ALL_BRANCHES@ / 100", + "MetricGroup": "Bad;BrMispredicts;tma_issueBM", + "MetricName": "tma_info_bad_spec_branch_misprediction_cost", + "PublicDescription": "Branch Misprediction Cost: Fraction of TMA slots wasted per non-speculative branch misprediction (retired JEClear). Related metrics: tma_branch_mispredicts, tma_info_bottleneck_mispredictions, tma_mispredicts_resteers", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Instructions per retired mispredicts for conditional non-taken branches (lower number means higher occurrence rate).", + "MetricExpr": "cpu_core@INST_RETIRED.ANY@ / cpu_core@BR_MISP_RETIRED.COND_NTAKEN@", + "MetricGroup": "Bad;BrMispredicts", + "MetricName": "tma_info_bad_spec_ipmisp_cond_ntaken", + "MetricThreshold": "tma_info_bad_spec_ipmisp_cond_ntaken < 200", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Instructions per retired mispredicts for conditional taken branches (lower number means higher occurrence rate).", + "MetricExpr": "cpu_core@INST_RETIRED.ANY@ / cpu_core@BR_MISP_RETIRED.COND_TAKEN@", + "MetricGroup": "Bad;BrMispredicts", + "MetricName": "tma_info_bad_spec_ipmisp_cond_taken", + "MetricThreshold": "tma_info_bad_spec_ipmisp_cond_taken < 200", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Instructions per retired mispredicts for indirect CALL or JMP branches (lower number means higher occurrence rate).", + "MetricExpr": "cpu_core@INST_RETIRED.ANY@ / cpu_core@BR_MISP_RETIRED.INDIRECT@", + "MetricGroup": "Bad;BrMispredicts", + "MetricName": "tma_info_bad_spec_ipmisp_indirect", + "MetricThreshold": "tma_info_bad_spec_ipmisp_indirect < 1e3", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Instructions per retired mispredicts for return branches (lower number means higher occurrence rate).", + "MetricExpr": "cpu_core@INST_RETIRED.ANY@ / cpu_core@BR_MISP_RETIRED.RET@", + "MetricGroup": "Bad;BrMispredicts", + "MetricName": "tma_info_bad_spec_ipmisp_ret", + "MetricThreshold": "tma_info_bad_spec_ipmisp_ret < 500", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Number of Instructions per non-speculative Branch Misprediction (JEClear) (lower number means higher occurrence rate)", + "MetricExpr": "cpu_core@INST_RETIRED.ANY@ / cpu_core@BR_MISP_RETIRED.ALL_BRANCHES@", + "MetricGroup": "Bad;BadSpec;BrMispredicts", + "MetricName": "tma_info_bad_spec_ipmispredict", + "MetricThreshold": "tma_info_bad_spec_ipmispredict < 200", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Speculative to Retired ratio of all clears (covering mispredicts and nukes)", + "MetricExpr": "cpu_core@INT_MISC.CLEARS_COUNT@ / (cpu_core@BR_MISP_RETIRED.ALL_BRANCHES@ + cpu_core@MACHINE_CLEARS.COUNT@)", + "MetricGroup": "BrMispredicts", + "MetricName": "tma_info_bad_spec_spec_clears_ratio", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Probability of Core Bound bottleneck hidden by SMT-profiling artifacts", + "MetricExpr": "(100 * (1 - tma_core_bound / tma_ports_utilization if tma_core_bound < tma_ports_utilization else 1) if tma_info_system_smt_2t_utilization > 0.5 else 0)", + "MetricGroup": "Cor;SMT", + "MetricName": "tma_info_botlnk_l0_core_bound_likely", + "MetricThreshold": "tma_info_botlnk_l0_core_bound_likely > 0.5", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Total pipeline cost of DSB (uop cache) hits - subset of the Instruction_Fetch_BW Bottleneck", + "MetricExpr": "100 * (tma_frontend_bound * (tma_fetch_bandwidth / (tma_fetch_bandwidth + tma_fetch_latency)) * (tma_dsb / (tma_dsb + tma_lsd + tma_mite)))", + "MetricGroup": "DSB;FetchBW;tma_issueFB", + "MetricName": "tma_info_botlnk_l2_dsb_bandwidth", + "MetricThreshold": "tma_info_botlnk_l2_dsb_bandwidth > 10", + "PublicDescription": "Total pipeline cost of DSB (uop cache) hits - subset of the Instruction_Fetch_BW Bottleneck. Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_botlnk_l2_dsb_misses, tma_info_frontend_dsb_coverage, tma_info_inst_mix_iptb, tma_lcp", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Total pipeline cost of DSB (uop cache) misses - subset of the Instruction_Fetch_BW Bottleneck", + "MetricExpr": "100 * (tma_fetch_latency * tma_dsb_switches / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches) + tma_fetch_bandwidth * tma_mite / (tma_dsb + tma_lsd + tma_mite))", + "MetricGroup": "DSBmiss;Fed;tma_issueFB", + "MetricName": "tma_info_botlnk_l2_dsb_misses", + "MetricThreshold": "tma_info_botlnk_l2_dsb_misses > 10", + "PublicDescription": "Total pipeline cost of DSB (uop cache) misses - subset of the Instruction_Fetch_BW Bottleneck. Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_botlnk_l2_dsb_bandwidth, tma_info_frontend_dsb_coverage, tma_info_inst_mix_iptb, tma_lcp", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Total pipeline cost of Instruction Cache misses - subset of the Big_Code Bottleneck", + "MetricExpr": "100 * (tma_fetch_latency * tma_icache_misses / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches))", + "MetricGroup": "Fed;FetchLat;IcMiss;tma_issueFL", + "MetricName": "tma_info_botlnk_l2_ic_misses", + "MetricThreshold": "tma_info_botlnk_l2_ic_misses > 5", + "PublicDescription": "Total pipeline cost of Instruction Cache misses - subset of the Big_Code Bottleneck. Related metrics: ", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Total pipeline cost of instruction fetch related bottlenecks by large code footprint programs (i-side cache; TLB and BTB misses)", + "MetricExpr": "100 * tma_fetch_latency * (tma_itlb_misses + tma_icache_misses + tma_unknown_branches) / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)", + "MetricGroup": "BigFootprint;BvBC;Fed;Frontend;IcMiss;MemoryTLB", + "MetricName": "tma_info_bottleneck_big_code", + "MetricThreshold": "tma_info_bottleneck_big_code > 20", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Total pipeline cost of instructions used for program control-flow - a subset of the Retiring category in TMA", + "MetricExpr": "100 * ((cpu_core@BR_INST_RETIRED.ALL_BRANCHES@ + 2 * cpu_core@BR_INST_RETIRED.NEAR_CALL@ + cpu_core@INST_RETIRED.NOP@) / tma_info_thread_slots)", + "MetricGroup": "BvBO;Ret", + "MetricName": "tma_info_bottleneck_branching_overhead", + "MetricThreshold": "tma_info_bottleneck_branching_overhead > 5", + "PublicDescription": "Total pipeline cost of instructions used for program control-flow - a subset of the Retiring category in TMA. Examples include function calls; loops and alignments. (A lower bound)", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Total pipeline cost of external Memory- or Cache-Bandwidth related bottlenecks", + "MetricExpr": "100 * (tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_mem_bandwidth / (tma_mem_bandwidth + tma_mem_latency)) + tma_memory_bound * (tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_sq_full / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full)) + tma_memory_bound * (tma_l1_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_fb_full / (tma_dtlb_load + tma_fb_full + tma_l1_hit_latency + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)))", + "MetricGroup": "BvMB;Mem;MemoryBW;Offcore;tma_issueBW", + "MetricName": "tma_info_bottleneck_cache_memory_bandwidth", + "MetricThreshold": "tma_info_bottleneck_cache_memory_bandwidth > 20", + "PublicDescription": "Total pipeline cost of external Memory- or Cache-Bandwidth related bottlenecks. Related metrics: tma_fb_full, tma_info_system_dram_bw_use, tma_mem_bandwidth, tma_sq_full", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Total pipeline cost of external Memory- or Cache-Latency related bottlenecks", + "MetricExpr": "100 * (tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_mem_latency / (tma_mem_bandwidth + tma_mem_latency)) + tma_memory_bound * (tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_l3_hit_latency / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full)) + tma_memory_bound * tma_l2_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) + tma_memory_bound * (tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_store_latency / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency + tma_streaming_stores)) + tma_memory_bound * (tma_l1_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_l1_hit_latency / (tma_dtlb_load + tma_fb_full + tma_l1_hit_latency + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)))", + "MetricGroup": "BvML;Mem;MemoryLat;Offcore;tma_issueLat", + "MetricName": "tma_info_bottleneck_cache_memory_latency", + "MetricThreshold": "tma_info_bottleneck_cache_memory_latency > 20", + "PublicDescription": "Total pipeline cost of external Memory- or Cache-Latency related bottlenecks. Related metrics: tma_l3_hit_latency, tma_mem_latency", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Total pipeline cost when the execution is compute-bound - an estimation", + "MetricExpr": "100 * (tma_core_bound * tma_divider / (tma_divider + tma_ports_utilization + tma_serializing_operation) + tma_core_bound * (tma_ports_utilization / (tma_divider + tma_ports_utilization + tma_serializing_operation)) * (tma_ports_utilized_3m / (tma_ports_utilized_0 + tma_ports_utilized_1 + tma_ports_utilized_2 + tma_ports_utilized_3m)))", + "MetricGroup": "BvCB;Cor;tma_issueComp", + "MetricName": "tma_info_bottleneck_compute_bound_est", + "MetricThreshold": "tma_info_bottleneck_compute_bound_est > 20", + "PublicDescription": "Total pipeline cost when the execution is compute-bound - an estimation. Covers Core Bound when High ILP as well as when long-latency execution units are busy. Related metrics: ", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Total pipeline cost of instruction fetch bandwidth related bottlenecks (when the front-end could not sustain operations delivery to the back-end)", + "MetricExpr": "100 * (tma_frontend_bound - (1 - 10 * tma_microcode_sequencer * tma_other_mispredicts / tma_branch_mispredicts) * tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches) - (1 - cpu_core@INST_RETIRED.REP_ITERATION@ / cpu_core@UOPS_RETIRED.MS\\,cmask\\=1@) * (tma_fetch_latency * (tma_ms_switches + tma_branch_resteers * (tma_clears_resteers + tma_mispredicts_resteers * tma_other_mispredicts / tma_branch_mispredicts) / (tma_clears_resteers + tma_mispredicts_resteers + tma_unknown_branches)) / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches))) - tma_info_bottleneck_big_code", + "MetricGroup": "BvFB;Fed;FetchBW;Frontend", + "MetricName": "tma_info_bottleneck_instruction_fetch_bw", + "MetricThreshold": "tma_info_bottleneck_instruction_fetch_bw > 20", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Total pipeline cost of irregular execution (e.g", + "MetricExpr": "100 * ((1 - cpu_core@INST_RETIRED.REP_ITERATION@ / cpu_core@UOPS_RETIRED.MS\\,cmask\\=1@) * (tma_fetch_latency * (tma_ms_switches + tma_branch_resteers * (tma_clears_resteers + tma_mispredicts_resteers * tma_other_mispredicts / tma_branch_mispredicts) / (tma_clears_resteers + tma_mispredicts_resteers + tma_unknown_branches)) / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)) + 10 * tma_microcode_sequencer * tma_other_mispredicts / tma_branch_mispredicts * tma_branch_mispredicts + tma_machine_clears * tma_other_nukes / tma_other_nukes + tma_core_bound * (tma_serializing_operation + cpu_core@RS.EMPTY\\,umask\\=1@ / tma_info_thread_clks * tma_ports_utilized_0) / (tma_divider + tma_ports_utilization + tma_serializing_operation) + tma_microcode_sequencer / (tma_few_uops_instructions + tma_microcode_sequencer) * (tma_assists / tma_microcode_sequencer) * tma_heavy_operations)", + "MetricGroup": "Bad;BvIO;Cor;Ret;tma_issueMS", + "MetricName": "tma_info_bottleneck_irregular_overhead", + "MetricThreshold": "tma_info_bottleneck_irregular_overhead > 10", + "PublicDescription": "Total pipeline cost of irregular execution (e.g. FP-assists in HPC, Wait time with work imbalance multithreaded workloads, overhead in system services or virtualized environments). Related metrics: tma_microcode_sequencer, tma_ms_switches", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Total pipeline cost of Memory Address Translation related bottlenecks (data-side TLBs)", + "MetricExpr": "100 * (tma_memory_bound * (tma_l1_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_dtlb_load / (tma_dtlb_load + tma_fb_full + tma_l1_hit_latency + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)) + tma_memory_bound * (tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_dtlb_store / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency + tma_streaming_stores)))", + "MetricGroup": "BvMT;Mem;MemoryTLB;Offcore;tma_issueTLB", + "MetricName": "tma_info_bottleneck_memory_data_tlbs", + "MetricThreshold": "tma_info_bottleneck_memory_data_tlbs > 20", + "PublicDescription": "Total pipeline cost of Memory Address Translation related bottlenecks (data-side TLBs). Related metrics: tma_dtlb_load, tma_dtlb_store, tma_info_bottleneck_memory_synchronization", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Total pipeline cost of Memory Synchronization related bottlenecks (data transfers and coherency updates across processors)", + "MetricExpr": "100 * (tma_memory_bound * (tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_contested_accesses + tma_data_sharing) / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full) + tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * tma_false_sharing / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency + tma_streaming_stores - tma_store_latency)) + tma_machine_clears * (1 - tma_other_nukes / tma_other_nukes))", + "MetricGroup": "BvMS;Mem;Offcore;tma_issueTLB", + "MetricName": "tma_info_bottleneck_memory_synchronization", + "MetricThreshold": "tma_info_bottleneck_memory_synchronization > 10", + "PublicDescription": "Total pipeline cost of Memory Synchronization related bottlenecks (data transfers and coherency updates across processors). Related metrics: tma_dtlb_load, tma_dtlb_store, tma_info_bottleneck_memory_data_tlbs", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Total pipeline cost of Branch Misprediction related bottlenecks", + "MetricExpr": "100 * (1 - 10 * tma_microcode_sequencer * tma_other_mispredicts / tma_branch_mispredicts) * (tma_branch_mispredicts + tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches))", + "MetricGroup": "Bad;BadSpec;BrMispredicts;BvMP;tma_issueBM", + "MetricName": "tma_info_bottleneck_mispredictions", + "MetricThreshold": "tma_info_bottleneck_mispredictions > 20", + "PublicDescription": "Total pipeline cost of Branch Misprediction related bottlenecks. Related metrics: tma_branch_mispredicts, tma_info_bad_spec_branch_misprediction_cost, tma_mispredicts_resteers", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Total pipeline cost of remaining bottlenecks in the back-end", + "MetricExpr": "100 - (tma_info_bottleneck_big_code + tma_info_bottleneck_instruction_fetch_bw + tma_info_bottleneck_mispredictions + tma_info_bottleneck_cache_memory_bandwidth + tma_info_bottleneck_cache_memory_latency + tma_info_bottleneck_memory_data_tlbs + tma_info_bottleneck_memory_synchronization + tma_info_bottleneck_compute_bound_est + tma_info_bottleneck_irregular_overhead + tma_info_bottleneck_branching_overhead + tma_info_bottleneck_useful_work)", + "MetricGroup": "BvOB;Cor;Offcore", + "MetricName": "tma_info_bottleneck_other_bottlenecks", + "MetricThreshold": "tma_info_bottleneck_other_bottlenecks > 20", + "PublicDescription": "Total pipeline cost of remaining bottlenecks in the back-end. Examples include data-dependencies (Core Bound when Low ILP) and other unlisted memory-related stalls.", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Total pipeline cost of \"useful operations\" - the portion of Retiring category not covered by Branching_Overhead nor Irregular_Overhead.", + "MetricExpr": "100 * (tma_retiring - (cpu_core@BR_INST_RETIRED.ALL_BRANCHES@ + 2 * cpu_core@BR_INST_RETIRED.NEAR_CALL@ + cpu_core@INST_RETIRED.NOP@) / tma_info_thread_slots - tma_microcode_sequencer / (tma_few_uops_instructions + tma_microcode_sequencer) * (tma_assists / tma_microcode_sequencer) * tma_heavy_operations)", + "MetricGroup": "BvUW;Ret", + "MetricName": "tma_info_bottleneck_useful_work", + "MetricThreshold": "tma_info_bottleneck_useful_work > 20", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Fraction of branches that are CALL or RET", + "MetricExpr": "(cpu_core@BR_INST_RETIRED.NEAR_CALL@ + cpu_core@BR_INST_RETIRED.NEAR_RETURN@) / cpu_core@BR_INST_RETIRED.ALL_BRANCHES@", + "MetricGroup": "Bad;Branches", + "MetricName": "tma_info_branches_callret", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Fraction of branches that are non-taken conditionals", + "MetricExpr": "cpu_core@BR_INST_RETIRED.COND_NTAKEN@ / cpu_core@BR_INST_RETIRED.ALL_BRANCHES@", + "MetricGroup": "Bad;Branches;CodeGen;PGO", + "MetricName": "tma_info_branches_cond_nt", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Fraction of branches that are taken conditionals", + "MetricExpr": "cpu_core@BR_INST_RETIRED.COND_TAKEN@ / cpu_core@BR_INST_RETIRED.ALL_BRANCHES@", + "MetricGroup": "Bad;Branches;CodeGen;PGO", + "MetricName": "tma_info_branches_cond_tk", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Fraction of branches that are unconditional (direct or indirect) jumps", + "MetricExpr": "(cpu_core@BR_INST_RETIRED.NEAR_TAKEN@ - cpu_core@BR_INST_RETIRED.COND_TAKEN@ - 2 * cpu_core@BR_INST_RETIRED.NEAR_CALL@) / cpu_core@BR_INST_RETIRED.ALL_BRANCHES@", + "MetricGroup": "Bad;Branches", + "MetricName": "tma_info_branches_jump", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Fraction of branches of other types (not individually covered by other metrics in Info.Branches group)", + "MetricExpr": "1 - (tma_info_branches_cond_nt + tma_info_branches_cond_tk + tma_info_branches_callret + tma_info_branches_jump)", + "MetricGroup": "Bad;Branches", + "MetricName": "tma_info_branches_other_branches", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Core actual clocks when any Logical Processor is active on the Physical Core", + "MetricExpr": "(cpu_core@CPU_CLK_UNHALTED.DISTRIBUTED@ if #SMT_on else tma_info_thread_clks)", + "MetricGroup": "SMT", + "MetricName": "tma_info_core_core_clks", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Instructions Per Cycle across hyper-threads (per physical core)", + "MetricExpr": "cpu_core@INST_RETIRED.ANY@ / tma_info_core_core_clks", + "MetricGroup": "Ret;SMT;TmaL1;tma_L1_group", + "MetricName": "tma_info_core_coreipc", + "Unit": "cpu_core" + }, + { + "BriefDescription": "uops Executed per Cycle", + "MetricExpr": "cpu_core@UOPS_EXECUTED.THREAD@ / tma_info_thread_clks", + "MetricGroup": "Power", + "MetricName": "tma_info_core_epc", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Floating Point Operations Per Cycle", + "MetricExpr": "(cpu_core@FP_ARITH_INST_RETIRED.SCALAR@ + 2 * cpu_core@FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE@ + 4 * cpu_core@FP_ARITH_INST_RETIRED.4_FLOPS@ + 8 * cpu_core@FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE@) / tma_info_core_core_clks", + "MetricGroup": "Flops;Ret", + "MetricName": "tma_info_core_flopc", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Actual per-core usage of the Floating Point non-X87 execution units (regardless of precision or vector-width)", + "MetricExpr": "(cpu_core@FP_ARITH_DISPATCHED.PORT_0@ + cpu_core@FP_ARITH_DISPATCHED.PORT_1@ + cpu_core@FP_ARITH_DISPATCHED.PORT_5@) / (2 * tma_info_core_core_clks)", + "MetricGroup": "Cor;Flops;HPC", + "MetricName": "tma_info_core_fp_arith_utilization", + "PublicDescription": "Actual per-core usage of the Floating Point non-X87 execution units (regardless of precision or vector-width). Values > 1 are possible due to ([BDW+] Fused-Multiply Add (FMA) counting - common; [ADL+] use all of ADD/MUL/FMA in Scalar or 128/256-bit vectors - less common).", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per thread (logical-processor)", + "MetricExpr": "cpu_core@UOPS_EXECUTED.THREAD@ / cpu_core@UOPS_EXECUTED.THREAD\\,cmask\\=1@", + "MetricGroup": "Backend;Cor;Pipeline;PortsUtil", + "MetricName": "tma_info_core_ilp", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Fraction of Uops delivered by the DSB (aka Decoded ICache; or Uop Cache)", + "MetricExpr": "cpu_core@IDQ.DSB_UOPS@ / cpu_core@UOPS_ISSUED.ANY@", + "MetricGroup": "DSB;Fed;FetchBW;tma_issueFB", + "MetricName": "tma_info_frontend_dsb_coverage", + "MetricThreshold": "tma_info_frontend_dsb_coverage < 0.7 & tma_info_thread_ipc / 6 > 0.35", + "PublicDescription": "Fraction of Uops delivered by the DSB (aka Decoded ICache; or Uop Cache). Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_botlnk_l2_dsb_bandwidth, tma_info_botlnk_l2_dsb_misses, tma_info_inst_mix_iptb, tma_lcp", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Average number of cycles of a switch from the DSB fetch-unit to MITE fetch unit - see DSB_Switches tree node for details.", + "MetricExpr": "cpu_core@DSB2MITE_SWITCHES.PENALTY_CYCLES@ / cpu_core@DSB2MITE_SWITCHES.PENALTY_CYCLES\\,cmask\\=1\\,edge@", + "MetricGroup": "DSBmiss", + "MetricName": "tma_info_frontend_dsb_switch_cost", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Average number of Uops issued by front-end when it issued something", + "MetricExpr": "cpu_core@UOPS_ISSUED.ANY@ / cpu_core@UOPS_ISSUED.ANY\\,cmask\\=1@", + "MetricGroup": "Fed;FetchBW", + "MetricName": "tma_info_frontend_fetch_upc", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Average Latency for L1 instruction cache misses", + "MetricExpr": "cpu_core@ICACHE_DATA.STALLS@ / cpu_core@ICACHE_DATA.STALLS\\,cmask\\=1\\,edge@", + "MetricGroup": "Fed;FetchLat;IcMiss", + "MetricName": "tma_info_frontend_icache_miss_latency", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Instructions per non-speculative DSB miss (lower number means higher occurrence rate)", + "MetricExpr": "cpu_core@INST_RETIRED.ANY@ / cpu_core@FRONTEND_RETIRED.ANY_DSB_MISS@", + "MetricGroup": "DSBmiss;Fed", + "MetricName": "tma_info_frontend_ipdsb_miss_ret", + "MetricThreshold": "tma_info_frontend_ipdsb_miss_ret < 50", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Instructions per speculative Unknown Branch Misprediction (BAClear) (lower number means higher occurrence rate)", + "MetricExpr": "tma_info_inst_mix_instructions / cpu_core@BACLEARS.ANY@", + "MetricGroup": "Fed", + "MetricName": "tma_info_frontend_ipunknown_branch", + "Unit": "cpu_core" + }, + { + "BriefDescription": "L2 cache true code cacheline misses per kilo instruction", + "MetricExpr": "1e3 * cpu_core@FRONTEND_RETIRED.L2_MISS@ / cpu_core@INST_RETIRED.ANY@", + "MetricGroup": "IcMiss", + "MetricName": "tma_info_frontend_l2mpki_code", + "Unit": "cpu_core" + }, + { + "BriefDescription": "L2 cache speculative code cacheline misses per kilo instruction", + "MetricExpr": "1e3 * cpu_core@L2_RQSTS.CODE_RD_MISS@ / cpu_core@INST_RETIRED.ANY@", + "MetricGroup": "IcMiss", + "MetricName": "tma_info_frontend_l2mpki_code_all", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Fraction of Uops delivered by the LSD (Loop Stream Detector; aka Loop Cache)", + "MetricExpr": "cpu_core@LSD.UOPS@ / cpu_core@UOPS_ISSUED.ANY@", + "MetricGroup": "Fed;LSD", + "MetricName": "tma_info_frontend_lsd_coverage", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Average number of cycles the front-end was delayed due to an Unknown Branch detection", + "MetricExpr": "cpu_core@INT_MISC.UNKNOWN_BRANCH_CYCLES@ / cpu_core@INT_MISC.UNKNOWN_BRANCH_CYCLES\\,cmask\\=1\\,edge@", + "MetricGroup": "Fed", + "MetricName": "tma_info_frontend_unknown_branch_cost", + "PublicDescription": "Average number of cycles the front-end was delayed due to an Unknown Branch detection. See Unknown_Branches node.", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Branch instructions per taken branch.", + "MetricExpr": "cpu_core@BR_INST_RETIRED.ALL_BRANCHES@ / cpu_core@BR_INST_RETIRED.NEAR_TAKEN@", + "MetricGroup": "Branches;Fed;PGO", + "MetricName": "tma_info_inst_mix_bptkbranch", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Total number of retired Instructions", + "MetricExpr": "cpu_core@INST_RETIRED.ANY@", + "MetricGroup": "Summary;TmaL1;tma_L1_group", + "MetricName": "tma_info_inst_mix_instructions", + "PublicDescription": "Total number of retired Instructions. Sample with: INST_RETIRED.PREC_DIST", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Instructions per FP Arithmetic instruction (lower number means higher occurrence rate)", + "MetricExpr": "cpu_core@INST_RETIRED.ANY@ / (cpu_core@FP_ARITH_INST_RETIRED.SCALAR@ + cpu_core@FP_ARITH_INST_RETIRED.VECTOR@)", + "MetricGroup": "Flops;InsType", + "MetricName": "tma_info_inst_mix_iparith", + "MetricThreshold": "tma_info_inst_mix_iparith < 10", + "PublicDescription": "Instructions per FP Arithmetic instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting. Approximated prior to BDW.", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate)", + "MetricExpr": "cpu_core@INST_RETIRED.ANY@ / (cpu_core@FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE@ + cpu_core@FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE@)", + "MetricGroup": "Flops;FpVector;InsType", + "MetricName": "tma_info_inst_mix_iparith_avx128", + "MetricThreshold": "tma_info_inst_mix_iparith_avx128 < 10", + "PublicDescription": "Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting.", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate)", + "MetricExpr": "cpu_core@INST_RETIRED.ANY@ / (cpu_core@FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE@ + cpu_core@FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE@)", + "MetricGroup": "Flops;FpVector;InsType", + "MetricName": "tma_info_inst_mix_iparith_avx256", + "MetricThreshold": "tma_info_inst_mix_iparith_avx256 < 10", + "PublicDescription": "Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting.", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Instructions per FP Arithmetic Scalar Double-Precision instruction (lower number means higher occurrence rate)", + "MetricExpr": "cpu_core@INST_RETIRED.ANY@ / cpu_core@FP_ARITH_INST_RETIRED.SCALAR_DOUBLE@", + "MetricGroup": "Flops;FpScalar;InsType", + "MetricName": "tma_info_inst_mix_iparith_scalar_dp", + "MetricThreshold": "tma_info_inst_mix_iparith_scalar_dp < 10", + "PublicDescription": "Instructions per FP Arithmetic Scalar Double-Precision instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting.", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Instructions per FP Arithmetic Scalar Single-Precision instruction (lower number means higher occurrence rate)", + "MetricExpr": "cpu_core@INST_RETIRED.ANY@ / cpu_core@FP_ARITH_INST_RETIRED.SCALAR_SINGLE@", + "MetricGroup": "Flops;FpScalar;InsType", + "MetricName": "tma_info_inst_mix_iparith_scalar_sp", + "MetricThreshold": "tma_info_inst_mix_iparith_scalar_sp < 10", + "PublicDescription": "Instructions per FP Arithmetic Scalar Single-Precision instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting.", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Instructions per Branch (lower number means higher occurrence rate)", + "MetricExpr": "cpu_core@INST_RETIRED.ANY@ / cpu_core@BR_INST_RETIRED.ALL_BRANCHES@", + "MetricGroup": "Branches;Fed;InsType", + "MetricName": "tma_info_inst_mix_ipbranch", + "MetricThreshold": "tma_info_inst_mix_ipbranch < 8", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Instructions per (near) call (lower number means higher occurrence rate)", + "MetricExpr": "cpu_core@INST_RETIRED.ANY@ / cpu_core@BR_INST_RETIRED.NEAR_CALL@", + "MetricGroup": "Branches;Fed;PGO", + "MetricName": "tma_info_inst_mix_ipcall", + "MetricThreshold": "tma_info_inst_mix_ipcall < 200", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Instructions per Floating Point (FP) Operation (lower number means higher occurrence rate)", + "MetricExpr": "cpu_core@INST_RETIRED.ANY@ / (cpu_core@FP_ARITH_INST_RETIRED.SCALAR@ + 2 * cpu_core@FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE@ + 4 * cpu_core@FP_ARITH_INST_RETIRED.4_FLOPS@ + 8 * cpu_core@FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE@)", + "MetricGroup": "Flops;InsType", + "MetricName": "tma_info_inst_mix_ipflop", + "MetricThreshold": "tma_info_inst_mix_ipflop < 10", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Instructions per Load (lower number means higher occurrence rate)", + "MetricExpr": "cpu_core@INST_RETIRED.ANY@ / cpu_core@MEM_INST_RETIRED.ALL_LOADS@", + "MetricGroup": "InsType", + "MetricName": "tma_info_inst_mix_ipload", + "MetricThreshold": "tma_info_inst_mix_ipload < 3", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Instructions per PAUSE (lower number means higher occurrence rate)", + "MetricExpr": "tma_info_inst_mix_instructions / cpu_core@CPU_CLK_UNHALTED.PAUSE_INST@", + "MetricGroup": "Flops;FpVector;InsType", + "MetricName": "tma_info_inst_mix_ippause", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Instructions per Store (lower number means higher occurrence rate)", + "MetricExpr": "cpu_core@INST_RETIRED.ANY@ / cpu_core@MEM_INST_RETIRED.ALL_STORES@", + "MetricGroup": "InsType", + "MetricName": "tma_info_inst_mix_ipstore", + "MetricThreshold": "tma_info_inst_mix_ipstore < 8", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Instructions per Software prefetch instruction (of any type: NTA/T0/T1/T2/Prefetch) (lower number means higher occurrence rate)", + "MetricExpr": "cpu_core@INST_RETIRED.ANY@ / cpu_core@SW_PREFETCH_ACCESS.T0\\,umask\\=0xF@", + "MetricGroup": "Prefetches", + "MetricName": "tma_info_inst_mix_ipswpf", + "MetricThreshold": "tma_info_inst_mix_ipswpf < 100", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Instructions per taken branch", + "MetricExpr": "cpu_core@INST_RETIRED.ANY@ / cpu_core@BR_INST_RETIRED.NEAR_TAKEN@", + "MetricGroup": "Branches;Fed;FetchBW;Frontend;PGO;tma_issueFB", + "MetricName": "tma_info_inst_mix_iptb", + "MetricThreshold": "tma_info_inst_mix_iptb < 13", + "PublicDescription": "Instructions per taken branch. Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_botlnk_l2_dsb_bandwidth, tma_info_botlnk_l2_dsb_misses, tma_info_frontend_dsb_coverage, tma_lcp", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Average per-core data fill bandwidth to the L1 data cache [GB / sec]", + "MetricExpr": "tma_info_memory_l1d_cache_fill_bw", + "MetricGroup": "Mem;MemoryBW", + "MetricName": "tma_info_memory_core_l1d_cache_fill_bw_2t", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Average per-core data fill bandwidth to the L2 cache [GB / sec]", + "MetricExpr": "tma_info_memory_l2_cache_fill_bw", + "MetricGroup": "Mem;MemoryBW", + "MetricName": "tma_info_memory_core_l2_cache_fill_bw_2t", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Average per-core data access bandwidth to the L3 cache [GB / sec]", + "MetricExpr": "tma_info_memory_l3_cache_access_bw", + "MetricGroup": "Mem;MemoryBW;Offcore", + "MetricName": "tma_info_memory_core_l3_cache_access_bw_2t", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Average per-core data fill bandwidth to the L3 cache [GB / sec]", + "MetricExpr": "tma_info_memory_l3_cache_fill_bw", + "MetricGroup": "Mem;MemoryBW", + "MetricName": "tma_info_memory_core_l3_cache_fill_bw_2t", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Fill Buffer (FB) hits per kilo instructions for retired demand loads (L1D misses that merge into ongoing miss-handling entries)", + "MetricExpr": "1e3 * cpu_core@MEM_LOAD_RETIRED.FB_HIT@ / cpu_core@INST_RETIRED.ANY@", + "MetricGroup": "CacheHits;Mem", + "MetricName": "tma_info_memory_fb_hpki", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Average per-thread data fill bandwidth to the L1 data cache [GB / sec]", + "MetricExpr": "64 * cpu_core@L1D.REPLACEMENT@ / 1e9 / duration_time", + "MetricGroup": "Mem;MemoryBW", + "MetricName": "tma_info_memory_l1d_cache_fill_bw", + "Unit": "cpu_core" + }, + { + "BriefDescription": "L1 cache true misses per kilo instruction for retired demand loads", + "MetricExpr": "1e3 * cpu_core@MEM_LOAD_RETIRED.L1_MISS@ / cpu_core@INST_RETIRED.ANY@", + "MetricGroup": "CacheHits;Mem", + "MetricName": "tma_info_memory_l1mpki", + "Unit": "cpu_core" + }, + { + "BriefDescription": "L1 cache true misses per kilo instruction for all demand loads (including speculative)", + "MetricExpr": "1e3 * cpu_core@L2_RQSTS.ALL_DEMAND_DATA_RD@ / cpu_core@INST_RETIRED.ANY@", + "MetricGroup": "CacheHits;Mem", + "MetricName": "tma_info_memory_l1mpki_load", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Average per-thread data fill bandwidth to the L2 cache [GB / sec]", + "MetricExpr": "64 * cpu_core@L2_LINES_IN.ALL@ / 1e9 / duration_time", + "MetricGroup": "Mem;MemoryBW", + "MetricName": "tma_info_memory_l2_cache_fill_bw", + "Unit": "cpu_core" + }, + { + "BriefDescription": "L2 cache hits per kilo instruction for all request types (including speculative)", + "MetricExpr": "1e3 * (cpu_core@L2_RQSTS.REFERENCES@ - cpu_core@L2_RQSTS.MISS@) / cpu_core@INST_RETIRED.ANY@", + "MetricGroup": "CacheHits;Mem", + "MetricName": "tma_info_memory_l2hpki_all", + "Unit": "cpu_core" + }, + { + "BriefDescription": "L2 cache hits per kilo instruction for all demand loads (including speculative)", + "MetricExpr": "1e3 * cpu_core@L2_RQSTS.DEMAND_DATA_RD_HIT@ / cpu_core@INST_RETIRED.ANY@", + "MetricGroup": "CacheHits;Mem", + "MetricName": "tma_info_memory_l2hpki_load", + "Unit": "cpu_core" + }, + { + "BriefDescription": "L2 cache true misses per kilo instruction for retired demand loads", + "MetricExpr": "1e3 * cpu_core@MEM_LOAD_RETIRED.L2_MISS@ / cpu_core@INST_RETIRED.ANY@", + "MetricGroup": "Backend;CacheHits;Mem", + "MetricName": "tma_info_memory_l2mpki", + "Unit": "cpu_core" + }, + { + "BriefDescription": "L2 cache ([RKL+] true) misses per kilo instruction for all request types (including speculative)", + "MetricExpr": "1e3 * cpu_core@L2_RQSTS.MISS@ / cpu_core@INST_RETIRED.ANY@", + "MetricGroup": "CacheHits;Mem;Offcore", + "MetricName": "tma_info_memory_l2mpki_all", + "Unit": "cpu_core" + }, + { + "BriefDescription": "L2 cache ([RKL+] true) misses per kilo instruction for all demand loads (including speculative)", + "MetricExpr": "1e3 * cpu_core@L2_RQSTS.DEMAND_DATA_RD_MISS@ / cpu_core@INST_RETIRED.ANY@", + "MetricGroup": "CacheHits;Mem", + "MetricName": "tma_info_memory_l2mpki_load", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Offcore requests (L2 cache miss) per kilo instruction for demand RFOs", + "MetricExpr": "1e3 * cpu_core@L2_RQSTS.RFO_MISS@ / cpu_core@INST_RETIRED.ANY@", + "MetricGroup": "CacheMisses;Offcore", + "MetricName": "tma_info_memory_l2mpki_rfo", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Average per-thread data access bandwidth to the L3 cache [GB / sec]", + "MetricExpr": "64 * cpu_core@OFFCORE_REQUESTS.ALL_REQUESTS@ / 1e9 / duration_time", + "MetricGroup": "Mem;MemoryBW;Offcore", + "MetricName": "tma_info_memory_l3_cache_access_bw", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Average per-thread data fill bandwidth to the L3 cache [GB / sec]", + "MetricExpr": "64 * cpu_core@LONGEST_LAT_CACHE.MISS@ / 1e9 / duration_time", + "MetricGroup": "Mem;MemoryBW", + "MetricName": "tma_info_memory_l3_cache_fill_bw", + "Unit": "cpu_core" + }, + { + "BriefDescription": "L3 cache true misses per kilo instruction for retired demand loads", + "MetricExpr": "1e3 * cpu_core@MEM_LOAD_RETIRED.L3_MISS@ / cpu_core@INST_RETIRED.ANY@", + "MetricGroup": "Mem", + "MetricName": "tma_info_memory_l3mpki", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Average Parallel L2 cache miss data reads", + "MetricExpr": "cpu_core@OFFCORE_REQUESTS_OUTSTANDING.DATA_RD@ / cpu_core@OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD@", + "MetricGroup": "Memory_BW;Offcore", + "MetricName": "tma_info_memory_latency_data_l2_mlp", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Average Latency for L2 cache miss demand Loads", + "MetricExpr": "cpu_core@OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD@ / cpu_core@OFFCORE_REQUESTS.DEMAND_DATA_RD@", + "MetricGroup": "Memory_Lat;Offcore", + "MetricName": "tma_info_memory_latency_load_l2_miss_latency", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Average Parallel L2 cache miss demand Loads", + "MetricExpr": "cpu_core@OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD@ / cpu_core@OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD\\,cmask\\=1@", + "MetricGroup": "Memory_BW;Offcore", + "MetricName": "tma_info_memory_latency_load_l2_mlp", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Average Latency for L3 cache miss demand Loads", + "MetricExpr": "cpu_core@OFFCORE_REQUESTS_OUTSTANDING.L3_MISS_DEMAND_DATA_RD@ / cpu_core@OFFCORE_REQUESTS.L3_MISS_DEMAND_DATA_RD@", + "MetricGroup": "Memory_Lat;Offcore", + "MetricName": "tma_info_memory_latency_load_l3_miss_latency", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Actual Average Latency for L1 data-cache miss demand load operations (in core cycles)", + "MetricExpr": "cpu_core@L1D_PEND_MISS.PENDING@ / cpu_core@MEM_LOAD_COMPLETED.L1_MISS_ANY@", + "MetricGroup": "Mem;MemoryBound;MemoryLat", + "MetricName": "tma_info_memory_load_miss_real_latency", + "Unit": "cpu_core" + }, + { + "BriefDescription": "\"Bus lock\" per kilo instruction", + "MetricExpr": "1e3 * cpu_core@SQ_MISC.BUS_LOCK@ / cpu_core@INST_RETIRED.ANY@", + "MetricGroup": "Mem", + "MetricName": "tma_info_memory_mix_bus_lock_pki", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Un-cacheable retired load per kilo instruction", + "MetricExpr": "1e3 * cpu_core@MEM_LOAD_MISC_RETIRED.UC@ / cpu_core@INST_RETIRED.ANY@", + "MetricGroup": "Mem", + "MetricName": "tma_info_memory_mix_uc_load_pki", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss", + "MetricExpr": "cpu_core@L1D_PEND_MISS.PENDING@ / cpu_core@L1D_PEND_MISS.PENDING_CYCLES@", + "MetricGroup": "Mem;MemoryBW;MemoryBound", + "MetricName": "tma_info_memory_mlp", + "PublicDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss. Per-Logical Processor)", + "Unit": "cpu_core" + }, + { + "BriefDescription": "STLB (2nd level TLB) code speculative misses per kilo instruction (misses of any page-size that complete the page walk)", + "MetricExpr": "1e3 * cpu_core@ITLB_MISSES.WALK_COMPLETED@ / cpu_core@INST_RETIRED.ANY@", + "MetricGroup": "Fed;MemoryTLB", + "MetricName": "tma_info_memory_tlb_code_stlb_mpki", + "Unit": "cpu_core" + }, + { + "BriefDescription": "STLB (2nd level TLB) data load speculative misses per kilo instruction (misses of any page-size that complete the page walk)", + "MetricExpr": "1e3 * cpu_core@DTLB_LOAD_MISSES.WALK_COMPLETED@ / cpu_core@INST_RETIRED.ANY@", + "MetricGroup": "Mem;MemoryTLB", + "MetricName": "tma_info_memory_tlb_load_stlb_mpki", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Utilization of the core's Page Walker(s) serving STLB misses triggered by instruction/Load/Store accesses", + "MetricExpr": "(cpu_core@ITLB_MISSES.WALK_PENDING@ + cpu_core@DTLB_LOAD_MISSES.WALK_PENDING@ + cpu_core@DTLB_STORE_MISSES.WALK_PENDING@) / (4 * tma_info_core_core_clks)", + "MetricGroup": "Mem;MemoryTLB", + "MetricName": "tma_info_memory_tlb_page_walks_utilization", + "MetricThreshold": "tma_info_memory_tlb_page_walks_utilization > 0.5", + "Unit": "cpu_core" + }, + { + "BriefDescription": "STLB (2nd level TLB) data store speculative misses per kilo instruction (misses of any page-size that complete the page walk)", + "MetricExpr": "1e3 * cpu_core@DTLB_STORE_MISSES.WALK_COMPLETED@ / cpu_core@INST_RETIRED.ANY@", + "MetricGroup": "Mem;MemoryTLB", + "MetricName": "tma_info_memory_tlb_store_stlb_mpki", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per core", + "MetricExpr": "cpu_core@UOPS_EXECUTED.THREAD@ / (cpu_core@UOPS_EXECUTED.CORE_CYCLES_GE_1@ / 2 if #SMT_on else cpu_core@UOPS_EXECUTED.THREAD\\,cmask\\=1@)", + "MetricGroup": "Cor;Pipeline;PortsUtil;SMT", + "MetricName": "tma_info_pipeline_execute", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Average number of uops fetched from DSB per cycle", + "MetricExpr": "cpu_core@IDQ.DSB_UOPS@ / cpu_core@IDQ.DSB_CYCLES_ANY@", + "MetricGroup": "Fed;FetchBW", + "MetricName": "tma_info_pipeline_fetch_dsb", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Average number of uops fetched from LSD per cycle", + "MetricExpr": "cpu_core@LSD.UOPS@ / cpu_core@LSD.CYCLES_ACTIVE@", + "MetricGroup": "Fed;FetchBW", + "MetricName": "tma_info_pipeline_fetch_lsd", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Average number of uops fetched from MITE per cycle", + "MetricExpr": "cpu_core@IDQ.MITE_UOPS@ / cpu_core@IDQ.MITE_CYCLES_ANY@", + "MetricGroup": "Fed;FetchBW", + "MetricName": "tma_info_pipeline_fetch_mite", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Instructions per a microcode Assist invocation", + "MetricExpr": "cpu_core@INST_RETIRED.ANY@ / cpu_core@ASSISTS.ANY@", + "MetricGroup": "MicroSeq;Pipeline;Ret;Retire", + "MetricName": "tma_info_pipeline_ipassist", + "MetricThreshold": "tma_info_pipeline_ipassist < 100e3", + "PublicDescription": "Instructions per a microcode Assist invocation. See Assists tree node for details (lower number means higher occurrence rate)", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Average number of Uops retired in cycles where at least one uop has retired.", + "MetricExpr": "tma_retiring * tma_info_thread_slots / cpu_core@UOPS_RETIRED.SLOTS\\,cmask\\=1@", + "MetricGroup": "Pipeline;Ret", + "MetricName": "tma_info_pipeline_retire", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Estimated fraction of retirement-cycles dealing with repeat instructions", + "MetricExpr": "cpu_core@INST_RETIRED.REP_ITERATION@ / cpu_core@UOPS_RETIRED.SLOTS\\,cmask\\=1@", + "MetricGroup": "MicroSeq;Pipeline;Ret", + "MetricName": "tma_info_pipeline_strings_cycles", + "MetricThreshold": "tma_info_pipeline_strings_cycles > 0.1", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Fraction of cycles the processor is waiting yet unhalted; covering legacy PAUSE instruction, as well as C0.1 / C0.2 power-performance optimized states", + "MetricExpr": "cpu_core@CPU_CLK_UNHALTED.C0_WAIT@ / tma_info_thread_clks", + "MetricGroup": "C0Wait", + "MetricName": "tma_info_system_c0_wait", + "MetricThreshold": "tma_info_system_c0_wait > 0.05", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Measured Average Core Frequency for unhalted processors [GHz]", + "MetricExpr": "tma_info_system_turbo_utilization * TSC / 1e9 / duration_time", + "MetricGroup": "Power;Summary", + "MetricName": "tma_info_system_core_frequency", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Average CPU Utilization (percentage)", + "MetricExpr": "tma_info_system_cpus_utilized / #num_cpus_online", + "MetricGroup": "HPC;Summary", + "MetricName": "tma_info_system_cpu_utilization", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Average number of utilized CPUs", + "MetricExpr": "cpu_core@CPU_CLK_UNHALTED.REF_TSC@ / TSC", + "MetricGroup": "Summary", + "MetricName": "tma_info_system_cpus_utilized", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]", + "MetricExpr": "64 * (UNC_HAC_ARB_TRK_REQUESTS.ALL + UNC_HAC_ARB_COH_TRK_REQUESTS.ALL) / 1e9 / duration_time", + "MetricGroup": "HPC;MemOffcore;MemoryBW;SoC;tma_issueBW", + "MetricName": "tma_info_system_dram_bw_use", + "PublicDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]. Related metrics: tma_fb_full, tma_info_bottleneck_cache_memory_bandwidth, tma_mem_bandwidth, tma_sq_full", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Giga Floating Point Operations Per Second", + "MetricExpr": "(cpu_core@FP_ARITH_INST_RETIRED.SCALAR@ + 2 * cpu_core@FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE@ + 4 * cpu_core@FP_ARITH_INST_RETIRED.4_FLOPS@ + 8 * cpu_core@FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE@) / 1e9 / duration_time", + "MetricGroup": "Cor;Flops;HPC", + "MetricName": "tma_info_system_gflops", + "PublicDescription": "Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Instructions per Far Branch ( Far Branches apply upon transition from application to operating system, handling interrupts, exceptions) [lower number means higher occurrence rate]", + "MetricExpr": "cpu_core@INST_RETIRED.ANY@ / cpu_core@BR_INST_RETIRED.FAR_BRANCH@u", + "MetricGroup": "Branches;OS", + "MetricName": "tma_info_system_ipfarbranch", + "MetricThreshold": "tma_info_system_ipfarbranch < 1e6", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Cycles Per Instruction for the Operating System (OS) Kernel mode", + "MetricExpr": "cpu_core@CPU_CLK_UNHALTED.THREAD_P@k / cpu_core@INST_RETIRED.ANY_P@k", + "MetricGroup": "OS", + "MetricName": "tma_info_system_kernel_cpi", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Fraction of cycles spent in the Operating System (OS) Kernel mode", + "MetricExpr": "cpu_core@CPU_CLK_UNHALTED.THREAD_P@k / cpu_core@CPU_CLK_UNHALTED.THREAD@", + "MetricGroup": "OS", + "MetricName": "tma_info_system_kernel_utilization", + "MetricThreshold": "tma_info_system_kernel_utilization > 0.05", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Average number of parallel data read requests to external memory", + "MetricExpr": "UNC_ARB_DAT_OCCUPANCY.RD / UNC_ARB_DAT_OCCUPANCY.RD@cmask\\=1@", + "MetricGroup": "Mem;MemoryBW;SoC", + "MetricName": "tma_info_system_mem_parallel_reads", + "PublicDescription": "Average number of parallel data read requests to external memory. Accounts for demand loads and L1/L2 prefetches", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Fraction of cycles where both hardware Logical Processors were active", + "MetricExpr": "(1 - cpu_core@CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE@ / cpu_core@CPU_CLK_UNHALTED.REF_DISTRIBUTED@ if #SMT_on else 0)", + "MetricGroup": "SMT", + "MetricName": "tma_info_system_smt_2t_utilization", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Socket actual clocks when any core is active on that socket", + "MetricExpr": "UNC_CLOCK.SOCKET", + "MetricGroup": "SoC", + "MetricName": "tma_info_system_socket_clks", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Average Frequency Utilization relative nominal frequency", + "MetricExpr": "tma_info_thread_clks / cpu_core@CPU_CLK_UNHALTED.REF_TSC@", + "MetricGroup": "Power", + "MetricName": "tma_info_system_turbo_utilization", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Per-Logical Processor actual clocks when the Logical Processor is active.", + "MetricExpr": "cpu_core@CPU_CLK_UNHALTED.THREAD@", + "MetricGroup": "Pipeline", + "MetricName": "tma_info_thread_clks", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Cycles Per Instruction (per Logical Processor)", + "MetricExpr": "1 / tma_info_thread_ipc", + "MetricGroup": "Mem;Pipeline", + "MetricName": "tma_info_thread_cpi", + "Unit": "cpu_core" + }, + { + "BriefDescription": "The ratio of Executed- by Issued-Uops", + "MetricExpr": "cpu_core@UOPS_EXECUTED.THREAD@ / cpu_core@UOPS_ISSUED.ANY@", + "MetricGroup": "Cor;Pipeline", + "MetricName": "tma_info_thread_execute_per_issue", + "PublicDescription": "The ratio of Executed- by Issued-Uops. Ratio > 1 suggests high rate of uop micro-fusions. Ratio < 1 suggest high rate of \"execute\" at rename stage.", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Instructions Per Cycle (per Logical Processor)", + "MetricExpr": "cpu_core@INST_RETIRED.ANY@ / tma_info_thread_clks", + "MetricGroup": "Ret;Summary", + "MetricName": "tma_info_thread_ipc", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Total issue-pipeline slots (per-Physical Core till ICL; per-Logical Processor ICL onward)", + "MetricExpr": "cpu_core@TOPDOWN.SLOTS@", + "MetricGroup": "TmaL1;tma_L1_group", + "MetricName": "tma_info_thread_slots", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Fraction of Physical Core issue-slots utilized by this Logical Processor", + "MetricExpr": "(tma_info_thread_slots / (cpu_core@TOPDOWN.SLOTS@ / 2) if #SMT_on else 1)", + "MetricGroup": "SMT;TmaL1;tma_L1_group", + "MetricName": "tma_info_thread_slots_utilization", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Uops Per Instruction", + "MetricExpr": "tma_retiring * tma_info_thread_slots / cpu_core@INST_RETIRED.ANY@", + "MetricGroup": "Pipeline;Ret;Retire", + "MetricName": "tma_info_thread_uoppi", + "MetricThreshold": "tma_info_thread_uoppi > 1.05", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Uops per taken branch", + "MetricExpr": "tma_retiring * tma_info_thread_slots / cpu_core@BR_INST_RETIRED.NEAR_TAKEN@", + "MetricGroup": "Branches;Fed;FetchBW", + "MetricName": "tma_info_thread_uptb", + "MetricThreshold": "tma_info_thread_uptb < 9", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This metric represents overall Integer (Int) select operations fraction the CPU has executed (retired)", + "MetricExpr": "tma_int_vector_128b + tma_int_vector_256b", + "MetricGroup": "Pipeline;TopdownL3;tma_L3_group;tma_light_operations_group", + "MetricName": "tma_int_operations", + "MetricThreshold": "tma_int_operations > 0.1 & tma_light_operations > 0.6", + "PublicDescription": "This metric represents overall Integer (Int) select operations fraction the CPU has executed (retired). Vector/Matrix Int operations and shuffles are counted. Note this metric's value may exceed its parent due to use of \"Uops\" CountDomain.", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This metric represents 128-bit vector Integer ADD/SUB/SAD or VNNI (Vector Neural Network Instructions) uops fraction the CPU has retired", + "MetricExpr": "(cpu_core@INT_VEC_RETIRED.ADD_128@ + cpu_core@INT_VEC_RETIRED.VNNI_128@) / (tma_retiring * tma_info_thread_slots)", + "MetricGroup": "Compute;IntVector;Pipeline;TopdownL4;tma_L4_group;tma_int_operations_group;tma_issue2P", + "MetricName": "tma_int_vector_128b", + "MetricThreshold": "tma_int_vector_128b > 0.1 & (tma_int_operations > 0.1 & tma_light_operations > 0.6)", + "PublicDescription": "This metric represents 128-bit vector Integer ADD/SUB/SAD or VNNI (Vector Neural Network Instructions) uops fraction the CPU has retired. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_128b, tma_fp_vector_256b, tma_fp_vector_512b, tma_int_vector_256b, tma_port_0, tma_port_1, tma_port_5, tma_port_6, tma_ports_utilized_2", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This metric represents 256-bit vector Integer ADD/SUB/SAD/MUL or VNNI (Vector Neural Network Instructions) uops fraction the CPU has retired", + "MetricExpr": "(cpu_core@INT_VEC_RETIRED.ADD_256@ + cpu_core@INT_VEC_RETIRED.MUL_256@ + cpu_core@INT_VEC_RETIRED.VNNI_256@) / (tma_retiring * tma_info_thread_slots)", + "MetricGroup": "Compute;IntVector;Pipeline;TopdownL4;tma_L4_group;tma_int_operations_group;tma_issue2P", + "MetricName": "tma_int_vector_256b", + "MetricThreshold": "tma_int_vector_256b > 0.1 & (tma_int_operations > 0.1 & tma_light_operations > 0.6)", + "PublicDescription": "This metric represents 256-bit vector Integer ADD/SUB/SAD/MUL or VNNI (Vector Neural Network Instructions) uops fraction the CPU has retired. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_128b, tma_fp_vector_256b, tma_fp_vector_512b, tma_int_vector_128b, tma_port_0, tma_port_1, tma_port_5, tma_port_6, tma_ports_utilized_2", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Instruction TLB (ITLB) misses", + "MetricExpr": "cpu_core@ICACHE_TAG.STALLS@ / tma_info_thread_clks", + "MetricGroup": "BigFootprint;BvBC;FetchLat;MemoryTLB;TopdownL3;tma_L3_group;tma_fetch_latency_group", + "MetricName": "tma_itlb_misses", + "MetricThreshold": "tma_itlb_misses > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", + "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to Instruction TLB (ITLB) misses. Sample with: FRONTEND_RETIRED.STLB_MISS_PS;FRONTEND_RETIRED.ITLB_MISS_PS", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This metric estimates how often the CPU was stalled without loads missing the L1 data cache", + "MetricExpr": "max((cpu_core@EXE_ACTIVITY.BOUND_ON_LOADS@ - cpu_core@MEMORY_ACTIVITY.STALLS_L1D_MISS@) / tma_info_thread_clks, 0)", + "MetricGroup": "CacheHits;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_issueL1;tma_issueMC;tma_memory_bound_group", + "MetricName": "tma_l1_bound", + "MetricThreshold": "tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)", + "PublicDescription": "This metric estimates how often the CPU was stalled without loads missing the L1 data cache. The L1 data cache typically has the shortest latency. However; in certain cases like loads blocked on older stores; a load might suffer due to high latency even though it is being satisfied by the L1. Another example is loads who miss in the TLB. These cases are characterized by execution unit stalls; while some non-completed demand load lives in the machine without having that demand load missing the L1 cache. Sample with: MEM_LOAD_RETIRED.L1_HIT_PS;MEM_LOAD_RETIRED.FB_HIT_PS. Related metrics: tma_clears_resteers, tma_machine_clears, tma_microcode_sequencer, tma_ms_switches, tma_ports_utilized_1", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This metric roughly estimates fraction of cycles with demand load accesses that hit the L1 cache", + "MetricExpr": "min(2 * (cpu_core@MEM_INST_RETIRED.ALL_LOADS@ - cpu_core@MEM_LOAD_RETIRED.FB_HIT@ - cpu_core@MEM_LOAD_RETIRED.L1_MISS@) * 20 / 100, max(cpu_core@CYCLE_ACTIVITY.CYCLES_MEM_ANY@ - cpu_core@MEMORY_ACTIVITY.CYCLES_L1D_MISS@, 0)) / tma_info_thread_clks", + "MetricGroup": "BvML;MemoryLat;TopdownL4;tma_L4_group;tma_l1_bound_group", + "MetricName": "tma_l1_hit_latency", + "MetricThreshold": "tma_l1_hit_latency > 0.1 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", + "PublicDescription": "This metric roughly estimates fraction of cycles with demand load accesses that hit the L1 cache. The short latency of the L1 data cache may be exposed in pointer-chasing memory access patterns as an example. Sample with: MEM_LOAD_RETIRED.L1_HIT", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This metric estimates how often the CPU was stalled due to L2 cache accesses by loads", + "MetricExpr": "(cpu_core@MEMORY_ACTIVITY.STALLS_L1D_MISS@ - cpu_core@MEMORY_ACTIVITY.STALLS_L2_MISS@) / tma_info_thread_clks", + "MetricGroup": "BvML;CacheHits;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group", + "MetricName": "tma_l2_bound", + "MetricThreshold": "tma_l2_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)", + "PublicDescription": "This metric estimates how often the CPU was stalled due to L2 cache accesses by loads. Avoiding cache misses (i.e. L1 misses/L2 hits) can improve the latency and increase performance. Sample with: MEM_LOAD_RETIRED.L2_HIT_PS", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This metric estimates how often the CPU was stalled due to loads accesses to L3 cache or contended with a sibling Core", + "MetricExpr": "(cpu_core@MEMORY_ACTIVITY.STALLS_L2_MISS@ - cpu_core@MEMORY_ACTIVITY.STALLS_L3_MISS@) / tma_info_thread_clks", + "MetricGroup": "CacheHits;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group", + "MetricName": "tma_l3_bound", + "MetricThreshold": "tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)", + "PublicDescription": "This metric estimates how often the CPU was stalled due to loads accesses to L3 cache or contended with a sibling Core. Avoiding cache misses (i.e. L2 misses/L3 hits) can improve the latency and increase performance. Sample with: MEM_LOAD_RETIRED.L3_HIT_PS", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This metric estimates fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited)", + "MetricExpr": "cpu_core@MEM_LOAD_RETIRED.L3_HIT@ * min(cpu_core@MEM_LOAD_RETIRED.L3_HIT@R, 9 * tma_info_system_core_frequency) * (1 + cpu_core@MEM_LOAD_RETIRED.FB_HIT@ / cpu_core@MEM_LOAD_RETIRED.L1_MISS@ / 2) / tma_info_thread_clks", + "MetricGroup": "BvML;MemoryLat;TopdownL4;tma_L4_group;tma_issueLat;tma_l3_bound_group", + "MetricName": "tma_l3_hit_latency", + "MetricThreshold": "tma_l3_hit_latency > 0.1 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", + "PublicDescription": "This metric estimates fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited). Avoiding private cache misses (i.e. L2 misses/L3 hits) will improve the latency; reduce contention with sibling physical cores and increase performance. Note the value of this node may overlap with its siblings. Sample with: MEM_LOAD_RETIRED.L3_HIT_PS. Related metrics: tma_info_bottleneck_cache_memory_latency, tma_mem_latency", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This metric represents fraction of cycles CPU was stalled due to Length Changing Prefixes (LCPs)", + "MetricExpr": "cpu_core@DECODE.LCP@ / tma_info_thread_clks", + "MetricGroup": "FetchLat;TopdownL3;tma_L3_group;tma_fetch_latency_group;tma_issueFB", + "MetricName": "tma_lcp", + "MetricThreshold": "tma_lcp > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", + "PublicDescription": "This metric represents fraction of cycles CPU was stalled due to Length Changing Prefixes (LCPs). Using proper compiler flags or Intel Compiler by default will certainly avoid this. #Link: Optimization Guide about LCP BKMs. Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_botlnk_l2_dsb_bandwidth, tma_info_botlnk_l2_dsb_misses, tma_info_frontend_dsb_coverage, tma_info_inst_mix_iptb", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation)", + "MetricExpr": "max(0, tma_retiring - tma_heavy_operations)", + "MetricGroup": "Retire;TmaL2;TopdownL2;tma_L2_group;tma_retiring_group", + "MetricName": "tma_light_operations", + "MetricThreshold": "tma_light_operations > 0.6", + "MetricgroupNoGroup": "TopdownL2", + "PublicDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation). This correlates with total number of instructions used by the program. A uops-per-instruction (see UopPI metric) ratio of 1 or less should be expected for decently optimized code running on Intel Core/Xeon products. While this often indicates efficient X86 instructions were executed; high value does not necessarily mean better performance cannot be achieved. ([ICL+] Note this may undercount due to approximation using indirect events; [ADL+] .). Sample with: INST_RETIRED.PREC_DIST", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port for Load operations", + "MetricExpr": "cpu_core@UOPS_DISPATCHED.PORT_2_3_10@ / (3 * tma_info_core_core_clks)", + "MetricGroup": "TopdownL5;tma_L5_group;tma_ports_utilized_3m_group", + "MetricName": "tma_load_op_utilization", + "MetricThreshold": "tma_load_op_utilization > 0.6", + "PublicDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port for Load operations. Sample with: UOPS_DISPATCHED.PORT_2_3_10", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This metric roughly estimates the fraction of cycles where the (first level) DTLB was missed by load accesses, that later on hit in second-level TLB (STLB)", + "MetricExpr": "max(0, tma_dtlb_load - tma_load_stlb_miss)", + "MetricGroup": "MemoryTLB;TopdownL5;tma_L5_group;tma_dtlb_load_group", + "MetricName": "tma_load_stlb_hit", + "MetricThreshold": "tma_load_stlb_hit > 0.05 & (tma_dtlb_load > 0.1 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)))", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This metric estimates the fraction of cycles where the Second-level TLB (STLB) was missed by load accesses, performing a hardware page walk", + "MetricExpr": "cpu_core@DTLB_LOAD_MISSES.WALK_ACTIVE@ / tma_info_thread_clks", + "MetricGroup": "MemoryTLB;TopdownL5;tma_L5_group;tma_dtlb_load_group", + "MetricName": "tma_load_stlb_miss", + "MetricThreshold": "tma_load_stlb_miss > 0.05 & (tma_dtlb_load > 0.1 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)))", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This metric represents fraction of cycles the CPU spent handling cache misses due to lock operations", + "MetricExpr": "cpu_core@MEM_INST_RETIRED.LOCK_LOADS@ * cpu_core@MEM_INST_RETIRED.LOCK_LOADS@R / tma_info_thread_clks", + "MetricGroup": "Offcore;TopdownL4;tma_L4_group;tma_issueRFO;tma_l1_bound_group", + "MetricName": "tma_lock_latency", + "MetricThreshold": "tma_lock_latency > 0.2 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", + "PublicDescription": "This metric represents fraction of cycles the CPU spent handling cache misses due to lock operations. Due to the microarchitecture handling of locks; they are classified as L1_Bound regardless of what memory source satisfied them. Sample with: MEM_INST_RETIRED.LOCK_LOADS. Related metrics: tma_store_latency", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to LSD (Loop Stream Detector) unit", + "MetricExpr": "(cpu_core@LSD.CYCLES_ACTIVE@ - cpu_core@LSD.CYCLES_OK@) / tma_info_core_core_clks / 2", + "MetricGroup": "FetchBW;LSD;TopdownL3;tma_L3_group;tma_fetch_bandwidth_group", + "MetricName": "tma_lsd", + "MetricThreshold": "tma_lsd > 0.15 & tma_fetch_bandwidth > 0.2", + "PublicDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to LSD (Loop Stream Detector) unit. LSD typically does well sustaining Uop supply. However; in some rare cases; optimal uop-delivery could not be reached for small loops whose size (in terms of number of uops) does not suit well the LSD structure.", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This metric represents fraction of slots the CPU has wasted due to Machine Clears", + "MetricExpr": "max(0, tma_bad_speculation - tma_branch_mispredicts)", + "MetricGroup": "BadSpec;BvMS;MachineClears;TmaL2;TopdownL2;tma_L2_group;tma_bad_speculation_group;tma_issueMC;tma_issueSyncxn", + "MetricName": "tma_machine_clears", + "MetricThreshold": "tma_machine_clears > 0.1 & tma_bad_speculation > 0.15", + "MetricgroupNoGroup": "TopdownL2", + "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Machine Clears. These slots are either wasted by uops fetched prior to the clear; or stalls the out-of-order portion of the machine needs to recover its state after the clear. For example; this can happen due to memory ordering Nukes (e.g. Memory Disambiguation) or Self-Modifying-Code (SMC) nukes. Sample with: MACHINE_CLEARS.COUNT. Related metrics: tma_clears_resteers, tma_contested_accesses, tma_data_sharing, tma_false_sharing, tma_l1_bound, tma_microcode_sequencer, tma_ms_switches, tma_remote_cache", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory - DRAM ([SPR-HBM] and/or HBM)", + "MetricExpr": "min(cpu_core@CPU_CLK_UNHALTED.THREAD@, cpu_core@OFFCORE_REQUESTS_OUTSTANDING.DATA_RD\\,cmask\\=4@) / tma_info_thread_clks", + "MetricGroup": "BvMS;MemoryBW;Offcore;TopdownL4;tma_L4_group;tma_dram_bound_group;tma_issueBW", + "MetricName": "tma_mem_bandwidth", + "MetricThreshold": "tma_mem_bandwidth > 0.2 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", + "PublicDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory - DRAM ([SPR-HBM] and/or HBM). The underlying heuristic assumes that a similar off-core traffic is generated by all IA cores. This metric does not aggregate non-data-read requests by this logical processor; requests from other IA Logical Processors/Physical Cores/sockets; or other non-IA devices like GPU; hence the maximum external memory bandwidth limits may or may not be approached when this metric is flagged (see Uncore counters for that). Related metrics: tma_fb_full, tma_info_bottleneck_cache_memory_bandwidth, tma_info_system_dram_bw_use, tma_sq_full", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory - DRAM ([SPR-HBM] and/or HBM)", + "MetricExpr": "min(cpu_core@CPU_CLK_UNHALTED.THREAD@, cpu_core@OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD@) / tma_info_thread_clks - tma_mem_bandwidth", + "MetricGroup": "BvML;MemoryLat;Offcore;TopdownL4;tma_L4_group;tma_dram_bound_group;tma_issueLat", + "MetricName": "tma_mem_latency", + "MetricThreshold": "tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", + "PublicDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory - DRAM ([SPR-HBM] and/or HBM). This metric does not aggregate requests from other Logical Processors/Physical Cores/sockets (see Uncore counters for that). Related metrics: tma_info_bottleneck_cache_memory_latency, tma_l3_hit_latency", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This metric represents fraction of slots the Memory subsystem within the Backend was a bottleneck", + "MetricExpr": "cpu_core@topdown\\-mem\\-bound@ / (cpu_core@topdown\\-fe\\-bound@ + cpu_core@topdown\\-bad\\-spec@ + cpu_core@topdown\\-retiring@ + cpu_core@topdown\\-be\\-bound@) + 0 * tma_info_thread_slots", + "MetricGroup": "Backend;TmaL2;TopdownL2;tma_L2_group;tma_backend_bound_group", + "MetricName": "tma_memory_bound", + "MetricThreshold": "tma_memory_bound > 0.2 & tma_backend_bound > 0.2", + "MetricgroupNoGroup": "TopdownL2", + "PublicDescription": "This metric represents fraction of slots the Memory subsystem within the Backend was a bottleneck. Memory Bound estimates fraction of slots where pipeline is likely stalled due to demand load or store instructions. This accounts mainly for (1) non-completed in-flight memory demand loads which coincides with execution units starvation; in addition to (2) cases where stores could impose backpressure on the pipeline when many of them get buffered at the same time (less common out of the two).", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to LFENCE Instructions.", + "MetricConstraint": "NO_GROUP_EVENTS_NMI", + "MetricExpr": "13 * cpu_core@MISC2_RETIRED.LFENCE@ / tma_info_thread_clks", + "MetricGroup": "TopdownL4;tma_L4_group;tma_serializing_operation_group", + "MetricName": "tma_memory_fence", + "MetricThreshold": "tma_memory_fence > 0.05 & (tma_serializing_operation > 0.1 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This metric represents fraction of slots where the CPU was retiring memory operations -- uops for memory load or store accesses.", + "MetricExpr": "tma_light_operations * cpu_core@MEM_UOP_RETIRED.ANY@ / (tma_retiring * tma_info_thread_slots)", + "MetricGroup": "Pipeline;TopdownL3;tma_L3_group;tma_light_operations_group", + "MetricName": "tma_memory_operations", + "MetricThreshold": "tma_memory_operations > 0.1 & tma_light_operations > 0.6", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This metric represents fraction of slots the CPU was retiring uops fetched by the Microcode Sequencer (MS) unit", + "MetricExpr": "cpu_core@UOPS_RETIRED.MS@ / tma_info_thread_slots", + "MetricGroup": "MicroSeq;TopdownL3;tma_L3_group;tma_heavy_operations_group;tma_issueMC;tma_issueMS", + "MetricName": "tma_microcode_sequencer", + "MetricThreshold": "tma_microcode_sequencer > 0.05 & tma_heavy_operations > 0.1", + "PublicDescription": "This metric represents fraction of slots the CPU was retiring uops fetched by the Microcode Sequencer (MS) unit. The MS is used for CISC instructions not supported by the default decoders (like repeat move strings; or CPUID); or by microcode assists used to address some operation modes (like in Floating Point assists). These cases can often be avoided. Sample with: UOPS_RETIRED.MS. Related metrics: tma_clears_resteers, tma_info_bottleneck_irregular_overhead, tma_l1_bound, tma_machine_clears, tma_ms_switches", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers as a result of Branch Misprediction at execution stage", + "MetricExpr": "tma_branch_mispredicts / tma_bad_speculation * cpu_core@INT_MISC.CLEAR_RESTEER_CYCLES@ / tma_info_thread_clks", + "MetricGroup": "BadSpec;BrMispredicts;BvMP;TopdownL4;tma_L4_group;tma_branch_resteers_group;tma_issueBM", + "MetricName": "tma_mispredicts_resteers", + "MetricThreshold": "tma_mispredicts_resteers > 0.05 & (tma_branch_resteers > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15))", + "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers as a result of Branch Misprediction at execution stage. Sample with: INT_MISC.CLEAR_RESTEER_CYCLES. Related metrics: tma_branch_mispredicts, tma_info_bad_spec_branch_misprediction_cost, tma_info_bottleneck_mispredictions", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to the MITE pipeline (the legacy decode pipeline)", + "MetricExpr": "(cpu_core@IDQ.MITE_CYCLES_ANY@ - cpu_core@IDQ.MITE_CYCLES_OK@) / tma_info_core_core_clks / 2", + "MetricGroup": "DSBmiss;FetchBW;TopdownL3;tma_L3_group;tma_fetch_bandwidth_group", + "MetricName": "tma_mite", + "MetricThreshold": "tma_mite > 0.1 & tma_fetch_bandwidth > 0.2", + "PublicDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to the MITE pipeline (the legacy decode pipeline). This pipeline is used for code that was not pre-cached in the DSB or LSD. For example; inefficiencies due to asymmetric decoders; use of long immediate or LCP can manifest as MITE fetch bandwidth bottleneck. Sample with: FRONTEND_RETIRED.ANY_DSB_MISS", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This metric estimates penalty in terms of percentage of([SKL+] injected blend uops out of all Uops Issued -- the Count Domain; [ADL+] cycles)", + "MetricExpr": "160 * cpu_core@ASSISTS.SSE_AVX_MIX@ / tma_info_thread_clks", + "MetricGroup": "TopdownL5;tma_L5_group;tma_issueMV;tma_ports_utilized_0_group", + "MetricName": "tma_mixing_vectors", + "MetricThreshold": "tma_mixing_vectors > 0.05", + "PublicDescription": "This metric estimates penalty in terms of percentage of([SKL+] injected blend uops out of all Uops Issued -- the Count Domain; [ADL+] cycles). Usually a Mixing_Vectors over 5% is worth investigating. Read more in Appendix B1 of the Optimizations Guide for this topic. Related metrics: tma_ms_switches", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This metric estimates the fraction of cycles when the CPU was stalled due to switches of uop delivery to the Microcode Sequencer (MS)", + "MetricExpr": "3 * cpu_core@UOPS_RETIRED.MS\\,cmask\\=1\\,edge@ / (cpu_core@UOPS_RETIRED.SLOTS@ / cpu_core@UOPS_ISSUED.ANY@) / tma_info_thread_clks", + "MetricGroup": "FetchLat;MicroSeq;TopdownL3;tma_L3_group;tma_fetch_latency_group;tma_issueMC;tma_issueMS;tma_issueMV;tma_issueSO", + "MetricName": "tma_ms_switches", + "MetricThreshold": "tma_ms_switches > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", + "PublicDescription": "This metric estimates the fraction of cycles when the CPU was stalled due to switches of uop delivery to the Microcode Sequencer (MS). Commonly used instructions are optimized for delivery by the DSB (decoded i-cache) or MITE (legacy instruction decode) pipelines. Certain operations cannot be handled natively by the execution pipeline; and must be performed by microcode (small programs injected into the execution stream). Switching to the MS too often can negatively impact performance. The MS is designated to deliver long uop flows required by CISC instructions like CPUID; or uncommon conditions like Floating Point Assists when dealing with Denormals. Sample with: FRONTEND_RETIRED.MS_FLOWS. Related metrics: tma_clears_resteers, tma_info_bottleneck_irregular_overhead, tma_l1_bound, tma_machine_clears, tma_microcode_sequencer, tma_mixing_vectors, tma_serializing_operation", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This metric represents fraction of slots where the CPU was retiring branch instructions that were not fused", + "MetricExpr": "tma_light_operations * (cpu_core@BR_INST_RETIRED.ALL_BRANCHES@ - cpu_core@INST_RETIRED.MACRO_FUSED@) / (tma_retiring * tma_info_thread_slots)", + "MetricGroup": "Branches;BvBO;Pipeline;TopdownL3;tma_L3_group;tma_light_operations_group", + "MetricName": "tma_non_fused_branches", + "MetricThreshold": "tma_non_fused_branches > 0.1 & tma_light_operations > 0.6", + "PublicDescription": "This metric represents fraction of slots where the CPU was retiring branch instructions that were not fused. Non-conditional branches like direct JMP or CALL would count here. Can be used to examine fusible conditional jumps that were not fused.", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This metric represents fraction of slots where the CPU was retiring NOP (no op) instructions", + "MetricExpr": "tma_light_operations * cpu_core@INST_RETIRED.NOP@ / (tma_retiring * tma_info_thread_slots)", + "MetricGroup": "BvBO;Pipeline;TopdownL4;tma_L4_group;tma_other_light_ops_group", + "MetricName": "tma_nop_instructions", + "MetricThreshold": "tma_nop_instructions > 0.1 & (tma_other_light_ops > 0.3 & tma_light_operations > 0.6)", + "PublicDescription": "This metric represents fraction of slots where the CPU was retiring NOP (no op) instructions. Compilers often use NOPs for certain address alignments - e.g. start address of a function or loop body. Sample with: INST_RETIRED.NOP", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This metric represents the remaining light uops fraction the CPU has executed - remaining means not covered by other sibling nodes", + "MetricExpr": "max(0, tma_light_operations - (tma_fp_arith + tma_int_operations + tma_memory_operations + tma_fused_instructions + tma_non_fused_branches))", + "MetricGroup": "Pipeline;TopdownL3;tma_L3_group;tma_light_operations_group", + "MetricName": "tma_other_light_ops", + "MetricThreshold": "tma_other_light_ops > 0.3 & tma_light_operations > 0.6", + "PublicDescription": "This metric represents the remaining light uops fraction the CPU has executed - remaining means not covered by other sibling nodes. May undercount due to FMA double counting", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This metric estimates fraction of slots the CPU was stalled due to other cases of misprediction (non-retired x86 branches or other types).", + "MetricExpr": "max(tma_branch_mispredicts * (1 - cpu_core@BR_MISP_RETIRED.ALL_BRANCHES@ / (cpu_core@INT_MISC.CLEARS_COUNT@ - cpu_core@MACHINE_CLEARS.COUNT@)), 0.0001)", + "MetricGroup": "BrMispredicts;BvIO;TopdownL3;tma_L3_group;tma_branch_mispredicts_group", + "MetricName": "tma_other_mispredicts", + "MetricThreshold": "tma_other_mispredicts > 0.05 & (tma_branch_mispredicts > 0.1 & tma_bad_speculation > 0.15)", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This metric represents fraction of slots the CPU has wasted due to Nukes (Machine Clears) not related to memory ordering.", + "MetricExpr": "max(tma_machine_clears * (1 - cpu_core@MACHINE_CLEARS.MEMORY_ORDERING@ / cpu_core@MACHINE_CLEARS.COUNT@), 0.0001)", + "MetricGroup": "BvIO;Machine_Clears;TopdownL3;tma_L3_group;tma_machine_clears_group", + "MetricName": "tma_other_nukes", + "MetricThreshold": "tma_other_nukes > 0.05 & (tma_machine_clears > 0.1 & tma_bad_speculation > 0.15)", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This metric roughly estimates fraction of slots the CPU retired uops as a result of handing Page Faults", + "MetricExpr": "99 * cpu_core@ASSISTS.PAGE_FAULT@ / tma_info_thread_slots", + "MetricGroup": "TopdownL5;tma_L5_group;tma_assists_group", + "MetricName": "tma_page_faults", + "MetricThreshold": "tma_page_faults > 0.05", + "PublicDescription": "This metric roughly estimates fraction of slots the CPU retired uops as a result of handing Page Faults. A Page Fault may apply on first application access to a memory page. Note operating system handling of page faults accounts for the majority of its cost.", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 0 ([SNB+] ALU; [HSW+] ALU and 2nd branch)", + "MetricExpr": "cpu_core@UOPS_DISPATCHED.PORT_0@ / tma_info_core_core_clks", + "MetricGroup": "Compute;TopdownL6;tma_L6_group;tma_alu_op_utilization_group;tma_issue2P", + "MetricName": "tma_port_0", + "MetricThreshold": "tma_port_0 > 0.6", + "PublicDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 0 ([SNB+] ALU; [HSW+] ALU and 2nd branch). Sample with: UOPS_DISPATCHED.PORT_0. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_128b, tma_fp_vector_256b, tma_fp_vector_512b, tma_int_vector_128b, tma_int_vector_256b, tma_port_1, tma_port_5, tma_port_6, tma_ports_utilized_2", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 1 (ALU)", + "MetricExpr": "cpu_core@UOPS_DISPATCHED.PORT_1@ / tma_info_core_core_clks", + "MetricGroup": "TopdownL6;tma_L6_group;tma_alu_op_utilization_group;tma_issue2P", + "MetricName": "tma_port_1", + "MetricThreshold": "tma_port_1 > 0.6", + "PublicDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 1 (ALU). Sample with: UOPS_DISPATCHED.PORT_1. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_128b, tma_fp_vector_256b, tma_fp_vector_512b, tma_int_vector_128b, tma_int_vector_256b, tma_port_0, tma_port_5, tma_port_6, tma_ports_utilized_2", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 6 ([HSW+] Primary Branch and simple ALU)", + "MetricExpr": "cpu_core@UOPS_DISPATCHED.PORT_6@ / tma_info_core_core_clks", + "MetricGroup": "TopdownL6;tma_L6_group;tma_alu_op_utilization_group;tma_issue2P", + "MetricName": "tma_port_6", + "MetricThreshold": "tma_port_6 > 0.6", + "PublicDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 6 ([HSW+] Primary Branch and simple ALU). Sample with: UOPS_DISPATCHED.PORT_6. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_128b, tma_fp_vector_256b, tma_fp_vector_512b, tma_int_vector_128b, tma_int_vector_256b, tma_port_0, tma_port_1, tma_port_5, tma_ports_utilized_2", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This metric estimates fraction of cycles the CPU performance was potentially limited due to Core computation issues (non divider-related)", + "MetricExpr": "((tma_ports_utilized_0 * tma_info_thread_clks + (cpu_core@EXE_ACTIVITY.1_PORTS_UTIL@ + tma_retiring * cpu_core@EXE_ACTIVITY.2_PORTS_UTIL\\,umask\\=0xc@)) / tma_info_thread_clks if cpu_core@ARITH.DIV_ACTIVE@ < cpu_core@CYCLE_ACTIVITY.STALLS_TOTAL@ - cpu_core@EXE_ACTIVITY.BOUND_ON_LOADS@ else (cpu_core@EXE_ACTIVITY.1_PORTS_UTIL@ + tma_retiring * cpu_core@EXE_ACTIVITY.2_PORTS_UTIL\\,umask\\=0xc@) / tma_info_thread_clks)", + "MetricGroup": "PortsUtil;TopdownL3;tma_L3_group;tma_core_bound_group", + "MetricName": "tma_ports_utilization", + "MetricThreshold": "tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2)", + "PublicDescription": "This metric estimates fraction of cycles the CPU performance was potentially limited due to Core computation issues (non divider-related). Two distinct categories can be attributed into this metric: (1) heavy data-dependency among contiguous instructions would manifest in this metric - such cases are often referred to as low Instruction Level Parallelism (ILP). (2) Contention on some hardware execution unit other than Divider. For example; when there are too many multiply operations.", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This metric represents fraction of cycles CPU executed no uops on any execution port (Logical Processor cycles since ICL, Physical Core cycles otherwise)", + "MetricExpr": "max((cpu_core@EXE_ACTIVITY.EXE_BOUND_0_PORTS@ + max(cpu_core@RS.EMPTY_RESOURCE@ - cpu_core@RESOURCE_STALLS.SCOREBOARD@, 0)) / tma_info_thread_clks, 1) * (cpu_core@CYCLE_ACTIVITY.STALLS_TOTAL@ - cpu_core@EXE_ACTIVITY.BOUND_ON_LOADS@) / tma_info_thread_clks", + "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_ports_utilization_group", + "MetricName": "tma_ports_utilized_0", + "MetricThreshold": "tma_ports_utilized_0 > 0.2 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))", + "PublicDescription": "This metric represents fraction of cycles CPU executed no uops on any execution port (Logical Processor cycles since ICL, Physical Core cycles otherwise). Long-latency instructions like divides may contribute to this metric.", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This metric represents fraction of cycles where the CPU executed total of 1 uop per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise)", + "MetricExpr": "cpu_core@EXE_ACTIVITY.1_PORTS_UTIL@ / tma_info_thread_clks", + "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_issueL1;tma_ports_utilization_group", + "MetricName": "tma_ports_utilized_1", + "MetricThreshold": "tma_ports_utilized_1 > 0.2 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))", + "PublicDescription": "This metric represents fraction of cycles where the CPU executed total of 1 uop per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise). This can be due to heavy data-dependency among software instructions; or over oversubscribing a particular hardware resource. In some other cases with high 1_Port_Utilized and L1_Bound; this metric can point to L1 data-cache latency bottleneck that may not necessarily manifest with complete execution starvation (due to the short L1 latency e.g. walking a linked list) - looking at the assembly can be helpful. Sample with: EXE_ACTIVITY.1_PORTS_UTIL. Related metrics: tma_l1_bound", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This metric represents fraction of cycles CPU executed total of 2 uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise)", + "MetricConstraint": "NO_GROUP_EVENTS_NMI", + "MetricExpr": "cpu_core@EXE_ACTIVITY.2_PORTS_UTIL@ / tma_info_thread_clks", + "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_issue2P;tma_ports_utilization_group", + "MetricName": "tma_ports_utilized_2", + "MetricThreshold": "tma_ports_utilized_2 > 0.15 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))", + "PublicDescription": "This metric represents fraction of cycles CPU executed total of 2 uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise). Loop Vectorization -most compilers feature auto-Vectorization options today- reduces pressure on the execution ports as multiple elements are calculated with same uop. Sample with: EXE_ACTIVITY.2_PORTS_UTIL. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_128b, tma_fp_vector_256b, tma_fp_vector_512b, tma_int_vector_128b, tma_int_vector_256b, tma_port_0, tma_port_1, tma_port_5, tma_port_6", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This metric represents fraction of cycles CPU executed total of 3 or more uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise)", + "MetricConstraint": "NO_GROUP_EVENTS_NMI", + "MetricExpr": "cpu_core@UOPS_EXECUTED.CYCLES_GE_3@ / tma_info_thread_clks", + "MetricGroup": "BvCB;PortsUtil;TopdownL4;tma_L4_group;tma_ports_utilization_group", + "MetricName": "tma_ports_utilized_3m", + "MetricThreshold": "tma_ports_utilized_3m > 0.4 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))", + "PublicDescription": "This metric represents fraction of cycles CPU executed total of 3 or more uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise). Sample with: UOPS_EXECUTED.CYCLES_GE_3", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired", + "MetricExpr": "cpu_core@topdown\\-retiring@ / (cpu_core@topdown\\-fe\\-bound@ + cpu_core@topdown\\-bad\\-spec@ + cpu_core@topdown\\-retiring@ + cpu_core@topdown\\-be\\-bound@) + 0 * tma_info_thread_slots", + "MetricGroup": "BvUW;TmaL1;TopdownL1;tma_L1_group", + "MetricName": "tma_retiring", + "MetricThreshold": "tma_retiring > 0.7 | tma_heavy_operations > 0.1", + "MetricgroupNoGroup": "TopdownL1", + "PublicDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired. Ideally; all pipeline slots would be attributed to the Retiring category. Retiring of 100% would indicate the maximum Pipeline_Width throughput was achieved. Maximizing Retiring typically increases the Instructions-per-cycle (see IPC metric). Note that a high Retiring value does not necessary mean there is no room for more performance. For example; Heavy-operations or Microcode Assists are categorized under Retiring. They often indicate suboptimal performance and can often be optimized or avoided. Sample with: UOPS_RETIRED.SLOTS", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This metric represents fraction of cycles the CPU issue-pipeline was stalled due to serializing operations", + "MetricExpr": "cpu_core@RESOURCE_STALLS.SCOREBOARD@ / tma_info_thread_clks + tma_c02_wait", + "MetricGroup": "BvIO;PortsUtil;TopdownL3;tma_L3_group;tma_core_bound_group;tma_issueSO", + "MetricName": "tma_serializing_operation", + "MetricThreshold": "tma_serializing_operation > 0.1 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2)", + "PublicDescription": "This metric represents fraction of cycles the CPU issue-pipeline was stalled due to serializing operations. Instructions like CPUID; WRMSR or LFENCE serialize the out-of-order execution which may limit performance. Sample with: RESOURCE_STALLS.SCOREBOARD. Related metrics: tma_ms_switches", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This metric represents fraction of slots where the CPU was retiring Shuffle operations of 256-bit vector size (FP or Integer)", + "MetricExpr": "tma_light_operations * cpu_core@INT_VEC_RETIRED.SHUFFLES@ / (tma_retiring * tma_info_thread_slots)", + "MetricGroup": "HPC;Pipeline;TopdownL4;tma_L4_group;tma_other_light_ops_group", + "MetricName": "tma_shuffles_256b", + "MetricThreshold": "tma_shuffles_256b > 0.1 & (tma_other_light_ops > 0.3 & tma_light_operations > 0.6)", + "PublicDescription": "This metric represents fraction of slots where the CPU was retiring Shuffle operations of 256-bit vector size (FP or Integer). Shuffles may incur slow cross \"vector lane\" data transfers.", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to PAUSE Instructions", + "MetricConstraint": "NO_GROUP_EVENTS_NMI", + "MetricExpr": "cpu_core@CPU_CLK_UNHALTED.PAUSE@ / tma_info_thread_clks", + "MetricGroup": "TopdownL4;tma_L4_group;tma_serializing_operation_group", + "MetricName": "tma_slow_pause", + "MetricThreshold": "tma_slow_pause > 0.05 & (tma_serializing_operation > 0.1 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))", + "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to PAUSE Instructions. Sample with: CPU_CLK_UNHALTED.PAUSE_INST", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This metric estimates fraction of cycles handling memory load split accesses - load that cross 64-byte cache line boundary", + "MetricExpr": "cpu_core@MEM_INST_RETIRED.SPLIT_LOADS@ * min(cpu_core@MEM_INST_RETIRED.SPLIT_LOADS@R, tma_info_memory_load_miss_real_latency) / tma_info_thread_clks", + "MetricGroup": "TopdownL4;tma_L4_group;tma_l1_bound_group", + "MetricName": "tma_split_loads", + "MetricThreshold": "tma_split_loads > 0.2 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", + "PublicDescription": "This metric estimates fraction of cycles handling memory load split accesses - load that cross 64-byte cache line boundary. Sample with: MEM_INST_RETIRED.SPLIT_LOADS_PS", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This metric represents rate of split store accesses", + "MetricExpr": "cpu_core@MEM_INST_RETIRED.SPLIT_STORES@ * min(cpu_core@MEM_INST_RETIRED.SPLIT_STORES@R, 1) / tma_info_thread_clks", + "MetricGroup": "TopdownL4;tma_L4_group;tma_issueSpSt;tma_store_bound_group", + "MetricName": "tma_split_stores", + "MetricThreshold": "tma_split_stores > 0.2 & (tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", + "PublicDescription": "This metric represents rate of split store accesses. Consider aligning your data to the 64-byte cache line granularity. Sample with: MEM_INST_RETIRED.SPLIT_STORES_PS. Related metrics: tma_port_4", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors)", + "MetricExpr": "(cpu_core@XQ.FULL_CYCLES@ + cpu_core@L1D_PEND_MISS.L2_STALLS@) / tma_info_thread_clks", + "MetricGroup": "BvMS;MemoryBW;Offcore;TopdownL4;tma_L4_group;tma_issueBW;tma_l3_bound_group", + "MetricName": "tma_sq_full", + "MetricThreshold": "tma_sq_full > 0.3 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", + "PublicDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors). Related metrics: tma_fb_full, tma_info_bottleneck_cache_memory_bandwidth, tma_info_system_dram_bw_use, tma_mem_bandwidth", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This metric estimates how often CPU was stalled due to RFO store memory accesses; RFO store issue a read-for-ownership request before the write", + "MetricExpr": "cpu_core@EXE_ACTIVITY.BOUND_ON_STORES@ / tma_info_thread_clks", + "MetricGroup": "MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group", + "MetricName": "tma_store_bound", + "MetricThreshold": "tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)", + "PublicDescription": "This metric estimates how often CPU was stalled due to RFO store memory accesses; RFO store issue a read-for-ownership request before the write. Even though store accesses do not typically stall out-of-order CPUs; there are few cases where stores can lead to actual stalls. This metric will be flagged should RFO stores be a bottleneck. Sample with: MEM_INST_RETIRED.ALL_STORES_PS", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This metric roughly estimates fraction of cycles when the memory subsystem had loads blocked since they could not forward data from earlier (in program order) overlapping stores", + "MetricExpr": "13 * cpu_core@LD_BLOCKS.STORE_FORWARD@ / tma_info_thread_clks", + "MetricGroup": "TopdownL4;tma_L4_group;tma_l1_bound_group", + "MetricName": "tma_store_fwd_blk", + "MetricThreshold": "tma_store_fwd_blk > 0.1 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", + "PublicDescription": "This metric roughly estimates fraction of cycles when the memory subsystem had loads blocked since they could not forward data from earlier (in program order) overlapping stores. To streamline memory operations in the pipeline; a load can avoid waiting for memory if a prior in-flight store is writing the data that the load wants to read (store forwarding process). However; in some cases the load may be blocked for a significant time pending the store forward. For example; when the prior store is writing a smaller region than the load is reading.", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This metric estimates fraction of cycles the CPU spent handling L1D store misses", + "MetricExpr": "(cpu_core@MEM_STORE_RETIRED.L2_HIT@ * 10 * (1 - cpu_core@MEM_INST_RETIRED.LOCK_LOADS@ / cpu_core@MEM_INST_RETIRED.ALL_STORES@) + (1 - cpu_core@MEM_INST_RETIRED.LOCK_LOADS@ / cpu_core@MEM_INST_RETIRED.ALL_STORES@) * min(cpu_core@CPU_CLK_UNHALTED.THREAD@, cpu_core@OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO@)) / tma_info_thread_clks", + "MetricGroup": "BvML;MemoryLat;Offcore;TopdownL4;tma_L4_group;tma_issueRFO;tma_issueSL;tma_store_bound_group", + "MetricName": "tma_store_latency", + "MetricThreshold": "tma_store_latency > 0.1 & (tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", + "PublicDescription": "This metric estimates fraction of cycles the CPU spent handling L1D store misses. Store accesses usually less impact out-of-order core performance; however; holding resources for longer time can lead into undesired implications (e.g. contention on L1D fill-buffer entries - see FB_Full). Related metrics: tma_fb_full, tma_lock_latency", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port for Store operations", + "MetricExpr": "(cpu_core@UOPS_DISPATCHED.PORT_4_9@ + cpu_core@UOPS_DISPATCHED.PORT_7_8@) / (4 * tma_info_core_core_clks)", + "MetricGroup": "TopdownL5;tma_L5_group;tma_ports_utilized_3m_group", + "MetricName": "tma_store_op_utilization", + "MetricThreshold": "tma_store_op_utilization > 0.6", + "PublicDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port for Store operations. Sample with: UOPS_DISPATCHED.PORT_7_8", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This metric roughly estimates the fraction of cycles where the TLB was missed by store accesses, hitting in the second-level TLB (STLB)", + "MetricExpr": "max(0, tma_dtlb_store - tma_store_stlb_miss)", + "MetricGroup": "MemoryTLB;TopdownL5;tma_L5_group;tma_dtlb_store_group", + "MetricName": "tma_store_stlb_hit", + "MetricThreshold": "tma_store_stlb_hit > 0.05 & (tma_dtlb_store > 0.05 & (tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)))", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This metric estimates the fraction of cycles where the STLB was missed by store accesses, performing a hardware page walk", + "MetricExpr": "cpu_core@DTLB_STORE_MISSES.WALK_ACTIVE@ / tma_info_core_core_clks", + "MetricGroup": "MemoryTLB;TopdownL5;tma_L5_group;tma_dtlb_store_group", + "MetricName": "tma_store_stlb_miss", + "MetricThreshold": "tma_store_stlb_miss > 0.05 & (tma_dtlb_store > 0.05 & (tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)))", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This metric estimates how often CPU was stalled due to Streaming store memory accesses; Streaming store optimize out a read request required by RFO stores", + "MetricExpr": "9 * cpu_core@OCR.STREAMING_WR.ANY_RESPONSE@ / tma_info_thread_clks", + "MetricGroup": "MemoryBW;Offcore;TopdownL4;tma_L4_group;tma_issueSmSt;tma_store_bound_group", + "MetricName": "tma_streaming_stores", + "MetricThreshold": "tma_streaming_stores > 0.2 & (tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", + "PublicDescription": "This metric estimates how often CPU was stalled due to Streaming store memory accesses; Streaming store optimize out a read request required by RFO stores. Even though store accesses do not typically stall out-of-order CPUs; there are few cases where stores can lead to actual stalls. This metric will be flagged should Streaming stores be a bottleneck. Sample with: OCR.STREAMING_WR.ANY_RESPONSE. Related metrics: tma_fb_full", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to new branch address clears", + "MetricExpr": "cpu_core@INT_MISC.UNKNOWN_BRANCH_CYCLES@ / tma_info_thread_clks", + "MetricGroup": "BigFootprint;BvBC;FetchLat;TopdownL4;tma_L4_group;tma_branch_resteers_group", + "MetricName": "tma_unknown_branches", + "MetricThreshold": "tma_unknown_branches > 0.05 & (tma_branch_resteers > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15))", + "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to new branch address clears. These are fetched branches the Branch Prediction Unit was unable to recognize (e.g. first time the branch is fetched or hitting BTB capacity limit) hence called Unknown Branches. Sample with: FRONTEND_RETIRED.UNKNOWN_BRANCH", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This metric serves as an approximation of legacy x87 usage", + "MetricExpr": "tma_retiring * cpu_core@UOPS_EXECUTED.X87@ / cpu_core@UOPS_EXECUTED.THREAD@", + "MetricGroup": "Compute;TopdownL4;tma_L4_group;tma_fp_arith_group", + "MetricName": "tma_x87_use", + "MetricThreshold": "tma_x87_use > 0.1 & (tma_fp_arith > 0.2 & tma_light_operations > 0.6)", + "PublicDescription": "This metric serves as an approximation of legacy x87 usage. It accounts for instructions beyond X87 FP arithmetic operations; hence may be used as a thermometer to avoid X87 high usage and preferably upgrade to modern ISA. See Tip under Tuning Hint.", + "ScaleUnit": "100%", + "Unit": "cpu_core" + } +] -- cgit v1.2.3 From d546e3acf3526eaf1d74902236c7219a424ac2e9 Mon Sep 17 00:00:00 2001 From: Weilin Wang Date: Sat, 20 Jul 2024 02:20:59 -0400 Subject: perf stat: Add command line option for enabling TPEBS recording With this command line option, TPEBS recording is turned off in 'perf stat' on default. It will only be turned on when this option is given in 'perf stat' command. Example with --record-tpebs: perf stat -M tma_split_loads -C1-4 --record-tpebs sleep 1 [ perf record: Woken up 2 times to write data ] [ perf record: Captured and wrote 0.044 MB - ] Performance counter stats for 'CPU(s) 1-4': 53,259,156,071 cpu_core/TOPDOWN.SLOTS/ # 1.6 % tma_split_loads (50.00%) 15,867,565,250 cpu_core/topdown-retiring/ (50.00%) 15,655,580,731 cpu_core/topdown-mem-bound/ (50.00%) 11,738,022,218 cpu_core/topdown-bad-spec/ (50.00%) 6,151,265,424 cpu_core/topdown-fe-bound/ (50.00%) 20,445,917,581 cpu_core/topdown-be-bound/ (50.00%) 6,925,098,013 cpu_core/L1D_PEND_MISS.PENDING/ (50.00%) 3,838,653,421 cpu_core/MEMORY_ACTIVITY.STALLS_L1D_MISS/ (50.00%) 4,797,059,783 cpu_core/EXE_ACTIVITY.BOUND_ON_LOADS/ (50.00%) 11,931,916,714 cpu_core/CPU_CLK_UNHALTED.THREAD/ (50.00%) 102,576,164 cpu_core/MEM_LOAD_COMPLETED.L1_MISS_ANY/ (50.00%) 64,071,854 cpu_core/MEM_INST_RETIRED.SPLIT_LOADS/ (50.00%) 3 cpu_core/MEM_INST_RETIRED.SPLIT_LOADS/R 1.003049679 seconds time elapsed Example without --record-tpebs: perf stat -M tma_contested_accesses -C1 sleep 1 Performance counter stats for 'CPU(s) 1': 50,203,891 cpu_core/TOPDOWN.SLOTS/ # 0.0 % tma_contested_accesses (63.60%) 10,040,777 cpu_core/topdown-retiring/ (63.60%) 6,890,729 cpu_core/topdown-mem-bound/ (63.60%) 2,756,463 cpu_core/topdown-bad-spec/ (63.60%) 10,828,288 cpu_core/topdown-fe-bound/ (63.60%) 28,350,432 cpu_core/topdown-be-bound/ (63.60%) 98 cpu_core/OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM/ (63.70%) 577,520 cpu_core/MEMORY_ACTIVITY.STALLS_L2_MISS/ (54.62%) 313,339 cpu_core/MEMORY_ACTIVITY.STALLS_L3_MISS/ (54.62%) 14,155 cpu_core/MEM_LOAD_RETIRED.L1_MISS/ (45.54%) 0 cpu_core/OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HIT_WITH_FWD/ (36.30%) 8,468,077 cpu_core/CPU_CLK_UNHALTED.THREAD/ (45.38%) 198 cpu_core/MEM_LOAD_L3_HIT_RETIRED.XSNP_MISS/ (45.38%) 8,324 cpu_core/MEM_LOAD_RETIRED.FB_HIT/ (45.38%) 3,388,031,520 TSC 23,226,785 cpu_core/CPU_CLK_UNHALTED.REF_TSC/ (54.46%) 80 cpu_core/MEM_LOAD_L3_HIT_RETIRED.XSNP_FWD/ (54.46%) 0 cpu_core/MEM_LOAD_L3_HIT_RETIRED.XSNP_FWD/R 0 cpu_core/MEM_LOAD_L3_HIT_RETIRED.XSNP_MISS/R 1,006,816,667 ns duration_time 1.002537737 seconds time elapsed Reviewed-by: Namhyung Kim Signed-off-by: Weilin Wang Acked-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Caleb Biggers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kan Liang Cc: Perry Taylor Cc: Peter Zijlstra Cc: Samantha Alt Link: https://lore.kernel.org/r/20240720062102.444578-7-weilin.wang@intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Documentation/perf-stat.txt | 8 ++++++++ tools/perf/builtin-stat.c | 4 ++++ 2 files changed, 12 insertions(+) (limited to 'tools') diff --git a/tools/perf/Documentation/perf-stat.txt b/tools/perf/Documentation/perf-stat.txt index 29756a87ab6f..2bc063672486 100644 --- a/tools/perf/Documentation/perf-stat.txt +++ b/tools/perf/Documentation/perf-stat.txt @@ -498,6 +498,14 @@ To interpret the results it is usually needed to know on which CPUs the workload runs on. If needed the CPUs can be forced using taskset. +--record-tpebs:: +Enable automatic sampling on Intel TPEBS retire_latency events (event with :R +modifier). Without this option, perf would not capture dynamic retire_latency +at runtime. Currently, a zero value is assigned to the retire_latency event when +this option is not set. The TPEBS hardware feature starts from Intel Granite +Rapids microarchitecture. This option only exists in X86_64 and is meaningful on +Intel platforms with TPEBS feature. + --td-level:: Print the top-down statistics that equal the input level. It allows users to print the interested top-down metrics level instead of the diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index 6162b98fc941..cf985cdb9a6e 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -2475,6 +2475,10 @@ int cmd_stat(int argc, const char **argv) "disable adding events for the metric threshold calculation"), OPT_BOOLEAN(0, "topdown", &topdown_run, "measure top-down statistics"), +#ifdef HAVE_ARCH_X86_64_SUPPORT + OPT_BOOLEAN(0, "record-tpebs", &tpebs_recording, + "enable recording for tpebs when retire_latency required"), +#endif OPT_UINTEGER(0, "td-level", &stat_config.topdown_level, "Set the metrics level for the top-down statistics (0: max level)"), OPT_BOOLEAN(0, "smi-cost", &smi_cost, -- cgit v1.2.3 From 169f18fd980cea90804bd8bc5a5614335fa0c8ae Mon Sep 17 00:00:00 2001 From: Weilin Wang Date: Sat, 20 Jul 2024 02:21:00 -0400 Subject: perf Document: Add TPEBS (Timed PEBS(Precise Event-Based Sampling)) to Documents TPEBS (Timed PEBS(Precise Event-Based Sampling)) is a new feature Intel PMU from Granite Rapids microarchitecture. It will be used in new TMA (Top-Down Microarchitecture Analysis) releases. Add related introduction to documents while adding new code to support it in 'perf stat'. Reviewed-by: Namhyung Kim Signed-off-by: Weilin Wang Acked-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Caleb Biggers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kan Liang Cc: Perry Taylor Cc: Peter Zijlstra Cc: Samantha Alt Link: https://lore.kernel.org/r/20240720062102.444578-8-weilin.wang@intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Documentation/perf-list.txt | 1 + tools/perf/Documentation/topdown.txt | 30 ++++++++++++++++++++++++++++++ 2 files changed, 31 insertions(+) (limited to 'tools') diff --git a/tools/perf/Documentation/perf-list.txt b/tools/perf/Documentation/perf-list.txt index 6bf2468f59d3..dea005410ec0 100644 --- a/tools/perf/Documentation/perf-list.txt +++ b/tools/perf/Documentation/perf-list.txt @@ -72,6 +72,7 @@ counted. The following modifiers exist: W - group is weak and will fallback to non-group if not schedulable, e - group or event are exclusive and do not share the PMU b - use BPF aggregration (see perf stat --bpf-counters) + R - retire latency value of the event The 'p' modifier can be used for specifying how precise the instruction address should be. The 'p' modifier can be specified multiple times: diff --git a/tools/perf/Documentation/topdown.txt b/tools/perf/Documentation/topdown.txt index ae0aee86844f..5c17fff694ee 100644 --- a/tools/perf/Documentation/topdown.txt +++ b/tools/perf/Documentation/topdown.txt @@ -325,6 +325,36 @@ other four level 2 metrics by subtracting corresponding metrics as below. Fetch_Bandwidth = Frontend_Bound - Fetch_Latency Core_Bound = Backend_Bound - Memory_Bound +TPEBS in TopDown +================ + +TPEBS (Timed PEBS) is one of the new Intel PMU features provided since Granite +Rapids microarchitecture. The TPEBS feature adds a 16 bit retire_latency field +in the Basic Info group of the PEBS record. It records the Core cycles since the +retirement of the previous instruction to the retirement of current instruction. +Please refer to Section 8.4.1 of "Intel® Architecture Instruction Set Extensions +Programming Reference" for more details about this feature. Because this feature +extends PEBS record, sampling with weight option is required to get the +retire_latency value. + + perf record -e event_name -W ... + +In the most recent release of TMA, the metrics begin to use event retire_latency +values in some of the metrics’ formulas on processors that support TPEBS feature. +For previous generations that do not support TPEBS, the values are static and +predefined per processor family by the hardware architects. Due to the diversity +of workloads in execution environments, retire_latency values measured at real +time are more accurate. Therefore, new TMA metrics that use TPEBS will provide +more accurate performance analysis results. + +To support TPEBS in TMA metrics, a new modifier :R on event is added. Perf would +capture retire_latency value of required events(event with :R in metric formula) +with perf record. The retire_latency value would be used in metric calculation. +Currently, this feature is supported through perf stat + + perf stat -M metric_name --record-tpebs ... + + [1] https://software.intel.com/en-us/top-down-microarchitecture-analysis-method-win [2] https://sites.google.com/site/analysismethods/yasin-pubs -- cgit v1.2.3 From b2738fda24543777a623a7d1cc2a9985ab81b448 Mon Sep 17 00:00:00 2001 From: Weilin Wang Date: Sat, 20 Jul 2024 02:21:01 -0400 Subject: perf test: Add test for Intel TPEBS counting mode Intel TPEBS sampling mode is supported through perf record. The counting mode code uses perf record to capture retire_latency value and use it in metric calculation. This test checks the counting mode code on Intel platforms. Committer testing: root@x1:~# perf test tpebs 123: test Intel TPEBS counting mode : Ok root@x1:~# set -o vi root@x1:~# perf test tpebs 123: test Intel TPEBS counting mode : Ok root@x1:~# perf test -v tpebs 123: test Intel TPEBS counting mode : Ok root@x1:~# perf test -vvv tpebs 123: test Intel TPEBS counting mode: --- start --- test child forked, pid 16603 Testing without --record-tpebs Testing with --record-tpebs ---- end(0) ---- 123: test Intel TPEBS counting mode : Ok root@x1:~# Reviewed-by: Namhyung Kim Signed-off-by: Weilin Wang Acked-by: Ian Rogers Tested-by: Arnaldo Carvalho de Melo Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Caleb Biggers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kan Liang Cc: Perry Taylor Cc: Peter Zijlstra Cc: Samantha Alt Link: https://lore.kernel.org/r/20240720062102.444578-9-weilin.wang@intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/shell/test_stat_intel_tpebs.sh | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) create mode 100755 tools/perf/tests/shell/test_stat_intel_tpebs.sh (limited to 'tools') diff --git a/tools/perf/tests/shell/test_stat_intel_tpebs.sh b/tools/perf/tests/shell/test_stat_intel_tpebs.sh new file mode 100755 index 000000000000..c60b29add980 --- /dev/null +++ b/tools/perf/tests/shell/test_stat_intel_tpebs.sh @@ -0,0 +1,19 @@ +#!/bin/bash +# test Intel TPEBS counting mode +# SPDX-License-Identifier: GPL-2.0 + +set -e +grep -q GenuineIntel /proc/cpuinfo || { echo Skipping non-Intel; exit 2; } + +# Use this event for testing because it should exist in all platforms +event=cache-misses:R + +# Without this cmd option, default value or zero is returned +echo "Testing without --record-tpebs" +result=$(perf stat -e "$event" true 2>&1) +[[ "$result" =~ $event ]] || exit 1 + +# In platforms that do not support TPEBS, it should execute without error. +echo "Testing with --record-tpebs" +result=$(perf stat -e "$event" --record-tpebs -a sleep 0.01 2>&1) +[[ "$result" =~ "perf record" && "$result" =~ $event ]] || exit 1 -- cgit v1.2.3 From 1a9d080d19c3303569614ed4c3b43a39aacd90d7 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Mon, 12 Aug 2024 15:41:19 -0700 Subject: perf callchain: Add a for_each callback style API Add a for_each callback style API to callchain with sample__for_each_callchain_node(). Possibly in the future such an API can avoid the overhead of constructing the call chain list. Signed-off-by: Ian Rogers Acked-by: Namhyung Kim Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Casey Chen Cc: Colin Ian King Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kan Liang Cc: Mark Rutland Cc: Peter Zijlstra Cc: Tom Zanussi Link: https://lore.kernel.org/r/20240812224119.744968-1-irogers@google.com [ Split from a larger patch that introduced the API and use it ] Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/callchain.c | 35 +++++++++++++++++++++++++++++++++++ tools/perf/util/callchain.h | 6 ++++++ 2 files changed, 41 insertions(+) (limited to 'tools') diff --git a/tools/perf/util/callchain.c b/tools/perf/util/callchain.c index 6d075648d2cc..0d608e875fe9 100644 --- a/tools/perf/util/callchain.c +++ b/tools/perf/util/callchain.c @@ -1797,3 +1797,38 @@ s64 callchain_avg_cycles(struct callchain_node *cnode) return cycles; } + +int sample__for_each_callchain_node(struct thread *thread, struct evsel *evsel, + struct perf_sample *sample, int max_stack, + callchain_iter_fn cb, void *data) +{ + struct callchain_cursor *cursor = get_tls_callchain_cursor(); + int ret; + + if (!cursor) + return -ENOMEM; + + /* Fill in the callchain. */ + ret = thread__resolve_callchain(thread, cursor, evsel, sample, + /*parent=*/NULL, /*root_al=*/NULL, + max_stack); + if (ret) + return ret; + + /* Switch from writing the callchain to reading it. */ + callchain_cursor_commit(cursor); + + while (1) { + struct callchain_cursor_node *node = callchain_cursor_current(cursor); + + if (!node) + break; + + ret = cb(node, data); + if (ret) + return ret; + + callchain_cursor_advance(cursor); + } + return 0; +} diff --git a/tools/perf/util/callchain.h b/tools/perf/util/callchain.h index d5c66345ae31..76891f8e2373 100644 --- a/tools/perf/util/callchain.h +++ b/tools/perf/util/callchain.h @@ -311,4 +311,10 @@ u64 callchain_total_hits(struct hists *hists); s64 callchain_avg_cycles(struct callchain_node *cnode); +typedef int (*callchain_iter_fn)(struct callchain_cursor_node *node, void *data); + +int sample__for_each_callchain_node(struct thread *thread, struct evsel *evsel, + struct perf_sample *sample, int max_stack, + callchain_iter_fn cb, void *data); + #endif /* __PERF_CALLCHAIN_H */ -- cgit v1.2.3 From 3d557dd3f54e329556e880129d0181988893b00f Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Mon, 12 Aug 2024 15:41:19 -0700 Subject: perf inject: Inject build ids for entire call chain The DSO build id is injected when the dso is first encountered but the checking for first encountered only looks at the sample->ip not the entire callchain. Use the callchain logic to ensure all build ids are inserted. Fixes: 454c407ec17a0c63 ("perf: add perf-inject builtin") Signed-off-by: Ian Rogers Acked-by: Namhyung Kim Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Casey Chen Cc: Colin Ian King Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kan Liang Cc: Mark Rutland Cc: Peter Zijlstra Cc: Tom Zanussi Link: https://lore.kernel.org/r/20240812224119.744968-1-irogers@google.com [ Split from a larger patch that introduced the API and use it ] Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-inject.c | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) (limited to 'tools') diff --git a/tools/perf/builtin-inject.c b/tools/perf/builtin-inject.c index ef9cba173dd2..a35bde3f3c09 100644 --- a/tools/perf/builtin-inject.c +++ b/tools/perf/builtin-inject.c @@ -743,6 +743,29 @@ static int dso__inject_build_id(struct dso *dso, const struct perf_tool *tool, return 0; } +struct mark_dso_hit_args { + const struct perf_tool *tool; + struct machine *machine; + u8 cpumode; +}; + +static int mark_dso_hit_callback(struct callchain_cursor_node *node, void *data) +{ + struct mark_dso_hit_args *args = data; + struct map *map = node->ms.map; + + if (map) { + struct dso *dso = map__dso(map); + + if (dso && !dso__hit(dso)) { + dso__set_hit(dso); + dso__inject_build_id(dso, args->tool, args->machine, + args->cpumode, map__flags(map)); + } + } + return 0; +} + int perf_event__inject_buildid(const struct perf_tool *tool, union perf_event *event, struct perf_sample *sample, struct evsel *evsel __maybe_unused, @@ -750,6 +773,11 @@ int perf_event__inject_buildid(const struct perf_tool *tool, union perf_event *e { struct addr_location al; struct thread *thread; + struct mark_dso_hit_args args = { + .tool = tool, + .machine = machine, + .cpumode = sample->cpumode, + }; addr_location__init(&al); thread = machine__findnew_thread(machine, sample->pid, sample->tid); @@ -769,6 +797,9 @@ int perf_event__inject_buildid(const struct perf_tool *tool, union perf_event *e } } + sample__for_each_callchain_node(thread, evsel, sample, PERF_MAX_STACK_DEPTH, + mark_dso_hit_callback, &args); + thread__put(thread); repipe: perf_event__repipe(tool, event, sample, machine); -- cgit v1.2.3 From 92ec8b9367ab328cdd3d4409464137bc9775a2e5 Mon Sep 17 00:00:00 2001 From: Asbjørn Sloth Tønnesen Date: Thu, 8 Aug 2024 07:59:02 +0000 Subject: selftests/bpf: Avoid subtraction after htons() in ipip tests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On little-endian systems, doing subtraction after htons() leads to interesting results: Given: MAGIC_BYTES = 123 = 0x007B aka. in big endian: 0x7B00 = 31488 sizeof(struct iphdr) = 20 Before this patch: __bpf_constant_htons(MAGIC_BYTES) - sizeof(struct iphdr) = 0x7AEC 0x7AEC = htons(0xEC7A) = htons(60538) So these were outer IP packets with a total length of 123 bytes, containing an inner IP packet with a total length of 60538 bytes. After this patch: __bpf_constant_htons(MAGIC_BYTES - sizeof(struct iphdr)) = htons(103) Now these packets are outer IP packets with a total length of 123 bytes, containing an inner IP packet with a total length of 103 bytes. Signed-off-by: Asbjørn Sloth Tønnesen Reviewed-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/r/20240808075906.1849564-1-ast@fiberby.net Signed-off-by: Martin KaFai Lau --- tools/testing/selftests/bpf/prog_tests/flow_dissector.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/flow_dissector.c b/tools/testing/selftests/bpf/prog_tests/flow_dissector.c index 9e5f38739104..6b3078dd5645 100644 --- a/tools/testing/selftests/bpf/prog_tests/flow_dissector.c +++ b/tools/testing/selftests/bpf/prog_tests/flow_dissector.c @@ -378,8 +378,8 @@ struct test tests[] = { .iph_inner.ihl = 5, .iph_inner.protocol = IPPROTO_TCP, .iph_inner.tot_len = - __bpf_constant_htons(MAGIC_BYTES) - - sizeof(struct iphdr), + __bpf_constant_htons(MAGIC_BYTES - + sizeof(struct iphdr)), .tcp.doff = 5, .tcp.source = 80, .tcp.dest = 8080, @@ -407,8 +407,8 @@ struct test tests[] = { .iph_inner.ihl = 5, .iph_inner.protocol = IPPROTO_TCP, .iph_inner.tot_len = - __bpf_constant_htons(MAGIC_BYTES) - - sizeof(struct iphdr), + __bpf_constant_htons(MAGIC_BYTES - + sizeof(struct iphdr)), .tcp.doff = 5, .tcp.source = 80, .tcp.dest = 8080, @@ -436,8 +436,8 @@ struct test tests[] = { .iph_inner.ihl = 5, .iph_inner.protocol = IPPROTO_TCP, .iph_inner.tot_len = - __bpf_constant_htons(MAGIC_BYTES) - - sizeof(struct iphdr), + __bpf_constant_htons(MAGIC_BYTES - + sizeof(struct iphdr)), .tcp.doff = 5, .tcp.source = 99, .tcp.dest = 9090, -- cgit v1.2.3 From 1c5144a066fd17eba681b3d6b0f86bda8e06fbfa Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 1 Aug 2024 17:37:50 -0700 Subject: torture: Add torture.sh --guest-cpu-limit argument for limited hosts Some servers have limitations on the number of CPUs a given guest OS can use. In my earlier experience, such limitations have been at least half of the host's CPUs, but in a recent example, this limit is less than 40%. This commit therefore adds a --guest-cpu-limit argument that allows such low limits to be made known to torture.sh. Signed-off-by: "Paul E. McKenney" Signed-off-by: Neeraj Upadhyay --- tools/testing/selftests/rcutorture/bin/torture.sh | 38 ++++++++++++++++------- 1 file changed, 27 insertions(+), 11 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/rcutorture/bin/torture.sh b/tools/testing/selftests/rcutorture/bin/torture.sh index 990d24696fd3..0447c4a00cc4 100755 --- a/tools/testing/selftests/rcutorture/bin/torture.sh +++ b/tools/testing/selftests/rcutorture/bin/torture.sh @@ -19,10 +19,10 @@ PATH=${RCUTORTURE}/bin:$PATH; export PATH TORTURE_ALLOTED_CPUS="`identify_qemu_vcpus`" MAKE_ALLOTED_CPUS=$((TORTURE_ALLOTED_CPUS*2)) -HALF_ALLOTED_CPUS=$((TORTURE_ALLOTED_CPUS/2)) -if test "$HALF_ALLOTED_CPUS" -lt 1 +SCALE_ALLOTED_CPUS=$((TORTURE_ALLOTED_CPUS/2)) +if test "$SCALE_ALLOTED_CPUS" -lt 1 then - HALF_ALLOTED_CPUS=1 + SCALE_ALLOTED_CPUS=1 fi VERBOSE_BATCH_CPUS=$((TORTURE_ALLOTED_CPUS/16)) if test "$VERBOSE_BATCH_CPUS" -lt 2 @@ -90,6 +90,7 @@ usage () { echo " --do-scftorture / --do-no-scftorture / --no-scftorture" echo " --do-srcu-lockdep / --do-no-srcu-lockdep / --no-srcu-lockdep" echo " --duration [ | h | d ]" + echo " --guest-cpu-limit N" echo " --kcsan-kmake-arg kernel-make-arguments" exit 1 } @@ -203,6 +204,21 @@ do duration_base=$(($ts*mult)) shift ;; + --guest-cpu-limit|--guest-cpu-lim) + checkarg --guest-cpu-limit "(number)" "$#" "$2" '^[0-9]*$' '^--' + if (("$2" <= "$TORTURE_ALLOTED_CPUS" / 2)) + then + SCALE_ALLOTED_CPUS="$2" + VERBOSE_BATCH_CPUS="$((SCALE_ALLOTED_CPUS/8))" + if (("$VERBOSE_BATCH_CPUS" < 2)) + then + VERBOSE_BATCH_CPUS=0 + fi + else + echo "Ignoring value of $2 for --guest-cpu-limit which is greater than (("$TORTURE_ALLOTED_CPUS" / 2))." + fi + shift + ;; --kcsan-kmake-arg|--kcsan-kmake-args) checkarg --kcsan-kmake-arg "(kernel make arguments)" $# "$2" '.*' '^error$' kcsan_kmake_args="`echo "$kcsan_kmake_args $2" | sed -e 's/^ *//' -e 's/ *$//'`" @@ -425,9 +441,9 @@ fi if test "$do_scftorture" = "yes" then # Scale memory based on the number of CPUs. - scfmem=$((3+HALF_ALLOTED_CPUS/16)) - torture_bootargs="scftorture.nthreads=$HALF_ALLOTED_CPUS torture.disable_onoff_at_boot csdlock_debug=1" - torture_set "scftorture" tools/testing/selftests/rcutorture/bin/kvm.sh --torture scf --allcpus --duration "$duration_scftorture" --configs "$configs_scftorture" --kconfig "CONFIG_NR_CPUS=$HALF_ALLOTED_CPUS" --memory ${scfmem}G --trust-make + scfmem=$((3+SCALE_ALLOTED_CPUS/16)) + torture_bootargs="scftorture.nthreads=$SCALE_ALLOTED_CPUS torture.disable_onoff_at_boot csdlock_debug=1" + torture_set "scftorture" tools/testing/selftests/rcutorture/bin/kvm.sh --torture scf --allcpus --duration "$duration_scftorture" --configs "$configs_scftorture" --kconfig "CONFIG_NR_CPUS=$SCALE_ALLOTED_CPUS" --memory ${scfmem}G --trust-make fi if test "$do_rt" = "yes" @@ -471,8 +487,8 @@ for prim in $primlist do if test -n "$firsttime" then - torture_bootargs="refscale.scale_type="$prim" refscale.nreaders=$HALF_ALLOTED_CPUS refscale.loops=10000 refscale.holdoff=20 torture.disable_onoff_at_boot" - torture_set "refscale-$prim" tools/testing/selftests/rcutorture/bin/kvm.sh --torture refscale --allcpus --duration 5 --kconfig "CONFIG_TASKS_TRACE_RCU=y CONFIG_NR_CPUS=$HALF_ALLOTED_CPUS" --bootargs "refscale.verbose_batched=$VERBOSE_BATCH_CPUS torture.verbose_sleep_frequency=8 torture.verbose_sleep_duration=$VERBOSE_BATCH_CPUS" --trust-make + torture_bootargs="refscale.scale_type="$prim" refscale.nreaders=$SCALE_ALLOTED_CPUS refscale.loops=10000 refscale.holdoff=20 torture.disable_onoff_at_boot" + torture_set "refscale-$prim" tools/testing/selftests/rcutorture/bin/kvm.sh --torture refscale --allcpus --duration 5 --kconfig "CONFIG_TASKS_TRACE_RCU=y CONFIG_NR_CPUS=$SCALE_ALLOTED_CPUS" --bootargs "refscale.verbose_batched=$VERBOSE_BATCH_CPUS torture.verbose_sleep_frequency=8 torture.verbose_sleep_duration=$VERBOSE_BATCH_CPUS" --trust-make mv $T/last-resdir-nodebug $T/first-resdir-nodebug || : if test -f "$T/last-resdir-kasan" then @@ -520,8 +536,8 @@ for prim in $primlist do if test -n "$firsttime" then - torture_bootargs="rcuscale.scale_type="$prim" rcuscale.nwriters=$HALF_ALLOTED_CPUS rcuscale.holdoff=20 torture.disable_onoff_at_boot" - torture_set "rcuscale-$prim" tools/testing/selftests/rcutorture/bin/kvm.sh --torture rcuscale --allcpus --duration 5 --kconfig "CONFIG_TASKS_TRACE_RCU=y CONFIG_NR_CPUS=$HALF_ALLOTED_CPUS" --trust-make + torture_bootargs="rcuscale.scale_type="$prim" rcuscale.nwriters=$SCALE_ALLOTED_CPUS rcuscale.holdoff=20 torture.disable_onoff_at_boot" + torture_set "rcuscale-$prim" tools/testing/selftests/rcutorture/bin/kvm.sh --torture rcuscale --allcpus --duration 5 --kconfig "CONFIG_TASKS_TRACE_RCU=y CONFIG_NR_CPUS=$SCALE_ALLOTED_CPUS" --trust-make mv $T/last-resdir-nodebug $T/first-resdir-nodebug || : if test -f "$T/last-resdir-kasan" then @@ -559,7 +575,7 @@ do_kcsan="$do_kcsan_save" if test "$do_kvfree" = "yes" then torture_bootargs="rcuscale.kfree_rcu_test=1 rcuscale.kfree_nthreads=16 rcuscale.holdoff=20 rcuscale.kfree_loops=10000 torture.disable_onoff_at_boot" - torture_set "rcuscale-kvfree" tools/testing/selftests/rcutorture/bin/kvm.sh --torture rcuscale --allcpus --duration $duration_rcutorture --kconfig "CONFIG_NR_CPUS=$HALF_ALLOTED_CPUS" --memory 2G --trust-make + torture_set "rcuscale-kvfree" tools/testing/selftests/rcutorture/bin/kvm.sh --torture rcuscale --allcpus --duration $duration_rcutorture --kconfig "CONFIG_NR_CPUS=$SCALE_ALLOTED_CPUS" --memory 2G --trust-make fi if test "$do_clocksourcewd" = "yes" -- cgit v1.2.3 From 3e49aea71d5bac93b892f954d14bc8070e4cc651 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 5 Jul 2024 19:57:47 -0700 Subject: refscale: Add TINY scenario This commit adds a TINY scenario in order to support tests of Tiny RCU and Tiny SRCU. Signed-off-by: Paul E. McKenney Signed-off-by: Neeraj Upadhyay --- .../selftests/rcutorture/configs/refscale/TINY | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) create mode 100644 tools/testing/selftests/rcutorture/configs/refscale/TINY (limited to 'tools') diff --git a/tools/testing/selftests/rcutorture/configs/refscale/TINY b/tools/testing/selftests/rcutorture/configs/refscale/TINY new file mode 100644 index 000000000000..759343980b80 --- /dev/null +++ b/tools/testing/selftests/rcutorture/configs/refscale/TINY @@ -0,0 +1,20 @@ +CONFIG_SMP=n +CONFIG_PREEMPT_NONE=y +CONFIG_PREEMPT_VOLUNTARY=n +CONFIG_PREEMPT=n +CONFIG_PREEMPT_DYNAMIC=n +#CHECK#CONFIG_PREEMPT_RCU=n +CONFIG_HZ_PERIODIC=n +CONFIG_NO_HZ_IDLE=y +CONFIG_NO_HZ_FULL=n +CONFIG_HOTPLUG_CPU=n +CONFIG_SUSPEND=n +CONFIG_HIBERNATION=n +CONFIG_RCU_NOCB_CPU=n +CONFIG_DEBUG_LOCK_ALLOC=n +CONFIG_PROVE_LOCKING=n +CONFIG_RCU_BOOST=n +CONFIG_DEBUG_OBJECTS_RCU_HEAD=n +CONFIG_RCU_EXPERT=y +CONFIG_KPROBES=n +CONFIG_FTRACE=n -- cgit v1.2.3 From 41f37c852ac3fbfd072a00281b60dc7ba056be8c Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Thu, 13 Jun 2024 07:12:10 +0900 Subject: selftests/ftrace: Add required dependency for kprobe tests kprobe_args_{char,string}.tc are using available_filter_functions file which is provided by function tracer. Thus if function tracer is disabled, these tests are failed on recent kernels because tracefs_create_dir is not raised events by adding a dynamic event. Add available_filter_functions to requires line. Fixes: 7c1130ea5cae ("test: ftrace: Fix kprobe test for eventfs") Signed-off-by: Masami Hiramatsu (Google) Signed-off-by: Shuah Khan --- tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args_char.tc | 2 +- tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args_string.tc | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args_char.tc b/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args_char.tc index e21c9c27ece4..77f4c07cdcb8 100644 --- a/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args_char.tc +++ b/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args_char.tc @@ -1,7 +1,7 @@ #!/bin/sh # SPDX-License-Identifier: GPL-2.0 # description: Kprobe event char type argument -# requires: kprobe_events +# requires: kprobe_events available_filter_functions case `uname -m` in x86_64) diff --git a/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args_string.tc b/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args_string.tc index 93217d459556..39001073f7ed 100644 --- a/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args_string.tc +++ b/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args_string.tc @@ -1,7 +1,7 @@ #!/bin/sh # SPDX-License-Identifier: GPL-2.0 # description: Kprobe event string type argument -# requires: kprobe_events +# requires: kprobe_events available_filter_functions case `uname -m` in x86_64) -- cgit v1.2.3 From a05031713d460de5ff8500680458ed8f3fd946e6 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Mon, 12 Aug 2024 21:06:12 -0700 Subject: perf disasm: Fix memory leak for locked operations lock__parse() calls disasm_line__parse() passing &ops->locked.ins.name that will use strdup() to populate it. Ensure ops->locked.ins.name is freed in lock__delete(). Found with address/leak sanitizer. Signed-off-by: Ian Rogers Acked-by: Namhyung Kim Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Athira Rajeev Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kajol Jain Cc: Kan Liang Cc: Mark Rutland Cc: Peter Zijlstra Cc: Thomas Richter Link: https://lore.kernel.org/r/20240813040613.882075-1-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/disasm.c | 1 + 1 file changed, 1 insertion(+) (limited to 'tools') diff --git a/tools/perf/util/disasm.c b/tools/perf/util/disasm.c index 22289003e16d..226d2181f694 100644 --- a/tools/perf/util/disasm.c +++ b/tools/perf/util/disasm.c @@ -566,6 +566,7 @@ static void lock__delete(struct ins_operands *ops) ins_ops__delete(ops->locked.ops); zfree(&ops->locked.ops); + zfree(&ops->locked.ins.name); zfree(&ops->target.raw); zfree(&ops->target.name); } -- cgit v1.2.3 From 653ac51f53032df8a17fd5f9d6c7540a5e70f85c Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Mon, 12 Aug 2024 21:06:13 -0700 Subject: perf test annotate: Dump trapping test in trap handler Help to better identify the location of test failures but dumping the failing test in the trap handler. Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Athira Rajeev Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kajol Jain Cc: Kan Liang Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Thomas Richter Link: https://lore.kernel.org/r/20240813040613.882075-2-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/shell/annotate.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/perf/tests/shell/annotate.sh b/tools/perf/tests/shell/annotate.sh index b072d9b97387..2ccf4f1d46b6 100755 --- a/tools/perf/tests/shell/annotate.sh +++ b/tools/perf/tests/shell/annotate.sh @@ -1,4 +1,4 @@ -#!/bin/sh +#!/bin/bash # perf annotate basic tests # SPDX-License-Identifier: GPL-2.0 @@ -28,6 +28,7 @@ cleanup() { } trap_cleanup() { + echo "Unexpected signal in ${FUNCNAME[1]}" cleanup exit 1 } -- cgit v1.2.3 From 3ef44458071a19e5b5832cdfe6f75273aa521b6e Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Tue, 13 Aug 2024 09:02:00 -0700 Subject: perf report: Fix --total-cycles --stdio output error The --total-cycles may output wrong information with the --stdio. For example: # perf record -e "{cycles,instructions}",cache-misses -b sleep 1 # perf report --total-cycles --stdio The total cycles output of {cycles,instructions} and cache-misses are almost the same. # Samples: 938 of events 'anon group { cycles, instructions }' # Event count (approx.): 938 # # Sampled Cycles% Sampled Cycles Avg Cycles% Avg Cycles [Program Block Range] # ............... .............. ........... .......... ..................................................> # 11.19% 2.6K 0.10% 21 [perf_iterate_ctx+48 -> > 5.79% 1.4K 0.45% 97 [__intel_pmu_enable_all.constprop.0+80 -> __intel_> 5.11% 1.2K 0.33% 71 [native_write_msr+0 ->> # Samples: 293 of event 'cache-misses' # Event count (approx.): 293 # # Sampled Cycles% Sampled Cycles Avg Cycles% Avg Cycles [Program Block Range] # ............... .............. ........... .......... ..................................................> # 11.19% 2.6K 0.13% 21 [perf_iterate_ctx+48 -> > 5.79% 1.4K 0.59% 97 [__intel_pmu_enable_all.constprop.0+80 -> __intel_> 5.11% 1.2K 0.43% 71 [native_write_msr+0 ->> With the symbol_conf.event_group, the 'perf report' should only report the block information of the leader event in a group. However, the current implementation retrieves the next event's block information, rather than the next group leader's block information. Make sure the index is updated even if the event is skipped. With the patch, # Samples: 293 of event 'cache-misses' # Event count (approx.): 293 # # Sampled Cycles% Sampled Cycles Avg Cycles% Avg Cycles [Program Block Range] # ............... .............. ........... .......... ..................................................> # 37.98% 9.0K 4.05% 299 [perf_event_addr_filters_exec+0 -> perf_event_a> 11.19% 2.6K 0.28% 21 [perf_iterate_ctx+48 -> > 5.79% 1.4K 1.32% 97 [__intel_pmu_enable_all.constprop.0+80 -> __intel_> Fixes: 6f7164fa231a5f36 ("perf report: Sort by sampled cycles percent per block for stdio") Reviewed-by: Andi Kleen Signed-off-by: Kan Liang Acked-by: Namhyung Kim Cc: Adrian Hunter Cc: Ian Rogers Cc: Ingo Molnar Cc: Jin Yao Cc: Peter Zijlstra Cc: Stephane Eranian Link: https://lore.kernel.org/r/20240813160208.2493643-2-kan.liang@linux.intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-report.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c index dfb47fa85e5c..04b9a5c1bc7e 100644 --- a/tools/perf/builtin-report.c +++ b/tools/perf/builtin-report.c @@ -565,6 +565,7 @@ static int evlist__tty_browse_hists(struct evlist *evlist, struct report *rep, c struct hists *hists = evsel__hists(pos); const char *evname = evsel__name(pos); + i++; if (symbol_conf.event_group && !evsel__is_group_leader(pos)) continue; @@ -574,7 +575,7 @@ static int evlist__tty_browse_hists(struct evlist *evlist, struct report *rep, c hists__fprintf_nr_sample_events(hists, rep, evname, stdout); if (rep->total_cycles_mode) { - report__browse_block_hists(&rep->block_reports[i++].hist, + report__browse_block_hists(&rep->block_reports[i - 1].hist, rep->min_percent, pos, NULL); continue; } -- cgit v1.2.3 From 183212a45e5fe66e7f6cbd360d186174b013d023 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Tue, 13 Aug 2024 09:02:01 -0700 Subject: perf report: Remove the first overflow check for branch counters A false overflow warning is triggered if a sample doesn't have any LBRs recorded and the branch counters feature is enabled. The current code does OVERFLOW_CHECK_u64() at the very beginning when reading the information of branch counters. It assumes that there is at least one LBR in the PEBS record. But it is a valid case that 0 LBR is recorded especially in a high context switch. Remove the OVERFLOW_CHECK_u64(). The later OVERFLOW_CHECK() should be good enough to check the overflow when reading the information of the branch counters. Fixes: 9fbb4b02302b0ae6 ("perf tools: Add branch counter knob") Reviewed-by: Andi Kleen Signed-off-by: Kan Liang Acked-by: Namhyung Kim Cc: Adrian Hunter Cc: Ian Rogers Cc: Ingo Molnar Cc: Kan Liang Cc: Peter Zijlstra Cc: Stephane Eranian Link: https://lore.kernel.org/r/20240813160208.2493643-3-kan.liang@linux.intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/evsel.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index d607056b73c9..f22f402d54cc 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -2884,8 +2884,6 @@ int evsel__parse_sample(struct evsel *evsel, union perf_event *event, array = (void *)array + sz; if (evsel__has_branch_counters(evsel)) { - OVERFLOW_CHECK_u64(array); - data->branch_stack_cntr = (u64 *)array; sz = data->branch_stack->nr * sizeof(u64); -- cgit v1.2.3 From 3a867a6dadb2d67b85cbf9bc67eca2428075a109 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Tue, 13 Aug 2024 09:02:02 -0700 Subject: perf evlist: Save branch counters information The branch counters logging (A.K.A LBR event logging) introduces a per-counter indication of precise event occurrences in LBRs. The kernel only dumps the number of occurrences into a record. The perf tool has to map the number to the corresponding event. Add evlist__update_br_cntr() to go through the evlist to pick the events that are configured to be logged. Assign a logical idx to track them, and add the total number of the events in the leader event. The total number will be used to allocate the space to save the branch counters for a block. The logical idx will be used to locate the corresponding event quickly in the following patches. It only needs to iterate the evlist once. The evsel__has_branch_counters() is also optimized. Reviewed-by: Andi Kleen Signed-off-by: Kan Liang Acked-by: Namhyung Kim Cc: Adrian Hunter Cc: Ian Rogers Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Stephane Eranian Link: https://lore.kernel.org/r/20240813160208.2493643-4-kan.liang@linux.intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/evlist.c | 15 +++++++++++++++ tools/perf/util/evlist.h | 2 ++ tools/perf/util/evsel.c | 13 +++++++------ tools/perf/util/evsel.h | 8 ++++++++ 4 files changed, 32 insertions(+), 6 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c index 4b2538e4f679..68bbd3ea771b 100644 --- a/tools/perf/util/evlist.c +++ b/tools/perf/util/evlist.c @@ -79,6 +79,7 @@ void evlist__init(struct evlist *evlist, struct perf_cpu_map *cpus, evlist->ctl_fd.fd = -1; evlist->ctl_fd.ack = -1; evlist->ctl_fd.pos = -1; + evlist->nr_br_cntr = -1; } struct evlist *evlist__new(void) @@ -1264,6 +1265,20 @@ u64 evlist__combined_branch_type(struct evlist *evlist) return branch_type; } +void evlist__update_br_cntr(struct evlist *evlist) +{ + struct evsel *evsel; + int i = 0; + + evlist__for_each_entry(evlist, evsel) { + if (evsel->core.attr.branch_sample_type & PERF_SAMPLE_BRANCH_COUNTERS) { + evsel->br_cntr_idx = i++; + evsel__leader(evsel)->br_cntr_nr++; + } + } + evlist->nr_br_cntr = i; +} + bool evlist__valid_read_format(struct evlist *evlist) { struct evsel *first = evlist__first(evlist), *pos = first; diff --git a/tools/perf/util/evlist.h b/tools/perf/util/evlist.h index cccc34da5a02..b46f1a320783 100644 --- a/tools/perf/util/evlist.h +++ b/tools/perf/util/evlist.h @@ -57,6 +57,7 @@ struct evlist { bool enabled; int id_pos; int is_pos; + int nr_br_cntr; u64 combined_sample_type; enum bkw_mmap_state bkw_mmap_state; struct { @@ -219,6 +220,7 @@ int evlist__apply_filters(struct evlist *evlist, struct evsel **err_evsel, u64 __evlist__combined_sample_type(struct evlist *evlist); u64 evlist__combined_sample_type(struct evlist *evlist); u64 evlist__combined_branch_type(struct evlist *evlist); +void evlist__update_br_cntr(struct evlist *evlist); bool evlist__sample_id_all(struct evlist *evlist); u16 evlist__id_hdr_size(struct evlist *evlist); diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index f22f402d54cc..38a74d6dde49 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -2636,17 +2636,18 @@ u64 evsel__bitfield_swap_branch_flags(u64 value) static inline bool evsel__has_branch_counters(const struct evsel *evsel) { - struct evsel *cur, *leader = evsel__leader(evsel); + struct evsel *leader = evsel__leader(evsel); /* The branch counters feature only supports group */ if (!leader || !evsel->evlist) return false; - evlist__for_each_entry(evsel->evlist, cur) { - if ((leader == evsel__leader(cur)) && - (cur->core.attr.branch_sample_type & PERF_SAMPLE_BRANCH_COUNTERS)) - return true; - } + if (evsel->evlist->nr_br_cntr < 0) + evlist__update_br_cntr(evsel->evlist); + + if (leader->br_cntr_nr > 0) + return true; + return false; } diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h index a5da4b03bb1c..a393dae1dc96 100644 --- a/tools/perf/util/evsel.h +++ b/tools/perf/util/evsel.h @@ -148,6 +148,14 @@ struct evsel { */ __u64 synth_sample_type; + /* + * Store the branch counter related information. + * br_cntr_idx: The idx of the branch counter event in the evlist + * br_cntr_nr: The number of the branch counter event in the group + * (Only available for the leader event) + */ + int br_cntr_idx; + int br_cntr_nr; /* * bpf_counter_ops serves two use cases: * 1. perf-stat -b counting events used byBPF programs -- cgit v1.2.3 From 1f2b7fbb04f53540090ac7d320194654f142443f Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Tue, 13 Aug 2024 09:02:03 -0700 Subject: perf annotate: Save branch counters for each block When annotating a basic block, it's useful to display the occurrences of other events in the block. The branch counter feature is only available for newer Intel platforms. So a dedicated option to display the branch counters is not introduced. Reuse the existing --total-cycles option, which triggers the annotation of a basic block and displays the cycle-related annotation. When the branch counters information is available, the branch counters are automatically appended after all the cycle-related annotation. Accounting the branch counters as well when accounting the cycles in hist__account_cycles(). In 'struct annotated_branch', introduce a br_cntr array to save the accumulation of each branch counter. In a sample, all the branch counters for a branch are saved in a u64 space. Because the saturation of a branch counter is small, e.g., for Intel Sierra Forest, the saturation is only 3. Add ANNOTATION__BR_CNTR_SATURATED_FLAG to indicate if a branch counter once saturated. That can be used to indicate a potential event lost because of the saturation. Reviewed-by: Andi Kleen Signed-off-by: Kan Liang Acked-by: Namhyung Kim Cc: Adrian Hunter Cc: Ian Rogers Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Stephane Eranian Link: https://lore.kernel.org/r/20240813160208.2493643-5-kan.liang@linux.intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-annotate.c | 3 +- tools/perf/builtin-diff.c | 4 +-- tools/perf/builtin-report.c | 2 +- tools/perf/builtin-top.c | 4 +-- tools/perf/util/annotate.c | 68 +++++++++++++++++++++++++++++++++++-------- tools/perf/util/annotate.h | 10 ++++++- tools/perf/util/branch.h | 1 + tools/perf/util/hist.c | 5 ++-- tools/perf/util/hist.h | 2 +- tools/perf/util/machine.c | 3 ++ 10 files changed, 80 insertions(+), 22 deletions(-) (limited to 'tools') diff --git a/tools/perf/builtin-annotate.c b/tools/perf/builtin-annotate.c index d6f6ba5a569d..9b4a8a379b5b 100644 --- a/tools/perf/builtin-annotate.c +++ b/tools/perf/builtin-annotate.c @@ -221,7 +221,8 @@ static int process_branch_callback(struct evsel *evsel, if (a.map != NULL) dso__set_hit(map__dso(a.map)); - hist__account_cycles(sample->branch_stack, al, sample, false, NULL); + hist__account_cycles(sample->branch_stack, al, sample, false, + NULL, evsel); ret = hist_entry_iter__add(&iter, &a, PERF_MAX_STACK_DEPTH, ann); out: diff --git a/tools/perf/builtin-diff.c b/tools/perf/builtin-diff.c index 28c5208fcdc9..6983b5373372 100644 --- a/tools/perf/builtin-diff.c +++ b/tools/perf/builtin-diff.c @@ -431,8 +431,8 @@ static int diff__process_sample_event(const struct perf_tool *tool, goto out; } - hist__account_cycles(sample->branch_stack, &al, sample, false, - NULL); + hist__account_cycles(sample->branch_stack, &al, sample, + false, NULL, evsel); break; case COMPUTE_STREAM: diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c index 04b9a5c1bc7e..ab450644e5b3 100644 --- a/tools/perf/builtin-report.c +++ b/tools/perf/builtin-report.c @@ -328,7 +328,7 @@ static int process_sample_event(const struct perf_tool *tool, if (ui__has_annotation() || rep->symbol_ipc || rep->total_cycles_mode) { hist__account_cycles(sample->branch_stack, &al, sample, rep->nonany_branch_mode, - &rep->total_cycles); + &rep->total_cycles, evsel); } ret = hist_entry_iter__add(&iter, &al, rep->max_stack, rep); diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c index 1cdac4e190e9..881b861c35ee 100644 --- a/tools/perf/builtin-top.c +++ b/tools/perf/builtin-top.c @@ -735,8 +735,8 @@ static int hist_iter__top_callback(struct hist_entry_iter *iter, perf_top__record_precise_ip(top, iter->he, iter->sample, evsel, al->addr); hist__account_cycles(iter->sample->branch_stack, al, iter->sample, - !(top->record_opts.branch_stack & PERF_SAMPLE_BRANCH_ANY), - NULL); + !(top->record_opts.branch_stack & PERF_SAMPLE_BRANCH_ANY), + NULL, evsel); return 0; } diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c index a87d2e97e10d..c269758d47c8 100644 --- a/tools/perf/util/annotate.c +++ b/tools/perf/util/annotate.c @@ -266,22 +266,30 @@ struct annotated_branch *annotation__get_branch(struct annotation *notes) return notes->branch; } -static struct cyc_hist *symbol__cycles_hist(struct symbol *sym) +static struct annotated_branch *symbol__find_branch_hist(struct symbol *sym, + unsigned int br_cntr_nr) { struct annotation *notes = symbol__annotation(sym); struct annotated_branch *branch; + const size_t size = symbol__size(sym); branch = annotation__get_branch(notes); if (branch == NULL) return NULL; if (branch->cycles_hist == NULL) { - const size_t size = symbol__size(sym); - branch->cycles_hist = calloc(size, sizeof(struct cyc_hist)); + if (!branch->cycles_hist) + return NULL; + } + + if (br_cntr_nr && branch->br_cntr == NULL) { + branch->br_cntr = calloc(br_cntr_nr * size, sizeof(u64)); + if (!branch->br_cntr) + return NULL; } - return branch->cycles_hist; + return branch; } struct annotated_source *symbol__hists(struct symbol *sym, int nr_hists) @@ -316,16 +324,44 @@ static int symbol__inc_addr_samples(struct map_symbol *ms, return src ? __symbol__inc_addr_samples(ms, src, evsel->core.idx, addr, sample) : 0; } -static int symbol__account_cycles(u64 addr, u64 start, - struct symbol *sym, unsigned cycles) +static int symbol__account_br_cntr(struct annotated_branch *branch, + struct evsel *evsel, + unsigned offset, + u64 br_cntr) +{ + unsigned int br_cntr_nr = evsel__leader(evsel)->br_cntr_nr; + unsigned int base = evsel__leader(evsel)->br_cntr_idx; + unsigned int width = evsel__env(evsel)->br_cntr_width; + unsigned int off = offset * evsel->evlist->nr_br_cntr; + unsigned int i, mask = (1L << width) - 1; + u64 *branch_br_cntr = branch->br_cntr; + + if (!br_cntr || !branch_br_cntr) + return 0; + + for (i = 0; i < br_cntr_nr; i++) { + u64 cntr = (br_cntr >> i * width) & mask; + + branch_br_cntr[off + i + base] += cntr; + if (cntr == mask) + branch_br_cntr[off + i + base] |= ANNOTATION__BR_CNTR_SATURATED_FLAG; + } + + return 0; +} + +static int symbol__account_cycles(u64 addr, u64 start, struct symbol *sym, + unsigned cycles, struct evsel *evsel, + u64 br_cntr) { - struct cyc_hist *cycles_hist; + struct annotated_branch *branch; unsigned offset; + int ret; if (sym == NULL) return 0; - cycles_hist = symbol__cycles_hist(sym); - if (cycles_hist == NULL) + branch = symbol__find_branch_hist(sym, evsel->evlist->nr_br_cntr); + if (!branch) return -ENOMEM; if (addr < sym->start || addr >= sym->end) return -ERANGE; @@ -337,15 +373,22 @@ static int symbol__account_cycles(u64 addr, u64 start, start = 0; } offset = addr - sym->start; - return __symbol__account_cycles(cycles_hist, + ret = __symbol__account_cycles(branch->cycles_hist, start ? start - sym->start : 0, offset, cycles, !!start); + + if (ret) + return ret; + + return symbol__account_br_cntr(branch, evsel, offset, br_cntr); } int addr_map_symbol__account_cycles(struct addr_map_symbol *ams, struct addr_map_symbol *start, - unsigned cycles) + unsigned cycles, + struct evsel *evsel, + u64 br_cntr) { u64 saddr = 0; int err; @@ -371,7 +414,7 @@ int addr_map_symbol__account_cycles(struct addr_map_symbol *ams, start ? start->addr : 0, ams->ms.sym ? ams->ms.sym->start + map__start(ams->ms.map) : 0, saddr); - err = symbol__account_cycles(ams->al_addr, saddr, ams->ms.sym, cycles); + err = symbol__account_cycles(ams->al_addr, saddr, ams->ms.sym, cycles, evsel, br_cntr); if (err) pr_debug2("account_cycles failed %d\n", err); return err; @@ -412,6 +455,7 @@ static void annotated_branch__delete(struct annotated_branch *branch) { if (branch) { zfree(&branch->cycles_hist); + free(branch->br_cntr); free(branch); } } diff --git a/tools/perf/util/annotate.h b/tools/perf/util/annotate.h index 0e52558baa01..6b3023f3b18f 100644 --- a/tools/perf/util/annotate.h +++ b/tools/perf/util/annotate.h @@ -14,6 +14,7 @@ #include "spark.h" #include "hashmap.h" #include "disasm.h" +#include "branch.h" struct hist_browser_timer; struct hist_entry; @@ -288,6 +289,9 @@ struct annotated_source { struct annotation_line *annotated_source__get_line(struct annotated_source *src, s64 offset); +/* A branch counter once saturated */ +#define ANNOTATION__BR_CNTR_SATURATED_FLAG (1ULL << 63) + /** * struct annotated_branch - basic block and IPC information for a symbol. * @@ -297,6 +301,7 @@ struct annotation_line *annotated_source__get_line(struct annotated_source *src, * @cover_insn: Number of distinct, actually executed instructions. * @cycles_hist: Array of cyc_hist for each instruction. * @max_coverage: Maximum number of covered basic block (used for block-range). + * @br_cntr: Array of the occurrences of events (branch counters) during a block. * * This struct is used by two different codes when the sample has branch stack * and cycles information. annotation__compute_ipc() calculates average IPC @@ -313,6 +318,7 @@ struct annotated_branch { unsigned int cover_insn; struct cyc_hist *cycles_hist; u64 max_coverage; + u64 *br_cntr; }; struct LOCKABLE annotation { @@ -383,7 +389,9 @@ struct annotated_branch *annotation__get_branch(struct annotation *notes); int addr_map_symbol__account_cycles(struct addr_map_symbol *ams, struct addr_map_symbol *start, - unsigned cycles); + unsigned cycles, + struct evsel *evsel, + u64 br_cntr); int hist_entry__inc_addr_samples(struct hist_entry *he, struct perf_sample *sample, struct evsel *evsel, u64 addr); diff --git a/tools/perf/util/branch.h b/tools/perf/util/branch.h index 87704d713ff6..b80c12c74bbb 100644 --- a/tools/perf/util/branch.h +++ b/tools/perf/util/branch.h @@ -34,6 +34,7 @@ struct branch_info { struct addr_map_symbol from; struct addr_map_symbol to; struct branch_flags flags; + u64 branch_stack_cntr; char *srcline_from; char *srcline_to; }; diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c index c8c1b511f8a7..0f554febf9a1 100644 --- a/tools/perf/util/hist.c +++ b/tools/perf/util/hist.c @@ -2677,7 +2677,7 @@ int hists__unlink(struct hists *hists) void hist__account_cycles(struct branch_stack *bs, struct addr_location *al, struct perf_sample *sample, bool nonany_branch_mode, - u64 *total_cycles) + u64 *total_cycles, struct evsel *evsel) { struct branch_info *bi; struct branch_entry *entries = perf_sample__branch_entries(sample); @@ -2701,7 +2701,8 @@ void hist__account_cycles(struct branch_stack *bs, struct addr_location *al, for (int i = bs->nr - 1; i >= 0; i--) { addr_map_symbol__account_cycles(&bi[i].from, nonany_branch_mode ? NULL : prev, - bi[i].flags.cycles); + bi[i].flags.cycles, evsel, + bi[i].branch_stack_cntr); prev = &bi[i].to; if (total_cycles) diff --git a/tools/perf/util/hist.h b/tools/perf/util/hist.h index 5273f5c37050..30c13fc8cbe4 100644 --- a/tools/perf/util/hist.h +++ b/tools/perf/util/hist.h @@ -742,7 +742,7 @@ unsigned int hists__overhead_width(struct hists *hists); void hist__account_cycles(struct branch_stack *bs, struct addr_location *al, struct perf_sample *sample, bool nonany_branch_mode, - u64 *total_cycles); + u64 *total_cycles, struct evsel *evsel); struct option; int parse_filter_percentage(const struct option *opt, const char *arg, int unset); diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c index 706be5e4a076..c08774d06d14 100644 --- a/tools/perf/util/machine.c +++ b/tools/perf/util/machine.c @@ -2141,6 +2141,7 @@ struct branch_info *sample__resolve_bstack(struct perf_sample *sample, unsigned int i; const struct branch_stack *bs = sample->branch_stack; struct branch_entry *entries = perf_sample__branch_entries(sample); + u64 *branch_stack_cntr = sample->branch_stack_cntr; struct branch_info *bi = calloc(bs->nr, sizeof(struct branch_info)); if (!bi) @@ -2150,6 +2151,8 @@ struct branch_info *sample__resolve_bstack(struct perf_sample *sample, ip__resolve_ams(al->thread, &bi[i].to, entries[i].to); ip__resolve_ams(al->thread, &bi[i].from, entries[i].from); bi[i].flags = entries[i].flags; + if (branch_stack_cntr) + bi[i].branch_stack_cntr = branch_stack_cntr[i]; } return bi; } -- cgit v1.2.3 From 7398bf181d59a8c8d8d13cb555bf3f44b70f6221 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Tue, 13 Aug 2024 09:02:04 -0700 Subject: perf evsel: Assign abbr name for the branch counter events There could be several branch counter events. If perf tool output the result via the format "event name + a number", the line could be very long and hard to read. An abbreviation is introduced to replace the full event name in the display. The abbreviation starts from 'A' to 'Z9', which can support up to 286 events. The same abbreviation will be assigned if the same events are found in the evlist. The next patch will utilize the abbreviation name to show the branch counter events in the output. Reviewed-by: Andi Kleen Signed-off-by: Kan Liang Acked-by: Namhyung Kim Cc: Adrian Hunter Cc: Ian Rogers Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Stephane Eranian Link: https://lore.kernel.org/r/20240813160208.2493643-6-kan.liang@linux.intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/evlist.c | 55 +++++++++++++++++++++++++++++++++++++++++++++++- tools/perf/util/evsel.h | 6 ++++++ 2 files changed, 60 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c index 68bbd3ea771b..c0379fa1ccfe 100644 --- a/tools/perf/util/evlist.c +++ b/tools/perf/util/evlist.c @@ -33,6 +33,7 @@ #include "util/bpf-filter.h" #include "util/stat.h" #include "util/util.h" +#include "util/env.h" #include "util/intel-tpebs.h" #include #include @@ -1265,15 +1266,67 @@ u64 evlist__combined_branch_type(struct evlist *evlist) return branch_type; } +static struct evsel * +evlist__find_dup_event_from_prev(struct evlist *evlist, struct evsel *event) +{ + struct evsel *pos; + + evlist__for_each_entry(evlist, pos) { + if (event == pos) + break; + if ((pos->core.attr.branch_sample_type & PERF_SAMPLE_BRANCH_COUNTERS) && + !strcmp(pos->name, event->name)) + return pos; + } + return NULL; +} + +#define MAX_NR_ABBR_NAME (26 * 11) + +/* + * The abbr name is from A to Z9. If the number of event + * which requires the branch counter > MAX_NR_ABBR_NAME, + * return NA. + */ +static void evlist__new_abbr_name(char *name) +{ + static int idx; + int i = idx / 26; + + if (idx >= MAX_NR_ABBR_NAME) { + name[0] = 'N'; + name[1] = 'A'; + name[2] = '\0'; + return; + } + + name[0] = 'A' + (idx % 26); + + if (!i) + name[1] = '\0'; + else { + name[1] = '0' + i - 1; + name[2] = '\0'; + } + + idx++; +} + void evlist__update_br_cntr(struct evlist *evlist) { - struct evsel *evsel; + struct evsel *evsel, *dup; int i = 0; evlist__for_each_entry(evlist, evsel) { if (evsel->core.attr.branch_sample_type & PERF_SAMPLE_BRANCH_COUNTERS) { evsel->br_cntr_idx = i++; evsel__leader(evsel)->br_cntr_nr++; + + dup = evlist__find_dup_event_from_prev(evlist, evsel); + if (dup) + memcpy(evsel->abbr_name, dup->abbr_name, 3 * sizeof(char)); + else + evlist__new_abbr_name(evsel->abbr_name); } } evlist->nr_br_cntr = i; diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h index a393dae1dc96..4316992f6a69 100644 --- a/tools/perf/util/evsel.h +++ b/tools/perf/util/evsel.h @@ -153,9 +153,15 @@ struct evsel { * br_cntr_idx: The idx of the branch counter event in the evlist * br_cntr_nr: The number of the branch counter event in the group * (Only available for the leader event) + * abbr_name: The abbreviation name assigned to an event which is + * logged by the branch counter. + * The abbr name is from A to Z9. NA is applied if out + * of the range. */ int br_cntr_idx; int br_cntr_nr; + char abbr_name[3]; + /* * bpf_counter_ops serves two use cases: * 1. perf-stat -b counting events used byBPF programs -- cgit v1.2.3 From 20d6f555283915f24d52e29a982b547cf6517f06 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Tue, 13 Aug 2024 09:02:05 -0700 Subject: perf report: Display the branch counter histogram Reusing the existing --total-cycles option to display the branch counters. Add a new PERF_HPP_REPORT__BLOCK_BRANCH_COUNTER to display the logged branch counter events. They are shown right after all the cycle-related annotations. Extend the 'struct block_info' to store and pass the branch counter related information. The annotation_br_cntr_entry() is to print the histogram of each branch counter event. If the number of logged events is less than 4, the exact number of the abbr name is printed. Otherwise, using '+' to stands for more than 3 events. Assume the number of logged events is less than 4. The annotation_br_cntr_abbr_list() prints the branch counter's abbreviation list. Press 'B' to display the list in the TUI mode. $ perf record -e "{branch-instructions:ppp,branch-misses}:S" -j any,counter $ perf report --total-cycles --stdio # To display the perf.data header info, please use --header/--header-only options. # # # Total Lost Samples: 0 # # Samples: 1M of events 'anon group { branch-instructions:ppp, branch-misses }' # Event count (approx.): 1610046 # # Branch counter abbr list: # branch-instructions:ppp = A # branch-misses = B # '-' No event occurs # '+' Event occurrences may be lost due to branch counter saturated # # Sampled Cycles% Sampled Cycles Avg Cycles% Avg Cycles Branch Counter [Program Block Range] # ............... .............. ........... .......... .............. .................. # 57.55% 2.5M 0.00% 3 |A |- | ... 25.27% 1.1M 0.00% 2 |AA |- | ... 15.61% 667.2K 0.00% 1 |A |- | ... 0.16% 6.9K 0.81% 575 |A |- | ... 0.16% 6.8K 1.38% 977 |AA |- | ... 0.16% 6.8K 0.04% 28 |AA |B | ... 0.15% 6.6K 1.33% 946 |A |- | ... 0.11% 4.5K 0.06% 46 |AAA+|- | ... 0.10% 4.4K 0.88% 624 |A |- | ... 0.09% 3.7K 0.74% 524 |AAA+|B | ... With -v applied, # Sampled Cycles% Sampled Cycles Avg Cycles% Avg Cycles Branch Counter [Program Block Range] # ............... .............. ........... .......... .............. .................. # 57.55% 2.5M 0.00% 3 A=1 ,B=- ... 25.27% 1.1M 0.00% 2 A=2 ,B=- ... 15.61% 667.2K 0.00% 1 A=1 ,B=- ... 0.16% 6.9K 0.81% 575 A=1 ,B=- ... 0.16% 6.8K 1.38% 977 A=2 ,B=- ... 0.16% 6.8K 0.04% 28 A=2 ,B=1 ... 0.15% 6.6K 1.33% 946 A=1 ,B=- ... 0.11% 4.5K 0.06% 46 A=3+,B=- ... 0.10% 4.4K 0.88% 624 A=1 ,B=- ... 0.09% 3.7K 0.74% 524 A=3+,B=1 ... Reviewed-by: Andi Kleen Signed-off-by: Kan Liang Acked-by: Namhyung Kim Cc: Adrian Hunter Cc: Ian Rogers Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Stephane Eranian Link: https://lore.kernel.org/r/20240813160208.2493643-7-kan.liang@linux.intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Documentation/perf-report.txt | 1 + tools/perf/builtin-diff.c | 4 +- tools/perf/builtin-report.c | 20 ++++- tools/perf/ui/browsers/hists.c | 17 +++- tools/perf/util/annotate.c | 145 +++++++++++++++++++++++++++++++ tools/perf/util/annotate.h | 3 + tools/perf/util/block-info.c | 66 ++++++++++++-- tools/perf/util/block-info.h | 8 +- 8 files changed, 246 insertions(+), 18 deletions(-) (limited to 'tools') diff --git a/tools/perf/Documentation/perf-report.txt b/tools/perf/Documentation/perf-report.txt index d2b1593ef700..7c66d81ab978 100644 --- a/tools/perf/Documentation/perf-report.txt +++ b/tools/perf/Documentation/perf-report.txt @@ -614,6 +614,7 @@ include::itrace.txt[] 'Avg Cycles%' - block average sampled cycles / sum of total block average sampled cycles 'Avg Cycles' - block average sampled cycles + 'Branch Counter' - block branch counter histogram (with -v showing the number) --skip-empty:: Do not print 0 results in the --stat output. diff --git a/tools/perf/builtin-diff.c b/tools/perf/builtin-diff.c index 6983b5373372..23326dd20333 100644 --- a/tools/perf/builtin-diff.c +++ b/tools/perf/builtin-diff.c @@ -691,7 +691,7 @@ static void hists__precompute(struct hists *hists) if (compute == COMPUTE_CYCLES) { bh = container_of(he, struct block_hist, he); init_block_hist(bh); - block_info__process_sym(he, bh, NULL, 0); + block_info__process_sym(he, bh, NULL, 0, 0); } data__for_each_file_new(i, d) { @@ -714,7 +714,7 @@ static void hists__precompute(struct hists *hists) pair_bh = container_of(pair, struct block_hist, he); init_block_hist(pair_bh); - block_info__process_sym(pair, pair_bh, NULL, 0); + block_info__process_sym(pair, pair_bh, NULL, 0, 0); bh = container_of(he, struct block_hist, he); diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c index ab450644e5b3..1643113616f4 100644 --- a/tools/perf/builtin-report.c +++ b/tools/perf/builtin-report.c @@ -575,6 +575,13 @@ static int evlist__tty_browse_hists(struct evlist *evlist, struct report *rep, c hists__fprintf_nr_sample_events(hists, rep, evname, stdout); if (rep->total_cycles_mode) { + char *buf; + + if (!annotation_br_cntr_abbr_list(&buf, pos, true)) { + fprintf(stdout, "%s", buf); + fprintf(stdout, "#\n"); + free(buf); + } report__browse_block_hists(&rep->block_reports[i - 1].hist, rep->min_percent, pos, NULL); continue; @@ -1119,18 +1126,23 @@ static int __cmd_report(struct report *rep) report__output_resort(rep); if (rep->total_cycles_mode) { - int block_hpps[6] = { + int nr_hpps = 4; + int block_hpps[PERF_HPP_REPORT__BLOCK_MAX_INDEX] = { PERF_HPP_REPORT__BLOCK_TOTAL_CYCLES_PCT, PERF_HPP_REPORT__BLOCK_LBR_CYCLES, PERF_HPP_REPORT__BLOCK_CYCLES_PCT, PERF_HPP_REPORT__BLOCK_AVG_CYCLES, - PERF_HPP_REPORT__BLOCK_RANGE, - PERF_HPP_REPORT__BLOCK_DSO, }; + if (session->evlist->nr_br_cntr > 0) + block_hpps[nr_hpps++] = PERF_HPP_REPORT__BLOCK_BRANCH_COUNTER; + + block_hpps[nr_hpps++] = PERF_HPP_REPORT__BLOCK_RANGE; + block_hpps[nr_hpps++] = PERF_HPP_REPORT__BLOCK_DSO; + rep->block_reports = block_info__create_report(session->evlist, rep->total_cycles, - block_hpps, 6, + block_hpps, nr_hpps, &rep->nr_block_reports); if (!rep->block_reports) return -1; diff --git a/tools/perf/ui/browsers/hists.c b/tools/perf/ui/browsers/hists.c index b7219df51236..970f7f349298 100644 --- a/tools/perf/ui/browsers/hists.c +++ b/tools/perf/ui/browsers/hists.c @@ -3684,8 +3684,10 @@ int block_hists_tui_browse(struct block_hist *bh, struct evsel *evsel, struct hist_browser *browser; int key = -1; struct popup_action action; + char *br_cntr_text = NULL; static const char help[] = - " q Quit \n"; + " q Quit \n" + " B Branch counter abbr list (Optional)\n"; browser = hist_browser__new(hists); if (!browser) @@ -3703,6 +3705,8 @@ int block_hists_tui_browse(struct block_hist *bh, struct evsel *evsel, memset(&action, 0, sizeof(action)); + annotation_br_cntr_abbr_list(&br_cntr_text, evsel, false); + while (1) { key = hist_browser__run(browser, "? - help", true, 0); @@ -3723,6 +3727,16 @@ int block_hists_tui_browse(struct block_hist *bh, struct evsel *evsel, action.ms.sym = browser->selection->sym; do_annotate(browser, &action); continue; + case 'B': + if (br_cntr_text) { + ui__question_window("Branch counter abbr list", + br_cntr_text, "Press any key...", 0); + } else { + ui__question_window("Branch counter abbr list", + "\n The branch counter is not available.\n", + "Press any key...", 0); + } + continue; default: break; } @@ -3730,5 +3744,6 @@ int block_hists_tui_browse(struct block_hist *bh, struct evsel *evsel, out: hist_browser__delete(browser); + free(br_cntr_text); return 0; } diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c index c269758d47c8..5200cafe20e7 100644 --- a/tools/perf/util/annotate.c +++ b/tools/perf/util/annotate.c @@ -41,6 +41,7 @@ #include "namespaces.h" #include "thread.h" #include "hashmap.h" +#include "strbuf.h" #include #include #include @@ -48,6 +49,7 @@ #include #include #include +#include /* FIXME: For the HE_COLORSET */ #include "ui/browser.h" @@ -1719,6 +1721,149 @@ static void ipc_coverage_string(char *bf, int size, struct annotation *notes) ipc, coverage); } +int annotation_br_cntr_abbr_list(char **str, struct evsel *evsel, bool header) +{ + struct evsel *pos; + struct strbuf sb; + + if (evsel->evlist->nr_br_cntr <= 0) + return -ENOTSUP; + + strbuf_init(&sb, /*hint=*/ 0); + + if (header && strbuf_addf(&sb, "# Branch counter abbr list:\n")) + goto err; + + evlist__for_each_entry(evsel->evlist, pos) { + if (!(pos->core.attr.branch_sample_type & PERF_SAMPLE_BRANCH_COUNTERS)) + continue; + if (header && strbuf_addf(&sb, "#")) + goto err; + + if (strbuf_addf(&sb, " %s = %s\n", pos->name, pos->abbr_name)) + goto err; + } + + if (header && strbuf_addf(&sb, "#")) + goto err; + if (strbuf_addf(&sb, " '-' No event occurs\n")) + goto err; + + if (header && strbuf_addf(&sb, "#")) + goto err; + if (strbuf_addf(&sb, " '+' Event occurrences may be lost due to branch counter saturated\n")) + goto err; + + *str = strbuf_detach(&sb, NULL); + + return 0; +err: + strbuf_release(&sb); + return -ENOMEM; +} + +/* Assume the branch counter saturated at 3 */ +#define ANNOTATION_BR_CNTR_SATURATION 3 + +int annotation_br_cntr_entry(char **str, int br_cntr_nr, + u64 *br_cntr, int num_aggr, + struct evsel *evsel) +{ + struct evsel *pos = evsel ? evlist__first(evsel->evlist) : NULL; + bool saturated = false; + int i, j, avg, used; + struct strbuf sb; + + strbuf_init(&sb, /*hint=*/ 0); + for (i = 0; i < br_cntr_nr; i++) { + used = 0; + avg = ceil((double)(br_cntr[i] & ~ANNOTATION__BR_CNTR_SATURATED_FLAG) / + (double)num_aggr); + + /* + * A histogram with the abbr name is displayed by default. + * With -v, the exact number of branch counter is displayed. + */ + if (verbose) { + evlist__for_each_entry_from(evsel->evlist, pos) { + if ((pos->core.attr.branch_sample_type & PERF_SAMPLE_BRANCH_COUNTERS) && + (pos->br_cntr_idx == i)) + break; + } + if (strbuf_addstr(&sb, pos->abbr_name)) + goto err; + + if (!br_cntr[i]) { + if (strbuf_addstr(&sb, "=-")) + goto err; + } else { + if (strbuf_addf(&sb, "=%d", avg)) + goto err; + } + if (br_cntr[i] & ANNOTATION__BR_CNTR_SATURATED_FLAG) { + if (strbuf_addch(&sb, '+')) + goto err; + } else { + if (strbuf_addch(&sb, ' ')) + goto err; + } + + if ((i < br_cntr_nr - 1) && strbuf_addch(&sb, ',')) + goto err; + continue; + } + + if (strbuf_addch(&sb, '|')) + goto err; + + if (!br_cntr[i]) { + if (strbuf_addch(&sb, '-')) + goto err; + used++; + } else { + evlist__for_each_entry_from(evsel->evlist, pos) { + if ((pos->core.attr.branch_sample_type & PERF_SAMPLE_BRANCH_COUNTERS) && + (pos->br_cntr_idx == i)) + break; + } + if (br_cntr[i] & ANNOTATION__BR_CNTR_SATURATED_FLAG) + saturated = true; + + for (j = 0; j < avg; j++, used++) { + /* Print + if the number of logged events > 3 */ + if (j >= ANNOTATION_BR_CNTR_SATURATION) { + saturated = true; + break; + } + if (strbuf_addstr(&sb, pos->abbr_name)) + goto err; + } + + if (saturated) { + if (strbuf_addch(&sb, '+')) + goto err; + used++; + } + pos = list_next_entry(pos, core.node); + } + + for (j = used; j < ANNOTATION_BR_CNTR_SATURATION + 1; j++) { + if (strbuf_addch(&sb, ' ')) + goto err; + } + } + + if (!verbose && strbuf_addch(&sb, br_cntr_nr ? '|' : ' ')) + goto err; + + *str = strbuf_detach(&sb, NULL); + + return 0; +err: + strbuf_release(&sb); + return -ENOMEM; +} + static void __annotation_line__write(struct annotation_line *al, struct annotation *notes, bool first_line, bool current_entry, bool change_color, int width, void *obj, unsigned int percent_type, diff --git a/tools/perf/util/annotate.h b/tools/perf/util/annotate.h index 6b3023f3b18f..4fdaa3d5e8d2 100644 --- a/tools/perf/util/annotate.h +++ b/tools/perf/util/annotate.h @@ -553,4 +553,7 @@ int annotate_get_basic_blocks(struct symbol *sym, s64 src, s64 dst, void debuginfo_cache__delete(void); +int annotation_br_cntr_entry(char **str, int br_cntr_nr, u64 *br_cntr, + int num_aggr, struct evsel *evsel); +int annotation_br_cntr_abbr_list(char **str, struct evsel *evsel, bool header); #endif /* __PERF_ANNOTATE_H */ diff --git a/tools/perf/util/block-info.c b/tools/perf/util/block-info.c index 04068d48683f..649392bee7ed 100644 --- a/tools/perf/util/block-info.c +++ b/tools/perf/util/block-info.c @@ -40,16 +40,32 @@ static struct block_header_column { [PERF_HPP_REPORT__BLOCK_DSO] = { .name = "Shared Object", .width = 20, + }, + [PERF_HPP_REPORT__BLOCK_BRANCH_COUNTER] = { + .name = "Branch Counter", + .width = 30, } }; -struct block_info *block_info__new(void) +static struct block_info *block_info__new(unsigned int br_cntr_nr) { - return zalloc(sizeof(struct block_info)); + struct block_info *bi = zalloc(sizeof(struct block_info)); + + if (bi && br_cntr_nr) { + bi->br_cntr = calloc(br_cntr_nr, sizeof(u64)); + if (!bi->br_cntr) { + free(bi); + return NULL; + } + } + + return bi; } void block_info__delete(struct block_info *bi) { + if (bi) + free(bi->br_cntr); free(bi); } @@ -86,7 +102,8 @@ int64_t block_info__cmp(struct perf_hpp_fmt *fmt __maybe_unused, static void init_block_info(struct block_info *bi, struct symbol *sym, struct cyc_hist *ch, int offset, - u64 total_cycles) + u64 total_cycles, unsigned int br_cntr_nr, + u64 *br_cntr, struct evsel *evsel) { bi->sym = sym; bi->start = ch->start; @@ -99,10 +116,18 @@ static void init_block_info(struct block_info *bi, struct symbol *sym, memcpy(bi->cycles_spark, ch->cycles_spark, NUM_SPARKS * sizeof(u64)); + + if (br_cntr && br_cntr_nr) { + bi->br_cntr_nr = br_cntr_nr; + memcpy(bi->br_cntr, &br_cntr[offset * br_cntr_nr], + br_cntr_nr * sizeof(u64)); + } + bi->evsel = evsel; } int block_info__process_sym(struct hist_entry *he, struct block_hist *bh, - u64 *block_cycles_aggr, u64 total_cycles) + u64 *block_cycles_aggr, u64 total_cycles, + unsigned int br_cntr_nr) { struct annotation *notes; struct cyc_hist *ch; @@ -125,12 +150,14 @@ int block_info__process_sym(struct hist_entry *he, struct block_hist *bh, struct block_info *bi; struct hist_entry *he_block; - bi = block_info__new(); + bi = block_info__new(br_cntr_nr); if (!bi) return -1; init_block_info(bi, he->ms.sym, &ch[i], i, - total_cycles); + total_cycles, br_cntr_nr, + notes->branch->br_cntr, + hists_to_evsel(he->hists)); cycles += bi->cycles_aggr / bi->num_aggr; he_block = hists__add_entry_block(&bh->block_hists, @@ -327,6 +354,24 @@ static void init_block_header(struct block_fmt *block_fmt) fmt->width = block_column_width; } +static int block_branch_counter_entry(struct perf_hpp_fmt *fmt, + struct perf_hpp *hpp, + struct hist_entry *he) +{ + struct block_fmt *block_fmt = container_of(fmt, struct block_fmt, fmt); + struct block_info *bi = he->block_info; + char *buf; + int ret; + + if (annotation_br_cntr_entry(&buf, bi->br_cntr_nr, bi->br_cntr, + bi->num_aggr, bi->evsel)) + return 0; + + ret = scnprintf(hpp->buf, hpp->size, "%*s", block_fmt->width, buf); + free(buf); + return ret; +} + static void hpp_register(struct block_fmt *block_fmt, int idx, struct perf_hpp_list *hpp_list) { @@ -357,6 +402,9 @@ static void hpp_register(struct block_fmt *block_fmt, int idx, case PERF_HPP_REPORT__BLOCK_DSO: fmt->entry = block_dso_entry; break; + case PERF_HPP_REPORT__BLOCK_BRANCH_COUNTER: + fmt->entry = block_branch_counter_entry; + break; default: return; } @@ -390,7 +438,7 @@ static void init_block_hist(struct block_hist *bh, struct block_fmt *block_fmts, static int process_block_report(struct hists *hists, struct block_report *block_report, u64 total_cycles, int *block_hpps, - int nr_hpps) + int nr_hpps, unsigned int br_cntr_nr) { struct rb_node *next = rb_first_cached(&hists->entries); struct block_hist *bh = &block_report->hist; @@ -405,7 +453,7 @@ static int process_block_report(struct hists *hists, while (next) { he = rb_entry(next, struct hist_entry, rb_node); block_info__process_sym(he, bh, &block_report->cycles, - total_cycles); + total_cycles, br_cntr_nr); next = rb_next(&he->rb_node); } @@ -435,7 +483,7 @@ struct block_report *block_info__create_report(struct evlist *evlist, struct hists *hists = evsel__hists(pos); process_block_report(hists, &block_reports[i], total_cycles, - block_hpps, nr_hpps); + block_hpps, nr_hpps, evlist->nr_br_cntr); i++; } diff --git a/tools/perf/util/block-info.h b/tools/perf/util/block-info.h index 0b9e1aad4c55..b9329dc3ab59 100644 --- a/tools/perf/util/block-info.h +++ b/tools/perf/util/block-info.h @@ -18,6 +18,9 @@ struct block_info { u64 total_cycles; int num; int num_aggr; + int br_cntr_nr; + u64 *br_cntr; + struct evsel *evsel; }; struct block_fmt { @@ -36,6 +39,7 @@ enum { PERF_HPP_REPORT__BLOCK_AVG_CYCLES, PERF_HPP_REPORT__BLOCK_RANGE, PERF_HPP_REPORT__BLOCK_DSO, + PERF_HPP_REPORT__BLOCK_BRANCH_COUNTER, PERF_HPP_REPORT__BLOCK_MAX_INDEX }; @@ -46,7 +50,6 @@ struct block_report { int nr_fmts; }; -struct block_info *block_info__new(void); void block_info__delete(struct block_info *bi); int64_t __block_info__cmp(struct hist_entry *left, struct hist_entry *right); @@ -55,7 +58,8 @@ int64_t block_info__cmp(struct perf_hpp_fmt *fmt __maybe_unused, struct hist_entry *left, struct hist_entry *right); int block_info__process_sym(struct hist_entry *he, struct block_hist *bh, - u64 *block_cycles_aggr, u64 total_cycles); + u64 *block_cycles_aggr, u64 total_cycles, + unsigned int br_cntr_nr); struct block_report *block_info__create_report(struct evlist *evlist, u64 total_cycles, -- cgit v1.2.3 From e6952dcec8302643a8b59751df250d03762429c3 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Tue, 13 Aug 2024 09:02:06 -0700 Subject: perf annotate: Display the branch counter histogram MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Display the branch counter histogram in the annotation view. Press 'B' to display the branch counter's abbreviation list as well. Samples: 1M of events 'anon group { branch-instructions:ppp, branch-misses }', 4000 Hz, Event count (approx.): f3 /home/sdp/test/tchain_edit [Percent: local period] Percent │ IPC Cycle Branch Counter (Average IPC: 1.39, IPC Coverage: 29.4%) │ 0000000000401755 : 0.00 0.00 │ endbr64 │ push %rbp │ mov %rsp,%rbp │ movl $0x0,-0x4(%rbp) 0.00 0.00 │1.33 3 |A |- | ↓ jmp 25 11.03 11.03 │ 11: mov -0x4(%rbp),%eax │ and $0x1,%eax │ test %eax,%eax 17.13 17.13 │2.41 1 |A |- | ↓ je 21 │ addl $0x1,-0x4(%rbp) 21.84 21.84 │2.22 2 |AA |- | ↓ jmp 25 17.13 17.13 │ 21: addl $0x1,-0x4(%rbp) 21.84 21.84 │ 25: cmpl $0x270f,-0x4(%rbp) 11.03 11.03 │0.61 3 |A |- | ↑ jle 11 │ nop │ pop %rbp 0.00 0.00 │0.24 20 |AA |B | ← ret Originally-by: Tinghao Zhang Reviewed-by: Andi Kleen Signed-off-by: Kan Liang Acked-by: Namhyung Kim Cc: Adrian Hunter Cc: Ian Rogers Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Stephane Eranian Link: https://lore.kernel.org/r/20240813160208.2493643-8-kan.liang@linux.intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-annotate.c | 10 +++++++--- tools/perf/ui/browsers/annotate.c | 18 ++++++++++++++++-- tools/perf/ui/browsers/hists.c | 3 ++- tools/perf/util/annotate.c | 40 ++++++++++++++++++++++++++++++++++++--- tools/perf/util/annotate.h | 11 +++++++++++ tools/perf/util/disasm.c | 1 + 6 files changed, 74 insertions(+), 9 deletions(-) (limited to 'tools') diff --git a/tools/perf/builtin-annotate.c b/tools/perf/builtin-annotate.c index 9b4a8a379b5b..3dc6197ef3fa 100644 --- a/tools/perf/builtin-annotate.c +++ b/tools/perf/builtin-annotate.c @@ -927,11 +927,15 @@ int cmd_annotate(int argc, const char **argv) sort_order = "dso,symbol"; /* - * Set SORT_MODE__BRANCH so that annotate display IPC/Cycle - * if branch info is in perf data in TUI mode. + * Set SORT_MODE__BRANCH so that annotate displays IPC/Cycle and + * branch counters, if the corresponding branch info is available + * in the perf data in the TUI mode. */ - if ((use_browser == 1 || annotate.use_stdio2) && annotate.has_br_stack) + if ((use_browser == 1 || annotate.use_stdio2) && annotate.has_br_stack) { sort__mode = SORT_MODE__BRANCH; + if (annotate.session->evlist->nr_br_cntr > 0) + annotate_opts.show_br_cntr = true; + } if (setup_sorting(NULL) < 0) usage_with_options(annotate_usage, options); diff --git a/tools/perf/ui/browsers/annotate.c b/tools/perf/ui/browsers/annotate.c index fe991a81256b..d7e727345dab 100644 --- a/tools/perf/ui/browsers/annotate.c +++ b/tools/perf/ui/browsers/annotate.c @@ -156,6 +156,7 @@ static void annotate_browser__draw_current_jump(struct ui_browser *browser) struct symbol *sym = ms->sym; struct annotation *notes = symbol__annotation(sym); u8 pcnt_width = annotation__pcnt_width(notes); + u8 cntr_width = annotation__br_cntr_width(); int width; int diff = 0; @@ -205,13 +206,13 @@ static void annotate_browser__draw_current_jump(struct ui_browser *browser) ui_browser__set_color(browser, HE_COLORSET_JUMP_ARROWS); __ui_browser__line_arrow(browser, - pcnt_width + 2 + notes->src->widths.addr + width, + pcnt_width + 2 + notes->src->widths.addr + width + cntr_width, from, to); diff = is_fused(ab, cursor); if (diff > 0) { ui_browser__mark_fused(browser, - pcnt_width + 3 + notes->src->widths.addr + width, + pcnt_width + 3 + notes->src->widths.addr + width + cntr_width, from - diff, diff, to > from); } } @@ -714,6 +715,7 @@ static int annotate_browser__run(struct annotate_browser *browser, struct annotation *notes = symbol__annotation(ms->sym); const char *help = "Press 'h' for help on key bindings"; int delay_secs = hbt ? hbt->refresh : 0; + char *br_cntr_text = NULL; char title[256]; int key; @@ -730,6 +732,8 @@ static int annotate_browser__run(struct annotate_browser *browser, nd = browser->curr_hot; + annotation_br_cntr_abbr_list(&br_cntr_text, evsel, false); + while (1) { key = ui_browser__run(&browser->b, delay_secs); @@ -796,6 +800,7 @@ static int annotate_browser__run(struct annotate_browser *browser, "r Run available scripts\n" "p Toggle percent type [local/global]\n" "b Toggle percent base [period/hits]\n" + "B Branch counter abbr list (Optional)\n" "? Search string backwards\n" "f Toggle showing offsets to full address\n"); continue; @@ -904,6 +909,14 @@ show_sup_ins: hists__scnprintf_title(hists, title, sizeof(title)); annotate_browser__show(&browser->b, title, help); continue; + case 'B': + if (br_cntr_text) + ui_browser__help_window(&browser->b, br_cntr_text); + else { + ui_browser__help_window(&browser->b, + "\n The branch counter is not available.\n"); + } + continue; case 'f': annotation__toggle_full_addr(notes, ms); continue; @@ -923,6 +936,7 @@ show_sup_ins: } out: ui_browser__hide(&browser->b); + free(br_cntr_text); return key; } diff --git a/tools/perf/ui/browsers/hists.c b/tools/perf/ui/browsers/hists.c index 970f7f349298..49ba82bf3391 100644 --- a/tools/perf/ui/browsers/hists.c +++ b/tools/perf/ui/browsers/hists.c @@ -3705,7 +3705,8 @@ int block_hists_tui_browse(struct block_hist *bh, struct evsel *evsel, memset(&action, 0, sizeof(action)); - annotation_br_cntr_abbr_list(&br_cntr_text, evsel, false); + if (!annotation_br_cntr_abbr_list(&br_cntr_text, evsel, false)) + annotate_opts.show_br_cntr = true; while (1) { key = hist_browser__run(browser, "? - help", true, 0); diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c index 5200cafe20e7..4990c70b1794 100644 --- a/tools/perf/util/annotate.c +++ b/tools/perf/util/annotate.c @@ -501,8 +501,10 @@ static void annotation__count_and_fill(struct annotation *notes, u64 start, u64 } } -static int annotation__compute_ipc(struct annotation *notes, size_t size) +static int annotation__compute_ipc(struct annotation *notes, size_t size, + struct evsel *evsel) { + unsigned int br_cntr_nr = evsel->evlist->nr_br_cntr; int err = 0; s64 offset; @@ -537,6 +539,20 @@ static int annotation__compute_ipc(struct annotation *notes, size_t size) al->cycles->max = ch->cycles_max; al->cycles->min = ch->cycles_min; } + if (al && notes->branch->br_cntr) { + if (!al->br_cntr) { + al->br_cntr = calloc(br_cntr_nr, sizeof(u64)); + if (!al->br_cntr) { + err = ENOMEM; + break; + } + } + al->num_aggr = ch->num_aggr; + al->br_cntr_nr = br_cntr_nr; + al->evsel = evsel; + memcpy(al->br_cntr, ¬es->branch->br_cntr[offset * br_cntr_nr], + br_cntr_nr * sizeof(u64)); + } } } @@ -548,8 +564,10 @@ static int annotation__compute_ipc(struct annotation *notes, size_t size) struct annotation_line *al; al = annotated_source__get_line(notes->src, offset); - if (al) + if (al) { zfree(&al->cycles); + zfree(&al->br_cntr); + } } } } @@ -1960,6 +1978,22 @@ static void __annotation_line__write(struct annotation_line *al, struct annotati "Cycle(min/max)"); } + if (annotate_opts.show_br_cntr) { + if (show_title) { + obj__printf(obj, "%*s ", + ANNOTATION__BR_CNTR_WIDTH, + "Branch Counter"); + } else { + char *buf; + + if (!annotation_br_cntr_entry(&buf, al->br_cntr_nr, al->br_cntr, + al->num_aggr, al->evsel)) { + obj__printf(obj, "%*s ", ANNOTATION__BR_CNTR_WIDTH, buf); + free(buf); + } + } + } + if (show_title && !*al->line) { ipc_coverage_string(bf, sizeof(bf), notes); obj__printf(obj, "%*s", ANNOTATION__AVG_IPC_WIDTH, bf); @@ -2056,7 +2090,7 @@ int symbol__annotate2(struct map_symbol *ms, struct evsel *evsel, annotation__set_index(notes); annotation__mark_jump_targets(notes, sym); - err = annotation__compute_ipc(notes, size); + err = annotation__compute_ipc(notes, size, evsel); if (err) return err; diff --git a/tools/perf/util/annotate.h b/tools/perf/util/annotate.h index 4fdaa3d5e8d2..8b9e05a1932f 100644 --- a/tools/perf/util/annotate.h +++ b/tools/perf/util/annotate.h @@ -31,6 +31,7 @@ struct annotated_data_type; #define ANNOTATION__CYCLES_WIDTH 6 #define ANNOTATION__MINMAX_CYCLES_WIDTH 19 #define ANNOTATION__AVG_IPC_WIDTH 36 +#define ANNOTATION__BR_CNTR_WIDTH 30 #define ANNOTATION_DUMMY_LEN 256 struct annotation_options { @@ -44,6 +45,7 @@ struct annotation_options { show_nr_jumps, show_minmax_cycle, show_asm_raw, + show_br_cntr, annotate_src, full_addr; u8 offset_level; @@ -104,6 +106,10 @@ struct annotation_line { char *fileloc; char *path; struct cycles_info *cycles; + int num_aggr; + int br_cntr_nr; + u64 *br_cntr; + struct evsel *evsel; int jump_sources; u32 idx; int idx_asm; @@ -353,6 +359,11 @@ static inline bool annotation_line__filter(struct annotation_line *al) return annotate_opts.hide_src_code && al->offset == -1; } +static inline u8 annotation__br_cntr_width(void) +{ + return annotate_opts.show_br_cntr ? ANNOTATION__BR_CNTR_WIDTH : 0; +} + void annotation__update_column_widths(struct annotation *notes); void annotation__toggle_full_addr(struct annotation *notes, struct map_symbol *ms); diff --git a/tools/perf/util/disasm.c b/tools/perf/util/disasm.c index 226d2181f694..d11e75133b23 100644 --- a/tools/perf/util/disasm.c +++ b/tools/perf/util/disasm.c @@ -1015,6 +1015,7 @@ static void annotation_line__exit(struct annotation_line *al) zfree_srcline(&al->path); zfree(&al->line); zfree(&al->cycles); + zfree(&al->br_cntr); } static size_t disasm_line_size(int nr) -- cgit v1.2.3 From 6f9d8d1de2c61288edd7dfc2a7e7986c8b2ae927 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Tue, 13 Aug 2024 09:02:07 -0700 Subject: perf script: Add branch counters It's useful to print the branch counter information for each jump in the brstackinsn when it's available. Add a new field 'brcntr' to display the branch counter information. By default, the abbreviation will be used to indicate the branch counter. In the verbose mode, the real event name is shown. $ perf script -F +brstackinsn,+brcntr # Branch counter abbr list: # branch-instructions:ppp = A # branch-misses = B # '-' No event occurs # '+' Event occurrences may be lost due to branch counter saturated tchain_edit 332203 3366329.405674: 53030 branch-instructions:ppp: 401781 f3+0x2c (home/sdp/test/tchain_edit) f3+31: 0000000000401774 insn: eb 04 br_cntr: AA # PRED 5 cycles [5] 000000000040177a insn: 81 7d fc 0f 27 00 00 0000000000401781 insn: 7e e3 br_cntr: A # PRED 1 cycles [6] 2.00 IPC 0000000000401766 insn: 8b 45 fc 0000000000401769 insn: 83 e0 01 000000000040176c insn: 85 c0 000000000040176e insn: 74 06 br_cntr: A # PRED 1 cycles [7] 4.00 IPC 0000000000401776 insn: 83 45 fc 01 000000000040177a insn: 81 7d fc 0f 27 00 00 0000000000401781 insn: 7e e3 br_cntr: A # PRED 7 cycles [14] 0.43 IPC $ perf script -F +brstackinsn,+brcntr -v tchain_edit 332203 3366329.405674: 53030 branch-instructions:ppp: 401781 f3+0x2c (/home/sdp/os.linux.perf.test-suite/kernels/lbr_kernel/tchain_edit) f3+31: 0000000000401774 insn: eb 04 br_cntr: branch-instructions:ppp 2 branch-misses 0 # PRED 5 cycles [5] 000000000040177a insn: 81 7d fc 0f 27 00 00 0000000000401781 insn: 7e e3 br_cntr: branch-instructions:ppp 1 branch-misses 0 # PRED 1 cycles [6] 2.00 IPC 0000000000401766 insn: 8b 45 fc 0000000000401769 insn: 83 e0 01 000000000040176c insn: 85 c0 000000000040176e insn: 74 06 br_cntr: branch-instructions:ppp 1 branch-misses 0 # PRED 1 cycles [7] 4.00 IPC 0000000000401776 insn: 83 45 fc 01 000000000040177a insn: 81 7d fc 0f 27 00 00 0000000000401781 insn: 7e e3 br_cntr: branch-instructions:ppp 1 branch-misses 0 # PRED 7 cycles [14] 0.43 IPC Originally-by: Tinghao Zhang Reviewed-by: Andi Kleen Signed-off-by: Kan Liang Acked-by: Namhyung Kim Cc: Adrian Hunter Cc: Ian Rogers Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Stephane Eranian Link: https://lore.kernel.org/r/20240813160208.2493643-9-kan.liang@linux.intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Documentation/perf-script.txt | 2 +- tools/perf/builtin-script.c | 69 ++++++++++++++++++++++++++++---- 2 files changed, 63 insertions(+), 8 deletions(-) (limited to 'tools') diff --git a/tools/perf/Documentation/perf-script.txt b/tools/perf/Documentation/perf-script.txt index 5abb960c4960..b72866ef270b 100644 --- a/tools/perf/Documentation/perf-script.txt +++ b/tools/perf/Documentation/perf-script.txt @@ -134,7 +134,7 @@ OPTIONS srcline, period, iregs, uregs, brstack, brstacksym, flags, bpf-output, brstackinsn, brstackinsnlen, brstackdisasm, brstackoff, callindent, insn, disasm, insnlen, synth, phys_addr, metric, misc, srccode, ipc, data_page_size, - code_page_size, ins_lat, machine_pid, vcpu, cgroup, retire_lat, + code_page_size, ins_lat, machine_pid, vcpu, cgroup, retire_lat, brcntr, Field list can be prepended with the type, trace, sw or hw, to indicate to which event type the field list applies. diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c index 54598d1e815a..206b08426555 100644 --- a/tools/perf/builtin-script.c +++ b/tools/perf/builtin-script.c @@ -62,6 +62,7 @@ #include "util/record.h" #include "util/util.h" #include "util/cgroup.h" +#include "util/annotate.h" #include "perf.h" #include @@ -138,6 +139,7 @@ enum perf_output_field { PERF_OUTPUT_DSOFF = 1ULL << 41, PERF_OUTPUT_DISASM = 1ULL << 42, PERF_OUTPUT_BRSTACKDISASM = 1ULL << 43, + PERF_OUTPUT_BRCNTR = 1ULL << 44, }; struct perf_script { @@ -213,6 +215,7 @@ struct output_option { {.str = "cgroup", .field = PERF_OUTPUT_CGROUP}, {.str = "retire_lat", .field = PERF_OUTPUT_RETIRE_LAT}, {.str = "brstackdisasm", .field = PERF_OUTPUT_BRSTACKDISASM}, + {.str = "brcntr", .field = PERF_OUTPUT_BRCNTR}, }; enum { @@ -520,6 +523,12 @@ static int evsel__check_attr(struct evsel *evsel, struct perf_session *session) "Hint: run 'perf record -b ...'\n"); return -EINVAL; } + if (PRINT_FIELD(BRCNTR) && + !(evlist__combined_branch_type(session->evlist) & PERF_SAMPLE_BRANCH_COUNTERS)) { + pr_err("Display of branch counter requested but it's not enabled\n" + "Hint: run 'perf record -j any,counter ...'\n"); + return -EINVAL; + } if ((PRINT_FIELD(PID) || PRINT_FIELD(TID)) && evsel__check_stype(evsel, PERF_SAMPLE_TID, "TID", PERF_OUTPUT_TID|PERF_OUTPUT_PID)) return -EINVAL; @@ -789,6 +798,19 @@ static int perf_sample__fprintf_start(struct perf_script *script, int printed = 0; char tstr[128]; + /* + * Print the branch counter's abbreviation list, + * if the branch counter is available. + */ + if (PRINT_FIELD(BRCNTR) && !verbose) { + char *buf; + + if (!annotation_br_cntr_abbr_list(&buf, evsel, true)) { + printed += fprintf(stdout, "%s", buf); + free(buf); + } + } + if (PRINT_FIELD(MACHINE_PID) && sample->machine_pid) printed += fprintf(fp, "VM:%5d ", sample->machine_pid); @@ -1195,7 +1217,9 @@ static int ip__fprintf_jump(uint64_t ip, struct branch_entry *en, struct perf_insn *x, u8 *inbuf, int len, int insn, FILE *fp, int *total_cycles, struct perf_event_attr *attr, - struct thread *thread) + struct thread *thread, + struct evsel *evsel, + u64 br_cntr) { int ilen = 0; int printed = fprintf(fp, "\t%016" PRIx64 "\t", ip); @@ -1216,6 +1240,28 @@ static int ip__fprintf_jump(uint64_t ip, struct branch_entry *en, addr_location__exit(&al); } + if (PRINT_FIELD(BRCNTR)) { + unsigned int width = evsel__env(evsel)->br_cntr_width; + unsigned int i = 0, j, num, mask = (1L << width) - 1; + struct evsel *pos = evsel__leader(evsel); + + printed += fprintf(fp, "br_cntr: "); + evlist__for_each_entry_from(evsel->evlist, pos) { + if (!(pos->core.attr.branch_sample_type & PERF_SAMPLE_BRANCH_COUNTERS)) + continue; + if (evsel__leader(pos) != evsel__leader(evsel)) + break; + + num = (br_cntr >> (i++ * width)) & mask; + if (!verbose) { + for (j = 0; j < num; j++) + printed += fprintf(fp, "%s", pos->abbr_name); + } else + printed += fprintf(fp, "%s %d ", pos->name, num); + } + printed += fprintf(fp, "\t"); + } + printed += fprintf(fp, "#%s%s%s%s", en->flags.predicted ? " PRED" : "", en->flags.mispred ? " MISPRED" : "", @@ -1272,6 +1318,7 @@ out: } static int perf_sample__fprintf_brstackinsn(struct perf_sample *sample, + struct evsel *evsel, struct thread *thread, struct perf_event_attr *attr, struct machine *machine, FILE *fp) @@ -1285,6 +1332,7 @@ static int perf_sample__fprintf_brstackinsn(struct perf_sample *sample, unsigned off; struct symbol *lastsym = NULL; int total_cycles = 0; + u64 br_cntr = 0; if (!(br && br->nr)) return 0; @@ -1296,6 +1344,9 @@ static int perf_sample__fprintf_brstackinsn(struct perf_sample *sample, x.machine = machine; x.cpu = sample->cpu; + if (PRINT_FIELD(BRCNTR) && sample->branch_stack_cntr) + br_cntr = sample->branch_stack_cntr[nr - 1]; + printed += fprintf(fp, "%c", '\n'); /* Handle first from jump, of which we don't know the entry. */ @@ -1307,7 +1358,7 @@ static int perf_sample__fprintf_brstackinsn(struct perf_sample *sample, x.cpumode, x.cpu, &lastsym, attr, fp); printed += ip__fprintf_jump(entries[nr - 1].from, &entries[nr - 1], &x, buffer, len, 0, fp, &total_cycles, - attr, thread); + attr, thread, evsel, br_cntr); if (PRINT_FIELD(SRCCODE)) printed += print_srccode(thread, x.cpumode, entries[nr - 1].from); } @@ -1337,8 +1388,10 @@ static int perf_sample__fprintf_brstackinsn(struct perf_sample *sample, printed += ip__fprintf_sym(ip, thread, x.cpumode, x.cpu, &lastsym, attr, fp); if (ip == end) { + if (PRINT_FIELD(BRCNTR) && sample->branch_stack_cntr) + br_cntr = sample->branch_stack_cntr[i]; printed += ip__fprintf_jump(ip, &entries[i], &x, buffer + off, len - off, ++insn, fp, - &total_cycles, attr, thread); + &total_cycles, attr, thread, evsel, br_cntr); if (PRINT_FIELD(SRCCODE)) printed += print_srccode(thread, x.cpumode, ip); break; @@ -1547,6 +1600,7 @@ void script_fetch_insn(struct perf_sample *sample, struct thread *thread, } static int perf_sample__fprintf_insn(struct perf_sample *sample, + struct evsel *evsel, struct perf_event_attr *attr, struct thread *thread, struct machine *machine, FILE *fp, @@ -1567,7 +1621,7 @@ static int perf_sample__fprintf_insn(struct perf_sample *sample, printed += sample__fprintf_insn_asm(sample, thread, machine, fp, al); } if (PRINT_FIELD(BRSTACKINSN) || PRINT_FIELD(BRSTACKINSNLEN) || PRINT_FIELD(BRSTACKDISASM)) - printed += perf_sample__fprintf_brstackinsn(sample, thread, attr, machine, fp); + printed += perf_sample__fprintf_brstackinsn(sample, evsel, thread, attr, machine, fp); return printed; } @@ -1639,7 +1693,7 @@ static int perf_sample__fprintf_bts(struct perf_sample *sample, if (print_srcline_last) printed += map__fprintf_srcline(al->map, al->addr, "\n ", fp); - printed += perf_sample__fprintf_insn(sample, attr, thread, machine, fp, al); + printed += perf_sample__fprintf_insn(sample, evsel, attr, thread, machine, fp, al); printed += fprintf(fp, "\n"); if (PRINT_FIELD(SRCCODE)) { int ret = map__fprintf_srccode(al->map, al->addr, stdout, @@ -2297,7 +2351,7 @@ static void process_event(struct perf_script *script, if (evsel__is_bpf_output(evsel) && PRINT_FIELD(BPF_OUTPUT)) perf_sample__fprintf_bpf_output(sample, fp); - perf_sample__fprintf_insn(sample, attr, thread, machine, fp, al); + perf_sample__fprintf_insn(sample, evsel, attr, thread, machine, fp, al); if (PRINT_FIELD(PHYS_ADDR)) fprintf(fp, "%16" PRIx64, sample->phys_addr); @@ -3947,7 +4001,8 @@ int cmd_script(int argc, const char **argv) "brstacksym,flags,data_src,weight,bpf-output,brstackinsn," "brstackinsnlen,brstackdisasm,brstackoff,callindent,insn,disasm,insnlen,synth," "phys_addr,metric,misc,srccode,ipc,tod,data_page_size," - "code_page_size,ins_lat,machine_pid,vcpu,cgroup,retire_lat", + "code_page_size,ins_lat,machine_pid,vcpu,cgroup,retire_lat," + "brcntr", parse_output_fields), OPT_BOOLEAN('a', "all-cpus", &system_wide, "system-wide collection from all CPUs"), -- cgit v1.2.3 From dab5b6cb0d405425502e60450ecf1c8e5ea6b845 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Tue, 13 Aug 2024 09:02:08 -0700 Subject: perf test: Add new test cases for the branch counter feature Enhance the test case for the branch counter feature. Now, the test verifies: - The new filter can be successfully applied on the supported platforms. - The counter value can be outputted via the perf report -D - The counter value and the abbr name can be outputted via the perf script (New) Reviewed-by: Andi Kleen Signed-off-by: Kan Liang Acked-by: Namhyung Kim Cc: Adrian Hunter Cc: Ian Rogers Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Stephane Eranian Link: https://lore.kernel.org/r/20240813160208.2493643-10-kan.liang@linux.intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/shell/record.sh | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) (limited to 'tools') diff --git a/tools/perf/tests/shell/record.sh b/tools/perf/tests/shell/record.sh index 3d1a7759a7b2..7964ebd9007d 100755 --- a/tools/perf/tests/shell/record.sh +++ b/tools/perf/tests/shell/record.sh @@ -21,6 +21,7 @@ testprog="perf test -w thloop" cpu_pmu_dir="/sys/bus/event_source/devices/cpu*" br_cntr_file="/caps/branch_counter_nr" br_cntr_output="branch stack counters" +br_cntr_script_output="br_cntr: A" cleanup() { rm -rf "${perfdata}" @@ -165,7 +166,7 @@ test_workload() { } test_branch_counter() { - echo "Basic branch counter test" + echo "Branch counter test" # Check if the branch counter feature is supported for dir in $cpu_pmu_dir do @@ -175,19 +176,25 @@ test_branch_counter() { return fi done - if ! perf record -o "${perfdata}" -j any,counter ${testprog} 2> /dev/null + if ! perf record -o "${perfdata}" -e "{branches:p,instructions}" -j any,counter ${testprog} 2> /dev/null then - echo "Basic branch counter test [Failed record]" + echo "Branch counter record test [Failed record]" err=1 return fi if ! perf report -i "${perfdata}" -D -q | grep -q "$br_cntr_output" then - echo "Basic branch record test [Failed missing output]" + echo "Branch counter report test [Failed missing output]" err=1 return fi - echo "Basic branch counter test [Success]" + if ! perf script -i "${perfdata}" -F +brstackinsn,+brcntr | grep -q "$br_cntr_script_output" + then + echo " Branch counter script test [Failed missing output]" + err=1 + return + fi + echo "Branch counter test [Success]" } test_per_thread -- cgit v1.2.3 From 4c55560f23d19051adc7e76818687a88448bef83 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Wed, 14 Aug 2024 10:36:24 -0300 Subject: perf build: Fix up broken capstone feature detection fast path MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The capstone devel headers define 'struct bpf_insn' in a way that clashes with what is in the libbpf devel headers, so we so far need to avoid including both. This is happening on the tools/build/feature/test-all.c file, where we try building all the expected set of libraries to be normally available on a system: ⬢[acme@toolbox perf-tools-next]$ cat /tmp/build/perf-tools-next/feature/test-all.make.output In file included from test-bpf.c:3, from test-all.c:150: /home/acme/git/perf-tools-next/tools/include/uapi/linux/bpf.h:77:8: error: ‘bpf_insn’ defined as wrong kind of tag 77 | struct bpf_insn { | ^~~~~~~~ ⬢[acme@toolbox perf-tools-next]$ cat /tmp/build/perf-tools-next/feature/test-all.make.output When doing so there is a trick where we define main to be main_test_libcapstone, then include the individual tools/build/feture/test-libcapstone.c capability query test, and then we undef 'main' because we'll do it all over again with the next expected library to be tested (at this time 'lzma'). To complete this mechanism we need to, in test-all.c 'main' routine, to call main_test_libcapstone(), which isn't being done, so the effect of adding references to capstone in test-all.c are not achieved. The only thing that is happening is that test-all.c is failing to build and thus all the tests will have to be done individually, which nullifies the test-all.c single build speedup. So lets remove references to capstone from test-all.c to see if this makes it build again so that we get faster builds or go on fixing up whatever is preventing us to get that benefit. Nothing: after this fix we get a clean test-all.c build and get the build speedup back: ⬢[acme@toolbox perf-tools-next]$ cat /tmp/build/perf-tools-next/feature/test-all.make.output ⬢[acme@toolbox perf-tools-next]$ cat /tmp/build/perf-tools-next/feature/test-all. test-all.bin test-all.d test-all.make.output ⬢[acme@toolbox perf-tools-next]$ cat /tmp/build/perf-tools-next/feature/test-all.make.output ⬢[acme@toolbox perf-tools-next]$ ldd /tmp/build/perf-tools-next/feature/test-all.bin linux-vdso.so.1 (0x00007f13277a1000) libpython3.12.so.1.0 => /lib64/libpython3.12.so.1.0 (0x00007f1326e00000) libm.so.6 => /lib64/libm.so.6 (0x00007f13274be000) libtraceevent.so.1 => /lib64/libtraceevent.so.1 (0x00007f1327496000) libtracefs.so.1 => /lib64/libtracefs.so.1 (0x00007f132746f000) libcrypto.so.3 => /lib64/libcrypto.so.3 (0x00007f1326800000) libunwind-x86_64.so.8 => /lib64/libunwind-x86_64.so.8 (0x00007f1327452000) libunwind.so.8 => /lib64/libunwind.so.8 (0x00007f1327436000) liblzma.so.5 => /lib64/liblzma.so.5 (0x00007f1327403000) libdw.so.1 => /lib64/libdw.so.1 (0x00007f1326d6f000) libz.so.1 => /lib64/libz.so.1 (0x00007f13273e2000) libelf.so.1 => /lib64/libelf.so.1 (0x00007f1326d53000) libnuma.so.1 => /lib64/libnuma.so.1 (0x00007f13273d4000) libslang.so.2 => /lib64/libslang.so.2 (0x00007f1326400000) libperl.so.5.38 => /lib64/libperl.so.5.38 (0x00007f1326000000) libc.so.6 => /lib64/libc.so.6 (0x00007f1325e0f000) libzstd.so.1 => /lib64/libzstd.so.1 (0x00007f1326741000) /lib64/ld-linux-x86-64.so.2 (0x00007f13277a3000) libbz2.so.1 => /lib64/libbz2.so.1 (0x00007f1326d3f000) libcrypt.so.2 => /lib64/libcrypt.so.2 (0x00007f1326d07000) ⬢[acme@toolbox perf-tools-next]$ And when having capstone-devel installed we get it detected and linked with perf, allowing us to benefit from the features that it enables: ⬢[acme@toolbox perf-tools-next]$ rpm -q capstone-devel capstone-devel-5.0.1-3.fc40.x86_64 ⬢[acme@toolbox perf-tools-next]$ ldd /tmp/build/perf-tools-next/perf | grep capstone libcapstone.so.5 => /lib64/libcapstone.so.5 (0x00007fe6a5c00000) ⬢[acme@toolbox perf-tools-next]$ /tmp/build/perf-tools-next/perf -vv | grep cap libcapstone: [ on ] # HAVE_LIBCAPSTONE_SUPPORT ⬢[acme@toolbox perf-tools-next]$ Fixes: 8b767db3309595a2 ("perf: build: introduce the libcapstone") Cc: Adrian Hunter Cc: Andi Kleen Cc: Changbin Du Cc: Ian Rogers Cc: Jiri Olsa Cc: Kan Liang Cc: Namhyung Kim Link: https://lore.kernel.org/lkml/Zry0sepD5Ppa5YKP@x1 Signed-off-by: Arnaldo Carvalho de Melo --- tools/build/feature/test-all.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'tools') diff --git a/tools/build/feature/test-all.c b/tools/build/feature/test-all.c index dd0a18c2ef8f..6f4bf386a3b5 100644 --- a/tools/build/feature/test-all.c +++ b/tools/build/feature/test-all.c @@ -134,10 +134,6 @@ #undef main #endif -#define main main_test_libcapstone -# include "test-libcapstone.c" -#undef main - #define main main_test_lzma # include "test-lzma.c" #undef main -- cgit v1.2.3 From 27ac597c0e2f596f53963db4a2cf53cd84657cda Mon Sep 17 00:00:00 2001 From: Veronika Molnarova Date: Mon, 29 Apr 2024 10:53:07 +0200 Subject: perf test record.sh: Raise limit of open file descriptors Subtest for system-wide record with '--threads=cpu' option fails due to a limit of open file descriptors on systems with 128 or more CPUs as the default limit is set to 1024. The number of open file descriptors should be slightly above nmb_events*nmb_cpus + nmb_cpus(for perf.data.n) + 4*nmb_cpus(for pipes), which equals 8*nmb_cpus. Therefore, temporarily raise the limit to 16*nmb_cpus for the test. Committer notes: Instead of disabling ShellCheck warnings all the uses of 'uname -n', i.e. those: In tests/shell/record.sh line 35: default_fd_limit=$(ulimit -Sn) ^-^ SC3045 (warning): In POSIX sh, ulimit -S is undefined. We can just switch from using '/bin/sh' to '/bin/bash' for this test, as bash _has_ 'ulimit -n', so ShellCheck will not emit that warning. There are dozens of 'perf test' shell tests that do just that, '/bin/bash' is a reasonable expectation for those tests. Signed-off-by: Veronika Molnarova Cc: Adrian Hunter Cc: Athira Rajeev Cc: Ian Rogers Cc: Jiri Olsa Cc: Kajol Jain Cc: Michael Petlan Cc: Radostin Stoyanov Link: https://lore.kernel.org/linux-perf-users/20240429085721.10122-1-vmolnaro@redhat.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/shell/record.sh | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/perf/tests/shell/record.sh b/tools/perf/tests/shell/record.sh index 7964ebd9007d..36883b03169f 100755 --- a/tools/perf/tests/shell/record.sh +++ b/tools/perf/tests/shell/record.sh @@ -1,4 +1,4 @@ -#!/bin/sh +#!/bin/bash # perf record tests # SPDX-License-Identifier: GPL-2.0 @@ -23,6 +23,15 @@ br_cntr_file="/caps/branch_counter_nr" br_cntr_output="branch stack counters" br_cntr_script_output="br_cntr: A" +default_fd_limit=$(ulimit -Sn) +# With option --threads=cpu the number of open file descriptors should be +# equal to sum of: nmb_cpus * nmb_events (2+dummy), +# nmb_threads for perf.data.n (equal to nmb_cpus) and +# 2*nmb_cpus of pipes = 4*nmb_cpus (each pipe has 2 ends) +# All together it needs 8*nmb_cpus file descriptors plus some are also used +# outside of testing, thus raising the limit to 16*nmb_cpus +min_fd_limit=$(($(getconf _NPROCESSORS_ONLN) * 16)) + cleanup() { rm -rf "${perfdata}" rm -rf "${perfdata}".old @@ -197,11 +206,19 @@ test_branch_counter() { echo "Branch counter test [Success]" } +# raise the limit of file descriptors to minimum +if [[ $default_fd_limit -lt $min_fd_limit ]]; then + ulimit -Sn $min_fd_limit +fi + test_per_thread test_register_capture test_system_wide test_workload test_branch_counter +# restore the default value +ulimit -Sn $default_fd_limit + cleanup exit $err -- cgit v1.2.3 From b4ed2c67d275b85b2ab07d54f88bebd5998d61d8 Mon Sep 17 00:00:00 2001 From: Yan Zhao Date: Wed, 3 Jul 2024 10:11:19 +0800 Subject: KVM: selftests: Test slot move/delete with slot zap quirk enabled/disabled Update set_memory_region_test to make sure memslot move and deletion function correctly both when slot zap quirk KVM_X86_QUIRK_SLOT_ZAP_ALL is enabled and disabled. Signed-off-by: Yan Zhao Message-ID: <20240703021119.13904-1-yan.y.zhao@intel.com> Signed-off-by: Paolo Bonzini --- .../testing/selftests/kvm/set_memory_region_test.c | 29 ++++++++++++++++------ 1 file changed, 21 insertions(+), 8 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/set_memory_region_test.c b/tools/testing/selftests/kvm/set_memory_region_test.c index bb8002084f52..a8267628e9ed 100644 --- a/tools/testing/selftests/kvm/set_memory_region_test.c +++ b/tools/testing/selftests/kvm/set_memory_region_test.c @@ -175,7 +175,7 @@ static void guest_code_move_memory_region(void) GUEST_DONE(); } -static void test_move_memory_region(void) +static void test_move_memory_region(bool disable_slot_zap_quirk) { pthread_t vcpu_thread; struct kvm_vcpu *vcpu; @@ -184,6 +184,9 @@ static void test_move_memory_region(void) vm = spawn_vm(&vcpu, &vcpu_thread, guest_code_move_memory_region); + if (disable_slot_zap_quirk) + vm_enable_cap(vm, KVM_CAP_DISABLE_QUIRKS2, KVM_X86_QUIRK_SLOT_ZAP_ALL); + hva = addr_gpa2hva(vm, MEM_REGION_GPA); /* @@ -266,7 +269,7 @@ static void guest_code_delete_memory_region(void) GUEST_ASSERT(0); } -static void test_delete_memory_region(void) +static void test_delete_memory_region(bool disable_slot_zap_quirk) { pthread_t vcpu_thread; struct kvm_vcpu *vcpu; @@ -276,6 +279,9 @@ static void test_delete_memory_region(void) vm = spawn_vm(&vcpu, &vcpu_thread, guest_code_delete_memory_region); + if (disable_slot_zap_quirk) + vm_enable_cap(vm, KVM_CAP_DISABLE_QUIRKS2, KVM_X86_QUIRK_SLOT_ZAP_ALL); + /* Delete the memory region, the guest should not die. */ vm_mem_region_delete(vm, MEM_REGION_SLOT); wait_for_vcpu(); @@ -553,7 +559,10 @@ int main(int argc, char *argv[]) { #ifdef __x86_64__ int i, loops; + int j, disable_slot_zap_quirk = 0; + if (kvm_check_cap(KVM_CAP_DISABLE_QUIRKS2) & KVM_X86_QUIRK_SLOT_ZAP_ALL) + disable_slot_zap_quirk = 1; /* * FIXME: the zero-memslot test fails on aarch64 and s390x because * KVM_RUN fails with ENOEXEC or EFAULT. @@ -579,13 +588,17 @@ int main(int argc, char *argv[]) else loops = 10; - pr_info("Testing MOVE of in-use region, %d loops\n", loops); - for (i = 0; i < loops; i++) - test_move_memory_region(); + for (j = 0; j <= disable_slot_zap_quirk; j++) { + pr_info("Testing MOVE of in-use region, %d loops, slot zap quirk %s\n", + loops, j ? "disabled" : "enabled"); + for (i = 0; i < loops; i++) + test_move_memory_region(!!j); - pr_info("Testing DELETE of in-use region, %d loops\n", loops); - for (i = 0; i < loops; i++) - test_delete_memory_region(); + pr_info("Testing DELETE of in-use region, %d loops, slot zap quirk %s\n", + loops, j ? "disabled" : "enabled"); + for (i = 0; i < loops; i++) + test_delete_memory_region(!!j); + } #endif return 0; -- cgit v1.2.3 From 218f6415004a881d116e254eeb837358aced55ab Mon Sep 17 00:00:00 2001 From: Yan Zhao Date: Wed, 3 Jul 2024 10:12:06 +0800 Subject: KVM: selftests: Allow slot modification stress test with quirk disabled Add a new user option to memslot_modification_stress_test to allow testing with slot zap quirk KVM_X86_QUIRK_SLOT_ZAP_ALL disabled. Signed-off-by: Yan Zhao Message-ID: <20240703021206.13923-1-yan.y.zhao@intel.com> Signed-off-by: Paolo Bonzini --- .../selftests/kvm/memslot_modification_stress_test.c | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/memslot_modification_stress_test.c b/tools/testing/selftests/kvm/memslot_modification_stress_test.c index 49f162573126..e3343f0df9e1 100644 --- a/tools/testing/selftests/kvm/memslot_modification_stress_test.c +++ b/tools/testing/selftests/kvm/memslot_modification_stress_test.c @@ -79,6 +79,7 @@ struct test_params { useconds_t delay; uint64_t nr_iterations; bool partition_vcpu_memory_access; + bool disable_slot_zap_quirk; }; static void run_test(enum vm_guest_mode mode, void *arg) @@ -89,6 +90,13 @@ static void run_test(enum vm_guest_mode mode, void *arg) vm = memstress_create_vm(mode, nr_vcpus, guest_percpu_mem_size, 1, VM_MEM_SRC_ANONYMOUS, p->partition_vcpu_memory_access); +#ifdef __x86_64__ + if (p->disable_slot_zap_quirk) + vm_enable_cap(vm, KVM_CAP_DISABLE_QUIRKS2, KVM_X86_QUIRK_SLOT_ZAP_ALL); + + pr_info("Memslot zap quirk %s\n", p->disable_slot_zap_quirk ? + "disabled" : "enabled"); +#endif pr_info("Finished creating vCPUs\n"); @@ -107,11 +115,12 @@ static void run_test(enum vm_guest_mode mode, void *arg) static void help(char *name) { puts(""); - printf("usage: %s [-h] [-m mode] [-d delay_usec]\n" + printf("usage: %s [-h] [-m mode] [-d delay_usec] [-q]\n" " [-b memory] [-v vcpus] [-o] [-i iterations]\n", name); guest_modes_help(); printf(" -d: add a delay between each iteration of adding and\n" " deleting a memslot in usec.\n"); + printf(" -q: Disable memslot zap quirk.\n"); printf(" -b: specify the size of the memory region which should be\n" " accessed by each vCPU. e.g. 10M or 3G.\n" " Default: 1G\n"); @@ -137,7 +146,7 @@ int main(int argc, char *argv[]) guest_modes_append_default(); - while ((opt = getopt(argc, argv, "hm:d:b:v:oi:")) != -1) { + while ((opt = getopt(argc, argv, "hm:d:qb:v:oi:")) != -1) { switch (opt) { case 'm': guest_modes_cmdline(optarg); @@ -160,6 +169,12 @@ int main(int argc, char *argv[]) case 'i': p.nr_iterations = atoi_positive("Number of iterations", optarg); break; + case 'q': + p.disable_slot_zap_quirk = true; + + TEST_REQUIRE(kvm_check_cap(KVM_CAP_DISABLE_QUIRKS2) & + KVM_X86_QUIRK_SLOT_ZAP_ALL); + break; case 'h': default: help(argv[0]); -- cgit v1.2.3 From 61de4c34b51c5b9c7ef8229eb246346254638446 Mon Sep 17 00:00:00 2001 From: Yan Zhao Date: Wed, 3 Jul 2024 10:12:19 +0800 Subject: KVM: selftests: Test memslot move in memslot_perf_test with quirk disabled Add a new user option to memslot_perf_test to allow testing memslot move with quirk KVM_X86_QUIRK_SLOT_ZAP_ALL disabled. Signed-off-by: Yan Zhao Message-ID: <20240703021219.13939-1-yan.y.zhao@intel.com> Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/memslot_perf_test.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/memslot_perf_test.c b/tools/testing/selftests/kvm/memslot_perf_test.c index 579a64f97333..893366982f77 100644 --- a/tools/testing/selftests/kvm/memslot_perf_test.c +++ b/tools/testing/selftests/kvm/memslot_perf_test.c @@ -113,6 +113,7 @@ static_assert(ATOMIC_BOOL_LOCK_FREE == 2, "atomic bool is not lockless"); static sem_t vcpu_ready; static bool map_unmap_verify; +static bool disable_slot_zap_quirk; static bool verbose; #define pr_info_v(...) \ @@ -578,6 +579,9 @@ static bool test_memslot_move_prepare(struct vm_data *data, uint32_t guest_page_size = data->vm->page_size; uint64_t movesrcgpa, movetestgpa; + if (disable_slot_zap_quirk) + vm_enable_cap(data->vm, KVM_CAP_DISABLE_QUIRKS2, KVM_X86_QUIRK_SLOT_ZAP_ALL); + movesrcgpa = vm_slot2gpa(data, data->nslots - 1); if (isactive) { @@ -896,6 +900,7 @@ static void help(char *name, struct test_args *targs) pr_info(" -h: print this help screen.\n"); pr_info(" -v: enable verbose mode (not for benchmarking).\n"); pr_info(" -d: enable extra debug checks.\n"); + pr_info(" -q: Disable memslot zap quirk during memslot move.\n"); pr_info(" -s: specify memslot count cap (-1 means no cap; currently: %i)\n", targs->nslots); pr_info(" -f: specify the first test to run (currently: %i; max %zu)\n", @@ -954,7 +959,7 @@ static bool parse_args(int argc, char *argv[], uint32_t max_mem_slots; int opt; - while ((opt = getopt(argc, argv, "hvds:f:e:l:r:")) != -1) { + while ((opt = getopt(argc, argv, "hvdqs:f:e:l:r:")) != -1) { switch (opt) { case 'h': default: @@ -966,6 +971,11 @@ static bool parse_args(int argc, char *argv[], case 'd': map_unmap_verify = true; break; + case 'q': + disable_slot_zap_quirk = true; + TEST_REQUIRE(kvm_check_cap(KVM_CAP_DISABLE_QUIRKS2) & + KVM_X86_QUIRK_SLOT_ZAP_ALL); + break; case 's': targs->nslots = atoi_paranoid(optarg); if (targs->nslots <= 1 && targs->nslots != -1) { -- cgit v1.2.3 From a4ae5c31e0f28d2c737583a06d4b29ffd6bbd622 Mon Sep 17 00:00:00 2001 From: "Alexis Lothoré (eBPF Foundation)" Date: Tue, 13 Aug 2024 14:45:05 +0200 Subject: selftests/bpf: convert get_current_cgroup_id_user to test_progs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit get_current_cgroup_id_user allows testing for bpf_get_current_cgroup_id() bpf API but is not integrated into test_progs, and so is not tested automatically in CI. Convert it to the test_progs framework to allow running it automatically. The most notable differences with the old test are the following: - the new test relies on autoattach instead of manually hooking/enabling the targeted tracepoint through perf_event, which reduces quite a lot the test code size - it also accesses bpf prog data through global variables instead of maps - sleep duration passed to nanosleep syscall has been reduced to its minimum to not impact overall CI duration (we only care about the syscall being properly triggered, not about the passed duration) Signed-off-by: Alexis Lothoré (eBPF Foundation) Link: https://lore.kernel.org/r/20240813-convert_cgroup_tests-v4-1-a33c03458cf6@bootlin.com Signed-off-by: Martin KaFai Lau --- tools/testing/selftests/bpf/.gitignore | 1 - tools/testing/selftests/bpf/Makefile | 3 +- tools/testing/selftests/bpf/get_cgroup_id_user.c | 151 --------------------- .../bpf/prog_tests/cgroup_get_current_cgroup_id.c | 46 +++++++ .../selftests/bpf/progs/get_cgroup_id_kern.c | 26 +--- 5 files changed, 51 insertions(+), 176 deletions(-) delete mode 100644 tools/testing/selftests/bpf/get_cgroup_id_user.c create mode 100644 tools/testing/selftests/bpf/prog_tests/cgroup_get_current_cgroup_id.c (limited to 'tools') diff --git a/tools/testing/selftests/bpf/.gitignore b/tools/testing/selftests/bpf/.gitignore index 8f14d8faeb0b..7d4d5d3e0210 100644 --- a/tools/testing/selftests/bpf/.gitignore +++ b/tools/testing/selftests/bpf/.gitignore @@ -19,7 +19,6 @@ test_sock urandom_read test_sockmap test_lirc_mode2_user -get_cgroup_id_user test_skb_cgroup_id_user test_cgroup_storage test_flow_dissector diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index f54185e96a95..8beade034caf 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -67,7 +67,7 @@ endif # Order correspond to 'make run_tests' order TEST_GEN_PROGS = test_verifier test_tag test_maps test_lru_map test_lpm_map test_progs \ - test_sock test_sockmap get_cgroup_id_user \ + test_sock test_sockmap \ test_cgroup_storage \ test_tcpnotify_user test_sysctl \ test_progs-no_alu32 @@ -295,7 +295,6 @@ $(OUTPUT)/test_skb_cgroup_id_user: $(CGROUP_HELPERS) $(TESTING_HELPERS) $(OUTPUT)/test_sock: $(CGROUP_HELPERS) $(TESTING_HELPERS) $(OUTPUT)/test_sockmap: $(CGROUP_HELPERS) $(TESTING_HELPERS) $(OUTPUT)/test_tcpnotify_user: $(CGROUP_HELPERS) $(TESTING_HELPERS) $(TRACE_HELPERS) -$(OUTPUT)/get_cgroup_id_user: $(CGROUP_HELPERS) $(TESTING_HELPERS) $(OUTPUT)/test_cgroup_storage: $(CGROUP_HELPERS) $(TESTING_HELPERS) $(OUTPUT)/test_sock_fields: $(CGROUP_HELPERS) $(TESTING_HELPERS) $(OUTPUT)/test_sysctl: $(CGROUP_HELPERS) $(TESTING_HELPERS) diff --git a/tools/testing/selftests/bpf/get_cgroup_id_user.c b/tools/testing/selftests/bpf/get_cgroup_id_user.c deleted file mode 100644 index aefd83ebdcd7..000000000000 --- a/tools/testing/selftests/bpf/get_cgroup_id_user.c +++ /dev/null @@ -1,151 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -// Copyright (c) 2018 Facebook - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include - -#include "cgroup_helpers.h" -#include "testing_helpers.h" - -#define CHECK(condition, tag, format...) ({ \ - int __ret = !!(condition); \ - if (__ret) { \ - printf("%s:FAIL:%s ", __func__, tag); \ - printf(format); \ - } else { \ - printf("%s:PASS:%s\n", __func__, tag); \ - } \ - __ret; \ -}) - -static int bpf_find_map(const char *test, struct bpf_object *obj, - const char *name) -{ - struct bpf_map *map; - - map = bpf_object__find_map_by_name(obj, name); - if (!map) - return -1; - return bpf_map__fd(map); -} - -#define TEST_CGROUP "/test-bpf-get-cgroup-id/" - -int main(int argc, char **argv) -{ - const char *probe_name = "syscalls/sys_enter_nanosleep"; - const char *file = "get_cgroup_id_kern.bpf.o"; - int err, bytes, efd, prog_fd, pmu_fd; - int cgroup_fd, cgidmap_fd, pidmap_fd; - struct perf_event_attr attr = {}; - struct bpf_object *obj; - __u64 kcgid = 0, ucgid; - __u32 key = 0, pid; - int exit_code = 1; - char buf[256]; - const struct timespec req = { - .tv_sec = 1, - .tv_nsec = 0, - }; - - cgroup_fd = cgroup_setup_and_join(TEST_CGROUP); - if (CHECK(cgroup_fd < 0, "cgroup_setup_and_join", "err %d errno %d\n", cgroup_fd, errno)) - return 1; - - /* Use libbpf 1.0 API mode */ - libbpf_set_strict_mode(LIBBPF_STRICT_ALL); - - err = bpf_prog_test_load(file, BPF_PROG_TYPE_TRACEPOINT, &obj, &prog_fd); - if (CHECK(err, "bpf_prog_test_load", "err %d errno %d\n", err, errno)) - goto cleanup_cgroup_env; - - cgidmap_fd = bpf_find_map(__func__, obj, "cg_ids"); - if (CHECK(cgidmap_fd < 0, "bpf_find_map", "err %d errno %d\n", - cgidmap_fd, errno)) - goto close_prog; - - pidmap_fd = bpf_find_map(__func__, obj, "pidmap"); - if (CHECK(pidmap_fd < 0, "bpf_find_map", "err %d errno %d\n", - pidmap_fd, errno)) - goto close_prog; - - pid = getpid(); - bpf_map_update_elem(pidmap_fd, &key, &pid, 0); - - if (access("/sys/kernel/tracing/trace", F_OK) == 0) { - snprintf(buf, sizeof(buf), - "/sys/kernel/tracing/events/%s/id", probe_name); - } else { - snprintf(buf, sizeof(buf), - "/sys/kernel/debug/tracing/events/%s/id", probe_name); - } - efd = open(buf, O_RDONLY, 0); - if (CHECK(efd < 0, "open", "err %d errno %d\n", efd, errno)) - goto close_prog; - bytes = read(efd, buf, sizeof(buf)); - close(efd); - if (CHECK(bytes <= 0 || bytes >= sizeof(buf), "read", - "bytes %d errno %d\n", bytes, errno)) - goto close_prog; - - attr.config = strtol(buf, NULL, 0); - attr.type = PERF_TYPE_TRACEPOINT; - attr.sample_type = PERF_SAMPLE_RAW; - attr.sample_period = 1; - attr.wakeup_events = 1; - - /* attach to this pid so the all bpf invocations will be in the - * cgroup associated with this pid. - */ - pmu_fd = syscall(__NR_perf_event_open, &attr, getpid(), -1, -1, 0); - if (CHECK(pmu_fd < 0, "perf_event_open", "err %d errno %d\n", pmu_fd, - errno)) - goto close_prog; - - err = ioctl(pmu_fd, PERF_EVENT_IOC_ENABLE, 0); - if (CHECK(err, "perf_event_ioc_enable", "err %d errno %d\n", err, - errno)) - goto close_pmu; - - err = ioctl(pmu_fd, PERF_EVENT_IOC_SET_BPF, prog_fd); - if (CHECK(err, "perf_event_ioc_set_bpf", "err %d errno %d\n", err, - errno)) - goto close_pmu; - - /* trigger some syscalls */ - syscall(__NR_nanosleep, &req, NULL); - - err = bpf_map_lookup_elem(cgidmap_fd, &key, &kcgid); - if (CHECK(err, "bpf_map_lookup_elem", "err %d errno %d\n", err, errno)) - goto close_pmu; - - ucgid = get_cgroup_id(TEST_CGROUP); - if (CHECK(kcgid != ucgid, "compare_cgroup_id", - "kern cgid %llx user cgid %llx", kcgid, ucgid)) - goto close_pmu; - - exit_code = 0; - printf("%s:PASS\n", argv[0]); - -close_pmu: - close(pmu_fd); -close_prog: - bpf_object__close(obj); -cleanup_cgroup_env: - cleanup_cgroup_environment(); - return exit_code; -} diff --git a/tools/testing/selftests/bpf/prog_tests/cgroup_get_current_cgroup_id.c b/tools/testing/selftests/bpf/prog_tests/cgroup_get_current_cgroup_id.c new file mode 100644 index 000000000000..7a1643b03bf3 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/cgroup_get_current_cgroup_id.c @@ -0,0 +1,46 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include "test_progs.h" +#include "cgroup_helpers.h" +#include "get_cgroup_id_kern.skel.h" + +#define TEST_CGROUP "/test-bpf-get-cgroup-id/" + +void test_cgroup_get_current_cgroup_id(void) +{ + struct get_cgroup_id_kern *skel; + const struct timespec req = { + .tv_sec = 0, + .tv_nsec = 1, + }; + int cgroup_fd; + __u64 ucgid; + + cgroup_fd = cgroup_setup_and_join(TEST_CGROUP); + if (!ASSERT_OK_FD(cgroup_fd, "cgroup switch")) + return; + + skel = get_cgroup_id_kern__open_and_load(); + if (!ASSERT_OK_PTR(skel, "load program")) + goto cleanup_cgroup; + + if (!ASSERT_OK(get_cgroup_id_kern__attach(skel), "attach bpf program")) + goto cleanup_progs; + + skel->bss->expected_pid = getpid(); + /* trigger the syscall on which is attached the tested prog */ + if (!ASSERT_OK(syscall(__NR_nanosleep, &req, NULL), "nanosleep")) + goto cleanup_progs; + + ucgid = get_cgroup_id(TEST_CGROUP); + + ASSERT_EQ(skel->bss->cg_id, ucgid, "compare cgroup ids"); + +cleanup_progs: + get_cgroup_id_kern__destroy(skel); +cleanup_cgroup: + close(cgroup_fd); + cleanup_cgroup_environment(); +} diff --git a/tools/testing/selftests/bpf/progs/get_cgroup_id_kern.c b/tools/testing/selftests/bpf/progs/get_cgroup_id_kern.c index 68587b1de34e..30fd504856c7 100644 --- a/tools/testing/selftests/bpf/progs/get_cgroup_id_kern.c +++ b/tools/testing/selftests/bpf/progs/get_cgroup_id_kern.c @@ -4,34 +4,16 @@ #include #include -struct { - __uint(type, BPF_MAP_TYPE_ARRAY); - __uint(max_entries, 1); - __type(key, __u32); - __type(value, __u64); -} cg_ids SEC(".maps"); - -struct { - __uint(type, BPF_MAP_TYPE_ARRAY); - __uint(max_entries, 1); - __type(key, __u32); - __type(value, __u32); -} pidmap SEC(".maps"); +__u64 cg_id; +__u64 expected_pid; SEC("tracepoint/syscalls/sys_enter_nanosleep") int trace(void *ctx) { __u32 pid = bpf_get_current_pid_tgid(); - __u32 key = 0, *expected_pid; - __u64 *val; - - expected_pid = bpf_map_lookup_elem(&pidmap, &key); - if (!expected_pid || *expected_pid != pid) - return 0; - val = bpf_map_lookup_elem(&cg_ids, &key); - if (val) - *val = bpf_get_current_cgroup_id(); + if (expected_pid == pid) + cg_id = bpf_get_current_cgroup_id(); return 0; } -- cgit v1.2.3 From 37a14cfd667a22662867b986925f1ad4205c17df Mon Sep 17 00:00:00 2001 From: "Alexis Lothoré (eBPF Foundation)" Date: Tue, 13 Aug 2024 14:45:06 +0200 Subject: selftests/bpf: convert test_cgroup_storage to test_progs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit test_cgroup_storage is currently a standalone program which is not run when executing test_progs. Convert it to the test_progs framework so it can be automatically executed in CI. The conversion led to the following changes: - converted the raw bpf program in the userspace test file into a dedicated test program in progs/ dir - reduced the scope of cgroup_storage test: the content from this test overlaps with some other tests already present in test_progs, most notably netcnt and cgroup_storage_multi*. Those tests already check extensively local storage, per-cpu local storage, cgroups interaction, etc. So the new test only keep the part testing that the program return code (based on map content) properly leads to packet being passed or dropped. Reviewed-by: Alan Maguire Signed-off-by: Alexis Lothoré (eBPF Foundation) Link: https://lore.kernel.org/r/20240813-convert_cgroup_tests-v4-2-a33c03458cf6@bootlin.com Signed-off-by: Martin KaFai Lau --- tools/testing/selftests/bpf/.gitignore | 1 - tools/testing/selftests/bpf/Makefile | 2 - .../selftests/bpf/prog_tests/cgroup_storage.c | 96 ++++++++++++ tools/testing/selftests/bpf/progs/cgroup_storage.c | 24 +++ tools/testing/selftests/bpf/test_cgroup_storage.c | 174 --------------------- 5 files changed, 120 insertions(+), 177 deletions(-) create mode 100644 tools/testing/selftests/bpf/prog_tests/cgroup_storage.c create mode 100644 tools/testing/selftests/bpf/progs/cgroup_storage.c delete mode 100644 tools/testing/selftests/bpf/test_cgroup_storage.c (limited to 'tools') diff --git a/tools/testing/selftests/bpf/.gitignore b/tools/testing/selftests/bpf/.gitignore index 7d4d5d3e0210..fd7ae37024e2 100644 --- a/tools/testing/selftests/bpf/.gitignore +++ b/tools/testing/selftests/bpf/.gitignore @@ -20,7 +20,6 @@ urandom_read test_sockmap test_lirc_mode2_user test_skb_cgroup_id_user -test_cgroup_storage test_flow_dissector flow_dissector_load test_tcpnotify_user diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 8beade034caf..99f8d6329694 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -68,7 +68,6 @@ endif # Order correspond to 'make run_tests' order TEST_GEN_PROGS = test_verifier test_tag test_maps test_lru_map test_lpm_map test_progs \ test_sock test_sockmap \ - test_cgroup_storage \ test_tcpnotify_user test_sysctl \ test_progs-no_alu32 TEST_INST_SUBDIRS := no_alu32 @@ -295,7 +294,6 @@ $(OUTPUT)/test_skb_cgroup_id_user: $(CGROUP_HELPERS) $(TESTING_HELPERS) $(OUTPUT)/test_sock: $(CGROUP_HELPERS) $(TESTING_HELPERS) $(OUTPUT)/test_sockmap: $(CGROUP_HELPERS) $(TESTING_HELPERS) $(OUTPUT)/test_tcpnotify_user: $(CGROUP_HELPERS) $(TESTING_HELPERS) $(TRACE_HELPERS) -$(OUTPUT)/test_cgroup_storage: $(CGROUP_HELPERS) $(TESTING_HELPERS) $(OUTPUT)/test_sock_fields: $(CGROUP_HELPERS) $(TESTING_HELPERS) $(OUTPUT)/test_sysctl: $(CGROUP_HELPERS) $(TESTING_HELPERS) $(OUTPUT)/test_tag: $(TESTING_HELPERS) diff --git a/tools/testing/selftests/bpf/prog_tests/cgroup_storage.c b/tools/testing/selftests/bpf/prog_tests/cgroup_storage.c new file mode 100644 index 000000000000..cf395715ced4 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/cgroup_storage.c @@ -0,0 +1,96 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include "cgroup_helpers.h" +#include "network_helpers.h" +#include "cgroup_storage.skel.h" + +#define TEST_CGROUP "/test-bpf-cgroup-storage-buf/" +#define TEST_NS "cgroup_storage_ns" +#define PING_CMD "ping localhost -c 1 -W 1 -q" + +static int setup_network(struct nstoken **token) +{ + SYS(fail, "ip netns add %s", TEST_NS); + *token = open_netns(TEST_NS); + if (!ASSERT_OK_PTR(*token, "open netns")) + goto cleanup_ns; + SYS(cleanup_ns, "ip link set lo up"); + + return 0; + +cleanup_ns: + SYS_NOFAIL("ip netns del %s", TEST_NS); +fail: + return -1; +} + +static void cleanup_network(struct nstoken *ns) +{ + close_netns(ns); + SYS_NOFAIL("ip netns del %s", TEST_NS); +} + +void test_cgroup_storage(void) +{ + struct bpf_cgroup_storage_key key; + struct cgroup_storage *skel; + struct nstoken *ns = NULL; + unsigned long long value; + int cgroup_fd; + int err; + + cgroup_fd = cgroup_setup_and_join(TEST_CGROUP); + if (!ASSERT_OK_FD(cgroup_fd, "create cgroup")) + return; + + if (!ASSERT_OK(setup_network(&ns), "setup network")) + goto cleanup_cgroup; + + skel = cgroup_storage__open_and_load(); + if (!ASSERT_OK_PTR(skel, "load program")) + goto cleanup_network; + + skel->links.bpf_prog = + bpf_program__attach_cgroup(skel->progs.bpf_prog, cgroup_fd); + if (!ASSERT_OK_PTR(skel->links.bpf_prog, "attach program")) + goto cleanup_progs; + + /* Check that one out of every two packets is dropped */ + err = SYS_NOFAIL(PING_CMD); + ASSERT_OK(err, "first ping"); + err = SYS_NOFAIL(PING_CMD); + ASSERT_NEQ(err, 0, "second ping"); + err = SYS_NOFAIL(PING_CMD); + ASSERT_OK(err, "third ping"); + + err = bpf_map__get_next_key(skel->maps.cgroup_storage, NULL, &key, + sizeof(key)); + if (!ASSERT_OK(err, "get first key")) + goto cleanup_progs; + err = bpf_map__lookup_elem(skel->maps.cgroup_storage, &key, sizeof(key), + &value, sizeof(value), 0); + if (!ASSERT_OK(err, "first packet count read")) + goto cleanup_progs; + + /* Add one to the packet counter, check again packet filtering */ + value++; + err = bpf_map__update_elem(skel->maps.cgroup_storage, &key, sizeof(key), + &value, sizeof(value), 0); + if (!ASSERT_OK(err, "increment packet counter")) + goto cleanup_progs; + err = SYS_NOFAIL(PING_CMD); + ASSERT_OK(err, "fourth ping"); + err = SYS_NOFAIL(PING_CMD); + ASSERT_NEQ(err, 0, "fifth ping"); + err = SYS_NOFAIL(PING_CMD); + ASSERT_OK(err, "sixth ping"); + +cleanup_progs: + cgroup_storage__destroy(skel); +cleanup_network: + cleanup_network(ns); +cleanup_cgroup: + close(cgroup_fd); + cleanup_cgroup_environment(); +} diff --git a/tools/testing/selftests/bpf/progs/cgroup_storage.c b/tools/testing/selftests/bpf/progs/cgroup_storage.c new file mode 100644 index 000000000000..db1e4d2d3281 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/cgroup_storage.c @@ -0,0 +1,24 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include + +struct { + __uint(type, BPF_MAP_TYPE_CGROUP_STORAGE); + __type(key, struct bpf_cgroup_storage_key); + __type(value, __u64); +} cgroup_storage SEC(".maps"); + +SEC("cgroup_skb/egress") +int bpf_prog(struct __sk_buff *skb) +{ + __u64 *counter; + + counter = bpf_get_local_storage(&cgroup_storage, 0); + __sync_fetch_and_add(counter, 1); + + /* Drop one out of every two packets */ + return (*counter & 1); +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/test_cgroup_storage.c b/tools/testing/selftests/bpf/test_cgroup_storage.c deleted file mode 100644 index 0861ea60dcdd..000000000000 --- a/tools/testing/selftests/bpf/test_cgroup_storage.c +++ /dev/null @@ -1,174 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include -#include -#include -#include -#include -#include - -#include "bpf_util.h" -#include "cgroup_helpers.h" -#include "testing_helpers.h" - -char bpf_log_buf[BPF_LOG_BUF_SIZE]; - -#define TEST_CGROUP "/test-bpf-cgroup-storage-buf/" - -int main(int argc, char **argv) -{ - struct bpf_insn prog[] = { - BPF_LD_MAP_FD(BPF_REG_1, 0), /* percpu map fd */ - BPF_MOV64_IMM(BPF_REG_2, 0), /* flags, not used */ - BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, - BPF_FUNC_get_local_storage), - BPF_LDX_MEM(BPF_DW, BPF_REG_3, BPF_REG_0, 0), - BPF_ALU64_IMM(BPF_ADD, BPF_REG_3, 0x1), - BPF_STX_MEM(BPF_DW, BPF_REG_0, BPF_REG_3, 0), - - BPF_LD_MAP_FD(BPF_REG_1, 0), /* map fd */ - BPF_MOV64_IMM(BPF_REG_2, 0), /* flags, not used */ - BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, - BPF_FUNC_get_local_storage), - BPF_MOV64_IMM(BPF_REG_1, 1), - BPF_ATOMIC_OP(BPF_DW, BPF_ADD, BPF_REG_0, BPF_REG_1, 0), - BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_0, 0), - BPF_ALU64_IMM(BPF_AND, BPF_REG_1, 0x1), - BPF_MOV64_REG(BPF_REG_0, BPF_REG_1), - BPF_EXIT_INSN(), - }; - size_t insns_cnt = ARRAY_SIZE(prog); - int error = EXIT_FAILURE; - int map_fd, percpu_map_fd, prog_fd, cgroup_fd; - struct bpf_cgroup_storage_key key; - unsigned long long value; - unsigned long long *percpu_value; - int cpu, nproc; - - nproc = bpf_num_possible_cpus(); - percpu_value = malloc(sizeof(*percpu_value) * nproc); - if (!percpu_value) { - printf("Not enough memory for per-cpu area (%d cpus)\n", nproc); - goto err; - } - - /* Use libbpf 1.0 API mode */ - libbpf_set_strict_mode(LIBBPF_STRICT_ALL); - - map_fd = bpf_map_create(BPF_MAP_TYPE_CGROUP_STORAGE, NULL, sizeof(key), - sizeof(value), 0, NULL); - if (map_fd < 0) { - printf("Failed to create map: %s\n", strerror(errno)); - goto out; - } - - percpu_map_fd = bpf_map_create(BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE, NULL, - sizeof(key), sizeof(value), 0, NULL); - if (percpu_map_fd < 0) { - printf("Failed to create map: %s\n", strerror(errno)); - goto out; - } - - prog[0].imm = percpu_map_fd; - prog[7].imm = map_fd; - prog_fd = bpf_test_load_program(BPF_PROG_TYPE_CGROUP_SKB, - prog, insns_cnt, "GPL", 0, - bpf_log_buf, BPF_LOG_BUF_SIZE); - if (prog_fd < 0) { - printf("Failed to load bpf program: %s\n", bpf_log_buf); - goto out; - } - - cgroup_fd = cgroup_setup_and_join(TEST_CGROUP); - - /* Attach the bpf program */ - if (bpf_prog_attach(prog_fd, cgroup_fd, BPF_CGROUP_INET_EGRESS, 0)) { - printf("Failed to attach bpf program\n"); - goto err; - } - - if (bpf_map_get_next_key(map_fd, NULL, &key)) { - printf("Failed to get the first key in cgroup storage\n"); - goto err; - } - - if (bpf_map_lookup_elem(map_fd, &key, &value)) { - printf("Failed to lookup cgroup storage 0\n"); - goto err; - } - - for (cpu = 0; cpu < nproc; cpu++) - percpu_value[cpu] = 1000; - - if (bpf_map_update_elem(percpu_map_fd, &key, percpu_value, 0)) { - printf("Failed to update the data in the cgroup storage\n"); - goto err; - } - - /* Every second packet should be dropped */ - assert(system("ping localhost -c 1 -W 1 -q > /dev/null") == 0); - assert(system("ping localhost -c 1 -W 1 -q > /dev/null")); - assert(system("ping localhost -c 1 -W 1 -q > /dev/null") == 0); - - /* Check the counter in the cgroup local storage */ - if (bpf_map_lookup_elem(map_fd, &key, &value)) { - printf("Failed to lookup cgroup storage\n"); - goto err; - } - - if (value != 3) { - printf("Unexpected data in the cgroup storage: %llu\n", value); - goto err; - } - - /* Bump the counter in the cgroup local storage */ - value++; - if (bpf_map_update_elem(map_fd, &key, &value, 0)) { - printf("Failed to update the data in the cgroup storage\n"); - goto err; - } - - /* Every second packet should be dropped */ - assert(system("ping localhost -c 1 -W 1 -q > /dev/null") == 0); - assert(system("ping localhost -c 1 -W 1 -q > /dev/null")); - assert(system("ping localhost -c 1 -W 1 -q > /dev/null") == 0); - - /* Check the final value of the counter in the cgroup local storage */ - if (bpf_map_lookup_elem(map_fd, &key, &value)) { - printf("Failed to lookup the cgroup storage\n"); - goto err; - } - - if (value != 7) { - printf("Unexpected data in the cgroup storage: %llu\n", value); - goto err; - } - - /* Check the final value of the counter in the percpu local storage */ - - for (cpu = 0; cpu < nproc; cpu++) - percpu_value[cpu] = 0; - - if (bpf_map_lookup_elem(percpu_map_fd, &key, percpu_value)) { - printf("Failed to lookup the per-cpu cgroup storage\n"); - goto err; - } - - value = 0; - for (cpu = 0; cpu < nproc; cpu++) - value += percpu_value[cpu]; - - if (value != nproc * 1000 + 6) { - printf("Unexpected data in the per-cpu cgroup storage\n"); - goto err; - } - - error = 0; - printf("test_cgroup_storage:PASS\n"); - -err: - cleanup_cgroup_environment(); - free(percpu_value); - -out: - return error; -} -- cgit v1.2.3 From 7b4400a0a69b8255a2afed0bead0ce2c2c457545 Mon Sep 17 00:00:00 2001 From: "Alexis Lothoré (eBPF Foundation)" Date: Tue, 13 Aug 2024 14:45:07 +0200 Subject: selftests/bpf: add proper section name to bpf prog and rename it MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit test_skb_cgroup_id_kern.c is currently involved in a manual test. In its current form, it can not be used with the auto-generated skeleton APIs, because the section name is not valid to allow libbpf to deduce the program type. Update section name to allow skeleton APIs usage. Also rename the program name to make it shorter and more straighforward regarding the API it is testing. While doing so, make sure that test_skb_cgroup_id.sh passes to get a working reference before converting it to test_progs - update the obj name - fix loading issue (verifier rejecting the program when loaded through tc, because of map not found), by preloading the whole obj with bpftool Reviewed-by: Alan Maguire Signed-off-by: Alexis Lothoré (eBPF Foundation) Link: https://lore.kernel.org/r/20240813-convert_cgroup_tests-v4-3-a33c03458cf6@bootlin.com Signed-off-by: Martin KaFai Lau --- .../testing/selftests/bpf/progs/cgroup_ancestor.c | 45 ++++++++++++++++++++++ .../selftests/bpf/progs/test_skb_cgroup_id_kern.c | 45 ---------------------- tools/testing/selftests/bpf/test_skb_cgroup_id.sh | 12 ++++-- 3 files changed, 53 insertions(+), 49 deletions(-) create mode 100644 tools/testing/selftests/bpf/progs/cgroup_ancestor.c delete mode 100644 tools/testing/selftests/bpf/progs/test_skb_cgroup_id_kern.c (limited to 'tools') diff --git a/tools/testing/selftests/bpf/progs/cgroup_ancestor.c b/tools/testing/selftests/bpf/progs/cgroup_ancestor.c new file mode 100644 index 000000000000..4879645f5827 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/cgroup_ancestor.c @@ -0,0 +1,45 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (c) 2018 Facebook + +#include +#include + +#include + +#include + +#define NUM_CGROUP_LEVELS 4 + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __type(key, __u32); + __type(value, __u64); + __uint(max_entries, NUM_CGROUP_LEVELS); +} cgroup_ids SEC(".maps"); + +static __always_inline void log_nth_level(struct __sk_buff *skb, __u32 level) +{ + __u64 id; + + /* [1] &level passed to external function that may change it, it's + * incompatible with loop unroll. + */ + id = bpf_skb_ancestor_cgroup_id(skb, level); + bpf_map_update_elem(&cgroup_ids, &level, &id, 0); +} + +SEC("tc") +int log_cgroup_id(struct __sk_buff *skb) +{ + /* Loop unroll can't be used here due to [1]. Unrolling manually. + * Number of calls should be in sync with NUM_CGROUP_LEVELS. + */ + log_nth_level(skb, 0); + log_nth_level(skb, 1); + log_nth_level(skb, 2); + log_nth_level(skb, 3); + + return TC_ACT_OK; +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/test_skb_cgroup_id_kern.c b/tools/testing/selftests/bpf/progs/test_skb_cgroup_id_kern.c deleted file mode 100644 index 37aacc66cd68..000000000000 --- a/tools/testing/selftests/bpf/progs/test_skb_cgroup_id_kern.c +++ /dev/null @@ -1,45 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -// Copyright (c) 2018 Facebook - -#include -#include - -#include - -#include - -#define NUM_CGROUP_LEVELS 4 - -struct { - __uint(type, BPF_MAP_TYPE_ARRAY); - __type(key, __u32); - __type(value, __u64); - __uint(max_entries, NUM_CGROUP_LEVELS); -} cgroup_ids SEC(".maps"); - -static __always_inline void log_nth_level(struct __sk_buff *skb, __u32 level) -{ - __u64 id; - - /* [1] &level passed to external function that may change it, it's - * incompatible with loop unroll. - */ - id = bpf_skb_ancestor_cgroup_id(skb, level); - bpf_map_update_elem(&cgroup_ids, &level, &id, 0); -} - -SEC("cgroup_id_logger") -int log_cgroup_id(struct __sk_buff *skb) -{ - /* Loop unroll can't be used here due to [1]. Unrolling manually. - * Number of calls should be in sync with NUM_CGROUP_LEVELS. - */ - log_nth_level(skb, 0); - log_nth_level(skb, 1); - log_nth_level(skb, 2); - log_nth_level(skb, 3); - - return TC_ACT_OK; -} - -char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/test_skb_cgroup_id.sh b/tools/testing/selftests/bpf/test_skb_cgroup_id.sh index 515c2eafc97f..d7dad49175c2 100755 --- a/tools/testing/selftests/bpf/test_skb_cgroup_id.sh +++ b/tools/testing/selftests/bpf/test_skb_cgroup_id.sh @@ -30,8 +30,10 @@ setup() wait_for_ip tc qdisc add dev ${TEST_IF} clsact - tc filter add dev ${TEST_IF} egress bpf obj ${BPF_PROG_OBJ} \ - sec ${BPF_PROG_SECTION} da + mkdir -p /sys/fs/bpf/${BPF_PROG_PIN} + bpftool prog loadall ${BPF_PROG_OBJ} /sys/fs/bpf/${BPF_PROG_PIN} type tc + tc filter add dev ${TEST_IF} egress bpf da object-pinned \ + /sys/fs/bpf/${BPF_PROG_PIN}/${BPF_PROG_NAME} BPF_PROG_ID=$(tc filter show dev ${TEST_IF} egress | \ awk '/ id / {sub(/.* id /, "", $0); print($1)}') @@ -41,6 +43,7 @@ cleanup() { ip link del ${TEST_IF} 2>/dev/null || : ip link del ${TEST_IF_PEER} 2>/dev/null || : + rm -rf /sys/fs/bpf/${BPF_PROG_PIN} } main() @@ -54,8 +57,9 @@ DIR=$(dirname $0) TEST_IF="test_cgid_1" TEST_IF_PEER="test_cgid_2" MAX_PING_TRIES=5 -BPF_PROG_OBJ="${DIR}/test_skb_cgroup_id_kern.bpf.o" -BPF_PROG_SECTION="cgroup_id_logger" +BPF_PROG_PIN="cgroup_ancestor" +BPF_PROG_OBJ="${DIR}/${BPF_PROG_PIN}.bpf.o" +BPF_PROG_NAME="log_cgroup_id" BPF_PROG_ID=0 PROG="${DIR}/test_skb_cgroup_id_user" type ping6 >/dev/null 2>&1 && PING6="ping6" || PING6="ping -6" -- cgit v1.2.3 From f957c230e173cf227566686015016d05f7102d27 Mon Sep 17 00:00:00 2001 From: "Alexis Lothoré (eBPF Foundation)" Date: Tue, 13 Aug 2024 14:45:08 +0200 Subject: selftests/bpf: convert test_skb_cgroup_id_user to test_progs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit test_skb_cgroup_id_user allows testing skb cgroup id retrieval at different levels, but is not integrated in test_progs, so it is not run automatically in CI. The test overlaps a bit with cgroup_skb_sk_lookup_kern, which is integrated in test_progs and test extensively skb cgroup helpers, but there is still one major difference between the two tests which justifies the conversion: cgroup_skb_sk_lookup_kern deals with a BPF_PROG_TYPE_CGROUP_SKB (attached on a cgroup), while test_skb_cgroup_id_user deals with a BPF_PROG_TYPE_SCHED_CLS (attached on a qdisc) Convert test_skb_cgroup_id_user into test_progs framework in order to run it automatically in CI. The main differences with the original test are the following: - rename the test to make it shorter and more straightforward regarding tested feature - the wrapping shell script has been dropped since every setup step is now handled in the main C test file - the test has been renamed for a shorter name and reflecting the tested API - add dedicated assert log per level to ease test failure debugging - use global variables instead of maps to access bpf prog data Signed-off-by: Alexis Lothoré (eBPF Foundation) Link: https://lore.kernel.org/r/20240813-convert_cgroup_tests-v4-4-a33c03458cf6@bootlin.com Signed-off-by: Martin KaFai Lau --- tools/testing/selftests/bpf/.gitignore | 1 - tools/testing/selftests/bpf/Makefile | 3 +- .../selftests/bpf/prog_tests/cgroup_ancestor.c | 141 ++++++++++++++++ .../testing/selftests/bpf/progs/cgroup_ancestor.c | 41 ++--- tools/testing/selftests/bpf/test_skb_cgroup_id.sh | 67 -------- .../selftests/bpf/test_skb_cgroup_id_user.c | 183 --------------------- 6 files changed, 160 insertions(+), 276 deletions(-) create mode 100644 tools/testing/selftests/bpf/prog_tests/cgroup_ancestor.c delete mode 100755 tools/testing/selftests/bpf/test_skb_cgroup_id.sh delete mode 100644 tools/testing/selftests/bpf/test_skb_cgroup_id_user.c (limited to 'tools') diff --git a/tools/testing/selftests/bpf/.gitignore b/tools/testing/selftests/bpf/.gitignore index fd7ae37024e2..99ffea1fa5c6 100644 --- a/tools/testing/selftests/bpf/.gitignore +++ b/tools/testing/selftests/bpf/.gitignore @@ -19,7 +19,6 @@ test_sock urandom_read test_sockmap test_lirc_mode2_user -test_skb_cgroup_id_user test_flow_dissector flow_dissector_load test_tcpnotify_user diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 99f8d6329694..163b11014158 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -137,7 +137,7 @@ TEST_PROGS_EXTENDED := with_addr.sh \ test_xdp_vlan.sh test_bpftool.py # Compile but not part of 'make run_tests' -TEST_GEN_PROGS_EXTENDED = test_skb_cgroup_id_user \ +TEST_GEN_PROGS_EXTENDED = \ flow_dissector_load test_flow_dissector test_tcp_check_syncookie_user \ test_lirc_mode2_user xdping test_cpp runqslower bench bpf_testmod.ko \ xskxceiver xdp_redirect_multi xdp_synproxy veristat xdp_hw_metadata \ @@ -290,7 +290,6 @@ JSON_WRITER := $(OUTPUT)/json_writer.o CAP_HELPERS := $(OUTPUT)/cap_helpers.o NETWORK_HELPERS := $(OUTPUT)/network_helpers.o -$(OUTPUT)/test_skb_cgroup_id_user: $(CGROUP_HELPERS) $(TESTING_HELPERS) $(OUTPUT)/test_sock: $(CGROUP_HELPERS) $(TESTING_HELPERS) $(OUTPUT)/test_sockmap: $(CGROUP_HELPERS) $(TESTING_HELPERS) $(OUTPUT)/test_tcpnotify_user: $(CGROUP_HELPERS) $(TESTING_HELPERS) $(TRACE_HELPERS) diff --git a/tools/testing/selftests/bpf/prog_tests/cgroup_ancestor.c b/tools/testing/selftests/bpf/prog_tests/cgroup_ancestor.c new file mode 100644 index 000000000000..9250a1e9f9af --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/cgroup_ancestor.c @@ -0,0 +1,141 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "test_progs.h" +#include "network_helpers.h" +#include "cgroup_helpers.h" +#include "cgroup_ancestor.skel.h" + +#define CGROUP_PATH "/skb_cgroup_test" +#define TEST_NS "cgroup_ancestor_ns" +#define NUM_CGROUP_LEVELS 4 +#define WAIT_AUTO_IP_MAX_ATTEMPT 10 +#define DST_ADDR "::1" +#define DST_PORT 1234 +#define MAX_ASSERT_NAME 32 + +struct test_data { + struct cgroup_ancestor *skel; + struct bpf_tc_hook qdisc; + struct bpf_tc_opts tc_attach; + struct nstoken *ns; +}; + +static int send_datagram(void) +{ + unsigned char buf[] = "some random test data"; + struct sockaddr_in6 addr = { .sin6_family = AF_INET6, + .sin6_port = htons(DST_PORT), }; + int sock, n; + + if (!ASSERT_EQ(inet_pton(AF_INET6, DST_ADDR, &addr.sin6_addr), 1, + "inet_pton")) + return -1; + + sock = socket(AF_INET6, SOCK_DGRAM, 0); + if (!ASSERT_OK_FD(sock, "create socket")) + return sock; + + if (!ASSERT_OK(connect(sock, &addr, sizeof(addr)), "connect")) { + close(sock); + return -1; + } + + n = sendto(sock, buf, sizeof(buf), 0, (const struct sockaddr *)&addr, + sizeof(addr)); + close(sock); + return ASSERT_EQ(n, sizeof(buf), "send data") ? 0 : -1; +} + +static int setup_network(struct test_data *t) +{ + SYS(fail, "ip netns add %s", TEST_NS); + t->ns = open_netns(TEST_NS); + if (!ASSERT_OK_PTR(t->ns, "open netns")) + goto cleanup_ns; + + SYS(close_ns, "ip link set lo up"); + + memset(&t->qdisc, 0, sizeof(t->qdisc)); + t->qdisc.sz = sizeof(t->qdisc); + t->qdisc.attach_point = BPF_TC_EGRESS; + t->qdisc.ifindex = if_nametoindex("lo"); + if (!ASSERT_NEQ(t->qdisc.ifindex, 0, "if_nametoindex")) + goto close_ns; + if (!ASSERT_OK(bpf_tc_hook_create(&t->qdisc), "qdisc add")) + goto close_ns; + + memset(&t->tc_attach, 0, sizeof(t->tc_attach)); + t->tc_attach.sz = sizeof(t->tc_attach); + t->tc_attach.prog_fd = bpf_program__fd(t->skel->progs.log_cgroup_id); + if (!ASSERT_OK(bpf_tc_attach(&t->qdisc, &t->tc_attach), "filter add")) + goto cleanup_qdisc; + + return 0; + +cleanup_qdisc: + bpf_tc_hook_destroy(&t->qdisc); +close_ns: + close_netns(t->ns); +cleanup_ns: + SYS_NOFAIL("ip netns del %s", TEST_NS); +fail: + return 1; +} + +static void cleanup_network(struct test_data *t) +{ + bpf_tc_detach(&t->qdisc, &t->tc_attach); + bpf_tc_hook_destroy(&t->qdisc); + close_netns(t->ns); + SYS_NOFAIL("ip netns del %s", TEST_NS); +} + +static void check_ancestors_ids(struct test_data *t) +{ + __u64 expected_ids[NUM_CGROUP_LEVELS]; + char assert_name[MAX_ASSERT_NAME]; + __u32 level; + + expected_ids[0] = get_cgroup_id("/.."); /* root cgroup */ + expected_ids[1] = get_cgroup_id(""); + expected_ids[2] = get_cgroup_id(CGROUP_PATH); + expected_ids[3] = 0; /* non-existent cgroup */ + + for (level = 0; level < NUM_CGROUP_LEVELS; level++) { + snprintf(assert_name, MAX_ASSERT_NAME, + "ancestor id at level %d", level); + ASSERT_EQ(t->skel->bss->cgroup_ids[level], expected_ids[level], + assert_name); + } +} + +void test_cgroup_ancestor(void) +{ + struct test_data t; + int cgroup_fd; + + t.skel = cgroup_ancestor__open_and_load(); + if (!ASSERT_OK_PTR(t.skel, "open and load")) + return; + + t.skel->bss->dport = htons(DST_PORT); + cgroup_fd = cgroup_setup_and_join(CGROUP_PATH); + if (cgroup_fd < 0) + goto cleanup_progs; + + if (setup_network(&t)) + goto cleanup_cgroups; + + if (send_datagram()) + goto cleanup_network; + + check_ancestors_ids(&t); + +cleanup_network: + cleanup_network(&t); +cleanup_cgroups: + close(cgroup_fd); + cleanup_cgroup_environment(); +cleanup_progs: + cgroup_ancestor__destroy(t.skel); +} diff --git a/tools/testing/selftests/bpf/progs/cgroup_ancestor.c b/tools/testing/selftests/bpf/progs/cgroup_ancestor.c index 4879645f5827..8c2deb4fc493 100644 --- a/tools/testing/selftests/bpf/progs/cgroup_ancestor.c +++ b/tools/testing/selftests/bpf/progs/cgroup_ancestor.c @@ -1,43 +1,38 @@ // SPDX-License-Identifier: GPL-2.0 // Copyright (c) 2018 Facebook -#include -#include - -#include - +#include #include - +#include +#include "bpf_tracing_net.h" #define NUM_CGROUP_LEVELS 4 -struct { - __uint(type, BPF_MAP_TYPE_ARRAY); - __type(key, __u32); - __type(value, __u64); - __uint(max_entries, NUM_CGROUP_LEVELS); -} cgroup_ids SEC(".maps"); +__u64 cgroup_ids[NUM_CGROUP_LEVELS]; +__u16 dport; static __always_inline void log_nth_level(struct __sk_buff *skb, __u32 level) { - __u64 id; - /* [1] &level passed to external function that may change it, it's * incompatible with loop unroll. */ - id = bpf_skb_ancestor_cgroup_id(skb, level); - bpf_map_update_elem(&cgroup_ids, &level, &id, 0); + cgroup_ids[level] = bpf_skb_ancestor_cgroup_id(skb, level); } SEC("tc") int log_cgroup_id(struct __sk_buff *skb) { - /* Loop unroll can't be used here due to [1]. Unrolling manually. - * Number of calls should be in sync with NUM_CGROUP_LEVELS. - */ - log_nth_level(skb, 0); - log_nth_level(skb, 1); - log_nth_level(skb, 2); - log_nth_level(skb, 3); + struct sock *sk = (void *)skb->sk; + + if (!sk) + return TC_ACT_OK; + + sk = bpf_core_cast(sk, struct sock); + if (sk->sk_protocol == IPPROTO_UDP && sk->sk_dport == dport) { + log_nth_level(skb, 0); + log_nth_level(skb, 1); + log_nth_level(skb, 2); + log_nth_level(skb, 3); + } return TC_ACT_OK; } diff --git a/tools/testing/selftests/bpf/test_skb_cgroup_id.sh b/tools/testing/selftests/bpf/test_skb_cgroup_id.sh deleted file mode 100755 index d7dad49175c2..000000000000 --- a/tools/testing/selftests/bpf/test_skb_cgroup_id.sh +++ /dev/null @@ -1,67 +0,0 @@ -#!/bin/sh -# SPDX-License-Identifier: GPL-2.0 -# Copyright (c) 2018 Facebook - -set -eu - -wait_for_ip() -{ - local _i - echo -n "Wait for testing link-local IP to become available " - for _i in $(seq ${MAX_PING_TRIES}); do - echo -n "." - if $PING6 -c 1 -W 1 ff02::1%${TEST_IF} >/dev/null 2>&1; then - echo " OK" - return - fi - sleep 1 - done - echo 1>&2 "ERROR: Timeout waiting for test IP to become available." - exit 1 -} - -setup() -{ - # Create testing interfaces not to interfere with current environment. - ip link add dev ${TEST_IF} type veth peer name ${TEST_IF_PEER} - ip link set ${TEST_IF} up - ip link set ${TEST_IF_PEER} up - - wait_for_ip - - tc qdisc add dev ${TEST_IF} clsact - mkdir -p /sys/fs/bpf/${BPF_PROG_PIN} - bpftool prog loadall ${BPF_PROG_OBJ} /sys/fs/bpf/${BPF_PROG_PIN} type tc - tc filter add dev ${TEST_IF} egress bpf da object-pinned \ - /sys/fs/bpf/${BPF_PROG_PIN}/${BPF_PROG_NAME} - - BPF_PROG_ID=$(tc filter show dev ${TEST_IF} egress | \ - awk '/ id / {sub(/.* id /, "", $0); print($1)}') -} - -cleanup() -{ - ip link del ${TEST_IF} 2>/dev/null || : - ip link del ${TEST_IF_PEER} 2>/dev/null || : - rm -rf /sys/fs/bpf/${BPF_PROG_PIN} -} - -main() -{ - trap cleanup EXIT 2 3 6 15 - setup - ${PROG} ${TEST_IF} ${BPF_PROG_ID} -} - -DIR=$(dirname $0) -TEST_IF="test_cgid_1" -TEST_IF_PEER="test_cgid_2" -MAX_PING_TRIES=5 -BPF_PROG_PIN="cgroup_ancestor" -BPF_PROG_OBJ="${DIR}/${BPF_PROG_PIN}.bpf.o" -BPF_PROG_NAME="log_cgroup_id" -BPF_PROG_ID=0 -PROG="${DIR}/test_skb_cgroup_id_user" -type ping6 >/dev/null 2>&1 && PING6="ping6" || PING6="ping -6" - -main diff --git a/tools/testing/selftests/bpf/test_skb_cgroup_id_user.c b/tools/testing/selftests/bpf/test_skb_cgroup_id_user.c deleted file mode 100644 index ed518d075d1d..000000000000 --- a/tools/testing/selftests/bpf/test_skb_cgroup_id_user.c +++ /dev/null @@ -1,183 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -// Copyright (c) 2018 Facebook - -#include -#include -#include - -#include -#include -#include -#include -#include - - -#include -#include - -#include "cgroup_helpers.h" - -#define CGROUP_PATH "/skb_cgroup_test" -#define NUM_CGROUP_LEVELS 4 - -/* RFC 4291, Section 2.7.1 */ -#define LINKLOCAL_MULTICAST "ff02::1" - -static int mk_dst_addr(const char *ip, const char *iface, - struct sockaddr_in6 *dst) -{ - memset(dst, 0, sizeof(*dst)); - - dst->sin6_family = AF_INET6; - dst->sin6_port = htons(1025); - - if (inet_pton(AF_INET6, ip, &dst->sin6_addr) != 1) { - log_err("Invalid IPv6: %s", ip); - return -1; - } - - dst->sin6_scope_id = if_nametoindex(iface); - if (!dst->sin6_scope_id) { - log_err("Failed to get index of iface: %s", iface); - return -1; - } - - return 0; -} - -static int send_packet(const char *iface) -{ - struct sockaddr_in6 dst; - char msg[] = "msg"; - int err = 0; - int fd = -1; - - if (mk_dst_addr(LINKLOCAL_MULTICAST, iface, &dst)) - goto err; - - fd = socket(AF_INET6, SOCK_DGRAM, 0); - if (fd == -1) { - log_err("Failed to create UDP socket"); - goto err; - } - - if (sendto(fd, &msg, sizeof(msg), 0, (const struct sockaddr *)&dst, - sizeof(dst)) == -1) { - log_err("Failed to send datagram"); - goto err; - } - - goto out; -err: - err = -1; -out: - if (fd >= 0) - close(fd); - return err; -} - -int get_map_fd_by_prog_id(int prog_id) -{ - struct bpf_prog_info info = {}; - __u32 info_len = sizeof(info); - __u32 map_ids[1]; - int prog_fd = -1; - int map_fd = -1; - - prog_fd = bpf_prog_get_fd_by_id(prog_id); - if (prog_fd < 0) { - log_err("Failed to get fd by prog id %d", prog_id); - goto err; - } - - info.nr_map_ids = 1; - info.map_ids = (__u64) (unsigned long) map_ids; - - if (bpf_prog_get_info_by_fd(prog_fd, &info, &info_len)) { - log_err("Failed to get info by prog fd %d", prog_fd); - goto err; - } - - if (!info.nr_map_ids) { - log_err("No maps found for prog fd %d", prog_fd); - goto err; - } - - map_fd = bpf_map_get_fd_by_id(map_ids[0]); - if (map_fd < 0) - log_err("Failed to get fd by map id %d", map_ids[0]); -err: - if (prog_fd >= 0) - close(prog_fd); - return map_fd; -} - -int check_ancestor_cgroup_ids(int prog_id) -{ - __u64 actual_ids[NUM_CGROUP_LEVELS], expected_ids[NUM_CGROUP_LEVELS]; - __u32 level; - int err = 0; - int map_fd; - - expected_ids[0] = get_cgroup_id("/.."); /* root cgroup */ - expected_ids[1] = get_cgroup_id(""); - expected_ids[2] = get_cgroup_id(CGROUP_PATH); - expected_ids[3] = 0; /* non-existent cgroup */ - - map_fd = get_map_fd_by_prog_id(prog_id); - if (map_fd < 0) - goto err; - - for (level = 0; level < NUM_CGROUP_LEVELS; ++level) { - if (bpf_map_lookup_elem(map_fd, &level, &actual_ids[level])) { - log_err("Failed to lookup key %d", level); - goto err; - } - if (actual_ids[level] != expected_ids[level]) { - log_err("%llx (actual) != %llx (expected), level: %u\n", - actual_ids[level], expected_ids[level], level); - goto err; - } - } - - goto out; -err: - err = -1; -out: - if (map_fd >= 0) - close(map_fd); - return err; -} - -int main(int argc, char **argv) -{ - int cgfd = -1; - int err = 0; - - if (argc < 3) { - fprintf(stderr, "Usage: %s iface prog_id\n", argv[0]); - exit(EXIT_FAILURE); - } - - /* Use libbpf 1.0 API mode */ - libbpf_set_strict_mode(LIBBPF_STRICT_ALL); - - cgfd = cgroup_setup_and_join(CGROUP_PATH); - if (cgfd < 0) - goto err; - - if (send_packet(argv[1])) - goto err; - - if (check_ancestor_cgroup_ids(atoi(argv[2]))) - goto err; - - goto out; -err: - err = -1; -out: - close(cgfd); - cleanup_cgroup_environment(); - printf("[%s]\n", err ? "FAIL" : "PASS"); - return err; -} -- cgit v1.2.3 From ac01c8c4246546fd8340a232f3ada1921dc0ee48 Mon Sep 17 00:00:00 2001 From: Matt Fleming Date: Thu, 15 Aug 2024 15:22:12 +0100 Subject: perf hist: Update hist symbol when updating maps AddressSanitizer found a use-after-free bug in the symbol code which manifested as 'perf top' segfaulting. ==1238389==ERROR: AddressSanitizer: heap-use-after-free on address 0x60b00c48844b at pc 0x5650d8035961 bp 0x7f751aaecc90 sp 0x7f751aaecc80 READ of size 1 at 0x60b00c48844b thread T193 #0 0x5650d8035960 in _sort__sym_cmp util/sort.c:310 #1 0x5650d8043744 in hist_entry__cmp util/hist.c:1286 #2 0x5650d8043951 in hists__findnew_entry util/hist.c:614 #3 0x5650d804568f in __hists__add_entry util/hist.c:754 #4 0x5650d8045bf9 in hists__add_entry util/hist.c:772 #5 0x5650d8045df1 in iter_add_single_normal_entry util/hist.c:997 #6 0x5650d8043326 in hist_entry_iter__add util/hist.c:1242 #7 0x5650d7ceeefe in perf_event__process_sample /home/matt/src/linux/tools/perf/builtin-top.c:845 #8 0x5650d7ceeefe in deliver_event /home/matt/src/linux/tools/perf/builtin-top.c:1208 #9 0x5650d7fdb51b in do_flush util/ordered-events.c:245 #10 0x5650d7fdb51b in __ordered_events__flush util/ordered-events.c:324 #11 0x5650d7ced743 in process_thread /home/matt/src/linux/tools/perf/builtin-top.c:1120 #12 0x7f757ef1f133 in start_thread nptl/pthread_create.c:442 #13 0x7f757ef9f7db in clone3 ../sysdeps/unix/sysv/linux/x86_64/clone3.S:81 When updating hist maps it's also necessary to update the hist symbol reference because the old one gets freed in map__put(). While this bug was probably introduced with 5c24b67aae72f54c ("perf tools: Replace map->referenced & maps->removed_maps with map->refcnt"), the symbol objects were leaked until c087e9480cf33672 ("perf machine: Fix refcount usage when processing PERF_RECORD_KSYMBOL") was merged so the bug was masked. Fixes: c087e9480cf33672 ("perf machine: Fix refcount usage when processing PERF_RECORD_KSYMBOL") Reported-by: Yunzhao Li Signed-off-by: Matt Fleming (Cloudflare) Cc: Ian Rogers Cc: kernel-team@cloudflare.com Cc: Namhyung Kim Cc: Riccardo Mancini Cc: stable@vger.kernel.org # v5.13+ Link: https://lore.kernel.org/r/20240815142212.3834625-1-matt@readmodwrite.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/hist.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'tools') diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c index 0f554febf9a1..0f9ce2ee2c31 100644 --- a/tools/perf/util/hist.c +++ b/tools/perf/util/hist.c @@ -639,6 +639,11 @@ static struct hist_entry *hists__findnew_entry(struct hists *hists, * the history counter to increment. */ if (he->ms.map != entry->ms.map) { + if (he->ms.sym) { + u64 addr = he->ms.sym->start; + he->ms.sym = map__find_symbol(entry->ms.map, addr); + } + map__put(he->ms.map); he->ms.map = map__get(entry->ms.map); } -- cgit v1.2.3 From f52403b6bfea6994b017c58367aee9acdb9d9fd4 Mon Sep 17 00:00:00 2001 From: Kui-Feng Lee Date: Wed, 14 Aug 2024 22:32:49 -0700 Subject: selftests/bpf: Add traffic monitor functions. Add functions that capture packets and print log in the background. They are supposed to be used for debugging flaky network test cases. A monitored test case should call traffic_monitor_start() to start a thread to capture packets in the background for a given namespace and call traffic_monitor_stop() to stop capturing. (Or, option '-m' implemented by the later patches.) lo In IPv4 127.0.0.1:40265 > 127.0.0.1:55907: TCP, length 68, SYN lo In IPv4 127.0.0.1:55907 > 127.0.0.1:40265: TCP, length 60, SYN, ACK lo In IPv4 127.0.0.1:40265 > 127.0.0.1:55907: TCP, length 60, ACK lo In IPv4 127.0.0.1:55907 > 127.0.0.1:40265: TCP, length 52, ACK lo In IPv4 127.0.0.1:40265 > 127.0.0.1:55907: TCP, length 52, FIN, ACK lo In IPv4 127.0.0.1:55907 > 127.0.0.1:40265: TCP, length 52, RST, ACK Packet file: packets-2173-86-select_reuseport:sockhash_IPv4_TCP_LOOPBACK_test_detach_bpf-test.log #280/87 select_reuseport/sockhash IPv4/TCP LOOPBACK test_detach_bpf:OK The above is the output of an example. It shows the packets of a connection and the name of the file that contains captured packets in the directory /tmp/tmon_pcap. The file can be loaded by tcpdump or wireshark. This feature only works if libpcap is available. (Could be found by pkg-config) Acked-by: Stanislav Fomichev Signed-off-by: Kui-Feng Lee Link: https://lore.kernel.org/r/20240815053254.470944-2-thinker.li@gmail.com Signed-off-by: Martin KaFai Lau --- tools/testing/selftests/bpf/Makefile | 4 + tools/testing/selftests/bpf/network_helpers.c | 454 ++++++++++++++++++++++++++ tools/testing/selftests/bpf/network_helpers.h | 18 + 3 files changed, 476 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 163b11014158..4eceb491a8ae 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -41,6 +41,10 @@ CFLAGS += -g $(OPT_FLAGS) -rdynamic \ LDFLAGS += $(SAN_LDFLAGS) LDLIBS += $(LIBELF_LIBS) -lz -lrt -lpthread +LDLIBS += $(shell $(PKG_CONFIG) --libs libpcap 2>/dev/null) +CFLAGS += $(shell $(PKG_CONFIG) --cflags libpcap 2>/dev/null) +CFLAGS += $(shell $(PKG_CONFIG) --exists libpcap 2>/dev/null && echo "-DTRAFFIC_MONITOR=1") + # The following tests perform type punning and they may break strict # aliasing rules, which are exploited by both GCC and clang by default # while optimizing. This can lead to broken programs. diff --git a/tools/testing/selftests/bpf/network_helpers.c b/tools/testing/selftests/bpf/network_helpers.c index a3f0a49fb26f..e0b0a69f066e 100644 --- a/tools/testing/selftests/bpf/network_helpers.c +++ b/tools/testing/selftests/bpf/network_helpers.c @@ -11,17 +11,31 @@ #include #include #include +#include #include +#include #include #include #include #include +#include +#include +#include +#include + #include "bpf_util.h" #include "network_helpers.h" #include "test_progs.h" +#ifdef TRAFFIC_MONITOR +/* Prevent pcap.h from including pcap/bpf.h and causing conflicts */ +#define PCAP_DONT_INCLUDE_PCAP_BPF_H 1 +#include +#include +#endif + #ifndef IPPROTO_MPTCP #define IPPROTO_MPTCP 262 #endif @@ -660,3 +674,443 @@ int send_recv_data(int lfd, int fd, uint32_t total_bytes) return err; } + +#ifdef TRAFFIC_MONITOR +struct tmonitor_ctx { + pcap_t *pcap; + pcap_dumper_t *dumper; + pthread_t thread; + int wake_fd; + + volatile bool done; + char pkt_fname[PATH_MAX]; + int pcap_fd; +}; + +/* Is this packet captured with a Ethernet protocol type? */ +static bool is_ethernet(const u_char *packet) +{ + u16 arphdr_type; + + memcpy(&arphdr_type, packet + 8, 2); + arphdr_type = ntohs(arphdr_type); + + /* Except the following cases, the protocol type contains the + * Ethernet protocol type for the packet. + * + * https://www.tcpdump.org/linktypes/LINKTYPE_LINUX_SLL2.html + */ + switch (arphdr_type) { + case 770: /* ARPHRD_FRAD */ + case 778: /* ARPHDR_IPGRE */ + case 803: /* ARPHRD_IEEE80211_RADIOTAP */ + printf("Packet captured: arphdr_type=%d\n", arphdr_type); + return false; + } + return true; +} + +static const char * const pkt_types[] = { + "In", + "B", /* Broadcast */ + "M", /* Multicast */ + "C", /* Captured with the promiscuous mode */ + "Out", +}; + +static const char *pkt_type_str(u16 pkt_type) +{ + if (pkt_type < ARRAY_SIZE(pkt_types)) + return pkt_types[pkt_type]; + return "Unknown"; +} + +/* Show the information of the transport layer in the packet */ +static void show_transport(const u_char *packet, u16 len, u32 ifindex, + const char *src_addr, const char *dst_addr, + u16 proto, bool ipv6, u8 pkt_type) +{ + char *ifname, _ifname[IF_NAMESIZE]; + const char *transport_str; + u16 src_port, dst_port; + struct udphdr *udp; + struct tcphdr *tcp; + + ifname = if_indextoname(ifindex, _ifname); + if (!ifname) { + snprintf(_ifname, sizeof(_ifname), "unknown(%d)", ifindex); + ifname = _ifname; + } + + if (proto == IPPROTO_UDP) { + udp = (struct udphdr *)packet; + src_port = ntohs(udp->source); + dst_port = ntohs(udp->dest); + transport_str = "UDP"; + } else if (proto == IPPROTO_TCP) { + tcp = (struct tcphdr *)packet; + src_port = ntohs(tcp->source); + dst_port = ntohs(tcp->dest); + transport_str = "TCP"; + } else if (proto == IPPROTO_ICMP) { + printf("%-7s %-3s IPv4 %s > %s: ICMP, length %d, type %d, code %d\n", + ifname, pkt_type_str(pkt_type), src_addr, dst_addr, len, + packet[0], packet[1]); + return; + } else if (proto == IPPROTO_ICMPV6) { + printf("%-7s %-3s IPv6 %s > %s: ICMPv6, length %d, type %d, code %d\n", + ifname, pkt_type_str(pkt_type), src_addr, dst_addr, len, + packet[0], packet[1]); + return; + } else { + printf("%-7s %-3s %s %s > %s: protocol %d\n", + ifname, pkt_type_str(pkt_type), ipv6 ? "IPv6" : "IPv4", + src_addr, dst_addr, proto); + return; + } + + /* TCP or UDP*/ + + flockfile(stdout); + if (ipv6) + printf("%-7s %-3s IPv6 %s.%d > %s.%d: %s, length %d", + ifname, pkt_type_str(pkt_type), src_addr, src_port, + dst_addr, dst_port, transport_str, len); + else + printf("%-7s %-3s IPv4 %s:%d > %s:%d: %s, length %d", + ifname, pkt_type_str(pkt_type), src_addr, src_port, + dst_addr, dst_port, transport_str, len); + + if (proto == IPPROTO_TCP) { + if (tcp->fin) + printf(", FIN"); + if (tcp->syn) + printf(", SYN"); + if (tcp->rst) + printf(", RST"); + if (tcp->ack) + printf(", ACK"); + } + + printf("\n"); + funlockfile(stdout); +} + +static void show_ipv6_packet(const u_char *packet, u32 ifindex, u8 pkt_type) +{ + char src_buf[INET6_ADDRSTRLEN], dst_buf[INET6_ADDRSTRLEN]; + struct ipv6hdr *pkt = (struct ipv6hdr *)packet; + const char *src, *dst; + u_char proto; + + src = inet_ntop(AF_INET6, &pkt->saddr, src_buf, sizeof(src_buf)); + if (!src) + src = ""; + dst = inet_ntop(AF_INET6, &pkt->daddr, dst_buf, sizeof(dst_buf)); + if (!dst) + dst = ""; + proto = pkt->nexthdr; + show_transport(packet + sizeof(struct ipv6hdr), + ntohs(pkt->payload_len), + ifindex, src, dst, proto, true, pkt_type); +} + +static void show_ipv4_packet(const u_char *packet, u32 ifindex, u8 pkt_type) +{ + char src_buf[INET_ADDRSTRLEN], dst_buf[INET_ADDRSTRLEN]; + struct iphdr *pkt = (struct iphdr *)packet; + const char *src, *dst; + u_char proto; + + src = inet_ntop(AF_INET, &pkt->saddr, src_buf, sizeof(src_buf)); + if (!src) + src = ""; + dst = inet_ntop(AF_INET, &pkt->daddr, dst_buf, sizeof(dst_buf)); + if (!dst) + dst = ""; + proto = pkt->protocol; + show_transport(packet + sizeof(struct iphdr), + ntohs(pkt->tot_len), + ifindex, src, dst, proto, false, pkt_type); +} + +static void *traffic_monitor_thread(void *arg) +{ + char *ifname, _ifname[IF_NAMESIZE]; + const u_char *packet, *payload; + struct tmonitor_ctx *ctx = arg; + pcap_dumper_t *dumper = ctx->dumper; + int fd = ctx->pcap_fd, nfds, r; + int wake_fd = ctx->wake_fd; + struct pcap_pkthdr header; + pcap_t *pcap = ctx->pcap; + u32 ifindex; + fd_set fds; + u16 proto; + u8 ptype; + + nfds = (fd > wake_fd ? fd : wake_fd) + 1; + FD_ZERO(&fds); + + while (!ctx->done) { + FD_SET(fd, &fds); + FD_SET(wake_fd, &fds); + r = select(nfds, &fds, NULL, NULL, NULL); + if (!r) + continue; + if (r < 0) { + if (errno == EINTR) + continue; + log_err("Fail to select on pcap fd and wake fd"); + break; + } + + /* This instance of pcap is non-blocking */ + packet = pcap_next(pcap, &header); + if (!packet) + continue; + + /* According to the man page of pcap_dump(), first argument + * is the pcap_dumper_t pointer even it's argument type is + * u_char *. + */ + pcap_dump((u_char *)dumper, &header, packet); + + /* Not sure what other types of packets look like. Here, we + * parse only Ethernet and compatible packets. + */ + if (!is_ethernet(packet)) + continue; + + /* Skip SLL2 header + * https://www.tcpdump.org/linktypes/LINKTYPE_LINUX_SLL2.html + * + * Although the document doesn't mention that, the payload + * doesn't include the Ethernet header. The payload starts + * from the first byte of the network layer header. + */ + payload = packet + 20; + + memcpy(&proto, packet, 2); + proto = ntohs(proto); + memcpy(&ifindex, packet + 4, 4); + ifindex = ntohl(ifindex); + ptype = packet[10]; + + if (proto == ETH_P_IPV6) { + show_ipv6_packet(payload, ifindex, ptype); + } else if (proto == ETH_P_IP) { + show_ipv4_packet(payload, ifindex, ptype); + } else { + ifname = if_indextoname(ifindex, _ifname); + if (!ifname) { + snprintf(_ifname, sizeof(_ifname), "unknown(%d)", ifindex); + ifname = _ifname; + } + + printf("%-7s %-3s Unknown network protocol type 0x%x\n", + ifname, pkt_type_str(ptype), proto); + } + } + + return NULL; +} + +/* Prepare the pcap handle to capture packets. + * + * This pcap is non-blocking and immediate mode is enabled to receive + * captured packets as soon as possible. The snaplen is set to 1024 bytes + * to limit the size of captured content. The format of the link-layer + * header is set to DLT_LINUX_SLL2 to enable handling various link-layer + * technologies. + */ +static pcap_t *traffic_monitor_prepare_pcap(void) +{ + char errbuf[PCAP_ERRBUF_SIZE]; + pcap_t *pcap; + int r; + + /* Listen on all NICs in the namespace */ + pcap = pcap_create("any", errbuf); + if (!pcap) { + log_err("Failed to open pcap: %s", errbuf); + return NULL; + } + /* Limit the size of the packet (first N bytes) */ + r = pcap_set_snaplen(pcap, 1024); + if (r) { + log_err("Failed to set snaplen: %s", pcap_geterr(pcap)); + goto error; + } + /* To receive packets as fast as possible */ + r = pcap_set_immediate_mode(pcap, 1); + if (r) { + log_err("Failed to set immediate mode: %s", pcap_geterr(pcap)); + goto error; + } + r = pcap_setnonblock(pcap, 1, errbuf); + if (r) { + log_err("Failed to set nonblock: %s", errbuf); + goto error; + } + r = pcap_activate(pcap); + if (r) { + log_err("Failed to activate pcap: %s", pcap_geterr(pcap)); + goto error; + } + /* Determine the format of the link-layer header */ + r = pcap_set_datalink(pcap, DLT_LINUX_SLL2); + if (r) { + log_err("Failed to set datalink: %s", pcap_geterr(pcap)); + goto error; + } + + return pcap; +error: + pcap_close(pcap); + return NULL; +} + +static void encode_test_name(char *buf, size_t len, const char *test_name, const char *subtest_name) +{ + char *p; + + if (subtest_name) + snprintf(buf, len, "%s__%s", test_name, subtest_name); + else + snprintf(buf, len, "%s", test_name); + while ((p = strchr(buf, '/'))) + *p = '_'; + while ((p = strchr(buf, ' '))) + *p = '_'; +} + +#define PCAP_DIR "/tmp/tmon_pcap" + +/* Start to monitor the network traffic in the given network namespace. + * + * netns: the name of the network namespace to monitor. If NULL, the + * current network namespace is monitored. + * test_name: the name of the running test. + * subtest_name: the name of the running subtest if there is. It should be + * NULL if it is not a subtest. + * + * This function will start a thread to capture packets going through NICs + * in the give network namespace. + */ +struct tmonitor_ctx *traffic_monitor_start(const char *netns, const char *test_name, + const char *subtest_name) +{ + struct nstoken *nstoken = NULL; + struct tmonitor_ctx *ctx; + char test_name_buf[64]; + static int tmon_seq; + int r; + + if (netns) { + nstoken = open_netns(netns); + if (!nstoken) + return NULL; + } + ctx = malloc(sizeof(*ctx)); + if (!ctx) { + log_err("Failed to malloc ctx"); + goto fail_ctx; + } + memset(ctx, 0, sizeof(*ctx)); + + encode_test_name(test_name_buf, sizeof(test_name_buf), test_name, subtest_name); + snprintf(ctx->pkt_fname, sizeof(ctx->pkt_fname), + PCAP_DIR "/packets-%d-%d-%s-%s.log", getpid(), tmon_seq++, + test_name_buf, netns ? netns : "unknown"); + + r = mkdir(PCAP_DIR, 0755); + if (r && errno != EEXIST) { + log_err("Failed to create " PCAP_DIR); + goto fail_pcap; + } + + ctx->pcap = traffic_monitor_prepare_pcap(); + if (!ctx->pcap) + goto fail_pcap; + ctx->pcap_fd = pcap_get_selectable_fd(ctx->pcap); + if (ctx->pcap_fd < 0) { + log_err("Failed to get pcap fd"); + goto fail_dumper; + } + + /* Create a packet file */ + ctx->dumper = pcap_dump_open(ctx->pcap, ctx->pkt_fname); + if (!ctx->dumper) { + log_err("Failed to open pcap dump: %s", ctx->pkt_fname); + goto fail_dumper; + } + + /* Create an eventfd to wake up the monitor thread */ + ctx->wake_fd = eventfd(0, 0); + if (ctx->wake_fd < 0) { + log_err("Failed to create eventfd"); + goto fail_eventfd; + } + + r = pthread_create(&ctx->thread, NULL, traffic_monitor_thread, ctx); + if (r) { + log_err("Failed to create thread"); + goto fail; + } + + close_netns(nstoken); + + return ctx; + +fail: + close(ctx->wake_fd); + +fail_eventfd: + pcap_dump_close(ctx->dumper); + unlink(ctx->pkt_fname); + +fail_dumper: + pcap_close(ctx->pcap); + +fail_pcap: + free(ctx); + +fail_ctx: + close_netns(nstoken); + + return NULL; +} + +static void traffic_monitor_release(struct tmonitor_ctx *ctx) +{ + pcap_close(ctx->pcap); + pcap_dump_close(ctx->dumper); + + close(ctx->wake_fd); + + free(ctx); +} + +/* Stop the network traffic monitor. + * + * ctx: the context returned by traffic_monitor_start() + */ +void traffic_monitor_stop(struct tmonitor_ctx *ctx) +{ + __u64 w = 1; + + if (!ctx) + return; + + /* Stop the monitor thread */ + ctx->done = true; + /* Wake up the background thread. */ + write(ctx->wake_fd, &w, sizeof(w)); + pthread_join(ctx->thread, NULL); + + printf("Packet file: %s\n", strrchr(ctx->pkt_fname, '/') + 1); + + traffic_monitor_release(ctx); +} +#endif /* TRAFFIC_MONITOR */ diff --git a/tools/testing/selftests/bpf/network_helpers.h b/tools/testing/selftests/bpf/network_helpers.h index cce56955371f..0d032ae706c6 100644 --- a/tools/testing/selftests/bpf/network_helpers.h +++ b/tools/testing/selftests/bpf/network_helpers.h @@ -136,4 +136,22 @@ static inline __sum16 csum_ipv6_magic(const struct in6_addr *saddr, return csum_fold((__u32)s); } +struct tmonitor_ctx; + +#ifdef TRAFFIC_MONITOR +struct tmonitor_ctx *traffic_monitor_start(const char *netns, const char *test_name, + const char *subtest_name); +void traffic_monitor_stop(struct tmonitor_ctx *ctx); +#else +static inline struct tmonitor_ctx *traffic_monitor_start(const char *netns, const char *test_name, + const char *subtest_name) +{ + return NULL; +} + +static inline void traffic_monitor_stop(struct tmonitor_ctx *ctx) +{ +} +#endif + #endif -- cgit v1.2.3 From f5281aacec856e7e0beb643d74122595fc1fb4be Mon Sep 17 00:00:00 2001 From: Kui-Feng Lee Date: Wed, 14 Aug 2024 22:32:50 -0700 Subject: selftests/bpf: Add the traffic monitor option to test_progs. Add option '-m' to test_progs to accept names and patterns of test cases. This option will be used later to enable traffic monitor that capture network packets generated by test cases. Acked-by: Stanislav Fomichev Signed-off-by: Kui-Feng Lee Link: https://lore.kernel.org/r/20240815053254.470944-3-thinker.li@gmail.com Signed-off-by: Martin KaFai Lau --- tools/testing/selftests/bpf/test_progs.c | 86 +++++++++++++++++++++++--------- tools/testing/selftests/bpf/test_progs.h | 2 + 2 files changed, 64 insertions(+), 24 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/test_progs.c b/tools/testing/selftests/bpf/test_progs.c index 60fafa2f1ed7..f8ed1a16a884 100644 --- a/tools/testing/selftests/bpf/test_progs.c +++ b/tools/testing/selftests/bpf/test_progs.c @@ -155,6 +155,7 @@ struct prog_test_def { void (*run_serial_test)(void); bool should_run; bool need_cgroup_cleanup; + bool should_tmon; }; /* Override C runtime library's usleep() implementation to ensure nanosleep() @@ -192,46 +193,59 @@ static bool should_run(struct test_selector *sel, int num, const char *name) return num < sel->num_set_len && sel->num_set[num]; } -static bool should_run_subtest(struct test_selector *sel, - struct test_selector *subtest_sel, - int subtest_num, - const char *test_name, - const char *subtest_name) +static bool match_subtest(struct test_filter_set *filter, + const char *test_name, + const char *subtest_name) { int i, j; - for (i = 0; i < sel->blacklist.cnt; i++) { - if (glob_match(test_name, sel->blacklist.tests[i].name)) { - if (!sel->blacklist.tests[i].subtest_cnt) - return false; - - for (j = 0; j < sel->blacklist.tests[i].subtest_cnt; j++) { - if (glob_match(subtest_name, - sel->blacklist.tests[i].subtests[j])) - return false; - } - } - } - - for (i = 0; i < sel->whitelist.cnt; i++) { - if (glob_match(test_name, sel->whitelist.tests[i].name)) { - if (!sel->whitelist.tests[i].subtest_cnt) + for (i = 0; i < filter->cnt; i++) { + if (glob_match(test_name, filter->tests[i].name)) { + if (!filter->tests[i].subtest_cnt) return true; - for (j = 0; j < sel->whitelist.tests[i].subtest_cnt; j++) { + for (j = 0; j < filter->tests[i].subtest_cnt; j++) { if (glob_match(subtest_name, - sel->whitelist.tests[i].subtests[j])) + filter->tests[i].subtests[j])) return true; } } } + return false; +} + +static bool should_run_subtest(struct test_selector *sel, + struct test_selector *subtest_sel, + int subtest_num, + const char *test_name, + const char *subtest_name) +{ + if (match_subtest(&sel->blacklist, test_name, subtest_name)) + return false; + + if (match_subtest(&sel->whitelist, test_name, subtest_name)) + return true; + if (!sel->whitelist.cnt && !subtest_sel->num_set) return true; return subtest_num < subtest_sel->num_set_len && subtest_sel->num_set[subtest_num]; } +static bool should_tmon(struct test_selector *sel, const char *name) +{ + int i; + + for (i = 0; i < sel->whitelist.cnt; i++) { + if (glob_match(name, sel->whitelist.tests[i].name) && + !sel->whitelist.tests[i].subtest_cnt) + return true; + } + + return false; +} + static char *test_result(bool failed, bool skipped) { return failed ? "FAIL" : (skipped ? "SKIP" : "OK"); @@ -488,6 +502,10 @@ bool test__start_subtest(const char *subtest_name) return false; } + subtest_state->should_tmon = match_subtest(&env.tmon_selector.whitelist, + test->test_name, + subtest_name); + env.subtest_state = subtest_state; stdio_hijack_init(&subtest_state->log_buf, &subtest_state->log_cnt); @@ -667,7 +685,8 @@ enum ARG_KEYS { ARG_TEST_NAME_GLOB_DENYLIST = 'd', ARG_NUM_WORKERS = 'j', ARG_DEBUG = -1, - ARG_JSON_SUMMARY = 'J' + ARG_JSON_SUMMARY = 'J', + ARG_TRAFFIC_MONITOR = 'm', }; static const struct argp_option opts[] = { @@ -694,6 +713,10 @@ static const struct argp_option opts[] = { { "debug", ARG_DEBUG, NULL, 0, "print extra debug information for test_progs." }, { "json-summary", ARG_JSON_SUMMARY, "FILE", 0, "Write report in json format to this file."}, +#ifdef TRAFFIC_MONITOR + { "traffic-monitor", ARG_TRAFFIC_MONITOR, "NAMES", 0, + "Monitor network traffic of tests with name matching the pattern (supports '*' wildcard)." }, +#endif {}, }; @@ -905,6 +928,18 @@ static error_t parse_arg(int key, char *arg, struct argp_state *state) break; case ARGP_KEY_END: break; +#ifdef TRAFFIC_MONITOR + case ARG_TRAFFIC_MONITOR: + if (arg[0] == '@') + err = parse_test_list_file(arg + 1, + &env->tmon_selector.whitelist, + true); + else + err = parse_test_list(arg, + &env->tmon_selector.whitelist, + true); + break; +#endif default: return ARGP_ERR_UNKNOWN; } @@ -1736,6 +1771,8 @@ int main(int argc, char **argv) test->test_num, test->test_name, test->test_name, test->test_name); exit(EXIT_ERR_SETUP_INFRA); } + if (test->should_run) + test->should_tmon = should_tmon(&env.tmon_selector, test->test_name); } /* ignore workers if we are just listing */ @@ -1820,6 +1857,7 @@ out: free_test_selector(&env.test_selector); free_test_selector(&env.subtest_selector); + free_test_selector(&env.tmon_selector); free_test_states(); if (env.succ_cnt + env.fail_cnt + env.skip_cnt == 0) diff --git a/tools/testing/selftests/bpf/test_progs.h b/tools/testing/selftests/bpf/test_progs.h index cb9d6d46826b..966011eb7ec8 100644 --- a/tools/testing/selftests/bpf/test_progs.h +++ b/tools/testing/selftests/bpf/test_progs.h @@ -74,6 +74,7 @@ struct subtest_state { int error_cnt; bool skipped; bool filtered; + bool should_tmon; FILE *stdout_saved; }; @@ -98,6 +99,7 @@ struct test_state { struct test_env { struct test_selector test_selector; struct test_selector subtest_selector; + struct test_selector tmon_selector; bool verifier_stats; bool debug; enum verbosity verbosity; -- cgit v1.2.3 From 1e115a58be0ffca63727dc0495dae924a19f8cd4 Mon Sep 17 00:00:00 2001 From: Kui-Feng Lee Date: Wed, 14 Aug 2024 22:32:51 -0700 Subject: selftests/bpf: netns_new() and netns_free() helpers. netns_new()/netns_free() create/delete network namespaces. They support the option '-m' of test_progs to start/stop traffic monitor for the network namespace being created for matched tests. Acked-by: Stanislav Fomichev Signed-off-by: Kui-Feng Lee Link: https://lore.kernel.org/r/20240815053254.470944-4-thinker.li@gmail.com Signed-off-by: Martin KaFai Lau --- tools/testing/selftests/bpf/network_helpers.c | 46 ++++++++++++++ tools/testing/selftests/bpf/network_helpers.h | 2 + tools/testing/selftests/bpf/test_progs.c | 88 +++++++++++++++++++++++++++ tools/testing/selftests/bpf/test_progs.h | 4 ++ 4 files changed, 140 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/network_helpers.c b/tools/testing/selftests/bpf/network_helpers.c index e0b0a69f066e..27784946b01b 100644 --- a/tools/testing/selftests/bpf/network_helpers.c +++ b/tools/testing/selftests/bpf/network_helpers.c @@ -446,6 +446,52 @@ char *ping_command(int family) return "ping"; } +int remove_netns(const char *name) +{ + char *cmd; + int r; + + r = asprintf(&cmd, "ip netns del %s >/dev/null 2>&1", name); + if (r < 0) { + log_err("Failed to malloc cmd"); + return -1; + } + + r = system(cmd); + free(cmd); + return r; +} + +int make_netns(const char *name) +{ + char *cmd; + int r; + + r = asprintf(&cmd, "ip netns add %s", name); + if (r < 0) { + log_err("Failed to malloc cmd"); + return -1; + } + + r = system(cmd); + free(cmd); + + if (r) + return r; + + r = asprintf(&cmd, "ip -n %s link set lo up", name); + if (r < 0) { + log_err("Failed to malloc cmd for setting up lo"); + remove_netns(name); + return -1; + } + + r = system(cmd); + free(cmd); + + return r; +} + struct nstoken { int orig_netns_fd; }; diff --git a/tools/testing/selftests/bpf/network_helpers.h b/tools/testing/selftests/bpf/network_helpers.h index 0d032ae706c6..c72c16e1aff8 100644 --- a/tools/testing/selftests/bpf/network_helpers.h +++ b/tools/testing/selftests/bpf/network_helpers.h @@ -93,6 +93,8 @@ struct nstoken; struct nstoken *open_netns(const char *name); void close_netns(struct nstoken *token); int send_recv_data(int lfd, int fd, uint32_t total_bytes); +int make_netns(const char *name); +int remove_netns(const char *name); static __u16 csum_fold(__u32 csum) { diff --git a/tools/testing/selftests/bpf/test_progs.c b/tools/testing/selftests/bpf/test_progs.c index f8ed1a16a884..f45b06791444 100644 --- a/tools/testing/selftests/bpf/test_progs.c +++ b/tools/testing/selftests/bpf/test_progs.c @@ -18,6 +18,8 @@ #include #include "json_writer.h" +#include "network_helpers.h" + #ifdef __GLIBC__ #include /* backtrace */ #endif @@ -642,6 +644,92 @@ out: return err; } +struct netns_obj { + char *nsname; + struct tmonitor_ctx *tmon; + struct nstoken *nstoken; +}; + +/* Create a new network namespace with the given name. + * + * Create a new network namespace and set the network namespace of the + * current process to the new network namespace if the argument "open" is + * true. This function should be paired with netns_free() to release the + * resource and delete the network namespace. + * + * It also implements the functionality of the option "-m" by starting + * traffic monitor on the background to capture the packets in this network + * namespace if the current test or subtest matching the pattern. + * + * nsname: the name of the network namespace to create. + * open: open the network namespace if true. + * + * Return: the network namespace object on success, NULL on failure. + */ +struct netns_obj *netns_new(const char *nsname, bool open) +{ + struct netns_obj *netns_obj = malloc(sizeof(*netns_obj)); + const char *test_name, *subtest_name; + int r; + + if (!netns_obj) + return NULL; + memset(netns_obj, 0, sizeof(*netns_obj)); + + netns_obj->nsname = strdup(nsname); + if (!netns_obj->nsname) + goto fail; + + /* Create the network namespace */ + r = make_netns(nsname); + if (r) + goto fail; + + /* Start traffic monitor */ + if (env.test->should_tmon || + (env.subtest_state && env.subtest_state->should_tmon)) { + test_name = env.test->test_name; + subtest_name = env.subtest_state ? env.subtest_state->name : NULL; + netns_obj->tmon = traffic_monitor_start(nsname, test_name, subtest_name); + if (!netns_obj->tmon) { + fprintf(stderr, "Failed to start traffic monitor for %s\n", nsname); + goto fail; + } + } else { + netns_obj->tmon = NULL; + } + + if (open) { + netns_obj->nstoken = open_netns(nsname); + if (!netns_obj->nstoken) + goto fail; + } + + return netns_obj; +fail: + traffic_monitor_stop(netns_obj->tmon); + remove_netns(nsname); + free(netns_obj->nsname); + free(netns_obj); + return NULL; +} + +/* Delete the network namespace. + * + * This function should be paired with netns_new() to delete the namespace + * created by netns_new(). + */ +void netns_free(struct netns_obj *netns_obj) +{ + if (!netns_obj) + return; + traffic_monitor_stop(netns_obj->tmon); + close_netns(netns_obj->nstoken); + remove_netns(netns_obj->nsname); + free(netns_obj->nsname); + free(netns_obj); +} + /* extern declarations for test funcs */ #define DEFINE_TEST(name) \ extern void test_##name(void) __weak; \ diff --git a/tools/testing/selftests/bpf/test_progs.h b/tools/testing/selftests/bpf/test_progs.h index 966011eb7ec8..3ad131de14c6 100644 --- a/tools/testing/selftests/bpf/test_progs.h +++ b/tools/testing/selftests/bpf/test_progs.h @@ -430,6 +430,10 @@ int write_sysctl(const char *sysctl, const char *value); int get_bpf_max_tramp_links_from(struct btf *btf); int get_bpf_max_tramp_links(void); +struct netns_obj; +struct netns_obj *netns_new(const char *name, bool open); +void netns_free(struct netns_obj *netns); + #ifdef __x86_64__ #define SYS_NANOSLEEP_KPROBE_NAME "__x64_sys_nanosleep" #elif defined(__s390x__) -- cgit v1.2.3 From 52a5b8a30fa882c4d363f7679b7db4a093550ac1 Mon Sep 17 00:00:00 2001 From: Kui-Feng Lee Date: Wed, 14 Aug 2024 22:32:52 -0700 Subject: selftests/bpf: Monitor traffic for tc_redirect. Enable traffic monitoring for the test case tc_redirect. Acked-by: Stanislav Fomichev Signed-off-by: Kui-Feng Lee Link: https://lore.kernel.org/r/20240815053254.470944-5-thinker.li@gmail.com Signed-off-by: Martin KaFai Lau --- .../testing/selftests/bpf/prog_tests/tc_redirect.c | 29 ++++++++++++++++------ 1 file changed, 22 insertions(+), 7 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/tc_redirect.c b/tools/testing/selftests/bpf/prog_tests/tc_redirect.c index 53b8ffc943dc..974f9d6269c9 100644 --- a/tools/testing/selftests/bpf/prog_tests/tc_redirect.c +++ b/tools/testing/selftests/bpf/prog_tests/tc_redirect.c @@ -68,6 +68,7 @@ __FILE__, __LINE__, strerror(errno), ##__VA_ARGS__) static const char * const namespaces[] = {NS_SRC, NS_FWD, NS_DST, NULL}; +static struct netns_obj *netns_objs[3]; static int write_file(const char *path, const char *newval) { @@ -87,27 +88,41 @@ static int write_file(const char *path, const char *newval) static int netns_setup_namespaces(const char *verb) { + struct netns_obj **ns_obj = netns_objs; const char * const *ns = namespaces; - char cmd[128]; while (*ns) { - snprintf(cmd, sizeof(cmd), "ip netns %s %s", verb, *ns); - if (!ASSERT_OK(system(cmd), cmd)) - return -1; + if (strcmp(verb, "add") == 0) { + *ns_obj = netns_new(*ns, false); + if (!ASSERT_OK_PTR(*ns_obj, "netns_new")) + return -1; + } else { + if (!ASSERT_OK_PTR(*ns_obj, "netns_obj is NULL")) + return -1; + netns_free(*ns_obj); + *ns_obj = NULL; + } ns++; + ns_obj++; } return 0; } static void netns_setup_namespaces_nofail(const char *verb) { + struct netns_obj **ns_obj = netns_objs; const char * const *ns = namespaces; - char cmd[128]; while (*ns) { - snprintf(cmd, sizeof(cmd), "ip netns %s %s > /dev/null 2>&1", verb, *ns); - system(cmd); + if (strcmp(verb, "add") == 0) { + *ns_obj = netns_new(*ns, false); + } else { + if (*ns_obj) + netns_free(*ns_obj); + *ns_obj = NULL; + } ns++; + ns_obj++; } } -- cgit v1.2.3 From b407b52b18505a7c09a77cbab022ec6cf82dc5f0 Mon Sep 17 00:00:00 2001 From: Kui-Feng Lee Date: Wed, 14 Aug 2024 22:32:53 -0700 Subject: selftests/bpf: Monitor traffic for sockmap_listen. Enable traffic monitor for each subtest of sockmap_listen. Acked-by: Stanislav Fomichev Signed-off-by: Kui-Feng Lee Link: https://lore.kernel.org/r/20240815053254.470944-6-thinker.li@gmail.com Signed-off-by: Martin KaFai Lau --- tools/testing/selftests/bpf/prog_tests/sockmap_listen.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c b/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c index 9ce0e0e0b7da..ebb5f6336052 100644 --- a/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c +++ b/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c @@ -1925,6 +1925,7 @@ static void test_udp_unix_redir(struct test_sockmap_listen *skel, struct bpf_map int family) { const char *family_name, *map_name; + struct netns_obj *netns; char s[MAX_TEST_NAME]; family_name = family_str(family); @@ -1932,8 +1933,15 @@ static void test_udp_unix_redir(struct test_sockmap_listen *skel, struct bpf_map snprintf(s, sizeof(s), "%s %s %s", map_name, family_name, __func__); if (!test__start_subtest(s)) return; + + netns = netns_new("sockmap_listen", true); + if (!ASSERT_OK_PTR(netns, "netns_new")) + return; + inet_unix_skb_redir_to_connected(skel, map, family); unix_inet_skb_redir_to_connected(skel, map, family); + + netns_free(netns); } static void run_tests(struct test_sockmap_listen *skel, struct bpf_map *map, -- cgit v1.2.3 From 69354085975ac61632ebceedebbeeeb0272620b8 Mon Sep 17 00:00:00 2001 From: Kui-Feng Lee Date: Wed, 14 Aug 2024 22:32:54 -0700 Subject: selftests/bpf: Monitor traffic for select_reuseport. Enable traffic monitoring for the subtests of select_reuseport. Acked-by: Stanislav Fomichev Signed-off-by: Kui-Feng Lee Link: https://lore.kernel.org/r/20240815053254.470944-7-thinker.li@gmail.com Signed-off-by: Martin KaFai Lau --- .../selftests/bpf/prog_tests/select_reuseport.c | 37 ++++++++-------------- 1 file changed, 13 insertions(+), 24 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/select_reuseport.c b/tools/testing/selftests/bpf/prog_tests/select_reuseport.c index 64c5f5eb2994..036d4760d2c1 100644 --- a/tools/testing/selftests/bpf/prog_tests/select_reuseport.c +++ b/tools/testing/selftests/bpf/prog_tests/select_reuseport.c @@ -37,9 +37,7 @@ static int sk_fds[REUSEPORT_ARRAY_SIZE]; static int reuseport_array = -1, outer_map = -1; static enum bpf_map_type inner_map_type; static int select_by_skb_data_prog; -static int saved_tcp_syncookie = -1; static struct bpf_object *obj; -static int saved_tcp_fo = -1; static __u32 index_zero; static int epfd; @@ -193,14 +191,6 @@ static int write_int_sysctl(const char *sysctl, int v) return 0; } -static void restore_sysctls(void) -{ - if (saved_tcp_fo != -1) - write_int_sysctl(TCP_FO_SYSCTL, saved_tcp_fo); - if (saved_tcp_syncookie != -1) - write_int_sysctl(TCP_SYNCOOKIE_SYSCTL, saved_tcp_syncookie); -} - static int enable_fastopen(void) { int fo; @@ -793,6 +783,7 @@ static void test_config(int sotype, sa_family_t family, bool inany) TEST_INIT(test_pass_on_err), TEST_INIT(test_detach_bpf), }; + struct netns_obj *netns; char s[MAX_TEST_NAME]; const struct test *t; @@ -808,9 +799,21 @@ static void test_config(int sotype, sa_family_t family, bool inany) if (!test__start_subtest(s)) continue; + netns = netns_new("select_reuseport", true); + if (!ASSERT_OK_PTR(netns, "netns_new")) + continue; + + if (CHECK_FAIL(enable_fastopen())) + goto out; + if (CHECK_FAIL(disable_syncookie())) + goto out; + setup_per_test(sotype, family, inany, t->no_inner_map); t->fn(sotype, family); cleanup_per_test(t->no_inner_map); + +out: + netns_free(netns); } } @@ -850,21 +853,7 @@ out: void serial_test_select_reuseport(void) { - saved_tcp_fo = read_int_sysctl(TCP_FO_SYSCTL); - if (saved_tcp_fo < 0) - goto out; - saved_tcp_syncookie = read_int_sysctl(TCP_SYNCOOKIE_SYSCTL); - if (saved_tcp_syncookie < 0) - goto out; - - if (enable_fastopen()) - goto out; - if (disable_syncookie()) - goto out; - test_map_type(BPF_MAP_TYPE_REUSEPORT_SOCKARRAY); test_map_type(BPF_MAP_TYPE_SOCKMAP); test_map_type(BPF_MAP_TYPE_SOCKHASH); -out: - restore_sysctls(); } -- cgit v1.2.3 From b53f20b323ee3c5daa5494ad69e3ae0e77cfd82a Mon Sep 17 00:00:00 2001 From: Alexander Gordeev Date: Thu, 15 Aug 2024 09:20:46 +0200 Subject: tools build: Provide consistent build options for fixdep The fixdep binary is being compiled and linked in one step. While the host linker flags are passed to the compiler the host compiler flags are missed. That leads to build errors at least on x86_64, arm64 and s390 as result of the compiler vs linker flags inconsistency. For example, during RPM package build redhat-hardened-ld script is provided to gcc, while redhat-hardened-cc1 script is missed. Provide both KBUILD_HOSTCFLAGS and KBUILD_HOSTLDFLAGS to avoid that. Fixes: ea974028a049f2ce ("tools build: Avoid circular .fixdep-in.o.cmd issues") Closes: https://lore.kernel.org/lkml/99ae0d34-ed76-4ca0-a9fd-c337da33c9f9@leemhuis.info/ Reported-by: Thorsten Leemhuis Reviewed-by: Brian Norris Signed-off-by: Alexander Gordeev Tested-by: Thorsten Leemhuis Acked-by: Jiri Olsa Link: https://lore.kernel.org/r/20240815072046.1002837-1-agordeev@linux.ibm.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/build/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/build/Makefile b/tools/build/Makefile index fea3cf647f5b..18ad131f6ea7 100644 --- a/tools/build/Makefile +++ b/tools/build/Makefile @@ -44,4 +44,4 @@ ifneq ($(wildcard $(TMP_O)),) endif $(OUTPUT)fixdep: $(srctree)/tools/build/fixdep.c - $(QUIET_CC)$(HOSTCC) $(KBUILD_HOSTLDFLAGS) -o $@ $< + $(QUIET_CC)$(HOSTCC) $(KBUILD_HOSTCFLAGS) $(KBUILD_HOSTLDFLAGS) -o $@ $< -- cgit v1.2.3 From fab45b962749184e1a1a57c7c583782b78fad539 Mon Sep 17 00:00:00 2001 From: Sam James Date: Tue, 13 Aug 2024 20:49:06 +0100 Subject: libbpf: Workaround -Wmaybe-uninitialized false positive MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In `elf_close`, we get this with GCC 15 -O3 (at least): ``` In function ‘elf_close’, inlined from ‘elf_close’ at elf.c:53:6, inlined from ‘elf_find_func_offset_from_file’ at elf.c:384:2: elf.c:57:9: warning: ‘elf_fd.elf’ may be used uninitialized [-Wmaybe-uninitialized] 57 | elf_end(elf_fd->elf); | ^~~~~~~~~~~~~~~~~~~~ elf.c: In function ‘elf_find_func_offset_from_file’: elf.c:377:23: note: ‘elf_fd.elf’ was declared here 377 | struct elf_fd elf_fd; | ^~~~~~ In function ‘elf_close’, inlined from ‘elf_close’ at elf.c:53:6, inlined from ‘elf_find_func_offset_from_file’ at elf.c:384:2: elf.c:58:9: warning: ‘elf_fd.fd’ may be used uninitialized [-Wmaybe-uninitialized] 58 | close(elf_fd->fd); | ^~~~~~~~~~~~~~~~~ elf.c: In function ‘elf_find_func_offset_from_file’: elf.c:377:23: note: ‘elf_fd.fd’ was declared here 377 | struct elf_fd elf_fd; | ^~~~~~ ``` In reality, our use is fine, it's just that GCC doesn't model errno here (see linked GCC bug). Suppress -Wmaybe-uninitialized accordingly by initializing elf_fd.fd to -1 and elf_fd.elf to NULL. I've done this in two other functions as well given it could easily occur there too (same access/use pattern). Signed-off-by: Sam James Signed-off-by: Andrii Nakryiko Link: https://gcc.gnu.org/PR114952 Link: https://lore.kernel.org/bpf/14ec488a1cac02794c2fa2b83ae0cef1bce2cb36.1723578546.git.sam@gentoo.org --- tools/lib/bpf/elf.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'tools') diff --git a/tools/lib/bpf/elf.c b/tools/lib/bpf/elf.c index c92e02394159..b5ab1cb13e5e 100644 --- a/tools/lib/bpf/elf.c +++ b/tools/lib/bpf/elf.c @@ -28,6 +28,9 @@ int elf_open(const char *binary_path, struct elf_fd *elf_fd) int fd, ret; Elf *elf; + elf_fd->elf = NULL; + elf_fd->fd = -1; + if (elf_version(EV_CURRENT) == EV_NONE) { pr_warn("elf: failed to init libelf for %s\n", binary_path); return -LIBBPF_ERRNO__LIBELF; -- cgit v1.2.3 From e7d731326ef0622f103e5ed47d3405f71cdcd7f6 Mon Sep 17 00:00:00 2001 From: Abhash Jha Date: Wed, 14 Aug 2024 23:01:21 +0530 Subject: selftests/net/pmtu.sh: Fix typo in error message The word 'expected' was spelled as 'exepcted'. Fixed the typo in this patch. Signed-off-by: Abhash Jha Reviewed-by: Simon Horman Link: https://patch.msgid.link/20240814173121.33590-1-abhashkumarjha123@gmail.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/pmtu.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/pmtu.sh b/tools/testing/selftests/net/pmtu.sh index 5175c0c83a23..24a50622406c 100755 --- a/tools/testing/selftests/net/pmtu.sh +++ b/tools/testing/selftests/net/pmtu.sh @@ -1447,7 +1447,7 @@ test_pmtu_ipvX_over_bridged_vxlanY_or_geneveY_exception() { size=$(du -sb $tmpoutfile) size=${size%%/tmp/*} - [ $size -ne 1048576 ] && err "File size $size mismatches exepcted value in locally bridged vxlan test" && return 1 + [ $size -ne 1048576 ] && err "File size $size mismatches expected value in locally bridged vxlan test" && return 1 done rm -f "$tmpoutfile" -- cgit v1.2.3 From 8d019b15ddd55d6dc5685b1f51902c4aa8e01939 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Thu, 15 Aug 2024 03:06:54 +0300 Subject: selftests: net: local_termination: refactor macvlan creation/deletion This will be used in other subtests as well; make new macvlan_create() and macvlan_destroy() functions. Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- .../selftests/net/forwarding/local_termination.sh | 30 +++++++++++++--------- 1 file changed, 18 insertions(+), 12 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/forwarding/local_termination.sh b/tools/testing/selftests/net/forwarding/local_termination.sh index 4b364cdf3ef0..36f3d577d0be 100755 --- a/tools/testing/selftests/net/forwarding/local_termination.sh +++ b/tools/testing/selftests/net/forwarding/local_termination.sh @@ -247,19 +247,29 @@ bridge_destroy() ip link del br0 } -standalone() +macvlan_create() { - h1_create - h2_create + local lower=$1 - ip link add link $h2 name macvlan0 type macvlan mode private + ip link add link $lower name macvlan0 type macvlan mode private ip link set macvlan0 address $MACVLAN_ADDR ip link set macvlan0 up +} - run_test $h2 - +macvlan_destroy() +{ ip link del macvlan0 +} + +standalone() +{ + h1_create + h2_create + macvlan_create $h2 + + run_test $h2 + macvlan_destroy h2_destroy h1_destroy } @@ -268,15 +278,11 @@ bridge() { h1_create bridge_create - - ip link add link br0 name macvlan0 type macvlan mode private - ip link set macvlan0 address $MACVLAN_ADDR - ip link set macvlan0 up + macvlan_create br0 run_test br0 - ip link del macvlan0 - + macvlan_destroy bridge_destroy h1_destroy } -- cgit v1.2.3 From 4261fa35185c0112acca0496d3732c8fcfe1dcf2 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Thu, 15 Aug 2024 03:06:55 +0300 Subject: selftests: net: local_termination: parameterize sending interface In future changes we will want to subject the DUT, $h2, to additional VLAN-tagged traffic. For that, we need to run the tests using $h1.100 as a sending interface, rather than the currently hardcoded $h1. Add a parameter to run_test() and modify its 2 callers to explicitly pass $h1, as was implicit before. Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- .../selftests/net/forwarding/local_termination.sh | 39 +++++++++++----------- 1 file changed, 20 insertions(+), 19 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/forwarding/local_termination.sh b/tools/testing/selftests/net/forwarding/local_termination.sh index 36f3d577d0be..92f0e242d119 100755 --- a/tools/testing/selftests/net/forwarding/local_termination.sh +++ b/tools/testing/selftests/net/forwarding/local_termination.sh @@ -104,44 +104,45 @@ mc_route_destroy() run_test() { - local rcv_if_name=$1 - local smac=$(mac_get $h1) + local send_if_name=$1; shift + local rcv_if_name=$1; shift + local smac=$(mac_get $send_if_name) local rcv_dmac=$(mac_get $rcv_if_name) tcpdump_start $rcv_if_name - mc_route_prepare $h1 + mc_route_prepare $send_if_name mc_route_prepare $rcv_if_name - send_uc_ipv4 $h1 $rcv_dmac - send_uc_ipv4 $h1 $MACVLAN_ADDR - send_uc_ipv4 $h1 $UNKNOWN_UC_ADDR1 + send_uc_ipv4 $send_if_name $rcv_dmac + send_uc_ipv4 $send_if_name $MACVLAN_ADDR + send_uc_ipv4 $send_if_name $UNKNOWN_UC_ADDR1 ip link set dev $rcv_if_name promisc on - send_uc_ipv4 $h1 $UNKNOWN_UC_ADDR2 - mc_send $h1 $UNKNOWN_IPV4_MC_ADDR2 - mc_send $h1 $UNKNOWN_IPV6_MC_ADDR2 + send_uc_ipv4 $send_if_name $UNKNOWN_UC_ADDR2 + mc_send $send_if_name $UNKNOWN_IPV4_MC_ADDR2 + mc_send $send_if_name $UNKNOWN_IPV6_MC_ADDR2 ip link set dev $rcv_if_name promisc off mc_join $rcv_if_name $JOINED_IPV4_MC_ADDR - mc_send $h1 $JOINED_IPV4_MC_ADDR + mc_send $send_if_name $JOINED_IPV4_MC_ADDR mc_leave mc_join $rcv_if_name $JOINED_IPV6_MC_ADDR - mc_send $h1 $JOINED_IPV6_MC_ADDR + mc_send $send_if_name $JOINED_IPV6_MC_ADDR mc_leave - mc_send $h1 $UNKNOWN_IPV4_MC_ADDR1 - mc_send $h1 $UNKNOWN_IPV6_MC_ADDR1 + mc_send $send_if_name $UNKNOWN_IPV4_MC_ADDR1 + mc_send $send_if_name $UNKNOWN_IPV6_MC_ADDR1 ip link set dev $rcv_if_name allmulticast on - send_uc_ipv4 $h1 $UNKNOWN_UC_ADDR3 - mc_send $h1 $UNKNOWN_IPV4_MC_ADDR3 - mc_send $h1 $UNKNOWN_IPV6_MC_ADDR3 + send_uc_ipv4 $send_if_name $UNKNOWN_UC_ADDR3 + mc_send $send_if_name $UNKNOWN_IPV4_MC_ADDR3 + mc_send $send_if_name $UNKNOWN_IPV6_MC_ADDR3 ip link set dev $rcv_if_name allmulticast off mc_route_destroy $rcv_if_name - mc_route_destroy $h1 + mc_route_destroy $send_if_name sleep 1 @@ -267,7 +268,7 @@ standalone() h2_create macvlan_create $h2 - run_test $h2 + run_test $h1 $h2 macvlan_destroy h2_destroy @@ -280,7 +281,7 @@ bridge() bridge_create macvlan_create br0 - run_test br0 + run_test $h1 br0 macvlan_destroy bridge_destroy -- cgit v1.2.3 From df7cf5cc551c7c0a92520e91e1184993784c6386 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Thu, 15 Aug 2024 03:06:56 +0300 Subject: selftests: net: local_termination: parameterize test name There are upcoming tests which verify the RX filtering of a bridge (or bridge port), but under differing vlan_filtering conditions. Since we currently print $h2 (the DUT) in the log_test() output, it becomes necessary to make a further distinction between tests, to not give the user the impression that the exact same thing is run twice. Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- .../selftests/net/forwarding/local_termination.sh | 38 ++++++++++++---------- 1 file changed, 20 insertions(+), 18 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/forwarding/local_termination.sh b/tools/testing/selftests/net/forwarding/local_termination.sh index 92f0e242d119..af284edaf401 100755 --- a/tools/testing/selftests/net/forwarding/local_termination.sh +++ b/tools/testing/selftests/net/forwarding/local_termination.sh @@ -68,10 +68,11 @@ send_uc_ipv4() check_rcv() { - local if_name=$1 - local type=$2 - local pattern=$3 - local should_receive=$4 + local if_name=$1; shift + local type=$1; shift + local pattern=$1; shift + local should_receive=$1; shift + local test_name="$1"; shift local should_fail= [ $should_receive = true ] && should_fail=0 || should_fail=1 @@ -81,7 +82,7 @@ check_rcv() check_err_fail "$should_fail" "$?" "reception" - log_test "$if_name: $type" + log_test "$test_name: $type" } mc_route_prepare() @@ -106,6 +107,7 @@ run_test() { local send_if_name=$1; shift local rcv_if_name=$1; shift + local test_name="$1"; shift local smac=$(mac_get $send_if_name) local rcv_dmac=$(mac_get $rcv_if_name) @@ -150,61 +152,61 @@ run_test() check_rcv $rcv_if_name "Unicast IPv4 to primary MAC address" \ "$smac > $rcv_dmac, ethertype IPv4 (0x0800)" \ - true + true "$test_name" check_rcv $rcv_if_name "Unicast IPv4 to macvlan MAC address" \ "$smac > $MACVLAN_ADDR, ethertype IPv4 (0x0800)" \ - true + true "$test_name" xfail_on_veth $h1 \ check_rcv $rcv_if_name "Unicast IPv4 to unknown MAC address" \ "$smac > $UNKNOWN_UC_ADDR1, ethertype IPv4 (0x0800)" \ - false + false "$test_name" check_rcv $rcv_if_name "Unicast IPv4 to unknown MAC address, promisc" \ "$smac > $UNKNOWN_UC_ADDR2, ethertype IPv4 (0x0800)" \ - true + true "$test_name" xfail_on_veth $h1 \ check_rcv $rcv_if_name \ "Unicast IPv4 to unknown MAC address, allmulti" \ "$smac > $UNKNOWN_UC_ADDR3, ethertype IPv4 (0x0800)" \ - false + false "$test_name" check_rcv $rcv_if_name "Multicast IPv4 to joined group" \ "$smac > $JOINED_MACV4_MC_ADDR, ethertype IPv4 (0x0800)" \ - true + true "$test_name" xfail_on_veth $h1 \ check_rcv $rcv_if_name \ "Multicast IPv4 to unknown group" \ "$smac > $UNKNOWN_MACV4_MC_ADDR1, ethertype IPv4 (0x0800)" \ - false + false "$test_name" check_rcv $rcv_if_name "Multicast IPv4 to unknown group, promisc" \ "$smac > $UNKNOWN_MACV4_MC_ADDR2, ethertype IPv4 (0x0800)" \ - true + true "$test_name" check_rcv $rcv_if_name "Multicast IPv4 to unknown group, allmulti" \ "$smac > $UNKNOWN_MACV4_MC_ADDR3, ethertype IPv4 (0x0800)" \ - true + true "$test_name" check_rcv $rcv_if_name "Multicast IPv6 to joined group" \ "$smac > $JOINED_MACV6_MC_ADDR, ethertype IPv6 (0x86dd)" \ - true + true "$test_name" xfail_on_veth $h1 \ check_rcv $rcv_if_name "Multicast IPv6 to unknown group" \ "$smac > $UNKNOWN_MACV6_MC_ADDR1, ethertype IPv6 (0x86dd)" \ - false + false "$test_name" check_rcv $rcv_if_name "Multicast IPv6 to unknown group, promisc" \ "$smac > $UNKNOWN_MACV6_MC_ADDR2, ethertype IPv6 (0x86dd)" \ - true + true "$test_name" check_rcv $rcv_if_name "Multicast IPv6 to unknown group, allmulti" \ "$smac > $UNKNOWN_MACV6_MC_ADDR3, ethertype IPv6 (0x86dd)" \ - true + true "$test_name" tcpdump_cleanup $rcv_if_name } -- cgit v1.2.3 From 5b8e74182ed3d4f1c38c626e6120275ca9d92bee Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Thu, 15 Aug 2024 03:06:57 +0300 Subject: selftests: net: local_termination: add one more test for VLAN-aware bridges The current bridge() test is for packet reception on a VLAN-unaware bridge. Some things are different enough with VLAN-aware bridges that it's worth renaming this test into vlan_unaware_bridge(), and add a new vlan_aware_bridge() test. The two will share the same implementation: bridge() becomes a common function, which receives $vlan_filtering as an argument. Rename it to test_bridge() at the same time, because just bridge() pollutes the global namespace and we cannot invoke the binary with the same name from the iproute2 package currently. Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- .../selftests/net/forwarding/local_termination.sh | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/forwarding/local_termination.sh b/tools/testing/selftests/net/forwarding/local_termination.sh index af284edaf401..5aa364b40e33 100755 --- a/tools/testing/selftests/net/forwarding/local_termination.sh +++ b/tools/testing/selftests/net/forwarding/local_termination.sh @@ -1,7 +1,7 @@ #!/bin/bash # SPDX-License-Identifier: GPL-2.0 -ALL_TESTS="standalone bridge" +ALL_TESTS="standalone vlan_unaware_bridge vlan_aware_bridge" NUM_NETIFS=2 PING_COUNT=1 REQUIRE_MTOOLS=yes @@ -233,7 +233,9 @@ h2_destroy() bridge_create() { - ip link add br0 type bridge + local vlan_filtering=$1 + + ip link add br0 type bridge vlan_filtering $vlan_filtering ip link set br0 address $BRIDGE_ADDR ip link set br0 up @@ -277,10 +279,12 @@ standalone() h1_destroy } -bridge() +test_bridge() { + local vlan_filtering=$1 + h1_create - bridge_create + bridge_create $vlan_filtering macvlan_create br0 run_test $h1 br0 @@ -290,6 +294,16 @@ bridge() h1_destroy } +vlan_unaware_bridge() +{ + test_bridge 0 +} + +vlan_aware_bridge() +{ + test_bridge 1 +} + cleanup() { pre_cleanup -- cgit v1.2.3 From 5fea8bb009744bbb90b3f6ca41c558429ee4c849 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Thu, 15 Aug 2024 03:06:58 +0300 Subject: selftests: net: local_termination: introduce new tests which capture VLAN behavior Add more coverage to the local termination selftest as follows: - 8021q upper of $h2 - 8021q upper of $h2, where $h2 is a port of a VLAN-unaware bridge - 8021q upper of $h2, where $h2 is a port of a VLAN-aware bridge - 8021q upper of VLAN-unaware br0, which is the upper of $h2 - 8021q upper of VLAN-aware br0, which is the upper of $h2 Especially the cases with traffic sent through the VLAN upper of a VLAN-aware bridge port will be immediately relevant when we will start transmitting PTP packets as an additional kind of traffic. Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- .../selftests/net/forwarding/local_termination.sh | 117 +++++++++++++++++++-- 1 file changed, 110 insertions(+), 7 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/forwarding/local_termination.sh b/tools/testing/selftests/net/forwarding/local_termination.sh index 5aa364b40e33..e22c6a693bef 100755 --- a/tools/testing/selftests/net/forwarding/local_termination.sh +++ b/tools/testing/selftests/net/forwarding/local_termination.sh @@ -1,7 +1,9 @@ #!/bin/bash # SPDX-License-Identifier: GPL-2.0 -ALL_TESTS="standalone vlan_unaware_bridge vlan_aware_bridge" +ALL_TESTS="standalone vlan_unaware_bridge vlan_aware_bridge test_vlan \ + vlan_over_vlan_unaware_bridged_port vlan_over_vlan_aware_bridged_port \ + vlan_over_vlan_unaware_bridge vlan_over_vlan_aware_bridge" NUM_NETIFS=2 PING_COUNT=1 REQUIRE_MTOOLS=yes @@ -231,6 +233,30 @@ h2_destroy() simple_if_fini $h2 $H2_IPV4/24 $H2_IPV6/64 } +h1_vlan_create() +{ + simple_if_init $h1 + vlan_create $h1 100 v$h1 $H1_IPV4/24 $H1_IPV6/64 +} + +h1_vlan_destroy() +{ + vlan_destroy $h1 100 + simple_if_fini $h1 +} + +h2_vlan_create() +{ + simple_if_init $h2 + vlan_create $h2 100 v$h2 $H2_IPV4/24 $H2_IPV6/64 +} + +h2_vlan_destroy() +{ + vlan_destroy $h2 100 + simple_if_fini $h2 +} + bridge_create() { local vlan_filtering=$1 @@ -241,14 +267,10 @@ bridge_create() ip link set $h2 master br0 ip link set $h2 up - - simple_if_init br0 $H2_IPV4/24 $H2_IPV6/64 } bridge_destroy() { - simple_if_fini br0 $H2_IPV4/24 $H2_IPV6/64 - ip link del br0 } @@ -272,7 +294,7 @@ standalone() h2_create macvlan_create $h2 - run_test $h1 $h2 + run_test $h1 $h2 "$h2" macvlan_destroy h2_destroy @@ -285,11 +307,13 @@ test_bridge() h1_create bridge_create $vlan_filtering + simple_if_init br0 $H2_IPV4/24 $H2_IPV6/64 macvlan_create br0 - run_test $h1 br0 + run_test $h1 br0 "vlan_filtering=$vlan_filtering bridge" macvlan_destroy + simple_if_fini br0 $H2_IPV4/24 $H2_IPV6/64 bridge_destroy h1_destroy } @@ -304,6 +328,85 @@ vlan_aware_bridge() test_bridge 1 } +test_vlan() +{ + h1_vlan_create + h2_vlan_create + macvlan_create $h2.100 + + run_test $h1.100 $h2.100 "VLAN upper" + + macvlan_destroy + h2_vlan_destroy + h1_vlan_destroy +} + +vlan_over_bridged_port() +{ + local vlan_filtering=$1 + + h1_vlan_create + h2_vlan_create + bridge_create $vlan_filtering + macvlan_create $h2.100 + + run_test $h1.100 $h2.100 "VLAN over vlan_filtering=$vlan_filtering bridged port" + + macvlan_destroy + bridge_destroy + h2_vlan_destroy + h1_vlan_destroy +} + +vlan_over_vlan_unaware_bridged_port() +{ + vlan_over_bridged_port 0 +} + +vlan_over_vlan_aware_bridged_port() +{ + vlan_over_bridged_port 1 +} + +vlan_over_bridge() +{ + local vlan_filtering=$1 + + h1_vlan_create + bridge_create $vlan_filtering + simple_if_init br0 + vlan_create br0 100 vbr0 $H2_IPV4/24 $H2_IPV6/64 + macvlan_create br0.100 + + if [ $vlan_filtering = 1 ]; then + bridge vlan add dev $h2 vid 100 master + bridge vlan add dev br0 vid 100 self + fi + + run_test $h1.100 br0.100 "VLAN over vlan_filtering=$vlan_filtering bridge" + + if [ $vlan_filtering = 1 ]; then + bridge vlan del dev br0 vid 100 self + bridge vlan del dev $h2 vid 100 master + fi + + macvlan_destroy + vlan_destroy br0 100 + simple_if_fini br0 + bridge_destroy + h1_vlan_destroy +} + +vlan_over_vlan_unaware_bridge() +{ + vlan_over_bridge 0 +} + +vlan_over_vlan_aware_bridge() +{ + vlan_over_bridge 1 +} + cleanup() { pre_cleanup -- cgit v1.2.3 From 9aa3749ca4a880c1a59720aab3eacf344ed8d68d Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Thu, 15 Aug 2024 03:06:59 +0300 Subject: selftests: net: local_termination: don't use xfail_on_veth() xfail_on_veth() for this test is an incorrect approximation which gives false positives and false negatives. When local_termination fails with "reception succeeded, but should have failed", it is because the DUT ($h2) accepts packets even when not configured as promiscuous. This is not something specific to veth; even the bridge behaves that way, but this is not captured by the xfail_on_veth test. The IFF_UNICAST_FLT flag is not explicitly exported to user space, but it can somewhat be determined from the interface's behavior. We have to create a macvlan upper with a different MAC address. This forces a dev_uc_add() call in the kernel. When the unicast filtering list is not empty, but the device doesn't support IFF_UNICAST_FLT, __dev_set_rx_mode() force-enables promiscuity on the interface, to ensure correct behavior (that the requested address is received). We can monitor the change in the promiscuity flag and infer from it whether the device supports unicast filtering. There is no equivalent thing for allmulti, unfortunately. We never know what's hiding behind a device which has allmulti=off. Whether it will actually perform RX multicast filtering of unknown traffic is a strong "maybe". The bridge driver, for example, completely ignores the flag. We'll have to keep the xfail behavior, but instead of XFAIL on just veth, always XFAIL. Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- tools/testing/selftests/net/forwarding/lib.sh | 57 +++++++++++++++++++++ .../selftests/net/forwarding/local_termination.sh | 58 ++++++++++++++++------ 2 files changed, 99 insertions(+), 16 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/forwarding/lib.sh b/tools/testing/selftests/net/forwarding/lib.sh index ff96bb7535ff..718d04a4f72d 100644 --- a/tools/testing/selftests/net/forwarding/lib.sh +++ b/tools/testing/selftests/net/forwarding/lib.sh @@ -500,6 +500,11 @@ check_err_fail() fi } +xfail() +{ + FAIL_TO_XFAIL=yes "$@" +} + xfail_on_slow() { if [[ $KSFT_MACHINE_SLOW = yes ]]; then @@ -1113,6 +1118,39 @@ mac_get() ip -j link show dev $if_name | jq -r '.[]["address"]' } +ether_addr_to_u64() +{ + local addr="$1" + local order="$((1 << 40))" + local val=0 + local byte + + addr="${addr//:/ }" + + for byte in $addr; do + byte="0x$byte" + val=$((val + order * byte)) + order=$((order >> 8)) + done + + printf "0x%x" $val +} + +u64_to_ether_addr() +{ + local val=$1 + local byte + local i + + for ((i = 40; i >= 0; i -= 8)); do + byte=$(((val & (0xff << i)) >> i)) + printf "%02x" $byte + if [ $i -ne 0 ]; then + printf ":" + fi + done +} + ipv6_lladdr_get() { local if_name=$1 @@ -2229,3 +2267,22 @@ absval() echo $((v > 0 ? v : -v)) } + +has_unicast_flt() +{ + local dev=$1; shift + local mac_addr=$(mac_get $dev) + local tmp=$(ether_addr_to_u64 $mac_addr) + local promisc + + ip link set $dev up + ip link add link $dev name macvlan-tmp type macvlan mode private + ip link set macvlan-tmp address $(u64_to_ether_addr $((tmp + 1))) + ip link set macvlan-tmp up + + promisc=$(ip -j -d link show dev $dev | jq -r '.[].promiscuity') + + ip link del macvlan-tmp + + [[ $promisc == 1 ]] && echo "no" || echo "yes" +} diff --git a/tools/testing/selftests/net/forwarding/local_termination.sh b/tools/testing/selftests/net/forwarding/local_termination.sh index e22c6a693bef..80ea4c10d764 100755 --- a/tools/testing/selftests/net/forwarding/local_termination.sh +++ b/tools/testing/selftests/net/forwarding/local_termination.sh @@ -109,9 +109,11 @@ run_test() { local send_if_name=$1; shift local rcv_if_name=$1; shift + local no_unicast_flt=$1; shift local test_name="$1"; shift local smac=$(mac_get $send_if_name) local rcv_dmac=$(mac_get $rcv_if_name) + local should_receive tcpdump_start $rcv_if_name @@ -160,26 +162,26 @@ run_test() "$smac > $MACVLAN_ADDR, ethertype IPv4 (0x0800)" \ true "$test_name" - xfail_on_veth $h1 \ - check_rcv $rcv_if_name "Unicast IPv4 to unknown MAC address" \ - "$smac > $UNKNOWN_UC_ADDR1, ethertype IPv4 (0x0800)" \ - false "$test_name" + [ $no_unicast_flt = true ] && should_receive=true || should_receive=false + check_rcv $rcv_if_name "Unicast IPv4 to unknown MAC address" \ + "$smac > $UNKNOWN_UC_ADDR1, ethertype IPv4 (0x0800)" \ + $should_receive "$test_name" check_rcv $rcv_if_name "Unicast IPv4 to unknown MAC address, promisc" \ "$smac > $UNKNOWN_UC_ADDR2, ethertype IPv4 (0x0800)" \ true "$test_name" - xfail_on_veth $h1 \ - check_rcv $rcv_if_name \ - "Unicast IPv4 to unknown MAC address, allmulti" \ - "$smac > $UNKNOWN_UC_ADDR3, ethertype IPv4 (0x0800)" \ - false "$test_name" + [ $no_unicast_flt = true ] && should_receive=true || should_receive=false + check_rcv $rcv_if_name \ + "Unicast IPv4 to unknown MAC address, allmulti" \ + "$smac > $UNKNOWN_UC_ADDR3, ethertype IPv4 (0x0800)" \ + $should_receive "$test_name" check_rcv $rcv_if_name "Multicast IPv4 to joined group" \ "$smac > $JOINED_MACV4_MC_ADDR, ethertype IPv4 (0x0800)" \ true "$test_name" - xfail_on_veth $h1 \ + xfail \ check_rcv $rcv_if_name \ "Multicast IPv4 to unknown group" \ "$smac > $UNKNOWN_MACV4_MC_ADDR1, ethertype IPv4 (0x0800)" \ @@ -197,7 +199,7 @@ run_test() "$smac > $JOINED_MACV6_MC_ADDR, ethertype IPv6 (0x86dd)" \ true "$test_name" - xfail_on_veth $h1 \ + xfail \ check_rcv $rcv_if_name "Multicast IPv6 to unknown group" \ "$smac > $UNKNOWN_MACV6_MC_ADDR1, ethertype IPv6 (0x86dd)" \ false "$test_name" @@ -290,11 +292,17 @@ macvlan_destroy() standalone() { + local no_unicast_flt=true + + if [ $(has_unicast_flt $h2) = yes ]; then + no_unicast_flt=false + fi + h1_create h2_create macvlan_create $h2 - run_test $h1 $h2 "$h2" + run_test $h1 $h2 $no_unicast_flt "$h2" macvlan_destroy h2_destroy @@ -303,6 +311,7 @@ standalone() test_bridge() { + local no_unicast_flt=true local vlan_filtering=$1 h1_create @@ -310,7 +319,7 @@ test_bridge() simple_if_init br0 $H2_IPV4/24 $H2_IPV6/64 macvlan_create br0 - run_test $h1 br0 "vlan_filtering=$vlan_filtering bridge" + run_test $h1 br0 $no_unicast_flt "vlan_filtering=$vlan_filtering bridge" macvlan_destroy simple_if_fini br0 $H2_IPV4/24 $H2_IPV6/64 @@ -330,11 +339,17 @@ vlan_aware_bridge() test_vlan() { + local no_unicast_flt=true + + if [ $(has_unicast_flt $h2) = yes ]; then + no_unicast_flt=false + fi + h1_vlan_create h2_vlan_create macvlan_create $h2.100 - run_test $h1.100 $h2.100 "VLAN upper" + run_test $h1.100 $h2.100 $no_unicast_flt "VLAN upper" macvlan_destroy h2_vlan_destroy @@ -343,14 +358,23 @@ test_vlan() vlan_over_bridged_port() { + local no_unicast_flt=true local vlan_filtering=$1 + # br_manage_promisc() will not force a single vlan_filtering port to + # promiscuous mode, so we should still expect unicast filtering to take + # place if the device can do it. + if [ $(has_unicast_flt $h2) = yes ] && [ $vlan_filtering = 1 ]; then + no_unicast_flt=false + fi + h1_vlan_create h2_vlan_create bridge_create $vlan_filtering macvlan_create $h2.100 - run_test $h1.100 $h2.100 "VLAN over vlan_filtering=$vlan_filtering bridged port" + run_test $h1.100 $h2.100 $no_unicast_flt \ + "VLAN over vlan_filtering=$vlan_filtering bridged port" macvlan_destroy bridge_destroy @@ -370,6 +394,7 @@ vlan_over_vlan_aware_bridged_port() vlan_over_bridge() { + local no_unicast_flt=true local vlan_filtering=$1 h1_vlan_create @@ -383,7 +408,8 @@ vlan_over_bridge() bridge vlan add dev br0 vid 100 self fi - run_test $h1.100 br0.100 "VLAN over vlan_filtering=$vlan_filtering bridge" + run_test $h1.100 br0.100 $no_unicast_flt \ + "VLAN over vlan_filtering=$vlan_filtering bridge" if [ $vlan_filtering = 1 ]; then bridge vlan del dev br0 vid 100 self -- cgit v1.2.3 From 237979504264912a9797dabc0db35126e705fe0d Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Thu, 15 Aug 2024 03:07:00 +0300 Subject: selftests: net: local_termination: add PTP frames to the mix A breakage in the felix DSA driver shows we do not have enough test coverage. More generally, it is sufficiently special that it is likely drivers will treat it differently. This is not meant to be a full PTP test, it just makes sure that PTP packets sent to the different addresses corresponding to their profiles are received correctly. The local_termination selftest seemed like the most appropriate place for this addition. PTP RX/TX in some cases makes no sense (over a bridge) and this is why $skip_ptp exists. And in others - PTP over a bridge port - the IP stack needs convincing through the available bridge netfilter hooks to leave the PTP packets alone and not stolen by the bridge rx_handler. It is safe to assume that users have that figured out already. This is a driver level test, and by using tcpdump, all that extra setup is out of scope here. send_non_ip() was an unfinished idea; written but never used. Replace it with a more generic send_raw(), and send 3 PTP packet types times 3 transports. Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- .../selftests/net/forwarding/local_termination.sh | 161 +++++++++++++++++++-- 1 file changed, 148 insertions(+), 13 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/forwarding/local_termination.sh b/tools/testing/selftests/net/forwarding/local_termination.sh index 80ea4c10d764..648868f74604 100755 --- a/tools/testing/selftests/net/forwarding/local_termination.sh +++ b/tools/testing/selftests/net/forwarding/local_termination.sh @@ -39,9 +39,68 @@ UNKNOWN_MACV6_MC_ADDR1="33:33:01:02:03:05" UNKNOWN_MACV6_MC_ADDR2="33:33:01:02:03:06" UNKNOWN_MACV6_MC_ADDR3="33:33:01:02:03:07" -NON_IP_MC="01:02:03:04:05:06" -NON_IP_PKT="00:04 48:45:4c:4f" -BC="ff:ff:ff:ff:ff:ff" +PTP_1588_L2_SYNC=" \ +01:1b:19:00:00:00 00:00:de:ad:be:ef 88:f7 00 02 \ +00 2c 00 00 02 00 00 00 00 00 00 00 00 00 00 00 \ +00 00 3e 37 63 ff fe cf 17 0e 00 01 00 00 00 00 \ +00 00 00 00 00 00 00 00 00 00" +PTP_1588_L2_FOLLOW_UP=" \ +01:1b:19:00:00:00 00:00:de:ad:be:ef 88:f7 08 02 \ +00 2c 00 00 00 00 00 00 00 00 00 00 00 00 00 00 \ +00 00 3e 37 63 ff fe cf 17 0e 00 01 00 00 02 00 \ +00 00 66 83 c5 f1 17 97 ed f0" +PTP_1588_L2_PDELAY_REQ=" \ +01:80:c2:00:00:0e 00:00:de:ad:be:ef 88:f7 02 02 \ +00 36 00 00 00 00 00 00 00 00 00 00 00 00 00 00 \ +00 00 3e 37 63 ff fe cf 17 0e 00 01 00 06 05 7f \ +00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 \ +00 00 00 00" +PTP_1588_IPV4_SYNC=" \ +01:00:5e:00:01:81 00:00:de:ad:be:ef 08:00 45 00 \ +00 48 0a 9a 40 00 01 11 cb 88 c0 00 02 01 e0 00 \ +01 81 01 3f 01 3f 00 34 a3 c8 00 02 00 2c 00 00 \ +02 00 00 00 00 00 00 00 00 00 00 00 00 00 3e 37 \ +63 ff fe cf 17 0e 00 01 00 00 00 00 00 00 00 00 \ +00 00 00 00 00 00" +PTP_1588_IPV4_FOLLOW_UP=" +01:00:5e:00:01:81 00:00:de:ad:be:ef 08:00 45 00 \ +00 48 0a 9b 40 00 01 11 cb 87 c0 00 02 01 e0 00 \ +01 81 01 40 01 40 00 34 a3 c8 08 02 00 2c 00 00 \ +00 00 00 00 00 00 00 00 00 00 00 00 00 00 3e 37 \ +63 ff fe cf 17 0e 00 01 00 00 02 00 00 00 66 83 \ +c6 0f 1d 9a 61 87" +PTP_1588_IPV4_PDELAY_REQ=" \ +01:00:5e:00:00:6b 00:00:de:ad:be:ef 08:00 45 00 \ +00 52 35 a9 40 00 01 11 a1 85 c0 00 02 01 e0 00 \ +00 6b 01 3f 01 3f 00 3e a2 bc 02 02 00 36 00 00 \ +00 00 00 00 00 00 00 00 00 00 00 00 00 00 3e 37 \ +63 ff fe cf 17 0e 00 01 00 01 05 7f 00 00 00 00 \ +00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00" +PTP_1588_IPV6_SYNC=" \ +33:33:00:00:01:81 00:00:de:ad:be:ef 86:dd 60 06 \ +7c 2f 00 36 11 01 20 01 0d b8 00 01 00 00 00 00 \ +00 00 00 00 00 01 ff 0e 00 00 00 00 00 00 00 00 \ +00 00 00 00 01 81 01 3f 01 3f 00 36 2e 92 00 02 \ +00 2c 00 00 02 00 00 00 00 00 00 00 00 00 00 00 \ +00 00 3e 37 63 ff fe cf 17 0e 00 01 00 00 00 00 \ +00 00 00 00 00 00 00 00 00 00 00 00" +PTP_1588_IPV6_FOLLOW_UP=" \ +33:33:00:00:01:81 00:00:de:ad:be:ef 86:dd 60 0a \ +00 bc 00 36 11 01 20 01 0d b8 00 01 00 00 00 00 \ +00 00 00 00 00 01 ff 0e 00 00 00 00 00 00 00 00 \ +00 00 00 00 01 81 01 40 01 40 00 36 2e 92 08 02 \ +00 2c 00 00 00 00 00 00 00 00 00 00 00 00 00 00 \ +00 00 3e 37 63 ff fe cf 17 0e 00 01 00 00 02 00 \ +00 00 66 83 c6 2a 32 09 bd 74 00 00" +PTP_1588_IPV6_PDELAY_REQ=" \ +33:33:00:00:00:6b 00:00:de:ad:be:ef 86:dd 60 0c \ +5c fd 00 40 11 01 fe 80 00 00 00 00 00 00 3c 37 \ +63 ff fe cf 17 0e ff 02 00 00 00 00 00 00 00 00 \ +00 00 00 00 00 6b 01 3f 01 3f 00 40 b4 54 02 02 \ +00 36 00 00 00 00 00 00 00 00 00 00 00 00 00 00 \ +00 00 3e 37 63 ff fe cf 17 0e 00 01 00 01 05 7f \ +00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 \ +00 00 00 00 00 00" # Disable promisc to ensure we don't receive unknown MAC DA packets export TCPDUMP_EXTRA_FLAGS="-pl" @@ -49,13 +108,15 @@ export TCPDUMP_EXTRA_FLAGS="-pl" h1=${NETIFS[p1]} h2=${NETIFS[p2]} -send_non_ip() +send_raw() { - local if_name=$1 - local smac=$2 - local dmac=$3 + local if_name=$1; shift + local pkt="$1"; shift + local smac=$(mac_get $if_name) - $MZ -q $if_name "$dmac $smac $NON_IP_PKT" + pkt="${pkt/00:00:de:ad:be:ef/$smac}" + + $MZ -q $if_name "$pkt" } send_uc_ipv4() @@ -109,6 +170,7 @@ run_test() { local send_if_name=$1; shift local rcv_if_name=$1; shift + local skip_ptp=$1; shift local no_unicast_flt=$1; shift local test_name="$1"; shift local smac=$(mac_get $send_if_name) @@ -150,6 +212,35 @@ run_test() mc_route_destroy $rcv_if_name mc_route_destroy $send_if_name + if [ $skip_ptp = false ]; then + ip maddress add 01:1b:19:00:00:00 dev $rcv_if_name + send_raw $send_if_name "$PTP_1588_L2_SYNC" + send_raw $send_if_name "$PTP_1588_L2_FOLLOW_UP" + ip maddress del 01:1b:19:00:00:00 dev $rcv_if_name + + ip maddress add 01:80:c2:00:00:0e dev $rcv_if_name + send_raw $send_if_name "$PTP_1588_L2_PDELAY_REQ" + ip maddress del 01:80:c2:00:00:0e dev $rcv_if_name + + mc_join $rcv_if_name 224.0.1.129 + send_raw $send_if_name "$PTP_1588_IPV4_SYNC" + send_raw $send_if_name "$PTP_1588_IPV4_FOLLOW_UP" + mc_leave + + mc_join $rcv_if_name 224.0.0.107 + send_raw $send_if_name "$PTP_1588_IPV4_PDELAY_REQ" + mc_leave + + mc_join $rcv_if_name ff0e::181 + send_raw $send_if_name "$PTP_1588_IPV6_SYNC" + send_raw $send_if_name "$PTP_1588_IPV6_FOLLOW_UP" + mc_leave + + mc_join $rcv_if_name ff02::6b + send_raw $send_if_name "$PTP_1588_IPV6_PDELAY_REQ" + mc_leave + fi + sleep 1 tcpdump_stop $rcv_if_name @@ -212,6 +303,44 @@ run_test() "$smac > $UNKNOWN_MACV6_MC_ADDR3, ethertype IPv6 (0x86dd)" \ true "$test_name" + if [ $skip_ptp = false ]; then + check_rcv $rcv_if_name "1588v2 over L2 transport, Sync" \ + "ethertype PTP (0x88f7).* PTPv2.* msg type : sync msg" \ + true "$test_name" + + check_rcv $rcv_if_name "1588v2 over L2 transport, Follow-Up" \ + "ethertype PTP (0x88f7).* PTPv2.* msg type : follow up msg" \ + true "$test_name" + + check_rcv $rcv_if_name "1588v2 over L2 transport, Peer Delay Request" \ + "ethertype PTP (0x88f7).* PTPv2.* msg type : peer delay req msg" \ + true "$test_name" + + check_rcv $rcv_if_name "1588v2 over IPv4, Sync" \ + "ethertype IPv4 (0x0800).* PTPv2.* msg type : sync msg" \ + true "$test_name" + + check_rcv $rcv_if_name "1588v2 over IPv4, Follow-Up" \ + "ethertype IPv4 (0x0800).* PTPv2.* msg type : follow up msg" \ + true "$test_name" + + check_rcv $rcv_if_name "1588v2 over IPv4, Peer Delay Request" \ + "ethertype IPv4 (0x0800).* PTPv2.* msg type : peer delay req msg" \ + true "$test_name" + + check_rcv $rcv_if_name "1588v2 over IPv6, Sync" \ + "ethertype IPv6 (0x86dd).* PTPv2.* msg type : sync msg" \ + true "$test_name" + + check_rcv $rcv_if_name "1588v2 over IPv6, Follow-Up" \ + "ethertype IPv6 (0x86dd).* PTPv2.* msg type : follow up msg" \ + true "$test_name" + + check_rcv $rcv_if_name "1588v2 over IPv6, Peer Delay Request" \ + "ethertype IPv6 (0x86dd).* PTPv2.* msg type : peer delay req msg" \ + true "$test_name" + fi + tcpdump_cleanup $rcv_if_name } @@ -293,6 +422,7 @@ macvlan_destroy() standalone() { local no_unicast_flt=true + local skip_ptp=false if [ $(has_unicast_flt $h2) = yes ]; then no_unicast_flt=false @@ -302,7 +432,7 @@ standalone() h2_create macvlan_create $h2 - run_test $h1 $h2 $no_unicast_flt "$h2" + run_test $h1 $h2 $skip_ptp $no_unicast_flt "$h2" macvlan_destroy h2_destroy @@ -313,13 +443,15 @@ test_bridge() { local no_unicast_flt=true local vlan_filtering=$1 + local skip_ptp=true h1_create bridge_create $vlan_filtering simple_if_init br0 $H2_IPV4/24 $H2_IPV6/64 macvlan_create br0 - run_test $h1 br0 $no_unicast_flt "vlan_filtering=$vlan_filtering bridge" + run_test $h1 br0 $skip_ptp $no_unicast_flt \ + "vlan_filtering=$vlan_filtering bridge" macvlan_destroy simple_if_fini br0 $H2_IPV4/24 $H2_IPV6/64 @@ -340,6 +472,7 @@ vlan_aware_bridge() test_vlan() { local no_unicast_flt=true + local skip_ptp=false if [ $(has_unicast_flt $h2) = yes ]; then no_unicast_flt=false @@ -349,7 +482,7 @@ test_vlan() h2_vlan_create macvlan_create $h2.100 - run_test $h1.100 $h2.100 $no_unicast_flt "VLAN upper" + run_test $h1.100 $h2.100 $skip_ptp $no_unicast_flt "VLAN upper" macvlan_destroy h2_vlan_destroy @@ -360,6 +493,7 @@ vlan_over_bridged_port() { local no_unicast_flt=true local vlan_filtering=$1 + local skip_ptp=false # br_manage_promisc() will not force a single vlan_filtering port to # promiscuous mode, so we should still expect unicast filtering to take @@ -373,7 +507,7 @@ vlan_over_bridged_port() bridge_create $vlan_filtering macvlan_create $h2.100 - run_test $h1.100 $h2.100 $no_unicast_flt \ + run_test $h1.100 $h2.100 $skip_ptp $no_unicast_flt \ "VLAN over vlan_filtering=$vlan_filtering bridged port" macvlan_destroy @@ -396,6 +530,7 @@ vlan_over_bridge() { local no_unicast_flt=true local vlan_filtering=$1 + local skip_ptp=true h1_vlan_create bridge_create $vlan_filtering @@ -408,7 +543,7 @@ vlan_over_bridge() bridge vlan add dev br0 vid 100 self fi - run_test $h1.100 br0.100 $no_unicast_flt \ + run_test $h1.100 br0.100 $skip_ptp $no_unicast_flt \ "VLAN over vlan_filtering=$vlan_filtering bridge" if [ $vlan_filtering = 1 ]; then -- cgit v1.2.3 From e29b82ef27616777e21c07dc263a8769cbdaa358 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Thu, 15 Aug 2024 03:07:01 +0300 Subject: selftests: net: bridge_vlan_aware: test that other TPIDs are seen as untagged The bridge VLAN implementation w.r.t. VLAN protocol is described in merge commit 1a0b20b25732 ("Merge branch 'bridge-next'"). We are only sensitive to those VLAN tags whose TPID is equal to the bridge's vlan_protocol. Thus, an 802.1ad VLAN should be treated as 802.1Q-untagged. Add 3 tests which validate that: - 802.1ad-tagged traffic is learned into the PVID of an 802.1Q-aware bridge - Double-tagged traffic is forwarded when just the PVID of the port is present in the VLAN group of the ports - Double-tagged traffic is not forwarded when the PVID of the port is absent from the VLAN group of the ports The test passes with both veth and ocelot. Signed-off-by: Vladimir Oltean Reviewed-by: Ido Schimmel Tested-by: Ido Schimmel Signed-off-by: David S. Miller --- .../selftests/net/forwarding/bridge_vlan_aware.sh | 54 +++++++++++++++++++++- 1 file changed, 53 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/forwarding/bridge_vlan_aware.sh b/tools/testing/selftests/net/forwarding/bridge_vlan_aware.sh index 64bd00fe9a4f..90f8a244ea90 100755 --- a/tools/testing/selftests/net/forwarding/bridge_vlan_aware.sh +++ b/tools/testing/selftests/net/forwarding/bridge_vlan_aware.sh @@ -1,7 +1,7 @@ #!/bin/bash # SPDX-License-Identifier: GPL-2.0 -ALL_TESTS="ping_ipv4 ping_ipv6 learning flooding vlan_deletion extern_learn" +ALL_TESTS="ping_ipv4 ping_ipv6 learning flooding vlan_deletion extern_learn other_tpid" NUM_NETIFS=4 CHECK_TC="yes" source lib.sh @@ -142,6 +142,58 @@ extern_learn() bridge fdb del de:ad:be:ef:13:37 dev $swp1 master vlan 1 &> /dev/null } +other_tpid() +{ + local mac=de:ad:be:ef:13:37 + + # Test that packets with TPID 802.1ad VID 3 + TPID 802.1Q VID 5 are + # classified as untagged by a bridge with vlan_protocol 802.1Q, and + # are processed in the PVID of the ingress port (here 1). Not VID 3, + # and not VID 5. + RET=0 + + tc qdisc add dev $h2 clsact + tc filter add dev $h2 ingress protocol all pref 1 handle 101 \ + flower dst_mac $mac action drop + ip link set $h2 promisc on + ethtool -K $h2 rx-vlan-filter off rx-vlan-stag-filter off + + $MZ -q $h1 -c 1 -b $mac -a own "88:a8 00:03 81:00 00:05 08:00 aa-aa-aa-aa-aa-aa-aa-aa-aa" + sleep 1 + + # Match on 'self' addresses as well, for those drivers which + # do not push their learned addresses to the bridge software + # database + bridge -j fdb show $swp1 | \ + jq -e ".[] | select(.mac == \"$(mac_get $h1)\") | select(.vlan == 1)" &> /dev/null + check_err $? "FDB entry was not learned when it should" + + log_test "FDB entry in PVID for VLAN-tagged with other TPID" + + RET=0 + tc -j -s filter show dev $h2 ingress \ + | jq -e ".[] | select(.options.handle == 101) \ + | select(.options.actions[0].stats.packets == 1)" &> /dev/null + check_err $? "Packet was not forwarded when it should" + log_test "Reception of VLAN with other TPID as untagged" + + bridge vlan del dev $swp1 vid 1 + + $MZ -q $h1 -c 1 -b $mac -a own "88:a8 00:03 81:00 00:05 08:00 aa-aa-aa-aa-aa-aa-aa-aa-aa" + sleep 1 + + RET=0 + tc -j -s filter show dev $h2 ingress \ + | jq -e ".[] | select(.options.handle == 101) \ + | select(.options.actions[0].stats.packets == 1)" &> /dev/null + check_err $? "Packet was forwarded when should not" + log_test "Reception of VLAN with other TPID as untagged (no PVID)" + + bridge vlan add dev $swp1 vid 1 pvid untagged + ip link set $h2 promisc off + tc qdisc del dev $h2 clsact +} + trap cleanup EXIT setup_prepare -- cgit v1.2.3 From 0021d6670d1a997549a39bf629da9940bf4068ed Mon Sep 17 00:00:00 2001 From: Thomas Weißschuh Date: Mon, 12 Aug 2024 22:50:17 +0200 Subject: tools/nolibc: crt: mark _start_c() as used MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit During LTO the reference from the asm startup code to the _start_c() function is not visible and _start_c() is removed. This will then lead to errors during linking. As _start_c() is indeed always used, mark it as such. Acked-by: Willy Tarreau Link: https://lore.kernel.org/r/20240812-nolibc-lto-v2-1-736af7bbefa8@weissschuh.net Signed-off-by: Thomas Weißschuh --- tools/include/nolibc/crt.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/include/nolibc/crt.h b/tools/include/nolibc/crt.h index ac291574f6c0..bbcd5fd09806 100644 --- a/tools/include/nolibc/crt.h +++ b/tools/include/nolibc/crt.h @@ -22,7 +22,7 @@ extern void (*const __init_array_end[])(int, char **, char**) __attribute__((wea extern void (*const __fini_array_start[])(void) __attribute__((weak)); extern void (*const __fini_array_end[])(void) __attribute__((weak)); -__attribute__((weak)) +__attribute__((weak,used)) void _start_c(long *sp) { long argc; -- cgit v1.2.3 From ff7b9abbfce985b92f71c855246508edb0980cd6 Mon Sep 17 00:00:00 2001 From: Thomas Weißschuh Date: Mon, 12 Aug 2024 22:50:18 +0200 Subject: tools/nolibc: stackprotector: mark implicitly used symbols as used MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit During LTO the references from the compiler-generated prologue and epilogues to the stack protector symbols are not visible and the symbols are removed. This will then lead to errors during linking. As those symbols are already #ifdeffed-out if unused mark them as "used" to prevent their removal. Acked-by: Willy Tarreau Link: https://lore.kernel.org/r/20240812-nolibc-lto-v2-2-736af7bbefa8@weissschuh.net Signed-off-by: Thomas Weißschuh --- tools/include/nolibc/stackprotector.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/include/nolibc/stackprotector.h b/tools/include/nolibc/stackprotector.h index 13f1d0e60387..1d0d5259ec41 100644 --- a/tools/include/nolibc/stackprotector.h +++ b/tools/include/nolibc/stackprotector.h @@ -18,7 +18,7 @@ * triggering stack protector errors themselves */ -__attribute__((weak,noreturn,section(".text.nolibc_stack_chk"))) +__attribute__((weak,used,noreturn,section(".text.nolibc_stack_chk"))) void __stack_chk_fail(void) { pid_t pid; @@ -34,7 +34,7 @@ void __stack_chk_fail_local(void) __stack_chk_fail(); } -__attribute__((weak,section(".data.nolibc_stack_chk"))) +__attribute__((weak,used,section(".data.nolibc_stack_chk"))) uintptr_t __stack_chk_guard; static __no_stack_protector void __stack_chk_init(void) -- cgit v1.2.3 From 25fb329a23c78d59a055a7b1329d18f30a2be92d Mon Sep 17 00:00:00 2001 From: Thomas Weißschuh Date: Mon, 12 Aug 2024 22:50:19 +0200 Subject: tools/nolibc: x86_64: use local label in memcpy/memmove MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Compiling arch-x86_64.h with clang and binutils LD yields duplicate label errors: .../gcc-13.2.0-nolibc/x86_64-linux/bin/x86_64-linux-ld: error: LLVM gold plugin: :44:1: symbol '.Lbackward_copy' is already defined .Lbackward_copy:leaq -1(%rdi, %rcx, 1), %rdi Instead of a local symbol use a local label which can be defined multiple times and therefore avoids the error. Reviewed-by: Ammar Faizi Acked-by: Willy Tarreau Link: https://lore.kernel.org/r/20240812-nolibc-lto-v2-3-736af7bbefa8@weissschuh.net Signed-off-by: Thomas Weißschuh --- tools/include/nolibc/arch-x86_64.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/include/nolibc/arch-x86_64.h b/tools/include/nolibc/arch-x86_64.h index 65252c005a30..1e40620a2b33 100644 --- a/tools/include/nolibc/arch-x86_64.h +++ b/tools/include/nolibc/arch-x86_64.h @@ -193,10 +193,10 @@ __asm__ ( "movq %rdi, %rdx\n\t" "subq %rsi, %rdx\n\t" "cmpq %rcx, %rdx\n\t" - "jb .Lbackward_copy\n\t" + "jb 1f\n\t" "rep movsb\n\t" "retq\n" -".Lbackward_copy:" +"1:" /* backward copy */ "leaq -1(%rdi, %rcx, 1), %rdi\n\t" "leaq -1(%rsi, %rcx, 1), %rsi\n\t" "std\n\t" -- cgit v1.2.3 From 30dcdd6a3a6caa27d75196bcc4f6ea9c7a51ad19 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Wed, 14 Aug 2024 14:10:01 +0300 Subject: selftests: fib_rule_tests: Remove unused functions The functions are unused since commit 816cda9ae531 ("selftests: net: fib_rule_tests: add support to select a test to run"). Remove them. Signed-off-by: Ido Schimmel Link: https://patch.msgid.link/20240814111005.955359-2-idosch@nvidia.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/fib_rule_tests.sh | 16 ---------------- 1 file changed, 16 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/fib_rule_tests.sh b/tools/testing/selftests/net/fib_rule_tests.sh index 7c01f58a20de..c821d91b9ee8 100755 --- a/tools/testing/selftests/net/fib_rule_tests.sh +++ b/tools/testing/selftests/net/fib_rule_tests.sh @@ -56,14 +56,6 @@ log_test() fi } -log_section() -{ - echo - echo "######################################################################" - echo "TEST SECTION: $*" - echo "######################################################################" -} - check_nettest() { if which nettest > /dev/null 2>&1; then @@ -453,14 +445,6 @@ fib_rule4_connect_test() $IP -4 rule del dsfield 0x04 table $RTABLE_PEER cleanup_peer } - -run_fibrule_tests() -{ - log_section "IPv4 fib rule" - fib_rule4_test - log_section "IPv6 fib rule" - fib_rule6_test -} ################################################################################ # usage -- cgit v1.2.3 From b1487d6abeb5ccfc23f61d600ca63d2b09964f9a Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Wed, 14 Aug 2024 14:10:02 +0300 Subject: selftests: fib_rule_tests: Clarify test results Clarify the test results by grouping the output of test cases belonging to the same test under a common title. This is consistent with the output of fib_tests.sh. Before: # ./fib_rule_tests.sh TEST: rule6 check: oif redirect to table [ OK ] TEST: rule6 del by pref: oif redirect to table [ OK ] [...] TEST: rule4 check: oif redirect to table [ OK ] TEST: rule4 del by pref: oif redirect to table [ OK ] [...] Tests passed: 116 Tests failed: 0 After: # ./fib_rule_tests.sh IPv6 FIB rule tests TEST: rule6 check: oif redirect to table [ OK ] TEST: rule6 del by pref: oif redirect to table [ OK ] [...] IPv4 FIB rule tests TEST: rule4 check: oif redirect to table [ OK ] TEST: rule4 del by pref: oif redirect to table [ OK ] [...] Tests passed: 116 Tests failed: 0 Signed-off-by: Ido Schimmel Link: https://patch.msgid.link/20240814111005.955359-3-idosch@nvidia.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/fib_rule_tests.sh | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/fib_rule_tests.sh b/tools/testing/selftests/net/fib_rule_tests.sh index c821d91b9ee8..9100dd4d0382 100755 --- a/tools/testing/selftests/net/fib_rule_tests.sh +++ b/tools/testing/selftests/net/fib_rule_tests.sh @@ -35,18 +35,13 @@ log_test() local expected=$2 local msg="$3" - $IP rule show | grep -q l3mdev - if [ $? -eq 0 ]; then - msg="$msg (VRF)" - fi - if [ ${rc} -eq ${expected} ]; then nsuccess=$((nsuccess+1)) - printf "\n TEST: %-60s [ OK ]\n" "${msg}" + printf " TEST: %-60s [ OK ]\n" "${msg}" else ret=1 nfail=$((nfail+1)) - printf "\n TEST: %-60s [FAIL]\n" "${msg}" + printf " TEST: %-60s [FAIL]\n" "${msg}" if [ "${PAUSE_ON_FAIL}" = "yes" ]; then echo echo "hit enter to continue, 'q' to quit" @@ -205,10 +200,14 @@ fib_rule6_test_reject() fib_rule6_test() { + local ext_name=$1; shift local getmatch local match local cnt + echo + echo "IPv6 FIB rule tests $ext_name" + # setup the fib rule redirect route $IP -6 route add table $RTABLE default via $GW_IP6 dev $DEV onlink @@ -267,7 +266,7 @@ fib_rule6_test() fib_rule6_vrf_test() { setup_vrf - fib_rule6_test + fib_rule6_test "- with VRF" cleanup_vrf } @@ -277,6 +276,9 @@ fib_rule6_connect_test() { local dsfield + echo + echo "IPv6 FIB rule connect tests" + if ! check_nettest; then echo "SKIP: Could not run test without nettest tool" return @@ -344,10 +346,14 @@ fib_rule4_test_reject() fib_rule4_test() { + local ext_name=$1; shift local getmatch local match local cnt + echo + echo "IPv4 FIB rule tests $ext_name" + # setup the fib rule redirect route $IP route add table $RTABLE default via $GW_IP4 dev $DEV onlink @@ -411,7 +417,7 @@ fib_rule4_test() fib_rule4_vrf_test() { setup_vrf - fib_rule4_test + fib_rule4_test "- with VRF" cleanup_vrf } @@ -421,6 +427,9 @@ fib_rule4_connect_test() { local dsfield + echo + echo "IPv4 FIB rule connect tests" + if ! check_nettest; then echo "SKIP: Could not run test without nettest tool" return -- cgit v1.2.3 From 9b6dcef32c2d8fe357592da5f6f52ef2edf1652d Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Wed, 14 Aug 2024 14:10:03 +0300 Subject: selftests: fib_rule_tests: Add negative match tests The fib_rule{4,6} tests verify the behavior of a given FIB rule selector (e.g., dport, sport) by redirecting to a routing table with a default route using a FIB rule with the given selector and checking that a route lookup using the selector matches this default route. Add negative tests to verify that a FIB rule is not hit when it should not. Signed-off-by: Ido Schimmel Link: https://patch.msgid.link/20240814111005.955359-4-idosch@nvidia.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/fib_rule_tests.sh | 87 +++++++++++++++++++++------ 1 file changed, 69 insertions(+), 18 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/fib_rule_tests.sh b/tools/testing/selftests/net/fib_rule_tests.sh index 9100dd4d0382..085e21ed9fc3 100755 --- a/tools/testing/selftests/net/fib_rule_tests.sh +++ b/tools/testing/selftests/net/fib_rule_tests.sh @@ -174,12 +174,17 @@ fib_rule6_test_match_n_redirect() { local match="$1" local getmatch="$2" - local description="$3" + local getnomatch="$3" + local description="$4" + local nomatch_description="$5" $IP -6 rule add $match table $RTABLE $IP -6 route get $GW_IP6 $getmatch | grep -q "table $RTABLE" log_test $? 0 "rule6 check: $description" + $IP -6 route get $GW_IP6 $getnomatch 2>&1 | grep -q "table $RTABLE" + log_test $? 1 "rule6 check: $nomatch_description" + fib_rule6_del_by_pref "$match" log_test $? 0 "rule6 del by pref: $description" } @@ -201,6 +206,7 @@ fib_rule6_test_reject() fib_rule6_test() { local ext_name=$1; shift + local getnomatch local getmatch local match local cnt @@ -212,10 +218,14 @@ fib_rule6_test() $IP -6 route add table $RTABLE default via $GW_IP6 dev $DEV onlink match="oif $DEV" - fib_rule6_test_match_n_redirect "$match" "$match" "oif redirect to table" + getnomatch="oif lo" + fib_rule6_test_match_n_redirect "$match" "$match" "$getnomatch" \ + "oif redirect to table" "oif no redirect to table" match="from $SRC_IP6 iif $DEV" - fib_rule6_test_match_n_redirect "$match" "$match" "iif redirect to table" + getnomatch="from $SRC_IP6 iif lo" + fib_rule6_test_match_n_redirect "$match" "$match" "$getnomatch" \ + "iif redirect to table" "iif no redirect to table" # Reject dsfield (tos) options which have ECN bits set for cnt in $(seq 1 3); do @@ -229,37 +239,52 @@ fib_rule6_test() # Using option 'tos' instead of 'dsfield' as old iproute2 # versions don't support 'dsfield' in ip rule show. getmatch="tos $cnt" + getnomatch="tos 0x20" fib_rule6_test_match_n_redirect "$match" "$getmatch" \ - "$getmatch redirect to table" + "$getnomatch" "$getmatch redirect to table" \ + "$getnomatch no redirect to table" done match="fwmark 0x64" getmatch="mark 0x64" - fib_rule6_test_match_n_redirect "$match" "$getmatch" "fwmark redirect to table" + getnomatch="mark 0x63" + fib_rule6_test_match_n_redirect "$match" "$getmatch" "$getnomatch" \ + "fwmark redirect to table" "fwmark no redirect to table" fib_check_iproute_support "uidrange" "uid" if [ $? -eq 0 ]; then match="uidrange 100-100" getmatch="uid 100" - fib_rule6_test_match_n_redirect "$match" "$getmatch" "uid redirect to table" + getnomatch="uid 101" + fib_rule6_test_match_n_redirect "$match" "$getmatch" \ + "$getnomatch" "uid redirect to table" \ + "uid no redirect to table" fi fib_check_iproute_support "sport" "sport" if [ $? -eq 0 ]; then match="sport 666 dport 777" - fib_rule6_test_match_n_redirect "$match" "$match" "sport and dport redirect to table" + getnomatch="sport 667 dport 778" + fib_rule6_test_match_n_redirect "$match" "$match" \ + "$getnomatch" "sport and dport redirect to table" \ + "sport and dport no redirect to table" fi fib_check_iproute_support "ipproto" "ipproto" if [ $? -eq 0 ]; then match="ipproto tcp" - fib_rule6_test_match_n_redirect "$match" "$match" "ipproto match" + getnomatch="ipproto udp" + fib_rule6_test_match_n_redirect "$match" "$match" \ + "$getnomatch" "ipproto tcp match" "ipproto udp no match" fi fib_check_iproute_support "ipproto" "ipproto" if [ $? -eq 0 ]; then match="ipproto ipv6-icmp" - fib_rule6_test_match_n_redirect "$match" "$match" "ipproto ipv6-icmp match" + getnomatch="ipproto tcp" + fib_rule6_test_match_n_redirect "$match" "$match" \ + "$getnomatch" "ipproto ipv6-icmp match" \ + "ipproto ipv6-tcp no match" fi } @@ -320,12 +345,17 @@ fib_rule4_test_match_n_redirect() { local match="$1" local getmatch="$2" - local description="$3" + local getnomatch="$3" + local description="$4" + local nomatch_description="$5" $IP rule add $match table $RTABLE $IP route get $GW_IP4 $getmatch | grep -q "table $RTABLE" log_test $? 0 "rule4 check: $description" + $IP route get $GW_IP4 $getnomatch 2>&1 | grep -q "table $RTABLE" + log_test $? 1 "rule4 check: $nomatch_description" + fib_rule4_del_by_pref "$match" log_test $? 0 "rule4 del by pref: $description" } @@ -347,6 +377,7 @@ fib_rule4_test_reject() fib_rule4_test() { local ext_name=$1; shift + local getnomatch local getmatch local match local cnt @@ -358,14 +389,18 @@ fib_rule4_test() $IP route add table $RTABLE default via $GW_IP4 dev $DEV onlink match="oif $DEV" - fib_rule4_test_match_n_redirect "$match" "$match" "oif redirect to table" + getnomatch="oif lo" + fib_rule4_test_match_n_redirect "$match" "$match" "$getnomatch" \ + "oif redirect to table" "oif no redirect to table" # need enable forwarding and disable rp_filter temporarily as all the # addresses are in the same subnet and egress device == ingress device. ip netns exec $testns sysctl -qw net.ipv4.ip_forward=1 ip netns exec $testns sysctl -qw net.ipv4.conf.$DEV.rp_filter=0 match="from $SRC_IP iif $DEV" - fib_rule4_test_match_n_redirect "$match" "$match" "iif redirect to table" + getnomatch="from $SRC_IP iif lo" + fib_rule4_test_match_n_redirect "$match" "$match" "$getnomatch" \ + "iif redirect to table" "iif no redirect to table" ip netns exec $testns sysctl -qw net.ipv4.ip_forward=0 # Reject dsfield (tos) options which have ECN bits set @@ -380,37 +415,53 @@ fib_rule4_test() # Using option 'tos' instead of 'dsfield' as old iproute2 # versions don't support 'dsfield' in ip rule show. getmatch="tos $cnt" + getnomatch="tos 0x20" fib_rule4_test_match_n_redirect "$match" "$getmatch" \ - "$getmatch redirect to table" + "$getnomatch" "$getmatch redirect to table" \ + "$getnomatch no redirect to table" done match="fwmark 0x64" getmatch="mark 0x64" - fib_rule4_test_match_n_redirect "$match" "$getmatch" "fwmark redirect to table" + getnomatch="mark 0x63" + fib_rule4_test_match_n_redirect "$match" "$getmatch" "$getnomatch" \ + "fwmark redirect to table" "fwmark no redirect to table" fib_check_iproute_support "uidrange" "uid" if [ $? -eq 0 ]; then match="uidrange 100-100" getmatch="uid 100" - fib_rule4_test_match_n_redirect "$match" "$getmatch" "uid redirect to table" + getnomatch="uid 101" + fib_rule4_test_match_n_redirect "$match" "$getmatch" \ + "$getnomatch" "uid redirect to table" \ + "uid no redirect to table" fi fib_check_iproute_support "sport" "sport" if [ $? -eq 0 ]; then match="sport 666 dport 777" - fib_rule4_test_match_n_redirect "$match" "$match" "sport and dport redirect to table" + getnomatch="sport 667 dport 778" + fib_rule4_test_match_n_redirect "$match" "$match" \ + "$getnomatch" "sport and dport redirect to table" \ + "sport and dport no redirect to table" fi fib_check_iproute_support "ipproto" "ipproto" if [ $? -eq 0 ]; then match="ipproto tcp" - fib_rule4_test_match_n_redirect "$match" "$match" "ipproto tcp match" + getnomatch="ipproto udp" + fib_rule4_test_match_n_redirect "$match" "$match" \ + "$getnomatch" "ipproto tcp match" \ + "ipproto udp no match" fi fib_check_iproute_support "ipproto" "ipproto" if [ $? -eq 0 ]; then match="ipproto icmp" - fib_rule4_test_match_n_redirect "$match" "$match" "ipproto icmp match" + getnomatch="ipproto tcp" + fib_rule4_test_match_n_redirect "$match" "$match" \ + "$getnomatch" "ipproto icmp match" \ + "ipproto tcp no match" fi } -- cgit v1.2.3 From 53f88ed85bdd194b8a30605a264692b28a3ee665 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Wed, 14 Aug 2024 14:10:04 +0300 Subject: selftests: fib_rule_tests: Add negative connect tests The fib_rule{4,6}_connect tests verify that locally generated traffic from a socket that specifies a DS Field using the IP_TOS / IPV6_TCLASS socket options is correctly redirected using a FIB rule that matches on the given DS Field. Add negative tests to verify that the FIB rule is not hit when the socket specifies a DS Field that differs from the one used by the FIB rule. Signed-off-by: Ido Schimmel Link: https://patch.msgid.link/20240814111005.955359-5-idosch@nvidia.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/fib_rule_tests.sh | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/net/fib_rule_tests.sh b/tools/testing/selftests/net/fib_rule_tests.sh index 085e21ed9fc3..a3b2c833f050 100755 --- a/tools/testing/selftests/net/fib_rule_tests.sh +++ b/tools/testing/selftests/net/fib_rule_tests.sh @@ -325,6 +325,16 @@ fib_rule6_connect_test() log_test $? 0 "rule6 dsfield tcp connect (dsfield ${dsfield})" done + # Check that UDP and TCP connections fail when using a DS Field that + # does not match the previously configured FIB rule. + nettest -q -6 -B -t 5 -N $testns -O $peerns -U -D \ + -Q 0x20 -l 2001:db8::1:11 -r 2001:db8::1:11 + log_test $? 1 "rule6 dsfield udp no connect (dsfield 0x20)" + + nettest -q -6 -B -t 5 -N $testns -O $peerns -Q 0x20 \ + -l 2001:db8::1:11 -r 2001:db8::1:11 + log_test $? 1 "rule6 dsfield tcp no connect (dsfield 0x20)" + $IP -6 rule del dsfield 0x04 table $RTABLE_PEER cleanup_peer } @@ -502,6 +512,16 @@ fib_rule4_connect_test() log_test $? 0 "rule4 dsfield tcp connect (dsfield ${dsfield})" done + # Check that UDP and TCP connections fail when using a DS Field that + # does not match the previously configured FIB rule. + nettest -q -B -t 5 -N $testns -O $peerns -D -U -Q 0x20 \ + -l 198.51.100.11 -r 198.51.100.11 + log_test $? 1 "rule4 dsfield udp no connect (dsfield 0x20)" + + nettest -q -B -t 5 -N $testns -O $peerns -Q 0x20 \ + -l 198.51.100.11 -r 198.51.100.11 + log_test $? 1 "rule4 dsfield tcp no connect (dsfield 0x20)" + $IP -4 rule del dsfield 0x04 table $RTABLE_PEER cleanup_peer } -- cgit v1.2.3 From 5f1b4f1be2d2b8d85dd07352b097cfc7064ad2e5 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Wed, 14 Aug 2024 14:10:05 +0300 Subject: selftests: fib_rule_tests: Test TOS matching with input routes The TOS value reaches the FIB rule core via different call paths when an input route is looked up compared to an output route. Re-test TOS matching with input routes to exercise these code paths. Pass the 'iif' and 'from' selectors separately from the 'get{,no}match' variables as otherwise the test name is too long to be printed without misalignments. Signed-off-by: Ido Schimmel Link: https://patch.msgid.link/20240814111005.955359-6-idosch@nvidia.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/fib_rule_tests.sh | 31 ++++++++++++++++++++++++--- 1 file changed, 28 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/fib_rule_tests.sh b/tools/testing/selftests/net/fib_rule_tests.sh index a3b2c833f050..89034c5b69dc 100755 --- a/tools/testing/selftests/net/fib_rule_tests.sh +++ b/tools/testing/selftests/net/fib_rule_tests.sh @@ -245,6 +245,19 @@ fib_rule6_test() "$getnomatch no redirect to table" done + # Re-test TOS matching, but with input routes since they are handled + # differently from output routes. + match="tos 0x10" + for cnt in "0x10" "0x11" "0x12" "0x13"; do + getmatch="tos $cnt" + getnomatch="tos 0x20" + fib_rule6_test_match_n_redirect "$match" \ + "from $SRC_IP6 iif $DEV $getmatch" \ + "from $SRC_IP6 iif $DEV $getnomatch" \ + "iif $getmatch redirect to table" \ + "iif $getnomatch no redirect to table" + done + match="fwmark 0x64" getmatch="mark 0x64" getnomatch="mark 0x63" @@ -403,15 +416,14 @@ fib_rule4_test() fib_rule4_test_match_n_redirect "$match" "$match" "$getnomatch" \ "oif redirect to table" "oif no redirect to table" - # need enable forwarding and disable rp_filter temporarily as all the - # addresses are in the same subnet and egress device == ingress device. + # Enable forwarding and disable rp_filter as all the addresses are in + # the same subnet and egress device == ingress device. ip netns exec $testns sysctl -qw net.ipv4.ip_forward=1 ip netns exec $testns sysctl -qw net.ipv4.conf.$DEV.rp_filter=0 match="from $SRC_IP iif $DEV" getnomatch="from $SRC_IP iif lo" fib_rule4_test_match_n_redirect "$match" "$match" "$getnomatch" \ "iif redirect to table" "iif no redirect to table" - ip netns exec $testns sysctl -qw net.ipv4.ip_forward=0 # Reject dsfield (tos) options which have ECN bits set for cnt in $(seq 1 3); do @@ -431,6 +443,19 @@ fib_rule4_test() "$getnomatch no redirect to table" done + # Re-test TOS matching, but with input routes since they are handled + # differently from output routes. + match="tos 0x10" + for cnt in "0x10" "0x11" "0x12" "0x13"; do + getmatch="tos $cnt" + getnomatch="tos 0x20" + fib_rule4_test_match_n_redirect "$match" \ + "from $SRC_IP iif $DEV $getmatch" \ + "from $SRC_IP iif $DEV $getnomatch" \ + "iif $getmatch redirect to table" \ + "iif $getnomatch no redirect to table" + done + match="fwmark 0x64" getmatch="mark 0x64" getnomatch="mark 0x63" -- cgit v1.2.3 From 6bdf5168b6fb19541b0c1862bdaa596d116c7bfb Mon Sep 17 00:00:00 2001 From: Yang Jihong Date: Tue, 6 Aug 2024 10:35:33 +0800 Subject: perf sched timehist: Fix missing free of session in perf_sched__timehist() When perf_time__parse_str() fails in perf_sched__timehist(), need to free session that was previously created, fix it. Fixes: 853b74071110bed3 ("perf sched timehist: Add option to specify time window of interest") Signed-off-by: Yang Jihong Acked-by: Namhyung Kim Cc: Adrian Hunter Cc: Alexander Shishkin Cc: David Ahern Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kan Liang Cc: Mark Rutland Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20240806023533.1316348-1-yangjihong@bytedance.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-sched.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c index 34fe8e540c43..9c1276dc8ef6 100644 --- a/tools/perf/builtin-sched.c +++ b/tools/perf/builtin-sched.c @@ -3120,7 +3120,8 @@ static int perf_sched__timehist(struct perf_sched *sched) if (perf_time__parse_str(&sched->ptime, sched->time_str) != 0) { pr_err("Invalid time string\n"); - return -EINVAL; + err = -EINVAL; + goto out; } if (timehist_check_attr(sched, evlist) != 0) -- cgit v1.2.3 From 2615639352420e6e3115952c5b8f46846e1c6d0e Mon Sep 17 00:00:00 2001 From: Yicong Yang Date: Fri, 2 Aug 2024 14:58:00 +0800 Subject: perf stat: Display iostat headers correctly Currently we'll only print metric headers for metric leader in aggregration mode. This will make `perf iostat` header not shown since it'll aggregrated globally but don't have metric events: root@ubuntu204:/home/yang/linux/tools/perf# ./perf stat --iostat --timeout 1000 Performance counter stats for 'system wide': port 0000:00 0 0 0 0 0000:80 0 0 0 0 [...] Fix this by excluding the iostat in the check of printing metric headers. Then we can see the headers: root@ubuntu204:/home/yang/linux/tools/perf# ./perf stat --iostat --timeout 1000 Performance counter stats for 'system wide': port Inbound Read(MB) Inbound Write(MB) Outbound Read(MB) Outbound Write(MB) 0000:00 0 0 0 0 0000:80 0 0 0 0 [...] Fixes: 193a9e30207f5477 ("perf stat: Don't display metric header for non-leader uncore events") Signed-off-by: Yicong Yang Acked-by: Namhyung Kim Cc: Ian Rogers Cc: Ingo Molnar Cc: Jonathan Cameron Cc: Junhao He Cc: linuxarm@huawei.com Cc: Mark Rutland Cc: Peter Zijlstra Cc: Shameerali Kolothum Thodi Cc: Zeng Tao Link: https://lore.kernel.org/r/20240802065800.48774-1-yangyicong@huawei.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/stat-display.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/perf/util/stat-display.c b/tools/perf/util/stat-display.c index c38bcb6f4c78..ea96e4ebad8c 100644 --- a/tools/perf/util/stat-display.c +++ b/tools/perf/util/stat-display.c @@ -1237,7 +1237,8 @@ static void print_metric_headers(struct perf_stat_config *config, /* Print metrics headers only */ evlist__for_each_entry(evlist, counter) { - if (config->aggr_mode != AGGR_NONE && counter->metric_leader != counter) + if (!config->iostat_run && + config->aggr_mode != AGGR_NONE && counter->metric_leader != counter) continue; os.evsel = counter; -- cgit v1.2.3 From a0c9fe5eecc97680323ee83780ea3eaf440ba1b7 Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Thu, 15 Aug 2024 16:37:13 +0100 Subject: tc-testing: don't access non-existent variable on exception Since commit 255c1c7279ab ("tc-testing: Allow test cases to be skipped") the variable test_ordinal doesn't exist in call_pre_case(). So it should not be accessed when an exception occurs. This resolves the following splat: ... During handling of the above exception, another exception occurred: Traceback (most recent call last): File ".../tdc.py", line 1028, in main() File ".../tdc.py", line 1022, in main set_operation_mode(pm, parser, args, remaining) File ".../tdc.py", line 966, in set_operation_mode catresults = test_runner_serial(pm, args, alltests) File ".../tdc.py", line 642, in test_runner_serial (index, tsr) = test_runner(pm, args, alltests) File ".../tdc.py", line 536, in test_runner res = run_one_test(pm, args, index, tidx) File ".../tdc.py", line 419, in run_one_test pm.call_pre_case(tidx) File ".../tdc.py", line 146, in call_pre_case print('test_ordinal is {}'.format(test_ordinal)) NameError: name 'test_ordinal' is not defined Fixes: 255c1c7279ab ("tc-testing: Allow test cases to be skipped") Signed-off-by: Simon Horman Acked-by: Jamal Hadi Salim Link: https://patch.msgid.link/20240815-tdc-test-ordinal-v1-1-0255c122a427@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/tc-testing/tdc.py | 1 - 1 file changed, 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/tc-testing/tdc.py b/tools/testing/selftests/tc-testing/tdc.py index ee349187636f..4f255cec0c22 100755 --- a/tools/testing/selftests/tc-testing/tdc.py +++ b/tools/testing/selftests/tc-testing/tdc.py @@ -143,7 +143,6 @@ class PluginMgr: except Exception as ee: print('exception {} in call to pre_case for {} plugin'. format(ee, pgn_inst.__class__)) - print('test_ordinal is {}'.format(test_ordinal)) print('testid is {}'.format(caseinfo['id'])) raise -- cgit v1.2.3 From 56d680dd23c38067a32fb8aeb74d6ce838fcf26c Mon Sep 17 00:00:00 2001 From: Miguel Ojeda Date: Thu, 25 Jul 2024 20:33:22 +0200 Subject: objtool/rust: list `noreturn` Rust functions Rust functions may be `noreturn` (i.e. diverging) by returning the "never" type, `!`, e.g. fn f() -> ! { loop {} } Thus list the known `noreturn` functions to avoid such warnings. Without this, `objtool` would complain if enabled for Rust, e.g.: rust/core.o: warning: objtool: _R...9panic_fmt() falls through to next function _R...18panic_nounwind_fmt() rust/alloc.o: warning: objtool: .text: unexpected end of section In order to do so, we cannot match symbols' names exactly, for two reasons: - Rust mangling scheme [1] contains disambiguators [2] which we cannot predict (e.g. they may vary depending on the compiler version). One possibility to solve this would be to parse v0 and ignore/zero those before comparison. - Some of the diverging functions come from `core`, i.e. the Rust standard library, which may change with each compiler version since they are implementation details (e.g. `panic_internals`). Thus, to workaround both issues, only part of the symbols are matched, instead of using the `NORETURN` macro in `noreturns.h`. Ideally, just like for the C side, we should have a better solution. For instance, the compiler could give us the list via something like: $ rustc --emit=noreturns ... [ Kees agrees this should be automated and Peter says: So it would be fairly simple to make objtool consume a magic section emitted by the compiler.. I think we've asked the compiler folks for that at some point even, but I don't have clear recollections. We will ask upstream Rust about it. And if they agree, then perhaps we can get Clang/GCC to implement something similar too -- for this sort of thing we can take advantage of the shorter cycles of `rustc` as well as their unstable features concept to experiment. Gary proposed using DWARF (though it would need to be available), and wrote a proof of concept script using the `object` and `gimli` crates: https://gist.github.com/nbdd0121/449692570622c2f46a29ad9f47c3379a - Miguel ] Link: https://rust-lang.github.io/rfcs/2603-rust-symbol-name-mangling-v0.html [1] Link: https://doc.rust-lang.org/rustc/symbol-mangling/v0.html#disambiguator [2] Acked-by: Peter Zijlstra (Intel) Tested-by: Alice Ryhl Reviewed-by: Kees Cook Tested-by: Benno Lossin Link: https://lore.kernel.org/r/20240725183325.122827-6-ojeda@kernel.org [ Added `len_mismatch_fail` symbol for new `kernel` crate code merged since then as well as 3 more `core::panicking` symbols that appear in `RUST_DEBUG_ASSERTIONS=y` builds. - Miguel ] Signed-off-by: Miguel Ojeda --- tools/objtool/check.c | 52 ++++++++++++++++++++++++++++++++++++++++++++++- tools/objtool/noreturns.h | 2 ++ 2 files changed, 53 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/objtool/check.c b/tools/objtool/check.c index 01237d167223..d086f207a3d3 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -177,6 +177,52 @@ static bool is_sibling_call(struct instruction *insn) return (is_static_jump(insn) && insn_call_dest(insn)); } +/* + * Checks if a string ends with another. + */ +static bool str_ends_with(const char *s, const char *sub) +{ + const int slen = strlen(s); + const int sublen = strlen(sub); + + if (sublen > slen) + return 0; + + return !memcmp(s + slen - sublen, sub, sublen); +} + +/* + * Checks if a function is a Rust "noreturn" one. + */ +static bool is_rust_noreturn(const struct symbol *func) +{ + /* + * If it does not start with "_R", then it is not a Rust symbol. + */ + if (strncmp(func->name, "_R", 2)) + return false; + + /* + * These are just heuristics -- we do not control the precise symbol + * name, due to the crate disambiguators (which depend on the compiler) + * as well as changes to the source code itself between versions (since + * these come from the Rust standard library). + */ + return str_ends_with(func->name, "_4core5sliceSp15copy_from_slice17len_mismatch_fail") || + str_ends_with(func->name, "_4core6option13unwrap_failed") || + str_ends_with(func->name, "_4core6result13unwrap_failed") || + str_ends_with(func->name, "_4core9panicking5panic") || + str_ends_with(func->name, "_4core9panicking9panic_fmt") || + str_ends_with(func->name, "_4core9panicking14panic_explicit") || + str_ends_with(func->name, "_4core9panicking14panic_nounwind") || + str_ends_with(func->name, "_4core9panicking18panic_bounds_check") || + str_ends_with(func->name, "_4core9panicking19assert_failed_inner") || + str_ends_with(func->name, "_4core9panicking36panic_misaligned_pointer_dereference") || + strstr(func->name, "_4core9panicking11panic_const24panic_const_") || + (strstr(func->name, "_4core5slice5index24slice_") && + str_ends_with(func->name, "_fail")); +} + /* * This checks to see if the given function is a "noreturn" function. * @@ -202,10 +248,14 @@ static bool __dead_end_function(struct objtool_file *file, struct symbol *func, if (!func) return false; - if (func->bind == STB_GLOBAL || func->bind == STB_WEAK) + if (func->bind == STB_GLOBAL || func->bind == STB_WEAK) { + if (is_rust_noreturn(func)) + return true; + for (i = 0; i < ARRAY_SIZE(global_noreturns); i++) if (!strcmp(func->name, global_noreturns[i])) return true; + } if (func->bind == STB_WEAK) return false; diff --git a/tools/objtool/noreturns.h b/tools/objtool/noreturns.h index 1e8141ef1b15..e7da92489167 100644 --- a/tools/objtool/noreturns.h +++ b/tools/objtool/noreturns.h @@ -39,6 +39,8 @@ NORETURN(panic) NORETURN(panic_smp_self_stop) NORETURN(rest_init) NORETURN(rewind_stack_and_make_dead) +NORETURN(rust_begin_unwind) +NORETURN(rust_helper_BUG) NORETURN(sev_es_terminate) NORETURN(snp_abort) NORETURN(start_kernel) -- cgit v1.2.3 From 252b6fd2e186b793b960fa28ffccb07ccc4d5f51 Mon Sep 17 00:00:00 2001 From: Christoph Schlameuss Date: Wed, 7 Aug 2024 17:45:03 +0200 Subject: selftests: kvm: s390: Define page sizes in shared header Multiple test cases need page size and shift definitions. By moving the definitions to a single architecture specific header we limit the repetition. Make use of PAGE_SIZE, PAGE_SHIFT and PAGE_MASK defines in existing code. Signed-off-by: Christoph Schlameuss Reviewed-by: Claudio Imbrenda Reviewed-by: Janosch Frank Link: https://lore.kernel.org/r/20240807154512.316936-2-schlameuss@linux.ibm.com Signed-off-by: Janosch Frank Message-ID: <20240807154512.316936-2-schlameuss@linux.ibm.com> --- tools/testing/selftests/kvm/include/s390x/processor.h | 5 +++++ tools/testing/selftests/kvm/lib/s390x/processor.c | 10 +++++----- tools/testing/selftests/kvm/s390x/cmma_test.c | 7 ++++--- tools/testing/selftests/kvm/s390x/memop.c | 4 +--- tools/testing/selftests/kvm/s390x/tprot.c | 5 ++--- 5 files changed, 17 insertions(+), 14 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/include/s390x/processor.h b/tools/testing/selftests/kvm/include/s390x/processor.h index 255c9b990f4c..481bd2fd6a32 100644 --- a/tools/testing/selftests/kvm/include/s390x/processor.h +++ b/tools/testing/selftests/kvm/include/s390x/processor.h @@ -21,6 +21,11 @@ #define PAGE_PROTECT 0x200 /* HW read-only bit */ #define PAGE_NOEXEC 0x100 /* HW no-execute bit */ +/* Page size definitions */ +#define PAGE_SHIFT 12 +#define PAGE_SIZE BIT_ULL(PAGE_SHIFT) +#define PAGE_MASK (~(PAGE_SIZE - 1)) + /* Is there a portable way to do this? */ static inline void cpu_relax(void) { diff --git a/tools/testing/selftests/kvm/lib/s390x/processor.c b/tools/testing/selftests/kvm/lib/s390x/processor.c index 4ad4492eea1d..20cfe970e3e3 100644 --- a/tools/testing/selftests/kvm/lib/s390x/processor.c +++ b/tools/testing/selftests/kvm/lib/s390x/processor.c @@ -14,7 +14,7 @@ void virt_arch_pgd_alloc(struct kvm_vm *vm) { vm_paddr_t paddr; - TEST_ASSERT(vm->page_size == 4096, "Unsupported page size: 0x%x", + TEST_ASSERT(vm->page_size == PAGE_SIZE, "Unsupported page size: 0x%x", vm->page_size); if (vm->pgd_created) @@ -79,7 +79,7 @@ void virt_arch_pg_map(struct kvm_vm *vm, uint64_t gva, uint64_t gpa) } /* Fill in page table entry */ - idx = (gva >> 12) & 0x0ffu; /* page index */ + idx = (gva >> PAGE_SHIFT) & 0x0ffu; /* page index */ if (!(entry[idx] & PAGE_INVALID)) fprintf(stderr, "WARNING: PTE for gpa=0x%"PRIx64" already set!\n", gpa); @@ -91,7 +91,7 @@ vm_paddr_t addr_arch_gva2gpa(struct kvm_vm *vm, vm_vaddr_t gva) int ri, idx; uint64_t *entry; - TEST_ASSERT(vm->page_size == 4096, "Unsupported page size: 0x%x", + TEST_ASSERT(vm->page_size == PAGE_SIZE, "Unsupported page size: 0x%x", vm->page_size); entry = addr_gpa2hva(vm, vm->pgd); @@ -103,7 +103,7 @@ vm_paddr_t addr_arch_gva2gpa(struct kvm_vm *vm, vm_vaddr_t gva) entry = addr_gpa2hva(vm, entry[idx] & REGION_ENTRY_ORIGIN); } - idx = (gva >> 12) & 0x0ffu; /* page index */ + idx = (gva >> PAGE_SHIFT) & 0x0ffu; /* page index */ TEST_ASSERT(!(entry[idx] & PAGE_INVALID), "No page mapping for vm virtual address 0x%lx", gva); @@ -168,7 +168,7 @@ struct kvm_vcpu *vm_arch_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id) struct kvm_sregs sregs; struct kvm_vcpu *vcpu; - TEST_ASSERT(vm->page_size == 4096, "Unsupported page size: 0x%x", + TEST_ASSERT(vm->page_size == PAGE_SIZE, "Unsupported page size: 0x%x", vm->page_size); stack_vaddr = __vm_vaddr_alloc(vm, stack_size, diff --git a/tools/testing/selftests/kvm/s390x/cmma_test.c b/tools/testing/selftests/kvm/s390x/cmma_test.c index b39033844756..e32dd59703a0 100644 --- a/tools/testing/selftests/kvm/s390x/cmma_test.c +++ b/tools/testing/selftests/kvm/s390x/cmma_test.c @@ -17,16 +17,17 @@ #include "kvm_util.h" #include "kselftest.h" #include "ucall_common.h" +#include "processor.h" #define MAIN_PAGE_COUNT 512 #define TEST_DATA_PAGE_COUNT 512 #define TEST_DATA_MEMSLOT 1 -#define TEST_DATA_START_GFN 4096 +#define TEST_DATA_START_GFN PAGE_SIZE #define TEST_DATA_TWO_PAGE_COUNT 256 #define TEST_DATA_TWO_MEMSLOT 2 -#define TEST_DATA_TWO_START_GFN 8192 +#define TEST_DATA_TWO_START_GFN (2 * PAGE_SIZE) static char cmma_value_buf[MAIN_PAGE_COUNT + TEST_DATA_PAGE_COUNT]; @@ -66,7 +67,7 @@ static void guest_dirty_test_data(void) " lghi 5,%[page_count]\n" /* r5 += r1 */ "2: agfr 5,1\n" - /* r2 = r1 << 12 */ + /* r2 = r1 << PAGE_SHIFT */ "1: sllg 2,1,12(0)\n" /* essa(r4, r2, SET_STABLE) */ " .insn rrf,0xb9ab0000,4,2,1,0\n" diff --git a/tools/testing/selftests/kvm/s390x/memop.c b/tools/testing/selftests/kvm/s390x/memop.c index f2df7416be84..4374b4cd2a80 100644 --- a/tools/testing/selftests/kvm/s390x/memop.c +++ b/tools/testing/selftests/kvm/s390x/memop.c @@ -16,6 +16,7 @@ #include "kvm_util.h" #include "kselftest.h" #include "ucall_common.h" +#include "processor.h" enum mop_target { LOGICAL, @@ -226,9 +227,6 @@ static void memop_ioctl(struct test_info info, struct kvm_s390_mem_op *ksmo, #define CHECK_N_DO(f, ...) ({ f(__VA_ARGS__, CHECK_ONLY); f(__VA_ARGS__); }) -#define PAGE_SHIFT 12 -#define PAGE_SIZE (1ULL << PAGE_SHIFT) -#define PAGE_MASK (~(PAGE_SIZE - 1)) #define CR0_FETCH_PROTECTION_OVERRIDE (1UL << (63 - 38)) #define CR0_STORAGE_PROTECTION_OVERRIDE (1UL << (63 - 39)) diff --git a/tools/testing/selftests/kvm/s390x/tprot.c b/tools/testing/selftests/kvm/s390x/tprot.c index 7a742a673b7c..12d5e1cb62e3 100644 --- a/tools/testing/selftests/kvm/s390x/tprot.c +++ b/tools/testing/selftests/kvm/s390x/tprot.c @@ -9,9 +9,8 @@ #include "kvm_util.h" #include "kselftest.h" #include "ucall_common.h" +#include "processor.h" -#define PAGE_SHIFT 12 -#define PAGE_SIZE (1 << PAGE_SHIFT) #define CR0_FETCH_PROTECTION_OVERRIDE (1UL << (63 - 38)) #define CR0_STORAGE_PROTECTION_OVERRIDE (1UL << (63 - 39)) @@ -151,7 +150,7 @@ static enum stage perform_next_stage(int *i, bool mapped_0) * instead. * In order to skip these tests we detect this inside the guest */ - skip = tests[*i].addr < (void *)4096 && + skip = tests[*i].addr < (void *)PAGE_SIZE && tests[*i].expected != TRANSL_UNAVAIL && !mapped_0; if (!skip) { -- cgit v1.2.3 From 845482188e3890269927c47316bcbacee9f71a3f Mon Sep 17 00:00:00 2001 From: Christoph Schlameuss Date: Wed, 7 Aug 2024 17:45:04 +0200 Subject: selftests: kvm: s390: Add kvm_s390_sie_block definition for userspace tests Subsequent tests do require direct manipulation of the SIE control block. This commit introduces the SIE control block definition for use within the selftests. There are already definitions of this within the kernel. This differs in two ways. * This is the first definition of this in userspace. * In the context of the selftests this does not require atomicity for the flags. With the userspace definition of the SIE block layout now being present we can reuse the values in other tests where applicable. Signed-off-by: Christoph Schlameuss Reviewed-by: Janosch Frank Link: https://lore.kernel.org/r/20240807154512.316936-3-schlameuss@linux.ibm.com Signed-off-by: Janosch Frank Message-ID: <20240807154512.316936-3-schlameuss@linux.ibm.com> --- tools/testing/selftests/kvm/include/s390x/sie.h | 240 ++++++++++++++++++++++++ tools/testing/selftests/kvm/s390x/debug_test.c | 4 +- 2 files changed, 242 insertions(+), 2 deletions(-) create mode 100644 tools/testing/selftests/kvm/include/s390x/sie.h (limited to 'tools') diff --git a/tools/testing/selftests/kvm/include/s390x/sie.h b/tools/testing/selftests/kvm/include/s390x/sie.h new file mode 100644 index 000000000000..160acd4a1db9 --- /dev/null +++ b/tools/testing/selftests/kvm/include/s390x/sie.h @@ -0,0 +1,240 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Definition for kernel virtual machines on s390. + * + * Adapted copy of struct definition kvm_s390_sie_block from + * arch/s390/include/asm/kvm_host.h for use in userspace selftest programs. + * + * Copyright IBM Corp. 2008, 2024 + * + * Authors: + * Christoph Schlameuss + * Carsten Otte + */ + +#ifndef SELFTEST_KVM_SIE_H +#define SELFTEST_KVM_SIE_H + +#include + +struct kvm_s390_sie_block { +#define CPUSTAT_STOPPED 0x80000000 +#define CPUSTAT_WAIT 0x10000000 +#define CPUSTAT_ECALL_PEND 0x08000000 +#define CPUSTAT_STOP_INT 0x04000000 +#define CPUSTAT_IO_INT 0x02000000 +#define CPUSTAT_EXT_INT 0x01000000 +#define CPUSTAT_RUNNING 0x00800000 +#define CPUSTAT_RETAINED 0x00400000 +#define CPUSTAT_TIMING_SUB 0x00020000 +#define CPUSTAT_SIE_SUB 0x00010000 +#define CPUSTAT_RRF 0x00008000 +#define CPUSTAT_SLSV 0x00004000 +#define CPUSTAT_SLSR 0x00002000 +#define CPUSTAT_ZARCH 0x00000800 +#define CPUSTAT_MCDS 0x00000100 +#define CPUSTAT_KSS 0x00000200 +#define CPUSTAT_SM 0x00000080 +#define CPUSTAT_IBS 0x00000040 +#define CPUSTAT_GED2 0x00000010 +#define CPUSTAT_G 0x00000008 +#define CPUSTAT_GED 0x00000004 +#define CPUSTAT_J 0x00000002 +#define CPUSTAT_P 0x00000001 + __u32 cpuflags; /* 0x0000 */ + __u32: 1; /* 0x0004 */ + __u32 prefix : 18; + __u32: 1; + __u32 ibc : 12; + __u8 reserved08[4]; /* 0x0008 */ +#define PROG_IN_SIE BIT(0) + __u32 prog0c; /* 0x000c */ + union { + __u8 reserved10[16]; /* 0x0010 */ + struct { + __u64 pv_handle_cpu; + __u64 pv_handle_config; + }; + }; +#define PROG_BLOCK_SIE BIT(0) +#define PROG_REQUEST BIT(1) + __u32 prog20; /* 0x0020 */ + __u8 reserved24[4]; /* 0x0024 */ + __u64 cputm; /* 0x0028 */ + __u64 ckc; /* 0x0030 */ + __u64 epoch; /* 0x0038 */ + __u32 svcc; /* 0x0040 */ +#define LCTL_CR0 0x8000 +#define LCTL_CR6 0x0200 +#define LCTL_CR9 0x0040 +#define LCTL_CR10 0x0020 +#define LCTL_CR11 0x0010 +#define LCTL_CR14 0x0002 + __u16 lctl; /* 0x0044 */ + __s16 icpua; /* 0x0046 */ +#define ICTL_OPEREXC 0x80000000 +#define ICTL_PINT 0x20000000 +#define ICTL_LPSW 0x00400000 +#define ICTL_STCTL 0x00040000 +#define ICTL_ISKE 0x00004000 +#define ICTL_SSKE 0x00002000 +#define ICTL_RRBE 0x00001000 +#define ICTL_TPROT 0x00000200 + __u32 ictl; /* 0x0048 */ +#define ECA_CEI 0x80000000 +#define ECA_IB 0x40000000 +#define ECA_SIGPI 0x10000000 +#define ECA_MVPGI 0x01000000 +#define ECA_AIV 0x00200000 +#define ECA_VX 0x00020000 +#define ECA_PROTEXCI 0x00002000 +#define ECA_APIE 0x00000008 +#define ECA_SII 0x00000001 + __u32 eca; /* 0x004c */ +#define ICPT_INST 0x04 +#define ICPT_PROGI 0x08 +#define ICPT_INSTPROGI 0x0C +#define ICPT_EXTREQ 0x10 +#define ICPT_EXTINT 0x14 +#define ICPT_IOREQ 0x18 +#define ICPT_WAIT 0x1c +#define ICPT_VALIDITY 0x20 +#define ICPT_STOP 0x28 +#define ICPT_OPEREXC 0x2C +#define ICPT_PARTEXEC 0x38 +#define ICPT_IOINST 0x40 +#define ICPT_KSS 0x5c +#define ICPT_MCHKREQ 0x60 +#define ICPT_INT_ENABLE 0x64 +#define ICPT_PV_INSTR 0x68 +#define ICPT_PV_NOTIFY 0x6c +#define ICPT_PV_PREF 0x70 + __u8 icptcode; /* 0x0050 */ + __u8 icptstatus; /* 0x0051 */ + __u16 ihcpu; /* 0x0052 */ + __u8 reserved54; /* 0x0054 */ +#define IICTL_CODE_NONE 0x00 +#define IICTL_CODE_MCHK 0x01 +#define IICTL_CODE_EXT 0x02 +#define IICTL_CODE_IO 0x03 +#define IICTL_CODE_RESTART 0x04 +#define IICTL_CODE_SPECIFICATION 0x10 +#define IICTL_CODE_OPERAND 0x11 + __u8 iictl; /* 0x0055 */ + __u16 ipa; /* 0x0056 */ + __u32 ipb; /* 0x0058 */ + __u32 scaoh; /* 0x005c */ +#define FPF_BPBC 0x20 + __u8 fpf; /* 0x0060 */ +#define ECB_GS 0x40 +#define ECB_TE 0x10 +#define ECB_SPECI 0x08 +#define ECB_SRSI 0x04 +#define ECB_HOSTPROTINT 0x02 +#define ECB_PTF 0x01 + __u8 ecb; /* 0x0061 */ +#define ECB2_CMMA 0x80 +#define ECB2_IEP 0x20 +#define ECB2_PFMFI 0x08 +#define ECB2_ESCA 0x04 +#define ECB2_ZPCI_LSI 0x02 + __u8 ecb2; /* 0x0062 */ +#define ECB3_AISI 0x20 +#define ECB3_AISII 0x10 +#define ECB3_DEA 0x08 +#define ECB3_AES 0x04 +#define ECB3_RI 0x01 + __u8 ecb3; /* 0x0063 */ +#define ESCA_SCAOL_MASK ~0x3fU + __u32 scaol; /* 0x0064 */ + __u8 sdf; /* 0x0068 */ + __u8 epdx; /* 0x0069 */ + __u8 cpnc; /* 0x006a */ + __u8 reserved6b; /* 0x006b */ + __u32 todpr; /* 0x006c */ +#define GISA_FORMAT1 0x00000001 + __u32 gd; /* 0x0070 */ + __u8 reserved74[12]; /* 0x0074 */ + __u64 mso; /* 0x0080 */ + __u64 msl; /* 0x0088 */ + __u64 psw_mask; /* 0x0090 */ + __u64 psw_addr; /* 0x0098 */ + __u64 gg14; /* 0x00a0 */ + __u64 gg15; /* 0x00a8 */ + __u8 reservedb0[8]; /* 0x00b0 */ +#define HPID_KVM 0x4 +#define HPID_VSIE 0x5 + __u8 hpid; /* 0x00b8 */ + __u8 reservedb9[7]; /* 0x00b9 */ + union { + struct { + __u32 eiparams; /* 0x00c0 */ + __u16 extcpuaddr; /* 0x00c4 */ + __u16 eic; /* 0x00c6 */ + }; + __u64 mcic; /* 0x00c0 */ + } __packed; + __u32 reservedc8; /* 0x00c8 */ + union { + struct { + __u16 pgmilc; /* 0x00cc */ + __u16 iprcc; /* 0x00ce */ + }; + __u32 edc; /* 0x00cc */ + } __packed; + union { + struct { + __u32 dxc; /* 0x00d0 */ + __u16 mcn; /* 0x00d4 */ + __u8 perc; /* 0x00d6 */ + __u8 peratmid; /* 0x00d7 */ + }; + __u64 faddr; /* 0x00d0 */ + } __packed; + __u64 peraddr; /* 0x00d8 */ + __u8 eai; /* 0x00e0 */ + __u8 peraid; /* 0x00e1 */ + __u8 oai; /* 0x00e2 */ + __u8 armid; /* 0x00e3 */ + __u8 reservede4[4]; /* 0x00e4 */ + union { + __u64 tecmc; /* 0x00e8 */ + struct { + __u16 subchannel_id; /* 0x00e8 */ + __u16 subchannel_nr; /* 0x00ea */ + __u32 io_int_parm; /* 0x00ec */ + __u32 io_int_word; /* 0x00f0 */ + }; + } __packed; + __u8 reservedf4[8]; /* 0x00f4 */ +#define CRYCB_FORMAT_MASK 0x00000003 +#define CRYCB_FORMAT0 0x00000000 +#define CRYCB_FORMAT1 0x00000001 +#define CRYCB_FORMAT2 0x00000003 + __u32 crycbd; /* 0x00fc */ + __u64 gcr[16]; /* 0x0100 */ + union { + __u64 gbea; /* 0x0180 */ + __u64 sidad; + }; + __u8 reserved188[8]; /* 0x0188 */ + __u64 sdnxo; /* 0x0190 */ + __u8 reserved198[8]; /* 0x0198 */ + __u32 fac; /* 0x01a0 */ + __u8 reserved1a4[20]; /* 0x01a4 */ + __u64 cbrlo; /* 0x01b8 */ + __u8 reserved1c0[8]; /* 0x01c0 */ +#define ECD_HOSTREGMGMT 0x20000000 +#define ECD_MEF 0x08000000 +#define ECD_ETOKENF 0x02000000 +#define ECD_ECC 0x00200000 + __u32 ecd; /* 0x01c8 */ + __u8 reserved1cc[18]; /* 0x01cc */ + __u64 pp; /* 0x01de */ + __u8 reserved1e6[2]; /* 0x01e6 */ + __u64 itdba; /* 0x01e8 */ + __u64 riccbd; /* 0x01f0 */ + __u64 gvrd; /* 0x01f8 */ +} __packed __aligned(512); + +#endif /* SELFTEST_KVM_SIE_H */ diff --git a/tools/testing/selftests/kvm/s390x/debug_test.c b/tools/testing/selftests/kvm/s390x/debug_test.c index 84313fb27529..ad8095968601 100644 --- a/tools/testing/selftests/kvm/s390x/debug_test.c +++ b/tools/testing/selftests/kvm/s390x/debug_test.c @@ -2,12 +2,12 @@ /* Test KVM debugging features. */ #include "kvm_util.h" #include "test_util.h" +#include "sie.h" #include #define __LC_SVC_NEW_PSW 0x1c0 #define __LC_PGM_NEW_PSW 0x1d0 -#define ICPT_INSTRUCTION 0x04 #define IPA0_DIAG 0x8300 #define PGM_SPECIFICATION 0x06 @@ -85,7 +85,7 @@ static void test_step_pgm_diag(void) vm = test_step_int_1(&vcpu, test_step_pgm_diag_guest_code, __LC_PGM_NEW_PSW, new_psw); TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_S390_SIEIC); - TEST_ASSERT_EQ(vcpu->run->s390_sieic.icptcode, ICPT_INSTRUCTION); + TEST_ASSERT_EQ(vcpu->run->s390_sieic.icptcode, ICPT_INST); TEST_ASSERT_EQ(vcpu->run->s390_sieic.ipa & 0xff00, IPA0_DIAG); vcpu_ioctl(vcpu, KVM_S390_IRQ, &irq); vcpu_run(vcpu); -- cgit v1.2.3 From 011901fc222435f5bf2d6accb392f749ddebdfe7 Mon Sep 17 00:00:00 2001 From: Christoph Schlameuss Date: Wed, 7 Aug 2024 17:45:05 +0200 Subject: selftests: kvm: s390: Add s390x ucontrol test suite with hpage test Add test suite to validate the s390x architecture specific ucontrol KVM interface. Make use of the selftest test harness. * uc_cap_hpage testcase verifies that a ucontrol VM cannot be run with hugepages. To allow testing of the ucontrol interface the kernel needs a non-default config containing CONFIG_KVM_S390_UCONTROL. This config needs to be set to built-in (y) as this cannot be built as module. Signed-off-by: Christoph Schlameuss Reviewed-by: Janosch Frank Link: https://lore.kernel.org/r/20240807154512.316936-4-schlameuss@linux.ibm.com Signed-off-by: Janosch Frank Message-ID: <20240807154512.316936-4-schlameuss@linux.ibm.com> --- tools/testing/selftests/kvm/.gitignore | 1 + tools/testing/selftests/kvm/Makefile | 1 + tools/testing/selftests/kvm/s390x/config | 2 + tools/testing/selftests/kvm/s390x/ucontrol_test.c | 76 +++++++++++++++++++++++ 4 files changed, 80 insertions(+) create mode 100644 tools/testing/selftests/kvm/s390x/config create mode 100644 tools/testing/selftests/kvm/s390x/ucontrol_test.c (limited to 'tools') diff --git a/tools/testing/selftests/kvm/.gitignore b/tools/testing/selftests/kvm/.gitignore index 6d9381d60172..f2a30a58cd71 100644 --- a/tools/testing/selftests/kvm/.gitignore +++ b/tools/testing/selftests/kvm/.gitignore @@ -5,3 +5,4 @@ !*.h !*.S !*.sh +!config diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile index 48d32c5aa3eb..b3dc0a5bf0d4 100644 --- a/tools/testing/selftests/kvm/Makefile +++ b/tools/testing/selftests/kvm/Makefile @@ -186,6 +186,7 @@ TEST_GEN_PROGS_s390x += s390x/tprot TEST_GEN_PROGS_s390x += s390x/cmma_test TEST_GEN_PROGS_s390x += s390x/debug_test TEST_GEN_PROGS_s390x += s390x/shared_zeropage_test +TEST_GEN_PROGS_s390x += s390x/ucontrol_test TEST_GEN_PROGS_s390x += demand_paging_test TEST_GEN_PROGS_s390x += dirty_log_test TEST_GEN_PROGS_s390x += guest_print_test diff --git a/tools/testing/selftests/kvm/s390x/config b/tools/testing/selftests/kvm/s390x/config new file mode 100644 index 000000000000..23270f2d679f --- /dev/null +++ b/tools/testing/selftests/kvm/s390x/config @@ -0,0 +1,2 @@ +CONFIG_KVM=y +CONFIG_KVM_S390_UCONTROL=y diff --git a/tools/testing/selftests/kvm/s390x/ucontrol_test.c b/tools/testing/selftests/kvm/s390x/ucontrol_test.c new file mode 100644 index 000000000000..cd68e7e37d35 --- /dev/null +++ b/tools/testing/selftests/kvm/s390x/ucontrol_test.c @@ -0,0 +1,76 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Test code for the s390x kvm ucontrol interface + * + * Copyright IBM Corp. 2024 + * + * Authors: + * Christoph Schlameuss + */ +#include "kselftest_harness.h" +#include "kvm_util.h" + +#include +#include + +/* so directly declare capget to check caps without libcap */ +int capget(cap_user_header_t header, cap_user_data_t data); + +/** + * In order to create user controlled virtual machines on S390, + * check KVM_CAP_S390_UCONTROL and use the flag KVM_VM_S390_UCONTROL + * as privileged user (SYS_ADMIN). + */ +void require_ucontrol_admin(void) +{ + struct __user_cap_data_struct data[_LINUX_CAPABILITY_U32S_3]; + struct __user_cap_header_struct hdr = { + .version = _LINUX_CAPABILITY_VERSION_3, + }; + int rc; + + rc = capget(&hdr, data); + TEST_ASSERT_EQ(0, rc); + TEST_REQUIRE((data->effective & CAP_TO_MASK(CAP_SYS_ADMIN)) > 0); + + TEST_REQUIRE(kvm_has_cap(KVM_CAP_S390_UCONTROL)); +} + +/** + * Assert HPAGE CAP cannot be enabled on UCONTROL VM + */ +TEST(uc_cap_hpage) +{ + int rc, kvm_fd, vm_fd, vcpu_fd; + struct kvm_enable_cap cap = { + .cap = KVM_CAP_S390_HPAGE_1M, + }; + + require_ucontrol_admin(); + + kvm_fd = open_kvm_dev_path_or_exit(); + vm_fd = ioctl(kvm_fd, KVM_CREATE_VM, KVM_VM_S390_UCONTROL); + ASSERT_GE(vm_fd, 0); + + /* assert hpages are not supported on ucontrol vm */ + rc = ioctl(vm_fd, KVM_CHECK_EXTENSION, KVM_CAP_S390_HPAGE_1M); + EXPECT_EQ(0, rc); + + /* Test that KVM_CAP_S390_HPAGE_1M can't be enabled for a ucontrol vm */ + rc = ioctl(vm_fd, KVM_ENABLE_CAP, cap); + EXPECT_EQ(-1, rc); + EXPECT_EQ(EINVAL, errno); + + /* assert HPAGE CAP is rejected after vCPU creation */ + vcpu_fd = ioctl(vm_fd, KVM_CREATE_VCPU, 0); + ASSERT_GE(vcpu_fd, 0); + rc = ioctl(vm_fd, KVM_ENABLE_CAP, cap); + EXPECT_EQ(-1, rc); + EXPECT_EQ(EBUSY, errno); + + close(vcpu_fd); + close(vm_fd); + close(kvm_fd); +} + +TEST_HARNESS_MAIN -- cgit v1.2.3 From d4f8592f6c42a6df7f570be6c4ccf8bd40838aab Mon Sep 17 00:00:00 2001 From: Christoph Schlameuss Date: Wed, 7 Aug 2024 17:45:06 +0200 Subject: selftests: kvm: s390: Add test fixture and simple VM setup tests Add a uc_kvm fixture to create and destroy a ucontrol VM. * uc_sie_assertions asserts basic settings in the SIE as setup by the kernel. * uc_attr_mem_limit asserts the memory limit is max value and cannot be set (not supported). * uc_no_dirty_log asserts dirty log is not supported. Signed-off-by: Christoph Schlameuss Reviewed-by: Claudio Imbrenda Link: https://lore.kernel.org/r/20240807154512.316936-5-schlameuss@linux.ibm.com Signed-off-by: Janosch Frank Message-ID: <20240807154512.316936-5-schlameuss@linux.ibm.com> --- tools/testing/selftests/kvm/s390x/ucontrol_test.c | 131 ++++++++++++++++++++++ 1 file changed, 131 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/s390x/ucontrol_test.c b/tools/testing/selftests/kvm/s390x/ucontrol_test.c index cd68e7e37d35..d103a92e7495 100644 --- a/tools/testing/selftests/kvm/s390x/ucontrol_test.c +++ b/tools/testing/selftests/kvm/s390x/ucontrol_test.c @@ -9,10 +9,14 @@ */ #include "kselftest_harness.h" #include "kvm_util.h" +#include "processor.h" +#include "sie.h" #include #include +#define VM_MEM_SIZE (4 * SZ_1M) + /* so directly declare capget to check caps without libcap */ int capget(cap_user_header_t header, cap_user_data_t data); @@ -36,6 +40,133 @@ void require_ucontrol_admin(void) TEST_REQUIRE(kvm_has_cap(KVM_CAP_S390_UCONTROL)); } +FIXTURE(uc_kvm) +{ + struct kvm_s390_sie_block *sie_block; + struct kvm_run *run; + uintptr_t base_gpa; + uintptr_t code_gpa; + uintptr_t base_hva; + uintptr_t code_hva; + int kvm_run_size; + void *vm_mem; + int vcpu_fd; + int kvm_fd; + int vm_fd; +}; + +/** + * create VM with single vcpu, map kvm_run and SIE control block for easy access + */ +FIXTURE_SETUP(uc_kvm) +{ + struct kvm_s390_vm_cpu_processor info; + int rc; + + require_ucontrol_admin(); + + self->kvm_fd = open_kvm_dev_path_or_exit(); + self->vm_fd = ioctl(self->kvm_fd, KVM_CREATE_VM, KVM_VM_S390_UCONTROL); + ASSERT_GE(self->vm_fd, 0); + + kvm_device_attr_get(self->vm_fd, KVM_S390_VM_CPU_MODEL, + KVM_S390_VM_CPU_PROCESSOR, &info); + TH_LOG("create VM 0x%llx", info.cpuid); + + self->vcpu_fd = ioctl(self->vm_fd, KVM_CREATE_VCPU, 0); + ASSERT_GE(self->vcpu_fd, 0); + + self->kvm_run_size = ioctl(self->kvm_fd, KVM_GET_VCPU_MMAP_SIZE, NULL); + ASSERT_GE(self->kvm_run_size, sizeof(struct kvm_run)) + TH_LOG(KVM_IOCTL_ERROR(KVM_GET_VCPU_MMAP_SIZE, self->kvm_run_size)); + self->run = (struct kvm_run *)mmap(NULL, self->kvm_run_size, + PROT_READ | PROT_WRITE, MAP_SHARED, self->vcpu_fd, 0); + ASSERT_NE(self->run, MAP_FAILED); + /** + * For virtual cpus that have been created with S390 user controlled + * virtual machines, the resulting vcpu fd can be memory mapped at page + * offset KVM_S390_SIE_PAGE_OFFSET in order to obtain a memory map of + * the virtual cpu's hardware control block. + */ + self->sie_block = (struct kvm_s390_sie_block *)mmap(NULL, PAGE_SIZE, + PROT_READ | PROT_WRITE, MAP_SHARED, + self->vcpu_fd, KVM_S390_SIE_PAGE_OFFSET << PAGE_SHIFT); + ASSERT_NE(self->sie_block, MAP_FAILED); + + TH_LOG("VM created %p %p", self->run, self->sie_block); + + self->base_gpa = 0; + self->code_gpa = self->base_gpa + (3 * SZ_1M); + + self->vm_mem = aligned_alloc(SZ_1M, VM_MEM_SIZE); + ASSERT_NE(NULL, self->vm_mem) TH_LOG("malloc failed %u", errno); + self->base_hva = (uintptr_t)self->vm_mem; + self->code_hva = self->base_hva - self->base_gpa + self->code_gpa; + struct kvm_s390_ucas_mapping map = { + .user_addr = self->base_hva, + .vcpu_addr = self->base_gpa, + .length = VM_MEM_SIZE, + }; + TH_LOG("ucas map %p %p 0x%llx", + (void *)map.user_addr, (void *)map.vcpu_addr, map.length); + rc = ioctl(self->vcpu_fd, KVM_S390_UCAS_MAP, &map); + ASSERT_EQ(0, rc) TH_LOG("ucas map result %d not expected, %s", + rc, strerror(errno)); + + TH_LOG("page in %p", (void *)self->base_gpa); + rc = ioctl(self->vcpu_fd, KVM_S390_VCPU_FAULT, self->base_gpa); + ASSERT_EQ(0, rc) TH_LOG("vcpu fault (%p) result %d not expected, %s", + (void *)self->base_hva, rc, strerror(errno)); + + self->sie_block->cpuflags &= ~CPUSTAT_STOPPED; +} + +FIXTURE_TEARDOWN(uc_kvm) +{ + munmap(self->sie_block, PAGE_SIZE); + munmap(self->run, self->kvm_run_size); + close(self->vcpu_fd); + close(self->vm_fd); + close(self->kvm_fd); + free(self->vm_mem); +} + +TEST_F(uc_kvm, uc_sie_assertions) +{ + /* assert interception of Code 08 (Program Interruption) is set */ + EXPECT_EQ(0, self->sie_block->ecb & ECB_SPECI); +} + +TEST_F(uc_kvm, uc_attr_mem_limit) +{ + u64 limit; + struct kvm_device_attr attr = { + .group = KVM_S390_VM_MEM_CTRL, + .attr = KVM_S390_VM_MEM_LIMIT_SIZE, + .addr = (unsigned long)&limit, + }; + int rc; + + rc = ioctl(self->vm_fd, KVM_GET_DEVICE_ATTR, &attr); + EXPECT_EQ(0, rc); + EXPECT_EQ(~0UL, limit); + + /* assert set not supported */ + rc = ioctl(self->vm_fd, KVM_SET_DEVICE_ATTR, &attr); + EXPECT_EQ(-1, rc); + EXPECT_EQ(EINVAL, errno); +} + +TEST_F(uc_kvm, uc_no_dirty_log) +{ + struct kvm_dirty_log dlog; + int rc; + + rc = ioctl(self->vm_fd, KVM_GET_DIRTY_LOG, &dlog); + EXPECT_EQ(-1, rc); + EXPECT_EQ(EINVAL, errno); +} + /** * Assert HPAGE CAP cannot be enabled on UCONTROL VM */ -- cgit v1.2.3 From 100932fc37d468bfa80b5675b653ff65f7bafba7 Mon Sep 17 00:00:00 2001 From: Christoph Schlameuss Date: Wed, 7 Aug 2024 17:45:07 +0200 Subject: selftests: kvm: s390: Add debug print functions Add functions to simply print some basic state information in selftests. The output can be enabled by setting: #define TH_LOG_ENABLED 1 #define DEBUG 1 * print_psw: current SIE state description and VM run state * print_hex_bytes: print memory with some counting markers * print_hex: PRINT_HEX with 512 bytes * print_run: use print_psw and print_hex to print contents of VM run state and SIE state description * print_regs: print content of general and control registers All prints use pr_debug for the output and can be configured using DEBUG. Signed-off-by: Christoph Schlameuss Acked-by: Janosch Frank Link: https://lore.kernel.org/r/20240807154512.316936-6-schlameuss@linux.ibm.com Signed-off-by: Janosch Frank Message-ID: <20240807154512.316936-6-schlameuss@linux.ibm.com> --- .../selftests/kvm/include/s390x/debug_print.h | 69 ++++++++++++++++++++++ 1 file changed, 69 insertions(+) create mode 100644 tools/testing/selftests/kvm/include/s390x/debug_print.h (limited to 'tools') diff --git a/tools/testing/selftests/kvm/include/s390x/debug_print.h b/tools/testing/selftests/kvm/include/s390x/debug_print.h new file mode 100644 index 000000000000..1bf275631cc6 --- /dev/null +++ b/tools/testing/selftests/kvm/include/s390x/debug_print.h @@ -0,0 +1,69 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Definition for kernel virtual machines on s390x + * + * Copyright IBM Corp. 2024 + * + * Authors: + * Christoph Schlameuss + */ + +#ifndef SELFTEST_KVM_DEBUG_PRINT_H +#define SELFTEST_KVM_DEBUG_PRINT_H + +#include "asm/ptrace.h" +#include "kvm_util.h" +#include "sie.h" + +static inline void print_hex_bytes(const char *name, u64 addr, size_t len) +{ + u64 pos; + + pr_debug("%s (%p)\n", name, (void *)addr); + pr_debug(" 0/0x00---------|"); + if (len > 8) + pr_debug(" 8/0x08---------|"); + if (len > 16) + pr_debug(" 16/0x10--------|"); + if (len > 24) + pr_debug(" 24/0x18--------|"); + for (pos = 0; pos < len; pos += 8) { + if ((pos % 32) == 0) + pr_debug("\n %3lu 0x%.3lx ", pos, pos); + pr_debug(" %16lx", *((u64 *)(addr + pos))); + } + pr_debug("\n"); +} + +static inline void print_hex(const char *name, u64 addr) +{ + print_hex_bytes(name, addr, 512); +} + +static inline void print_psw(struct kvm_run *run, struct kvm_s390_sie_block *sie_block) +{ + pr_debug("flags:0x%x psw:0x%.16llx:0x%.16llx exit:%u %s\n", + run->flags, + run->psw_mask, run->psw_addr, + run->exit_reason, exit_reason_str(run->exit_reason)); + pr_debug("sie_block psw:0x%.16llx:0x%.16llx\n", + sie_block->psw_mask, sie_block->psw_addr); +} + +static inline void print_run(struct kvm_run *run, struct kvm_s390_sie_block *sie_block) +{ + print_hex_bytes("run", (u64)run, 0x150); + print_hex("sie_block", (u64)sie_block); + print_psw(run, sie_block); +} + +static inline void print_regs(struct kvm_run *run) +{ + struct kvm_sync_regs *sync_regs = &run->s.regs; + + print_hex_bytes("GPRS", (u64)sync_regs->gprs, 8 * NUM_GPRS); + print_hex_bytes("ACRS", (u64)sync_regs->acrs, 4 * NUM_ACRS); + print_hex_bytes("CRS", (u64)sync_regs->crs, 8 * NUM_CRS); +} + +#endif /* SELFTEST_KVM_DEBUG_PRINT_H */ -- cgit v1.2.3 From 7167395a4be7930ecac6a33b4e54d7e3dd9ee209 Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Thu, 15 Aug 2024 15:59:50 +0800 Subject: selftests: udpgro: report error when receive failed Currently, we only check the latest senders's exit code. If the receiver report failed, it is not recoreded. Fix it by checking the exit code of all the involved processes. Before: bad GRO lookup ok multiple GRO socks ./udpgso_bench_rx: recv: bad packet len, got 1452, expected 14520 ./udpgso_bench_rx: recv: bad packet len, got 1452, expected 14520 failed $ echo $? 0 After: bad GRO lookup ok multiple GRO socks ./udpgso_bench_rx: recv: bad packet len, got 1452, expected 14520 ./udpgso_bench_rx: recv: bad packet len, got 1452, expected 14520 failed $ echo $? 1 Fixes: 3327a9c46352 ("selftests: add functionals test for UDP GRO") Suggested-by: Paolo Abeni Signed-off-by: Hangbin Liu Signed-off-by: David S. Miller --- tools/testing/selftests/net/udpgro.sh | 44 +++++++++++++++++++++-------------- 1 file changed, 27 insertions(+), 17 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/udpgro.sh b/tools/testing/selftests/net/udpgro.sh index 11a1ebda564f..4659cf01e438 100755 --- a/tools/testing/selftests/net/udpgro.sh +++ b/tools/testing/selftests/net/udpgro.sh @@ -46,17 +46,19 @@ run_one() { local -r all="$@" local -r tx_args=${all%rx*} local -r rx_args=${all#*rx} + local ret=0 cfg_veth - ip netns exec "${PEER_NS}" ./udpgso_bench_rx -C 1000 -R 10 ${rx_args} && \ - echo "ok" || \ - echo "failed" & + ip netns exec "${PEER_NS}" ./udpgso_bench_rx -C 1000 -R 10 ${rx_args} & + local PID1=$! wait_local_port_listen ${PEER_NS} 8000 udp ./udpgso_bench_tx ${tx_args} - ret=$? - wait $(jobs -p) + check_err $? + wait ${PID1} + check_err $? + [ "$ret" -eq 0 ] && echo "ok" || echo "failed" return $ret } @@ -73,6 +75,7 @@ run_one_nat() { local -r all="$@" local -r tx_args=${all%rx*} local -r rx_args=${all#*rx} + local ret=0 if [[ ${tx_args} = *-4* ]]; then ipt_cmd=iptables @@ -93,16 +96,17 @@ run_one_nat() { # ... so that GRO will match the UDP_GRO enabled socket, but packets # will land on the 'plain' one ip netns exec "${PEER_NS}" ./udpgso_bench_rx -G ${family} -b ${addr1} -n 0 & - pid=$! - ip netns exec "${PEER_NS}" ./udpgso_bench_rx -C 1000 -R 10 ${family} -b ${addr2%/*} ${rx_args} && \ - echo "ok" || \ - echo "failed"& + local PID1=$! + ip netns exec "${PEER_NS}" ./udpgso_bench_rx -C 1000 -R 10 ${family} -b ${addr2%/*} ${rx_args} & + local PID2=$! wait_local_port_listen "${PEER_NS}" 8000 udp ./udpgso_bench_tx ${tx_args} - ret=$? - kill -INT $pid - wait $(jobs -p) + check_err $? + kill -INT ${PID1} + wait ${PID2} + check_err $? + [ "$ret" -eq 0 ] && echo "ok" || echo "failed" return $ret } @@ -111,20 +115,26 @@ run_one_2sock() { local -r all="$@" local -r tx_args=${all%rx*} local -r rx_args=${all#*rx} + local ret=0 cfg_veth ip netns exec "${PEER_NS}" ./udpgso_bench_rx -C 1000 -R 10 ${rx_args} -p 12345 & - ip netns exec "${PEER_NS}" ./udpgso_bench_rx -C 2000 -R 10 ${rx_args} && \ - echo "ok" || \ - echo "failed" & + local PID1=$! + ip netns exec "${PEER_NS}" ./udpgso_bench_rx -C 2000 -R 10 ${rx_args} & + local PID2=$! wait_local_port_listen "${PEER_NS}" 12345 udp ./udpgso_bench_tx ${tx_args} -p 12345 + check_err $? wait_local_port_listen "${PEER_NS}" 8000 udp ./udpgso_bench_tx ${tx_args} - ret=$? - wait $(jobs -p) + check_err $? + wait ${PID1} + check_err $? + wait ${PID2} + check_err $? + [ "$ret" -eq 0 ] && echo "ok" || echo "failed" return $ret } -- cgit v1.2.3 From d7818402b1d80347c764001583f6d63fa68c2e1a Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Thu, 15 Aug 2024 15:59:51 +0800 Subject: selftests: udpgro: no need to load xdp for gro MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After commit d7db7775ea2e ("net: veth: do not manipulate GRO when using XDP"), there is no need to load XDP program to enable GRO. On the other hand, the current test is failed due to loading the XDP program. e.g. # selftests: net: udpgro.sh # ipv4 # no GRO ok # no GRO chk cmsg ok # GRO ./udpgso_bench_rx: recv: bad packet len, got 1472, expected 14720 # # failed [...] # bad GRO lookup ok # multiple GRO socks ./udpgso_bench_rx: recv: bad packet len, got 1452, expected 14520 # # ./udpgso_bench_rx: recv: bad packet len, got 1452, expected 14520 # # failed ok 1 selftests: net: udpgro.sh After fix, all the test passed. # ./udpgro.sh ipv4 no GRO ok [...] multiple GRO socks ok Fixes: d7db7775ea2e ("net: veth: do not manipulate GRO when using XDP") Reported-by: Yi Chen Closes: https://issues.redhat.com/browse/RHEL-53858 Reviewed-by: Toke Høiland-Jørgensen Acked-by: Paolo Abeni Signed-off-by: Hangbin Liu Signed-off-by: David S. Miller --- tools/testing/selftests/net/udpgro.sh | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/udpgro.sh b/tools/testing/selftests/net/udpgro.sh index 4659cf01e438..d5ffd8c9172e 100755 --- a/tools/testing/selftests/net/udpgro.sh +++ b/tools/testing/selftests/net/udpgro.sh @@ -7,8 +7,6 @@ source net_helper.sh readonly PEER_NS="ns-peer-$(mktemp -u XXXXXX)" -BPF_FILE="xdp_dummy.bpf.o" - # set global exit status, but never reset nonzero one. check_err() { @@ -38,7 +36,7 @@ cfg_veth() { ip -netns "${PEER_NS}" addr add dev veth1 192.168.1.1/24 ip -netns "${PEER_NS}" addr add dev veth1 2001:db8::1/64 nodad ip -netns "${PEER_NS}" link set dev veth1 up - ip -n "${PEER_NS}" link set veth1 xdp object ${BPF_FILE} section xdp + ip netns exec "${PEER_NS}" ethtool -K veth1 gro on } run_one() { @@ -206,11 +204,6 @@ run_all() { return $ret } -if [ ! -f ${BPF_FILE} ]; then - echo "Missing ${BPF_FILE}. Run 'make' first" - exit -1 -fi - if [[ $# -eq 0 ]]; then run_all elif [[ $1 == "__subprocess" ]]; then -- cgit v1.2.3 From d0fe8920cbe42547798fd806f078eeaaba05df18 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 24 Jul 2024 15:15:36 +0200 Subject: selftests: add F_CREATED_QUERY tests Add simple selftests for fcntl(fd, F_CREATED_QUERY, 0). Link: https://lore.kernel.org/r/20240724-work-fcntl-v1-2-e8153a2f1991@kernel.org Signed-off-by: Christian Brauner --- tools/testing/selftests/core/close_range_test.c | 39 +++++++++++++++++++++++++ 1 file changed, 39 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/core/close_range_test.c b/tools/testing/selftests/core/close_range_test.c index 12b4eb9d0434..e0d9851fe1c9 100644 --- a/tools/testing/selftests/core/close_range_test.c +++ b/tools/testing/selftests/core/close_range_test.c @@ -26,6 +26,10 @@ #define F_DUPFD_QUERY (F_LINUX_SPECIFIC_BASE + 3) #endif +#ifndef F_CREATED_QUERY +#define F_CREATED_QUERY (F_LINUX_SPECIFIC_BASE + 4) +#endif + static inline int sys_close_range(unsigned int fd, unsigned int max_fd, unsigned int flags) { @@ -624,4 +628,39 @@ TEST(close_range_bitmap_corruption) EXPECT_EQ(0, WEXITSTATUS(status)); } +TEST(fcntl_created) +{ + for (int i = 0; i < 101; i++) { + int fd; + char path[PATH_MAX]; + + fd = open("/dev/null", O_RDONLY | O_CLOEXEC); + ASSERT_GE(fd, 0) { + if (errno == ENOENT) + SKIP(return, + "Skipping test since /dev/null does not exist"); + } + + /* We didn't create "/dev/null". */ + EXPECT_EQ(fcntl(fd, F_CREATED_QUERY, 0), 0); + close(fd); + + sprintf(path, "aaaa_%d", i); + fd = open(path, O_CREAT | O_RDONLY | O_CLOEXEC, 0600); + ASSERT_GE(fd, 0); + + /* We created "aaaa_%d". */ + EXPECT_EQ(fcntl(fd, F_CREATED_QUERY, 0), 1); + close(fd); + + fd = open(path, O_RDONLY | O_CLOEXEC); + ASSERT_GE(fd, 0); + + /* We're opening it again, so no positive creation check. */ + EXPECT_EQ(fcntl(fd, F_CREATED_QUERY, 0), 0); + close(fd); + unlink(path); + } +} + TEST_HARNESS_MAIN -- cgit v1.2.3 From e8bb03ed6850c6ed4ce2f1600ea73401fc2ebd95 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Fri, 16 Aug 2024 16:58:31 -0700 Subject: perf dwarf-aux: Check allowed location expressions when collecting variables It missed to call check_allowed_ops() in __die_collect_vars_cb() so it can take variables with complex location expression incorrectly. For example, I found some variable has this expression. 015d8df8 ffffffff81aacfb3 (base address) 015d8e01 v000000000000004 v000000000000000 views at 015d8df2 for: ffffffff81aacfb3 ffffffff81aacfd2 (DW_OP_fbreg: -176; DW_OP_deref; DW_OP_plus_uconst: 332; DW_OP_deref_size: 4; DW_OP_lit1; DW_OP_shra; DW_OP_const1u: 64; DW_OP_minus; DW_OP_stack_value) 015d8e14 v000000000000000 v000000000000000 views at 015d8df4 for: ffffffff81aacfd2 ffffffff81aacfd7 (DW_OP_reg3 (rbx)) 015d8e19 v000000000000000 v000000000000000 views at 015d8df6 for: ffffffff81aacfd7 ffffffff81aad020 (DW_OP_fbreg: -176; DW_OP_deref; DW_OP_plus_uconst: 332; DW_OP_deref_size: 4; DW_OP_lit1; DW_OP_shra; DW_OP_const1u: 64; DW_OP_minus; DW_OP_stack_value) 015d8e2c It looks like '((int *)(-176(%rbp) + 332) >> 1) - 64' but the current code thought it's just -176(%rbp) and processed the variable incorrectly. It should reject such a complex expression if check_allowed_ops() doesn't like it. :) Fixes: 932dcc2c39aedf54 ("perf dwarf-aux: Add die_collect_vars()") Signed-off-by: Namhyung Kim Acked-by: Masami Hiramatsu Cc: Adrian Hunter Cc: Athira Rajeev Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kan Liang Cc: Masami Hiramatsu Cc: Namhyung Kim Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20240816235840.2754937-2-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/dwarf-aux.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'tools') diff --git a/tools/perf/util/dwarf-aux.c b/tools/perf/util/dwarf-aux.c index 5e080d7e22c2..beb632153a74 100644 --- a/tools/perf/util/dwarf-aux.c +++ b/tools/perf/util/dwarf-aux.c @@ -1598,6 +1598,9 @@ static int __die_collect_vars_cb(Dwarf_Die *die_mem, void *arg) if (dwarf_getlocations(&attr, 0, &base, &start, &end, &ops, &nops) <= 0) return DIE_FIND_CB_SIBLING; + if (!check_allowed_ops(ops, nops)) + return DIE_FIND_CB_SIBLING; + if (die_get_real_type(die_mem, &type_die) == NULL) return DIE_FIND_CB_SIBLING; -- cgit v1.2.3 From 3ab0b8b238b5130ae3fa37ddaa329fc0e93b6b9a Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Fri, 16 Aug 2024 16:58:32 -0700 Subject: perf annotate-data: Fix off-by-one in location range check The location list will have entries with half-open addressing like [start, end) which means it doesn't include the end address. So it should skip entries at the end address and match to the next entry. An example location list looks like this (from readelf -wo): 00237876 ffffffff8110d32b (base address) 0023787f v000000000000000 v000000000000002 views at 00237868 for: ffffffff8110d32b ffffffff8110d4eb (DW_OP_reg3 (rbx)) <<<--- 1 00237885 v000000000000002 v000000000000000 views at 0023786a for: ffffffff8110d4eb ffffffff8110d50b (DW_OP_reg14 (r14)) <<<--- 2 0023788c v000000000000000 v000000000000001 views at 0023786c for: ffffffff8110d50b ffffffff8110d7c4 (DW_OP_reg3 (rbx)) 00237893 v000000000000000 v000000000000000 views at 0023786e for: ffffffff8110d806 ffffffff8110d854 (DW_OP_reg3 (rbx)) 0023789a v000000000000000 v000000000000000 views at 00237870 for: ffffffff8110d876 ffffffff8110d88e (DW_OP_reg3 (rbx)) The first entry at 0023787f has [8110d32b, 8110d4eb) (omitting the ffffffff at the beginning), and the second one has [8110d4eb, 8110d50b). Fixes: 2bc3cf575a162a2c ("perf annotate-data: Improve debug message with location info") Reviewed-by: Masami Hiramatsu Signed-off-by: Namhyung Kim Cc: Adrian Hunter Cc: Athira Rajeev Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kan Liang Cc: Masami Hiramatsu Cc: Namhyung Kim Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20240816235840.2754937-3-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/annotate-data.c | 2 +- tools/perf/util/dwarf-aux.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/annotate-data.c b/tools/perf/util/annotate-data.c index ff85d190e3ac..fd8d3cdead5a 100644 --- a/tools/perf/util/annotate-data.c +++ b/tools/perf/util/annotate-data.c @@ -95,7 +95,7 @@ static void pr_debug_location(Dwarf_Die *die, u64 pc, int reg) return; while ((off = dwarf_getlocations(&attr, off, &base, &start, &end, &ops, &nops)) > 0) { - if (reg != DWARF_REG_PC && end < pc) + if (reg != DWARF_REG_PC && end <= pc) continue; if (reg != DWARF_REG_PC && start > pc) break; diff --git a/tools/perf/util/dwarf-aux.c b/tools/perf/util/dwarf-aux.c index beb632153a74..0151a8d14350 100644 --- a/tools/perf/util/dwarf-aux.c +++ b/tools/perf/util/dwarf-aux.c @@ -1444,7 +1444,7 @@ static int __die_find_var_reg_cb(Dwarf_Die *die_mem, void *arg) while ((off = dwarf_getlocations(&attr, off, &base, &start, &end, &ops, &nops)) > 0) { /* Assuming the location list is sorted by address */ - if (end < data->pc) + if (end <= data->pc) continue; if (start > data->pc) break; -- cgit v1.2.3 From 976862f8abefafc42a27fff4c0e5d56cfc8d8ef4 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Fri, 16 Aug 2024 16:58:33 -0700 Subject: perf annotate-data: Add 'enum type_match_result' And let check_variable() return the enum value so that callers can know what was the problem. This will be used by the later patch to update the statistics correctly and print the error message in a right place. Signed-off-by: Namhyung Kim Cc: Adrian Hunter Cc: Athira Rajeev Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kan Liang Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20240816235840.2754937-4-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/annotate-data.c | 36 +++++++++++++++++++++++++----------- 1 file changed, 25 insertions(+), 11 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/annotate-data.c b/tools/perf/util/annotate-data.c index fd8d3cdead5a..8e3b422eca22 100644 --- a/tools/perf/util/annotate-data.c +++ b/tools/perf/util/annotate-data.c @@ -345,9 +345,20 @@ static bool find_cu_die(struct debuginfo *di, u64 pc, Dwarf_Die *cu_die) return false; } +enum type_match_result { + PERF_TMR_UNKNOWN = 0, + PERF_TMR_OK, + PERF_TMR_NO_TYPE, + PERF_TMR_NO_POINTER, + PERF_TMR_NO_SIZE, + PERF_TMR_BAD_OFFSET, +}; + /* The type info will be saved in @type_die */ -static int check_variable(struct data_loc_info *dloc, Dwarf_Die *var_die, - Dwarf_Die *type_die, int reg, int offset, bool is_fbreg) +static enum type_match_result check_variable(struct data_loc_info *dloc, + Dwarf_Die *var_die, + Dwarf_Die *type_die, int reg, + int offset, bool is_fbreg) { Dwarf_Word size; bool is_pointer = true; @@ -364,7 +375,7 @@ static int check_variable(struct data_loc_info *dloc, Dwarf_Die *var_die, if (__die_get_real_type(var_die, type_die) == NULL) { pr_debug_dtp("variable has no type\n"); ann_data_stat.no_typeinfo++; - return -1; + return PERF_TMR_NO_TYPE; } /* @@ -378,7 +389,7 @@ static int check_variable(struct data_loc_info *dloc, Dwarf_Die *var_die, __die_get_real_type(type_die, type_die) == NULL) { pr_debug_dtp("no pointer or no type\n"); ann_data_stat.no_typeinfo++; - return -1; + return PERF_TMR_NO_POINTER; } } @@ -391,7 +402,7 @@ static int check_variable(struct data_loc_info *dloc, Dwarf_Die *var_die, if (dwarf_aggregate_size(&sized_type, &size) < 0) { pr_debug_dtp("type size is unknown\n"); ann_data_stat.invalid_size++; - return -1; + return PERF_TMR_NO_SIZE; } /* Minimal sanity check */ @@ -399,10 +410,10 @@ static int check_variable(struct data_loc_info *dloc, Dwarf_Die *var_die, pr_debug_dtp("offset: %d is bigger than size: %"PRIu64"\n", offset, size); ann_data_stat.bad_offset++; - return -1; + return PERF_TMR_BAD_OFFSET; } - return 0; + return PERF_TMR_OK; } struct type_state_stack *find_stack_state(struct type_state *state, @@ -652,7 +663,7 @@ bool get_global_var_type(Dwarf_Die *cu_die, struct data_loc_info *dloc, /* Try to get the variable by address first */ if (die_find_variable_by_addr(cu_die, var_addr, &var_die, &offset) && check_variable(dloc, &var_die, type_die, DWARF_REG_PC, offset, - /*is_fbreg=*/false) == 0) { + /*is_fbreg=*/false) == PERF_TMR_OK) { var_name = dwarf_diename(&var_die); *var_offset = offset; goto ok; @@ -666,7 +677,7 @@ bool get_global_var_type(Dwarf_Die *cu_die, struct data_loc_info *dloc, /* Try to get the name of global variable */ if (die_find_variable_at(cu_die, var_name, pc, &var_die) && check_variable(dloc, &var_die, type_die, DWARF_REG_PC, *var_offset, - /*is_fbreg=*/false) == 0) + /*is_fbreg=*/false) == PERF_TMR_OK) goto ok; return false; @@ -1205,6 +1216,7 @@ static int find_data_type_die(struct data_loc_info *dloc, Dwarf_Die *type_die) bool is_fbreg = false; u64 pc; char buf[64]; + enum type_match_result result; if (dloc->op->multi_regs) snprintf(buf, sizeof(buf), "reg%d, reg%d", dloc->op->reg1, dloc->op->reg2); @@ -1299,8 +1311,8 @@ retry: } /* Found a variable, see if it's correct */ - ret = check_variable(dloc, &var_die, type_die, reg, offset, is_fbreg); - if (ret == 0) { + result = check_variable(dloc, &var_die, type_die, reg, offset, is_fbreg); + if (result == PERF_TMR_OK) { pr_debug_dtp("found \"%s\" in scope=%d/%d (die: %#lx) ", dwarf_diename(&var_die), i+1, nr_scopes, (long)dwarf_dieoffset(&scopes[i])); @@ -1315,12 +1327,14 @@ retry: } pr_debug_location(&var_die, pc, reg); pr_debug_type_name(type_die, TSR_KIND_TYPE); + ret = 0; } else { pr_debug_dtp("check variable \"%s\" failed (die: %#lx)\n", dwarf_diename(&var_die), (long)dwarf_dieoffset(&var_die)); pr_debug_location(&var_die, pc, reg); pr_debug_type_name(type_die, TSR_KIND_TYPE); + ret = -1; } dloc->type_offset = offset; goto out; -- cgit v1.2.3 From 653185d808ea6f5e3e6247087b793fcd8510bb26 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Fri, 16 Aug 2024 16:58:34 -0700 Subject: perf annotate-data: Add variable_state_str() So that it can show a proper debug message in the right place. The check_variable() is used in other places which don't want to print the message. $ perf --debug type-profile annotate --data-type Before: ----------------------------------------------------------- find data type for 0x140(reg14) at update_blocked_averages+0x2db CU for kernel/sched/fair.c (die:0x12dd892) frame base: cfa=1 fbreg=7 no pointer or no type <<<--- removed check variable "__mptr" failed (die: 0x13022f1) variable location: base=reg14, offset=0x140 type='void*' size=0x8 (die:0x12dd8f9) After: ----------------------------------------------------------- find data type for 0x140(reg14) at update_blocked_averages+0x2db CU for kernel/sched/fair.c (die:0x12dd892) frame base: cfa=1 fbreg=7 found "__mptr" (die: 0x13022f1) in scope=4/4 (die: 0x13022e8) failed: no/void pointer <<<--- here variable location: base=reg14, offset=0x140 type='void*' size=0x8 (die:0x12dd8f9) Signed-off-by: Namhyung Kim Cc: Adrian Hunter Cc: Athira Rajeev Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kan Liang Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20240816235840.2754937-5-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/annotate-data.c | 41 ++++++++++++++++++++++++++--------------- 1 file changed, 26 insertions(+), 15 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/annotate-data.c b/tools/perf/util/annotate-data.c index 8e3b422eca22..332254da49be 100644 --- a/tools/perf/util/annotate-data.c +++ b/tools/perf/util/annotate-data.c @@ -354,6 +354,25 @@ enum type_match_result { PERF_TMR_BAD_OFFSET, }; +static const char *match_result_str(enum type_match_result tmr) +{ + switch (tmr) { + case PERF_TMR_OK: + return "Good!"; + case PERF_TMR_NO_TYPE: + return "no type information"; + case PERF_TMR_NO_POINTER: + return "no/void pointer"; + case PERF_TMR_NO_SIZE: + return "type size is unknown"; + case PERF_TMR_BAD_OFFSET: + return "offset bigger than size"; + case PERF_TMR_UNKNOWN: + default: + return "invalid state"; + } +} + /* The type info will be saved in @type_die */ static enum type_match_result check_variable(struct data_loc_info *dloc, Dwarf_Die *var_die, @@ -373,7 +392,6 @@ static enum type_match_result check_variable(struct data_loc_info *dloc, /* Get the type of the variable */ if (__die_get_real_type(var_die, type_die) == NULL) { - pr_debug_dtp("variable has no type\n"); ann_data_stat.no_typeinfo++; return PERF_TMR_NO_TYPE; } @@ -387,7 +405,6 @@ static enum type_match_result check_variable(struct data_loc_info *dloc, if ((dwarf_tag(type_die) != DW_TAG_pointer_type && dwarf_tag(type_die) != DW_TAG_array_type) || __die_get_real_type(type_die, type_die) == NULL) { - pr_debug_dtp("no pointer or no type\n"); ann_data_stat.no_typeinfo++; return PERF_TMR_NO_POINTER; } @@ -400,15 +417,12 @@ static enum type_match_result check_variable(struct data_loc_info *dloc, /* Get the size of the actual type */ if (dwarf_aggregate_size(&sized_type, &size) < 0) { - pr_debug_dtp("type size is unknown\n"); ann_data_stat.invalid_size++; return PERF_TMR_NO_SIZE; } /* Minimal sanity check */ if ((unsigned)offset >= size) { - pr_debug_dtp("offset: %d is bigger than size: %"PRIu64"\n", - offset, size); ann_data_stat.bad_offset++; return PERF_TMR_BAD_OFFSET; } @@ -1310,12 +1324,13 @@ retry: continue; } + pr_debug_dtp("found \"%s\" (die: %#lx) in scope=%d/%d (die: %#lx) ", + dwarf_diename(&var_die), (long)dwarf_dieoffset(&var_die), + i+1, nr_scopes, (long)dwarf_dieoffset(&scopes[i])); + /* Found a variable, see if it's correct */ result = check_variable(dloc, &var_die, type_die, reg, offset, is_fbreg); if (result == PERF_TMR_OK) { - pr_debug_dtp("found \"%s\" in scope=%d/%d (die: %#lx) ", - dwarf_diename(&var_die), i+1, nr_scopes, - (long)dwarf_dieoffset(&scopes[i])); if (reg == DWARF_REG_PC) { pr_debug_dtp("addr=%#"PRIx64" type_offset=%#x\n", dloc->var_addr, offset); @@ -1325,17 +1340,13 @@ retry: } else { pr_debug_dtp("type_offset=%#x\n", offset); } - pr_debug_location(&var_die, pc, reg); - pr_debug_type_name(type_die, TSR_KIND_TYPE); ret = 0; } else { - pr_debug_dtp("check variable \"%s\" failed (die: %#lx)\n", - dwarf_diename(&var_die), - (long)dwarf_dieoffset(&var_die)); - pr_debug_location(&var_die, pc, reg); - pr_debug_type_name(type_die, TSR_KIND_TYPE); + pr_debug_dtp("failed: %s\n", match_result_str(result)); ret = -1; } + pr_debug_location(&var_die, pc, reg); + pr_debug_type_name(type_die, TSR_KIND_TYPE); dloc->type_offset = offset; goto out; } -- cgit v1.2.3 From 69e2c78425c92361f0ab01cbbb8f3a44fc94341f Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Fri, 16 Aug 2024 16:58:35 -0700 Subject: perf annotate-data: Change return type of find_data_type_block() So that it can return enum variable_match_type to be propagated to the find_data_type_die(). Also update the debug message to show the result of the check_matching_type(). chk [dd] reg0 offset=0 ok=1 kind=1 : Good! or chk [177] reg4 offset=0x138 ok=0 kind=0 cfa : no type information Signed-off-by: Namhyung Kim Cc: Adrian Hunter Cc: Athira Rajeev Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kan Liang Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20240816235840.2754937-6-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/annotate-data.c | 117 ++++++++++++++++++++-------------------- 1 file changed, 58 insertions(+), 59 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/annotate-data.c b/tools/perf/util/annotate-data.c index 332254da49be..5548cd8e84ba 100644 --- a/tools/perf/util/annotate-data.c +++ b/tools/perf/util/annotate-data.c @@ -352,6 +352,7 @@ enum type_match_result { PERF_TMR_NO_POINTER, PERF_TMR_NO_SIZE, PERF_TMR_BAD_OFFSET, + PERF_TMR_BAIL_OUT, }; static const char *match_result_str(enum type_match_result tmr) @@ -368,6 +369,7 @@ static const char *match_result_str(enum type_match_result tmr) case PERF_TMR_BAD_OFFSET: return "offset bigger than size"; case PERF_TMR_UNKNOWN: + case PERF_TMR_BAIL_OUT: default: return "invalid state"; } @@ -868,19 +870,19 @@ static void setup_stack_canary(struct data_loc_info *dloc) /* * It's at the target address, check if it has a matching type. - * It returns 1 if found, 0 if not or -1 if not found but no need to - * repeat the search. The last case is for per-cpu variables which + * It returns PERF_TMR_BAIL_OUT when it looks up per-cpu variables which * are similar to global variables and no additional info is needed. */ -static int check_matching_type(struct type_state *state, - struct data_loc_info *dloc, - Dwarf_Die *cu_die, Dwarf_Die *type_die) +static enum type_match_result check_matching_type(struct type_state *state, + struct data_loc_info *dloc, + Dwarf_Die *cu_die, + Dwarf_Die *type_die) { Dwarf_Word size; u32 insn_offset = dloc->ip - dloc->ms->sym->start; int reg = dloc->op->reg1; - pr_debug_dtp("chk [%x] reg%d offset=%#x ok=%d kind=%d", + pr_debug_dtp("chk [%x] reg%d offset=%#x ok=%d kind=%d ", insn_offset, reg, dloc->op->offset, state->regs[reg].ok, state->regs[reg].kind); @@ -896,15 +898,12 @@ static int check_matching_type(struct type_state *state, if (dloc->op->offset < 0 && reg != state->stack_reg) goto check_kernel; - pr_debug_dtp("\n"); - return -1; + return PERF_TMR_NO_POINTER; } - pr_debug_dtp("\n"); - /* Remove the pointer and get the target type */ if (__die_get_real_type(&state->regs[reg].type, type_die) == NULL) - return -1; + return PERF_TMR_NO_POINTER; dloc->type_offset = dloc->op->offset; @@ -916,33 +915,33 @@ static int check_matching_type(struct type_state *state, /* Get the size of the actual type */ if (dwarf_aggregate_size(&sized_type, &size) < 0 || (unsigned)dloc->type_offset >= size) - return -1; + return PERF_TMR_BAD_OFFSET; - return 1; + return PERF_TMR_OK; } if (reg == dloc->fbreg) { struct type_state_stack *stack; - pr_debug_dtp(" fbreg\n"); + pr_debug_dtp("fbreg"); stack = find_stack_state(state, dloc->type_offset); if (stack == NULL) - return 0; + return PERF_TMR_NO_TYPE; if (stack->kind == TSR_KIND_CANARY) { setup_stack_canary(dloc); - return -1; + return PERF_TMR_BAIL_OUT; } if (stack->kind != TSR_KIND_TYPE) - return 0; + return PERF_TMR_NO_TYPE; *type_die = stack->type; /* Update the type offset from the start of slot */ dloc->type_offset -= stack->offset; - return 1; + return PERF_TMR_OK; } if (dloc->fb_cfa) { @@ -950,38 +949,38 @@ static int check_matching_type(struct type_state *state, u64 pc = map__rip_2objdump(dloc->ms->map, dloc->ip); int fbreg, fboff; - pr_debug_dtp(" cfa\n"); + pr_debug_dtp("cfa"); if (die_get_cfa(dloc->di->dbg, pc, &fbreg, &fboff) < 0) fbreg = -1; if (reg != fbreg) - return 0; + return PERF_TMR_NO_TYPE; stack = find_stack_state(state, dloc->type_offset - fboff); if (stack == NULL) - return 0; + return PERF_TMR_NO_TYPE; if (stack->kind == TSR_KIND_CANARY) { setup_stack_canary(dloc); - return -1; + return PERF_TMR_BAIL_OUT; } if (stack->kind != TSR_KIND_TYPE) - return 0; + return PERF_TMR_NO_TYPE; *type_die = stack->type; /* Update the type offset from the start of slot */ dloc->type_offset -= fboff + stack->offset; - return 1; + return PERF_TMR_OK; } if (state->regs[reg].kind == TSR_KIND_PERCPU_BASE) { u64 var_addr = dloc->op->offset; int var_offset; - pr_debug_dtp(" percpu var\n"); + pr_debug_dtp("percpu var"); if (dloc->op->multi_regs) { int reg2 = dloc->op->reg2; @@ -997,14 +996,14 @@ static int check_matching_type(struct type_state *state, if (get_global_var_type(cu_die, dloc, dloc->ip, var_addr, &var_offset, type_die)) { dloc->type_offset = var_offset; - return 1; + return PERF_TMR_OK; } /* No need to retry per-cpu (global) variables */ - return -1; + return PERF_TMR_BAIL_OUT; } if (state->regs[reg].ok && state->regs[reg].kind == TSR_KIND_POINTER) { - pr_debug_dtp(" percpu ptr\n"); + pr_debug_dtp("percpu ptr"); /* * It's actaully pointer but the address was calculated using @@ -1017,13 +1016,13 @@ static int check_matching_type(struct type_state *state, /* Get the size of the actual type */ if (dwarf_aggregate_size(type_die, &size) < 0 || (unsigned)dloc->type_offset >= size) - return -1; + return PERF_TMR_BAIL_OUT; - return 1; + return PERF_TMR_OK; } if (state->regs[reg].ok && state->regs[reg].kind == TSR_KIND_CANARY) { - pr_debug_dtp(" stack canary\n"); + pr_debug_dtp("stack canary"); /* * This is a saved value of the stack canary which will be handled @@ -1032,7 +1031,7 @@ static int check_matching_type(struct type_state *state, */ setup_stack_canary(dloc); - return -1; + return PERF_TMR_BAIL_OUT; } check_kernel: @@ -1043,16 +1042,16 @@ check_kernel: /* Direct this-cpu access like "%gs:0x34740" */ if (dloc->op->segment == INSN_SEG_X86_GS && dloc->op->imm && arch__is(dloc->arch, "x86")) { - pr_debug_dtp(" this-cpu var\n"); + pr_debug_dtp("this-cpu var"); addr = dloc->op->offset; if (get_global_var_type(cu_die, dloc, dloc->ip, addr, &offset, type_die)) { dloc->type_offset = offset; - return 1; + return PERF_TMR_OK; } - return -1; + return PERF_TMR_BAIL_OUT; } /* Access to global variable like "-0x7dcf0500(,%rdx,8)" */ @@ -1061,31 +1060,30 @@ check_kernel: if (get_global_var_type(cu_die, dloc, dloc->ip, addr, &offset, type_die)) { - pr_debug_dtp(" global var\n"); + pr_debug_dtp("global var"); dloc->type_offset = offset; - return 1; + return PERF_TMR_OK; } - pr_debug_dtp(" negative offset\n"); - return -1; + return PERF_TMR_BAIL_OUT; } } - pr_debug_dtp("\n"); - return 0; + return PERF_TMR_UNKNOWN; } /* Iterate instructions in basic blocks and update type table */ -static int find_data_type_insn(struct data_loc_info *dloc, - struct list_head *basic_blocks, - struct die_var_type *var_types, - Dwarf_Die *cu_die, Dwarf_Die *type_die) +static enum type_match_result find_data_type_insn(struct data_loc_info *dloc, + struct list_head *basic_blocks, + struct die_var_type *var_types, + Dwarf_Die *cu_die, + Dwarf_Die *type_die) { struct type_state state; struct symbol *sym = dloc->ms->sym; struct annotation *notes = symbol__annotation(sym); struct annotated_basic_block *bb; - int ret = 0; + enum type_match_result ret = PERF_TMR_UNKNOWN; init_type_state(&state, dloc->arch); @@ -1111,6 +1109,7 @@ static int find_data_type_insn(struct data_loc_info *dloc, if (this_ip == dloc->ip) { ret = check_matching_type(&state, dloc, cu_die, type_die); + pr_debug_dtp(" : %s\n", match_result_str(ret)); goto out; } @@ -1137,24 +1136,25 @@ static int arch_supports_insn_tracking(struct data_loc_info *dloc) * Construct a list of basic blocks for each scope with variables and try to find * the data type by updating a type state table through instructions. */ -static int find_data_type_block(struct data_loc_info *dloc, - Dwarf_Die *cu_die, Dwarf_Die *scopes, - int nr_scopes, Dwarf_Die *type_die) +static enum type_match_result find_data_type_block(struct data_loc_info *dloc, + Dwarf_Die *cu_die, + Dwarf_Die *scopes, + int nr_scopes, + Dwarf_Die *type_die) { LIST_HEAD(basic_blocks); struct die_var_type *var_types = NULL; u64 src_ip, dst_ip, prev_dst_ip; - int ret = -1; + enum type_match_result ret = PERF_TMR_UNKNOWN; /* TODO: other architecture support */ if (!arch_supports_insn_tracking(dloc)) - return -1; + return PERF_TMR_BAIL_OUT; prev_dst_ip = dst_ip = dloc->ip; for (int i = nr_scopes - 1; i >= 0; i--) { Dwarf_Addr base, start, end; LIST_HEAD(this_blocks); - int found; if (dwarf_ranges(&scopes[i], 0, &base, &start, &end) < 0) break; @@ -1185,9 +1185,9 @@ again: fixup_var_address(var_types, start); /* Find from start of this scope to the target instruction */ - found = find_data_type_insn(dloc, &basic_blocks, var_types, + ret = find_data_type_insn(dloc, &basic_blocks, var_types, cu_die, type_die); - if (found > 0) { + if (ret == PERF_TMR_OK) { char buf[64]; if (dloc->op->multi_regs) @@ -1199,11 +1199,10 @@ again: pr_debug_dtp("found by insn track: %#x(%s) type-offset=%#x\n", dloc->op->offset, buf, dloc->type_offset); pr_debug_type_name(type_die, TSR_KIND_TYPE); - ret = 0; break; } - if (found < 0) + if (ret == PERF_TMR_BAIL_OUT) break; /* Go up to the next scope and find blocks to the start */ @@ -1357,11 +1356,11 @@ retry: } if (reg != DWARF_REG_PC) { - ret = find_data_type_block(dloc, &cu_die, scopes, + result = find_data_type_block(dloc, &cu_die, scopes, nr_scopes, type_die); - if (ret == 0) { + if (result == PERF_TMR_OK) { ann_data_stat.insn_track++; - goto out; + ret = 0; } } -- cgit v1.2.3 From 98d1f1dc72fd6e0e85db4acc0b2dd591f4488216 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Fri, 16 Aug 2024 16:58:36 -0700 Subject: perf annotate-data: Add is_pointer_type() helper It treats pointers and arrays in the same way. Let's add the helper and use it when it checks if it needs a pointer. Signed-off-by: Namhyung Kim Cc: Adrian Hunter Cc: Athira Rajeev Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kan Liang Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20240816235840.2754937-7-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/annotate-data.c | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/annotate-data.c b/tools/perf/util/annotate-data.c index 5548cd8e84ba..b2414747cdeb 100644 --- a/tools/perf/util/annotate-data.c +++ b/tools/perf/util/annotate-data.c @@ -375,6 +375,13 @@ static const char *match_result_str(enum type_match_result tmr) } } +static bool is_pointer_type(Dwarf_Die *type_die) +{ + int tag = dwarf_tag(type_die); + + return tag == DW_TAG_pointer_type || tag == DW_TAG_array_type; +} + /* The type info will be saved in @type_die */ static enum type_match_result check_variable(struct data_loc_info *dloc, Dwarf_Die *var_die, @@ -382,15 +389,15 @@ static enum type_match_result check_variable(struct data_loc_info *dloc, int offset, bool is_fbreg) { Dwarf_Word size; - bool is_pointer = true; + bool needs_pointer = true; Dwarf_Die sized_type; if (reg == DWARF_REG_PC) - is_pointer = false; + needs_pointer = false; else if (reg == dloc->fbreg || is_fbreg) - is_pointer = false; + needs_pointer = false; else if (arch__is(dloc->arch, "x86") && reg == X86_REG_SP) - is_pointer = false; + needs_pointer = false; /* Get the type of the variable */ if (__die_get_real_type(var_die, type_die) == NULL) { @@ -403,9 +410,8 @@ static enum type_match_result check_variable(struct data_loc_info *dloc, * Convert to a real type it points to. But global variables * and local variables are accessed directly without a pointer. */ - if (is_pointer) { - if ((dwarf_tag(type_die) != DW_TAG_pointer_type && - dwarf_tag(type_die) != DW_TAG_array_type) || + if (needs_pointer) { + if (!is_pointer_type(type_die) || __die_get_real_type(type_die, type_die) == NULL) { ann_data_stat.no_typeinfo++; return PERF_TMR_NO_POINTER; @@ -887,14 +893,13 @@ static enum type_match_result check_matching_type(struct type_state *state, state->regs[reg].ok, state->regs[reg].kind); if (state->regs[reg].ok && state->regs[reg].kind == TSR_KIND_TYPE) { - int tag = dwarf_tag(&state->regs[reg].type); Dwarf_Die sized_type; /* * Normal registers should hold a pointer (or array) to * dereference a memory location. */ - if (tag != DW_TAG_pointer_type && tag != DW_TAG_array_type) { + if (!is_pointer_type(&state->regs[reg].type)) { if (dloc->op->offset < 0 && reg != state->stack_reg) goto check_kernel; -- cgit v1.2.3 From c663451f9239c89746b55f1a17ad62b014fa7905 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Fri, 16 Aug 2024 16:58:37 -0700 Subject: perf annotate-data: Add is_better_type() helper Sometimes more than one variables are located in the same register or a stack slot. Or it can overwrite existing information with others. I found this is not helpful in some cases so it needs to update the type information from the variable only if it's better. But it's hard to know which one is better, so we needs heuristics. :) As it deals with memory accesses, the location should have a pointer or something similar (like array or reference). So if it had an integer type and a variable is a pointer, we can take the variable's type to resolve the target of the access. If it has a pointer type and a variable with the same location has a different pointer type, it'll take one with bigger target type. This can be useful when the target type embeds a smaller type (like list header or RB-tree node) at the beginning so their location is same. Signed-off-by: Namhyung Kim Cc: Adrian Hunter Cc: Athira Rajeev Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kan Liang Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20240816235840.2754937-8-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/annotate-data.c | 61 ++++++++++++++++++++++++++++++++++------- 1 file changed, 51 insertions(+), 10 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/annotate-data.c b/tools/perf/util/annotate-data.c index b2414747cdeb..916d26bfb9eb 100644 --- a/tools/perf/util/annotate-data.c +++ b/tools/perf/util/annotate-data.c @@ -382,6 +382,38 @@ static bool is_pointer_type(Dwarf_Die *type_die) return tag == DW_TAG_pointer_type || tag == DW_TAG_array_type; } +/* returns if Type B has better information than Type A */ +static bool is_better_type(Dwarf_Die *type_a, Dwarf_Die *type_b) +{ + Dwarf_Word size_a, size_b; + Dwarf_Die die_a, die_b; + + /* pointer type is preferred */ + if (is_pointer_type(type_a) != is_pointer_type(type_b)) + return is_pointer_type(type_b); + + if (is_pointer_type(type_b)) { + /* + * We want to compare the target type, but 'void *' can fail to + * get the target type. + */ + if (die_get_real_type(type_a, &die_a) == NULL) + return true; + if (die_get_real_type(type_b, &die_b) == NULL) + return false; + + type_a = &die_a; + type_b = &die_b; + } + + /* bigger type is preferred */ + if (dwarf_aggregate_size(type_a, &size_a) < 0 || + dwarf_aggregate_size(type_b, &size_b) < 0) + return false; + + return size_a < size_b; +} + /* The type info will be saved in @type_die */ static enum type_match_result check_variable(struct data_loc_info *dloc, Dwarf_Die *var_die, @@ -741,24 +773,33 @@ static void update_var_state(struct type_state *state, struct data_loc_info *dlo if (!dwarf_offdie(dloc->di->dbg, var->die_off, &mem_die)) continue; - if (var->reg == DWARF_REG_FB) { - findnew_stack_state(state, var->offset, TSR_KIND_TYPE, - &mem_die); + if (var->reg == DWARF_REG_FB || var->reg == fbreg) { + int offset = var->offset; + struct type_state_stack *stack; - pr_debug_dtp("var [%"PRIx64"] -%#x(stack)", - insn_offset, -var->offset); - pr_debug_type_name(&mem_die, TSR_KIND_TYPE); - } else if (var->reg == fbreg) { - findnew_stack_state(state, var->offset - fb_offset, - TSR_KIND_TYPE, &mem_die); + if (var->reg != DWARF_REG_FB) + offset -= fb_offset; + + stack = find_stack_state(state, offset); + if (stack && stack->kind == TSR_KIND_TYPE && + !is_better_type(&stack->type, &mem_die)) + continue; + + findnew_stack_state(state, offset, TSR_KIND_TYPE, + &mem_die); pr_debug_dtp("var [%"PRIx64"] -%#x(stack)", - insn_offset, -var->offset + fb_offset); + insn_offset, -offset); pr_debug_type_name(&mem_die, TSR_KIND_TYPE); } else if (has_reg_type(state, var->reg) && var->offset == 0) { struct type_state_reg *reg; reg = &state->regs[var->reg]; + + if (reg->ok && reg->kind == TSR_KIND_TYPE && + !is_better_type(®->type, &mem_die)) + continue; + reg->type = mem_die; reg->kind = TSR_KIND_TYPE; reg->ok = true; -- cgit v1.2.3 From ba8833703b49451453d97bf6414a522293f7e31d Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Fri, 16 Aug 2024 16:58:38 -0700 Subject: perf annotate-data: Check variables in every scope Sometimes it matches a variable in the inner scope but it fails because the actual access can be on a different type. Let's try variables in every scope and choose the best one using is_better_type(). I have an example with update_blocked_averages(), at first it found a variable (__mptr) but it's a void pointer. So it moved on to the upper scope and found another variable (cfs_rq). $ perf --debug type-profile annotate --data-type --stdio ... ----------------------------------------------------------- find data type for 0x140(reg14) at update_blocked_averages+0x2db CU for kernel/sched/fair.c (die:0x12dd892) frame base: cfa=1 fbreg=7 found "__mptr" (die: 0x13022f1) in scope=4/4 (die: 0x13022e8) failed: no/void pointer variable location: base=reg14, offset=0x140 type='void*' size=0x8 (die:0x12dd8f9) found "cfs_rq" (die: 0x1301721) in scope=3/4 (die: 0x130171c) type_offset=0x140 variable location: reg14 type='struct cfs_rq' size=0x1c0 (die:0x12e37e5) final type: type='struct cfs_rq' size=0x1c0 (die:0x12e37e5) IIUC the scope is like below: 1: update_blocked_averages 2: __update_blocked_fair 3: for_each_leaf_cfs_rq_safe 4: list_entry -> (container_of) The container_of is implemented like: #define container_of(ptr, type, member) ({ \ void *__mptr = (void *)(ptr); \ static_assert(__same_type(*(ptr), ((type *)0)->member) || \ __same_type(*(ptr), void), \ "pointer type mismatch in container_of()"); \ ((type *)(__mptr - offsetof(type, member))); }) That's why we see the __mptr variable first but it failed since it has no type information. Then for_each_leaf_cfs_rq_safe() is defined as #define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) \ list_for_each_entry_safe(cfs_rq, pos, &rq->leaf_cfs_rq_list, \ leaf_cfs_rq_list) Note that the access was 0x140(r14). And the cfs_rq has leaf_cfs_rq_list at the 0x140. So it converts the list_head pointer to a pointer to struct cfs_rq here. $ pahole --hex -C cfs_rq vmlinux | grep 140 struct cfs_rq struct list_head leaf_cfs_rq_list; /* 0x140 0x10 */ Signed-off-by: Namhyung Kim Cc: Adrian Hunter Cc: Athira Rajeev Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kan Liang Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20240816235840.2754937-9-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/annotate-data.c | 44 +++++++++++++++++++++++++---------------- 1 file changed, 27 insertions(+), 17 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/annotate-data.c b/tools/perf/util/annotate-data.c index 916d26bfb9eb..e86f40fed323 100644 --- a/tools/perf/util/annotate-data.c +++ b/tools/perf/util/annotate-data.c @@ -1244,7 +1244,6 @@ again: pr_debug_dtp("found by insn track: %#x(%s) type-offset=%#x\n", dloc->op->offset, buf, dloc->type_offset); - pr_debug_type_name(type_die, TSR_KIND_TYPE); break; } @@ -1273,6 +1272,7 @@ static int find_data_type_die(struct data_loc_info *dloc, Dwarf_Die *type_die) int fbreg = -1; int fb_offset = 0; bool is_fbreg = false; + bool found = false; u64 pc; char buf[64]; enum type_match_result result; @@ -1358,14 +1358,17 @@ retry: /* Search from the inner-most scope to the outer */ for (i = nr_scopes - 1; i >= 0; i--) { + Dwarf_Die mem_die; + int type_offset = offset; + if (reg == DWARF_REG_PC) { if (!die_find_variable_by_addr(&scopes[i], dloc->var_addr, - &var_die, &offset)) + &var_die, &type_offset)) continue; } else { /* Look up variables/parameters in this scope */ if (!die_find_variable_by_reg(&scopes[i], pc, reg, - &offset, is_fbreg, &var_die)) + &type_offset, is_fbreg, &var_die)) continue; } @@ -1374,43 +1377,50 @@ retry: i+1, nr_scopes, (long)dwarf_dieoffset(&scopes[i])); /* Found a variable, see if it's correct */ - result = check_variable(dloc, &var_die, type_die, reg, offset, is_fbreg); + result = check_variable(dloc, &var_die, &mem_die, reg, type_offset, is_fbreg); if (result == PERF_TMR_OK) { if (reg == DWARF_REG_PC) { pr_debug_dtp("addr=%#"PRIx64" type_offset=%#x\n", - dloc->var_addr, offset); + dloc->var_addr, type_offset); } else if (reg == DWARF_REG_FB || is_fbreg) { pr_debug_dtp("stack_offset=%#x type_offset=%#x\n", - fb_offset, offset); + fb_offset, type_offset); } else { - pr_debug_dtp("type_offset=%#x\n", offset); + pr_debug_dtp("type_offset=%#x\n", type_offset); + } + + if (!found || is_better_type(type_die, &mem_die)) { + *type_die = mem_die; + dloc->type_offset = type_offset; + found = true; } - ret = 0; } else { pr_debug_dtp("failed: %s\n", match_result_str(result)); - ret = -1; } + pr_debug_location(&var_die, pc, reg); - pr_debug_type_name(type_die, TSR_KIND_TYPE); - dloc->type_offset = offset; - goto out; + pr_debug_type_name(&mem_die, TSR_KIND_TYPE); } - if (loc->multi_regs && reg == loc->reg1 && loc->reg1 != loc->reg2) { + if (!found && loc->multi_regs && reg == loc->reg1 && loc->reg1 != loc->reg2) { reg = loc->reg2; goto retry; } - if (reg != DWARF_REG_PC) { + if (!found && reg != DWARF_REG_PC) { result = find_data_type_block(dloc, &cu_die, scopes, - nr_scopes, type_die); + nr_scopes, type_die); if (result == PERF_TMR_OK) { ann_data_stat.insn_track++; - ret = 0; + found = true; } } - if (ret < 0) { + if (found) { + pr_debug_dtp("final type:"); + pr_debug_type_name(type_die, TSR_KIND_TYPE); + ret = 0; + } else { pr_debug_dtp("no variable found\n"); ann_data_stat.no_var++; } -- cgit v1.2.3 From 023aceecc74ae7dd9e91c8e35d641da74a371078 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Fri, 16 Aug 2024 16:58:39 -0700 Subject: perf annotate-data: Update type stat at the end of find_data_type_die() After trying all possibilities with DWARF and instruction tracking. Signed-off-by: Namhyung Kim Cc: Adrian Hunter Cc: Athira Rajeev Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kan Liang Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20240816235840.2754937-10-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/annotate-data.c | 47 ++++++++++++++++++++++++++--------------- 1 file changed, 30 insertions(+), 17 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/annotate-data.c b/tools/perf/util/annotate-data.c index e86f40fed323..aa330c7d8edd 100644 --- a/tools/perf/util/annotate-data.c +++ b/tools/perf/util/annotate-data.c @@ -432,10 +432,8 @@ static enum type_match_result check_variable(struct data_loc_info *dloc, needs_pointer = false; /* Get the type of the variable */ - if (__die_get_real_type(var_die, type_die) == NULL) { - ann_data_stat.no_typeinfo++; + if (__die_get_real_type(var_die, type_die) == NULL) return PERF_TMR_NO_TYPE; - } /* * Usually it expects a pointer type for a memory access. @@ -444,10 +442,8 @@ static enum type_match_result check_variable(struct data_loc_info *dloc, */ if (needs_pointer) { if (!is_pointer_type(type_die) || - __die_get_real_type(type_die, type_die) == NULL) { - ann_data_stat.no_typeinfo++; + __die_get_real_type(type_die, type_die) == NULL) return PERF_TMR_NO_POINTER; - } } if (dwarf_tag(type_die) == DW_TAG_typedef) @@ -456,16 +452,12 @@ static enum type_match_result check_variable(struct data_loc_info *dloc, sized_type = *type_die; /* Get the size of the actual type */ - if (dwarf_aggregate_size(&sized_type, &size) < 0) { - ann_data_stat.invalid_size++; + if (dwarf_aggregate_size(&sized_type, &size) < 0) return PERF_TMR_NO_SIZE; - } /* Minimal sanity check */ - if ((unsigned)offset >= size) { - ann_data_stat.bad_offset++; + if ((unsigned)offset >= size) return PERF_TMR_BAD_OFFSET; - } return PERF_TMR_OK; } @@ -1275,7 +1267,7 @@ static int find_data_type_die(struct data_loc_info *dloc, Dwarf_Die *type_die) bool found = false; u64 pc; char buf[64]; - enum type_match_result result; + enum type_match_result result = PERF_TMR_UNKNOWN; if (dloc->op->multi_regs) snprintf(buf, sizeof(buf), "reg%d, reg%d", dloc->op->reg1, dloc->op->reg2); @@ -1317,7 +1309,7 @@ static int find_data_type_die(struct data_loc_info *dloc, Dwarf_Die *type_die) pr_debug_dtp("found by addr=%#"PRIx64" type_offset=%#x\n", dloc->var_addr, offset); pr_debug_type_name(type_die, TSR_KIND_TYPE); - ret = 0; + found = true; goto out; } } @@ -1416,16 +1408,37 @@ retry: } } +out: if (found) { pr_debug_dtp("final type:"); pr_debug_type_name(type_die, TSR_KIND_TYPE); ret = 0; } else { - pr_debug_dtp("no variable found\n"); - ann_data_stat.no_var++; + switch (result) { + case PERF_TMR_NO_TYPE: + case PERF_TMR_NO_POINTER: + pr_debug_dtp("%s\n", match_result_str(result)); + ann_data_stat.no_typeinfo++; + break; + case PERF_TMR_NO_SIZE: + pr_debug_dtp("%s\n", match_result_str(result)); + ann_data_stat.invalid_size++; + break; + case PERF_TMR_BAD_OFFSET: + pr_debug_dtp("%s\n", match_result_str(result)); + ann_data_stat.bad_offset++; + break; + case PERF_TMR_UNKNOWN: + case PERF_TMR_BAIL_OUT: + case PERF_TMR_OK: /* should not reach here */ + default: + pr_debug_dtp("no variable found\n"); + ann_data_stat.no_var++; + break; + } + ret = -1; } -out: free(scopes); return ret; } -- cgit v1.2.3 From 4e97d521c2be094718c4c5c7c4f785e8972b4af0 Mon Sep 17 00:00:00 2001 From: Antonio Ojea Date: Tue, 2 Jul 2024 13:15:36 +0200 Subject: selftests: netfilter: nft_queue.sh: sctp coverage Test that nfqueue with and without GSO process SCTP packets correctly. Joint work with Florian and Pablo. Signed-off-by: Antonio Ojea Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- tools/testing/selftests/net/netfilter/config | 2 + tools/testing/selftests/net/netfilter/nft_queue.sh | 85 +++++++++++++++++++++- 2 files changed, 86 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/netfilter/config b/tools/testing/selftests/net/netfilter/config index 63ef80ef47a4..b2dd4db45215 100644 --- a/tools/testing/selftests/net/netfilter/config +++ b/tools/testing/selftests/net/netfilter/config @@ -87,3 +87,5 @@ CONFIG_XFRM_USER=m CONFIG_XFRM_STATISTICS=y CONFIG_NET_PKTGEN=m CONFIG_TUN=m +CONFIG_INET_DIAG=m +CONFIG_SCTP_DIAG=m diff --git a/tools/testing/selftests/net/netfilter/nft_queue.sh b/tools/testing/selftests/net/netfilter/nft_queue.sh index c61d23a8c88d..f3bdeb1271eb 100755 --- a/tools/testing/selftests/net/netfilter/nft_queue.sh +++ b/tools/testing/selftests/net/netfilter/nft_queue.sh @@ -25,6 +25,9 @@ cleanup() } checktool "nft --version" "test without nft tool" +checktool "socat -h" "run test without socat" + +modprobe -q sctp trap cleanup EXIT @@ -265,7 +268,6 @@ test_tcp_forward() test_tcp_localhost() { - dd conv=sparse status=none if=/dev/zero bs=1M count=200 of="$TMPINPUT" timeout 5 ip netns exec "$nsrouter" socat -u TCP-LISTEN:12345 STDOUT >/dev/null & local rpid=$! @@ -375,6 +377,82 @@ EOF wait 2>/dev/null } +sctp_listener_ready() +{ + ss -S -N "$1" -ln -o "sport = :12345" | grep -q 12345 +} + +test_sctp_forward() +{ + ip netns exec "$nsrouter" nft -f /dev/stdin < "$TMPFILE1" & + local rpid=$! + + busywait "$BUSYWAIT_TIMEOUT" sctp_listener_ready "$ns2" + + ip netns exec "$nsrouter" ./nf_queue -q 10 -G -t "$timeout" & + local nfqpid=$! + + ip netns exec "$ns1" socat -u STDIN SCTP:10.0.2.99:12345 <"$TMPINPUT" >/dev/null + + if ! ip netns exec "$nsrouter" nft delete table inet sctpq; then + echo "FAIL: Could not delete sctpq table" + exit 1 + fi + + wait "$rpid" && echo "PASS: sctp and nfqueue in forward chain" + + if ! diff -u "$TMPINPUT" "$TMPFILE1" ; then + echo "FAIL: lost packets?!" 1>&2 + exit 1 + fi +} + +test_sctp_output() +{ + ip netns exec "$ns1" nft -f /dev/stdin < "$TMPFILE1" & + local rpid=$! + + busywait "$BUSYWAIT_TIMEOUT" sctp_listener_ready "$ns2" + + ip netns exec "$ns1" ./nf_queue -q 11 -t "$timeout" & + local nfqpid=$! + + ip netns exec "$ns1" socat -u STDIN SCTP:10.0.2.99:12345 <"$TMPINPUT" >/dev/null + + if ! ip netns exec "$ns1" nft delete table inet sctpq; then + echo "FAIL: Could not delete sctpq table" + exit 1 + fi + + # must wait before checking completeness of output file. + wait "$rpid" && echo "PASS: sctp and nfqueue in output chain with GSO" + + if ! diff -u "$TMPINPUT" "$TMPFILE1" ; then + echo "FAIL: lost packets?!" 1>&2 + exit 1 + fi +} + test_queue_removal() { read tainted_then < /proc/sys/kernel/tainted @@ -443,11 +521,16 @@ test_queue 10 # same. We queue to a second program as well. load_ruleset "filter2" 20 test_queue 20 +ip netns exec "$ns1" nft flush ruleset test_tcp_forward test_tcp_localhost test_tcp_localhost_connectclose test_tcp_localhost_requeue +test_sctp_forward +test_sctp_output + +# should be last, adds vrf device in ns1 and changes routes test_icmp_vrf test_queue_removal -- cgit v1.2.3 From 2518e13275ab9ea6b2540f828cf78b0280991f85 Mon Sep 17 00:00:00 2001 From: Xu Yang Date: Mon, 19 Aug 2024 10:34:03 +0800 Subject: perf python: Fix the build on 32-bit arm by including missing "util/sample.h" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The 32-bit arm build system will complain: tools/perf/util/python.c:75:28: error: field ‘sample’ has incomplete type 75 | struct perf_sample sample; However, arm64 build system doesn't complain this. The root cause is arm64 define "HAVE_KVM_STAT_SUPPORT := 1" in tools/perf/arch/arm64/Makefile, but arm arch doesn't define this. This will lead to kvm-stat.h include other header files on arm64 build system, especially "util/sample.h" for util/python.c. This will try to directly include "util/sample.h" for "util/python.c" to avoid such build issue on arm platform. Signed-off-by: Xu Yang Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kan Liang Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: imx@lists.linux.dev Link: https://lore.kernel.org/r/20240819023403.201324-1-xu.yang_2@nxp.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/python.c | 1 + 1 file changed, 1 insertion(+) (limited to 'tools') diff --git a/tools/perf/util/python.c b/tools/perf/util/python.c index 3be882b2e845..31a223eaf8e6 100644 --- a/tools/perf/util/python.c +++ b/tools/perf/util/python.c @@ -20,6 +20,7 @@ #include "util/env.h" #include "util/kvm-stat.h" #include "util/kwork.h" +#include "util/sample.h" #include "util/lock-contention.h" #include #include "../builtin.h" -- cgit v1.2.3 From 2aebebb834e208344eeabcc4cacfc680026c8f6f Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Fri, 16 Aug 2024 23:44:30 -0700 Subject: perf synthetic-events: Avoid unnecessary memset Make sure the memset of a synthesized event only zeros the necessary tracing data part of the event, as a full event can be over 4kb in size. Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Anne Macedo Cc: Arnd Bergmann Cc: Athira Rajeev Cc: Casey Chen Cc: Chaitanya S Prakash Cc: Colin Ian King Cc: Dominique Martinet Cc: Ingo Molnar Cc: James Clark Cc: Jann Horn Cc: Jiri Olsa Cc: Kan Liang Cc: Mark Rutland Cc: Masahiro Yamada Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Sun Haiyong Cc: Weilin Wang Cc: Yang Jihong Cc: Yunseong Kim Cc: Ze Gao Link: https://lore.kernel.org/r/20240817064442.2152089-2-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/synthetic-events.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/perf/util/synthetic-events.c b/tools/perf/util/synthetic-events.c index 7f884d70de81..0a7f93ae76fb 100644 --- a/tools/perf/util/synthetic-events.c +++ b/tools/perf/util/synthetic-events.c @@ -2200,7 +2200,7 @@ int perf_event__synthesize_tracing_data(const struct perf_tool *tool, int fd, st if (!tdata) return -1; - memset(&ev, 0, sizeof(ev)); + memset(&ev, 0, sizeof(ev.tracing_data)); ev.tracing_data.header.type = PERF_RECORD_HEADER_TRACING_DATA; size = tdata->size; -- cgit v1.2.3 From a031073626d135b332c1004dd6ade1b1e8d7c9f8 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Fri, 16 Aug 2024 23:44:31 -0700 Subject: perf map: API clean up map__init() is only used internally so make it static. Assume memory is zero initialized, which will better support adding fields to struct map in the future and was already the case for map__new2. To reduce complexity, change set_priv and set_erange_warned to not take a value to assign as they always assign true. Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Anne Macedo Cc: Arnd Bergmann Cc: Athira Rajeev Cc: Casey Chen Cc: Chaitanya S Prakash Cc: Colin Ian King Cc: Dominique Martinet Cc: Ingo Molnar Cc: James Clark Cc: Jann Horn Cc: Jiri Olsa Cc: Kan Liang Cc: Mark Rutland Cc: Masahiro Yamada Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Sun Haiyong Cc: Weilin Wang Cc: Yang Jihong Cc: Yunseong Kim Cc: Ze Gao Link: https://lore.kernel.org/r/20240817064442.2152089-3-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-top.c | 2 +- tools/perf/tests/vmlinux-kallsyms.c | 4 ++-- tools/perf/util/map.c | 24 ++++++++++++------------ tools/perf/util/map.h | 11 ++++------- 4 files changed, 19 insertions(+), 22 deletions(-) (limited to 'tools') diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c index 881b861c35ee..724a79386321 100644 --- a/tools/perf/builtin-top.c +++ b/tools/perf/builtin-top.c @@ -191,7 +191,7 @@ static void ui__warn_map_erange(struct map *map, struct symbol *sym, u64 ip) if (use_browser <= 0) sleep(5); - map__set_erange_warned(map, true); + map__set_erange_warned(map); } static void perf_top__record_precise_ip(struct perf_top *top, diff --git a/tools/perf/tests/vmlinux-kallsyms.c b/tools/perf/tests/vmlinux-kallsyms.c index cd3b480d20bd..74cdbd2ce9d0 100644 --- a/tools/perf/tests/vmlinux-kallsyms.c +++ b/tools/perf/tests/vmlinux-kallsyms.c @@ -131,7 +131,7 @@ static int test__vmlinux_matches_kallsyms_cb1(struct map *map, void *data) (dso__kernel(dso) ? dso__short_name(dso) : dso__name(dso))); if (pair) { - map__set_priv(pair, 1); + map__set_priv(pair); map__put(pair); } else { if (!args->header_printed) { @@ -166,7 +166,7 @@ static int test__vmlinux_matches_kallsyms_cb2(struct map *map, void *data) pr_info(":\nWARN: *%" PRIx64 "-%" PRIx64 " %" PRIx64, map__start(pair), map__end(pair), map__pgoff(pair)); pr_info(" %s\n", dso__name(dso)); - map__set_priv(pair, 1); + map__set_priv(pair); } map__put(pair); return 0; diff --git a/tools/perf/util/map.c b/tools/perf/util/map.c index e1d14936a60d..e781c8d56a9a 100644 --- a/tools/perf/util/map.c +++ b/tools/perf/util/map.c @@ -102,16 +102,20 @@ static inline bool replace_android_lib(const char *filename, char *newfilename) return false; } -void map__init(struct map *map, u64 start, u64 end, u64 pgoff, struct dso *dso) +static void map__init(struct map *map, u64 start, u64 end, u64 pgoff, + struct dso *dso, u32 prot, u32 flags) { map__set_start(map, start); map__set_end(map, end); map__set_pgoff(map, pgoff); - map__set_reloc(map, 0); + assert(map__reloc(map) == 0); map__set_dso(map, dso__get(dso)); - map__set_mapping_type(map, MAPPING_TYPE__DSO); - map__set_erange_warned(map, false); refcount_set(map__refcnt(map), 1); + RC_CHK_ACCESS(map)->prot = prot; + RC_CHK_ACCESS(map)->flags = flags; + map__set_mapping_type(map, MAPPING_TYPE__DSO); + assert(map__erange_warned(map) == false); + assert(map__priv(map) == false); } struct map *map__new(struct machine *machine, u64 start, u64 len, @@ -124,7 +128,7 @@ struct map *map__new(struct machine *machine, u64 start, u64 len, struct nsinfo *nsi = NULL; struct nsinfo *nnsi; - map = malloc(sizeof(*map)); + map = zalloc(sizeof(*map)); if (ADD_RC_CHK(result, map)) { char newfilename[PATH_MAX]; struct dso *dso, *header_bid_dso; @@ -134,8 +138,6 @@ struct map *map__new(struct machine *machine, u64 start, u64 len, anon = is_anon_memory(filename) || flags & MAP_HUGETLB; vdso = is_vdso_map(filename); no_dso = is_no_dso_memory(filename); - map->prot = prot; - map->flags = flags; nsi = nsinfo__get(thread__nsinfo(thread)); if ((anon || no_dso) && nsi && (prot & PROT_EXEC)) { @@ -169,7 +171,7 @@ struct map *map__new(struct machine *machine, u64 start, u64 len, goto out_delete; assert(!dso__kernel(dso)); - map__init(result, start, start + len, pgoff, dso); + map__init(result, start, start + len, pgoff, dso, prot, flags); if (anon || no_dso) { map->mapping_type = MAPPING_TYPE__IDENTITY; @@ -223,10 +225,8 @@ struct map *map__new2(u64 start, struct dso *dso) map = calloc(1, sizeof(*map) + (dso__kernel(dso) ? sizeof(struct kmap) : 0)); if (ADD_RC_CHK(result, map)) { - /* - * ->end will be filled after we load all the symbols - */ - map__init(result, start, 0, 0, dso); + /* ->end will be filled after we load all the symbols. */ + map__init(result, start, /*end=*/0, /*pgoff=*/0, dso, /*prot=*/0, /*flags=*/0); } return result; diff --git a/tools/perf/util/map.h b/tools/perf/util/map.h index 65e2609fa1b1..6c43f31a9fe0 100644 --- a/tools/perf/util/map.h +++ b/tools/perf/util/map.h @@ -166,9 +166,6 @@ struct thread; #define map__for_each_symbol_by_name(map, sym_name, pos, idx) \ __map__for_each_symbol_by_name(map, sym_name, (pos), idx) -void map__init(struct map *map, - u64 start, u64 end, u64 pgoff, struct dso *dso); - struct dso_id; struct build_id; @@ -285,14 +282,14 @@ static inline void map__set_reloc(struct map *map, u64 reloc) RC_CHK_ACCESS(map)->reloc = reloc; } -static inline void map__set_priv(struct map *map, int priv) +static inline void map__set_priv(struct map *map) { - RC_CHK_ACCESS(map)->priv = priv; + RC_CHK_ACCESS(map)->priv = true; } -static inline void map__set_erange_warned(struct map *map, bool erange_warned) +static inline void map__set_erange_warned(struct map *map) { - RC_CHK_ACCESS(map)->erange_warned = erange_warned; + RC_CHK_ACCESS(map)->erange_warned = true; } static inline void map__set_dso(struct map *map, struct dso *dso) -- cgit v1.2.3 From 0847c193c3d777b79af9d019d6509b5857591f33 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Fri, 16 Aug 2024 23:44:32 -0700 Subject: perf jit: Constify filename argument Make it clearer the argument is just being used as a string. Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Anne Macedo Cc: Arnd Bergmann Cc: Athira Rajeev Cc: Casey Chen Cc: Chaitanya S Prakash Cc: Colin Ian King Cc: Dominique Martinet Cc: Ingo Molnar Cc: James Clark Cc: Jann Horn Cc: Jiri Olsa Cc: Kan Liang Cc: Mark Rutland Cc: Masahiro Yamada Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Sun Haiyong Cc: Weilin Wang Cc: Yang Jihong Cc: Yunseong Kim Cc: Ze Gao Link: https://lore.kernel.org/r/20240817064442.2152089-4-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/jit.h | 3 ++- tools/perf/util/jitdump.c | 6 +++--- 2 files changed, 5 insertions(+), 4 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/jit.h b/tools/perf/util/jit.h index fb810e1b2de7..f4037203e9ec 100644 --- a/tools/perf/util/jit.h +++ b/tools/perf/util/jit.h @@ -5,7 +5,8 @@ #include int jit_process(struct perf_session *session, struct perf_data *output, - struct machine *machine, char *filename, pid_t pid, pid_t tid, u64 *nbytes); + struct machine *machine, const char *filename, + pid_t pid, pid_t tid, u64 *nbytes); int jit_inject_record(const char *filename); diff --git a/tools/perf/util/jitdump.c b/tools/perf/util/jitdump.c index 5ce13653512b..346513e5e9b7 100644 --- a/tools/perf/util/jitdump.c +++ b/tools/perf/util/jitdump.c @@ -710,7 +710,7 @@ jit_process_dump(struct jit_buf_desc *jd) } static int -jit_inject(struct jit_buf_desc *jd, char *path) +jit_inject(struct jit_buf_desc *jd, const char *path) { int ret; @@ -737,7 +737,7 @@ jit_inject(struct jit_buf_desc *jd, char *path) * as captured in the RECORD_MMAP record */ static int -jit_detect(char *mmap_name, pid_t pid, struct nsinfo *nsi) +jit_detect(const char *mmap_name, pid_t pid, struct nsinfo *nsi) { char *p; char *end = NULL; @@ -821,7 +821,7 @@ int jit_process(struct perf_session *session, struct perf_data *output, struct machine *machine, - char *filename, + const char *filename, pid_t pid, pid_t tid, u64 *nbytes) -- cgit v1.2.3 From e4bb4caa54b24b839c21612d816b40673a75f6d4 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Fri, 16 Aug 2024 23:44:33 -0700 Subject: perf dso: Constify dso_id The passed dso_id is copied and so is never an out argument. Remove its mutability. Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Anne Macedo Cc: Arnd Bergmann Cc: Athira Rajeev Cc: Casey Chen Cc: Chaitanya S Prakash Cc: Colin Ian King Cc: Dominique Martinet Cc: Ingo Molnar Cc: James Clark Cc: Jann Horn Cc: Jiri Olsa Cc: Kan Liang Cc: Mark Rutland Cc: Masahiro Yamada Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Sun Haiyong Cc: Weilin Wang Cc: Yang Jihong Cc: Yunseong Kim Cc: Ze Gao Link: https://lore.kernel.org/r/20240817064442.2152089-5-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-inject.c | 2 +- tools/perf/util/dso.c | 4 ++-- tools/perf/util/dso.h | 4 ++-- tools/perf/util/dsos.c | 12 ++++++------ tools/perf/util/dsos.h | 2 +- tools/perf/util/machine.c | 3 ++- tools/perf/util/machine.h | 3 ++- 7 files changed, 16 insertions(+), 14 deletions(-) (limited to 'tools') diff --git a/tools/perf/builtin-inject.c b/tools/perf/builtin-inject.c index a35bde3f3c09..8440ddfbf4fe 100644 --- a/tools/perf/builtin-inject.c +++ b/tools/perf/builtin-inject.c @@ -411,7 +411,7 @@ static int perf_event__jit_repipe_mmap(const struct perf_tool *tool, #endif static struct dso *findnew_dso(int pid, int tid, const char *filename, - struct dso_id *id, struct machine *machine) + const struct dso_id *id, struct machine *machine) { struct thread *thread; struct nsinfo *nsi = NULL; diff --git a/tools/perf/util/dso.c b/tools/perf/util/dso.c index 67414944f245..5c6e85fdae0d 100644 --- a/tools/perf/util/dso.c +++ b/tools/perf/util/dso.c @@ -1327,7 +1327,7 @@ bool dso_id__empty(const struct dso_id *id) return !id->maj && !id->min && !id->ino && !id->ino_generation; } -void __dso__inject_id(struct dso *dso, struct dso_id *id) +void __dso__inject_id(struct dso *dso, const struct dso_id *id) { struct dsos *dsos = dso__dsos(dso); struct dso_id *dso_id = dso__id(dso); @@ -1417,7 +1417,7 @@ void dso__set_sorted_by_name(struct dso *dso) RC_CHK_ACCESS(dso)->sorted_by_name = true; } -struct dso *dso__new_id(const char *name, struct dso_id *id) +struct dso *dso__new_id(const char *name, const struct dso_id *id) { RC_STRUCT(dso) *dso = zalloc(sizeof(*dso) + strlen(name) + 1); struct dso *res; diff --git a/tools/perf/util/dso.h b/tools/perf/util/dso.h index ed0068251c65..bb8e8f444054 100644 --- a/tools/perf/util/dso.h +++ b/tools/perf/util/dso.h @@ -640,14 +640,14 @@ static inline void dso__set_text_offset(struct dso *dso, u64 val) int dso_id__cmp(const struct dso_id *a, const struct dso_id *b); bool dso_id__empty(const struct dso_id *id); -struct dso *dso__new_id(const char *name, struct dso_id *id); +struct dso *dso__new_id(const char *name, const struct dso_id *id); struct dso *dso__new(const char *name); void dso__delete(struct dso *dso); int dso__cmp_id(struct dso *a, struct dso *b); void dso__set_short_name(struct dso *dso, const char *name, bool name_allocated); void dso__set_long_name(struct dso *dso, const char *name, bool name_allocated); -void __dso__inject_id(struct dso *dso, struct dso_id *id); +void __dso__inject_id(struct dso *dso, const struct dso_id *id); int dso__name_len(const struct dso *dso); diff --git a/tools/perf/util/dsos.c b/tools/perf/util/dsos.c index d4acdb37f046..e0998e2a7c4e 100644 --- a/tools/perf/util/dsos.c +++ b/tools/perf/util/dsos.c @@ -155,7 +155,7 @@ static int dsos__cmp_key_long_name_id(const void *vkey, const void *vdso) */ static struct dso *__dsos__find_by_longname_id(struct dsos *dsos, const char *name, - struct dso_id *id, + const struct dso_id *id, bool write_locked) { struct dsos__key key = { @@ -244,7 +244,7 @@ int dsos__add(struct dsos *dsos, struct dso *dso) struct dsos__find_id_cb_args { const char *name; - struct dso_id *id; + const struct dso_id *id; struct dso *res; }; @@ -260,7 +260,7 @@ static int dsos__find_id_cb(struct dso *dso, void *data) } -static struct dso *__dsos__find_id(struct dsos *dsos, const char *name, struct dso_id *id, +static struct dso *__dsos__find_id(struct dsos *dsos, const char *name, const struct dso_id *id, bool cmp_short, bool write_locked) { struct dso *res; @@ -321,7 +321,7 @@ static void dso__set_basename(struct dso *dso) dso__set_short_name(dso, base, true); } -static struct dso *__dsos__addnew_id(struct dsos *dsos, const char *name, struct dso_id *id) +static struct dso *__dsos__addnew_id(struct dsos *dsos, const char *name, const struct dso_id *id) { struct dso *dso = dso__new_id(name, id); @@ -337,7 +337,7 @@ static struct dso *__dsos__addnew_id(struct dsos *dsos, const char *name, struct return dso; } -static struct dso *__dsos__findnew_id(struct dsos *dsos, const char *name, struct dso_id *id) +static struct dso *__dsos__findnew_id(struct dsos *dsos, const char *name, const struct dso_id *id) { struct dso *dso = __dsos__find_id(dsos, name, id, false, /*write_locked=*/true); @@ -347,7 +347,7 @@ static struct dso *__dsos__findnew_id(struct dsos *dsos, const char *name, struc return dso ? dso : __dsos__addnew_id(dsos, name, id); } -struct dso *dsos__findnew_id(struct dsos *dsos, const char *name, struct dso_id *id) +struct dso *dsos__findnew_id(struct dsos *dsos, const char *name, const struct dso_id *id) { struct dso *dso; down_write(&dsos->lock); diff --git a/tools/perf/util/dsos.h b/tools/perf/util/dsos.h index 6c13b65648bc..a26774950866 100644 --- a/tools/perf/util/dsos.h +++ b/tools/perf/util/dsos.h @@ -32,7 +32,7 @@ int __dsos__add(struct dsos *dsos, struct dso *dso); int dsos__add(struct dsos *dsos, struct dso *dso); struct dso *dsos__find(struct dsos *dsos, const char *name, bool cmp_short); -struct dso *dsos__findnew_id(struct dsos *dsos, const char *name, struct dso_id *id); +struct dso *dsos__findnew_id(struct dsos *dsos, const char *name, const struct dso_id *id); bool dsos__read_build_ids(struct dsos *dsos, bool with_hits); diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c index c08774d06d14..cd79a830abae 100644 --- a/tools/perf/util/machine.c +++ b/tools/perf/util/machine.c @@ -3128,7 +3128,8 @@ out: return addr_cpumode; } -struct dso *machine__findnew_dso_id(struct machine *machine, const char *filename, struct dso_id *id) +struct dso *machine__findnew_dso_id(struct machine *machine, const char *filename, + const struct dso_id *id) { return dsos__findnew_id(&machine->dsos, filename, id); } diff --git a/tools/perf/util/machine.h b/tools/perf/util/machine.h index 82a47bac8023..a687876e3453 100644 --- a/tools/perf/util/machine.h +++ b/tools/perf/util/machine.h @@ -207,7 +207,8 @@ int machine__nr_cpus_avail(struct machine *machine); struct thread *machine__findnew_thread(struct machine *machine, pid_t pid, pid_t tid); -struct dso *machine__findnew_dso_id(struct machine *machine, const char *filename, struct dso_id *id); +struct dso *machine__findnew_dso_id(struct machine *machine, const char *filename, + const struct dso_id *id); struct dso *machine__findnew_dso(struct machine *machine, const char *filename); size_t machine__fprintf(struct machine *machine, FILE *fp); -- cgit v1.2.3 From 63c89dc5e129a73cd3f8528640d2496543b79706 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Fri, 16 Aug 2024 23:44:34 -0700 Subject: perf evsel: Constify evsel__id_hdr_size() argument Allows evsel__id_hdr_size() to be used when the evsel is const. Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Anne Macedo Cc: Arnd Bergmann Cc: Athira Rajeev Cc: Casey Chen Cc: Chaitanya S Prakash Cc: Colin Ian King Cc: Dominique Martinet Cc: Ingo Molnar Cc: James Clark Cc: Jann Horn Cc: Jiri Olsa Cc: Kan Liang Cc: Mark Rutland Cc: Masahiro Yamada Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Sun Haiyong Cc: Weilin Wang Cc: Yang Jihong Cc: Yunseong Kim Cc: Ze Gao Link: https://lore.kernel.org/r/20240817064442.2152089-6-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/evsel.c | 2 +- tools/perf/util/evsel.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index 38a74d6dde49..49cc71511c0c 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -3048,7 +3048,7 @@ int evsel__parse_sample_timestamp(struct evsel *evsel, union perf_event *event, return 0; } -u16 evsel__id_hdr_size(struct evsel *evsel) +u16 evsel__id_hdr_size(const struct evsel *evsel) { u64 sample_type = evsel->core.attr.sample_type; u16 size = 0; diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h index 4316992f6a69..15acf293e12a 100644 --- a/tools/perf/util/evsel.h +++ b/tools/perf/util/evsel.h @@ -442,7 +442,7 @@ int evsel__parse_sample(struct evsel *evsel, union perf_event *event, int evsel__parse_sample_timestamp(struct evsel *evsel, union perf_event *event, u64 *timestamp); -u16 evsel__id_hdr_size(struct evsel *evsel); +u16 evsel__id_hdr_size(const struct evsel *evsel); static inline struct evsel *evsel__next(struct evsel *evsel) { -- cgit v1.2.3 From a8656614ebe017a0487ba9f85213910c09e89c59 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Fri, 16 Aug 2024 23:44:35 -0700 Subject: perf test: Expand pipe/inject test Test recording of call-graphs and injecting --build-all. Add/expand trap handler. Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Anne Macedo Cc: Arnd Bergmann Cc: Athira Rajeev Cc: Casey Chen Cc: Chaitanya S Prakash Cc: Colin Ian King Cc: Dominique Martinet Cc: Ingo Molnar Cc: James Clark Cc: Jann Horn Cc: Jiri Olsa Cc: Kan Liang Cc: Mark Rutland Cc: Masahiro Yamada Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Sun Haiyong Cc: Weilin Wang Cc: Yang Jihong Cc: Yunseong Kim Cc: Ze Gao Link: https://lore.kernel.org/r/20240817064442.2152089-7-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/shell/pipe_test.sh | 101 ++++++++++++++++++++++++++++-------- 1 file changed, 79 insertions(+), 22 deletions(-) (limited to 'tools') diff --git a/tools/perf/tests/shell/pipe_test.sh b/tools/perf/tests/shell/pipe_test.sh index a78d35d2cff0..ad10012fdc29 100755 --- a/tools/perf/tests/shell/pipe_test.sh +++ b/tools/perf/tests/shell/pipe_test.sh @@ -1,4 +1,4 @@ -#!/bin/sh +#!/bin/bash # perf pipe recording and injection test # SPDX-License-Identifier: GPL-2.0 @@ -12,30 +12,87 @@ skip_test_missing_symbol ${sym} data=$(mktemp /tmp/perf.data.XXXXXX) prog="perf test -w noploop" -task="perf" +err=0 -if ! perf record -e task-clock:u -o - ${prog} | perf report -i - --task | grep ${task}; then - echo "cannot find the test file in the perf report" - exit 1 -fi +set -e -if ! perf record -e task-clock:u -o - ${prog} | perf inject -b | perf report -i - | grep ${sym}; then - echo "cannot find noploop function in pipe #1" - exit 1 -fi +cleanup() { + rm -rf "${data}" + rm -rf "${data}".old -perf record -e task-clock:u -o - ${prog} | perf inject -b -o ${data} -if ! perf report -i ${data} | grep ${sym}; then - echo "cannot find noploop function in pipe #2" - exit 1 -fi + trap - EXIT TERM INT +} -perf record -e task-clock:u -o ${data} ${prog} -if ! perf inject -b -i ${data} | perf report -i - | grep ${sym}; then - echo "cannot find noploop function in pipe #3" - exit 1 -fi +trap_cleanup() { + echo "Unexpected signal in ${FUNCNAME[1]}" + cleanup + exit 1 +} +trap trap_cleanup EXIT TERM INT +test_record_report() { + echo + echo "Record+report pipe test" + + task="perf" + if ! perf record -e task-clock:u -o - ${prog} | perf report -i - --task | grep -q ${task} + then + echo "Record+report pipe test [Failed - cannot find the test file in the perf report #1]" + err=1 + return + fi + + if ! perf record -g -e task-clock:u -o - ${prog} | perf report -i - --task | grep -q ${task} + then + echo "Record+report pipe test [Failed - cannot find the test file in the perf report #2]" + err=1 + return + fi + + echo "Record+report pipe test [Success]" +} + +test_inject_bids() { + inject_opt=$1 + + echo + echo "Inject ${inject_opt} build-ids test" + + if ! perf record -e task-clock:u -o - ${prog} | perf inject ${inject_opt}| perf report -i - | grep -q ${sym} + then + echo "Inject build-ids test [Failed - cannot find noploop function in pipe #1]" + err=1 + return + fi + + if ! perf record -g -e task-clock:u -o - ${prog} | perf inject ${inject_opt} | perf report -i - | grep -q ${sym} + then + echo "Inject ${inject_opt} build-ids test [Failed - cannot find noploop function in pipe #2]" + err=1 + return + fi + + perf record -e task-clock:u -o - ${prog} | perf inject ${inject_opt} -o ${data} + if ! perf report -i ${data} | grep -q ${sym}; then + echo "Inject ${inject_opt} build-ids test [Failed - cannot find noploop function in pipe #3]" + err=1 + return + fi + + perf record -e task-clock:u -o ${data} ${prog} + if ! perf inject ${inject_opt} -i ${data} | perf report -i - | grep -q ${sym}; then + echo "Inject ${inject_opt} build-ids test [Failed - cannot find noploop function in pipe #4]" + err=1 + return + fi + + echo "Inject ${inject_opt} build-ids test [Success]" +} + +test_record_report +test_inject_bids -b +test_inject_bids --buildid-all + +cleanup +exit $err -rm -f ${data} ${data}.old -exit 0 -- cgit v1.2.3 From 0ed4c8c31139e3360c5db9f536e8ff37c1824f8a Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Fri, 16 Aug 2024 23:44:36 -0700 Subject: perf inject: Combine build_ids and build_id_all into enum It is clearer to have a single enum that determines how build ids are injected, it also allows for future extension. Set the header build ID feature whether lazy or all are generated, previously only the lazy case would set it. Allow parsing of known build IDs for either the lazy or all cases. Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Anne Macedo Cc: Arnd Bergmann Cc: Athira Rajeev Cc: Casey Chen Cc: Chaitanya S Prakash Cc: Colin Ian King Cc: Dominique Martinet Cc: Ingo Molnar Cc: James Clark Cc: Jann Horn Cc: Jiri Olsa Cc: Kan Liang Cc: Mark Rutland Cc: Masahiro Yamada Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Sun Haiyong Cc: Weilin Wang Cc: Yang Jihong Cc: Yunseong Kim Cc: Ze Gao Link: https://lore.kernel.org/r/20240817064442.2152089-8-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-inject.c | 51 +++++++++++++++++++++++++++------------------ 1 file changed, 31 insertions(+), 20 deletions(-) (limited to 'tools') diff --git a/tools/perf/builtin-inject.c b/tools/perf/builtin-inject.c index 8440ddfbf4fe..865d16ceead2 100644 --- a/tools/perf/builtin-inject.c +++ b/tools/perf/builtin-inject.c @@ -103,11 +103,16 @@ struct guest_session { struct guest_event ev; }; +enum build_id_rewrite_style { + BID_RWS__NONE = 0, + BID_RWS__INJECT_HEADER_LAZY, + BID_RWS__INJECT_HEADER_ALL, +}; + struct perf_inject { struct perf_tool tool; struct perf_session *session; - bool build_ids; - bool build_id_all; + enum build_id_rewrite_style build_id_style; bool sched_stat; bool have_auxtrace; bool strip; @@ -2015,8 +2020,8 @@ static int __cmd_inject(struct perf_inject *inject) signal(SIGINT, sig_handler); - if (inject->build_ids || inject->sched_stat || - inject->itrace_synth_opts.set || inject->build_id_all) { + if (inject->build_id_style != BID_RWS__NONE || inject->sched_stat || + inject->itrace_synth_opts.set) { inject->tool.mmap = perf_event__repipe_mmap; inject->tool.mmap2 = perf_event__repipe_mmap2; inject->tool.fork = perf_event__repipe_fork; @@ -2027,10 +2032,10 @@ static int __cmd_inject(struct perf_inject *inject) output_data_offset = perf_session__data_offset(session->evlist); - if (inject->build_id_all) { + if (inject->build_id_style == BID_RWS__INJECT_HEADER_ALL) { inject->tool.mmap = perf_event__repipe_buildid_mmap; inject->tool.mmap2 = perf_event__repipe_buildid_mmap2; - } else if (inject->build_ids) { + } else if (inject->build_id_style == BID_RWS__INJECT_HEADER_LAZY) { inject->tool.sample = perf_event__inject_buildid; } else if (inject->sched_stat) { struct evsel *evsel; @@ -2148,9 +2153,9 @@ static int __cmd_inject(struct perf_inject *inject) .inject = inject, }; - if (inject->build_ids) - perf_header__set_feat(&session->header, - HEADER_BUILD_ID); + if (inject->build_id_style == BID_RWS__INJECT_HEADER_LAZY || + inject->build_id_style == BID_RWS__INJECT_HEADER_ALL) + perf_header__set_feat(&session->header, HEADER_BUILD_ID); /* * Keep all buildids when there is unprocessed AUX data because * it is not known which ones the AUX trace hits. @@ -2211,11 +2216,13 @@ int cmd_inject(int argc, const char **argv) int ret; bool repipe = true; const char *known_build_ids = NULL; + bool build_ids; + bool build_id_all; struct option options[] = { - OPT_BOOLEAN('b', "build-ids", &inject.build_ids, + OPT_BOOLEAN('b', "build-ids", &build_ids, "Inject build-ids into the output stream"), - OPT_BOOLEAN(0, "buildid-all", &inject.build_id_all, + OPT_BOOLEAN(0, "buildid-all", &build_id_all, "Inject build-ids of all DSOs into the output stream"), OPT_STRING(0, "known-build-ids", &known_build_ids, "buildid path [,buildid path...]", @@ -2313,6 +2320,10 @@ int cmd_inject(int argc, const char **argv) return -1; } } + if (build_ids) + inject.build_id_style = BID_RWS__INJECT_HEADER_LAZY; + if (build_id_all) + inject.build_id_style = BID_RWS__INJECT_HEADER_ALL; data.path = inject.input_name; if (!strcmp(inject.input_name, "-") || inject.output.is_pipe) { @@ -2326,7 +2337,7 @@ int cmd_inject(int argc, const char **argv) repipe = false; } ordered_events = inject.jit_mode || inject.sched_stat || - (inject.build_ids && !inject.build_id_all); + (inject.build_id_style == BID_RWS__INJECT_HEADER_LAZY); perf_tool__init(&inject.tool, ordered_events); inject.tool.sample = perf_event__repipe_sample; inject.tool.read = perf_event__repipe_sample; @@ -2398,7 +2409,7 @@ int cmd_inject(int argc, const char **argv) goto out_delete; } - if (inject.build_ids && !inject.build_id_all) { + if (inject.build_id_style == BID_RWS__INJECT_HEADER_LAZY) { /* * to make sure the mmap records are ordered correctly * and so that the correct especially due to jitted code @@ -2406,14 +2417,14 @@ int cmd_inject(int argc, const char **argv) * inject the jit mmaps at the same time for now. */ inject.tool.ordering_requires_timestamps = true; - if (known_build_ids != NULL) { - inject.known_build_ids = - perf_inject__parse_known_build_ids(known_build_ids); + } + if (inject.build_id_style != BID_RWS__NONE && known_build_ids != NULL) { + inject.known_build_ids = + perf_inject__parse_known_build_ids(known_build_ids); - if (inject.known_build_ids == NULL) { - pr_err("Couldn't parse known build ids.\n"); - goto out_delete; - } + if (inject.known_build_ids == NULL) { + pr_err("Couldn't parse known build ids.\n"); + goto out_delete; } } -- cgit v1.2.3 From 048a7a9363a276ffc15e137ae8544d2cd7c28b67 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Fri, 16 Aug 2024 23:44:37 -0700 Subject: perf inject: Combine different mmap and mmap2 functions There are repipe, build ID and JIT dump variants of the mmap and mmap2 repipe functions. The organization doesn't allow JIT dump to work with build ID injection and the structure is less than clear. Combine the function and enable the different behaviors based on ifs. Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Anne Macedo Cc: Arnd Bergmann Cc: Athira Rajeev Cc: Casey Chen Cc: Chaitanya S Prakash Cc: Colin Ian King Cc: Dominique Martinet Cc: Ingo Molnar Cc: James Clark Cc: Jann Horn Cc: Jiri Olsa Cc: Kan Liang Cc: Mark Rutland Cc: Masahiro Yamada Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Sun Haiyong Cc: Weilin Wang Cc: Yang Jihong Cc: Yunseong Kim Cc: Ze Gao Link: https://lore.kernel.org/r/20240817064442.2152089-9-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-inject.c | 199 +++++++++++++++++--------------------------- 1 file changed, 76 insertions(+), 123 deletions(-) (limited to 'tools') diff --git a/tools/perf/builtin-inject.c b/tools/perf/builtin-inject.c index 865d16ceead2..d99868953ff2 100644 --- a/tools/perf/builtin-inject.c +++ b/tools/perf/builtin-inject.c @@ -377,44 +377,6 @@ static int perf_event__repipe_sample(const struct perf_tool *tool, return perf_event__repipe_synth(tool, event); } -static int perf_event__repipe_mmap(const struct perf_tool *tool, - union perf_event *event, - struct perf_sample *sample, - struct machine *machine) -{ - int err; - - err = perf_event__process_mmap(tool, event, sample, machine); - perf_event__repipe(tool, event, sample, machine); - - return err; -} - -#ifdef HAVE_JITDUMP -static int perf_event__jit_repipe_mmap(const struct perf_tool *tool, - union perf_event *event, - struct perf_sample *sample, - struct machine *machine) -{ - struct perf_inject *inject = container_of(tool, struct perf_inject, tool); - u64 n = 0; - int ret; - - /* - * if jit marker, then inject jit mmaps and generate ELF images - */ - ret = jit_process(inject->session, &inject->output, machine, - event->mmap.filename, event->mmap.pid, event->mmap.tid, &n); - if (ret < 0) - return ret; - if (ret) { - inject->bytes_written += n; - return 0; - } - return perf_event__repipe_mmap(tool, event, sample, machine); -} -#endif - static struct dso *findnew_dso(int pid, int tid, const char *filename, const struct dso_id *id, struct machine *machine) { @@ -460,114 +422,108 @@ static struct dso *findnew_dso(int pid, int tid, const char *filename, return dso; } -static int perf_event__repipe_buildid_mmap(const struct perf_tool *tool, - union perf_event *event, - struct perf_sample *sample, - struct machine *machine) -{ - struct dso *dso; - - dso = findnew_dso(event->mmap.pid, event->mmap.tid, - event->mmap.filename, NULL, machine); - - if (dso && !dso__hit(dso)) { - dso__set_hit(dso); - dso__inject_build_id(dso, tool, machine, sample->cpumode, 0); - } - dso__put(dso); - - return perf_event__repipe(tool, event, sample, machine); -} - -static int perf_event__repipe_mmap2(const struct perf_tool *tool, +static int perf_event__repipe_mmap(const struct perf_tool *tool, union perf_event *event, struct perf_sample *sample, struct machine *machine) { - int err; + struct perf_inject *inject = container_of(tool, struct perf_inject, tool); - err = perf_event__process_mmap2(tool, event, sample, machine); - perf_event__repipe(tool, event, sample, machine); +#ifdef HAVE_JITDUMP + if (inject->jit_mode) { + u64 n = 0; + int ret; - if (event->header.misc & PERF_RECORD_MISC_MMAP_BUILD_ID) { - struct dso *dso; + /* If jit marker, then inject jit mmaps and generate ELF images. */ + ret = jit_process(inject->session, &inject->output, machine, + event->mmap.filename, event->mmap.pid, event->mmap.tid, &n); + if (ret < 0) + return ret; + if (ret) { + inject->bytes_written += n; + return 0; + } + } +#endif + if (inject->build_id_style == BID_RWS__INJECT_HEADER_ALL) { + struct dso *dso = findnew_dso(event->mmap.pid, event->mmap.tid, + event->mmap.filename, NULL, machine); - dso = findnew_dso(event->mmap2.pid, event->mmap2.tid, - event->mmap2.filename, NULL, machine); - if (dso) { - /* mark it not to inject build-id */ + if (dso && !dso__hit(dso)) { dso__set_hit(dso); + dso__inject_build_id(dso, tool, machine, sample->cpumode, 0); } dso__put(dso); - } + } else { + /* Create the thread, map, etc. Not done for the unordered inject all case. */ + int err = perf_event__process_mmap(tool, event, sample, machine); - return err; + if (err) + return err; + } + return perf_event__repipe(tool, event, sample, machine); } -#ifdef HAVE_JITDUMP -static int perf_event__jit_repipe_mmap2(const struct perf_tool *tool, - union perf_event *event, - struct perf_sample *sample, - struct machine *machine) +static int perf_event__repipe_mmap2(const struct perf_tool *tool, + union perf_event *event, + struct perf_sample *sample, + struct machine *machine) { struct perf_inject *inject = container_of(tool, struct perf_inject, tool); - u64 n = 0; - int ret; + struct dso *dso = NULL; - /* - * if jit marker, then inject jit mmaps and generate ELF images - */ - ret = jit_process(inject->session, &inject->output, machine, - event->mmap2.filename, event->mmap2.pid, event->mmap2.tid, &n); - if (ret < 0) - return ret; - if (ret) { - inject->bytes_written += n; - return 0; +#ifdef HAVE_JITDUMP + if (inject->jit_mode) { + u64 n = 0; + int ret; + + /* If jit marker, then inject jit mmaps and generate ELF images. */ + ret = jit_process(inject->session, &inject->output, machine, + event->mmap2.filename, event->mmap2.pid, event->mmap2.tid, &n); + if (ret < 0) + return ret; + if (ret) { + inject->bytes_written += n; + return 0; + } } - return perf_event__repipe_mmap2(tool, event, sample, machine); -} #endif - -static int perf_event__repipe_buildid_mmap2(const struct perf_tool *tool, - union perf_event *event, - struct perf_sample *sample, - struct machine *machine) -{ - struct dso_id dso_id = { - .maj = event->mmap2.maj, - .min = event->mmap2.min, - .ino = event->mmap2.ino, - .ino_generation = event->mmap2.ino_generation, - }; - struct dso *dso; - if (event->header.misc & PERF_RECORD_MISC_MMAP_BUILD_ID) { - /* cannot use dso_id since it'd have invalid info */ dso = findnew_dso(event->mmap2.pid, event->mmap2.tid, event->mmap2.filename, NULL, machine); if (dso) { /* mark it not to inject build-id */ dso__set_hit(dso); } - dso__put(dso); - perf_event__repipe(tool, event, sample, machine); - return 0; } + if (inject->build_id_style == BID_RWS__INJECT_HEADER_ALL) { + if (!(event->header.misc & PERF_RECORD_MISC_MMAP_BUILD_ID)) { + struct dso_id dso_id = { + .maj = event->mmap2.maj, + .min = event->mmap2.min, + .ino = event->mmap2.ino, + .ino_generation = event->mmap2.ino_generation, + }; + + dso = findnew_dso(event->mmap2.pid, event->mmap2.tid, + event->mmap2.filename, &dso_id, machine); + } + if (dso && !dso__hit(dso)) { + dso__set_hit(dso); + dso__inject_build_id(dso, tool, machine, sample->cpumode, + event->mmap2.flags); + } + } else { + /* Create the thread, map, etc. Not done for the unordered inject all case. */ + int err = perf_event__process_mmap(tool, event, sample, machine); - dso = findnew_dso(event->mmap2.pid, event->mmap2.tid, - event->mmap2.filename, &dso_id, machine); - - if (dso && !dso__hit(dso)) { - dso__set_hit(dso); - dso__inject_build_id(dso, tool, machine, sample->cpumode, - event->mmap2.flags); + if (err) { + dso__put(dso); + return err; + } } dso__put(dso); - - perf_event__repipe(tool, event, sample, machine); - - return 0; + return perf_event__repipe(tool, event, sample, machine); } static int perf_event__repipe_fork(const struct perf_tool *tool, @@ -2032,10 +1988,7 @@ static int __cmd_inject(struct perf_inject *inject) output_data_offset = perf_session__data_offset(session->evlist); - if (inject->build_id_style == BID_RWS__INJECT_HEADER_ALL) { - inject->tool.mmap = perf_event__repipe_buildid_mmap; - inject->tool.mmap2 = perf_event__repipe_buildid_mmap2; - } else if (inject->build_id_style == BID_RWS__INJECT_HEADER_LAZY) { + if (inject->build_id_style == BID_RWS__INJECT_HEADER_LAZY) { inject->tool.sample = perf_event__inject_buildid; } else if (inject->sched_stat) { struct evsel *evsel; @@ -2430,8 +2383,8 @@ int cmd_inject(int argc, const char **argv) #ifdef HAVE_JITDUMP if (inject.jit_mode) { - inject.tool.mmap2 = perf_event__jit_repipe_mmap2; - inject.tool.mmap = perf_event__jit_repipe_mmap; + inject.tool.mmap2 = perf_event__repipe_mmap2; + inject.tool.mmap = perf_event__repipe_mmap; inject.tool.ordering_requires_timestamps = true; /* * JIT MMAP injection injects all MMAP events in one go, so it -- cgit v1.2.3 From 05c4cfeba097b986532c7a5609dcc5435c0a4dee Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Fri, 16 Aug 2024 23:44:38 -0700 Subject: perf inject: Combine mmap and mmap2 handling The handling of mmap and mmap2 events is near identical. Add a common helper function and call that by the two event handling functions. Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Anne Macedo Cc: Arnd Bergmann Cc: Athira Rajeev Cc: Casey Chen Cc: Chaitanya S Prakash Cc: Colin Ian King Cc: Dominique Martinet Cc: Ingo Molnar Cc: James Clark Cc: Jann Horn Cc: Jiri Olsa Cc: Kan Liang Cc: Mark Rutland Cc: Masahiro Yamada Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Sun Haiyong Cc: Weilin Wang Cc: Yang Jihong Cc: Yunseong Kim Cc: Ze Gao Link: https://lore.kernel.org/r/20240817064442.2152089-10-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-inject.c | 118 +++++++++++++++++++++----------------------- 1 file changed, 56 insertions(+), 62 deletions(-) (limited to 'tools') diff --git a/tools/perf/builtin-inject.c b/tools/perf/builtin-inject.c index d99868953ff2..a7c859db2e15 100644 --- a/tools/perf/builtin-inject.c +++ b/tools/perf/builtin-inject.c @@ -422,55 +422,21 @@ static struct dso *findnew_dso(int pid, int tid, const char *filename, return dso; } -static int perf_event__repipe_mmap(const struct perf_tool *tool, - union perf_event *event, - struct perf_sample *sample, - struct machine *machine) -{ - struct perf_inject *inject = container_of(tool, struct perf_inject, tool); - -#ifdef HAVE_JITDUMP - if (inject->jit_mode) { - u64 n = 0; - int ret; - - /* If jit marker, then inject jit mmaps and generate ELF images. */ - ret = jit_process(inject->session, &inject->output, machine, - event->mmap.filename, event->mmap.pid, event->mmap.tid, &n); - if (ret < 0) - return ret; - if (ret) { - inject->bytes_written += n; - return 0; - } - } -#endif - if (inject->build_id_style == BID_RWS__INJECT_HEADER_ALL) { - struct dso *dso = findnew_dso(event->mmap.pid, event->mmap.tid, - event->mmap.filename, NULL, machine); - - if (dso && !dso__hit(dso)) { - dso__set_hit(dso); - dso__inject_build_id(dso, tool, machine, sample->cpumode, 0); - } - dso__put(dso); - } else { - /* Create the thread, map, etc. Not done for the unordered inject all case. */ - int err = perf_event__process_mmap(tool, event, sample, machine); - - if (err) - return err; - } - return perf_event__repipe(tool, event, sample, machine); -} - -static int perf_event__repipe_mmap2(const struct perf_tool *tool, - union perf_event *event, - struct perf_sample *sample, - struct machine *machine) +static int perf_event__repipe_common_mmap(const struct perf_tool *tool, + union perf_event *event, + struct perf_sample *sample, + struct machine *machine, + __u32 pid, __u32 tid, __u32 flags, + const char *filename, + const struct dso_id *dso_id, + int (*perf_event_process)(const struct perf_tool *tool, + union perf_event *event, + struct perf_sample *sample, + struct machine *machine)) { struct perf_inject *inject = container_of(tool, struct perf_inject, tool); struct dso *dso = NULL; + bool dso_sought = false; #ifdef HAVE_JITDUMP if (inject->jit_mode) { @@ -479,7 +445,7 @@ static int perf_event__repipe_mmap2(const struct perf_tool *tool, /* If jit marker, then inject jit mmaps and generate ELF images. */ ret = jit_process(inject->session, &inject->output, machine, - event->mmap2.filename, event->mmap2.pid, event->mmap2.tid, &n); + filename, pid, tid, &n); if (ret < 0) return ret; if (ret) { @@ -489,33 +455,26 @@ static int perf_event__repipe_mmap2(const struct perf_tool *tool, } #endif if (event->header.misc & PERF_RECORD_MISC_MMAP_BUILD_ID) { - dso = findnew_dso(event->mmap2.pid, event->mmap2.tid, - event->mmap2.filename, NULL, machine); + dso = findnew_dso(pid, tid, filename, dso_id, machine); + dso_sought = true; if (dso) { /* mark it not to inject build-id */ dso__set_hit(dso); } } if (inject->build_id_style == BID_RWS__INJECT_HEADER_ALL) { - if (!(event->header.misc & PERF_RECORD_MISC_MMAP_BUILD_ID)) { - struct dso_id dso_id = { - .maj = event->mmap2.maj, - .min = event->mmap2.min, - .ino = event->mmap2.ino, - .ino_generation = event->mmap2.ino_generation, - }; - - dso = findnew_dso(event->mmap2.pid, event->mmap2.tid, - event->mmap2.filename, &dso_id, machine); + if (!dso_sought) { + dso = findnew_dso(pid, tid, filename, dso_id, machine); + dso_sought = true; } + if (dso && !dso__hit(dso)) { dso__set_hit(dso); - dso__inject_build_id(dso, tool, machine, sample->cpumode, - event->mmap2.flags); + dso__inject_build_id(dso, tool, machine, sample->cpumode, flags); } } else { /* Create the thread, map, etc. Not done for the unordered inject all case. */ - int err = perf_event__process_mmap(tool, event, sample, machine); + int err = perf_event_process(tool, event, sample, machine); if (err) { dso__put(dso); @@ -526,6 +485,41 @@ static int perf_event__repipe_mmap2(const struct perf_tool *tool, return perf_event__repipe(tool, event, sample, machine); } +static int perf_event__repipe_mmap(const struct perf_tool *tool, + union perf_event *event, + struct perf_sample *sample, + struct machine *machine) +{ + return perf_event__repipe_common_mmap( + tool, event, sample, machine, + event->mmap.pid, event->mmap.tid, /*flags=*/0, + event->mmap.filename, /*dso_id=*/NULL, + perf_event__process_mmap); +} + +static int perf_event__repipe_mmap2(const struct perf_tool *tool, + union perf_event *event, + struct perf_sample *sample, + struct machine *machine) +{ + struct dso_id id; + struct dso_id *dso_id = NULL; + + if (!(event->header.misc & PERF_RECORD_MISC_MMAP_BUILD_ID)) { + id.maj = event->mmap2.maj; + id.min = event->mmap2.min; + id.ino = event->mmap2.ino; + id.ino_generation = event->mmap2.ino_generation; + dso_id = &id; + } + + return perf_event__repipe_common_mmap( + tool, event, sample, machine, + event->mmap2.pid, event->mmap2.tid, event->mmap2.flags, + event->mmap2.filename, dso_id, + perf_event__process_mmap2); +} + static int perf_event__repipe_fork(const struct perf_tool *tool, union perf_event *event, struct perf_sample *sample, -- cgit v1.2.3 From 3432bae89e044819f0b30f4c09260d2740896797 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Sun, 18 Aug 2024 14:29:47 -0700 Subject: perf record: Fix sample cgroup & namespace tracking The recent change in 'struct perf_tool' constification broke the cgroup and/or namespace tracking by resetting tool fields. It should set the values after perf_tool__init(). Fixes: cecb1cf154b301c6 ("perf record: Use perf_tool__init()") Reviewed-by: Ian Rogers Signed-off-by: Namhyung Kim Cc: Adrian Hunter Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kan Liang Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20240818212948.2873156-1-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-record.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'tools') diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index 39367709fd99..adbaf80b398c 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -2374,13 +2374,8 @@ static int __cmd_record(struct record *rec, int argc, const char **argv) signal(SIGTERM, sig_handler); signal(SIGSEGV, sigsegv_handler); - if (rec->opts.record_namespaces) - tool->namespace_events = true; - if (rec->opts.record_cgroup) { -#ifdef HAVE_FILE_HANDLE - tool->cgroup_events = true; -#else +#ifndef HAVE_FILE_HANDLE pr_err("cgroup tracking is not supported\n"); return -1; #endif @@ -2406,6 +2401,8 @@ static int __cmd_record(struct record *rec, int argc, const char **argv) tool->mmap2 = build_id__process_mmap2; tool->itrace_start = process_timestamp_boundary; tool->aux = process_timestamp_boundary; + tool->namespace_events = rec->opts.record_namespaces; + tool->cgroup_events = rec->opts.record_cgroup; session = perf_session__new(data, tool); if (IS_ERR(session)) { pr_err("Perf session creation failed.\n"); -- cgit v1.2.3 From 5cc698bad72667cf80097dece6b91efbb4a1a0a7 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Sun, 18 Aug 2024 14:29:48 -0700 Subject: perf test: Add cgroup sampling test Add it to the record.sh shell test to verify if it tracks cgroup information correctly. It records with --all-cgroups option can check if it has PERF_RECORD_CGROUP and the names are not "unknown". $ sudo ./perf test -vv 95 95: perf record tests: --- start --- test child forked, pid 2871922 169c90-169cd0 g test_loop perf does have symbol 'test_loop' Basic --per-thread mode test Basic --per-thread mode test [Success] Register capture test Register capture test [Success] Basic --system-wide mode test Basic --system-wide mode test [Success] Basic target workload test Basic target workload test [Success] Branch counter test branch counter feature not supported on all core PMUs (/sys/bus/event_source/devices/cpu) [Skipped] Cgroup sampling test Cgroup sampling test [Success] ---- end(0) ---- 95: perf record tests : Ok Reviewed-by: Ian Rogers Signed-off-by: Namhyung Kim Cc: Adrian Hunter Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kan Liang Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20240818212948.2873156-2-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/shell/record.sh | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) (limited to 'tools') diff --git a/tools/perf/tests/shell/record.sh b/tools/perf/tests/shell/record.sh index 36883b03169f..048078ee2eca 100755 --- a/tools/perf/tests/shell/record.sh +++ b/tools/perf/tests/shell/record.sh @@ -206,6 +206,28 @@ test_branch_counter() { echo "Branch counter test [Success]" } +test_cgroup() { + echo "Cgroup sampling test" + if ! perf record -aB --synth=cgroup --all-cgroups -o "${perfdata}" ${testprog} 2> /dev/null + then + echo "Cgroup sampling [Skipped not supported]" + return + fi + if ! perf report -i "${perfdata}" -D | grep -q "CGROUP" + then + echo "Cgroup sampling [Failed missing output]" + err=1 + return + fi + if ! perf script -i "${perfdata}" -F cgroup | grep -q -v "unknown" + then + echo "Cgroup sampling [Failed cannot resolve cgroup names]" + err=1 + return + fi + echo "Cgroup sampling test [Success]" +} + # raise the limit of file descriptors to minimum if [[ $default_fd_limit -lt $min_fd_limit ]]; then ulimit -Sn $min_fd_limit @@ -216,6 +238,7 @@ test_register_capture test_system_wide test_workload test_branch_counter +test_cgroup # restore the default value ulimit -Sn $default_fd_limit -- cgit v1.2.3 From 2aa93695081d4bd62350b41d58684d802be4563f Mon Sep 17 00:00:00 2001 From: Cupertino Miranda Date: Mon, 19 Aug 2024 16:11:27 +0100 Subject: selftests/bpf: Disable strict aliasing for verifier_nocsr.c verfifier_nocsr.c fails to compile in GCC. The reason behind it was initially explained in commit 27a90b14b93d3b2e1efd10764e456af7e2a42991. "A few BPF selftests perform type punning and they may break strict aliasing rules, which are exploited by both GCC and clang by default while optimizing. This can lead to broken compiled programs." Signed-off-by: Cupertino Miranda Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20240819151129.1366484-2-cupertino.miranda@oracle.com --- tools/testing/selftests/bpf/Makefile | 1 + 1 file changed, 1 insertion(+) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 4eceb491a8ae..7bca9a2b8bc0 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -58,6 +58,7 @@ progs/test_pkt_md_access.c-CFLAGS := -fno-strict-aliasing progs/test_sk_lookup.c-CFLAGS := -fno-strict-aliasing progs/timer_crash.c-CFLAGS := -fno-strict-aliasing progs/test_global_func9.c-CFLAGS := -fno-strict-aliasing +progs/verifier_nocsr.c-CFLAGS := -fno-strict-aliasing ifneq ($(LLVM),) # Silence some warnings when compiled with clang -- cgit v1.2.3 From d9075ac631ce0c5c2c17aa6221282a7c4ba8fa70 Mon Sep 17 00:00:00 2001 From: Cupertino Miranda Date: Mon, 19 Aug 2024 16:11:29 +0100 Subject: selftest/bpf: Adapt inline asm operand constraint for GCC support GCC errors when compiling tailcall_bpf2bpf_hierarchy2.c and tailcall_bpf2bpf_hierarchy3.c with the following error: progs/tailcall_bpf2bpf_hierarchy2.c: In function 'tailcall_bpf2bpf_hierarchy_2': progs/tailcall_bpf2bpf_hierarchy2.c:66:9: error: input operand constraint contains '+' 66 | asm volatile (""::"r+"(ret)); | ^~~ Changed implementation to make use of __sink macro that abstracts the desired behaviour. The proposed change seems valid for both GCC and CLANG. Signed-off-by: Cupertino Miranda Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20240819151129.1366484-4-cupertino.miranda@oracle.com --- tools/testing/selftests/bpf/progs/tailcall_bpf2bpf_hierarchy2.c | 4 ++-- tools/testing/selftests/bpf/progs/tailcall_bpf2bpf_hierarchy3.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf_hierarchy2.c b/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf_hierarchy2.c index 37604b0b97af..72fd0d577506 100644 --- a/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf_hierarchy2.c +++ b/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf_hierarchy2.c @@ -58,12 +58,12 @@ __retval(33) SEC("tc") int tailcall_bpf2bpf_hierarchy_2(struct __sk_buff *skb) { - volatile int ret = 0; + int ret = 0; subprog_tail0(skb); subprog_tail1(skb); - asm volatile (""::"r+"(ret)); + __sink(ret); return (count1 << 16) | count0; } diff --git a/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf_hierarchy3.c b/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf_hierarchy3.c index 0cdbb781fcbc..a7fb91cb05b7 100644 --- a/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf_hierarchy3.c +++ b/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf_hierarchy3.c @@ -51,11 +51,11 @@ __retval(33) SEC("tc") int tailcall_bpf2bpf_hierarchy_3(struct __sk_buff *skb) { - volatile int ret = 0; + int ret = 0; bpf_tail_call_static(skb, &jmp_table0, 0); - asm volatile (""::"r+"(ret)); + __sink(ret); return ret; } -- cgit v1.2.3 From 190de5449973056f416c855b11fdfee2fc18f550 Mon Sep 17 00:00:00 2001 From: Michal Luczaj Date: Wed, 31 Jul 2024 12:01:26 +0200 Subject: selftests/bpf: Support more socket types in create_pair() Extend the function to allow creating socket pairs of SOCK_STREAM, SOCK_DGRAM and SOCK_SEQPACKET. Adapt direct callers and leave further cleanups for the following patch. Reviewed-by: Jakub Sitnicki Tested-by: Jakub Sitnicki Suggested-by: Jakub Sitnicki Signed-off-by: Michal Luczaj Link: https://lore.kernel.org/r/20240731-selftest-sockmap-fixes-v2-1-08a0c73abed2@rbox.co Signed-off-by: Martin KaFai Lau --- .../selftests/bpf/prog_tests/sockmap_basic.c | 19 +-- .../selftests/bpf/prog_tests/sockmap_helpers.h | 138 ++++++++++++++------- 2 files changed, 96 insertions(+), 61 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c b/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c index 1337153eb0ad..5b17d69c9ee6 100644 --- a/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c +++ b/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c @@ -451,11 +451,11 @@ out: #define MAX_EVENTS 10 static void test_sockmap_skb_verdict_shutdown(void) { + int n, err, map, verdict, c1 = -1, p1 = -1; struct epoll_event ev, events[MAX_EVENTS]; - int n, err, map, verdict, s, c1 = -1, p1 = -1; struct test_sockmap_pass_prog *skel; - int epollfd; int zero = 0; + int epollfd; char b; skel = test_sockmap_pass_prog__open_and_load(); @@ -469,10 +469,7 @@ static void test_sockmap_skb_verdict_shutdown(void) if (!ASSERT_OK(err, "bpf_prog_attach")) goto out; - s = socket_loopback(AF_INET, SOCK_STREAM); - if (s < 0) - goto out; - err = create_pair(s, AF_INET, SOCK_STREAM, &c1, &p1); + err = create_pair(AF_INET, SOCK_STREAM, &c1, &p1); if (err < 0) goto out; @@ -570,16 +567,12 @@ out: static void test_sockmap_skb_verdict_peek_helper(int map) { - int err, s, c1, p1, zero = 0, sent, recvd, avail; + int err, c1, p1, zero = 0, sent, recvd, avail; char snd[256] = "0123456789"; char rcv[256] = "0"; - s = socket_loopback(AF_INET, SOCK_STREAM); - if (!ASSERT_GT(s, -1, "socket_loopback(s)")) - return; - - err = create_pair(s, AF_INET, SOCK_STREAM, &c1, &p1); - if (!ASSERT_OK(err, "create_pairs(s)")) + err = create_pair(AF_INET, SOCK_STREAM, &c1, &p1); + if (!ASSERT_OK(err, "create_pair()")) return; err = bpf_map_update_elem(map, &zero, &c1, BPF_NOEXIST); diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_helpers.h b/tools/testing/selftests/bpf/prog_tests/sockmap_helpers.h index e880f97bc44d..77b73333f091 100644 --- a/tools/testing/selftests/bpf/prog_tests/sockmap_helpers.h +++ b/tools/testing/selftests/bpf/prog_tests/sockmap_helpers.h @@ -3,6 +3,9 @@ #include +/* include/linux/net.h */ +#define SOCK_TYPE_MASK 0xf + #define IO_TIMEOUT_SEC 30 #define MAX_STRERR_LEN 256 #define MAX_TEST_NAME 80 @@ -312,54 +315,6 @@ static inline int add_to_sockmap(int sock_mapfd, int fd1, int fd2) return xbpf_map_update_elem(sock_mapfd, &key, &value, BPF_NOEXIST); } -static inline int create_pair(int s, int family, int sotype, int *c, int *p) -{ - struct sockaddr_storage addr; - socklen_t len; - int err = 0; - - len = sizeof(addr); - err = xgetsockname(s, sockaddr(&addr), &len); - if (err) - return err; - - *c = xsocket(family, sotype, 0); - if (*c < 0) - return errno; - err = xconnect(*c, sockaddr(&addr), len); - if (err) { - err = errno; - goto close_cli0; - } - - *p = xaccept_nonblock(s, NULL, NULL); - if (*p < 0) { - err = errno; - goto close_cli0; - } - return err; -close_cli0: - close(*c); - return err; -} - -static inline int create_socket_pairs(int s, int family, int sotype, - int *c0, int *c1, int *p0, int *p1) -{ - int err; - - err = create_pair(s, family, sotype, c0, p0); - if (err) - return err; - - err = create_pair(s, family, sotype, c1, p1); - if (err) { - close(*c0); - close(*p0); - } - return err; -} - static inline int enable_reuseport(int s, int progfd) { int err, one = 1; @@ -412,5 +367,92 @@ static inline int socket_loopback(int family, int sotype) return socket_loopback_reuseport(family, sotype, -1); } +static inline int create_pair(int family, int sotype, int *p0, int *p1) +{ + struct sockaddr_storage addr; + socklen_t len = sizeof(addr); + int s, c, p, err; + + s = socket_loopback(family, sotype); + if (s < 0) + return s; + + err = xgetsockname(s, sockaddr(&addr), &len); + if (err) + goto close_s; + + c = xsocket(family, sotype, 0); + if (c < 0) { + err = c; + goto close_s; + } + + err = connect(c, sockaddr(&addr), len); + if (err) { + if (errno != EINPROGRESS) { + FAIL_ERRNO("connect"); + goto close_c; + } + + err = poll_connect(c, IO_TIMEOUT_SEC); + if (err) { + FAIL_ERRNO("poll_connect"); + goto close_c; + } + } + + switch (sotype & SOCK_TYPE_MASK) { + case SOCK_DGRAM: + err = xgetsockname(c, sockaddr(&addr), &len); + if (err) + goto close_c; + + err = xconnect(s, sockaddr(&addr), len); + if (!err) { + *p0 = s; + *p1 = c; + return err; + } + break; + case SOCK_STREAM: + case SOCK_SEQPACKET: + p = xaccept_nonblock(s, NULL, NULL); + if (p >= 0) { + *p0 = p; + *p1 = c; + goto close_s; + } + + err = p; + break; + default: + FAIL("Unsupported socket type %#x", sotype); + err = -EOPNOTSUPP; + } + +close_c: + close(c); +close_s: + close(s); + return err; +} + +static inline int create_socket_pairs(int s, int family, int sotype, + int *c0, int *c1, int *p0, int *p1) +{ + int err; + + err = create_pair(family, sotype, c0, p0); + if (err) + return err; + + err = create_pair(family, sotype, c1, p1); + if (err) { + close(*c0); + close(*p0); + } + + return err; +} #endif // __SOCKMAP_HELPERS__ -- cgit v1.2.3 From b08f205e5b9a074ae89ad7f71002ca417a4a2700 Mon Sep 17 00:00:00 2001 From: Michal Luczaj Date: Wed, 31 Jul 2024 12:01:27 +0200 Subject: selftests/bpf: Socket pair creation, cleanups Following create_pair() changes, remove unused function argument in create_socket_pairs() and adapt its callers, i.e. drop the open-coded loopback socket creation. Reviewed-by: Jakub Sitnicki Tested-by: Jakub Sitnicki Suggested-by: Jakub Sitnicki Signed-off-by: Michal Luczaj Link: https://lore.kernel.org/r/20240731-selftest-sockmap-fixes-v2-2-08a0c73abed2@rbox.co Signed-off-by: Martin KaFai Lau --- .../selftests/bpf/prog_tests/sockmap_basic.c | 9 +++----- .../selftests/bpf/prog_tests/sockmap_helpers.h | 4 ++-- .../selftests/bpf/prog_tests/sockmap_listen.c | 26 +++++++--------------- 3 files changed, 13 insertions(+), 26 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c b/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c index 5b17d69c9ee6..82bfb266741c 100644 --- a/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c +++ b/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c @@ -503,8 +503,8 @@ out: static void test_sockmap_skb_verdict_fionread(bool pass_prog) { + int err, map, verdict, c0 = -1, c1 = -1, p0 = -1, p1 = -1; int expected, zero = 0, sent, recvd, avail; - int err, map, verdict, s, c0 = -1, c1 = -1, p0 = -1, p1 = -1; struct test_sockmap_pass_prog *pass = NULL; struct test_sockmap_drop_prog *drop = NULL; char buf[256] = "0123456789"; @@ -531,11 +531,8 @@ static void test_sockmap_skb_verdict_fionread(bool pass_prog) if (!ASSERT_OK(err, "bpf_prog_attach")) goto out; - s = socket_loopback(AF_INET, SOCK_STREAM); - if (!ASSERT_GT(s, -1, "socket_loopback(s)")) - goto out; - err = create_socket_pairs(s, AF_INET, SOCK_STREAM, &c0, &c1, &p0, &p1); - if (!ASSERT_OK(err, "create_socket_pairs(s)")) + err = create_socket_pairs(AF_INET, SOCK_STREAM, &c0, &c1, &p0, &p1); + if (!ASSERT_OK(err, "create_socket_pairs()")) goto out; err = bpf_map_update_elem(map, &zero, &c1, BPF_NOEXIST); diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_helpers.h b/tools/testing/selftests/bpf/prog_tests/sockmap_helpers.h index 77b73333f091..ead8ea4fd0da 100644 --- a/tools/testing/selftests/bpf/prog_tests/sockmap_helpers.h +++ b/tools/testing/selftests/bpf/prog_tests/sockmap_helpers.h @@ -437,8 +437,8 @@ close_s: return err; } -static inline int create_socket_pairs(int s, int family, int sotype, - int *c0, int *c1, int *p0, int *p1) +static inline int create_socket_pairs(int family, int sotype, int *c0, int *c1, + int *p0, int *p1) { int err; diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c b/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c index 9ce0e0e0b7da..bfbc217637d1 100644 --- a/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c +++ b/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c @@ -677,7 +677,7 @@ static void redir_to_connected(int family, int sotype, int sock_mapfd, int verd_mapfd, enum redir_mode mode) { const char *log_prefix = redir_mode_str(mode); - int s, c0, c1, p0, p1; + int c0, c1, p0, p1; unsigned int pass; int err, n; u32 key; @@ -685,13 +685,10 @@ static void redir_to_connected(int family, int sotype, int sock_mapfd, zero_verdict_count(verd_mapfd); - s = socket_loopback(family, sotype | SOCK_NONBLOCK); - if (s < 0) - return; - - err = create_socket_pairs(s, family, sotype, &c0, &c1, &p0, &p1); + err = create_socket_pairs(family, sotype | SOCK_NONBLOCK, &c0, &c1, + &p0, &p1); if (err) - goto close_srv; + return; err = add_to_sockmap(sock_mapfd, p0, p1); if (err) @@ -722,8 +719,6 @@ close: xclose(c1); xclose(p0); xclose(c0); -close_srv: - xclose(s); } static void test_skb_redir_to_connected(struct test_sockmap_listen *skel, @@ -909,7 +904,7 @@ static void test_msg_redir_to_listening_with_link(struct test_sockmap_listen *sk static void redir_partial(int family, int sotype, int sock_map, int parser_map) { - int s, c0 = -1, c1 = -1, p0 = -1, p1 = -1; + int c0 = -1, c1 = -1, p0 = -1, p1 = -1; int err, n, key, value; char buf[] = "abc"; @@ -919,13 +914,10 @@ static void redir_partial(int family, int sotype, int sock_map, int parser_map) if (err) return; - s = socket_loopback(family, sotype | SOCK_NONBLOCK); - if (s < 0) - goto clean_parser_map; - - err = create_socket_pairs(s, family, sotype, &c0, &c1, &p0, &p1); + err = create_socket_pairs(family, sotype | SOCK_NONBLOCK, &c0, &c1, + &p0, &p1); if (err) - goto close_srv; + goto clean_parser_map; err = add_to_sockmap(sock_map, p0, p1); if (err) @@ -944,8 +936,6 @@ close: xclose(p0); xclose(c1); xclose(p1); -close_srv: - xclose(s); clean_parser_map: key = 0; -- cgit v1.2.3 From 4e3dec2295b1fdf5ea5c40a8195bc13de7cffdeb Mon Sep 17 00:00:00 2001 From: Michal Luczaj Date: Wed, 31 Jul 2024 12:01:28 +0200 Subject: selftests/bpf: Simplify inet_socketpair() and vsock_socketpair_connectible() Replace implementation with a call to a generic function. Reviewed-by: Jakub Sitnicki Tested-by: Jakub Sitnicki Signed-off-by: Michal Luczaj Link: https://lore.kernel.org/r/20240731-selftest-sockmap-fixes-v2-3-08a0c73abed2@rbox.co Signed-off-by: Martin KaFai Lau --- .../selftests/bpf/prog_tests/sockmap_listen.c | 83 +--------------------- 1 file changed, 2 insertions(+), 81 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c b/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c index bfbc217637d1..ea2faacd146d 100644 --- a/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c +++ b/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c @@ -1490,49 +1490,7 @@ static void test_unix_redir(struct test_sockmap_listen *skel, struct bpf_map *ma /* Returns two connected loopback vsock sockets */ static int vsock_socketpair_connectible(int sotype, int *v0, int *v1) { - struct sockaddr_storage addr; - socklen_t len = sizeof(addr); - int s, p, c; - - s = socket_loopback(AF_VSOCK, sotype); - if (s < 0) - return -1; - - c = xsocket(AF_VSOCK, sotype | SOCK_NONBLOCK, 0); - if (c == -1) - goto close_srv; - - if (getsockname(s, sockaddr(&addr), &len) < 0) - goto close_cli; - - if (connect(c, sockaddr(&addr), len) < 0 && errno != EINPROGRESS) { - FAIL_ERRNO("connect"); - goto close_cli; - } - - len = sizeof(addr); - p = accept_timeout(s, sockaddr(&addr), &len, IO_TIMEOUT_SEC); - if (p < 0) - goto close_cli; - - if (poll_connect(c, IO_TIMEOUT_SEC) < 0) { - FAIL_ERRNO("poll_connect"); - goto close_acc; - } - - *v0 = p; - *v1 = c; - - return 0; - -close_acc: - close(p); -close_cli: - close(c); -close_srv: - close(s); - - return -1; + return create_pair(AF_VSOCK, sotype | SOCK_NONBLOCK, v0, v1); } static void vsock_unix_redir_connectible(int sock_mapfd, int verd_mapfd, @@ -1681,44 +1639,7 @@ static void test_reuseport(struct test_sockmap_listen *skel, static int inet_socketpair(int family, int type, int *s, int *c) { - struct sockaddr_storage addr; - socklen_t len; - int p0, c0; - int err; - - p0 = socket_loopback(family, type | SOCK_NONBLOCK); - if (p0 < 0) - return p0; - - len = sizeof(addr); - err = xgetsockname(p0, sockaddr(&addr), &len); - if (err) - goto close_peer0; - - c0 = xsocket(family, type | SOCK_NONBLOCK, 0); - if (c0 < 0) { - err = c0; - goto close_peer0; - } - err = xconnect(c0, sockaddr(&addr), len); - if (err) - goto close_cli0; - err = xgetsockname(c0, sockaddr(&addr), &len); - if (err) - goto close_cli0; - err = xconnect(p0, sockaddr(&addr), len); - if (err) - goto close_cli0; - - *s = p0; - *c = c0; - return 0; - -close_cli0: - xclose(c0); -close_peer0: - xclose(p0); - return err; + return create_pair(family, type | SOCK_NONBLOCK, s, c); } static void udp_redir_to_connected(int family, int sock_mapfd, int verd_mapfd, -- cgit v1.2.3 From b3b15b7a1e8de773c82f81c97904b51d4e919faa Mon Sep 17 00:00:00 2001 From: Michal Luczaj Date: Wed, 31 Jul 2024 12:01:29 +0200 Subject: selftests/bpf: Honour the sotype of af_unix redir tests Do actually test the sotype as specified by the caller. This picks up after commit 75e0e27db6cf ("selftest/bpf: Change udp to inet in some function names"). Reviewed-by: Jakub Sitnicki Tested-by: Jakub Sitnicki Suggested-by: Jakub Sitnicki Signed-off-by: Michal Luczaj Link: https://lore.kernel.org/r/20240731-selftest-sockmap-fixes-v2-4-08a0c73abed2@rbox.co Signed-off-by: Martin KaFai Lau --- tools/testing/selftests/bpf/prog_tests/sockmap_listen.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c b/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c index ea2faacd146d..7ed223df5f12 100644 --- a/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c +++ b/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c @@ -1706,11 +1706,11 @@ static void inet_unix_redir_to_connected(int family, int type, int sock_mapfd, int sfd[2]; int err; - if (socketpair(AF_UNIX, SOCK_DGRAM | SOCK_NONBLOCK, 0, sfd)) + if (socketpair(AF_UNIX, type | SOCK_NONBLOCK, 0, sfd)) return; c0 = sfd[0], p0 = sfd[1]; - err = inet_socketpair(family, SOCK_DGRAM, &p1, &c1); + err = inet_socketpair(family, type, &p1, &c1); if (err) goto close; @@ -1758,7 +1758,7 @@ static void unix_inet_redir_to_connected(int family, int type, int sock_mapfd, int sfd[2]; int err; - err = inet_socketpair(family, SOCK_DGRAM, &p0, &c0); + err = inet_socketpair(family, type, &p0, &c0); if (err) return; -- cgit v1.2.3 From c9c70b28face7b156960f53649bec9ace5601b85 Mon Sep 17 00:00:00 2001 From: Michal Luczaj Date: Wed, 31 Jul 2024 12:01:30 +0200 Subject: selftests/bpf: Exercise SOCK_STREAM unix_inet_redir_to_connected() Constants got switched reducing the test's coverage. Replace SOCK_DGRAM with SOCK_STREAM in one of unix_inet_skb_redir_to_connected() tests. Fixes: 51354f700d40 ("bpf, sockmap: Add af_unix test with both sockets in map") Reviewed-by: Jakub Sitnicki Tested-by: Jakub Sitnicki Suggested-by: Jakub Sitnicki Signed-off-by: Michal Luczaj Link: https://lore.kernel.org/r/20240731-selftest-sockmap-fixes-v2-5-08a0c73abed2@rbox.co Signed-off-by: Martin KaFai Lau --- tools/testing/selftests/bpf/prog_tests/sockmap_listen.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c b/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c index 7ed223df5f12..da5a6fb03b69 100644 --- a/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c +++ b/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c @@ -1793,7 +1793,7 @@ static void unix_inet_skb_redir_to_connected(struct test_sockmap_listen *skel, unix_inet_redir_to_connected(family, SOCK_DGRAM, sock_map, -1, verdict_map, REDIR_EGRESS, NO_FLAGS); - unix_inet_redir_to_connected(family, SOCK_DGRAM, + unix_inet_redir_to_connected(family, SOCK_STREAM, sock_map, -1, verdict_map, REDIR_EGRESS, NO_FLAGS); -- cgit v1.2.3 From 86149b4f5a2d535279096946b6ef8d41911a9b5a Mon Sep 17 00:00:00 2001 From: Michal Luczaj Date: Wed, 31 Jul 2024 12:01:31 +0200 Subject: selftests/bpf: Introduce __attribute__((cleanup)) in create_pair() Rewrite function to have (unneeded) socket descriptors automatically close()d when leaving the scope. Make sure the "ownership" of fds is correctly passed via take_fd(); i.e. descriptor returned to caller will remain valid. Reviewed-by: Jakub Sitnicki Tested-by: Jakub Sitnicki Suggested-by: Jakub Sitnicki Signed-off-by: Michal Luczaj Link: https://lore.kernel.org/r/20240731-selftest-sockmap-fixes-v2-6-08a0c73abed2@rbox.co Signed-off-by: Martin KaFai Lau --- .../selftests/bpf/prog_tests/sockmap_helpers.h | 61 +++++++++++++--------- 1 file changed, 36 insertions(+), 25 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_helpers.h b/tools/testing/selftests/bpf/prog_tests/sockmap_helpers.h index ead8ea4fd0da..38e35c72bdaa 100644 --- a/tools/testing/selftests/bpf/prog_tests/sockmap_helpers.h +++ b/tools/testing/selftests/bpf/prog_tests/sockmap_helpers.h @@ -17,6 +17,17 @@ #define __always_unused __attribute__((__unused__)) +/* include/linux/cleanup.h */ +#define __get_and_null(p, nullvalue) \ + ({ \ + __auto_type __ptr = &(p); \ + __auto_type __val = *__ptr; \ + *__ptr = nullvalue; \ + __val; \ + }) + +#define take_fd(fd) __get_and_null(fd, -EBADF) + #define _FAIL(errnum, fmt...) \ ({ \ error_at_line(0, (errnum), __func__, __LINE__, fmt); \ @@ -182,6 +193,14 @@ __ret; \ }) +static inline void close_fd(int *fd) +{ + if (*fd >= 0) + xclose(*fd); +} + +#define __close_fd __attribute__((cleanup(close_fd))) + static inline int poll_connect(int fd, unsigned int timeout_sec) { struct timeval timeout = { .tv_sec = timeout_sec }; @@ -369,9 +388,10 @@ static inline int socket_loopback(int family, int sotype) static inline int create_pair(int family, int sotype, int *p0, int *p1) { + __close_fd int s, c = -1, p = -1; struct sockaddr_storage addr; socklen_t len = sizeof(addr); - int s, c, p, err; + int err; s = socket_loopback(family, sotype); if (s < 0) @@ -379,25 +399,23 @@ static inline int create_pair(int family, int sotype, int *p0, int *p1) err = xgetsockname(s, sockaddr(&addr), &len); if (err) - goto close_s; + return err; c = xsocket(family, sotype, 0); - if (c < 0) { - err = c; - goto close_s; - } + if (c < 0) + return c; err = connect(c, sockaddr(&addr), len); if (err) { if (errno != EINPROGRESS) { FAIL_ERRNO("connect"); - goto close_c; + return err; } err = poll_connect(c, IO_TIMEOUT_SEC); if (err) { FAIL_ERRNO("poll_connect"); - goto close_c; + return err; } } @@ -405,36 +423,29 @@ static inline int create_pair(int family, int sotype, int *p0, int *p1) case SOCK_DGRAM: err = xgetsockname(c, sockaddr(&addr), &len); if (err) - goto close_c; + return err; err = xconnect(s, sockaddr(&addr), len); - if (!err) { - *p0 = s; - *p1 = c; + if (err) return err; - } + + *p0 = take_fd(s); break; case SOCK_STREAM: case SOCK_SEQPACKET: p = xaccept_nonblock(s, NULL, NULL); - if (p >= 0) { - *p0 = p; - *p1 = c; - goto close_s; - } + if (p < 0) + return p; - err = p; + *p0 = take_fd(p); break; default: FAIL("Unsupported socket type %#x", sotype); - err = -EOPNOTSUPP; + return -EOPNOTSUPP; } -close_c: - close(c); -close_s: - close(s); - return err; + *p1 = take_fd(c); + return 0; } static inline int create_socket_pairs(int family, int sotype, int *c0, int *c1, -- cgit v1.2.3 From 6236ebe07131a7746d870f1d8eb3637a8df13e70 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Mon, 19 Aug 2024 16:46:29 -0300 Subject: perf daemon: Fix the build on more 32-bit architectures The previous attempt fixed the build on debian:experimental-x-mipsel, but when building on a larger set of containers I noticed it broke the build on some other 32-bit architectures such as: 42 7.87 ubuntu:18.04-x-arm : FAIL gcc version 7.5.0 (Ubuntu/Linaro 7.5.0-3ubuntu1~18.04) builtin-daemon.c: In function 'cmd_session_list': builtin-daemon.c:692:16: error: format '%llu' expects argument of type 'long long unsigned int', but argument 4 has type 'long int' [-Werror=format=] fprintf(out, "%c%" PRIu64, ^~~~~ builtin-daemon.c:694:13: csv_sep, (curr - daemon->start) / 60); ~~~~~~~~~~~~~~~~~~~~~~~~~~~ In file included from builtin-daemon.c:3:0: /usr/arm-linux-gnueabihf/include/inttypes.h:105:34: note: format string is defined here # define PRIu64 __PRI64_PREFIX "u" So lets cast that time_t (32-bit/64-bit) to uint64_t to make sure it builds everywhere. Fixes: 4bbe6002931954bb ("perf daemon: Fix the build on 32-bit architectures") Cc: Adrian Hunter Cc: Ian Rogers Cc: Jiri Olsa Cc: Kan Liang Cc: Namhyung Kim Link: https://lore.kernel.org/lkml/ZsPmldtJ0D9Cua9_@x1 Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-daemon.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'tools') diff --git a/tools/perf/builtin-daemon.c b/tools/perf/builtin-daemon.c index 7dca39c4314b..f0568431fbd5 100644 --- a/tools/perf/builtin-daemon.c +++ b/tools/perf/builtin-daemon.c @@ -691,7 +691,7 @@ static int cmd_session_list(struct daemon *daemon, union cmd *cmd, FILE *out) fprintf(out, "%c%" PRIu64, /* session up time */ - csv_sep, (curr - daemon->start) / 60); + csv_sep, (uint64_t)((curr - daemon->start) / 60)); fprintf(out, "\n"); } else { @@ -702,7 +702,7 @@ static int cmd_session_list(struct daemon *daemon, union cmd *cmd, FILE *out) fprintf(out, " lock: %s/lock\n", daemon->base); fprintf(out, " up: %" PRIu64 " minutes\n", - (curr - daemon->start) / 60); + (uint64_t)((curr - daemon->start) / 60)); } } @@ -730,7 +730,7 @@ static int cmd_session_list(struct daemon *daemon, union cmd *cmd, FILE *out) fprintf(out, "%c%" PRIu64, /* session up time */ - csv_sep, (curr - session->start) / 60); + csv_sep, (uint64_t)((curr - session->start) / 60)); fprintf(out, "\n"); } else { @@ -747,7 +747,7 @@ static int cmd_session_list(struct daemon *daemon, union cmd *cmd, FILE *out) fprintf(out, " ack: %s/%s\n", session->base, SESSION_ACK); fprintf(out, " up: %" PRIu64 " minutes\n", - (curr - session->start) / 60); + (uint64_t)((curr - session->start) / 60)); } } -- cgit v1.2.3 From 0311507792b54069ac72e0a6c6b35c5d40aadad8 Mon Sep 17 00:00:00 2001 From: Deven Bowers Date: Fri, 2 Aug 2024 23:08:15 -0700 Subject: lsm: add IPE lsm Integrity Policy Enforcement (IPE) is an LSM that provides an complimentary approach to Mandatory Access Control than existing LSMs today. Existing LSMs have centered around the concept of access to a resource should be controlled by the current user's credentials. IPE's approach, is that access to a resource should be controlled by the system's trust of a current resource. The basis of this approach is defining a global policy to specify which resource can be trusted. Signed-off-by: Deven Bowers Signed-off-by: Fan Wu [PM: subject line tweak] Signed-off-by: Paul Moore --- include/uapi/linux/lsm.h | 1 + security/Kconfig | 11 +++--- security/Makefile | 1 + security/ipe/Kconfig | 17 +++++++++ security/ipe/Makefile | 9 +++++ security/ipe/ipe.c | 42 ++++++++++++++++++++++ security/ipe/ipe.h | 16 +++++++++ security/security.c | 3 +- .../testing/selftests/lsm/lsm_list_modules_test.c | 3 ++ 9 files changed, 97 insertions(+), 6 deletions(-) create mode 100644 security/ipe/Kconfig create mode 100644 security/ipe/Makefile create mode 100644 security/ipe/ipe.c create mode 100644 security/ipe/ipe.h (limited to 'tools') diff --git a/include/uapi/linux/lsm.h b/include/uapi/linux/lsm.h index 33d8c9f4aa6b..938593dfd5da 100644 --- a/include/uapi/linux/lsm.h +++ b/include/uapi/linux/lsm.h @@ -64,6 +64,7 @@ struct lsm_ctx { #define LSM_ID_LANDLOCK 110 #define LSM_ID_IMA 111 #define LSM_ID_EVM 112 +#define LSM_ID_IPE 113 /* * LSM_ATTR_XXX definitions identify different LSM attributes diff --git a/security/Kconfig b/security/Kconfig index 412e76f1575d..9fb8f9b14972 100644 --- a/security/Kconfig +++ b/security/Kconfig @@ -192,6 +192,7 @@ source "security/yama/Kconfig" source "security/safesetid/Kconfig" source "security/lockdown/Kconfig" source "security/landlock/Kconfig" +source "security/ipe/Kconfig" source "security/integrity/Kconfig" @@ -231,11 +232,11 @@ endchoice config LSM string "Ordered list of enabled LSMs" - default "landlock,lockdown,yama,loadpin,safesetid,smack,selinux,tomoyo,apparmor,bpf" if DEFAULT_SECURITY_SMACK - default "landlock,lockdown,yama,loadpin,safesetid,apparmor,selinux,smack,tomoyo,bpf" if DEFAULT_SECURITY_APPARMOR - default "landlock,lockdown,yama,loadpin,safesetid,tomoyo,bpf" if DEFAULT_SECURITY_TOMOYO - default "landlock,lockdown,yama,loadpin,safesetid,bpf" if DEFAULT_SECURITY_DAC - default "landlock,lockdown,yama,loadpin,safesetid,selinux,smack,tomoyo,apparmor,bpf" + default "landlock,lockdown,yama,loadpin,safesetid,smack,selinux,tomoyo,apparmor,ipe,bpf" if DEFAULT_SECURITY_SMACK + default "landlock,lockdown,yama,loadpin,safesetid,apparmor,selinux,smack,tomoyo,ipe,bpf" if DEFAULT_SECURITY_APPARMOR + default "landlock,lockdown,yama,loadpin,safesetid,tomoyo,ipe,bpf" if DEFAULT_SECURITY_TOMOYO + default "landlock,lockdown,yama,loadpin,safesetid,ipe,bpf" if DEFAULT_SECURITY_DAC + default "landlock,lockdown,yama,loadpin,safesetid,selinux,smack,tomoyo,apparmor,ipe,bpf" help A comma-separated list of LSMs, in initialization order. Any LSMs left off this list, except for those with order diff --git a/security/Makefile b/security/Makefile index 59f238490665..cc0982214b84 100644 --- a/security/Makefile +++ b/security/Makefile @@ -25,6 +25,7 @@ obj-$(CONFIG_SECURITY_LOCKDOWN_LSM) += lockdown/ obj-$(CONFIG_CGROUPS) += device_cgroup.o obj-$(CONFIG_BPF_LSM) += bpf/ obj-$(CONFIG_SECURITY_LANDLOCK) += landlock/ +obj-$(CONFIG_SECURITY_IPE) += ipe/ # Object integrity file lists obj-$(CONFIG_INTEGRITY) += integrity/ diff --git a/security/ipe/Kconfig b/security/ipe/Kconfig new file mode 100644 index 000000000000..e4875fb04883 --- /dev/null +++ b/security/ipe/Kconfig @@ -0,0 +1,17 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +# Integrity Policy Enforcement (IPE) configuration +# + +menuconfig SECURITY_IPE + bool "Integrity Policy Enforcement (IPE)" + depends on SECURITY && SECURITYFS + select PKCS7_MESSAGE_PARSER + select SYSTEM_DATA_VERIFICATION + help + This option enables the Integrity Policy Enforcement LSM + allowing users to define a policy to enforce a trust-based access + control. A key feature of IPE is a customizable policy to allow + admins to reconfigure trust requirements on the fly. + + If unsure, answer N. diff --git a/security/ipe/Makefile b/security/ipe/Makefile new file mode 100644 index 000000000000..5486398a69e9 --- /dev/null +++ b/security/ipe/Makefile @@ -0,0 +1,9 @@ +# SPDX-License-Identifier: GPL-2.0 +# +# Copyright (C) 2020-2024 Microsoft Corporation. All rights reserved. +# +# Makefile for building the IPE module as part of the kernel tree. +# + +obj-$(CONFIG_SECURITY_IPE) += \ + ipe.o \ diff --git a/security/ipe/ipe.c b/security/ipe/ipe.c new file mode 100644 index 000000000000..8d4ea372873e --- /dev/null +++ b/security/ipe/ipe.c @@ -0,0 +1,42 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2020-2024 Microsoft Corporation. All rights reserved. + */ +#include + +#include "ipe.h" + +static struct lsm_blob_sizes ipe_blobs __ro_after_init = { +}; + +static const struct lsm_id ipe_lsmid = { + .name = "ipe", + .id = LSM_ID_IPE, +}; + +static struct security_hook_list ipe_hooks[] __ro_after_init = { +}; + +/** + * ipe_init() - Entry point of IPE. + * + * This is called at LSM init, which happens occurs early during kernel + * start up. During this phase, IPE registers its hooks and loads the + * builtin boot policy. + * + * Return: + * * %0 - OK + * * %-ENOMEM - Out of memory (OOM) + */ +static int __init ipe_init(void) +{ + security_add_hooks(ipe_hooks, ARRAY_SIZE(ipe_hooks), &ipe_lsmid); + + return 0; +} + +DEFINE_LSM(ipe) = { + .name = "ipe", + .init = ipe_init, + .blobs = &ipe_blobs, +}; diff --git a/security/ipe/ipe.h b/security/ipe/ipe.h new file mode 100644 index 000000000000..adc3c45e9f53 --- /dev/null +++ b/security/ipe/ipe.h @@ -0,0 +1,16 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2020-2024 Microsoft Corporation. All rights reserved. + */ + +#ifndef _IPE_H +#define _IPE_H + +#ifdef pr_fmt +#undef pr_fmt +#endif +#define pr_fmt(fmt) "ipe: " fmt + +#include + +#endif /* _IPE_H */ diff --git a/security/security.c b/security/security.c index 611d3c124ba6..645a660320cb 100644 --- a/security/security.c +++ b/security/security.c @@ -53,7 +53,8 @@ (IS_ENABLED(CONFIG_BPF_LSM) ? 1 : 0) + \ (IS_ENABLED(CONFIG_SECURITY_LANDLOCK) ? 1 : 0) + \ (IS_ENABLED(CONFIG_IMA) ? 1 : 0) + \ - (IS_ENABLED(CONFIG_EVM) ? 1 : 0)) + (IS_ENABLED(CONFIG_EVM) ? 1 : 0) + \ + (IS_ENABLED(CONFIG_SECURITY_IPE) ? 1 : 0)) /* * These are descriptions of the reasons that can be passed to the diff --git a/tools/testing/selftests/lsm/lsm_list_modules_test.c b/tools/testing/selftests/lsm/lsm_list_modules_test.c index 06d24d4679a6..1cc8a977c711 100644 --- a/tools/testing/selftests/lsm/lsm_list_modules_test.c +++ b/tools/testing/selftests/lsm/lsm_list_modules_test.c @@ -128,6 +128,9 @@ TEST(correct_lsm_list_modules) case LSM_ID_EVM: name = "evm"; break; + case LSM_ID_IPE: + name = "ipe"; + break; default: name = "INVALID"; break; -- cgit v1.2.3 From 8b1042c425f6a5a9fb57c5e1d5ecf5247cf899a6 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Thu, 15 Aug 2024 15:38:23 -0700 Subject: perf annotate-data: Set bitfield member offset and size properly The bitfield members might not have DW_AT_data_member_location. Let's use DW_AT_data_bit_offset to set the member offset correct. Also use DW_AT_bit_size for the name like in a C program. Before: Annotate type: 'struct sk_buff' (1 samples) Percent Offset Size Field - 100.00 0 232 struct sk_buff { + 0.00 0 24 union ; + 0.00 24 8 union ; + 0.00 32 8 union ; 0.00 40 48 char[] cb; + 0.00 88 16 union ; 0.00 104 8 long unsigned int _nfct; 100.00 112 4 unsigned int len; 0.00 116 4 unsigned int data_len; 0.00 120 2 __u16 mac_len; 0.00 122 2 __u16 hdr_len; 0.00 124 2 __u16 queue_mapping; 0.00 126 0 __u8[] __cloned_offset; 0.00 0 1 __u8 cloned; 0.00 0 1 __u8 nohdr; 0.00 0 1 __u8 fclone; 0.00 0 1 __u8 peeked; 0.00 0 1 __u8 head_frag; 0.00 0 1 __u8 pfmemalloc; 0.00 0 1 __u8 pp_recycle; 0.00 127 1 __u8 active_extensions; + 0.00 128 60 union ; 0.00 188 4 sk_buff_data_t tail; 0.00 192 4 sk_buff_data_t end; 0.00 200 8 unsigned char* head; After: Annotate type: 'struct sk_buff' (1 samples) Percent Offset Size Field - 100.00 0 232 struct sk_buff { + 0.00 0 24 union ; + 0.00 24 8 union ; + 0.00 32 8 union ; 0.00 40 48 char[] cb + 0.00 88 16 union ; 0.00 104 8 long unsigned int _nfct; 100.00 112 4 unsigned int len; 0.00 116 4 unsigned int data_len; 0.00 120 2 __u16 mac_len; 0.00 122 2 __u16 hdr_len; 0.00 124 2 __u16 queue_mapping; 0.00 126 0 __u8[] __cloned_offset; 0.00 126 1 __u8 cloned:1; 0.00 126 1 __u8 nohdr:1; 0.00 126 1 __u8 fclone:2; 0.00 126 1 __u8 peeked:1; 0.00 126 1 __u8 head_frag:1; 0.00 126 1 __u8 pfmemalloc:1; 0.00 126 1 __u8 pp_recycle:1; 0.00 127 1 __u8 active_extensions; + 0.00 128 60 union ; 0.00 188 4 sk_buff_data_t tail; 0.00 192 4 sk_buff_data_t end; 0.00 200 8 unsigned char* head; Commiter notes: Collect some data: root@number:~# perf mem record -a --ldlat 5 -- ping -s 8193 -f 192.168.86.1 Memory events are enabled on a subset of CPUs: 16-27 PING 192.168.86.1 (192.168.86.1) 8193(8221) bytes of data. .^C --- 192.168.86.1 ping statistics --- 13881 packets transmitted, 13880 received, 0.00720409% packet loss, time 8664ms rtt min/avg/max/mdev = 0.510/0.599/7.768/0.115 ms, ipg/ewma 0.624/0.593 ms [ perf record: Woken up 8 times to write data ] [ perf record: Captured and wrote 14.877 MB perf.data (46785 samples) ] root@number:~# root@number:~# perf evlist cpu_atom/mem-loads,ldlat=5/P cpu_atom/mem-stores/P dummy:u root@number:~# perf evlist -v cpu_atom/mem-loads,ldlat=5/P: type: 10 (cpu_atom), size: 136, config: 0x5d0 (mem-loads), { sample_period, sample_freq }: 4000, sample_type: IP|TID|TIME|ADDR|CPU|PERIOD|IDENTIFIER|DATA_SRC|WEIGHT_STRUCT, read_format: ID|LOST, disabled: 1, inherit: 1, freq: 1, precise_ip: 3, sample_id_all: 1, { bp_addr, config1 }: 0x7 cpu_atom/mem-stores/P: type: 10 (cpu_atom), size: 136, config: 0x6d0 (mem-stores), { sample_period, sample_freq }: 4000, sample_type: IP|TID|TIME|ADDR|CPU|PERIOD|IDENTIFIER|DATA_SRC|WEIGHT_STRUCT, read_format: ID|LOST, disabled: 1, inherit: 1, freq: 1, precise_ip: 3, sample_id_all: 1 dummy:u: type: 1 (software), size: 136, config: 0x9 (PERF_COUNT_SW_DUMMY), { sample_period, sample_freq }: 1, sample_type: IP|TID|TIME|ADDR|CPU|IDENTIFIER|DATA_SRC|WEIGHT_STRUCT, read_format: ID|LOST, inherit: 1, exclude_kernel: 1, exclude_hv: 1, mmap: 1, comm: 1, task: 1, mmap_data: 1, sample_id_all: 1, exclude_guest: 1, mmap2: 1, comm_exec: 1, ksymbol: 1, bpf_event: 1 root@number:~# Ok, now lets see what changes from before this patch to after it: root@number:~# perf annotate --data-type > /tmp/before Apply the patch, build: root@number:~# perf annotate --data-type > /tmp/after The first hunk of the diff, for a glib data structure, in userspace, look at those bitfields: root@number:~# diff -u10 /tmp/before /tmp/after | head -20 --- /tmp/before 2024-08-20 17:29:58.306765780 -0300 +++ /tmp/after 2024-08-20 17:33:13.210582596 -0300 @@ -163,22 +163,22 @@ Annotate type: 'GHashTable' in /usr/lib64/libglib-2.0.so.0.8000.3 (1 samples): ============================================================================ Percent offset size field 100.00 0 96 GHashTable { 0.00 0 8 gsize size; 0.00 8 4 gint mod; 100.00 12 4 guint mask; 0.00 16 4 guint nnodes; 0.00 20 4 guint noccupied; - 0.00 0 4 guint have_big_keys; - 0.00 0 4 guint have_big_values; + 0.00 24 1 guint have_big_keys:1; + 0.00 24 1 guint have_big_values:1; 0.00 32 8 gpointer keys; 0.00 40 8 guint* hashes; 0.00 48 8 gpointer values; root@number:~# As advertised :-) Signed-off-by: Namhyung Kim Tested-by: Arnaldo Carvalho de Melo Cc: Adrian Hunter Cc: Athira Rajeev Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20240815223823.2402285-1-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/annotate-data.c | 34 ++++++++++++++++++++++++++++------ 1 file changed, 28 insertions(+), 6 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/annotate-data.c b/tools/perf/util/annotate-data.c index aa330c7d8edd..e5589268cb42 100644 --- a/tools/perf/util/annotate-data.c +++ b/tools/perf/util/annotate-data.c @@ -202,7 +202,7 @@ static int __add_member_cb(Dwarf_Die *die, void *arg) struct annotated_member *parent = arg; struct annotated_member *member; Dwarf_Die member_type, die_mem; - Dwarf_Word size, loc; + Dwarf_Word size, loc, bit_size = 0; Dwarf_Attribute attr; struct strbuf sb; int tag; @@ -226,15 +226,37 @@ static int __add_member_cb(Dwarf_Die *die, void *arg) if (dwarf_aggregate_size(&die_mem, &size) < 0) size = 0; - if (!dwarf_attr_integrate(die, DW_AT_data_member_location, &attr)) - loc = 0; - else + if (dwarf_attr_integrate(die, DW_AT_data_member_location, &attr)) dwarf_formudata(&attr, &loc); + else { + /* bitfield member */ + if (dwarf_attr_integrate(die, DW_AT_data_bit_offset, &attr) && + dwarf_formudata(&attr, &loc) == 0) + loc /= 8; + else + loc = 0; + + if (dwarf_attr_integrate(die, DW_AT_bit_size, &attr) && + dwarf_formudata(&attr, &bit_size) == 0) + size = (bit_size + 7) / 8; + } member->type_name = strbuf_detach(&sb, NULL); /* member->var_name can be NULL */ - if (dwarf_diename(die)) - member->var_name = strdup(dwarf_diename(die)); + if (dwarf_diename(die)) { + if (bit_size) { + if (asprintf(&member->var_name, "%s:%ld", + dwarf_diename(die), (long)bit_size) < 0) + member->var_name = NULL; + } else { + member->var_name = strdup(dwarf_diename(die)); + } + + if (member->var_name == NULL) { + free(member); + return DIE_FIND_CB_END; + } + } member->size = size; member->offset = loc + parent->offset; INIT_LIST_HEAD(&member->children); -- cgit v1.2.3 From e25ebda78e230283bf707ae3e9655270ff40a7f9 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Tue, 6 Aug 2024 15:06:14 -0700 Subject: perf cap: Tidy up and improve capability testing Remove dependence on libcap. libcap is only used to query whether a capability is supported, which is just 1 capget system call. If the capget system call fails, fall back on root permission checking. Previously if libcap fails then the permission is assumed not present which may be pessimistic/wrong. Add a used_root out argument to perf_cap__capable to say whether the fall back root check was used. This allows the correct error message, "root" vs "users with the CAP_PERFMON or CAP_SYS_ADMIN capability", to be selected. Tidy uses of perf_cap__capable so that tests aren't repeated if capget isn't supported. Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Athira Rajeev Cc: Changbin Du Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: Kan Liang Cc: Leo Yan Cc: Mark Rutland Cc: Namhyung Kim Cc: Oliver Upton Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20240806220614.831914-1-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Makefile.config | 11 -------- tools/perf/builtin-ftrace.c | 28 ++++++++++---------- tools/perf/util/Build | 2 +- tools/perf/util/cap.c | 63 +++++++++++++++++++++++++++++++-------------- tools/perf/util/cap.h | 23 +++-------------- tools/perf/util/symbol.c | 8 +++--- tools/perf/util/util.c | 12 ++++++--- 7 files changed, 75 insertions(+), 72 deletions(-) (limited to 'tools') diff --git a/tools/perf/Makefile.config b/tools/perf/Makefile.config index fa679db61f62..4eb1fc897baf 100644 --- a/tools/perf/Makefile.config +++ b/tools/perf/Makefile.config @@ -1031,17 +1031,6 @@ ifndef NO_LIBZSTD endif endif -ifndef NO_LIBCAP - ifeq ($(feature-libcap), 1) - CFLAGS += -DHAVE_LIBCAP_SUPPORT - EXTLIBS += -lcap - $(call detected,CONFIG_LIBCAP) - else - $(warning No libcap found, disables capability support, please install libcap-devel/libcap-dev) - NO_LIBCAP := 1 - endif -endif - ifndef NO_BACKTRACE ifeq ($(feature-backtrace), 1) CFLAGS += -DHAVE_BACKTRACE_SUPPORT diff --git a/tools/perf/builtin-ftrace.c b/tools/perf/builtin-ftrace.c index a615c405d98f..88a87bf387d2 100644 --- a/tools/perf/builtin-ftrace.c +++ b/tools/perf/builtin-ftrace.c @@ -63,20 +63,21 @@ static void ftrace__workload_exec_failed_signal(int signo __maybe_unused, done = true; } -static int check_ftrace_capable(void) +static bool check_ftrace_capable(void) { - if (!(perf_cap__capable(CAP_PERFMON) || - perf_cap__capable(CAP_SYS_ADMIN))) { - pr_err("ftrace only works for %s!\n", -#ifdef HAVE_LIBCAP_SUPPORT - "users with the CAP_PERFMON or CAP_SYS_ADMIN capability" -#else - "root" -#endif + bool used_root; + + if (perf_cap__capable(CAP_PERFMON, &used_root)) + return true; + + if (!used_root && perf_cap__capable(CAP_SYS_ADMIN, &used_root)) + return true; + + pr_err("ftrace only works for %s!\n", + used_root ? "root" + : "users with the CAP_PERFMON or CAP_SYS_ADMIN capability" ); - return -1; - } - return 0; + return false; } static int __write_tracing_file(const char *name, const char *val, bool append) @@ -1579,8 +1580,7 @@ int cmd_ftrace(int argc, const char **argv) signal(SIGCHLD, sig_handler); signal(SIGPIPE, sig_handler); - ret = check_ftrace_capable(); - if (ret < 0) + if (!check_ftrace_capable()) return -1; ret = perf_config(perf_ftrace_config, &ftrace); diff --git a/tools/perf/util/Build b/tools/perf/util/Build index 7ea261416c14..b87f918bdfe7 100644 --- a/tools/perf/util/Build +++ b/tools/perf/util/Build @@ -223,7 +223,7 @@ perf-util-$(CONFIG_ZLIB) += zlib.o perf-util-$(CONFIG_LZMA) += lzma.o perf-util-$(CONFIG_ZSTD) += zstd.o -perf-util-$(CONFIG_LIBCAP) += cap.o +perf-util-y += cap.o perf-util-$(CONFIG_CXX_DEMANGLE) += demangle-cxx.o perf-util-y += demangle-ocaml.o diff --git a/tools/perf/util/cap.c b/tools/perf/util/cap.c index c3ba841bbf37..7574a67651bc 100644 --- a/tools/perf/util/cap.c +++ b/tools/perf/util/cap.c @@ -3,27 +3,52 @@ * Capability utilities */ -#ifdef HAVE_LIBCAP_SUPPORT - #include "cap.h" -#include -#include - -bool perf_cap__capable(cap_value_t cap) -{ - cap_flag_value_t val; - cap_t caps = cap_get_proc(); +#include "debug.h" +#include +#include +#include +#include +#include - if (!caps) - return false; +#ifndef SYS_capget +#define SYS_capget 90 +#endif - if (cap_get_flag(caps, cap, CAP_EFFECTIVE, &val) != 0) - val = CAP_CLEAR; +#define MAX_LINUX_CAPABILITY_U32S _LINUX_CAPABILITY_U32S_3 - if (cap_free(caps) != 0) - return false; - - return val == CAP_SET; +bool perf_cap__capable(int cap, bool *used_root) +{ + struct __user_cap_header_struct header = { + .version = _LINUX_CAPABILITY_VERSION_3, + .pid = getpid(), + }; + struct __user_cap_data_struct data[MAX_LINUX_CAPABILITY_U32S]; + __u32 cap_val; + + *used_root = false; + while (syscall(SYS_capget, &header, &data[0]) == -1) { + /* Retry, first attempt has set the header.version correctly. */ + if (errno == EINVAL && header.version != _LINUX_CAPABILITY_VERSION_3 && + header.version == _LINUX_CAPABILITY_VERSION_1) + continue; + + pr_debug2("capget syscall failed (%s - %d) fall back on root check\n", + strerror(errno), errno); + *used_root = true; + return geteuid() == 0; + } + + /* Extract the relevant capability bit. */ + if (cap >= 32) { + if (header.version == _LINUX_CAPABILITY_VERSION_3) { + cap_val = data[1].effective; + } else { + /* Capability beyond 32 is requested but only 32 are supported. */ + return false; + } + } else { + cap_val = data[0].effective; + } + return (cap_val & (1 << (cap & 0x1f))) != 0; } - -#endif /* HAVE_LIBCAP_SUPPORT */ diff --git a/tools/perf/util/cap.h b/tools/perf/util/cap.h index ae52878c0b2e..0c6a1ff55f07 100644 --- a/tools/perf/util/cap.h +++ b/tools/perf/util/cap.h @@ -3,26 +3,6 @@ #define __PERF_CAP_H #include -#include -#include - -#ifdef HAVE_LIBCAP_SUPPORT - -#include - -bool perf_cap__capable(cap_value_t cap); - -#else - -#include -#include - -static inline bool perf_cap__capable(int cap __maybe_unused) -{ - return geteuid() == 0; -} - -#endif /* HAVE_LIBCAP_SUPPORT */ /* For older systems */ #ifndef CAP_SYSLOG @@ -33,4 +13,7 @@ static inline bool perf_cap__capable(int cap __maybe_unused) #define CAP_PERFMON 38 #endif +/* Query if a capability is supported, used_root is set if the fallback root check was used. */ +bool perf_cap__capable(int cap, bool *used_root); + #endif /* __PERF_CAP_H */ diff --git a/tools/perf/util/symbol.c b/tools/perf/util/symbol.c index 19eb623e0826..a18927d792af 100644 --- a/tools/perf/util/symbol.c +++ b/tools/perf/util/symbol.c @@ -2425,14 +2425,14 @@ static bool symbol__read_kptr_restrict(void) { bool value = false; FILE *fp = fopen("/proc/sys/kernel/kptr_restrict", "r"); + bool used_root; + bool cap_syslog = perf_cap__capable(CAP_SYSLOG, &used_root); if (fp != NULL) { char line[8]; if (fgets(line, sizeof(line), fp) != NULL) - value = perf_cap__capable(CAP_SYSLOG) ? - (atoi(line) >= 2) : - (atoi(line) != 0); + value = cap_syslog ? (atoi(line) >= 2) : (atoi(line) != 0); fclose(fp); } @@ -2440,7 +2440,7 @@ static bool symbol__read_kptr_restrict(void) /* Per kernel/kallsyms.c: * we also restrict when perf_event_paranoid > 1 w/o CAP_SYSLOG */ - if (perf_event_paranoid() > 1 && !perf_cap__capable(CAP_SYSLOG)) + if (perf_event_paranoid() > 1 && !cap_syslog) value = true; return value; diff --git a/tools/perf/util/util.c b/tools/perf/util/util.c index 4f561e5e4162..9d55a13787ce 100644 --- a/tools/perf/util/util.c +++ b/tools/perf/util/util.c @@ -325,9 +325,15 @@ int perf_event_paranoid(void) bool perf_event_paranoid_check(int max_level) { - return perf_cap__capable(CAP_SYS_ADMIN) || - perf_cap__capable(CAP_PERFMON) || - perf_event_paranoid() <= max_level; + bool used_root; + + if (perf_cap__capable(CAP_SYS_ADMIN, &used_root)) + return true; + + if (!used_root && perf_cap__capable(CAP_PERFMON, &used_root)) + return true; + + return perf_event_paranoid() <= max_level; } static int -- cgit v1.2.3 From 555e5531635a7c5dba663d06cea240362624dfde Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Sat, 17 Aug 2024 13:36:59 -0700 Subject: selftests: net/forwarding: spawn sh inside vrf to speed up ping loop Looking at timestamped output of netdev CI reveals that most of the time in forwarding tests for custom route hashing is spent on a single case, namely the test which uses ping (mausezahn does not support flow labels). On a non-debug kernel we spend 714 of 730 total test runtime (97%) on this test case. While having flow label support in a traffic gen tool / mausezahn would be best, we can significantly speed up the loop by putting ip vrf exec outside of the iteration. In a test of 1000 pings using a normal loop takes 50 seconds to finish. While using: ip vrf exec $vrf sh -c "$loop-body" takes 12 seconds (1/4 of the time). Some of the slowness is likely due to our inefficient virtualization setup, but even on my laptop running "ip link help" 16k times takes 25-30 seconds, so I think it's worth optimizing even for fastest setups. Reviewed-by: Ido Schimmel Tested-by: Ido Schimmel Reviewed-by: Hangbin Liu Link: https://patch.msgid.link/20240817203659.712085-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/forwarding/custom_multipath_hash.sh | 8 ++++---- .../testing/selftests/net/forwarding/gre_custom_multipath_hash.sh | 8 ++++---- .../selftests/net/forwarding/ip6gre_custom_multipath_hash.sh | 8 ++++---- 3 files changed, 12 insertions(+), 12 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/forwarding/custom_multipath_hash.sh b/tools/testing/selftests/net/forwarding/custom_multipath_hash.sh index 1783c10215e5..7d531f7091e6 100755 --- a/tools/testing/selftests/net/forwarding/custom_multipath_hash.sh +++ b/tools/testing/selftests/net/forwarding/custom_multipath_hash.sh @@ -224,10 +224,10 @@ send_dst_ipv6() send_flowlabel() { # Generate 16384 echo requests, each with a random flow label. - for _ in $(seq 1 16384); do - ip vrf exec v$h1 \ - $PING6 2001:db8:4::2 -F 0 -c 1 -q >/dev/null 2>&1 - done + ip vrf exec v$h1 sh -c \ + "for _ in {1..16384}; do \ + $PING6 2001:db8:4::2 -F 0 -c 1 -q >/dev/null 2>&1; \ + done" } send_src_udp6() diff --git a/tools/testing/selftests/net/forwarding/gre_custom_multipath_hash.sh b/tools/testing/selftests/net/forwarding/gre_custom_multipath_hash.sh index 9788bd0f6e8b..dda11a4a9450 100755 --- a/tools/testing/selftests/net/forwarding/gre_custom_multipath_hash.sh +++ b/tools/testing/selftests/net/forwarding/gre_custom_multipath_hash.sh @@ -319,10 +319,10 @@ send_dst_ipv6() send_flowlabel() { # Generate 16384 echo requests, each with a random flow label. - for _ in $(seq 1 16384); do - ip vrf exec v$h1 \ - $PING6 2001:db8:2::2 -F 0 -c 1 -q >/dev/null 2>&1 - done + ip vrf exec v$h1 sh -c \ + "for _ in {1..16384}; do \ + $PING6 2001:db8:2::2 -F 0 -c 1 -q >/dev/null 2>&1; \ + done" } send_src_udp6() diff --git a/tools/testing/selftests/net/forwarding/ip6gre_custom_multipath_hash.sh b/tools/testing/selftests/net/forwarding/ip6gre_custom_multipath_hash.sh index 2ab9eaaa5532..e28b4a079e52 100755 --- a/tools/testing/selftests/net/forwarding/ip6gre_custom_multipath_hash.sh +++ b/tools/testing/selftests/net/forwarding/ip6gre_custom_multipath_hash.sh @@ -321,10 +321,10 @@ send_dst_ipv6() send_flowlabel() { # Generate 16384 echo requests, each with a random flow label. - for _ in $(seq 1 16384); do - ip vrf exec v$h1 \ - $PING6 2001:db8:2::2 -F 0 -c 1 -q >/dev/null 2>&1 - done + ip vrf exec v$h1 sh -c \ + "for _ in {1..16384}; do \ + $PING6 2001:db8:2::2 -F 0 -c 1 -q >/dev/null 2>&1; \ + done" } send_src_udp6() -- cgit v1.2.3 From a13d5aad4dd9a309eecdc33cfd75045bd5f376a3 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Mon, 19 Aug 2024 21:45:20 +0200 Subject: selftests: mptcp: join: check re-using ID of unused ADD_ADDR This test extends "delete re-add signal" to validate the previous commit. An extra address is announced by the server, but this address cannot be used by the client. The result is that no subflow will be established to this address. Later, the server will delete this extra endpoint, and set a new one, with a valid address, but re-using the same ID. Before the previous commit, the server would not have been able to announce this new address. While at it, extra checks have been added to validate the expected numbers of MPJ, ADD_ADDR and RM_ADDR. The 'Fixes' tag here below is the same as the one from the previous commit: this patch here is not fixing anything wrong in the selftests, but it validates the previous fix for an issue introduced by this commit ID. Fixes: b6c08380860b ("mptcp: remove addr and subflow in PM netlink") Cc: stable@vger.kernel.org Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20240819-net-mptcp-pm-reusing-id-v1-2-38035d40de5b@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/mptcp/mptcp_join.sh | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh index 9ea6d698e9d3..25077ccf31d2 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_join.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh @@ -3601,9 +3601,11 @@ endpoint_tests() # remove and re-add if reset "delete re-add signal" && mptcp_lib_kallsyms_has "subflow_rebuild_header$"; then - pm_nl_set_limits $ns1 1 1 - pm_nl_set_limits $ns2 1 1 + pm_nl_set_limits $ns1 0 2 + pm_nl_set_limits $ns2 2 2 pm_nl_add_endpoint $ns1 10.0.2.1 id 1 flags signal + # broadcast IP: no packet for this address will be received on ns1 + pm_nl_add_endpoint $ns1 224.0.0.1 id 2 flags signal test_linkfail=4 speed=20 \ run_tests $ns1 $ns2 10.0.1.1 & local tests_pid=$! @@ -3615,15 +3617,21 @@ endpoint_tests() chk_mptcp_info subflows 1 subflows 1 pm_nl_del_endpoint $ns1 1 10.0.2.1 + pm_nl_del_endpoint $ns1 2 224.0.0.1 sleep 0.5 chk_subflow_nr "after delete" 1 chk_mptcp_info subflows 0 subflows 0 - pm_nl_add_endpoint $ns1 10.0.2.1 flags signal + pm_nl_add_endpoint $ns1 10.0.2.1 id 1 flags signal + pm_nl_add_endpoint $ns1 10.0.3.1 id 2 flags signal wait_mpj $ns2 - chk_subflow_nr "after re-add" 2 - chk_mptcp_info subflows 1 subflows 1 + chk_subflow_nr "after re-add" 3 + chk_mptcp_info subflows 2 subflows 2 mptcp_lib_kill_wait $tests_pid + + chk_join_nr 3 3 3 + chk_add_nr 4 4 + chk_rm_nr 2 1 invert fi } -- cgit v1.2.3 From 65fb58afa341ad68e71e5c4d816b407e6a683a66 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Mon, 19 Aug 2024 21:45:22 +0200 Subject: selftests: mptcp: join: check re-using ID of closed subflow This test extends "delete and re-add" to validate the previous commit. A new 'subflow' endpoint is added, but the subflow request will be rejected. The result is that no subflow will be established from this address. Later, the endpoint is removed and re-added after having cleared the firewall rule. Before the previous commit, the client would not have been able to create this new subflow. While at it, extra checks have been added to validate the expected numbers of MPJ and RM_ADDR. The 'Fixes' tag here below is the same as the one from the previous commit: this patch here is not fixing anything wrong in the selftests, but it validates the previous fix for an issue introduced by this commit ID. Fixes: b6c08380860b ("mptcp: remove addr and subflow in PM netlink") Cc: stable@vger.kernel.org Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20240819-net-mptcp-pm-reusing-id-v1-4-38035d40de5b@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/mptcp/mptcp_join.sh | 27 ++++++++++++++++++++----- 1 file changed, 22 insertions(+), 5 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh index 25077ccf31d2..fbb0174145ad 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_join.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh @@ -436,9 +436,10 @@ reset_with_tcp_filter() local ns="${!1}" local src="${2}" local target="${3}" + local chain="${4:-INPUT}" if ! ip netns exec "${ns}" ${iptables} \ - -A INPUT \ + -A "${chain}" \ -s "${src}" \ -p tcp \ -j "${target}"; then @@ -3571,10 +3572,10 @@ endpoint_tests() mptcp_lib_kill_wait $tests_pid fi - if reset "delete and re-add" && + if reset_with_tcp_filter "delete and re-add" ns2 10.0.3.2 REJECT OUTPUT && mptcp_lib_kallsyms_has "subflow_rebuild_header$"; then - pm_nl_set_limits $ns1 1 1 - pm_nl_set_limits $ns2 1 1 + pm_nl_set_limits $ns1 0 2 + pm_nl_set_limits $ns2 0 2 pm_nl_add_endpoint $ns2 10.0.2.2 id 2 dev ns2eth2 flags subflow test_linkfail=4 speed=20 \ run_tests $ns1 $ns2 10.0.1.1 & @@ -3591,11 +3592,27 @@ endpoint_tests() chk_subflow_nr "after delete" 1 chk_mptcp_info subflows 0 subflows 0 - pm_nl_add_endpoint $ns2 10.0.2.2 dev ns2eth2 flags subflow + pm_nl_add_endpoint $ns2 10.0.2.2 id 2 dev ns2eth2 flags subflow wait_mpj $ns2 chk_subflow_nr "after re-add" 2 chk_mptcp_info subflows 1 subflows 1 + + pm_nl_add_endpoint $ns2 10.0.3.2 id 3 flags subflow + wait_attempt_fail $ns2 + chk_subflow_nr "after new reject" 2 + chk_mptcp_info subflows 1 subflows 1 + + ip netns exec "${ns2}" ${iptables} -D OUTPUT -s "10.0.3.2" -p tcp -j REJECT + pm_nl_del_endpoint $ns2 3 10.0.3.2 + pm_nl_add_endpoint $ns2 10.0.3.2 id 3 flags subflow + wait_mpj $ns2 + chk_subflow_nr "after no reject" 3 + chk_mptcp_info subflows 2 subflows 2 + mptcp_lib_kill_wait $tests_pid + + chk_join_nr 3 3 3 + chk_rm_nr 1 1 fi # remove and re-add -- cgit v1.2.3 From e06959e9eebdfea4654390f53b65cff57691872e Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Mon, 19 Aug 2024 21:45:24 +0200 Subject: selftests: mptcp: join: test for flush/re-add endpoints After having flushed endpoints that didn't cause the creation of new subflows, it is important to check endpoints can be re-created, re-using previously used IDs. Before the previous commit, the client would not have been able to re-create the subflow that was previously rejected. The 'Fixes' tag here below is the same as the one from the previous commit: this patch here is not fixing anything wrong in the selftests, but it validates the previous fix for an issue introduced by this commit ID. Fixes: 06faa2271034 ("mptcp: remove multi addresses and subflows in PM") Cc: stable@vger.kernel.org Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20240819-net-mptcp-pm-reusing-id-v1-6-38035d40de5b@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/mptcp/mptcp_join.sh | 30 +++++++++++++++++++++++++ 1 file changed, 30 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh index fbb0174145ad..f609c02c6123 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_join.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh @@ -3651,6 +3651,36 @@ endpoint_tests() chk_rm_nr 2 1 invert fi + # flush and re-add + if reset_with_tcp_filter "flush re-add" ns2 10.0.3.2 REJECT OUTPUT && + mptcp_lib_kallsyms_has "subflow_rebuild_header$"; then + pm_nl_set_limits $ns1 0 2 + pm_nl_set_limits $ns2 1 2 + # broadcast IP: no packet for this address will be received on ns1 + pm_nl_add_endpoint $ns1 224.0.0.1 id 2 flags signal + pm_nl_add_endpoint $ns2 10.0.3.2 id 3 flags subflow + test_linkfail=4 speed=20 \ + run_tests $ns1 $ns2 10.0.1.1 & + local tests_pid=$! + + wait_attempt_fail $ns2 + chk_subflow_nr "before flush" 1 + chk_mptcp_info subflows 0 subflows 0 + + pm_nl_flush_endpoint $ns2 + pm_nl_flush_endpoint $ns1 + wait_rm_addr $ns2 0 + ip netns exec "${ns2}" ${iptables} -D OUTPUT -s "10.0.3.2" -p tcp -j REJECT + pm_nl_add_endpoint $ns2 10.0.3.2 id 3 flags subflow + wait_mpj $ns2 + pm_nl_add_endpoint $ns1 10.0.3.1 id 2 flags signal + wait_mpj $ns2 + mptcp_lib_kill_wait $tests_pid + + chk_join_nr 2 2 2 + chk_add_nr 2 2 + chk_rm_nr 1 0 invert + fi } # [$1: error message] -- cgit v1.2.3 From 4878f9f8421f4587bee7b232c1c8a9d3a7d4d782 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Mon, 19 Aug 2024 21:45:31 +0200 Subject: selftests: mptcp: join: validate fullmesh endp on 1st sf This case was not covered, and the wrong ID was set before the previous commit. The rest is not modified, it is just that it will increase the code coverage. The right address ID can be verified by looking at the packet traces. We could automate that using Netfilter with some cBPF code for example, but that's always a bit cryptic. Packetdrill seems better fitted for that. Fixes: 4f49d63352da ("selftests: mptcp: add fullmesh testcases") Cc: stable@vger.kernel.org Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20240819-net-mptcp-pm-reusing-id-v1-13-38035d40de5b@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/mptcp/mptcp_join.sh | 1 + 1 file changed, 1 insertion(+) (limited to 'tools') diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh index f609c02c6123..89e553e0e0c2 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_join.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh @@ -3059,6 +3059,7 @@ fullmesh_tests() pm_nl_set_limits $ns1 1 3 pm_nl_set_limits $ns2 1 3 pm_nl_add_endpoint $ns1 10.0.2.1 flags signal + pm_nl_add_endpoint $ns2 10.0.1.2 flags subflow,fullmesh fullmesh=1 speed=slow \ run_tests $ns1 $ns2 10.0.1.1 chk_join_nr 3 3 3 -- cgit v1.2.3 From 99338cc1e47187c3efdd39cda2a31500d0f2305d Mon Sep 17 00:00:00 2001 From: Piotr Zalewski Date: Mon, 19 Aug 2024 17:58:34 +0000 Subject: kselftest: timers: Fix const correctness Make timespec pointers, pointers to const in checklist function. As a consequence, make list parameter in checklist function pointer to const as well. Const-correctness increases readability. Improvement was found by running cppcheck tool on the patched file as follows: ``` cppcheck --enable=all \ tools/testing/selftests/timers/threadtest.c \ --suppress=missingIncludeSystem \ --suppress=unusedFunction ``` Reviewed-by: Shuah Khan Signed-off-by: Piotr Zalewski Acked-by: John Stultz Signed-off-by: Shuah Khan --- tools/testing/selftests/timers/threadtest.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/timers/threadtest.c b/tools/testing/selftests/timers/threadtest.c index 76b38e41d9c7..d5564bbf0e50 100644 --- a/tools/testing/selftests/timers/threadtest.c +++ b/tools/testing/selftests/timers/threadtest.c @@ -38,10 +38,10 @@ struct timespec global_list[LISTSIZE]; int listcount = 0; -void checklist(struct timespec *list, int size) +void checklist(const struct timespec *list, int size) { int i, j; - struct timespec *a, *b; + const struct timespec *a, *b; /* scan the list */ for (i = 0; i < size-1; i++) { -- cgit v1.2.3 From c049acee3c71cfc26c739f82617a84e13e471a45 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Wed, 15 May 2024 01:36:20 -0400 Subject: selftests/ftrace: Fix test to handle both old and new kernels The function "scheduler_tick" was renamed to "sched_tick" and a selftest that used that function for testing function trace filtering used that function as part of the test. But the change causes it to fail when run on older kernels. As tests should not fail on older kernels, add a check to see which name is available before testing. Fixes: 86dd6c04ef9f ("sched/balancing: Rename scheduler_tick() => sched_tick()") Signed-off-by: Steven Rostedt (Google) Signed-off-by: Shuah Khan --- .../selftests/ftrace/test.d/ftrace/func_set_ftrace_file.tc | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/ftrace/test.d/ftrace/func_set_ftrace_file.tc b/tools/testing/selftests/ftrace/test.d/ftrace/func_set_ftrace_file.tc index 073a748b9380..263f6b798c85 100644 --- a/tools/testing/selftests/ftrace/test.d/ftrace/func_set_ftrace_file.tc +++ b/tools/testing/selftests/ftrace/test.d/ftrace/func_set_ftrace_file.tc @@ -19,7 +19,14 @@ fail() { # mesg FILTER=set_ftrace_filter FUNC1="schedule" -FUNC2="sched_tick" +if grep '^sched_tick\b' available_filter_functions; then + FUNC2="sched_tick" +elif grep '^scheduler_tick\b' available_filter_functions; then + FUNC2="scheduler_tick" +else + exit_unresolved +fi + ALL_FUNCS="#### all functions enabled ####" -- cgit v1.2.3 From 922ec313f061cf75157d8e93fe16bc6397f6ea25 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Tue, 20 Aug 2024 23:54:06 -0700 Subject: perf annotate-data: Fix missing constant copy I found it missed to copy the immediate constant when it moves the register value. This could result in a wrong type inference since the address for the per-cpu variable would be 0 always. Fixes: eb9190afaed6afd5 ("perf annotate-data: Handle ADD instructions") Signed-off-by: Namhyung Kim Cc: Adrian Hunter Cc: Athira Rajeev Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kan Liang Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20240821065408.285548-1-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/arch/x86/annotate/instructions.c | 1 + 1 file changed, 1 insertion(+) (limited to 'tools') diff --git a/tools/perf/arch/x86/annotate/instructions.c b/tools/perf/arch/x86/annotate/instructions.c index 7b7d462c6c6b..88b5bcf2116f 100644 --- a/tools/perf/arch/x86/annotate/instructions.c +++ b/tools/perf/arch/x86/annotate/instructions.c @@ -382,6 +382,7 @@ static void update_insn_state_x86(struct type_state *state, tsr->type = state->regs[src->reg1].type; tsr->kind = state->regs[src->reg1].kind; + tsr->imm_value = state->regs[src->reg1].imm_value; tsr->ok = true; pr_debug_dtp("mov [%x] reg%d -> reg%d", -- cgit v1.2.3 From 4a32a97268d304d5708f1212c4a4d0e1dd3246dd Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Tue, 20 Aug 2024 23:54:07 -0700 Subject: perf annotate-data: Prefer struct/union over base type Sometimes a compound type can have a single field and the size is the same as the base type. But it's still preferred as struct or union could carry more information than the base type. Also put a slight priority on the typedef for the same reason. Signed-off-by: Namhyung Kim Cc: Adrian Hunter Cc: Athira Rajeev Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kan Liang Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20240821065408.285548-2-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/annotate-data.c | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/perf/util/annotate-data.c b/tools/perf/util/annotate-data.c index e5589268cb42..bc65264084d8 100644 --- a/tools/perf/util/annotate-data.c +++ b/tools/perf/util/annotate-data.c @@ -404,6 +404,13 @@ static bool is_pointer_type(Dwarf_Die *type_die) return tag == DW_TAG_pointer_type || tag == DW_TAG_array_type; } +static bool is_compound_type(Dwarf_Die *type_die) +{ + int tag = dwarf_tag(type_die); + + return tag == DW_TAG_structure_type || tag == DW_TAG_union_type; +} + /* returns if Type B has better information than Type A */ static bool is_better_type(Dwarf_Die *type_a, Dwarf_Die *type_b) { @@ -433,7 +440,18 @@ static bool is_better_type(Dwarf_Die *type_a, Dwarf_Die *type_b) dwarf_aggregate_size(type_b, &size_b) < 0) return false; - return size_a < size_b; + if (size_a != size_b) + return size_a < size_b; + + /* struct or union is preferred */ + if (is_compound_type(type_a) != is_compound_type(type_b)) + return is_compound_type(type_b); + + /* typedef is preferred */ + if (dwarf_tag(type_b) == DW_TAG_typedef) + return true; + + return false; } /* The type info will be saved in @type_die */ -- cgit v1.2.3 From 4d6d6e0f61e2267103e9b013d2a82d04ff278127 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Tue, 20 Aug 2024 23:54:08 -0700 Subject: perf annotate-data: Fix percpu pointer check In check_matching_type(), it checks the type state of the register in a wrong order. When it's the percpu pointer, it should check the type for the pointer, but it checks the CFA bit first and thought it has no type in the stack slot. This resulted in no type info. ----------------------------------------------------------- find data type for 0x28(reg1) at hrtimer_reprogram+0x88 CU for kernel/time/hrtimer.c (die:0x18f219f) frame base: cfa=1 fbreg=7 ... add [72] percpu 0x24500 -> reg1 pointer type='struct hrtimer_cpu_base' size=0x240 (die:0x18f6d46) bb: [7a - 7e] bb: [80 - 86] (here) bb: [88 - 88] vvv chk [88] reg1 offset=0x28 ok=1 kind=4 cfa : no type information no type information Here, instruction at 0x72 found reg1 has a (percpu) pointer and got the correct type. But when it checks the final result, it wrongly thought it was stack variable because it checks the cfa bit first. After changing the order of state check: ----------------------------------------------------------- find data type for 0x28(reg1) at hrtimer_reprogram+0x88 CU for kernel/time/hrtimer.c (die:0x18f219f) frame base: cfa=1 fbreg=7 ... (here) vvvvvvvvvv chk [88] reg1 offset=0x28 ok=1 kind=4 percpu ptr : Good! found by insn track: 0x28(reg1) type-offset=0x28 final type: type='struct hrtimer_cpu_base' size=0x240 (die:0x18f6d46) Signed-off-by: Namhyung Kim Cc: Adrian Hunter Cc: Athira Rajeev Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kan Liang Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20240821065408.285548-3-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/arch/x86/annotate/instructions.c | 3 + tools/perf/util/annotate-data.c | 122 ++++++++++++++-------------- 2 files changed, 66 insertions(+), 59 deletions(-) (limited to 'tools') diff --git a/tools/perf/arch/x86/annotate/instructions.c b/tools/perf/arch/x86/annotate/instructions.c index 88b5bcf2116f..15dfc2988e24 100644 --- a/tools/perf/arch/x86/annotate/instructions.c +++ b/tools/perf/arch/x86/annotate/instructions.c @@ -282,6 +282,7 @@ static void update_insn_state_x86(struct type_state *state, !strcmp(var_name, "this_cpu_off") && tsr->kind == TSR_KIND_CONST) { tsr->kind = TSR_KIND_PERCPU_BASE; + tsr->ok = true; imm_value = tsr->imm_value; } } @@ -533,9 +534,11 @@ retry: &var_name, &offset) && !strcmp(var_name, "__per_cpu_offset")) { tsr->kind = TSR_KIND_PERCPU_BASE; + tsr->ok = true; pr_debug_dtp("mov [%x] percpu base reg%d\n", insn_offset, dst->reg1); + return; } } diff --git a/tools/perf/util/annotate-data.c b/tools/perf/util/annotate-data.c index bc65264084d8..f5eefcb71c4f 100644 --- a/tools/perf/util/annotate-data.c +++ b/tools/perf/util/annotate-data.c @@ -965,7 +965,10 @@ static enum type_match_result check_matching_type(struct type_state *state, insn_offset, reg, dloc->op->offset, state->regs[reg].ok, state->regs[reg].kind); - if (state->regs[reg].ok && state->regs[reg].kind == TSR_KIND_TYPE) { + if (!state->regs[reg].ok) + goto check_non_register; + + if (state->regs[reg].kind == TSR_KIND_TYPE) { Dwarf_Die sized_type; /* @@ -998,6 +1001,65 @@ static enum type_match_result check_matching_type(struct type_state *state, return PERF_TMR_OK; } + if (state->regs[reg].kind == TSR_KIND_POINTER) { + pr_debug_dtp("percpu ptr"); + + /* + * It's actaully pointer but the address was calculated using + * some arithmetic. So it points to the actual type already. + */ + *type_die = state->regs[reg].type; + + dloc->type_offset = dloc->op->offset; + + /* Get the size of the actual type */ + if (dwarf_aggregate_size(type_die, &size) < 0 || + (unsigned)dloc->type_offset >= size) + return PERF_TMR_BAIL_OUT; + + return PERF_TMR_OK; + } + + if (state->regs[reg].kind == TSR_KIND_CANARY) { + pr_debug_dtp("stack canary"); + + /* + * This is a saved value of the stack canary which will be handled + * in the outer logic when it returns failure here. Pretend it's + * from the stack canary directly. + */ + setup_stack_canary(dloc); + + return PERF_TMR_BAIL_OUT; + } + + if (state->regs[reg].kind == TSR_KIND_PERCPU_BASE) { + u64 var_addr = dloc->op->offset; + int var_offset; + + pr_debug_dtp("percpu var"); + + if (dloc->op->multi_regs) { + int reg2 = dloc->op->reg2; + + if (dloc->op->reg2 == reg) + reg2 = dloc->op->reg1; + + if (has_reg_type(state, reg2) && state->regs[reg2].ok && + state->regs[reg2].kind == TSR_KIND_CONST) + var_addr += state->regs[reg2].imm_value; + } + + if (get_global_var_type(cu_die, dloc, dloc->ip, var_addr, + &var_offset, type_die)) { + dloc->type_offset = var_offset; + return PERF_TMR_OK; + } + /* No need to retry per-cpu (global) variables */ + return PERF_TMR_BAIL_OUT; + } + +check_non_register: if (reg == dloc->fbreg) { struct type_state_stack *stack; @@ -1054,64 +1116,6 @@ static enum type_match_result check_matching_type(struct type_state *state, return PERF_TMR_OK; } - if (state->regs[reg].kind == TSR_KIND_PERCPU_BASE) { - u64 var_addr = dloc->op->offset; - int var_offset; - - pr_debug_dtp("percpu var"); - - if (dloc->op->multi_regs) { - int reg2 = dloc->op->reg2; - - if (dloc->op->reg2 == reg) - reg2 = dloc->op->reg1; - - if (has_reg_type(state, reg2) && state->regs[reg2].ok && - state->regs[reg2].kind == TSR_KIND_CONST) - var_addr += state->regs[reg2].imm_value; - } - - if (get_global_var_type(cu_die, dloc, dloc->ip, var_addr, - &var_offset, type_die)) { - dloc->type_offset = var_offset; - return PERF_TMR_OK; - } - /* No need to retry per-cpu (global) variables */ - return PERF_TMR_BAIL_OUT; - } - - if (state->regs[reg].ok && state->regs[reg].kind == TSR_KIND_POINTER) { - pr_debug_dtp("percpu ptr"); - - /* - * It's actaully pointer but the address was calculated using - * some arithmetic. So it points to the actual type already. - */ - *type_die = state->regs[reg].type; - - dloc->type_offset = dloc->op->offset; - - /* Get the size of the actual type */ - if (dwarf_aggregate_size(type_die, &size) < 0 || - (unsigned)dloc->type_offset >= size) - return PERF_TMR_BAIL_OUT; - - return PERF_TMR_OK; - } - - if (state->regs[reg].ok && state->regs[reg].kind == TSR_KIND_CANARY) { - pr_debug_dtp("stack canary"); - - /* - * This is a saved value of the stack canary which will be handled - * in the outer logic when it returns failure here. Pretend it's - * from the stack canary directly. - */ - setup_stack_canary(dloc); - - return PERF_TMR_BAIL_OUT; - } - check_kernel: if (dso__kernel(map__dso(dloc->ms->map))) { u64 addr; -- cgit v1.2.3 From ce66d7c703d32851ceb50511e634712049f0c1c1 Mon Sep 17 00:00:00 2001 From: Yang Ruibin <11162571@vivo.com> Date: Wed, 21 Aug 2024 06:14:56 -0400 Subject: perf bpf: Remove redundant check that map is NULL The check that map is NULL is already done in the bpf_map__fd(map) and returns an errno, which does not run further checks. In addition, even if the check for map is run, the return is a pointer, which is not consistent with the err_number returned by bpf_map__fd(map). Signed-off-by: Yang Ruibin <11162571@vivo.com> Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kan Liang Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephen Rothwell Cc: opensource.kernel@vivo.com Link: https://lore.kernel.org/r/20240821101500.4568-1-11162571@vivo.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/bpf_map.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/bpf_map.c b/tools/perf/util/bpf_map.c index c863ae0c5cb5..578f27d2d6b4 100644 --- a/tools/perf/util/bpf_map.c +++ b/tools/perf/util/bpf_map.c @@ -35,9 +35,6 @@ int bpf_map__fprintf(struct bpf_map *map, FILE *fp) if (fd < 0) return fd; - if (!map) - return PTR_ERR(map); - err = -ENOMEM; key = malloc(bpf_map__key_size(map)); if (key == NULL) -- cgit v1.2.3 From 7a5c2170244b4f55588353e68ebbafc249597ee6 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Mon, 19 Aug 2024 16:36:02 -0700 Subject: perf annotate-data: Show offset and size in hex It'd be better to have them in hex to check cacheline alignment. Percent offset size field 100.00 0 0x1c0 struct cfs_rq { 0.00 0 0x10 struct load_weight load { 0.00 0 0x8 long unsigned int weight; 0.00 0x8 0x4 u32 inv_weight; }; 0.00 0x10 0x4 unsigned int nr_running; 14.56 0x14 0x4 unsigned int h_nr_running; 0.00 0x18 0x4 unsigned int idle_nr_running; 0.00 0x1c 0x4 unsigned int idle_h_nr_running; ... Committer notes: Justification from Namhyung when asked about why it would be "better": Cache line sizes are power of 2 so it'd be natural to use hex and check whether an offset is in the same boundary. Also 'perf annotate' shows instruction offsets in hex. > > Maybe this should be selectable? I can add an option and/or a config if you want. Signed-off-by: Namhyung Kim Tested-by: Arnaldo Carvalho de Melo Cc: Adrian Hunter Cc: Athira Rajeev Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kan Liang Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20240819233603.54941-1-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/ui/browsers/annotate-data.c | 4 ++-- tools/perf/util/annotate-data.c | 2 +- tools/perf/util/sort.c | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) (limited to 'tools') diff --git a/tools/perf/ui/browsers/annotate-data.c b/tools/perf/ui/browsers/annotate-data.c index f563a3bb072c..cd562a8822b7 100644 --- a/tools/perf/ui/browsers/annotate-data.c +++ b/tools/perf/ui/browsers/annotate-data.c @@ -427,12 +427,12 @@ static void browser__write(struct ui_browser *uib, void *entry, int row) /* print type info */ if (be->indent == 0 && !member->var_name) { - ui_browser__printf(uib, " %10d %10d %s%s", + ui_browser__printf(uib, " %#10x %#10x %s%s", member->offset, member->size, member->type_name, list_empty(&member->children) || be->folded? ";" : " {"); } else { - ui_browser__printf(uib, " %10d %10d %*s%s\t%s%s", + ui_browser__printf(uib, " %#10x %#10x %*s%s\t%s%s", member->offset, member->size, be->indent * 4, "", member->type_name, member->var_name ?: "", diff --git a/tools/perf/util/annotate-data.c b/tools/perf/util/annotate-data.c index f5eefcb71c4f..1e8328dde720 100644 --- a/tools/perf/util/annotate-data.c +++ b/tools/perf/util/annotate-data.c @@ -1704,7 +1704,7 @@ static void print_annotated_data_type(struct annotated_data_type *mem_type, nr_events++; } - printf(" %10d %10d %*s%s\t%s", + printf(" %#10x %#10x %*s%s\t%s", member->offset, member->size, indent, "", member->type_name, member->var_name ?: ""); diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c index c4046d5d1749..d315308f9170 100644 --- a/tools/perf/util/sort.c +++ b/tools/perf/util/sort.c @@ -2312,7 +2312,7 @@ static int hist_entry__typeoff_snprintf(struct hist_entry *he, char *bf, he->mem_type_off, true); buf[4095] = '\0'; - return repsep_snprintf(bf, size, "%s %+d (%s)", he_type->self.type_name, + return repsep_snprintf(bf, size, "%s +%#x (%s)", he_type->self.type_name, he->mem_type_off, buf); } -- cgit v1.2.3 From fd45d52eae5c42fdb68802127fa98d3718c80043 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Mon, 19 Aug 2024 16:36:03 -0700 Subject: perf annotate-data: Add 'typecln' sort key Sometimes it's useful to organize member fields in cache-line boundary. The 'typecln' sort key is short for type-cacheline and to show samples in each cacheline. The cacheline size is fixed to 64 for now, but it can read the actual size once it saves the value from sysfs. For example, you maybe want to which cacheline in a target is hot or cold. The following shows members in the cfs_rq's first cache line. $ perf report -s type,typecln,typeoff -H ... - 2.67% struct cfs_rq + 1.23% struct cfs_rq: cache-line 2 + 0.57% struct cfs_rq: cache-line 4 + 0.46% struct cfs_rq: cache-line 6 - 0.41% struct cfs_rq: cache-line 0 0.39% struct cfs_rq +0x14 (h_nr_running) 0.02% struct cfs_rq +0x38 (tasks_timeline.rb_leftmost) ... Committer testing: # root@number:~# perf report -s type,typecln,typeoff -H --stdio # Total Lost Samples: 0 # # Samples: 5K of event 'cpu_atom/mem-loads,ldlat=5/P' # Event count (approx.): 312251 # # Overhead Data Type / Data Type Cacheline / Data Type Offset # .............. .................................................. # 0.07% struct sigaction 0.05% struct sigaction: cache-line 1 0.02% struct sigaction +0x58 (sa_mask) 0.02% struct sigaction +0x78 (sa_mask) 0.03% struct sigaction: cache-line 0 0.02% struct sigaction +0x38 (sa_mask) 0.01% struct sigaction +0x8 (sa_mask) Signed-off-by: Namhyung Kim Tested-by: Arnaldo Carvalho de Melo Cc: Adrian Hunter Cc: Athira Rajeev Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kan Liang Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20240819233603.54941-2-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/hist.h | 1 + tools/perf/util/sort.c | 52 ++++++++++++++++++++++++++++++++++++++++++++++++++ tools/perf/util/sort.h | 1 + 3 files changed, 54 insertions(+) (limited to 'tools') diff --git a/tools/perf/util/hist.h b/tools/perf/util/hist.h index 30c13fc8cbe4..deb1087c5948 100644 --- a/tools/perf/util/hist.h +++ b/tools/perf/util/hist.h @@ -86,6 +86,7 @@ enum hist_column { HISTC_TYPE, HISTC_TYPE_OFFSET, HISTC_SYMBOL_OFFSET, + HISTC_TYPE_CACHELINE, HISTC_NR_COLS, /* Last entry */ }; diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c index d315308f9170..013020f33ece 100644 --- a/tools/perf/util/sort.c +++ b/tools/perf/util/sort.c @@ -2326,6 +2326,57 @@ struct sort_entry sort_type_offset = { .se_width_idx = HISTC_TYPE_OFFSET, }; +/* --sort typecln */ + +/* TODO: use actual value in the system */ +#define TYPE_CACHELINE_SIZE 64 + +static int64_t +sort__typecln_sort(struct hist_entry *left, struct hist_entry *right) +{ + struct annotated_data_type *left_type = left->mem_type; + struct annotated_data_type *right_type = right->mem_type; + int64_t left_cln, right_cln; + int64_t ret; + + if (!left_type) { + sort__type_init(left); + left_type = left->mem_type; + } + + if (!right_type) { + sort__type_init(right); + right_type = right->mem_type; + } + + ret = strcmp(left_type->self.type_name, right_type->self.type_name); + if (ret) + return ret; + + left_cln = left->mem_type_off / TYPE_CACHELINE_SIZE; + right_cln = right->mem_type_off / TYPE_CACHELINE_SIZE; + return left_cln - right_cln; +} + +static int hist_entry__typecln_snprintf(struct hist_entry *he, char *bf, + size_t size, unsigned int width __maybe_unused) +{ + struct annotated_data_type *he_type = he->mem_type; + + return repsep_snprintf(bf, size, "%s: cache-line %d", he_type->self.type_name, + he->mem_type_off / TYPE_CACHELINE_SIZE); +} + +struct sort_entry sort_type_cacheline = { + .se_header = "Data Type Cacheline", + .se_cmp = sort__type_cmp, + .se_collapse = sort__typecln_sort, + .se_sort = sort__typecln_sort, + .se_init = sort__type_init, + .se_snprintf = hist_entry__typecln_snprintf, + .se_width_idx = HISTC_TYPE_CACHELINE, +}; + struct sort_dimension { const char *name; @@ -2384,6 +2435,7 @@ static struct sort_dimension common_sort_dimensions[] = { DIM(SORT_ANNOTATE_DATA_TYPE, "type", sort_type), DIM(SORT_ANNOTATE_DATA_TYPE_OFFSET, "typeoff", sort_type_offset), DIM(SORT_SYM_OFFSET, "symoff", sort_sym_offset), + DIM(SORT_ANNOTATE_DATA_TYPE_CACHELINE, "typecln", sort_type_cacheline), }; #undef DIM diff --git a/tools/perf/util/sort.h b/tools/perf/util/sort.h index 6357bc32c5ca..9ff68c6786e7 100644 --- a/tools/perf/util/sort.h +++ b/tools/perf/util/sort.h @@ -71,6 +71,7 @@ enum sort_type { SORT_ANNOTATE_DATA_TYPE, SORT_ANNOTATE_DATA_TYPE_OFFSET, SORT_SYM_OFFSET, + SORT_ANNOTATE_DATA_TYPE_CACHELINE, /* branch stack specific sort keys */ __SORT_BRANCH_STACK, -- cgit v1.2.3 From b0cd726f9a8279b33faa5b4b0011df14f331fb33 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 8 Aug 2024 16:22:30 -0700 Subject: selftests/bpf: test passing iterator to a kfunc Define BPF iterator "getter" kfunc, which accepts iterator pointer as one of the arguments. Make sure that argument passed doesn't have to be the very first argument (unlike new-next-destroy combo). Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20240808232230.2848712-4-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/bpf_testmod/bpf_testmod.c | 16 +++++-- .../selftests/bpf/progs/iters_testmod_seq.c | 50 ++++++++++++++++++++++ 2 files changed, 62 insertions(+), 4 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c index 3687a40b61c6..c04b7dec2ab9 100644 --- a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c +++ b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c @@ -141,13 +141,12 @@ bpf_testmod_test_mod_kfunc(int i) __bpf_kfunc int bpf_iter_testmod_seq_new(struct bpf_iter_testmod_seq *it, s64 value, int cnt) { - if (cnt < 0) { - it->cnt = 0; + it->cnt = cnt; + + if (cnt < 0) return -EINVAL; - } it->value = value; - it->cnt = cnt; return 0; } @@ -162,6 +161,14 @@ __bpf_kfunc s64 *bpf_iter_testmod_seq_next(struct bpf_iter_testmod_seq* it) return &it->value; } +__bpf_kfunc s64 bpf_iter_testmod_seq_value(int val, struct bpf_iter_testmod_seq* it__iter) +{ + if (it__iter->cnt < 0) + return 0; + + return val + it__iter->value; +} + __bpf_kfunc void bpf_iter_testmod_seq_destroy(struct bpf_iter_testmod_seq *it) { it->cnt = 0; @@ -531,6 +538,7 @@ BTF_KFUNCS_START(bpf_testmod_common_kfunc_ids) BTF_ID_FLAGS(func, bpf_iter_testmod_seq_new, KF_ITER_NEW) BTF_ID_FLAGS(func, bpf_iter_testmod_seq_next, KF_ITER_NEXT | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_iter_testmod_seq_destroy, KF_ITER_DESTROY) +BTF_ID_FLAGS(func, bpf_iter_testmod_seq_value) BTF_ID_FLAGS(func, bpf_kfunc_common_test) BTF_ID_FLAGS(func, bpf_kfunc_dynptr_test) BTF_ID_FLAGS(func, bpf_testmod_ctx_create, KF_ACQUIRE | KF_RET_NULL) diff --git a/tools/testing/selftests/bpf/progs/iters_testmod_seq.c b/tools/testing/selftests/bpf/progs/iters_testmod_seq.c index 3873fb6c292a..4a176e6aede8 100644 --- a/tools/testing/selftests/bpf/progs/iters_testmod_seq.c +++ b/tools/testing/selftests/bpf/progs/iters_testmod_seq.c @@ -12,6 +12,7 @@ struct bpf_iter_testmod_seq { extern int bpf_iter_testmod_seq_new(struct bpf_iter_testmod_seq *it, s64 value, int cnt) __ksym; extern s64 *bpf_iter_testmod_seq_next(struct bpf_iter_testmod_seq *it) __ksym; +extern s64 bpf_iter_testmod_seq_value(int blah, struct bpf_iter_testmod_seq *it) __ksym; extern void bpf_iter_testmod_seq_destroy(struct bpf_iter_testmod_seq *it) __ksym; const volatile __s64 exp_empty = 0 + 1; @@ -76,4 +77,53 @@ int testmod_seq_truncated(const void *ctx) return 0; } +SEC("?raw_tp") +__failure +__msg("expected an initialized iter_testmod_seq as arg #2") +int testmod_seq_getter_before_bad(const void *ctx) +{ + struct bpf_iter_testmod_seq it; + + return bpf_iter_testmod_seq_value(0, &it); +} + +SEC("?raw_tp") +__failure +__msg("expected an initialized iter_testmod_seq as arg #2") +int testmod_seq_getter_after_bad(const void *ctx) +{ + struct bpf_iter_testmod_seq it; + s64 sum = 0, *v; + + bpf_iter_testmod_seq_new(&it, 100, 100); + + while ((v = bpf_iter_testmod_seq_next(&it))) { + sum += *v; + } + + bpf_iter_testmod_seq_destroy(&it); + + return sum + bpf_iter_testmod_seq_value(0, &it); +} + +SEC("?socket") +__success __retval(1000000) +int testmod_seq_getter_good(const void *ctx) +{ + struct bpf_iter_testmod_seq it; + s64 sum = 0, *v; + + bpf_iter_testmod_seq_new(&it, 100, 100); + + while ((v = bpf_iter_testmod_seq_next(&it))) { + sum += *v; + } + + sum *= bpf_iter_testmod_seq_value(0, &it); + + bpf_iter_testmod_seq_destroy(&it); + + return sum; +} + char _license[] SEC("license") = "GPL"; -- cgit v1.2.3 From 7d41dad105b6667745f0984e386caf3289cef6db Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Tue, 20 Aug 2024 03:23:49 -0700 Subject: selftests/bpf: less spam in the log for message matching When running test_loader based tests in the verbose mode each matched message leaves a trace in the stderr, e.g.: ./test_progs -vvv -t ... validate_msgs:PASS:expect_msg 0 nsec validate_msgs:PASS:expect_msg 0 nsec validate_msgs:PASS:expect_msg 0 nsec validate_msgs:PASS:expect_msg 0 nsec validate_msgs:PASS:expect_msg 0 nsec This is not very helpful when debugging such tests and clobbers the log a lot. Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20240820102357.3372779-2-eddyz87@gmail.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/test_loader.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/test_loader.c b/tools/testing/selftests/bpf/test_loader.c index 12b0c41e8d64..1b1290e090e7 100644 --- a/tools/testing/selftests/bpf/test_loader.c +++ b/tools/testing/selftests/bpf/test_loader.c @@ -531,7 +531,8 @@ static void validate_msgs(char *log_buf, struct expected_msgs *msgs, } } - if (!ASSERT_OK_PTR(match, "expect_msg")) { + if (!match) { + PRINT_FAIL("expect_msg\n"); if (env.verbosity == VERBOSE_NONE) emit_fn(log_buf, true /*force*/); for (j = 0; j <= i; j++) { -- cgit v1.2.3 From d0a29cdb6ef95d8a175e09ab2d1334271f047e60 Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Tue, 20 Aug 2024 03:23:50 -0700 Subject: selftests/bpf: correctly move 'log' upon successful match Suppose log="foo bar buz" and msg->substr="bar". In such case current match processing logic would update 'log' as follows: log += strlen(msg->substr); -> log += 3 -> log=" bar". However, the intent behind the 'log' update is to make it point after the successful match, e.g. to make log=" buz" in the example above. Fixes: 4ef5d6af4935 ("selftests/bpf: no need to track next_match_pos in struct test_loader") Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20240820102357.3372779-3-eddyz87@gmail.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/test_loader.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/test_loader.c b/tools/testing/selftests/bpf/test_loader.c index 1b1290e090e7..f5f5d16ac550 100644 --- a/tools/testing/selftests/bpf/test_loader.c +++ b/tools/testing/selftests/bpf/test_loader.c @@ -522,7 +522,7 @@ static void validate_msgs(char *log_buf, struct expected_msgs *msgs, if (msg->substr) { match = strstr(log, msg->substr); if (match) - log += strlen(msg->substr); + log = match + strlen(msg->substr); } else { err = regexec(&msg->regex, log, 1, reg_match, 0); if (err == 0) { -- cgit v1.2.3 From f00bb757ed630affc951691ddaff206039cbb7ee Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Tue, 20 Aug 2024 03:23:51 -0700 Subject: selftests/bpf: fix to avoid __msg tag de-duplication by clang __msg, __regex and __xlated tags are based on __attribute__((btf_decl_tag("..."))) annotations. Clang de-duplicates such annotations, e.g. the following two sequences of tags are identical in final BTF: /* seq A */ /* seq B */ __tag("foo") __tag("foo") __tag("bar") __tag("bar") __tag("foo") Fix this by adding a unique suffix for each tag using __COUNTER__ pre-processor macro. E.g. here is a new definition for __msg: #define __msg(msg) \ __attribute__((btf_decl_tag("comment:test_expect_msg=" XSTR(__COUNTER__) "=" msg))) Using this definition the "seq A" from example above is translated to BTF as follows: [..] DECL_TAG 'comment:test_expect_msg=0=foo' type_id=X component_idx=-1 [..] DECL_TAG 'comment:test_expect_msg=1=bar' type_id=X component_idx=-1 [..] DECL_TAG 'comment:test_expect_msg=2=foo' type_id=X component_idx=-1 Surprisingly, this bug affects a single existing test: verifier_spill_fill/old_stack_misc_vs_cur_ctx_ptr, where sequence of identical messages was expected in the log. Fixes: 537c3f66eac1 ("selftests/bpf: add generic BPF program tester-loader") Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20240820102357.3372779-4-eddyz87@gmail.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/progs/bpf_misc.h | 15 ++++--- .../selftests/bpf/progs/verifier_spill_fill.c | 8 ++-- tools/testing/selftests/bpf/test_loader.c | 47 ++++++++++++++++------ 3 files changed, 48 insertions(+), 22 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/progs/bpf_misc.h b/tools/testing/selftests/bpf/progs/bpf_misc.h index a225cd87897c..4f1029743734 100644 --- a/tools/testing/selftests/bpf/progs/bpf_misc.h +++ b/tools/testing/selftests/bpf/progs/bpf_misc.h @@ -2,6 +2,9 @@ #ifndef __BPF_MISC_H__ #define __BPF_MISC_H__ +#define XSTR(s) STR(s) +#define STR(s) #s + /* This set of attributes controls behavior of the * test_loader.c:test_loader__run_subtests(). * @@ -68,15 +71,15 @@ * Several __arch_* annotations could be specified at once. * When test case is not run on current arch it is marked as skipped. */ -#define __msg(msg) __attribute__((btf_decl_tag("comment:test_expect_msg=" msg))) -#define __regex(regex) __attribute__((btf_decl_tag("comment:test_expect_regex=" regex))) -#define __xlated(msg) __attribute__((btf_decl_tag("comment:test_expect_xlated=" msg))) +#define __msg(msg) __attribute__((btf_decl_tag("comment:test_expect_msg=" XSTR(__COUNTER__) "=" msg))) +#define __regex(regex) __attribute__((btf_decl_tag("comment:test_expect_regex=" XSTR(__COUNTER__) "=" regex))) +#define __xlated(msg) __attribute__((btf_decl_tag("comment:test_expect_xlated=" XSTR(__COUNTER__) "=" msg))) #define __failure __attribute__((btf_decl_tag("comment:test_expect_failure"))) #define __success __attribute__((btf_decl_tag("comment:test_expect_success"))) #define __description(desc) __attribute__((btf_decl_tag("comment:test_description=" desc))) -#define __msg_unpriv(msg) __attribute__((btf_decl_tag("comment:test_expect_msg_unpriv=" msg))) -#define __regex_unpriv(regex) __attribute__((btf_decl_tag("comment:test_expect_regex_unpriv=" regex))) -#define __xlated_unpriv(msg) __attribute__((btf_decl_tag("comment:test_expect_xlated_unpriv=" msg))) +#define __msg_unpriv(msg) __attribute__((btf_decl_tag("comment:test_expect_msg_unpriv=" XSTR(__COUNTER__) "=" msg))) +#define __regex_unpriv(regex) __attribute__((btf_decl_tag("comment:test_expect_regex_unpriv=" XSTR(__COUNTER__) "=" regex))) +#define __xlated_unpriv(msg) __attribute__((btf_decl_tag("comment:test_expect_xlated_unpriv=" XSTR(__COUNTER__) "=" msg))) #define __failure_unpriv __attribute__((btf_decl_tag("comment:test_expect_failure_unpriv"))) #define __success_unpriv __attribute__((btf_decl_tag("comment:test_expect_success_unpriv"))) #define __log_level(lvl) __attribute__((btf_decl_tag("comment:test_log_level="#lvl))) diff --git a/tools/testing/selftests/bpf/progs/verifier_spill_fill.c b/tools/testing/selftests/bpf/progs/verifier_spill_fill.c index 9d288ec7a168..671d9f415dbf 100644 --- a/tools/testing/selftests/bpf/progs/verifier_spill_fill.c +++ b/tools/testing/selftests/bpf/progs/verifier_spill_fill.c @@ -1213,10 +1213,10 @@ __success __log_level(2) * - once for path entry - label 2; * - once for path entry - label 1 - label 2. */ -__msg("r1 = *(u64 *)(r10 -8)") -__msg("exit") -__msg("r1 = *(u64 *)(r10 -8)") -__msg("exit") +__msg("8: (79) r1 = *(u64 *)(r10 -8)") +__msg("9: (95) exit") +__msg("from 2 to 7") +__msg("8: safe") __msg("processed 11 insns") __flag(BPF_F_TEST_STATE_FREQ) __naked void old_stack_misc_vs_cur_ctx_ptr(void) diff --git a/tools/testing/selftests/bpf/test_loader.c b/tools/testing/selftests/bpf/test_loader.c index f5f5d16ac550..c604a82e03c4 100644 --- a/tools/testing/selftests/bpf/test_loader.c +++ b/tools/testing/selftests/bpf/test_loader.c @@ -215,6 +215,35 @@ static void update_flags(int *flags, int flag, bool clear) *flags |= flag; } +/* Matches a string of form '[^=]=.*' and returns it's suffix. + * Used to parse btf_decl_tag values. + * Such values require unique prefix because compiler does not add + * same __attribute__((btf_decl_tag(...))) twice. + * Test suite uses two-component tags for such cases: + * + * __COUNTER__ '=' + * + * For example, two consecutive __msg tags '__msg("foo") __msg("foo")' + * would be encoded as: + * + * [18] DECL_TAG 'comment:test_expect_msg=0=foo' type_id=15 component_idx=-1 + * [19] DECL_TAG 'comment:test_expect_msg=1=foo' type_id=15 component_idx=-1 + * + * And the purpose of this function is to extract 'foo' from the above. + */ +static const char *skip_dynamic_pfx(const char *s, const char *pfx) +{ + const char *msg; + + if (strncmp(s, pfx, strlen(pfx)) != 0) + return NULL; + msg = s + strlen(pfx); + msg = strchr(msg, '='); + if (!msg) + return NULL; + return msg + 1; +} + enum arch { ARCH_X86_64 = 0x1, ARCH_ARM64 = 0x2, @@ -290,38 +319,32 @@ static int parse_test_spec(struct test_loader *tester, } else if (strcmp(s, TEST_TAG_AUXILIARY_UNPRIV) == 0) { spec->auxiliary = true; spec->mode_mask |= UNPRIV; - } else if (str_has_pfx(s, TEST_TAG_EXPECT_MSG_PFX)) { - msg = s + sizeof(TEST_TAG_EXPECT_MSG_PFX) - 1; + } else if ((msg = skip_dynamic_pfx(s, TEST_TAG_EXPECT_MSG_PFX))) { err = push_msg(msg, NULL, &spec->priv.expect_msgs); if (err) goto cleanup; spec->mode_mask |= PRIV; - } else if (str_has_pfx(s, TEST_TAG_EXPECT_MSG_PFX_UNPRIV)) { - msg = s + sizeof(TEST_TAG_EXPECT_MSG_PFX_UNPRIV) - 1; + } else if ((msg = skip_dynamic_pfx(s, TEST_TAG_EXPECT_MSG_PFX_UNPRIV))) { err = push_msg(msg, NULL, &spec->unpriv.expect_msgs); if (err) goto cleanup; spec->mode_mask |= UNPRIV; - } else if (str_has_pfx(s, TEST_TAG_EXPECT_REGEX_PFX)) { - msg = s + sizeof(TEST_TAG_EXPECT_REGEX_PFX) - 1; + } else if ((msg = skip_dynamic_pfx(s, TEST_TAG_EXPECT_REGEX_PFX))) { err = push_msg(NULL, msg, &spec->priv.expect_msgs); if (err) goto cleanup; spec->mode_mask |= PRIV; - } else if (str_has_pfx(s, TEST_TAG_EXPECT_REGEX_PFX_UNPRIV)) { - msg = s + sizeof(TEST_TAG_EXPECT_REGEX_PFX_UNPRIV) - 1; + } else if ((msg = skip_dynamic_pfx(s, TEST_TAG_EXPECT_REGEX_PFX_UNPRIV))) { err = push_msg(NULL, msg, &spec->unpriv.expect_msgs); if (err) goto cleanup; spec->mode_mask |= UNPRIV; - } else if (str_has_pfx(s, TEST_TAG_EXPECT_XLATED_PFX)) { - msg = s + sizeof(TEST_TAG_EXPECT_XLATED_PFX) - 1; + } else if ((msg = skip_dynamic_pfx(s, TEST_TAG_EXPECT_XLATED_PFX))) { err = push_msg(msg, NULL, &spec->priv.expect_xlated); if (err) goto cleanup; spec->mode_mask |= PRIV; - } else if (str_has_pfx(s, TEST_TAG_EXPECT_XLATED_PFX_UNPRIV)) { - msg = s + sizeof(TEST_TAG_EXPECT_XLATED_PFX_UNPRIV) - 1; + } else if ((msg = skip_dynamic_pfx(s, TEST_TAG_EXPECT_XLATED_PFX_UNPRIV))) { err = push_msg(msg, NULL, &spec->unpriv.expect_xlated); if (err) goto cleanup; -- cgit v1.2.3 From f8d161756d422598e10a112171a73cf621e67fae Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Tue, 20 Aug 2024 03:23:52 -0700 Subject: selftests/bpf: replace __regex macro with "{{...}}" patterns Upcoming changes require a notation to specify regular expression matches for regular verifier log messages, disassembly of BPF instructions, disassembly of jited instructions. Neither basic nor extended POSIX regular expressions w/o additional escaping are good for this role because of wide use of special characters in disassembly, for example: movq -0x10(%rbp), %rax ;; () are special characters cmpq $0x21, %rax ;; $ is a special character *(u64 *)(r10 -16) = r1 ;; * and () are special characters This commit borrows syntax from LLVM's FileCheck utility. It replaces __regex macro with ability to embed regular expressions in __msg patters using "{{" "}}" pairs for escaping. Syntax for __msg patterns: pattern := ( | regex)* regex := "{{" "}}" For example, pattern "foo{{[0-9]+}}" matches strings like "foo0", "foo007", etc. Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20240820102357.3372779-5-eddyz87@gmail.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/progs/bpf_misc.h | 9 +- tools/testing/selftests/bpf/progs/dynptr_fail.c | 6 +- tools/testing/selftests/bpf/progs/rbtree_fail.c | 2 +- .../selftests/bpf/progs/refcounted_kptr_fail.c | 4 +- tools/testing/selftests/bpf/test_loader.c | 164 +++++++++++++-------- 5 files changed, 115 insertions(+), 70 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/progs/bpf_misc.h b/tools/testing/selftests/bpf/progs/bpf_misc.h index 4f1029743734..cc3ef20a6490 100644 --- a/tools/testing/selftests/bpf/progs/bpf_misc.h +++ b/tools/testing/selftests/bpf/progs/bpf_misc.h @@ -25,12 +25,15 @@ * * __msg Message expected to be found in the verifier log. * Multiple __msg attributes could be specified. + * To match a regular expression use "{{" "}}" brackets, + * e.g. "foo{{[0-9]+}}" matches strings like "foo007". + * Extended POSIX regular expression syntax is allowed + * inside the brackets. * __msg_unpriv Same as __msg but for unprivileged mode. * - * __regex Same as __msg, but using a regular expression. - * __regex_unpriv Same as __msg_unpriv but using a regular expression. * __xlated Expect a line in a disassembly log after verifier applies rewrites. * Multiple __xlated attributes could be specified. + * Regular expressions could be specified same way as in __msg. * __xlated_unpriv Same as __xlated but for unprivileged mode. * * __success Expect program load success in privileged mode. @@ -72,13 +75,11 @@ * When test case is not run on current arch it is marked as skipped. */ #define __msg(msg) __attribute__((btf_decl_tag("comment:test_expect_msg=" XSTR(__COUNTER__) "=" msg))) -#define __regex(regex) __attribute__((btf_decl_tag("comment:test_expect_regex=" XSTR(__COUNTER__) "=" regex))) #define __xlated(msg) __attribute__((btf_decl_tag("comment:test_expect_xlated=" XSTR(__COUNTER__) "=" msg))) #define __failure __attribute__((btf_decl_tag("comment:test_expect_failure"))) #define __success __attribute__((btf_decl_tag("comment:test_expect_success"))) #define __description(desc) __attribute__((btf_decl_tag("comment:test_description=" desc))) #define __msg_unpriv(msg) __attribute__((btf_decl_tag("comment:test_expect_msg_unpriv=" XSTR(__COUNTER__) "=" msg))) -#define __regex_unpriv(regex) __attribute__((btf_decl_tag("comment:test_expect_regex_unpriv=" XSTR(__COUNTER__) "=" regex))) #define __xlated_unpriv(msg) __attribute__((btf_decl_tag("comment:test_expect_xlated_unpriv=" XSTR(__COUNTER__) "=" msg))) #define __failure_unpriv __attribute__((btf_decl_tag("comment:test_expect_failure_unpriv"))) #define __success_unpriv __attribute__((btf_decl_tag("comment:test_expect_success_unpriv"))) diff --git a/tools/testing/selftests/bpf/progs/dynptr_fail.c b/tools/testing/selftests/bpf/progs/dynptr_fail.c index e35bc1eac52a..68b8c6eca508 100644 --- a/tools/testing/selftests/bpf/progs/dynptr_fail.c +++ b/tools/testing/selftests/bpf/progs/dynptr_fail.c @@ -964,7 +964,7 @@ int dynptr_invalidate_slice_reinit(void *ctx) * mem_or_null pointers. */ SEC("?raw_tp") -__failure __regex("R[0-9]+ type=scalar expected=percpu_ptr_") +__failure __msg("R{{[0-9]+}} type=scalar expected=percpu_ptr_") int dynptr_invalidate_slice_or_null(void *ctx) { struct bpf_dynptr ptr; @@ -982,7 +982,7 @@ int dynptr_invalidate_slice_or_null(void *ctx) /* Destruction of dynptr should also any slices obtained from it */ SEC("?raw_tp") -__failure __regex("R[0-9]+ invalid mem access 'scalar'") +__failure __msg("R{{[0-9]+}} invalid mem access 'scalar'") int dynptr_invalidate_slice_failure(void *ctx) { struct bpf_dynptr ptr1; @@ -1069,7 +1069,7 @@ int dynptr_read_into_slot(void *ctx) /* bpf_dynptr_slice()s are read-only and cannot be written to */ SEC("?tc") -__failure __regex("R[0-9]+ cannot write into rdonly_mem") +__failure __msg("R{{[0-9]+}} cannot write into rdonly_mem") int skb_invalid_slice_write(struct __sk_buff *skb) { struct bpf_dynptr ptr; diff --git a/tools/testing/selftests/bpf/progs/rbtree_fail.c b/tools/testing/selftests/bpf/progs/rbtree_fail.c index b722a1e1ddef..dbd5eee8e25e 100644 --- a/tools/testing/selftests/bpf/progs/rbtree_fail.c +++ b/tools/testing/selftests/bpf/progs/rbtree_fail.c @@ -105,7 +105,7 @@ long rbtree_api_remove_unadded_node(void *ctx) } SEC("?tc") -__failure __regex("Unreleased reference id=3 alloc_insn=[0-9]+") +__failure __msg("Unreleased reference id=3 alloc_insn={{[0-9]+}}") long rbtree_api_remove_no_drop(void *ctx) { struct bpf_rb_node *res; diff --git a/tools/testing/selftests/bpf/progs/refcounted_kptr_fail.c b/tools/testing/selftests/bpf/progs/refcounted_kptr_fail.c index f8d4b7cfcd68..836c8ab7b908 100644 --- a/tools/testing/selftests/bpf/progs/refcounted_kptr_fail.c +++ b/tools/testing/selftests/bpf/progs/refcounted_kptr_fail.c @@ -32,7 +32,7 @@ static bool less(struct bpf_rb_node *a, const struct bpf_rb_node *b) } SEC("?tc") -__failure __regex("Unreleased reference id=4 alloc_insn=[0-9]+") +__failure __msg("Unreleased reference id=4 alloc_insn={{[0-9]+}}") long rbtree_refcounted_node_ref_escapes(void *ctx) { struct node_acquire *n, *m; @@ -73,7 +73,7 @@ long refcount_acquire_maybe_null(void *ctx) } SEC("?tc") -__failure __regex("Unreleased reference id=3 alloc_insn=[0-9]+") +__failure __msg("Unreleased reference id=3 alloc_insn={{[0-9]+}}") long rbtree_refcounted_node_ref_escapes_owning_input(void *ctx) { struct node_acquire *n, *m; diff --git a/tools/testing/selftests/bpf/test_loader.c b/tools/testing/selftests/bpf/test_loader.c index c604a82e03c4..b0d7158e00c1 100644 --- a/tools/testing/selftests/bpf/test_loader.c +++ b/tools/testing/selftests/bpf/test_loader.c @@ -19,12 +19,10 @@ #define TEST_TAG_EXPECT_FAILURE "comment:test_expect_failure" #define TEST_TAG_EXPECT_SUCCESS "comment:test_expect_success" #define TEST_TAG_EXPECT_MSG_PFX "comment:test_expect_msg=" -#define TEST_TAG_EXPECT_REGEX_PFX "comment:test_expect_regex=" #define TEST_TAG_EXPECT_XLATED_PFX "comment:test_expect_xlated=" #define TEST_TAG_EXPECT_FAILURE_UNPRIV "comment:test_expect_failure_unpriv" #define TEST_TAG_EXPECT_SUCCESS_UNPRIV "comment:test_expect_success_unpriv" #define TEST_TAG_EXPECT_MSG_PFX_UNPRIV "comment:test_expect_msg_unpriv=" -#define TEST_TAG_EXPECT_REGEX_PFX_UNPRIV "comment:test_expect_regex_unpriv=" #define TEST_TAG_EXPECT_XLATED_PFX_UNPRIV "comment:test_expect_xlated_unpriv=" #define TEST_TAG_LOG_LEVEL_PFX "comment:test_log_level=" #define TEST_TAG_PROG_FLAGS_PFX "comment:test_prog_flags=" @@ -55,8 +53,9 @@ enum mode { struct expect_msg { const char *substr; /* substring match */ - const char *regex_str; /* regex-based match */ regex_t regex; + bool is_regex; + bool on_next_line; }; struct expected_msgs { @@ -111,7 +110,7 @@ static void free_msgs(struct expected_msgs *msgs) int i; for (i = 0; i < msgs->cnt; i++) - if (msgs->patterns[i].regex_str) + if (msgs->patterns[i].is_regex) regfree(&msgs->patterns[i].regex); free(msgs->patterns); msgs->patterns = NULL; @@ -132,12 +131,71 @@ static void free_test_spec(struct test_spec *spec) spec->unpriv.name = NULL; } -static int push_msg(const char *substr, const char *regex_str, struct expected_msgs *msgs) +/* Compiles regular expression matching pattern. + * Pattern has a special syntax: + * + * pattern := ( | regex)* + * regex := "{{" "}}" + * + * In other words, pattern is a verbatim text with inclusion + * of regular expressions enclosed in "{{" "}}" pairs. + * For example, pattern "foo{{[0-9]+}}" matches strings like + * "foo0", "foo007", etc. + */ +static int compile_regex(const char *pattern, regex_t *regex) +{ + char err_buf[256], buf[256] = {}, *ptr, *buf_end; + const char *original_pattern = pattern; + bool in_regex = false; + int err; + + buf_end = buf + sizeof(buf); + ptr = buf; + while (*pattern && ptr < buf_end - 2) { + if (!in_regex && str_has_pfx(pattern, "{{")) { + in_regex = true; + pattern += 2; + continue; + } + if (in_regex && str_has_pfx(pattern, "}}")) { + in_regex = false; + pattern += 2; + continue; + } + if (in_regex) { + *ptr++ = *pattern++; + continue; + } + /* list of characters that need escaping for extended posix regex */ + if (strchr(".[]\\()*+?{}|^$", *pattern)) { + *ptr++ = '\\'; + *ptr++ = *pattern++; + continue; + } + *ptr++ = *pattern++; + } + if (*pattern) { + PRINT_FAIL("Regexp too long: '%s'\n", original_pattern); + return -EINVAL; + } + if (in_regex) { + PRINT_FAIL("Regexp has open '{{' but no closing '}}': '%s'\n", original_pattern); + return -EINVAL; + } + err = regcomp(regex, buf, REG_EXTENDED | REG_NEWLINE); + if (err != 0) { + regerror(err, regex, err_buf, sizeof(err_buf)); + PRINT_FAIL("Regexp compilation error in '%s': '%s'\n", buf, err_buf); + return -EINVAL; + } + return 0; +} + +static int __push_msg(const char *pattern, bool on_next_line, struct expected_msgs *msgs) { - void *tmp; - int regcomp_res; - char error_msg[100]; struct expect_msg *msg; + void *tmp; + int err; tmp = realloc(msgs->patterns, (1 + msgs->cnt) * sizeof(struct expect_msg)); @@ -147,26 +205,38 @@ static int push_msg(const char *substr, const char *regex_str, struct expected_m } msgs->patterns = tmp; msg = &msgs->patterns[msgs->cnt]; - - if (substr) { - msg->substr = substr; - msg->regex_str = NULL; - } else { - msg->regex_str = regex_str; - msg->substr = NULL; - regcomp_res = regcomp(&msg->regex, regex_str, REG_EXTENDED|REG_NEWLINE); - if (regcomp_res != 0) { - regerror(regcomp_res, &msg->regex, error_msg, sizeof(error_msg)); - PRINT_FAIL("Regexp compilation error in '%s': '%s'\n", - regex_str, error_msg); - return -EINVAL; - } + msg->on_next_line = on_next_line; + msg->substr = pattern; + msg->is_regex = false; + if (strstr(pattern, "{{")) { + err = compile_regex(pattern, &msg->regex); + if (err) + return err; + msg->is_regex = true; } - msgs->cnt += 1; return 0; } +static int clone_msgs(struct expected_msgs *from, struct expected_msgs *to) +{ + struct expect_msg *msg; + int i, err; + + for (i = 0; i < from->cnt; i++) { + msg = &from->patterns[i]; + err = __push_msg(msg->substr, msg->on_next_line, to); + if (err) + return err; + } + return 0; +} + +static int push_msg(const char *substr, struct expected_msgs *msgs) +{ + return __push_msg(substr, false, msgs); +} + static int parse_int(const char *str, int *val, const char *name) { char *end; @@ -320,32 +390,22 @@ static int parse_test_spec(struct test_loader *tester, spec->auxiliary = true; spec->mode_mask |= UNPRIV; } else if ((msg = skip_dynamic_pfx(s, TEST_TAG_EXPECT_MSG_PFX))) { - err = push_msg(msg, NULL, &spec->priv.expect_msgs); + err = push_msg(msg, &spec->priv.expect_msgs); if (err) goto cleanup; spec->mode_mask |= PRIV; } else if ((msg = skip_dynamic_pfx(s, TEST_TAG_EXPECT_MSG_PFX_UNPRIV))) { - err = push_msg(msg, NULL, &spec->unpriv.expect_msgs); - if (err) - goto cleanup; - spec->mode_mask |= UNPRIV; - } else if ((msg = skip_dynamic_pfx(s, TEST_TAG_EXPECT_REGEX_PFX))) { - err = push_msg(NULL, msg, &spec->priv.expect_msgs); - if (err) - goto cleanup; - spec->mode_mask |= PRIV; - } else if ((msg = skip_dynamic_pfx(s, TEST_TAG_EXPECT_REGEX_PFX_UNPRIV))) { - err = push_msg(NULL, msg, &spec->unpriv.expect_msgs); + err = push_msg(msg, &spec->unpriv.expect_msgs); if (err) goto cleanup; spec->mode_mask |= UNPRIV; } else if ((msg = skip_dynamic_pfx(s, TEST_TAG_EXPECT_XLATED_PFX))) { - err = push_msg(msg, NULL, &spec->priv.expect_xlated); + err = push_msg(msg, &spec->priv.expect_xlated); if (err) goto cleanup; spec->mode_mask |= PRIV; } else if ((msg = skip_dynamic_pfx(s, TEST_TAG_EXPECT_XLATED_PFX_UNPRIV))) { - err = push_msg(msg, NULL, &spec->unpriv.expect_xlated); + err = push_msg(msg, &spec->unpriv.expect_xlated); if (err) goto cleanup; spec->mode_mask |= UNPRIV; @@ -457,26 +517,10 @@ static int parse_test_spec(struct test_loader *tester, spec->unpriv.execute = spec->priv.execute; } - if (spec->unpriv.expect_msgs.cnt == 0) { - for (i = 0; i < spec->priv.expect_msgs.cnt; i++) { - struct expect_msg *msg = &spec->priv.expect_msgs.patterns[i]; - - err = push_msg(msg->substr, msg->regex_str, - &spec->unpriv.expect_msgs); - if (err) - goto cleanup; - } - } - if (spec->unpriv.expect_xlated.cnt == 0) { - for (i = 0; i < spec->priv.expect_xlated.cnt; i++) { - struct expect_msg *msg = &spec->priv.expect_xlated.patterns[i]; - - err = push_msg(msg->substr, msg->regex_str, - &spec->unpriv.expect_xlated); - if (err) - goto cleanup; - } - } + if (spec->unpriv.expect_msgs.cnt == 0) + clone_msgs(&spec->priv.expect_msgs, &spec->unpriv.expect_msgs); + if (spec->unpriv.expect_xlated.cnt == 0) + clone_msgs(&spec->priv.expect_xlated, &spec->unpriv.expect_xlated); } spec->valid = true; @@ -542,7 +586,7 @@ static void validate_msgs(char *log_buf, struct expected_msgs *msgs, struct expect_msg *msg = &msgs->patterns[i]; const char *match = NULL; - if (msg->substr) { + if (!msg->is_regex) { match = strstr(log, msg->substr); if (match) log = match + strlen(msg->substr); @@ -562,8 +606,8 @@ static void validate_msgs(char *log_buf, struct expected_msgs *msgs, msg = &msgs->patterns[j]; fprintf(stderr, "%s %s: '%s'\n", j < i ? "MATCHED " : "EXPECTED", - msg->substr ? "SUBSTR" : " REGEX", - msg->substr ?: msg->regex_str); + msg->is_regex ? " REGEX" : "SUBSTR", + msg->substr); } return; } -- cgit v1.2.3 From b991fc52070042468f31b70fb8ccab96ba351f8a Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Tue, 20 Aug 2024 03:23:53 -0700 Subject: selftests/bpf: utility function to get program disassembly after jit This commit adds a utility function to get disassembled text for jited representation of a BPF program designated by file descriptor. Function prototype looks as follows: int get_jited_program_text(int fd, char *text, size_t text_sz) Where 'fd' is a file descriptor for the program, 'text' and 'text_sz' refer to a destination buffer for disassembled text. Output format looks as follows: 18: 77 06 ja L0 1a: 50 pushq %rax 1b: 48 89 e0 movq %rsp, %rax 1e: eb 01 jmp L1 20: 50 L0: pushq %rax 21: 50 L1: pushq %rax ^ ^^^^^^^^ ^ ^^^^^^^^^^^^^^^^^^ | binary insn | textual insn | representation | representation | | instruction offset inferred local label name The code and makefile changes are inspired by jit_disasm.c from bpftool. Use llvm libraries to disassemble BPF program instead of libbfd to avoid issues with disassembly output stability pointed out in [1]. Selftests makefile uses Makefile.feature to detect if LLVM libraries are available. If that is not the case selftests build proceeds but the function returns -EOPNOTSUPP at runtime. [1] commit eb9d1acf634b ("bpftool: Add LLVM as default library for disassembling JIT-ed programs") Acked-by: Yonghong Song Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20240820102357.3372779-6-eddyz87@gmail.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/.gitignore | 1 + tools/testing/selftests/bpf/Makefile | 48 ++++- tools/testing/selftests/bpf/jit_disasm_helpers.c | 234 +++++++++++++++++++++++ tools/testing/selftests/bpf/jit_disasm_helpers.h | 10 + 4 files changed, 291 insertions(+), 2 deletions(-) create mode 100644 tools/testing/selftests/bpf/jit_disasm_helpers.c create mode 100644 tools/testing/selftests/bpf/jit_disasm_helpers.h (limited to 'tools') diff --git a/tools/testing/selftests/bpf/.gitignore b/tools/testing/selftests/bpf/.gitignore index 99ffea1fa5c6..e6533b3400de 100644 --- a/tools/testing/selftests/bpf/.gitignore +++ b/tools/testing/selftests/bpf/.gitignore @@ -8,6 +8,7 @@ test_lru_map test_lpm_map test_tag FEATURE-DUMP.libbpf +FEATURE-DUMP.selftests fixdep /test_progs /test_progs-no_alu32 diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 7bca9a2b8bc0..6d6d0c29db47 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -33,6 +33,13 @@ OPT_FLAGS ?= $(if $(RELEASE),-O2,-O0) LIBELF_CFLAGS := $(shell $(PKG_CONFIG) libelf --cflags 2>/dev/null) LIBELF_LIBS := $(shell $(PKG_CONFIG) libelf --libs 2>/dev/null || echo -lelf) +ifeq ($(srctree),) +srctree := $(patsubst %/,%,$(dir $(CURDIR))) +srctree := $(patsubst %/,%,$(dir $(srctree))) +srctree := $(patsubst %/,%,$(dir $(srctree))) +srctree := $(patsubst %/,%,$(dir $(srctree))) +endif + CFLAGS += -g $(OPT_FLAGS) -rdynamic \ -Wall -Werror -fno-omit-frame-pointer \ $(GENFLAGS) $(SAN_CFLAGS) $(LIBELF_CFLAGS) \ @@ -60,6 +67,9 @@ progs/timer_crash.c-CFLAGS := -fno-strict-aliasing progs/test_global_func9.c-CFLAGS := -fno-strict-aliasing progs/verifier_nocsr.c-CFLAGS := -fno-strict-aliasing +# Some utility functions use LLVM libraries +jit_disasm_helpers.c-CFLAGS = $(LLVM_CFLAGS) + ifneq ($(LLVM),) # Silence some warnings when compiled with clang CFLAGS += -Wno-unused-command-line-argument @@ -168,6 +178,31 @@ endef include ../lib.mk +NON_CHECK_FEAT_TARGETS := clean docs-clean +CHECK_FEAT := $(filter-out $(NON_CHECK_FEAT_TARGETS),$(or $(MAKECMDGOALS), "none")) +ifneq ($(CHECK_FEAT),) +FEATURE_USER := .selftests +FEATURE_TESTS := llvm +FEATURE_DISPLAY := $(FEATURE_TESTS) + +# Makefile.feature expects OUTPUT to end with a slash +$(let OUTPUT,$(OUTPUT)/,\ + $(eval include ../../../build/Makefile.feature)) +endif + +ifeq ($(feature-llvm),1) + LLVM_CFLAGS += -DHAVE_LLVM_SUPPORT + LLVM_CONFIG_LIB_COMPONENTS := mcdisassembler all-targets + # both llvm-config and lib.mk add -D_GNU_SOURCE, which ends up as conflict + LLVM_CFLAGS += $(filter-out -D_GNU_SOURCE,$(shell $(LLVM_CONFIG) --cflags)) + LLVM_LDLIBS += $(shell $(LLVM_CONFIG) --libs $(LLVM_CONFIG_LIB_COMPONENTS)) + ifeq ($(shell $(LLVM_CONFIG) --shared-mode),static) + LLVM_LDLIBS += $(shell $(LLVM_CONFIG) --system-libs $(LLVM_CONFIG_LIB_COMPONENTS)) + LLVM_LDLIBS += -lstdc++ + endif + LLVM_LDFLAGS += $(shell $(LLVM_CONFIG) --ldflags) +endif + SCRATCH_DIR := $(OUTPUT)/tools BUILD_DIR := $(SCRATCH_DIR)/build INCLUDE_DIR := $(SCRATCH_DIR)/include @@ -612,6 +647,10 @@ ifeq ($(filter clean docs-clean,$(MAKECMDGOALS)),) include $(wildcard $(TRUNNER_TEST_OBJS:.o=.d)) endif +# add per extra obj CFGLAGS definitions +$(foreach N,$(patsubst $(TRUNNER_OUTPUT)/%.o,%,$(TRUNNER_EXTRA_OBJS)), \ + $(eval $(TRUNNER_OUTPUT)/$(N).o: CFLAGS += $($(N).c-CFLAGS))) + $(TRUNNER_EXTRA_OBJS): $(TRUNNER_OUTPUT)/%.o: \ %.c \ $(TRUNNER_EXTRA_HDRS) \ @@ -628,6 +667,9 @@ ifneq ($2:$(OUTPUT),:$(shell pwd)) $(Q)rsync -aq $$^ $(TRUNNER_OUTPUT)/ endif +$(OUTPUT)/$(TRUNNER_BINARY): LDLIBS += $$(LLVM_LDLIBS) +$(OUTPUT)/$(TRUNNER_BINARY): LDFLAGS += $$(LLVM_LDFLAGS) + # some X.test.o files have runtime dependencies on Y.bpf.o files $(OUTPUT)/$(TRUNNER_BINARY): | $(TRUNNER_BPF_OBJS) @@ -637,7 +679,7 @@ $(OUTPUT)/$(TRUNNER_BINARY): $(TRUNNER_TEST_OBJS) \ $(TRUNNER_BPFTOOL) \ | $(TRUNNER_BINARY)-extras $$(call msg,BINARY,,$$@) - $(Q)$$(CC) $$(CFLAGS) $$(filter %.a %.o,$$^) $$(LDLIBS) -o $$@ + $(Q)$$(CC) $$(CFLAGS) $$(filter %.a %.o,$$^) $$(LDLIBS) $$(LDFLAGS) -o $$@ $(Q)$(RESOLVE_BTFIDS) --btf $(TRUNNER_OUTPUT)/btf_data.bpf.o $$@ $(Q)ln -sf $(if $2,..,.)/tools/build/bpftool/$(USE_BOOTSTRAP)bpftool \ $(OUTPUT)/$(if $2,$2/)bpftool @@ -656,6 +698,7 @@ TRUNNER_EXTRA_SOURCES := test_progs.c \ cap_helpers.c \ unpriv_helpers.c \ netlink_helpers.c \ + jit_disasm_helpers.c \ test_loader.c \ xsk.c \ disasm.c \ @@ -798,7 +841,8 @@ EXTRA_CLEAN := $(SCRATCH_DIR) $(HOST_SCRATCH_DIR) \ $(addprefix $(OUTPUT)/,*.o *.d *.skel.h *.lskel.h *.subskel.h \ no_alu32 cpuv4 bpf_gcc bpf_testmod.ko \ bpf_test_no_cfi.ko \ - liburandom_read.so) + liburandom_read.so) \ + $(OUTPUT)/FEATURE-DUMP.selftests .PHONY: docs docs-clean diff --git a/tools/testing/selftests/bpf/jit_disasm_helpers.c b/tools/testing/selftests/bpf/jit_disasm_helpers.c new file mode 100644 index 000000000000..1b0f1fd267c0 --- /dev/null +++ b/tools/testing/selftests/bpf/jit_disasm_helpers.c @@ -0,0 +1,234 @@ +// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) +#include +#include +#include + +#ifdef HAVE_LLVM_SUPPORT + +#include +#include +#include +#include + +/* The intent is to use get_jited_program_text() for small test + * programs written in BPF assembly, thus assume that 32 local labels + * would be sufficient. + */ +#define MAX_LOCAL_LABELS 32 + +static bool llvm_initialized; + +struct local_labels { + bool print_phase; + __u32 prog_len; + __u32 cnt; + __u32 pcs[MAX_LOCAL_LABELS]; + char names[MAX_LOCAL_LABELS][4]; +}; + +static const char *lookup_symbol(void *data, uint64_t ref_value, uint64_t *ref_type, + uint64_t ref_pc, const char **ref_name) +{ + struct local_labels *labels = data; + uint64_t type = *ref_type; + int i; + + *ref_type = LLVMDisassembler_ReferenceType_InOut_None; + *ref_name = NULL; + if (type != LLVMDisassembler_ReferenceType_In_Branch) + return NULL; + /* Depending on labels->print_phase either discover local labels or + * return a name assigned with local jump target: + * - if print_phase is true and ref_value is in labels->pcs, + * return corresponding labels->name. + * - if print_phase is false, save program-local jump targets + * in labels->pcs; + */ + if (labels->print_phase) { + for (i = 0; i < labels->cnt; ++i) + if (labels->pcs[i] == ref_value) + return labels->names[i]; + } else { + if (labels->cnt < MAX_LOCAL_LABELS && ref_value < labels->prog_len) + labels->pcs[labels->cnt++] = ref_value; + } + return NULL; +} + +static int disasm_insn(LLVMDisasmContextRef ctx, uint8_t *image, __u32 len, __u32 pc, + char *buf, __u32 buf_sz) +{ + int i, cnt; + + cnt = LLVMDisasmInstruction(ctx, image + pc, len - pc, pc, + buf, buf_sz); + if (cnt > 0) + return cnt; + PRINT_FAIL("Can't disasm instruction at offset %d:", pc); + for (i = 0; i < 16 && pc + i < len; ++i) + printf(" %02x", image[pc + i]); + printf("\n"); + return -EINVAL; +} + +static int cmp_u32(const void *_a, const void *_b) +{ + __u32 a = *(__u32 *)_a; + __u32 b = *(__u32 *)_b; + + if (a < b) + return -1; + if (a > b) + return 1; + return 0; +} + +static int disasm_one_func(FILE *text_out, uint8_t *image, __u32 len) +{ + char *label, *colon, *triple = NULL; + LLVMDisasmContextRef ctx = NULL; + struct local_labels labels = {}; + __u32 *label_pc, pc; + int i, cnt, err = 0; + char buf[64]; + + triple = LLVMGetDefaultTargetTriple(); + ctx = LLVMCreateDisasm(triple, &labels, 0, NULL, lookup_symbol); + if (!ASSERT_OK_PTR(ctx, "LLVMCreateDisasm")) { + err = -EINVAL; + goto out; + } + + cnt = LLVMSetDisasmOptions(ctx, LLVMDisassembler_Option_PrintImmHex); + if (!ASSERT_EQ(cnt, 1, "LLVMSetDisasmOptions")) { + err = -EINVAL; + goto out; + } + + /* discover labels */ + labels.prog_len = len; + pc = 0; + while (pc < len) { + cnt = disasm_insn(ctx, image, len, pc, buf, 1); + if (cnt < 0) { + err = cnt; + goto out; + } + pc += cnt; + } + qsort(labels.pcs, labels.cnt, sizeof(*labels.pcs), cmp_u32); + for (i = 0; i < labels.cnt; ++i) + /* use (i % 100) to avoid format truncation warning */ + snprintf(labels.names[i], sizeof(labels.names[i]), "L%d", i % 100); + + /* now print with labels */ + labels.print_phase = true; + pc = 0; + while (pc < len) { + cnt = disasm_insn(ctx, image, len, pc, buf, sizeof(buf)); + if (cnt < 0) { + err = cnt; + goto out; + } + label_pc = bsearch(&pc, labels.pcs, labels.cnt, sizeof(*labels.pcs), cmp_u32); + label = ""; + colon = ""; + if (label_pc) { + label = labels.names[label_pc - labels.pcs]; + colon = ":"; + } + fprintf(text_out, "%x:\t", pc); + for (i = 0; i < cnt; ++i) + fprintf(text_out, "%02x ", image[pc + i]); + for (i = cnt * 3; i < 12 * 3; ++i) + fputc(' ', text_out); + fprintf(text_out, "%s%s%s\n", label, colon, buf); + pc += cnt; + } + +out: + if (triple) + LLVMDisposeMessage(triple); + if (ctx) + LLVMDisasmDispose(ctx); + return err; +} + +int get_jited_program_text(int fd, char *text, size_t text_sz) +{ + struct bpf_prog_info info = {}; + __u32 info_len = sizeof(info); + __u32 jited_funcs, len, pc; + __u32 *func_lens = NULL; + FILE *text_out = NULL; + uint8_t *image = NULL; + int i, err = 0; + + if (!llvm_initialized) { + LLVMInitializeAllTargetInfos(); + LLVMInitializeAllTargetMCs(); + LLVMInitializeAllDisassemblers(); + llvm_initialized = 1; + } + + text_out = fmemopen(text, text_sz, "w"); + if (!ASSERT_OK_PTR(text_out, "open_memstream")) { + err = -errno; + goto out; + } + + /* first call is to find out jited program len */ + err = bpf_prog_get_info_by_fd(fd, &info, &info_len); + if (!ASSERT_OK(err, "bpf_prog_get_info_by_fd #1")) + goto out; + + len = info.jited_prog_len; + image = malloc(len); + if (!ASSERT_OK_PTR(image, "malloc(info.jited_prog_len)")) { + err = -ENOMEM; + goto out; + } + + jited_funcs = info.nr_jited_func_lens; + func_lens = malloc(jited_funcs * sizeof(__u32)); + if (!ASSERT_OK_PTR(func_lens, "malloc(info.nr_jited_func_lens)")) { + err = -ENOMEM; + goto out; + } + + memset(&info, 0, sizeof(info)); + info.jited_prog_insns = (__u64)image; + info.jited_prog_len = len; + info.jited_func_lens = (__u64)func_lens; + info.nr_jited_func_lens = jited_funcs; + err = bpf_prog_get_info_by_fd(fd, &info, &info_len); + if (!ASSERT_OK(err, "bpf_prog_get_info_by_fd #2")) + goto out; + + for (pc = 0, i = 0; i < jited_funcs; ++i) { + fprintf(text_out, "func #%d:\n", i); + disasm_one_func(text_out, image + pc, func_lens[i]); + fprintf(text_out, "\n"); + pc += func_lens[i]; + } + +out: + if (text_out) + fclose(text_out); + if (image) + free(image); + if (func_lens) + free(func_lens); + return err; +} + +#else /* HAVE_LLVM_SUPPORT */ + +int get_jited_program_text(int fd, char *text, size_t text_sz) +{ + if (env.verbosity >= VERBOSE_VERY) + printf("compiled w/o llvm development libraries, can't dis-assembly binary code"); + return -EOPNOTSUPP; +} + +#endif /* HAVE_LLVM_SUPPORT */ diff --git a/tools/testing/selftests/bpf/jit_disasm_helpers.h b/tools/testing/selftests/bpf/jit_disasm_helpers.h new file mode 100644 index 000000000000..e6924fd65ecf --- /dev/null +++ b/tools/testing/selftests/bpf/jit_disasm_helpers.h @@ -0,0 +1,10 @@ +/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ + +#ifndef __JIT_DISASM_HELPERS_H +#define __JIT_DISASM_HELPERS_H + +#include + +int get_jited_program_text(int fd, char *text, size_t text_sz); + +#endif /* __JIT_DISASM_HELPERS_H */ -- cgit v1.2.3 From 7d743e4c759c6bff0131d638158c3472358eda2b Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Tue, 20 Aug 2024 03:23:54 -0700 Subject: selftests/bpf: __jited test tag to check disassembly after jit Allow to verify jit behaviour by writing tests as below: SEC("tp") __arch_x86_64 __jited(" endbr64") __jited(" nopl (%rax,%rax)") __jited(" xorq %rax, %rax") ... __naked void some_test(void) { asm volatile (... ::: __clobber_all); } Allow regular expressions in patterns, same way as in __msg. By default assume that each __jited pattern has to be matched on the next consecutive line of the disassembly, e.g.: __jited(" endbr64") # matched on line N __jited(" nopl (%rax,%rax)") # matched on line N+1 If match occurs on a wrong line an error is reported. To override this behaviour use __jited("..."), e.g.: __jited(" endbr64") # matched on line N __jited("...") # not matched __jited(" nopl (%rax,%rax)") # matched on any line >= N Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20240820102357.3372779-7-eddyz87@gmail.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/progs/bpf_misc.h | 35 +++++++ tools/testing/selftests/bpf/test_loader.c | 149 ++++++++++++++++++++++----- 2 files changed, 161 insertions(+), 23 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/progs/bpf_misc.h b/tools/testing/selftests/bpf/progs/bpf_misc.h index cc3ef20a6490..eccaf955e394 100644 --- a/tools/testing/selftests/bpf/progs/bpf_misc.h +++ b/tools/testing/selftests/bpf/progs/bpf_misc.h @@ -36,6 +36,39 @@ * Regular expressions could be specified same way as in __msg. * __xlated_unpriv Same as __xlated but for unprivileged mode. * + * __jited Match a line in a disassembly of the jited BPF program. + * Has to be used after __arch_* macro. + * For example: + * + * __arch_x86_64 + * __jited(" endbr64") + * __jited(" nopl (%rax,%rax)") + * __jited(" xorq %rax, %rax") + * ... + * __naked void some_test(void) + * { + * asm volatile (... ::: __clobber_all); + * } + * + * Regular expressions could be included in patterns same way + * as in __msg. + * + * By default assume that each pattern has to be matched on the + * next consecutive line of disassembly, e.g.: + * + * __jited(" endbr64") # matched on line N + * __jited(" nopl (%rax,%rax)") # matched on line N+1 + * + * If match occurs on a wrong line an error is reported. + * To override this behaviour use literal "...", e.g.: + * + * __jited(" endbr64") # matched on line N + * __jited("...") # not matched + * __jited(" nopl (%rax,%rax)") # matched on any line >= N + * + * __jited_unpriv Same as __jited but for unprivileged mode. + * + * * __success Expect program load success in privileged mode. * __success_unpriv Expect program load success in unprivileged mode. * @@ -76,11 +109,13 @@ */ #define __msg(msg) __attribute__((btf_decl_tag("comment:test_expect_msg=" XSTR(__COUNTER__) "=" msg))) #define __xlated(msg) __attribute__((btf_decl_tag("comment:test_expect_xlated=" XSTR(__COUNTER__) "=" msg))) +#define __jited(msg) __attribute__((btf_decl_tag("comment:test_jited=" XSTR(__COUNTER__) "=" msg))) #define __failure __attribute__((btf_decl_tag("comment:test_expect_failure"))) #define __success __attribute__((btf_decl_tag("comment:test_expect_success"))) #define __description(desc) __attribute__((btf_decl_tag("comment:test_description=" desc))) #define __msg_unpriv(msg) __attribute__((btf_decl_tag("comment:test_expect_msg_unpriv=" XSTR(__COUNTER__) "=" msg))) #define __xlated_unpriv(msg) __attribute__((btf_decl_tag("comment:test_expect_xlated_unpriv=" XSTR(__COUNTER__) "=" msg))) +#define __jited_unpriv(msg) __attribute__((btf_decl_tag("comment:test_jited=" XSTR(__COUNTER__) "=" msg))) #define __failure_unpriv __attribute__((btf_decl_tag("comment:test_expect_failure_unpriv"))) #define __success_unpriv __attribute__((btf_decl_tag("comment:test_expect_success_unpriv"))) #define __log_level(lvl) __attribute__((btf_decl_tag("comment:test_log_level="#lvl))) diff --git a/tools/testing/selftests/bpf/test_loader.c b/tools/testing/selftests/bpf/test_loader.c index b0d7158e00c1..d588c612ac03 100644 --- a/tools/testing/selftests/bpf/test_loader.c +++ b/tools/testing/selftests/bpf/test_loader.c @@ -10,6 +10,7 @@ #include "disasm_helpers.h" #include "unpriv_helpers.h" #include "cap_helpers.h" +#include "jit_disasm_helpers.h" #define str_has_pfx(str, pfx) \ (strncmp(str, pfx, __builtin_constant_p(pfx) ? sizeof(pfx) - 1 : strlen(pfx)) == 0) @@ -33,6 +34,8 @@ #define TEST_TAG_AUXILIARY_UNPRIV "comment:test_auxiliary_unpriv" #define TEST_BTF_PATH "comment:test_btf_path=" #define TEST_TAG_ARCH "comment:test_arch=" +#define TEST_TAG_JITED_PFX "comment:test_jited=" +#define TEST_TAG_JITED_PFX_UNPRIV "comment:test_jited_unpriv=" /* Warning: duplicated in bpf_misc.h */ #define POINTER_VALUE 0xcafe4all @@ -68,6 +71,7 @@ struct test_subspec { bool expect_failure; struct expected_msgs expect_msgs; struct expected_msgs expect_xlated; + struct expected_msgs jited; int retval; bool execute; }; @@ -124,6 +128,8 @@ static void free_test_spec(struct test_spec *spec) free_msgs(&spec->unpriv.expect_msgs); free_msgs(&spec->priv.expect_xlated); free_msgs(&spec->unpriv.expect_xlated); + free_msgs(&spec->priv.jited); + free_msgs(&spec->unpriv.jited); free(spec->priv.name); free(spec->unpriv.name); @@ -237,6 +243,21 @@ static int push_msg(const char *substr, struct expected_msgs *msgs) return __push_msg(substr, false, msgs); } +static int push_disasm_msg(const char *regex_str, bool *on_next_line, struct expected_msgs *msgs) +{ + int err; + + if (strcmp(regex_str, "...") == 0) { + *on_next_line = false; + return 0; + } + err = __push_msg(regex_str, *on_next_line, msgs); + if (err) + return err; + *on_next_line = true; + return 0; +} + static int parse_int(const char *str, int *val, const char *name) { char *end; @@ -320,6 +341,18 @@ enum arch { ARCH_RISCV64 = 0x4, }; +static int get_current_arch(void) +{ +#if defined(__x86_64__) + return ARCH_X86_64; +#elif defined(__aarch64__) + return ARCH_ARM64; +#elif defined(__riscv) && __riscv_xlen == 64 + return ARCH_RISCV64; +#endif + return 0; +} + /* Uses btf_decl_tag attributes to describe the expected test * behavior, see bpf_misc.h for detailed description of each attribute * and attribute combinations. @@ -332,9 +365,13 @@ static int parse_test_spec(struct test_loader *tester, const char *description = NULL; bool has_unpriv_result = false; bool has_unpriv_retval = false; + bool unpriv_jit_on_next_line; + bool jit_on_next_line; + bool collect_jit = false; int func_id, i, err = 0; u32 arch_mask = 0; struct btf *btf; + enum arch arch; memset(spec, 0, sizeof(*spec)); @@ -399,6 +436,30 @@ static int parse_test_spec(struct test_loader *tester, if (err) goto cleanup; spec->mode_mask |= UNPRIV; + } else if ((msg = skip_dynamic_pfx(s, TEST_TAG_JITED_PFX))) { + if (arch_mask == 0) { + PRINT_FAIL("__jited used before __arch_*"); + goto cleanup; + } + if (collect_jit) { + err = push_disasm_msg(msg, &jit_on_next_line, + &spec->priv.jited); + if (err) + goto cleanup; + spec->mode_mask |= PRIV; + } + } else if ((msg = skip_dynamic_pfx(s, TEST_TAG_JITED_PFX_UNPRIV))) { + if (arch_mask == 0) { + PRINT_FAIL("__unpriv_jited used before __arch_*"); + goto cleanup; + } + if (collect_jit) { + err = push_disasm_msg(msg, &unpriv_jit_on_next_line, + &spec->unpriv.jited); + if (err) + goto cleanup; + spec->mode_mask |= UNPRIV; + } } else if ((msg = skip_dynamic_pfx(s, TEST_TAG_EXPECT_XLATED_PFX))) { err = push_msg(msg, &spec->priv.expect_xlated); if (err) @@ -459,16 +520,20 @@ static int parse_test_spec(struct test_loader *tester, } else if (str_has_pfx(s, TEST_TAG_ARCH)) { val = s + sizeof(TEST_TAG_ARCH) - 1; if (strcmp(val, "X86_64") == 0) { - arch_mask |= ARCH_X86_64; + arch = ARCH_X86_64; } else if (strcmp(val, "ARM64") == 0) { - arch_mask |= ARCH_ARM64; + arch = ARCH_ARM64; } else if (strcmp(val, "RISCV64") == 0) { - arch_mask |= ARCH_RISCV64; + arch = ARCH_RISCV64; } else { PRINT_FAIL("bad arch spec: '%s'", val); err = -EINVAL; goto cleanup; } + arch_mask |= arch; + collect_jit = get_current_arch() == arch; + unpriv_jit_on_next_line = true; + jit_on_next_line = true; } else if (str_has_pfx(s, TEST_BTF_PATH)) { spec->btf_custom_path = s + sizeof(TEST_BTF_PATH) - 1; } @@ -521,6 +586,8 @@ static int parse_test_spec(struct test_loader *tester, clone_msgs(&spec->priv.expect_msgs, &spec->unpriv.expect_msgs); if (spec->unpriv.expect_xlated.cnt == 0) clone_msgs(&spec->priv.expect_xlated, &spec->unpriv.expect_xlated); + if (spec->unpriv.jited.cnt == 0) + clone_msgs(&spec->priv.jited, &spec->unpriv.jited); } spec->valid = true; @@ -575,16 +642,29 @@ static void emit_xlated(const char *xlated, bool force) fprintf(stdout, "XLATED:\n=============\n%s=============\n", xlated); } +static void emit_jited(const char *jited, bool force) +{ + if (!force && env.verbosity == VERBOSE_NONE) + return; + fprintf(stdout, "JITED:\n=============\n%s=============\n", jited); +} + static void validate_msgs(char *log_buf, struct expected_msgs *msgs, void (*emit_fn)(const char *buf, bool force)) { + const char *log = log_buf, *prev_match; regmatch_t reg_match[1]; - const char *log = log_buf; + int prev_match_line; + int match_line; int i, j, err; + prev_match_line = -1; + match_line = 0; + prev_match = log; for (i = 0; i < msgs->cnt; i++) { struct expect_msg *msg = &msgs->patterns[i]; - const char *match = NULL; + const char *match = NULL, *pat_status; + bool wrong_line = false; if (!msg->is_regex) { match = strstr(log, msg->substr); @@ -598,19 +678,41 @@ static void validate_msgs(char *log_buf, struct expected_msgs *msgs, } } - if (!match) { + if (match) { + for (; prev_match < match; ++prev_match) + if (*prev_match == '\n') + ++match_line; + wrong_line = msg->on_next_line && prev_match_line >= 0 && + prev_match_line + 1 != match_line; + } + + if (!match || wrong_line) { PRINT_FAIL("expect_msg\n"); if (env.verbosity == VERBOSE_NONE) emit_fn(log_buf, true /*force*/); for (j = 0; j <= i; j++) { msg = &msgs->patterns[j]; + if (j < i) + pat_status = "MATCHED "; + else if (wrong_line) + pat_status = "WRONG LINE"; + else + pat_status = "EXPECTED "; + msg = &msgs->patterns[j]; fprintf(stderr, "%s %s: '%s'\n", - j < i ? "MATCHED " : "EXPECTED", + pat_status, msg->is_regex ? " REGEX" : "SUBSTR", msg->substr); } - return; + if (wrong_line) { + fprintf(stderr, + "expecting match at line %d, actual match is at line %d\n", + prev_match_line + 1, match_line); + } + break; } + + prev_match_line = match_line; } } @@ -769,20 +871,6 @@ out: return err; } -static bool run_on_current_arch(int arch_mask) -{ - if (arch_mask == 0) - return true; -#if defined(__x86_64__) - return arch_mask & ARCH_X86_64; -#elif defined(__aarch64__) - return arch_mask & ARCH_ARM64; -#elif defined(__riscv) && __riscv_xlen == 64 - return arch_mask & ARCH_RISCV64; -#endif - return false; -} - /* this function is forced noinline and has short generic name to look better * in test_progs output (in case of a failure) */ @@ -807,7 +895,7 @@ void run_subtest(struct test_loader *tester, if (!test__start_subtest(subspec->name)) return; - if (!run_on_current_arch(spec->arch_mask)) { + if ((get_current_arch() & spec->arch_mask) == 0) { test__skip(); return; } @@ -884,6 +972,21 @@ void run_subtest(struct test_loader *tester, validate_msgs(tester->log_buf, &subspec->expect_xlated, emit_xlated); } + if (subspec->jited.cnt) { + err = get_jited_program_text(bpf_program__fd(tprog), + tester->log_buf, tester->log_buf_sz); + if (err == -EOPNOTSUPP) { + printf("%s:SKIP: jited programs disassembly is not supported,\n", __func__); + printf("%s:SKIP: tests are built w/o LLVM development libs\n", __func__); + test__skip(); + goto tobj_cleanup; + } + if (!ASSERT_EQ(err, 0, "get_jited_program_text")) + goto tobj_cleanup; + emit_jited(tester->log_buf, false /*force*/); + validate_msgs(tester->log_buf, &subspec->jited, emit_jited); + } + if (should_do_test_run(spec, subspec)) { /* For some reason test_verifier executes programs * with all capabilities restored. Do the same here. -- cgit v1.2.3 From e5bdd6a8be783eb0c960723d9f37df7ee931d6d6 Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Tue, 20 Aug 2024 03:23:56 -0700 Subject: selftests/bpf: validate jit behaviour for tail calls A program calling sub-program which does a tail call. The idea is to verify instructions generated by jit for tail calls: - in program and sub-program prologues; - for subprogram call instruction; - for tail call itself. Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20240820102357.3372779-9-eddyz87@gmail.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/prog_tests/verifier.c | 2 + .../selftests/bpf/progs/verifier_tailcall_jit.c | 105 +++++++++++++++++++++ 2 files changed, 107 insertions(+) create mode 100644 tools/testing/selftests/bpf/progs/verifier_tailcall_jit.c (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/verifier.c b/tools/testing/selftests/bpf/prog_tests/verifier.c index f8f546eba488..cf3662dbd24f 100644 --- a/tools/testing/selftests/bpf/prog_tests/verifier.c +++ b/tools/testing/selftests/bpf/prog_tests/verifier.c @@ -75,6 +75,7 @@ #include "verifier_stack_ptr.skel.h" #include "verifier_subprog_precision.skel.h" #include "verifier_subreg.skel.h" +#include "verifier_tailcall_jit.skel.h" #include "verifier_typedef.skel.h" #include "verifier_uninit.skel.h" #include "verifier_unpriv.skel.h" @@ -198,6 +199,7 @@ void test_verifier_spin_lock(void) { RUN(verifier_spin_lock); } void test_verifier_stack_ptr(void) { RUN(verifier_stack_ptr); } void test_verifier_subprog_precision(void) { RUN(verifier_subprog_precision); } void test_verifier_subreg(void) { RUN(verifier_subreg); } +void test_verifier_tailcall_jit(void) { RUN(verifier_tailcall_jit); } void test_verifier_typedef(void) { RUN(verifier_typedef); } void test_verifier_uninit(void) { RUN(verifier_uninit); } void test_verifier_unpriv(void) { RUN(verifier_unpriv); } diff --git a/tools/testing/selftests/bpf/progs/verifier_tailcall_jit.c b/tools/testing/selftests/bpf/progs/verifier_tailcall_jit.c new file mode 100644 index 000000000000..06d327cf1e1f --- /dev/null +++ b/tools/testing/selftests/bpf/progs/verifier_tailcall_jit.c @@ -0,0 +1,105 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include "bpf_misc.h" + +int main(void); + +struct { + __uint(type, BPF_MAP_TYPE_PROG_ARRAY); + __uint(max_entries, 1); + __uint(key_size, sizeof(__u32)); + __array(values, void (void)); +} jmp_table SEC(".maps") = { + .values = { + [0] = (void *) &main, + }, +}; + +__noinline __auxiliary +static __naked int sub(void) +{ + asm volatile ( + "r2 = %[jmp_table] ll;" + "r3 = 0;" + "call 12;" + "exit;" + : + : __imm_addr(jmp_table) + : __clobber_all); +} + +__success +__arch_x86_64 +/* program entry for main(), regular function prologue */ +__jited(" endbr64") +__jited(" nopl (%rax,%rax)") +__jited(" xorq %rax, %rax") +__jited(" pushq %rbp") +__jited(" movq %rsp, %rbp") +/* tail call prologue for program: + * - establish memory location for tail call counter at &rbp[-8]; + * - spill tail_call_cnt_ptr at &rbp[-16]; + * - expect tail call counter to be passed in rax; + * - for entry program rax is a raw counter, value < 33; + * - for tail called program rax is tail_call_cnt_ptr (value > 33). + */ +__jited(" endbr64") +__jited(" cmpq $0x21, %rax") +__jited(" ja L0") +__jited(" pushq %rax") +__jited(" movq %rsp, %rax") +__jited(" jmp L1") +__jited("L0: pushq %rax") /* rbp[-8] = rax */ +__jited("L1: pushq %rax") /* rbp[-16] = rax */ +/* on subprogram call restore rax to be tail_call_cnt_ptr from rbp[-16] + * (cause original rax might be clobbered by this point) + */ +__jited(" movq -0x10(%rbp), %rax") +__jited(" callq 0x{{.*}}") /* call to sub() */ +__jited(" xorl %eax, %eax") +__jited(" leave") +__jited(" retq") +__jited("...") +/* subprogram entry for sub(), regular function prologue */ +__jited(" endbr64") +__jited(" nopl (%rax,%rax)") +__jited(" nopl (%rax)") +__jited(" pushq %rbp") +__jited(" movq %rsp, %rbp") +/* tail call prologue for subprogram address of tail call counter + * stored at rbp[-16]. + */ +__jited(" endbr64") +__jited(" pushq %rax") /* rbp[-8] = rax */ +__jited(" pushq %rax") /* rbp[-16] = rax */ +__jited(" movabsq ${{.*}}, %rsi") /* r2 = &jmp_table */ +__jited(" xorl %edx, %edx") /* r3 = 0 */ +/* bpf_tail_call implementation: + * - load tail_call_cnt_ptr from rbp[-16]; + * - if *tail_call_cnt_ptr < 33, increment it and jump to target; + * - otherwise do nothing. + */ +__jited(" movq -0x10(%rbp), %rax") +__jited(" cmpq $0x21, (%rax)") +__jited(" jae L0") +__jited(" nopl (%rax,%rax)") +__jited(" addq $0x1, (%rax)") /* *tail_call_cnt_ptr += 1 */ +__jited(" popq %rax") +__jited(" popq %rax") +__jited(" jmp {{.*}}") /* jump to tail call tgt */ +__jited("L0: leave") +__jited(" retq") +SEC("tc") +__naked int main(void) +{ + asm volatile ( + "call %[sub];" + "r0 = 0;" + "exit;" + : + : __imm(sub) + : __clobber_all); +} + +char __license[] SEC("license") = "GPL"; -- cgit v1.2.3 From a038eacdbf59e6024c9e8da5df7f293e64cf20de Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Tue, 20 Aug 2024 03:23:57 -0700 Subject: selftests/bpf: validate __xlated same way as __jited Both __xlated and __jited work with disassembly. It is logical to have both work in a similar manner. This commit updates __xlated macro handling in test_loader.c by making it expect matches on sequential lines, same way as __jited operates. For example: __xlated("1: *(u64 *)(r10 -16) = r1") ;; matched on line N __xlated("3: r0 = &(void __percpu *)(r0)") ;; matched on line N+1 Also: __xlated("1: *(u64 *)(r10 -16) = r1") ;; matched on line N __xlated("...") ;; not matched __xlated("3: r0 = &(void __percpu *)(r0)") ;; mantched on any ;; line >= N Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20240820102357.3372779-10-eddyz87@gmail.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/progs/verifier_nocsr.c | 53 +++++++++++++++++++++- tools/testing/selftests/bpf/test_loader.c | 8 +++- 2 files changed, 57 insertions(+), 4 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/progs/verifier_nocsr.c b/tools/testing/selftests/bpf/progs/verifier_nocsr.c index a7fe277e5167..666c736d196f 100644 --- a/tools/testing/selftests/bpf/progs/verifier_nocsr.c +++ b/tools/testing/selftests/bpf/progs/verifier_nocsr.c @@ -78,6 +78,7 @@ __naked void canary_arm64_riscv64(void) SEC("raw_tp") __arch_x86_64 __xlated("1: r0 = &(void __percpu *)(r0)") +__xlated("...") __xlated("3: exit") __success __naked void canary_zero_spills(void) @@ -94,7 +95,9 @@ SEC("raw_tp") __arch_x86_64 __log_level(4) __msg("stack depth 16") __xlated("1: *(u64 *)(r10 -16) = r1") +__xlated("...") __xlated("3: r0 = &(void __percpu *)(r0)") +__xlated("...") __xlated("5: r2 = *(u64 *)(r10 -16)") __success __naked void wrong_reg_in_pattern1(void) @@ -113,7 +116,9 @@ __naked void wrong_reg_in_pattern1(void) SEC("raw_tp") __arch_x86_64 __xlated("1: *(u64 *)(r10 -16) = r6") +__xlated("...") __xlated("3: r0 = &(void __percpu *)(r0)") +__xlated("...") __xlated("5: r6 = *(u64 *)(r10 -16)") __success __naked void wrong_reg_in_pattern2(void) @@ -132,7 +137,9 @@ __naked void wrong_reg_in_pattern2(void) SEC("raw_tp") __arch_x86_64 __xlated("1: *(u64 *)(r10 -16) = r0") +__xlated("...") __xlated("3: r0 = &(void __percpu *)(r0)") +__xlated("...") __xlated("5: r0 = *(u64 *)(r10 -16)") __success __naked void wrong_reg_in_pattern3(void) @@ -151,7 +158,9 @@ __naked void wrong_reg_in_pattern3(void) SEC("raw_tp") __arch_x86_64 __xlated("2: *(u64 *)(r2 -16) = r1") +__xlated("...") __xlated("4: r0 = &(void __percpu *)(r0)") +__xlated("...") __xlated("6: r1 = *(u64 *)(r10 -16)") __success __naked void wrong_base_in_pattern(void) @@ -171,7 +180,9 @@ __naked void wrong_base_in_pattern(void) SEC("raw_tp") __arch_x86_64 __xlated("1: *(u64 *)(r10 -16) = r1") +__xlated("...") __xlated("3: r0 = &(void __percpu *)(r0)") +__xlated("...") __xlated("5: r2 = 1") __success __naked void wrong_insn_in_pattern(void) @@ -191,7 +202,9 @@ __naked void wrong_insn_in_pattern(void) SEC("raw_tp") __arch_x86_64 __xlated("2: *(u64 *)(r10 -16) = r1") +__xlated("...") __xlated("4: r0 = &(void __percpu *)(r0)") +__xlated("...") __xlated("6: r1 = *(u64 *)(r10 -8)") __success __naked void wrong_off_in_pattern1(void) @@ -211,7 +224,9 @@ __naked void wrong_off_in_pattern1(void) SEC("raw_tp") __arch_x86_64 __xlated("1: *(u32 *)(r10 -4) = r1") +__xlated("...") __xlated("3: r0 = &(void __percpu *)(r0)") +__xlated("...") __xlated("5: r1 = *(u32 *)(r10 -4)") __success __naked void wrong_off_in_pattern2(void) @@ -230,7 +245,9 @@ __naked void wrong_off_in_pattern2(void) SEC("raw_tp") __arch_x86_64 __xlated("1: *(u32 *)(r10 -16) = r1") +__xlated("...") __xlated("3: r0 = &(void __percpu *)(r0)") +__xlated("...") __xlated("5: r1 = *(u32 *)(r10 -16)") __success __naked void wrong_size_in_pattern(void) @@ -249,7 +266,9 @@ __naked void wrong_size_in_pattern(void) SEC("raw_tp") __arch_x86_64 __xlated("2: *(u32 *)(r10 -8) = r1") +__xlated("...") __xlated("4: r0 = &(void __percpu *)(r0)") +__xlated("...") __xlated("6: r1 = *(u32 *)(r10 -8)") __success __naked void partial_pattern(void) @@ -275,11 +294,15 @@ __xlated("1: r2 = 2") /* not patched, spills for -8, -16 not removed */ __xlated("2: *(u64 *)(r10 -8) = r1") __xlated("3: *(u64 *)(r10 -16) = r2") +__xlated("...") __xlated("5: r0 = &(void __percpu *)(r0)") +__xlated("...") __xlated("7: r2 = *(u64 *)(r10 -16)") __xlated("8: r1 = *(u64 *)(r10 -8)") /* patched, spills for -24, -32 removed */ +__xlated("...") __xlated("10: r0 = &(void __percpu *)(r0)") +__xlated("...") __xlated("12: exit") __success __naked void min_stack_offset(void) @@ -308,7 +331,9 @@ __naked void min_stack_offset(void) SEC("raw_tp") __arch_x86_64 __xlated("1: *(u64 *)(r10 -8) = r1") +__xlated("...") __xlated("3: r0 = &(void __percpu *)(r0)") +__xlated("...") __xlated("5: r1 = *(u64 *)(r10 -8)") __success __naked void bad_fixed_read(void) @@ -330,7 +355,9 @@ __naked void bad_fixed_read(void) SEC("raw_tp") __arch_x86_64 __xlated("1: *(u64 *)(r10 -8) = r1") +__xlated("...") __xlated("3: r0 = &(void __percpu *)(r0)") +__xlated("...") __xlated("5: r1 = *(u64 *)(r10 -8)") __success __naked void bad_fixed_write(void) @@ -352,7 +379,9 @@ __naked void bad_fixed_write(void) SEC("raw_tp") __arch_x86_64 __xlated("6: *(u64 *)(r10 -16) = r1") +__xlated("...") __xlated("8: r0 = &(void __percpu *)(r0)") +__xlated("...") __xlated("10: r1 = *(u64 *)(r10 -16)") __success __naked void bad_varying_read(void) @@ -379,7 +408,9 @@ __naked void bad_varying_read(void) SEC("raw_tp") __arch_x86_64 __xlated("6: *(u64 *)(r10 -16) = r1") +__xlated("...") __xlated("8: r0 = &(void __percpu *)(r0)") +__xlated("...") __xlated("10: r1 = *(u64 *)(r10 -16)") __success __naked void bad_varying_write(void) @@ -406,7 +437,9 @@ __naked void bad_varying_write(void) SEC("raw_tp") __arch_x86_64 __xlated("1: *(u64 *)(r10 -8) = r1") +__xlated("...") __xlated("3: r0 = &(void __percpu *)(r0)") +__xlated("...") __xlated("5: r1 = *(u64 *)(r10 -8)") __success __naked void bad_write_in_subprog(void) @@ -438,7 +471,9 @@ __naked static void bad_write_in_subprog_aux(void) SEC("raw_tp") __arch_x86_64 __xlated("1: *(u64 *)(r10 -8) = r1") +__xlated("...") __xlated("3: r0 = &(void __percpu *)(r0)") +__xlated("...") __xlated("5: r1 = *(u64 *)(r10 -8)") __success __naked void bad_helper_write(void) @@ -466,13 +501,19 @@ SEC("raw_tp") __arch_x86_64 /* main, not patched */ __xlated("1: *(u64 *)(r10 -8) = r1") +__xlated("...") __xlated("3: r0 = &(void __percpu *)(r0)") +__xlated("...") __xlated("5: r1 = *(u64 *)(r10 -8)") +__xlated("...") __xlated("9: call pc+1") +__xlated("...") __xlated("10: exit") /* subprogram, patched */ __xlated("11: r1 = 1") +__xlated("...") __xlated("13: r0 = &(void __percpu *)(r0)") +__xlated("...") __xlated("15: exit") __success __naked void invalidate_one_subprog(void) @@ -510,12 +551,16 @@ SEC("raw_tp") __arch_x86_64 /* main */ __xlated("0: r1 = 1") +__xlated("...") __xlated("2: r0 = &(void __percpu *)(r0)") +__xlated("...") __xlated("4: call pc+1") __xlated("5: exit") /* subprogram */ __xlated("6: r1 = 1") +__xlated("...") __xlated("8: r0 = &(void __percpu *)(r0)") +__xlated("...") __xlated("10: *(u64 *)(r10 -16) = r1") __xlated("11: exit") __success @@ -576,7 +621,9 @@ __log_level(4) __msg("stack depth 16") /* may_goto counter at -16 */ __xlated("0: *(u64 *)(r10 -16) =") __xlated("1: r1 = 1") +__xlated("...") __xlated("3: r0 = &(void __percpu *)(r0)") +__xlated("...") /* may_goto expansion starts */ __xlated("5: r11 = *(u64 *)(r10 -16)") __xlated("6: if r11 == 0x0 goto pc+3") @@ -623,13 +670,15 @@ __xlated("5: r0 = *(u32 *)(r0 +0)") __xlated("6: r2 =") __xlated("7: r3 = 0") __xlated("8: r4 = 0") +__xlated("...") /* ... part of the inlined bpf_loop */ __xlated("12: *(u64 *)(r10 -32) = r6") __xlated("13: *(u64 *)(r10 -24) = r7") __xlated("14: *(u64 *)(r10 -16) = r8") -/* ... */ +__xlated("...") __xlated("21: call pc+8") /* dummy_loop_callback */ /* ... last insns of the bpf_loop_interaction1 */ +__xlated("...") __xlated("28: r0 = 0") __xlated("29: exit") /* dummy_loop_callback */ @@ -670,7 +719,7 @@ __xlated("5: r0 = *(u32 *)(r0 +0)") __xlated("6: *(u64 *)(r10 -16) = r1") __xlated("7: call") __xlated("8: r1 = *(u64 *)(r10 -16)") -/* ... */ +__xlated("...") /* ... part of the inlined bpf_loop */ __xlated("15: *(u64 *)(r10 -40) = r6") __xlated("16: *(u64 *)(r10 -32) = r7") diff --git a/tools/testing/selftests/bpf/test_loader.c b/tools/testing/selftests/bpf/test_loader.c index d588c612ac03..b229dd013355 100644 --- a/tools/testing/selftests/bpf/test_loader.c +++ b/tools/testing/selftests/bpf/test_loader.c @@ -365,6 +365,8 @@ static int parse_test_spec(struct test_loader *tester, const char *description = NULL; bool has_unpriv_result = false; bool has_unpriv_retval = false; + bool unpriv_xlated_on_next_line = true; + bool xlated_on_next_line = true; bool unpriv_jit_on_next_line; bool jit_on_next_line; bool collect_jit = false; @@ -461,12 +463,14 @@ static int parse_test_spec(struct test_loader *tester, spec->mode_mask |= UNPRIV; } } else if ((msg = skip_dynamic_pfx(s, TEST_TAG_EXPECT_XLATED_PFX))) { - err = push_msg(msg, &spec->priv.expect_xlated); + err = push_disasm_msg(msg, &xlated_on_next_line, + &spec->priv.expect_xlated); if (err) goto cleanup; spec->mode_mask |= PRIV; } else if ((msg = skip_dynamic_pfx(s, TEST_TAG_EXPECT_XLATED_PFX_UNPRIV))) { - err = push_msg(msg, &spec->unpriv.expect_xlated); + err = push_disasm_msg(msg, &unpriv_xlated_on_next_line, + &spec->unpriv.expect_xlated); if (err) goto cleanup; spec->mode_mask |= UNPRIV; -- cgit v1.2.3 From f8669d7b5f5d2d88959456ae9123d8bb6fdc1ebe Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Tue, 20 Aug 2024 12:53:47 +0200 Subject: selftests: mlxsw: ethtool_lanes: Source ethtool lib from correct path Source the ethtool library from the correct path and avoid the following error: ./ethtool_lanes.sh: line 14: ./../../../net/forwarding/ethtool_lib.sh: No such file or directory Fixes: 40d269c000bd ("selftests: forwarding: Move several selftests") Signed-off-by: Ido Schimmel Signed-off-by: Petr Machata Reviewed-by: Simon Horman Link: https://patch.msgid.link/2112faff02e536e1ac14beb4c2be09c9574b90ae.1724150067.git.petrm@nvidia.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/drivers/net/mlxsw/ethtool_lanes.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/drivers/net/mlxsw/ethtool_lanes.sh b/tools/testing/selftests/drivers/net/mlxsw/ethtool_lanes.sh index 877cd6df94a1..fe905a7f34b3 100755 --- a/tools/testing/selftests/drivers/net/mlxsw/ethtool_lanes.sh +++ b/tools/testing/selftests/drivers/net/mlxsw/ethtool_lanes.sh @@ -2,6 +2,7 @@ # SPDX-License-Identifier: GPL-2.0 lib_dir=$(dirname $0)/../../../net/forwarding +ethtool_lib_dir=$(dirname $0)/../hw ALL_TESTS=" autoneg @@ -11,7 +12,7 @@ ALL_TESTS=" NUM_NETIFS=2 : ${TIMEOUT:=30000} # ms source $lib_dir/lib.sh -source $lib_dir/ethtool_lib.sh +source $ethtool_lib_dir/ethtool_lib.sh setup_prepare() { -- cgit v1.2.3 From f32c821ae0198cf43181711efb18376b2eb6a1cb Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Tue, 20 Aug 2024 17:12:22 +0200 Subject: tools: ynl: lift an assumption about spec file name Currently the parsing code generator assumes that the yaml specification file name and the main 'name' attribute carried inside correspond, that is the field is the c-name representation of the file basename. The above assumption held true within the current tree, but will be hopefully broken soon by the upcoming net shaper specification. Additionally, it makes the field 'name' itself useless. Lift the assumption, always computing the generated include file name from the generated c file name. Signed-off-by: Paolo Abeni Link: https://patch.msgid.link/24da5a3596d814beeb12bd7139a6b4f89756cc19.1724165948.git.pabeni@redhat.com Signed-off-by: Jakub Kicinski --- tools/net/ynl/ynl-gen-c.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/net/ynl/ynl-gen-c.py b/tools/net/ynl/ynl-gen-c.py index 51529fabd517..717530bc9c52 100755 --- a/tools/net/ynl/ynl-gen-c.py +++ b/tools/net/ynl/ynl-gen-c.py @@ -2668,13 +2668,15 @@ def main(): cw.p('#define ' + hdr_prot) cw.nl() + hdr_file=os.path.basename(args.out_file[:-2]) + ".h" + if args.mode == 'kernel': cw.p('#include ') cw.p('#include ') cw.nl() if not args.header: if args.out_file: - cw.p(f'#include "{os.path.basename(args.out_file[:-2])}.h"') + cw.p(f'#include "{hdr_file}"') cw.nl() headers = ['uapi/' + parsed.uapi_header] headers += parsed.kernel_family.get('headers', []) @@ -2686,7 +2688,7 @@ def main(): if family_contains_bitfield32(parsed): cw.p('#include ') else: - cw.p(f'#include "{parsed.name}-user.h"') + cw.p(f'#include "{hdr_file}"') cw.p('#include "ynl.h"') headers = [parsed.uapi_header] for definition in parsed['definitions']: -- cgit v1.2.3 From af8a066f1c473261881a6d8e2b55cca8eda9ce80 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 20 Aug 2024 18:34:25 -0700 Subject: selftest: bpf: Remove mssind boundary check in test_tcp_custom_syncookie.c. Smatch reported a possible off-by-one in tcp_validate_cookie(). However, it's false positive because the possible range of mssind is limited from 0 to 3 by the preceding calculation. mssind = (cookie & (3 << 6)) >> 6; Now, the verifier does not complain without the boundary check. Let's remove the checks. Reported-by: Dan Carpenter Closes: https://lore.kernel.org/bpf/6ae12487-d3f1-488b-9514-af0dac96608f@stanley.mountain/ Signed-off-by: Kuniyuki Iwashima Acked-by: Yonghong Song Link: https://lore.kernel.org/r/20240821013425.49316-1-kuniyu@amazon.com Signed-off-by: Martin KaFai Lau --- tools/testing/selftests/bpf/progs/test_tcp_custom_syncookie.c | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/progs/test_tcp_custom_syncookie.c b/tools/testing/selftests/bpf/progs/test_tcp_custom_syncookie.c index 44ee0d037f95..eb5cca1fce16 100644 --- a/tools/testing/selftests/bpf/progs/test_tcp_custom_syncookie.c +++ b/tools/testing/selftests/bpf/progs/test_tcp_custom_syncookie.c @@ -486,17 +486,10 @@ static int tcp_validate_cookie(struct tcp_syncookie *ctx) goto err; mssind = (cookie & (3 << 6)) >> 6; - if (ctx->ipv4) { - if (mssind > ARRAY_SIZE(msstab4)) - goto err; - + if (ctx->ipv4) ctx->attrs.mss = msstab4[mssind]; - } else { - if (mssind > ARRAY_SIZE(msstab6)) - goto err; - + else ctx->attrs.mss = msstab6[mssind]; - } ctx->attrs.snd_wscale = cookie & BPF_SYNCOOKIE_WSCALE_MASK; ctx->attrs.rcv_wscale = ctx->attrs.snd_wscale; -- cgit v1.2.3 From 611fbeb44a777e5ab54ab3127ec85f72147911d8 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 22 Aug 2024 05:39:32 +0100 Subject: selftests:core: test coverage for dup_fd() failure handling in unshare_fd() At some point there'd been a dumb braino during the dup_fd() calling conventions change; caught by smatch and immediately fixed. The trouble is, there had been no test coverage for the dup_fd() failure handling - neither in kselftests nor in LTP. Fortunately, it can be triggered on stock kernel - ENOMEM would require fault injection, but EMFILE can be had with sysctl alone (fs.nr_open). Add a test for dup_fd() failure. Fixed up commit log and short log - Shuah Khan Signed-off-by: Al Viro Signed-off-by: Shuah Khan --- tools/testing/selftests/core/Makefile | 2 +- tools/testing/selftests/core/unshare_test.c | 94 +++++++++++++++++++++++++++++ 2 files changed, 95 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/core/unshare_test.c (limited to 'tools') diff --git a/tools/testing/selftests/core/Makefile b/tools/testing/selftests/core/Makefile index ce262d097269..8e99f87f5d7c 100644 --- a/tools/testing/selftests/core/Makefile +++ b/tools/testing/selftests/core/Makefile @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0-only CFLAGS += -g $(KHDR_INCLUDES) -TEST_GEN_PROGS := close_range_test +TEST_GEN_PROGS := close_range_test unshare_test include ../lib.mk diff --git a/tools/testing/selftests/core/unshare_test.c b/tools/testing/selftests/core/unshare_test.c new file mode 100644 index 000000000000..7fec9dfb1b0e --- /dev/null +++ b/tools/testing/selftests/core/unshare_test.c @@ -0,0 +1,94 @@ +// SPDX-License-Identifier: GPL-2.0 + +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../kselftest_harness.h" +#include "../clone3/clone3_selftests.h" + +TEST(unshare_EMFILE) +{ + pid_t pid; + int status; + struct __clone_args args = { + .flags = CLONE_FILES, + .exit_signal = SIGCHLD, + }; + int fd; + ssize_t n, n2; + static char buf[512], buf2[512]; + struct rlimit rlimit; + int nr_open; + + fd = open("/proc/sys/fs/nr_open", O_RDWR); + ASSERT_GE(fd, 0); + + n = read(fd, buf, sizeof(buf)); + ASSERT_GT(n, 0); + ASSERT_EQ(buf[n - 1], '\n'); + + ASSERT_EQ(sscanf(buf, "%d", &nr_open), 1); + + ASSERT_EQ(0, getrlimit(RLIMIT_NOFILE, &rlimit)); + + /* bump fs.nr_open */ + n2 = sprintf(buf2, "%d\n", nr_open + 1024); + lseek(fd, 0, SEEK_SET); + write(fd, buf2, n2); + + /* bump ulimit -n */ + rlimit.rlim_cur = nr_open + 1024; + rlimit.rlim_max = nr_open + 1024; + EXPECT_EQ(0, setrlimit(RLIMIT_NOFILE, &rlimit)) { + lseek(fd, 0, SEEK_SET); + write(fd, buf, n); + exit(EXIT_FAILURE); + } + + /* get a descriptor past the old fs.nr_open */ + EXPECT_GE(dup2(2, nr_open + 64), 0) { + lseek(fd, 0, SEEK_SET); + write(fd, buf, n); + exit(EXIT_FAILURE); + } + + /* get descriptor table shared */ + pid = sys_clone3(&args, sizeof(args)); + EXPECT_GE(pid, 0) { + lseek(fd, 0, SEEK_SET); + write(fd, buf, n); + exit(EXIT_FAILURE); + } + + if (pid == 0) { + int err; + + /* restore fs.nr_open */ + lseek(fd, 0, SEEK_SET); + write(fd, buf, n); + /* ... and now unshare(CLONE_FILES) must fail with EMFILE */ + err = unshare(CLONE_FILES); + EXPECT_EQ(err, -1) + exit(EXIT_FAILURE); + EXPECT_EQ(errno, EMFILE) + exit(EXIT_FAILURE); + exit(EXIT_SUCCESS); + } + + EXPECT_EQ(waitpid(pid, &status, 0), pid); + EXPECT_EQ(true, WIFEXITED(status)); + EXPECT_EQ(0, WEXITSTATUS(status)); +} + +TEST_HARNESS_MAIN -- cgit v1.2.3 From bcc3773c49af2426093645473fe02a5a8e765587 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 20 Aug 2024 18:22:27 -0700 Subject: selftests: net: add helper for checking if nettest is available A few tests check if nettest exists in the $PATH before adding $PWD to $PATH and re-checking. They don't discard stderr on the first check (and nettest is built as part of selftests, so it's pretty normal for it to not be available in system $PATH). This leads to output noise: which: no nettest in (/home/virtme/tools/fs/bin:/home/virtme/tools/fs/sbin:/home/virtme/tools/fs/usr/bin:/home/virtme/tools/fs/usr/sbin:/bin:/sbin:/usr/bin:/usr/sbin:/usr/local/bin) Add a common helper for the check which does silence stderr. There is another small functional change hiding here, because pmtu.sh and fib_rule_tests.sh used to return from the test case rather than completely exit. Building nettest is not hard, there should be no need to maintain the ability to selectively skip cases in its absence. Reviewed-by: Ido Schimmel Signed-off-by: Jakub Kicinski Reviewed-by: Hangbin Liu Link: https://patch.msgid.link/20240821012227.1398769-1-kuba@kernel.org Signed-off-by: Paolo Abeni --- tools/testing/selftests/net/fcnal-test.sh | 9 +----- tools/testing/selftests/net/fib_rule_tests.sh | 37 ++--------------------- tools/testing/selftests/net/lib.sh | 15 +++++++++ tools/testing/selftests/net/pmtu.sh | 8 +---- tools/testing/selftests/net/unicast_extensions.sh | 9 +----- tools/testing/selftests/net/vrf_route_leaking.sh | 3 +- 6 files changed, 22 insertions(+), 59 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/fcnal-test.sh b/tools/testing/selftests/net/fcnal-test.sh index 386ebd829df5..899dbad0104b 100755 --- a/tools/testing/selftests/net/fcnal-test.sh +++ b/tools/testing/selftests/net/fcnal-test.sh @@ -4304,14 +4304,7 @@ elif [ "$TESTS" = "ipv6" ]; then TESTS="$TESTS_IPV6" fi -# nettest can be run from PATH or from same directory as this selftest -if ! which nettest >/dev/null; then - PATH=$PWD:$PATH - if ! which nettest >/dev/null; then - echo "'nettest' command not found; skipping tests" - exit $ksft_skip - fi -fi +check_gen_prog "nettest" declare -i nfail=0 declare -i nsuccess=0 diff --git a/tools/testing/selftests/net/fib_rule_tests.sh b/tools/testing/selftests/net/fib_rule_tests.sh index 89034c5b69dc..53c5c1ad437e 100755 --- a/tools/testing/selftests/net/fib_rule_tests.sh +++ b/tools/testing/selftests/net/fib_rule_tests.sh @@ -51,31 +51,6 @@ log_test() fi } -check_nettest() -{ - if which nettest > /dev/null 2>&1; then - return 0 - fi - - # Add the selftest directory to PATH if not already done - if [ "${SELFTEST_PATH}" = "" ]; then - SELFTEST_PATH="$(dirname $0)" - PATH="${PATH}:${SELFTEST_PATH}" - - # Now retry with the new path - if which nettest > /dev/null 2>&1; then - return 0 - fi - - if [ "${ret}" -eq 0 ]; then - ret="${ksft_skip}" - fi - echo "nettest not found (try 'make -C ${SELFTEST_PATH} nettest')" - fi - - return 1 -} - setup() { set -e @@ -317,11 +292,6 @@ fib_rule6_connect_test() echo echo "IPv6 FIB rule connect tests" - if ! check_nettest; then - echo "SKIP: Could not run test without nettest tool" - return - fi - setup_peer $IP -6 rule add dsfield 0x04 table $RTABLE_PEER @@ -516,11 +486,6 @@ fib_rule4_connect_test() echo echo "IPv4 FIB rule connect tests" - if ! check_nettest; then - echo "SKIP: Could not run test without nettest tool" - return - fi - setup_peer $IP -4 rule add dsfield 0x04 table $RTABLE_PEER @@ -584,6 +549,8 @@ if [ ! -x "$(command -v ip)" ]; then exit $ksft_skip fi +check_gen_prog "nettest" + # start clean cleanup &> /dev/null setup diff --git a/tools/testing/selftests/net/lib.sh b/tools/testing/selftests/net/lib.sh index 8ee4489238ca..be8707bfb46e 100644 --- a/tools/testing/selftests/net/lib.sh +++ b/tools/testing/selftests/net/lib.sh @@ -125,6 +125,21 @@ slowwait_for_counter() slowwait "$timeout" until_counter_is ">= $((base + delta))" "$@" } +# Check for existence of tools which are built as part of selftests +# but may also already exist in $PATH +check_gen_prog() +{ + local prog_name=$1; shift + + if ! which $prog_name >/dev/null 2>/dev/null; then + PATH=$PWD:$PATH + if ! which $prog_name >/dev/null; then + echo "'$prog_name' command not found; skipping tests" + exit $ksft_skip + fi + fi +} + remove_ns_list() { local item=$1 diff --git a/tools/testing/selftests/net/pmtu.sh b/tools/testing/selftests/net/pmtu.sh index 24a50622406c..569bce8b6383 100755 --- a/tools/testing/selftests/net/pmtu.sh +++ b/tools/testing/selftests/net/pmtu.sh @@ -681,13 +681,7 @@ setup_xfrm() { } setup_nettest_xfrm() { - if ! which nettest >/dev/null; then - PATH=$PWD:$PATH - if ! which nettest >/dev/null; then - echo "'nettest' command not found; skipping tests" - return 1 - fi - fi + check_gen_prog "nettest" [ ${1} -eq 6 ] && proto="-6" || proto="" port=${2} diff --git a/tools/testing/selftests/net/unicast_extensions.sh b/tools/testing/selftests/net/unicast_extensions.sh index f52aa5f7da52..3e751234ccfe 100755 --- a/tools/testing/selftests/net/unicast_extensions.sh +++ b/tools/testing/selftests/net/unicast_extensions.sh @@ -30,14 +30,7 @@ source lib.sh -# nettest can be run from PATH or from same directory as this selftest -if ! which nettest >/dev/null; then - PATH=$PWD:$PATH - if ! which nettest >/dev/null; then - echo "'nettest' command not found; skipping tests" - exit $ksft_skip - fi -fi +check_gen_prog "nettest" result=0 diff --git a/tools/testing/selftests/net/vrf_route_leaking.sh b/tools/testing/selftests/net/vrf_route_leaking.sh index 152171fb1fc8..e9c2f71da207 100755 --- a/tools/testing/selftests/net/vrf_route_leaking.sh +++ b/tools/testing/selftests/net/vrf_route_leaking.sh @@ -59,7 +59,6 @@ # while it is forwarded between different vrfs. source lib.sh -PATH=$PWD:$PWD/tools/testing/selftests/net:$PATH VERBOSE=0 PAUSE_ON_FAIL=no DEFAULT_TTYPE=sym @@ -636,6 +635,8 @@ EOF # Some systems don't have a ping6 binary anymore command -v ping6 > /dev/null 2>&1 && ping6=$(command -v ping6) || ping6=$(command -v ping) +check_gen_prog "nettest" + TESTS_IPV4="ipv4_ping_ttl ipv4_traceroute ipv4_ping_frag ipv4_ping_local ipv4_tcp_local ipv4_udp_local ipv4_ping_ttl_asym ipv4_traceroute_asym" TESTS_IPV6="ipv6_ping_ttl ipv6_traceroute ipv6_ping_local ipv6_tcp_local ipv6_udp_local -- cgit v1.2.3 From 110bbd3a2ed71f3cf4d9438d62f22f591952fc3b Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Thu, 22 Aug 2024 01:01:24 -0700 Subject: selftests/bpf: test for malformed BPF_CORE_TYPE_ID_LOCAL relocation Check that verifier rejects BPF program containing relocation pointing to non-existent BTF type. To force relocation resolution on kernel side test case uses bpf_attr->core_relos field. This field is not exposed by libbpf, so directly do BPF system call in the test. Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20240822080124.2995724-3-eddyz87@gmail.com Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/prog_tests/core_reloc_raw.c | 125 +++++++++++++++++++++ 1 file changed, 125 insertions(+) create mode 100644 tools/testing/selftests/bpf/prog_tests/core_reloc_raw.c (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/core_reloc_raw.c b/tools/testing/selftests/bpf/prog_tests/core_reloc_raw.c new file mode 100644 index 000000000000..a18d3680fb16 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/core_reloc_raw.c @@ -0,0 +1,125 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* Test cases that can't load programs using libbpf and need direct + * BPF syscall access + */ + +#include +#include +#include + +#include "test_progs.h" +#include "test_btf.h" +#include "bpf/libbpf_internal.h" + +static char log[16 * 1024]; + +/* Check that verifier rejects BPF program containing relocation + * pointing to non-existent BTF type. + */ +static void test_bad_local_id(void) +{ + struct test_btf { + struct btf_header hdr; + __u32 types[15]; + char strings[128]; + } raw_btf = { + .hdr = { + .magic = BTF_MAGIC, + .version = BTF_VERSION, + .hdr_len = sizeof(struct btf_header), + .type_off = 0, + .type_len = sizeof(raw_btf.types), + .str_off = offsetof(struct test_btf, strings) - + offsetof(struct test_btf, types), + .str_len = sizeof(raw_btf.strings), + }, + .types = { + BTF_PTR_ENC(0), /* [1] void* */ + BTF_TYPE_INT_ENC(1, BTF_INT_SIGNED, 0, 32, 4), /* [2] int */ + BTF_FUNC_PROTO_ENC(2, 1), /* [3] int (*)(void*) */ + BTF_FUNC_PROTO_ARG_ENC(8, 1), + BTF_FUNC_ENC(8, 3) /* [4] FUNC 'foo' type_id=2 */ + }, + .strings = "\0int\0 0\0foo\0" + }; + __u32 log_level = 1 | 2 | 4; + LIBBPF_OPTS(bpf_btf_load_opts, opts, + .log_buf = log, + .log_size = sizeof(log), + .log_level = log_level, + ); + struct bpf_insn insns[] = { + BPF_ALU64_IMM(BPF_MOV, BPF_REG_0, 0), + BPF_EXIT_INSN(), + }; + struct bpf_func_info funcs[] = { + { + .insn_off = 0, + .type_id = 4, + } + }; + struct bpf_core_relo relos[] = { + { + .insn_off = 0, /* patch first instruction (r0 = 0) */ + .type_id = 100500, /* !!! this type id does not exist */ + .access_str_off = 6, /* offset of "0" */ + .kind = BPF_CORE_TYPE_ID_LOCAL, + } + }; + union bpf_attr attr; + int saved_errno; + int prog_fd = -1; + int btf_fd = -1; + + btf_fd = bpf_btf_load(&raw_btf, sizeof(raw_btf), &opts); + saved_errno = errno; + if (btf_fd < 0 || env.verbosity > VERBOSE_NORMAL) { + printf("-------- BTF load log start --------\n"); + printf("%s", log); + printf("-------- BTF load log end ----------\n"); + } + if (btf_fd < 0) { + PRINT_FAIL("bpf_btf_load() failed, errno=%d\n", saved_errno); + return; + } + + log[0] = 0; + memset(&attr, 0, sizeof(attr)); + attr.prog_btf_fd = btf_fd; + attr.prog_type = BPF_TRACE_RAW_TP; + attr.license = (__u64)"GPL"; + attr.insns = (__u64)&insns; + attr.insn_cnt = sizeof(insns) / sizeof(*insns); + attr.log_buf = (__u64)log; + attr.log_size = sizeof(log); + attr.log_level = log_level; + attr.func_info = (__u64)funcs; + attr.func_info_cnt = sizeof(funcs) / sizeof(*funcs); + attr.func_info_rec_size = sizeof(*funcs); + attr.core_relos = (__u64)relos; + attr.core_relo_cnt = sizeof(relos) / sizeof(*relos); + attr.core_relo_rec_size = sizeof(*relos); + prog_fd = sys_bpf_prog_load(&attr, sizeof(attr), 1); + saved_errno = errno; + if (prog_fd < 0 || env.verbosity > VERBOSE_NORMAL) { + printf("-------- program load log start --------\n"); + printf("%s", log); + printf("-------- program load log end ----------\n"); + } + if (prog_fd >= 0) { + PRINT_FAIL("sys_bpf_prog_load() expected to fail\n"); + goto out; + } + ASSERT_HAS_SUBSTR(log, "relo #0: bad type id 100500", "program load log"); + +out: + close(prog_fd); + close(btf_fd); +} + +void test_core_reloc_raw(void) +{ + if (test__start_subtest("bad_local_id")) + test_bad_local_id(); +} -- cgit v1.2.3 From a11b4222bb579dcf9646f3c4ecd2212ae762a2c8 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Wed, 21 Aug 2024 16:26:25 -0700 Subject: perf dwarf-aux: Handle bitfield members from pointer access The __die_find_member_offset_cb() missed to handle bitfield members which don't have DW_AT_data_member_location. Like in adding member types in __add_member_cb() it should fallback to check the bit offset when it resolves the member type for an offset. Fixes: 437683a9941891c1 ("perf dwarf-aux: Handle type transfer for memory access") Signed-off-by: Namhyung Kim Cc: Adrian Hunter Cc: Athira Rajeev Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kan Liang Cc: Masami Hiramatsu Cc: Namhyung Kim Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20240821232628.353177-2-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/dwarf-aux.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/dwarf-aux.c b/tools/perf/util/dwarf-aux.c index 0151a8d14350..92eb9c8dc3e5 100644 --- a/tools/perf/util/dwarf-aux.c +++ b/tools/perf/util/dwarf-aux.c @@ -1977,8 +1977,15 @@ static int __die_find_member_offset_cb(Dwarf_Die *die_mem, void *arg) return DIE_FIND_CB_SIBLING; /* Unions might not have location */ - if (die_get_data_member_location(die_mem, &loc) < 0) - loc = 0; + if (die_get_data_member_location(die_mem, &loc) < 0) { + Dwarf_Attribute attr; + + if (dwarf_attr_integrate(die_mem, DW_AT_data_bit_offset, &attr) && + dwarf_formudata(&attr, &loc) == 0) + loc /= 8; + else + loc = 0; + } if (offset == loc) return DIE_FIND_CB_END; -- cgit v1.2.3 From adec67d372fecd97585d76dbebb610d62ec9d2cc Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Thu, 22 Aug 2024 01:41:08 -0700 Subject: selftests/bpf: rename nocsr -> bpf_fastcall in selftests Attribute used by LLVM implementation of the feature had been changed from no_caller_saved_registers to bpf_fastcall (see [1]). This commit replaces references to nocsr by references to bpf_fastcall to keep LLVM and selftests parts in sync. [1] https://github.com/llvm/llvm-project/pull/105417 Acked-by: Yonghong Song Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20240822084112.3257995-3-eddyz87@gmail.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/prog_tests/verifier.c | 4 +- .../selftests/bpf/progs/verifier_bpf_fastcall.c | 845 +++++++++++++++++++++ tools/testing/selftests/bpf/progs/verifier_nocsr.c | 845 --------------------- 3 files changed, 847 insertions(+), 847 deletions(-) create mode 100644 tools/testing/selftests/bpf/progs/verifier_bpf_fastcall.c delete mode 100644 tools/testing/selftests/bpf/progs/verifier_nocsr.c (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/verifier.c b/tools/testing/selftests/bpf/prog_tests/verifier.c index cf3662dbd24f..80a90c627182 100644 --- a/tools/testing/selftests/bpf/prog_tests/verifier.c +++ b/tools/testing/selftests/bpf/prog_tests/verifier.c @@ -53,7 +53,7 @@ #include "verifier_movsx.skel.h" #include "verifier_netfilter_ctx.skel.h" #include "verifier_netfilter_retcode.skel.h" -#include "verifier_nocsr.skel.h" +#include "verifier_bpf_fastcall.skel.h" #include "verifier_or_jmp32_k.skel.h" #include "verifier_precision.skel.h" #include "verifier_prevent_map_lookup.skel.h" @@ -177,7 +177,7 @@ void test_verifier_meta_access(void) { RUN(verifier_meta_access); } void test_verifier_movsx(void) { RUN(verifier_movsx); } void test_verifier_netfilter_ctx(void) { RUN(verifier_netfilter_ctx); } void test_verifier_netfilter_retcode(void) { RUN(verifier_netfilter_retcode); } -void test_verifier_nocsr(void) { RUN(verifier_nocsr); } +void test_verifier_bpf_fastcall(void) { RUN(verifier_bpf_fastcall); } void test_verifier_or_jmp32_k(void) { RUN(verifier_or_jmp32_k); } void test_verifier_precision(void) { RUN(verifier_precision); } void test_verifier_prevent_map_lookup(void) { RUN(verifier_prevent_map_lookup); } diff --git a/tools/testing/selftests/bpf/progs/verifier_bpf_fastcall.c b/tools/testing/selftests/bpf/progs/verifier_bpf_fastcall.c new file mode 100644 index 000000000000..e30ab9fe5096 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/verifier_bpf_fastcall.c @@ -0,0 +1,845 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include "../../../include/linux/filter.h" +#include "bpf_misc.h" + +SEC("raw_tp") +__arch_x86_64 +__log_level(4) __msg("stack depth 8") +__xlated("4: r5 = 5") +__xlated("5: w0 = ") +__xlated("6: r0 = &(void __percpu *)(r0)") +__xlated("7: r0 = *(u32 *)(r0 +0)") +__xlated("8: exit") +__success +__naked void simple(void) +{ + asm volatile ( + "r1 = 1;" + "r2 = 2;" + "r3 = 3;" + "r4 = 4;" + "r5 = 5;" + "*(u64 *)(r10 - 16) = r1;" + "*(u64 *)(r10 - 24) = r2;" + "*(u64 *)(r10 - 32) = r3;" + "*(u64 *)(r10 - 40) = r4;" + "*(u64 *)(r10 - 48) = r5;" + "call %[bpf_get_smp_processor_id];" + "r5 = *(u64 *)(r10 - 48);" + "r4 = *(u64 *)(r10 - 40);" + "r3 = *(u64 *)(r10 - 32);" + "r2 = *(u64 *)(r10 - 24);" + "r1 = *(u64 *)(r10 - 16);" + "exit;" + : + : __imm(bpf_get_smp_processor_id) + : __clobber_all); +} + +/* The logic for detecting and verifying bpf_fastcall pattern is the same for + * any arch, however x86 differs from arm64 or riscv64 in a way + * bpf_get_smp_processor_id is rewritten: + * - on x86 it is done by verifier + * - on arm64 and riscv64 it is done by jit + * + * Which leads to different xlated patterns for different archs: + * - on x86 the call is expanded as 3 instructions + * - on arm64 and riscv64 the call remains as is + * (but spills/fills are still removed) + * + * It is really desirable to check instruction indexes in the xlated + * patterns, so add this canary test to check that function rewrite by + * jit is correctly processed by bpf_fastcall logic, keep the rest of the + * tests as x86. + */ +SEC("raw_tp") +__arch_arm64 +__arch_riscv64 +__xlated("0: r1 = 1") +__xlated("1: call bpf_get_smp_processor_id") +__xlated("2: exit") +__success +__naked void canary_arm64_riscv64(void) +{ + asm volatile ( + "r1 = 1;" + "*(u64 *)(r10 - 16) = r1;" + "call %[bpf_get_smp_processor_id];" + "r1 = *(u64 *)(r10 - 16);" + "exit;" + : + : __imm(bpf_get_smp_processor_id) + : __clobber_all); +} + +SEC("raw_tp") +__arch_x86_64 +__xlated("1: r0 = &(void __percpu *)(r0)") +__xlated("...") +__xlated("3: exit") +__success +__naked void canary_zero_spills(void) +{ + asm volatile ( + "call %[bpf_get_smp_processor_id];" + "exit;" + : + : __imm(bpf_get_smp_processor_id) + : __clobber_all); +} + +SEC("raw_tp") +__arch_x86_64 +__log_level(4) __msg("stack depth 16") +__xlated("1: *(u64 *)(r10 -16) = r1") +__xlated("...") +__xlated("3: r0 = &(void __percpu *)(r0)") +__xlated("...") +__xlated("5: r2 = *(u64 *)(r10 -16)") +__success +__naked void wrong_reg_in_pattern1(void) +{ + asm volatile ( + "r1 = 1;" + "*(u64 *)(r10 - 16) = r1;" + "call %[bpf_get_smp_processor_id];" + "r2 = *(u64 *)(r10 - 16);" + "exit;" + : + : __imm(bpf_get_smp_processor_id) + : __clobber_all); +} + +SEC("raw_tp") +__arch_x86_64 +__xlated("1: *(u64 *)(r10 -16) = r6") +__xlated("...") +__xlated("3: r0 = &(void __percpu *)(r0)") +__xlated("...") +__xlated("5: r6 = *(u64 *)(r10 -16)") +__success +__naked void wrong_reg_in_pattern2(void) +{ + asm volatile ( + "r6 = 1;" + "*(u64 *)(r10 - 16) = r6;" + "call %[bpf_get_smp_processor_id];" + "r6 = *(u64 *)(r10 - 16);" + "exit;" + : + : __imm(bpf_get_smp_processor_id) + : __clobber_all); +} + +SEC("raw_tp") +__arch_x86_64 +__xlated("1: *(u64 *)(r10 -16) = r0") +__xlated("...") +__xlated("3: r0 = &(void __percpu *)(r0)") +__xlated("...") +__xlated("5: r0 = *(u64 *)(r10 -16)") +__success +__naked void wrong_reg_in_pattern3(void) +{ + asm volatile ( + "r0 = 1;" + "*(u64 *)(r10 - 16) = r0;" + "call %[bpf_get_smp_processor_id];" + "r0 = *(u64 *)(r10 - 16);" + "exit;" + : + : __imm(bpf_get_smp_processor_id) + : __clobber_all); +} + +SEC("raw_tp") +__arch_x86_64 +__xlated("2: *(u64 *)(r2 -16) = r1") +__xlated("...") +__xlated("4: r0 = &(void __percpu *)(r0)") +__xlated("...") +__xlated("6: r1 = *(u64 *)(r10 -16)") +__success +__naked void wrong_base_in_pattern(void) +{ + asm volatile ( + "r1 = 1;" + "r2 = r10;" + "*(u64 *)(r2 - 16) = r1;" + "call %[bpf_get_smp_processor_id];" + "r1 = *(u64 *)(r10 - 16);" + "exit;" + : + : __imm(bpf_get_smp_processor_id) + : __clobber_all); +} + +SEC("raw_tp") +__arch_x86_64 +__xlated("1: *(u64 *)(r10 -16) = r1") +__xlated("...") +__xlated("3: r0 = &(void __percpu *)(r0)") +__xlated("...") +__xlated("5: r2 = 1") +__success +__naked void wrong_insn_in_pattern(void) +{ + asm volatile ( + "r1 = 1;" + "*(u64 *)(r10 - 16) = r1;" + "call %[bpf_get_smp_processor_id];" + "r2 = 1;" + "r1 = *(u64 *)(r10 - 16);" + "exit;" + : + : __imm(bpf_get_smp_processor_id) + : __clobber_all); +} + +SEC("raw_tp") +__arch_x86_64 +__xlated("2: *(u64 *)(r10 -16) = r1") +__xlated("...") +__xlated("4: r0 = &(void __percpu *)(r0)") +__xlated("...") +__xlated("6: r1 = *(u64 *)(r10 -8)") +__success +__naked void wrong_off_in_pattern1(void) +{ + asm volatile ( + "r1 = 1;" + "*(u64 *)(r10 - 8) = r1;" + "*(u64 *)(r10 - 16) = r1;" + "call %[bpf_get_smp_processor_id];" + "r1 = *(u64 *)(r10 - 8);" + "exit;" + : + : __imm(bpf_get_smp_processor_id) + : __clobber_all); +} + +SEC("raw_tp") +__arch_x86_64 +__xlated("1: *(u32 *)(r10 -4) = r1") +__xlated("...") +__xlated("3: r0 = &(void __percpu *)(r0)") +__xlated("...") +__xlated("5: r1 = *(u32 *)(r10 -4)") +__success +__naked void wrong_off_in_pattern2(void) +{ + asm volatile ( + "r1 = 1;" + "*(u32 *)(r10 - 4) = r1;" + "call %[bpf_get_smp_processor_id];" + "r1 = *(u32 *)(r10 - 4);" + "exit;" + : + : __imm(bpf_get_smp_processor_id) + : __clobber_all); +} + +SEC("raw_tp") +__arch_x86_64 +__xlated("1: *(u32 *)(r10 -16) = r1") +__xlated("...") +__xlated("3: r0 = &(void __percpu *)(r0)") +__xlated("...") +__xlated("5: r1 = *(u32 *)(r10 -16)") +__success +__naked void wrong_size_in_pattern(void) +{ + asm volatile ( + "r1 = 1;" + "*(u32 *)(r10 - 16) = r1;" + "call %[bpf_get_smp_processor_id];" + "r1 = *(u32 *)(r10 - 16);" + "exit;" + : + : __imm(bpf_get_smp_processor_id) + : __clobber_all); +} + +SEC("raw_tp") +__arch_x86_64 +__xlated("2: *(u32 *)(r10 -8) = r1") +__xlated("...") +__xlated("4: r0 = &(void __percpu *)(r0)") +__xlated("...") +__xlated("6: r1 = *(u32 *)(r10 -8)") +__success +__naked void partial_pattern(void) +{ + asm volatile ( + "r1 = 1;" + "r2 = 2;" + "*(u32 *)(r10 - 8) = r1;" + "*(u64 *)(r10 - 16) = r2;" + "call %[bpf_get_smp_processor_id];" + "r2 = *(u64 *)(r10 - 16);" + "r1 = *(u32 *)(r10 - 8);" + "exit;" + : + : __imm(bpf_get_smp_processor_id) + : __clobber_all); +} + +SEC("raw_tp") +__arch_x86_64 +__xlated("0: r1 = 1") +__xlated("1: r2 = 2") +/* not patched, spills for -8, -16 not removed */ +__xlated("2: *(u64 *)(r10 -8) = r1") +__xlated("3: *(u64 *)(r10 -16) = r2") +__xlated("...") +__xlated("5: r0 = &(void __percpu *)(r0)") +__xlated("...") +__xlated("7: r2 = *(u64 *)(r10 -16)") +__xlated("8: r1 = *(u64 *)(r10 -8)") +/* patched, spills for -24, -32 removed */ +__xlated("...") +__xlated("10: r0 = &(void __percpu *)(r0)") +__xlated("...") +__xlated("12: exit") +__success +__naked void min_stack_offset(void) +{ + asm volatile ( + "r1 = 1;" + "r2 = 2;" + /* this call won't be patched */ + "*(u64 *)(r10 - 8) = r1;" + "*(u64 *)(r10 - 16) = r2;" + "call %[bpf_get_smp_processor_id];" + "r2 = *(u64 *)(r10 - 16);" + "r1 = *(u64 *)(r10 - 8);" + /* this call would be patched */ + "*(u64 *)(r10 - 24) = r1;" + "*(u64 *)(r10 - 32) = r2;" + "call %[bpf_get_smp_processor_id];" + "r2 = *(u64 *)(r10 - 32);" + "r1 = *(u64 *)(r10 - 24);" + "exit;" + : + : __imm(bpf_get_smp_processor_id) + : __clobber_all); +} + +SEC("raw_tp") +__arch_x86_64 +__xlated("1: *(u64 *)(r10 -8) = r1") +__xlated("...") +__xlated("3: r0 = &(void __percpu *)(r0)") +__xlated("...") +__xlated("5: r1 = *(u64 *)(r10 -8)") +__success +__naked void bad_fixed_read(void) +{ + asm volatile ( + "r1 = 1;" + "*(u64 *)(r10 - 8) = r1;" + "call %[bpf_get_smp_processor_id];" + "r1 = *(u64 *)(r10 - 8);" + "r1 = r10;" + "r1 += -8;" + "r1 = *(u64 *)(r1 - 0);" + "exit;" + : + : __imm(bpf_get_smp_processor_id) + : __clobber_all); +} + +SEC("raw_tp") +__arch_x86_64 +__xlated("1: *(u64 *)(r10 -8) = r1") +__xlated("...") +__xlated("3: r0 = &(void __percpu *)(r0)") +__xlated("...") +__xlated("5: r1 = *(u64 *)(r10 -8)") +__success +__naked void bad_fixed_write(void) +{ + asm volatile ( + "r1 = 1;" + "*(u64 *)(r10 - 8) = r1;" + "call %[bpf_get_smp_processor_id];" + "r1 = *(u64 *)(r10 - 8);" + "r1 = r10;" + "r1 += -8;" + "*(u64 *)(r1 - 0) = r1;" + "exit;" + : + : __imm(bpf_get_smp_processor_id) + : __clobber_all); +} + +SEC("raw_tp") +__arch_x86_64 +__xlated("6: *(u64 *)(r10 -16) = r1") +__xlated("...") +__xlated("8: r0 = &(void __percpu *)(r0)") +__xlated("...") +__xlated("10: r1 = *(u64 *)(r10 -16)") +__success +__naked void bad_varying_read(void) +{ + asm volatile ( + "r6 = *(u64 *)(r1 + 0);" /* random scalar value */ + "r6 &= 0x7;" /* r6 range [0..7] */ + "r6 += 0x2;" /* r6 range [2..9] */ + "r7 = 0;" + "r7 -= r6;" /* r7 range [-9..-2] */ + "r1 = 1;" + "*(u64 *)(r10 - 16) = r1;" + "call %[bpf_get_smp_processor_id];" + "r1 = *(u64 *)(r10 - 16);" + "r1 = r10;" + "r1 += r7;" + "r1 = *(u8 *)(r1 - 0);" /* touches slot [-16..-9] where spills are stored */ + "exit;" + : + : __imm(bpf_get_smp_processor_id) + : __clobber_all); +} + +SEC("raw_tp") +__arch_x86_64 +__xlated("6: *(u64 *)(r10 -16) = r1") +__xlated("...") +__xlated("8: r0 = &(void __percpu *)(r0)") +__xlated("...") +__xlated("10: r1 = *(u64 *)(r10 -16)") +__success +__naked void bad_varying_write(void) +{ + asm volatile ( + "r6 = *(u64 *)(r1 + 0);" /* random scalar value */ + "r6 &= 0x7;" /* r6 range [0..7] */ + "r6 += 0x2;" /* r6 range [2..9] */ + "r7 = 0;" + "r7 -= r6;" /* r7 range [-9..-2] */ + "r1 = 1;" + "*(u64 *)(r10 - 16) = r1;" + "call %[bpf_get_smp_processor_id];" + "r1 = *(u64 *)(r10 - 16);" + "r1 = r10;" + "r1 += r7;" + "*(u8 *)(r1 - 0) = r7;" /* touches slot [-16..-9] where spills are stored */ + "exit;" + : + : __imm(bpf_get_smp_processor_id) + : __clobber_all); +} + +SEC("raw_tp") +__arch_x86_64 +__xlated("1: *(u64 *)(r10 -8) = r1") +__xlated("...") +__xlated("3: r0 = &(void __percpu *)(r0)") +__xlated("...") +__xlated("5: r1 = *(u64 *)(r10 -8)") +__success +__naked void bad_write_in_subprog(void) +{ + asm volatile ( + "r1 = 1;" + "*(u64 *)(r10 - 8) = r1;" + "call %[bpf_get_smp_processor_id];" + "r1 = *(u64 *)(r10 - 8);" + "r1 = r10;" + "r1 += -8;" + "call bad_write_in_subprog_aux;" + "exit;" + : + : __imm(bpf_get_smp_processor_id) + : __clobber_all); +} + +__used +__naked static void bad_write_in_subprog_aux(void) +{ + asm volatile ( + "r0 = 1;" + "*(u64 *)(r1 - 0) = r0;" /* invalidates bpf_fastcall contract for caller: */ + "exit;" /* caller stack at -8 used outside of the pattern */ + ::: __clobber_all); +} + +SEC("raw_tp") +__arch_x86_64 +__xlated("1: *(u64 *)(r10 -8) = r1") +__xlated("...") +__xlated("3: r0 = &(void __percpu *)(r0)") +__xlated("...") +__xlated("5: r1 = *(u64 *)(r10 -8)") +__success +__naked void bad_helper_write(void) +{ + asm volatile ( + "r1 = 1;" + /* bpf_fastcall pattern with stack offset -8 */ + "*(u64 *)(r10 - 8) = r1;" + "call %[bpf_get_smp_processor_id];" + "r1 = *(u64 *)(r10 - 8);" + "r1 = r10;" + "r1 += -8;" + "r2 = 1;" + "r3 = 42;" + /* read dst is fp[-8], thus bpf_fastcall rewrite not applied */ + "call %[bpf_probe_read_kernel];" + "exit;" + : + : __imm(bpf_get_smp_processor_id), + __imm(bpf_probe_read_kernel) + : __clobber_all); +} + +SEC("raw_tp") +__arch_x86_64 +/* main, not patched */ +__xlated("1: *(u64 *)(r10 -8) = r1") +__xlated("...") +__xlated("3: r0 = &(void __percpu *)(r0)") +__xlated("...") +__xlated("5: r1 = *(u64 *)(r10 -8)") +__xlated("...") +__xlated("9: call pc+1") +__xlated("...") +__xlated("10: exit") +/* subprogram, patched */ +__xlated("11: r1 = 1") +__xlated("...") +__xlated("13: r0 = &(void __percpu *)(r0)") +__xlated("...") +__xlated("15: exit") +__success +__naked void invalidate_one_subprog(void) +{ + asm volatile ( + "r1 = 1;" + "*(u64 *)(r10 - 8) = r1;" + "call %[bpf_get_smp_processor_id];" + "r1 = *(u64 *)(r10 - 8);" + "r1 = r10;" + "r1 += -8;" + "r1 = *(u64 *)(r1 - 0);" + "call invalidate_one_subprog_aux;" + "exit;" + : + : __imm(bpf_get_smp_processor_id) + : __clobber_all); +} + +__used +__naked static void invalidate_one_subprog_aux(void) +{ + asm volatile ( + "r1 = 1;" + "*(u64 *)(r10 - 8) = r1;" + "call %[bpf_get_smp_processor_id];" + "r1 = *(u64 *)(r10 - 8);" + "exit;" + : + : __imm(bpf_get_smp_processor_id) + : __clobber_all); +} + +SEC("raw_tp") +__arch_x86_64 +/* main */ +__xlated("0: r1 = 1") +__xlated("...") +__xlated("2: r0 = &(void __percpu *)(r0)") +__xlated("...") +__xlated("4: call pc+1") +__xlated("5: exit") +/* subprogram */ +__xlated("6: r1 = 1") +__xlated("...") +__xlated("8: r0 = &(void __percpu *)(r0)") +__xlated("...") +__xlated("10: *(u64 *)(r10 -16) = r1") +__xlated("11: exit") +__success +__naked void subprogs_use_independent_offsets(void) +{ + asm volatile ( + "r1 = 1;" + "*(u64 *)(r10 - 16) = r1;" + "call %[bpf_get_smp_processor_id];" + "r1 = *(u64 *)(r10 - 16);" + "call subprogs_use_independent_offsets_aux;" + "exit;" + : + : __imm(bpf_get_smp_processor_id) + : __clobber_all); +} + +__used +__naked static void subprogs_use_independent_offsets_aux(void) +{ + asm volatile ( + "r1 = 1;" + "*(u64 *)(r10 - 24) = r1;" + "call %[bpf_get_smp_processor_id];" + "r1 = *(u64 *)(r10 - 24);" + "*(u64 *)(r10 - 16) = r1;" + "exit;" + : + : __imm(bpf_get_smp_processor_id) + : __clobber_all); +} + +SEC("raw_tp") +__arch_x86_64 +__log_level(4) __msg("stack depth 8") +__xlated("2: r0 = &(void __percpu *)(r0)") +__success +__naked void helper_call_does_not_prevent_bpf_fastcall(void) +{ + asm volatile ( + "r1 = 1;" + "*(u64 *)(r10 - 8) = r1;" + "call %[bpf_get_smp_processor_id];" + "r1 = *(u64 *)(r10 - 8);" + "*(u64 *)(r10 - 8) = r1;" + "call %[bpf_get_prandom_u32];" + "r1 = *(u64 *)(r10 - 8);" + "exit;" + : + : __imm(bpf_get_smp_processor_id), + __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("raw_tp") +__arch_x86_64 +__log_level(4) __msg("stack depth 16") +/* may_goto counter at -16 */ +__xlated("0: *(u64 *)(r10 -16) =") +__xlated("1: r1 = 1") +__xlated("...") +__xlated("3: r0 = &(void __percpu *)(r0)") +__xlated("...") +/* may_goto expansion starts */ +__xlated("5: r11 = *(u64 *)(r10 -16)") +__xlated("6: if r11 == 0x0 goto pc+3") +__xlated("7: r11 -= 1") +__xlated("8: *(u64 *)(r10 -16) = r11") +/* may_goto expansion ends */ +__xlated("9: *(u64 *)(r10 -8) = r1") +__xlated("10: exit") +__success +__naked void may_goto_interaction(void) +{ + asm volatile ( + "r1 = 1;" + "*(u64 *)(r10 - 16) = r1;" + "call %[bpf_get_smp_processor_id];" + "r1 = *(u64 *)(r10 - 16);" + ".8byte %[may_goto];" + /* just touch some stack at -8 */ + "*(u64 *)(r10 - 8) = r1;" + "exit;" + : + : __imm(bpf_get_smp_processor_id), + __imm_insn(may_goto, BPF_RAW_INSN(BPF_JMP | BPF_JCOND, 0, 0, +1 /* offset */, 0)) + : __clobber_all); +} + +__used +__naked static void dummy_loop_callback(void) +{ + asm volatile ( + "r0 = 0;" + "exit;" + ::: __clobber_all); +} + +SEC("raw_tp") +__arch_x86_64 +__log_level(4) __msg("stack depth 32+0") +__xlated("2: r1 = 1") +__xlated("3: w0 =") +__xlated("4: r0 = &(void __percpu *)(r0)") +__xlated("5: r0 = *(u32 *)(r0 +0)") +/* bpf_loop params setup */ +__xlated("6: r2 =") +__xlated("7: r3 = 0") +__xlated("8: r4 = 0") +__xlated("...") +/* ... part of the inlined bpf_loop */ +__xlated("12: *(u64 *)(r10 -32) = r6") +__xlated("13: *(u64 *)(r10 -24) = r7") +__xlated("14: *(u64 *)(r10 -16) = r8") +__xlated("...") +__xlated("21: call pc+8") /* dummy_loop_callback */ +/* ... last insns of the bpf_loop_interaction1 */ +__xlated("...") +__xlated("28: r0 = 0") +__xlated("29: exit") +/* dummy_loop_callback */ +__xlated("30: r0 = 0") +__xlated("31: exit") +__success +__naked int bpf_loop_interaction1(void) +{ + asm volatile ( + "r1 = 1;" + /* bpf_fastcall stack region at -16, but could be removed */ + "*(u64 *)(r10 - 16) = r1;" + "call %[bpf_get_smp_processor_id];" + "r1 = *(u64 *)(r10 - 16);" + "r2 = %[dummy_loop_callback];" + "r3 = 0;" + "r4 = 0;" + "call %[bpf_loop];" + "r0 = 0;" + "exit;" + : + : __imm_ptr(dummy_loop_callback), + __imm(bpf_get_smp_processor_id), + __imm(bpf_loop) + : __clobber_common + ); +} + +SEC("raw_tp") +__arch_x86_64 +__log_level(4) __msg("stack depth 40+0") +/* call bpf_get_smp_processor_id */ +__xlated("2: r1 = 42") +__xlated("3: w0 =") +__xlated("4: r0 = &(void __percpu *)(r0)") +__xlated("5: r0 = *(u32 *)(r0 +0)") +/* call bpf_get_prandom_u32 */ +__xlated("6: *(u64 *)(r10 -16) = r1") +__xlated("7: call") +__xlated("8: r1 = *(u64 *)(r10 -16)") +__xlated("...") +/* ... part of the inlined bpf_loop */ +__xlated("15: *(u64 *)(r10 -40) = r6") +__xlated("16: *(u64 *)(r10 -32) = r7") +__xlated("17: *(u64 *)(r10 -24) = r8") +__success +__naked int bpf_loop_interaction2(void) +{ + asm volatile ( + "r1 = 42;" + /* bpf_fastcall stack region at -16, cannot be removed */ + "*(u64 *)(r10 - 16) = r1;" + "call %[bpf_get_smp_processor_id];" + "r1 = *(u64 *)(r10 - 16);" + "*(u64 *)(r10 - 16) = r1;" + "call %[bpf_get_prandom_u32];" + "r1 = *(u64 *)(r10 - 16);" + "r2 = %[dummy_loop_callback];" + "r3 = 0;" + "r4 = 0;" + "call %[bpf_loop];" + "r0 = 0;" + "exit;" + : + : __imm_ptr(dummy_loop_callback), + __imm(bpf_get_smp_processor_id), + __imm(bpf_get_prandom_u32), + __imm(bpf_loop) + : __clobber_common + ); +} + +SEC("raw_tp") +__arch_x86_64 +__log_level(4) +__msg("stack depth 512+0") +/* just to print xlated version when debugging */ +__xlated("r0 = &(void __percpu *)(r0)") +__success +/* cumulative_stack_depth() stack usage is MAX_BPF_STACK, + * called subprogram uses an additional slot for bpf_fastcall spill/fill, + * since bpf_fastcall spill/fill could be removed the program still fits + * in MAX_BPF_STACK and should be accepted. + */ +__naked int cumulative_stack_depth(void) +{ + asm volatile( + "r1 = 42;" + "*(u64 *)(r10 - %[max_bpf_stack]) = r1;" + "call cumulative_stack_depth_subprog;" + "exit;" + : + : __imm_const(max_bpf_stack, MAX_BPF_STACK) + : __clobber_all + ); +} + +__used +__naked static void cumulative_stack_depth_subprog(void) +{ + asm volatile ( + "*(u64 *)(r10 - 8) = r1;" + "call %[bpf_get_smp_processor_id];" + "r1 = *(u64 *)(r10 - 8);" + "exit;" + :: __imm(bpf_get_smp_processor_id) : __clobber_all); +} + +SEC("raw_tp") +__arch_x86_64 +__log_level(4) +__msg("stack depth 512") +__xlated("0: r1 = 42") +__xlated("1: *(u64 *)(r10 -512) = r1") +__xlated("2: w0 = ") +__xlated("3: r0 = &(void __percpu *)(r0)") +__xlated("4: r0 = *(u32 *)(r0 +0)") +__xlated("5: exit") +__success +__naked int bpf_fastcall_max_stack_ok(void) +{ + asm volatile( + "r1 = 42;" + "*(u64 *)(r10 - %[max_bpf_stack]) = r1;" + "*(u64 *)(r10 - %[max_bpf_stack_8]) = r1;" + "call %[bpf_get_smp_processor_id];" + "r1 = *(u64 *)(r10 - %[max_bpf_stack_8]);" + "exit;" + : + : __imm_const(max_bpf_stack, MAX_BPF_STACK), + __imm_const(max_bpf_stack_8, MAX_BPF_STACK + 8), + __imm(bpf_get_smp_processor_id) + : __clobber_all + ); +} + +SEC("raw_tp") +__arch_x86_64 +__log_level(4) +__msg("stack depth 520") +__failure +__naked int bpf_fastcall_max_stack_fail(void) +{ + asm volatile( + "r1 = 42;" + "*(u64 *)(r10 - %[max_bpf_stack]) = r1;" + "*(u64 *)(r10 - %[max_bpf_stack_8]) = r1;" + "call %[bpf_get_smp_processor_id];" + "r1 = *(u64 *)(r10 - %[max_bpf_stack_8]);" + /* call to prandom blocks bpf_fastcall rewrite */ + "*(u64 *)(r10 - %[max_bpf_stack_8]) = r1;" + "call %[bpf_get_prandom_u32];" + "r1 = *(u64 *)(r10 - %[max_bpf_stack_8]);" + "exit;" + : + : __imm_const(max_bpf_stack, MAX_BPF_STACK), + __imm_const(max_bpf_stack_8, MAX_BPF_STACK + 8), + __imm(bpf_get_smp_processor_id), + __imm(bpf_get_prandom_u32) + : __clobber_all + ); +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/verifier_nocsr.c b/tools/testing/selftests/bpf/progs/verifier_nocsr.c deleted file mode 100644 index 666c736d196f..000000000000 --- a/tools/testing/selftests/bpf/progs/verifier_nocsr.c +++ /dev/null @@ -1,845 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include -#include -#include "../../../include/linux/filter.h" -#include "bpf_misc.h" - -SEC("raw_tp") -__arch_x86_64 -__log_level(4) __msg("stack depth 8") -__xlated("4: r5 = 5") -__xlated("5: w0 = ") -__xlated("6: r0 = &(void __percpu *)(r0)") -__xlated("7: r0 = *(u32 *)(r0 +0)") -__xlated("8: exit") -__success -__naked void simple(void) -{ - asm volatile ( - "r1 = 1;" - "r2 = 2;" - "r3 = 3;" - "r4 = 4;" - "r5 = 5;" - "*(u64 *)(r10 - 16) = r1;" - "*(u64 *)(r10 - 24) = r2;" - "*(u64 *)(r10 - 32) = r3;" - "*(u64 *)(r10 - 40) = r4;" - "*(u64 *)(r10 - 48) = r5;" - "call %[bpf_get_smp_processor_id];" - "r5 = *(u64 *)(r10 - 48);" - "r4 = *(u64 *)(r10 - 40);" - "r3 = *(u64 *)(r10 - 32);" - "r2 = *(u64 *)(r10 - 24);" - "r1 = *(u64 *)(r10 - 16);" - "exit;" - : - : __imm(bpf_get_smp_processor_id) - : __clobber_all); -} - -/* The logic for detecting and verifying nocsr pattern is the same for - * any arch, however x86 differs from arm64 or riscv64 in a way - * bpf_get_smp_processor_id is rewritten: - * - on x86 it is done by verifier - * - on arm64 and riscv64 it is done by jit - * - * Which leads to different xlated patterns for different archs: - * - on x86 the call is expanded as 3 instructions - * - on arm64 and riscv64 the call remains as is - * (but spills/fills are still removed) - * - * It is really desirable to check instruction indexes in the xlated - * patterns, so add this canary test to check that function rewrite by - * jit is correctly processed by nocsr logic, keep the rest of the - * tests as x86. - */ -SEC("raw_tp") -__arch_arm64 -__arch_riscv64 -__xlated("0: r1 = 1") -__xlated("1: call bpf_get_smp_processor_id") -__xlated("2: exit") -__success -__naked void canary_arm64_riscv64(void) -{ - asm volatile ( - "r1 = 1;" - "*(u64 *)(r10 - 16) = r1;" - "call %[bpf_get_smp_processor_id];" - "r1 = *(u64 *)(r10 - 16);" - "exit;" - : - : __imm(bpf_get_smp_processor_id) - : __clobber_all); -} - -SEC("raw_tp") -__arch_x86_64 -__xlated("1: r0 = &(void __percpu *)(r0)") -__xlated("...") -__xlated("3: exit") -__success -__naked void canary_zero_spills(void) -{ - asm volatile ( - "call %[bpf_get_smp_processor_id];" - "exit;" - : - : __imm(bpf_get_smp_processor_id) - : __clobber_all); -} - -SEC("raw_tp") -__arch_x86_64 -__log_level(4) __msg("stack depth 16") -__xlated("1: *(u64 *)(r10 -16) = r1") -__xlated("...") -__xlated("3: r0 = &(void __percpu *)(r0)") -__xlated("...") -__xlated("5: r2 = *(u64 *)(r10 -16)") -__success -__naked void wrong_reg_in_pattern1(void) -{ - asm volatile ( - "r1 = 1;" - "*(u64 *)(r10 - 16) = r1;" - "call %[bpf_get_smp_processor_id];" - "r2 = *(u64 *)(r10 - 16);" - "exit;" - : - : __imm(bpf_get_smp_processor_id) - : __clobber_all); -} - -SEC("raw_tp") -__arch_x86_64 -__xlated("1: *(u64 *)(r10 -16) = r6") -__xlated("...") -__xlated("3: r0 = &(void __percpu *)(r0)") -__xlated("...") -__xlated("5: r6 = *(u64 *)(r10 -16)") -__success -__naked void wrong_reg_in_pattern2(void) -{ - asm volatile ( - "r6 = 1;" - "*(u64 *)(r10 - 16) = r6;" - "call %[bpf_get_smp_processor_id];" - "r6 = *(u64 *)(r10 - 16);" - "exit;" - : - : __imm(bpf_get_smp_processor_id) - : __clobber_all); -} - -SEC("raw_tp") -__arch_x86_64 -__xlated("1: *(u64 *)(r10 -16) = r0") -__xlated("...") -__xlated("3: r0 = &(void __percpu *)(r0)") -__xlated("...") -__xlated("5: r0 = *(u64 *)(r10 -16)") -__success -__naked void wrong_reg_in_pattern3(void) -{ - asm volatile ( - "r0 = 1;" - "*(u64 *)(r10 - 16) = r0;" - "call %[bpf_get_smp_processor_id];" - "r0 = *(u64 *)(r10 - 16);" - "exit;" - : - : __imm(bpf_get_smp_processor_id) - : __clobber_all); -} - -SEC("raw_tp") -__arch_x86_64 -__xlated("2: *(u64 *)(r2 -16) = r1") -__xlated("...") -__xlated("4: r0 = &(void __percpu *)(r0)") -__xlated("...") -__xlated("6: r1 = *(u64 *)(r10 -16)") -__success -__naked void wrong_base_in_pattern(void) -{ - asm volatile ( - "r1 = 1;" - "r2 = r10;" - "*(u64 *)(r2 - 16) = r1;" - "call %[bpf_get_smp_processor_id];" - "r1 = *(u64 *)(r10 - 16);" - "exit;" - : - : __imm(bpf_get_smp_processor_id) - : __clobber_all); -} - -SEC("raw_tp") -__arch_x86_64 -__xlated("1: *(u64 *)(r10 -16) = r1") -__xlated("...") -__xlated("3: r0 = &(void __percpu *)(r0)") -__xlated("...") -__xlated("5: r2 = 1") -__success -__naked void wrong_insn_in_pattern(void) -{ - asm volatile ( - "r1 = 1;" - "*(u64 *)(r10 - 16) = r1;" - "call %[bpf_get_smp_processor_id];" - "r2 = 1;" - "r1 = *(u64 *)(r10 - 16);" - "exit;" - : - : __imm(bpf_get_smp_processor_id) - : __clobber_all); -} - -SEC("raw_tp") -__arch_x86_64 -__xlated("2: *(u64 *)(r10 -16) = r1") -__xlated("...") -__xlated("4: r0 = &(void __percpu *)(r0)") -__xlated("...") -__xlated("6: r1 = *(u64 *)(r10 -8)") -__success -__naked void wrong_off_in_pattern1(void) -{ - asm volatile ( - "r1 = 1;" - "*(u64 *)(r10 - 8) = r1;" - "*(u64 *)(r10 - 16) = r1;" - "call %[bpf_get_smp_processor_id];" - "r1 = *(u64 *)(r10 - 8);" - "exit;" - : - : __imm(bpf_get_smp_processor_id) - : __clobber_all); -} - -SEC("raw_tp") -__arch_x86_64 -__xlated("1: *(u32 *)(r10 -4) = r1") -__xlated("...") -__xlated("3: r0 = &(void __percpu *)(r0)") -__xlated("...") -__xlated("5: r1 = *(u32 *)(r10 -4)") -__success -__naked void wrong_off_in_pattern2(void) -{ - asm volatile ( - "r1 = 1;" - "*(u32 *)(r10 - 4) = r1;" - "call %[bpf_get_smp_processor_id];" - "r1 = *(u32 *)(r10 - 4);" - "exit;" - : - : __imm(bpf_get_smp_processor_id) - : __clobber_all); -} - -SEC("raw_tp") -__arch_x86_64 -__xlated("1: *(u32 *)(r10 -16) = r1") -__xlated("...") -__xlated("3: r0 = &(void __percpu *)(r0)") -__xlated("...") -__xlated("5: r1 = *(u32 *)(r10 -16)") -__success -__naked void wrong_size_in_pattern(void) -{ - asm volatile ( - "r1 = 1;" - "*(u32 *)(r10 - 16) = r1;" - "call %[bpf_get_smp_processor_id];" - "r1 = *(u32 *)(r10 - 16);" - "exit;" - : - : __imm(bpf_get_smp_processor_id) - : __clobber_all); -} - -SEC("raw_tp") -__arch_x86_64 -__xlated("2: *(u32 *)(r10 -8) = r1") -__xlated("...") -__xlated("4: r0 = &(void __percpu *)(r0)") -__xlated("...") -__xlated("6: r1 = *(u32 *)(r10 -8)") -__success -__naked void partial_pattern(void) -{ - asm volatile ( - "r1 = 1;" - "r2 = 2;" - "*(u32 *)(r10 - 8) = r1;" - "*(u64 *)(r10 - 16) = r2;" - "call %[bpf_get_smp_processor_id];" - "r2 = *(u64 *)(r10 - 16);" - "r1 = *(u32 *)(r10 - 8);" - "exit;" - : - : __imm(bpf_get_smp_processor_id) - : __clobber_all); -} - -SEC("raw_tp") -__arch_x86_64 -__xlated("0: r1 = 1") -__xlated("1: r2 = 2") -/* not patched, spills for -8, -16 not removed */ -__xlated("2: *(u64 *)(r10 -8) = r1") -__xlated("3: *(u64 *)(r10 -16) = r2") -__xlated("...") -__xlated("5: r0 = &(void __percpu *)(r0)") -__xlated("...") -__xlated("7: r2 = *(u64 *)(r10 -16)") -__xlated("8: r1 = *(u64 *)(r10 -8)") -/* patched, spills for -24, -32 removed */ -__xlated("...") -__xlated("10: r0 = &(void __percpu *)(r0)") -__xlated("...") -__xlated("12: exit") -__success -__naked void min_stack_offset(void) -{ - asm volatile ( - "r1 = 1;" - "r2 = 2;" - /* this call won't be patched */ - "*(u64 *)(r10 - 8) = r1;" - "*(u64 *)(r10 - 16) = r2;" - "call %[bpf_get_smp_processor_id];" - "r2 = *(u64 *)(r10 - 16);" - "r1 = *(u64 *)(r10 - 8);" - /* this call would be patched */ - "*(u64 *)(r10 - 24) = r1;" - "*(u64 *)(r10 - 32) = r2;" - "call %[bpf_get_smp_processor_id];" - "r2 = *(u64 *)(r10 - 32);" - "r1 = *(u64 *)(r10 - 24);" - "exit;" - : - : __imm(bpf_get_smp_processor_id) - : __clobber_all); -} - -SEC("raw_tp") -__arch_x86_64 -__xlated("1: *(u64 *)(r10 -8) = r1") -__xlated("...") -__xlated("3: r0 = &(void __percpu *)(r0)") -__xlated("...") -__xlated("5: r1 = *(u64 *)(r10 -8)") -__success -__naked void bad_fixed_read(void) -{ - asm volatile ( - "r1 = 1;" - "*(u64 *)(r10 - 8) = r1;" - "call %[bpf_get_smp_processor_id];" - "r1 = *(u64 *)(r10 - 8);" - "r1 = r10;" - "r1 += -8;" - "r1 = *(u64 *)(r1 - 0);" - "exit;" - : - : __imm(bpf_get_smp_processor_id) - : __clobber_all); -} - -SEC("raw_tp") -__arch_x86_64 -__xlated("1: *(u64 *)(r10 -8) = r1") -__xlated("...") -__xlated("3: r0 = &(void __percpu *)(r0)") -__xlated("...") -__xlated("5: r1 = *(u64 *)(r10 -8)") -__success -__naked void bad_fixed_write(void) -{ - asm volatile ( - "r1 = 1;" - "*(u64 *)(r10 - 8) = r1;" - "call %[bpf_get_smp_processor_id];" - "r1 = *(u64 *)(r10 - 8);" - "r1 = r10;" - "r1 += -8;" - "*(u64 *)(r1 - 0) = r1;" - "exit;" - : - : __imm(bpf_get_smp_processor_id) - : __clobber_all); -} - -SEC("raw_tp") -__arch_x86_64 -__xlated("6: *(u64 *)(r10 -16) = r1") -__xlated("...") -__xlated("8: r0 = &(void __percpu *)(r0)") -__xlated("...") -__xlated("10: r1 = *(u64 *)(r10 -16)") -__success -__naked void bad_varying_read(void) -{ - asm volatile ( - "r6 = *(u64 *)(r1 + 0);" /* random scalar value */ - "r6 &= 0x7;" /* r6 range [0..7] */ - "r6 += 0x2;" /* r6 range [2..9] */ - "r7 = 0;" - "r7 -= r6;" /* r7 range [-9..-2] */ - "r1 = 1;" - "*(u64 *)(r10 - 16) = r1;" - "call %[bpf_get_smp_processor_id];" - "r1 = *(u64 *)(r10 - 16);" - "r1 = r10;" - "r1 += r7;" - "r1 = *(u8 *)(r1 - 0);" /* touches slot [-16..-9] where spills are stored */ - "exit;" - : - : __imm(bpf_get_smp_processor_id) - : __clobber_all); -} - -SEC("raw_tp") -__arch_x86_64 -__xlated("6: *(u64 *)(r10 -16) = r1") -__xlated("...") -__xlated("8: r0 = &(void __percpu *)(r0)") -__xlated("...") -__xlated("10: r1 = *(u64 *)(r10 -16)") -__success -__naked void bad_varying_write(void) -{ - asm volatile ( - "r6 = *(u64 *)(r1 + 0);" /* random scalar value */ - "r6 &= 0x7;" /* r6 range [0..7] */ - "r6 += 0x2;" /* r6 range [2..9] */ - "r7 = 0;" - "r7 -= r6;" /* r7 range [-9..-2] */ - "r1 = 1;" - "*(u64 *)(r10 - 16) = r1;" - "call %[bpf_get_smp_processor_id];" - "r1 = *(u64 *)(r10 - 16);" - "r1 = r10;" - "r1 += r7;" - "*(u8 *)(r1 - 0) = r7;" /* touches slot [-16..-9] where spills are stored */ - "exit;" - : - : __imm(bpf_get_smp_processor_id) - : __clobber_all); -} - -SEC("raw_tp") -__arch_x86_64 -__xlated("1: *(u64 *)(r10 -8) = r1") -__xlated("...") -__xlated("3: r0 = &(void __percpu *)(r0)") -__xlated("...") -__xlated("5: r1 = *(u64 *)(r10 -8)") -__success -__naked void bad_write_in_subprog(void) -{ - asm volatile ( - "r1 = 1;" - "*(u64 *)(r10 - 8) = r1;" - "call %[bpf_get_smp_processor_id];" - "r1 = *(u64 *)(r10 - 8);" - "r1 = r10;" - "r1 += -8;" - "call bad_write_in_subprog_aux;" - "exit;" - : - : __imm(bpf_get_smp_processor_id) - : __clobber_all); -} - -__used -__naked static void bad_write_in_subprog_aux(void) -{ - asm volatile ( - "r0 = 1;" - "*(u64 *)(r1 - 0) = r0;" /* invalidates nocsr contract for caller: */ - "exit;" /* caller stack at -8 used outside of the pattern */ - ::: __clobber_all); -} - -SEC("raw_tp") -__arch_x86_64 -__xlated("1: *(u64 *)(r10 -8) = r1") -__xlated("...") -__xlated("3: r0 = &(void __percpu *)(r0)") -__xlated("...") -__xlated("5: r1 = *(u64 *)(r10 -8)") -__success -__naked void bad_helper_write(void) -{ - asm volatile ( - "r1 = 1;" - /* nocsr pattern with stack offset -8 */ - "*(u64 *)(r10 - 8) = r1;" - "call %[bpf_get_smp_processor_id];" - "r1 = *(u64 *)(r10 - 8);" - "r1 = r10;" - "r1 += -8;" - "r2 = 1;" - "r3 = 42;" - /* read dst is fp[-8], thus nocsr rewrite not applied */ - "call %[bpf_probe_read_kernel];" - "exit;" - : - : __imm(bpf_get_smp_processor_id), - __imm(bpf_probe_read_kernel) - : __clobber_all); -} - -SEC("raw_tp") -__arch_x86_64 -/* main, not patched */ -__xlated("1: *(u64 *)(r10 -8) = r1") -__xlated("...") -__xlated("3: r0 = &(void __percpu *)(r0)") -__xlated("...") -__xlated("5: r1 = *(u64 *)(r10 -8)") -__xlated("...") -__xlated("9: call pc+1") -__xlated("...") -__xlated("10: exit") -/* subprogram, patched */ -__xlated("11: r1 = 1") -__xlated("...") -__xlated("13: r0 = &(void __percpu *)(r0)") -__xlated("...") -__xlated("15: exit") -__success -__naked void invalidate_one_subprog(void) -{ - asm volatile ( - "r1 = 1;" - "*(u64 *)(r10 - 8) = r1;" - "call %[bpf_get_smp_processor_id];" - "r1 = *(u64 *)(r10 - 8);" - "r1 = r10;" - "r1 += -8;" - "r1 = *(u64 *)(r1 - 0);" - "call invalidate_one_subprog_aux;" - "exit;" - : - : __imm(bpf_get_smp_processor_id) - : __clobber_all); -} - -__used -__naked static void invalidate_one_subprog_aux(void) -{ - asm volatile ( - "r1 = 1;" - "*(u64 *)(r10 - 8) = r1;" - "call %[bpf_get_smp_processor_id];" - "r1 = *(u64 *)(r10 - 8);" - "exit;" - : - : __imm(bpf_get_smp_processor_id) - : __clobber_all); -} - -SEC("raw_tp") -__arch_x86_64 -/* main */ -__xlated("0: r1 = 1") -__xlated("...") -__xlated("2: r0 = &(void __percpu *)(r0)") -__xlated("...") -__xlated("4: call pc+1") -__xlated("5: exit") -/* subprogram */ -__xlated("6: r1 = 1") -__xlated("...") -__xlated("8: r0 = &(void __percpu *)(r0)") -__xlated("...") -__xlated("10: *(u64 *)(r10 -16) = r1") -__xlated("11: exit") -__success -__naked void subprogs_use_independent_offsets(void) -{ - asm volatile ( - "r1 = 1;" - "*(u64 *)(r10 - 16) = r1;" - "call %[bpf_get_smp_processor_id];" - "r1 = *(u64 *)(r10 - 16);" - "call subprogs_use_independent_offsets_aux;" - "exit;" - : - : __imm(bpf_get_smp_processor_id) - : __clobber_all); -} - -__used -__naked static void subprogs_use_independent_offsets_aux(void) -{ - asm volatile ( - "r1 = 1;" - "*(u64 *)(r10 - 24) = r1;" - "call %[bpf_get_smp_processor_id];" - "r1 = *(u64 *)(r10 - 24);" - "*(u64 *)(r10 - 16) = r1;" - "exit;" - : - : __imm(bpf_get_smp_processor_id) - : __clobber_all); -} - -SEC("raw_tp") -__arch_x86_64 -__log_level(4) __msg("stack depth 8") -__xlated("2: r0 = &(void __percpu *)(r0)") -__success -__naked void helper_call_does_not_prevent_nocsr(void) -{ - asm volatile ( - "r1 = 1;" - "*(u64 *)(r10 - 8) = r1;" - "call %[bpf_get_smp_processor_id];" - "r1 = *(u64 *)(r10 - 8);" - "*(u64 *)(r10 - 8) = r1;" - "call %[bpf_get_prandom_u32];" - "r1 = *(u64 *)(r10 - 8);" - "exit;" - : - : __imm(bpf_get_smp_processor_id), - __imm(bpf_get_prandom_u32) - : __clobber_all); -} - -SEC("raw_tp") -__arch_x86_64 -__log_level(4) __msg("stack depth 16") -/* may_goto counter at -16 */ -__xlated("0: *(u64 *)(r10 -16) =") -__xlated("1: r1 = 1") -__xlated("...") -__xlated("3: r0 = &(void __percpu *)(r0)") -__xlated("...") -/* may_goto expansion starts */ -__xlated("5: r11 = *(u64 *)(r10 -16)") -__xlated("6: if r11 == 0x0 goto pc+3") -__xlated("7: r11 -= 1") -__xlated("8: *(u64 *)(r10 -16) = r11") -/* may_goto expansion ends */ -__xlated("9: *(u64 *)(r10 -8) = r1") -__xlated("10: exit") -__success -__naked void may_goto_interaction(void) -{ - asm volatile ( - "r1 = 1;" - "*(u64 *)(r10 - 16) = r1;" - "call %[bpf_get_smp_processor_id];" - "r1 = *(u64 *)(r10 - 16);" - ".8byte %[may_goto];" - /* just touch some stack at -8 */ - "*(u64 *)(r10 - 8) = r1;" - "exit;" - : - : __imm(bpf_get_smp_processor_id), - __imm_insn(may_goto, BPF_RAW_INSN(BPF_JMP | BPF_JCOND, 0, 0, +1 /* offset */, 0)) - : __clobber_all); -} - -__used -__naked static void dummy_loop_callback(void) -{ - asm volatile ( - "r0 = 0;" - "exit;" - ::: __clobber_all); -} - -SEC("raw_tp") -__arch_x86_64 -__log_level(4) __msg("stack depth 32+0") -__xlated("2: r1 = 1") -__xlated("3: w0 =") -__xlated("4: r0 = &(void __percpu *)(r0)") -__xlated("5: r0 = *(u32 *)(r0 +0)") -/* bpf_loop params setup */ -__xlated("6: r2 =") -__xlated("7: r3 = 0") -__xlated("8: r4 = 0") -__xlated("...") -/* ... part of the inlined bpf_loop */ -__xlated("12: *(u64 *)(r10 -32) = r6") -__xlated("13: *(u64 *)(r10 -24) = r7") -__xlated("14: *(u64 *)(r10 -16) = r8") -__xlated("...") -__xlated("21: call pc+8") /* dummy_loop_callback */ -/* ... last insns of the bpf_loop_interaction1 */ -__xlated("...") -__xlated("28: r0 = 0") -__xlated("29: exit") -/* dummy_loop_callback */ -__xlated("30: r0 = 0") -__xlated("31: exit") -__success -__naked int bpf_loop_interaction1(void) -{ - asm volatile ( - "r1 = 1;" - /* nocsr stack region at -16, but could be removed */ - "*(u64 *)(r10 - 16) = r1;" - "call %[bpf_get_smp_processor_id];" - "r1 = *(u64 *)(r10 - 16);" - "r2 = %[dummy_loop_callback];" - "r3 = 0;" - "r4 = 0;" - "call %[bpf_loop];" - "r0 = 0;" - "exit;" - : - : __imm_ptr(dummy_loop_callback), - __imm(bpf_get_smp_processor_id), - __imm(bpf_loop) - : __clobber_common - ); -} - -SEC("raw_tp") -__arch_x86_64 -__log_level(4) __msg("stack depth 40+0") -/* call bpf_get_smp_processor_id */ -__xlated("2: r1 = 42") -__xlated("3: w0 =") -__xlated("4: r0 = &(void __percpu *)(r0)") -__xlated("5: r0 = *(u32 *)(r0 +0)") -/* call bpf_get_prandom_u32 */ -__xlated("6: *(u64 *)(r10 -16) = r1") -__xlated("7: call") -__xlated("8: r1 = *(u64 *)(r10 -16)") -__xlated("...") -/* ... part of the inlined bpf_loop */ -__xlated("15: *(u64 *)(r10 -40) = r6") -__xlated("16: *(u64 *)(r10 -32) = r7") -__xlated("17: *(u64 *)(r10 -24) = r8") -__success -__naked int bpf_loop_interaction2(void) -{ - asm volatile ( - "r1 = 42;" - /* nocsr stack region at -16, cannot be removed */ - "*(u64 *)(r10 - 16) = r1;" - "call %[bpf_get_smp_processor_id];" - "r1 = *(u64 *)(r10 - 16);" - "*(u64 *)(r10 - 16) = r1;" - "call %[bpf_get_prandom_u32];" - "r1 = *(u64 *)(r10 - 16);" - "r2 = %[dummy_loop_callback];" - "r3 = 0;" - "r4 = 0;" - "call %[bpf_loop];" - "r0 = 0;" - "exit;" - : - : __imm_ptr(dummy_loop_callback), - __imm(bpf_get_smp_processor_id), - __imm(bpf_get_prandom_u32), - __imm(bpf_loop) - : __clobber_common - ); -} - -SEC("raw_tp") -__arch_x86_64 -__log_level(4) -__msg("stack depth 512+0") -/* just to print xlated version when debugging */ -__xlated("r0 = &(void __percpu *)(r0)") -__success -/* cumulative_stack_depth() stack usage is MAX_BPF_STACK, - * called subprogram uses an additional slot for nocsr spill/fill, - * since nocsr spill/fill could be removed the program still fits - * in MAX_BPF_STACK and should be accepted. - */ -__naked int cumulative_stack_depth(void) -{ - asm volatile( - "r1 = 42;" - "*(u64 *)(r10 - %[max_bpf_stack]) = r1;" - "call cumulative_stack_depth_subprog;" - "exit;" - : - : __imm_const(max_bpf_stack, MAX_BPF_STACK) - : __clobber_all - ); -} - -__used -__naked static void cumulative_stack_depth_subprog(void) -{ - asm volatile ( - "*(u64 *)(r10 - 8) = r1;" - "call %[bpf_get_smp_processor_id];" - "r1 = *(u64 *)(r10 - 8);" - "exit;" - :: __imm(bpf_get_smp_processor_id) : __clobber_all); -} - -SEC("raw_tp") -__arch_x86_64 -__log_level(4) -__msg("stack depth 512") -__xlated("0: r1 = 42") -__xlated("1: *(u64 *)(r10 -512) = r1") -__xlated("2: w0 = ") -__xlated("3: r0 = &(void __percpu *)(r0)") -__xlated("4: r0 = *(u32 *)(r0 +0)") -__xlated("5: exit") -__success -__naked int nocsr_max_stack_ok(void) -{ - asm volatile( - "r1 = 42;" - "*(u64 *)(r10 - %[max_bpf_stack]) = r1;" - "*(u64 *)(r10 - %[max_bpf_stack_8]) = r1;" - "call %[bpf_get_smp_processor_id];" - "r1 = *(u64 *)(r10 - %[max_bpf_stack_8]);" - "exit;" - : - : __imm_const(max_bpf_stack, MAX_BPF_STACK), - __imm_const(max_bpf_stack_8, MAX_BPF_STACK + 8), - __imm(bpf_get_smp_processor_id) - : __clobber_all - ); -} - -SEC("raw_tp") -__arch_x86_64 -__log_level(4) -__msg("stack depth 520") -__failure -__naked int nocsr_max_stack_fail(void) -{ - asm volatile( - "r1 = 42;" - "*(u64 *)(r10 - %[max_bpf_stack]) = r1;" - "*(u64 *)(r10 - %[max_bpf_stack_8]) = r1;" - "call %[bpf_get_smp_processor_id];" - "r1 = *(u64 *)(r10 - %[max_bpf_stack_8]);" - /* call to prandom blocks nocsr rewrite */ - "*(u64 *)(r10 - %[max_bpf_stack_8]) = r1;" - "call %[bpf_get_prandom_u32];" - "r1 = *(u64 *)(r10 - %[max_bpf_stack_8]);" - "exit;" - : - : __imm_const(max_bpf_stack, MAX_BPF_STACK), - __imm_const(max_bpf_stack_8, MAX_BPF_STACK + 8), - __imm(bpf_get_smp_processor_id), - __imm(bpf_get_prandom_u32) - : __clobber_all - ); -} - -char _license[] SEC("license") = "GPL"; -- cgit v1.2.3 From f406026fefa745ff9a3153507cc3b9a87371d954 Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Thu, 22 Aug 2024 01:41:11 -0700 Subject: selftests/bpf: by default use arch mask allowing all archs If test case does not specify architecture via __arch_* macro consider that it should be run for all architectures. Fixes: 7d743e4c759c ("selftests/bpf: __jited test tag to check disassembly after jit") Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20240822084112.3257995-6-eddyz87@gmail.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/test_loader.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/test_loader.c b/tools/testing/selftests/bpf/test_loader.c index b229dd013355..2ca9b73e5a6b 100644 --- a/tools/testing/selftests/bpf/test_loader.c +++ b/tools/testing/selftests/bpf/test_loader.c @@ -543,7 +543,7 @@ static int parse_test_spec(struct test_loader *tester, } } - spec->arch_mask = arch_mask; + spec->arch_mask = arch_mask ?: -1; if (spec->mode_mask == 0) spec->mode_mask = PRIV; -- cgit v1.2.3 From 8c2e043daadad021fc501ac64cce131f48c3ca46 Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Thu, 22 Aug 2024 01:41:12 -0700 Subject: selftests/bpf: check if bpf_fastcall is recognized for kfuncs Use kfunc_bpf_cast_to_kern_ctx() and kfunc_bpf_rdonly_cast() to verify that bpf_fastcall pattern is recognized for kfunc calls. Acked-by: Yonghong Song Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20240822084112.3257995-7-eddyz87@gmail.com Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/progs/verifier_bpf_fastcall.c | 55 ++++++++++++++++++++++ 1 file changed, 55 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/progs/verifier_bpf_fastcall.c b/tools/testing/selftests/bpf/progs/verifier_bpf_fastcall.c index e30ab9fe5096..9da97d2efcd9 100644 --- a/tools/testing/selftests/bpf/progs/verifier_bpf_fastcall.c +++ b/tools/testing/selftests/bpf/progs/verifier_bpf_fastcall.c @@ -2,8 +2,11 @@ #include #include +#include #include "../../../include/linux/filter.h" #include "bpf_misc.h" +#include +#include "bpf_kfuncs.h" SEC("raw_tp") __arch_x86_64 @@ -842,4 +845,56 @@ __naked int bpf_fastcall_max_stack_fail(void) ); } +SEC("cgroup/getsockname_unix") +__xlated("0: r2 = 1") +/* bpf_cast_to_kern_ctx is replaced by a single assignment */ +__xlated("1: r0 = r1") +__xlated("2: r0 = r2") +__xlated("3: exit") +__success +__naked void kfunc_bpf_cast_to_kern_ctx(void) +{ + asm volatile ( + "r2 = 1;" + "*(u64 *)(r10 - 32) = r2;" + "call %[bpf_cast_to_kern_ctx];" + "r2 = *(u64 *)(r10 - 32);" + "r0 = r2;" + "exit;" + : + : __imm(bpf_cast_to_kern_ctx) + : __clobber_all); +} + +SEC("raw_tp") +__xlated("3: r3 = 1") +/* bpf_rdonly_cast is replaced by a single assignment */ +__xlated("4: r0 = r1") +__xlated("5: r0 = r3") +void kfunc_bpf_rdonly_cast(void) +{ + asm volatile ( + "r2 = %[btf_id];" + "r3 = 1;" + "*(u64 *)(r10 - 32) = r3;" + "call %[bpf_rdonly_cast];" + "r3 = *(u64 *)(r10 - 32);" + "r0 = r3;" + : + : __imm(bpf_rdonly_cast), + [btf_id]"r"(bpf_core_type_id_kernel(union bpf_attr)) + : __clobber_common); +} + +/* BTF FUNC records are not generated for kfuncs referenced + * from inline assembly. These records are necessary for + * libbpf to link the program. The function below is a hack + * to ensure that BTF FUNC records are generated. + */ +void kfunc_root(void) +{ + bpf_cast_to_kern_ctx(0); + bpf_rdonly_cast(0, 0); +} + char _license[] SEC("license") = "GPL"; -- cgit v1.2.3 From a0d57c606188ebf106f26951558c3801627dd0a1 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Wed, 21 Aug 2024 16:26:26 -0700 Subject: perf annotate-data: Update debug messages In check_matching_type(), it'd be easier to display the typename in question if it's available. For example, check out the line starts with 'chk'. ----------------------------------------------------------- find data type for 0x10(reg0) at cpuacct_charge+0x13 CU for kernel/sched/build_utility.c (die:0x137ee0b) frame base: cfa=1 fbreg=7 scope: [3/3] (die:13d9632) bb: [c - 13] var [c] reg5 type='struct task_struct*' size=0x8 (die:0x1381230) mov [c] 0xdf8(reg5) -> reg0 type='struct css_set*' size=0x8 (die:0x1385c56) chk [13] reg0 offset=0x10 ok=1 kind=1 (struct css_set*) : Good! <<<--- here found by insn track: 0x10(reg0) type-offset=0x10 final result: type='struct css_set' size=0x250 (die:0x1385b0e) Another example: ----------------------------------------------------------- find data type for 0x8(reg0) at menu_select+0x279 CU for drivers/cpuidle/governors/menu.c (die:0x7b0fe79) frame base: cfa=1 fbreg=7 scope: [2/2] (die:7b11010) bb: [273 - 277] bb: [279 - 279] chk [279] reg0 offset=0x8 ok=0 kind=0 cfa : no type information scope: [1/2] (die:7b10cbc) bb: [0 - 64] ... mov [26a] imm=0xffffffff -> reg15 bb: [273 - 277] bb: [279 - 279] chk [279] reg0 offset=0x8 ok=1 kind=1 (long long unsigned int) : no/void pointer <<<--- here final result: no/void pointer Also change some places to print negative offsets properly. Before: ----------------------------------------------------------- find data type for 0xffffff40(reg6) at __tcp_transmit_skb+0x58 After: ----------------------------------------------------------- find data type for -0xc0(reg6) at __tcp_transmit_skb+0x58 Signed-off-by: Namhyung Kim Cc: Adrian Hunter Cc: Athira Rajeev Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kan Liang Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20240821232628.353177-3-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/annotate-data.c | 45 ++++++++++++++++++++++++++++++++--------- 1 file changed, 35 insertions(+), 10 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/annotate-data.c b/tools/perf/util/annotate-data.c index 1e8328dde720..7d4b5aaa3b6a 100644 --- a/tools/perf/util/annotate-data.c +++ b/tools/perf/util/annotate-data.c @@ -960,9 +960,16 @@ static enum type_match_result check_matching_type(struct type_state *state, Dwarf_Word size; u32 insn_offset = dloc->ip - dloc->ms->sym->start; int reg = dloc->op->reg1; + int offset = dloc->op->offset; + const char *offset_sign = ""; - pr_debug_dtp("chk [%x] reg%d offset=%#x ok=%d kind=%d ", - insn_offset, reg, dloc->op->offset, + if (offset < 0) { + offset = -offset; + offset_sign = "-"; + } + + pr_debug_dtp("chk [%x] reg%d offset=%s%#x ok=%d kind=%d ", + insn_offset, reg, offset_sign, offset, state->regs[reg].ok, state->regs[reg].kind); if (!state->regs[reg].ok) @@ -970,6 +977,12 @@ static enum type_match_result check_matching_type(struct type_state *state, if (state->regs[reg].kind == TSR_KIND_TYPE) { Dwarf_Die sized_type; + struct strbuf sb; + + strbuf_init(&sb, 32); + die_get_typename_from_type(&state->regs[reg].type, &sb); + pr_debug_dtp("(%s)", sb.buf); + strbuf_release(&sb); /* * Normal registers should hold a pointer (or array) to @@ -1119,7 +1132,6 @@ check_non_register: check_kernel: if (dso__kernel(map__dso(dloc->ms->map))) { u64 addr; - int offset; /* Direct this-cpu access like "%gs:0x34740" */ if (dloc->op->segment == INSN_SEG_X86_GS && dloc->op->imm && @@ -1271,6 +1283,13 @@ again: cu_die, type_die); if (ret == PERF_TMR_OK) { char buf[64]; + int offset = dloc->op->offset; + const char *offset_sign = ""; + + if (offset < 0) { + offset = -offset; + offset_sign = "-"; + } if (dloc->op->multi_regs) snprintf(buf, sizeof(buf), "reg%d, reg%d", @@ -1278,8 +1297,8 @@ again: else snprintf(buf, sizeof(buf), "reg%d", dloc->op->reg1); - pr_debug_dtp("found by insn track: %#x(%s) type-offset=%#x\n", - dloc->op->offset, buf, dloc->type_offset); + pr_debug_dtp("found by insn track: %s%#x(%s) type-offset=%#x\n", + offset_sign, offset, buf, dloc->type_offset); break; } @@ -1302,7 +1321,7 @@ static int find_data_type_die(struct data_loc_info *dloc, Dwarf_Die *type_die) struct annotated_op_loc *loc = dloc->op; Dwarf_Die cu_die, var_die; Dwarf_Die *scopes = NULL; - int reg, offset; + int reg, offset = loc->offset; int ret = -1; int i, nr_scopes; int fbreg = -1; @@ -1312,6 +1331,7 @@ static int find_data_type_die(struct data_loc_info *dloc, Dwarf_Die *type_die) u64 pc; char buf[64]; enum type_match_result result = PERF_TMR_UNKNOWN; + const char *offset_sign = ""; if (dloc->op->multi_regs) snprintf(buf, sizeof(buf), "reg%d, reg%d", dloc->op->reg1, dloc->op->reg2); @@ -1320,10 +1340,15 @@ static int find_data_type_die(struct data_loc_info *dloc, Dwarf_Die *type_die) else snprintf(buf, sizeof(buf), "reg%d", dloc->op->reg1); + if (offset < 0) { + offset = -offset; + offset_sign = "-"; + } + pr_debug_dtp("-----------------------------------------------------------\n"); - pr_debug_dtp("find data type for %#x(%s) at %s+%#"PRIx64"\n", - dloc->op->offset, buf, dloc->ms->sym->name, - dloc->ip - dloc->ms->sym->start); + pr_debug_dtp("find data type for %s%#x(%s) at %s+%#"PRIx64"\n", + offset_sign, offset, buf, + dloc->ms->sym->name, dloc->ip - dloc->ms->sym->start); /* * IP is a relative instruction address from the start of the map, as @@ -1453,8 +1478,8 @@ retry: } out: + pr_debug_dtp("final result: "); if (found) { - pr_debug_dtp("final type:"); pr_debug_type_name(type_die, TSR_KIND_TYPE); ret = 0; } else { -- cgit v1.2.3 From 895891dad7353d6058a8e5c499cf814fe7de9388 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Wed, 21 Aug 2024 16:26:27 -0700 Subject: perf annotate-data: Update stack slot for the store When checking the match variable at the target instruction, it might not have any information if it's a first write to a stack slot. In this case it could spill a register value into the stack so the type info is in the source operand. But currently it's hard to get the operand from the checking function. Let's process the instruction and retry to get the type info from the stack if there's no information already. This is an example of __tcp_transmit_skb(). The instructions are <__tcp_transmit_skb>: 0: nopl 0x0(%rax, %rax, 1) 5: push %rbp 6: mov %rsp, %rbp 9: push %r15 b: push %r14 d: push %r13 f: push %r12 11: push %rbx 12: sub $0x98, %rsp 19: mov %r8d, -0xa8(%rbp) ... It cannot find any variable at -0xa8(%rbp) at this point. ----------------------------------------------------------- find data type for -0xa8(reg6) at __tcp_transmit_skb+0x19 CU for net/ipv4/tcp_output.c (die:0x817f543) frame base: cfa=0 fbreg=6 scope: [1/1] (die:81aac3e) bb: [0 - 19] var [0] -0x98(stack) type='struct tcp_out_options' size=0x28 (die:0x81af3df) var [5] reg8 type='unsigned int' size=0x4 (die:0x8180ed6) var [5] reg2 type='unsigned int' size=0x4 (die:0x8180ed6) var [5] reg1 type='int' size=0x4 (die:0x818059e) var [5] reg4 type='struct sk_buff*' size=0x8 (die:0x8181360) var [5] reg5 type='struct sock*' size=0x8 (die:0x8181a0c) chk [19] reg6 offset=-0xa8 ok=0 kind=0 fbreg : no type information no type information And it was able to find the type after processing the 'mov' instruction. ----------------------------------------------------------- find data type for -0xa8(reg6) at __tcp_transmit_skb+0x19 CU for net/ipv4/tcp_output.c (die:0x817f543) frame base: cfa=0 fbreg=6 scope: [1/1] (die:81aac3e) bb: [0 - 19] var [0] -0x98(stack) type='struct tcp_out_options' size=0x28 (die:0x81af3df) var [5] reg8 type='unsigned int' size=0x4 (die:0x8180ed6) var [5] reg2 type='unsigned int' size=0x4 (die:0x8180ed6) var [5] reg1 type='int' size=0x4 (die:0x818059e) var [5] reg4 type='struct sk_buff*' size=0x8 (die:0x8181360) var [5] reg5 type='struct sock*' size=0x8 (die:0x8181a0c) chk [19] reg6 offset=-0xa8 ok=0 kind=0 fbreg : retry <<<--- here mov [19] reg8 -> -0xa8(stack) type='unsigned int' size=0x4 (die:0x8180ed6) chk [19] reg6 offset=-0xa8 ok=0 kind=0 fbreg : Good! found by insn track: -0xa8(reg6) type-offset=0 final result: type='unsigned int' size=0x4 (die:0x8180ed6) Signed-off-by: Namhyung Kim Cc: Adrian Hunter Cc: Athira Rajeev Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kan Liang Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20240821232628.353177-4-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/annotate-data.c | 29 +++++++++++++++++++++++++---- 1 file changed, 25 insertions(+), 4 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/annotate-data.c b/tools/perf/util/annotate-data.c index 7d4b5aaa3b6a..5d23c30b3a7b 100644 --- a/tools/perf/util/annotate-data.c +++ b/tools/perf/util/annotate-data.c @@ -955,19 +955,22 @@ static void setup_stack_canary(struct data_loc_info *dloc) static enum type_match_result check_matching_type(struct type_state *state, struct data_loc_info *dloc, Dwarf_Die *cu_die, + struct disasm_line *dl, Dwarf_Die *type_die) { Dwarf_Word size; - u32 insn_offset = dloc->ip - dloc->ms->sym->start; + u32 insn_offset = dl->al.offset; int reg = dloc->op->reg1; int offset = dloc->op->offset; const char *offset_sign = ""; + bool retry = true; if (offset < 0) { offset = -offset; offset_sign = "-"; } +again: pr_debug_dtp("chk [%x] reg%d offset=%s%#x ok=%d kind=%d ", insn_offset, reg, offset_sign, offset, state->regs[reg].ok, state->regs[reg].kind); @@ -1079,8 +1082,17 @@ check_non_register: pr_debug_dtp("fbreg"); stack = find_stack_state(state, dloc->type_offset); - if (stack == NULL) + if (stack == NULL) { + if (retry) { + pr_debug_dtp(" : retry\n"); + retry = false; + + /* update type info it's the first store to the stack */ + update_insn_state(state, dloc, cu_die, dl); + goto again; + } return PERF_TMR_NO_TYPE; + } if (stack->kind == TSR_KIND_CANARY) { setup_stack_canary(dloc); @@ -1111,8 +1123,17 @@ check_non_register: return PERF_TMR_NO_TYPE; stack = find_stack_state(state, dloc->type_offset - fboff); - if (stack == NULL) + if (stack == NULL) { + if (retry) { + pr_debug_dtp(" : retry\n"); + retry = false; + + /* update type info it's the first store to the stack */ + update_insn_state(state, dloc, cu_die, dl); + goto again; + } return PERF_TMR_NO_TYPE; + } if (stack->kind == TSR_KIND_CANARY) { setup_stack_canary(dloc); @@ -1202,7 +1223,7 @@ static enum type_match_result find_data_type_insn(struct data_loc_info *dloc, if (this_ip == dloc->ip) { ret = check_matching_type(&state, dloc, - cu_die, type_die); + cu_die, dl, type_die); pr_debug_dtp(" : %s\n", match_result_str(ret)); goto out; } -- cgit v1.2.3 From 1cfd01eb602d73b92df2ffc24196cd0a3dc3efb2 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Wed, 21 Aug 2024 16:26:28 -0700 Subject: perf annotate-data: Copy back variable types after move In some cases, compilers don't set the location expression in DWARF precisely. For instance, it may assign a variable to a register after copying it from a different register. Then it should use the register for the new type but still uses the old register. This makes hard to track the type information properly. This is an example I found in __tcp_transmit_skb(). The first argument (sk) of this function is a pointer to sock and there's a variable (tp) for tcp_sock. static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, gfp_t gfp_mask, u32 rcv_nxt) { ... struct tcp_sock *tp; BUG_ON(!skb || !tcp_skb_pcount(skb)); tp = tcp_sk(sk); prior_wstamp = tp->tcp_wstamp_ns; tp->tcp_wstamp_ns = max(tp->tcp_wstamp_ns, tp->tcp_clock_cache); ... So it basically calls tcp_sk(sk) to get the tcp_sock pointer from sk. But it turned out to be the same value because tcp_sock embeds sock as the first member. The sk is located in reg5 (RDI) and tp is in reg3 (RBX). The offset of tcp_wstamp_ns is 0x748 and tcp_clock_cache is 0x750. So you need to use RBX (reg3) to access the fields in the tcp_sock. But the code used RDI (reg5) as it has the same value. $ pahole --hex -C tcp_sock vmlinux | grep -e 748 -e 750 u64 tcp_wstamp_ns; /* 0x748 0x8 */ u64 tcp_clock_cache; /* 0x750 0x8 */ And this is the disassembly of the part of the function. <__tcp_transmit_skb>: ... 44: mov %rdi, %rbx 47: mov 0x748(%rdi), %rsi 4e: mov 0x750(%rdi), %rax 55: cmp %rax, %rsi Because compiler put the debug info to RBX, it only knows RDI is a pointer to sock and accessing those two fields resulted in error due to offset being beyond the type size. ----------------------------------------------------------- find data type for 0x748(reg5) at __tcp_transmit_skb+0x63 CU for net/ipv4/tcp_output.c (die:0x817f543) frame base: cfa=0 fbreg=6 scope: [1/1] (die:81aac3e) bb: [0 - 30] var [0] -0x98(stack) type='struct tcp_out_options' size=0x28 (die:0x81af3df) var [5] reg8 type='unsigned int' size=0x4 (die:0x8180ed6) var [5] reg2 type='unsigned int' size=0x4 (die:0x8180ed6) var [5] reg1 type='int' size=0x4 (die:0x818059e) var [5] reg4 type='struct sk_buff*' size=0x8 (die:0x8181360) var [5] reg5 type='struct sock*' size=0x8 (die:0x8181a0c) <<<--- the first argument ('sk' at %RDI) mov [19] reg8 -> -0xa8(stack) type='unsigned int' size=0x4 (die:0x8180ed6) mov [20] stack canary -> reg0 mov [29] reg0 -> -0x30(stack) stack canary bb: [36 - 3e] mov [36] reg4 -> reg15 type='struct sk_buff*' size=0x8 (die:0x8181360) bb: [44 - 63] mov [44] reg5 -> reg3 type='struct sock*' size=0x8 (die:0x8181a0c) <<<--- calling tcp_sk() var [47] reg3 type='struct tcp_sock*' size=0x8 (die:0x819eead) <<<--- new variable ('tp' at %RBX) var [4e] reg4 type='unsigned long long' size=0x8 (die:0x8180edd) mov [58] reg4 -> -0xc0(stack) type='unsigned long long' size=0x8 (die:0x8180edd) chk [63] reg5 offset=0x748 ok=1 kind=1 (struct sock*) : offset bigger than size <<<--- access with old variable final result: offset bigger than size While it's a fault in the compiler, we could work around this issue by using the type of new variable when it's copied directly. So I've added copied_from field in the register state to track those direct register to register copies. After that new register gets a new type and the old register still has the same type, it'll update (copy it back) the type of the old register. For example, if we can update type of reg5 at __tcp_transmit_skb+0x47, we can find the target type of the instruction at 0x63 like below: ----------------------------------------------------------- find data type for 0x748(reg5) at __tcp_transmit_skb+0x63 ... bb: [44 - 63] mov [44] reg5 -> reg3 type='struct sock*' size=0x8 (die:0x8181a0c) var [47] reg3 type='struct tcp_sock*' size=0x8 (die:0x819eead) var [47] copyback reg5 type='struct tcp_sock*' size=0x8 (die:0x819eead) <<<--- here mov [47] 0x748(reg5) -> reg4 type='unsigned long long' size=0x8 (die:0x8180edd) mov [4e] 0x750(reg5) -> reg0 type='unsigned long long' size=0x8 (die:0x8180edd) mov [58] reg4 -> -0xc0(stack) type='unsigned long long' size=0x8 (die:0x8180edd) chk [63] reg5 offset=0x748 ok=1 kind=1 (struct tcp_sock*) : Good! <<<--- new type found by insn track: 0x748(reg5) type-offset=0x748 final result: type='struct tcp_sock' size=0xa98 (die:0x819eeb2) Signed-off-by: Namhyung Kim Cc: Adrian Hunter Cc: Athira Rajeev Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kan Liang Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20240821232628.353177-5-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/arch/x86/annotate/instructions.c | 8 ++++++++ tools/perf/util/annotate-data.c | 31 +++++++++++++++++++++++++++++ tools/perf/util/annotate-data.h | 1 + 3 files changed, 40 insertions(+) (limited to 'tools') diff --git a/tools/perf/arch/x86/annotate/instructions.c b/tools/perf/arch/x86/annotate/instructions.c index 15dfc2988e24..5caf5a17f03d 100644 --- a/tools/perf/arch/x86/annotate/instructions.c +++ b/tools/perf/arch/x86/annotate/instructions.c @@ -267,6 +267,7 @@ static void update_insn_state_x86(struct type_state *state, return; tsr = &state->regs[dst->reg1]; + tsr->copied_from = -1; if (src->imm) imm_value = src->offset; @@ -326,6 +327,8 @@ static void update_insn_state_x86(struct type_state *state, return; tsr = &state->regs[dst->reg1]; + tsr->copied_from = -1; + if (dso__kernel(map__dso(dloc->ms->map)) && src->segment == INSN_SEG_X86_GS && src->imm) { u64 ip = dloc->ms->sym->start + dl->al.offset; @@ -386,6 +389,10 @@ static void update_insn_state_x86(struct type_state *state, tsr->imm_value = state->regs[src->reg1].imm_value; tsr->ok = true; + /* To copy back the variable type later (hopefully) */ + if (tsr->kind == TSR_KIND_TYPE) + tsr->copied_from = src->reg1; + pr_debug_dtp("mov [%x] reg%d -> reg%d", insn_offset, src->reg1, dst->reg1); pr_debug_type_name(&tsr->type, tsr->kind); @@ -398,6 +405,7 @@ static void update_insn_state_x86(struct type_state *state, return; tsr = &state->regs[dst->reg1]; + tsr->copied_from = -1; retry: /* Check stack variables with offset */ diff --git a/tools/perf/util/annotate-data.c b/tools/perf/util/annotate-data.c index 5d23c30b3a7b..a0ea4e07e570 100644 --- a/tools/perf/util/annotate-data.c +++ b/tools/perf/util/annotate-data.c @@ -774,6 +774,11 @@ ok: return true; } +static bool die_is_same(Dwarf_Die *die_a, Dwarf_Die *die_b) +{ + return (die_a->cu == die_b->cu) && (die_a->addr == die_b->addr); +} + /** * update_var_state - Update type state using given variables * @state: type state table @@ -825,6 +830,7 @@ static void update_var_state(struct type_state *state, struct data_loc_info *dlo pr_debug_type_name(&mem_die, TSR_KIND_TYPE); } else if (has_reg_type(state, var->reg) && var->offset == 0) { struct type_state_reg *reg; + Dwarf_Die orig_type; reg = &state->regs[var->reg]; @@ -832,6 +838,8 @@ static void update_var_state(struct type_state *state, struct data_loc_info *dlo !is_better_type(®->type, &mem_die)) continue; + orig_type = reg->type; + reg->type = mem_die; reg->kind = TSR_KIND_TYPE; reg->ok = true; @@ -839,6 +847,29 @@ static void update_var_state(struct type_state *state, struct data_loc_info *dlo pr_debug_dtp("var [%"PRIx64"] reg%d", insn_offset, var->reg); pr_debug_type_name(&mem_die, TSR_KIND_TYPE); + + /* + * If this register is directly copied from another and it gets a + * better type, also update the type of the source register. This + * is usually the case of container_of() macro with offset of 0. + */ + if (has_reg_type(state, reg->copied_from)) { + struct type_state_reg *copy_reg; + + copy_reg = &state->regs[reg->copied_from]; + + /* TODO: check if type is compatible or embedded */ + if (!copy_reg->ok || (copy_reg->kind != TSR_KIND_TYPE) || + !die_is_same(©_reg->type, &orig_type) || + !is_better_type(©_reg->type, &mem_die)) + continue; + + copy_reg->type = mem_die; + + pr_debug_dtp("var [%"PRIx64"] copyback reg%d", + insn_offset, reg->copied_from); + pr_debug_type_name(&mem_die, TSR_KIND_TYPE); + } } } } diff --git a/tools/perf/util/annotate-data.h b/tools/perf/util/annotate-data.h index 37a1a3b68e0b..8ac0fd94a0ba 100644 --- a/tools/perf/util/annotate-data.h +++ b/tools/perf/util/annotate-data.h @@ -176,6 +176,7 @@ struct type_state_reg { bool ok; bool caller_saved; u8 kind; + u8 copied_from; }; /* Type information in a stack location, dynamically allocated */ -- cgit v1.2.3 From b81162302001f41157f6e93654aaccc30e817e2a Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Thu, 22 Aug 2024 14:13:49 -0300 Subject: perf python: Allow checking for the existence of warning options in clang We'll need to check if an warning option introduced in clang 19 is available on the clang version being used, so cover the error message emitted when testing for a -W option. Tested-by: Sedat Dilek Cc: Ian Rogers Cc: Ingo Molnar Cc: Namhyung Kim Cc: Nathan Chancellor Cc: Peter Zijlstra Link: https://lore.kernel.org/lkml/CA+icZUVtHn8X1Tb_Y__c-WswsO0K8U9uy3r2MzKXwTA5THtL7w@mail.gmail.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/perf/util/setup.py b/tools/perf/util/setup.py index 142e9d447ce7..26c0f2614fe9 100644 --- a/tools/perf/util/setup.py +++ b/tools/perf/util/setup.py @@ -17,7 +17,7 @@ src_feature_tests = getenv('srctree') + '/tools/build/feature' def clang_has_option(option): cc_output = Popen([cc, cc_options + option, path.join(src_feature_tests, "test-hello.c") ], stderr=PIPE).stderr.readlines() - return [o for o in cc_output if ((b"unknown argument" in o) or (b"is not supported" in o))] == [ ] + return [o for o in cc_output if ((b"unknown argument" in o) or (b"is not supported" in o) or (b"unknown warning option" in o))] == [ ] if cc_is_clang: from sysconfig import get_config_vars -- cgit v1.2.3 From 24a7e944966cc8a285e6581dcc98edebeee76c97 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Fri, 16 Aug 2024 15:01:38 +0200 Subject: KVM: selftests: Move Hyper-V specific functions out of processor.c Since there is 'hyperv.c' for Hyper-V specific functions already, move Hyper-V specific functions out of processor.c there. No functional change intended. Signed-off-by: Vitaly Kuznetsov Link: https://lore.kernel.org/r/20240816130139.286246-2-vkuznets@redhat.com Signed-off-by: Sean Christopherson --- .../testing/selftests/kvm/include/x86_64/hyperv.h | 4 ++ .../selftests/kvm/include/x86_64/processor.h | 7 +-- tools/testing/selftests/kvm/lib/x86_64/hyperv.c | 59 +++++++++++++++++++++ tools/testing/selftests/kvm/lib/x86_64/processor.c | 61 ---------------------- .../testing/selftests/kvm/x86_64/xen_vmcall_test.c | 1 + 5 files changed, 68 insertions(+), 64 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/include/x86_64/hyperv.h b/tools/testing/selftests/kvm/include/x86_64/hyperv.h index fa65b908b13e..a2e7cf7ee0ad 100644 --- a/tools/testing/selftests/kvm/include/x86_64/hyperv.h +++ b/tools/testing/selftests/kvm/include/x86_64/hyperv.h @@ -343,4 +343,8 @@ struct hyperv_test_pages *vcpu_alloc_hyperv_test_pages(struct kvm_vm *vm, /* HV_X64_MSR_TSC_INVARIANT_CONTROL bits */ #define HV_INVARIANT_TSC_EXPOSED BIT_ULL(0) +const struct kvm_cpuid2 *kvm_get_supported_hv_cpuid(void); +const struct kvm_cpuid2 *vcpu_get_supported_hv_cpuid(struct kvm_vcpu *vcpu); +void vcpu_set_hv_cpuid(struct kvm_vcpu *vcpu); + #endif /* !SELFTEST_KVM_HYPERV_H */ diff --git a/tools/testing/selftests/kvm/include/x86_64/processor.h b/tools/testing/selftests/kvm/include/x86_64/processor.h index a0c1440017bb..e247f99e0473 100644 --- a/tools/testing/selftests/kvm/include/x86_64/processor.h +++ b/tools/testing/selftests/kvm/include/x86_64/processor.h @@ -25,6 +25,10 @@ extern bool host_cpu_is_intel; extern bool host_cpu_is_amd; extern uint64_t guest_tsc_khz; +#ifndef MAX_NR_CPUID_ENTRIES +#define MAX_NR_CPUID_ENTRIES 100 +#endif + /* Forced emulation prefix, used to invoke the emulator unconditionally. */ #define KVM_FEP "ud2; .byte 'k', 'v', 'm';" @@ -908,8 +912,6 @@ static inline void vcpu_xcrs_set(struct kvm_vcpu *vcpu, struct kvm_xcrs *xcrs) const struct kvm_cpuid_entry2 *get_cpuid_entry(const struct kvm_cpuid2 *cpuid, uint32_t function, uint32_t index); const struct kvm_cpuid2 *kvm_get_supported_cpuid(void); -const struct kvm_cpuid2 *kvm_get_supported_hv_cpuid(void); -const struct kvm_cpuid2 *vcpu_get_supported_hv_cpuid(struct kvm_vcpu *vcpu); static inline uint32_t kvm_cpu_fms(void) { @@ -1009,7 +1011,6 @@ static inline struct kvm_cpuid2 *allocate_kvm_cpuid2(int nr_entries) } void vcpu_init_cpuid(struct kvm_vcpu *vcpu, const struct kvm_cpuid2 *cpuid); -void vcpu_set_hv_cpuid(struct kvm_vcpu *vcpu); static inline struct kvm_cpuid_entry2 *__vcpu_get_cpuid_entry(struct kvm_vcpu *vcpu, uint32_t function, diff --git a/tools/testing/selftests/kvm/lib/x86_64/hyperv.c b/tools/testing/selftests/kvm/lib/x86_64/hyperv.c index efb7e7a1354d..b4a5e4ad7105 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/hyperv.c +++ b/tools/testing/selftests/kvm/lib/x86_64/hyperv.c @@ -8,6 +8,65 @@ #include "processor.h" #include "hyperv.h" +const struct kvm_cpuid2 *kvm_get_supported_hv_cpuid(void) +{ + static struct kvm_cpuid2 *cpuid; + int kvm_fd; + + if (cpuid) + return cpuid; + + cpuid = allocate_kvm_cpuid2(MAX_NR_CPUID_ENTRIES); + kvm_fd = open_kvm_dev_path_or_exit(); + + kvm_ioctl(kvm_fd, KVM_GET_SUPPORTED_HV_CPUID, cpuid); + + close(kvm_fd); + return cpuid; +} + +void vcpu_set_hv_cpuid(struct kvm_vcpu *vcpu) +{ + static struct kvm_cpuid2 *cpuid_full; + const struct kvm_cpuid2 *cpuid_sys, *cpuid_hv; + int i, nent = 0; + + if (!cpuid_full) { + cpuid_sys = kvm_get_supported_cpuid(); + cpuid_hv = kvm_get_supported_hv_cpuid(); + + cpuid_full = allocate_kvm_cpuid2(cpuid_sys->nent + cpuid_hv->nent); + if (!cpuid_full) { + perror("malloc"); + abort(); + } + + /* Need to skip KVM CPUID leaves 0x400000xx */ + for (i = 0; i < cpuid_sys->nent; i++) { + if (cpuid_sys->entries[i].function >= 0x40000000 && + cpuid_sys->entries[i].function < 0x40000100) + continue; + cpuid_full->entries[nent] = cpuid_sys->entries[i]; + nent++; + } + + memcpy(&cpuid_full->entries[nent], cpuid_hv->entries, + cpuid_hv->nent * sizeof(struct kvm_cpuid_entry2)); + cpuid_full->nent = nent + cpuid_hv->nent; + } + + vcpu_init_cpuid(vcpu, cpuid_full); +} + +const struct kvm_cpuid2 *vcpu_get_supported_hv_cpuid(struct kvm_vcpu *vcpu) +{ + struct kvm_cpuid2 *cpuid = allocate_kvm_cpuid2(MAX_NR_CPUID_ENTRIES); + + vcpu_ioctl(vcpu, KVM_GET_SUPPORTED_HV_CPUID, cpuid); + + return cpuid; +} + struct hyperv_test_pages *vcpu_alloc_hyperv_test_pages(struct kvm_vm *vm, vm_vaddr_t *p_hv_pages_gva) { diff --git a/tools/testing/selftests/kvm/lib/x86_64/processor.c b/tools/testing/selftests/kvm/lib/x86_64/processor.c index 153739f2e201..7876f052ca39 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/processor.c +++ b/tools/testing/selftests/kvm/lib/x86_64/processor.c @@ -19,8 +19,6 @@ #define KERNEL_DS 0x10 #define KERNEL_TSS 0x18 -#define MAX_NR_CPUID_ENTRIES 100 - vm_vaddr_t exception_handlers; bool host_cpu_is_amd; bool host_cpu_is_intel; @@ -1195,65 +1193,6 @@ void xen_hypercall(uint64_t nr, uint64_t a0, void *a1) GUEST_ASSERT(!__xen_hypercall(nr, a0, a1)); } -const struct kvm_cpuid2 *kvm_get_supported_hv_cpuid(void) -{ - static struct kvm_cpuid2 *cpuid; - int kvm_fd; - - if (cpuid) - return cpuid; - - cpuid = allocate_kvm_cpuid2(MAX_NR_CPUID_ENTRIES); - kvm_fd = open_kvm_dev_path_or_exit(); - - kvm_ioctl(kvm_fd, KVM_GET_SUPPORTED_HV_CPUID, cpuid); - - close(kvm_fd); - return cpuid; -} - -void vcpu_set_hv_cpuid(struct kvm_vcpu *vcpu) -{ - static struct kvm_cpuid2 *cpuid_full; - const struct kvm_cpuid2 *cpuid_sys, *cpuid_hv; - int i, nent = 0; - - if (!cpuid_full) { - cpuid_sys = kvm_get_supported_cpuid(); - cpuid_hv = kvm_get_supported_hv_cpuid(); - - cpuid_full = allocate_kvm_cpuid2(cpuid_sys->nent + cpuid_hv->nent); - if (!cpuid_full) { - perror("malloc"); - abort(); - } - - /* Need to skip KVM CPUID leaves 0x400000xx */ - for (i = 0; i < cpuid_sys->nent; i++) { - if (cpuid_sys->entries[i].function >= 0x40000000 && - cpuid_sys->entries[i].function < 0x40000100) - continue; - cpuid_full->entries[nent] = cpuid_sys->entries[i]; - nent++; - } - - memcpy(&cpuid_full->entries[nent], cpuid_hv->entries, - cpuid_hv->nent * sizeof(struct kvm_cpuid_entry2)); - cpuid_full->nent = nent + cpuid_hv->nent; - } - - vcpu_init_cpuid(vcpu, cpuid_full); -} - -const struct kvm_cpuid2 *vcpu_get_supported_hv_cpuid(struct kvm_vcpu *vcpu) -{ - struct kvm_cpuid2 *cpuid = allocate_kvm_cpuid2(MAX_NR_CPUID_ENTRIES); - - vcpu_ioctl(vcpu, KVM_GET_SUPPORTED_HV_CPUID, cpuid); - - return cpuid; -} - unsigned long vm_compute_max_gfn(struct kvm_vm *vm) { const unsigned long num_ht_pages = 12 << (30 - vm->page_shift); /* 12 GiB */ diff --git a/tools/testing/selftests/kvm/x86_64/xen_vmcall_test.c b/tools/testing/selftests/kvm/x86_64/xen_vmcall_test.c index e149d0574961..2585087cdf5c 100644 --- a/tools/testing/selftests/kvm/x86_64/xen_vmcall_test.c +++ b/tools/testing/selftests/kvm/x86_64/xen_vmcall_test.c @@ -10,6 +10,7 @@ #include "test_util.h" #include "kvm_util.h" #include "processor.h" +#include "hyperv.h" #define HCALL_REGION_GPA 0xc0000000ULL #define HCALL_REGION_SLOT 10 -- cgit v1.2.3 From d8414067cc17bd2070e6667763124754e2932251 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Fri, 16 Aug 2024 15:01:39 +0200 Subject: KVM: selftests: Re-enable hyperv_evmcs/hyperv_svm_test on bare metal KVM_CAP_HYPERV_DIRECT_TLBFLUSH is only reported when KVM runs on top of Hyper-V and hyperv_evmcs/hyperv_svm_test don't need that, these tests check that the feature is properly emulated for Hyper-V on KVM guests. There's no corresponding CAP for that, the feature is reported in KVM_GET_SUPPORTED_HV_CPUID. Hyper-V specific CPUIDs are not reported by KVM_GET_SUPPORTED_CPUID, implement dedicated kvm_hv_cpu_has() helper to do the job. Fixes: 6dac1195181c ("KVM: selftests: Make Hyper-V tests explicitly require KVM Hyper-V support") Signed-off-by: Vitaly Kuznetsov Link: https://lore.kernel.org/r/20240816130139.286246-3-vkuznets@redhat.com Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/include/x86_64/hyperv.h | 14 ++++++++++++++ tools/testing/selftests/kvm/lib/x86_64/hyperv.c | 8 ++++++++ tools/testing/selftests/kvm/x86_64/hyperv_evmcs.c | 2 +- tools/testing/selftests/kvm/x86_64/hyperv_svm_test.c | 2 +- 4 files changed, 24 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/include/x86_64/hyperv.h b/tools/testing/selftests/kvm/include/x86_64/hyperv.h index a2e7cf7ee0ad..6849e2552f1b 100644 --- a/tools/testing/selftests/kvm/include/x86_64/hyperv.h +++ b/tools/testing/selftests/kvm/include/x86_64/hyperv.h @@ -186,6 +186,18 @@ #define HV_X64_ENLIGHTENED_VMCS_RECOMMENDED \ KVM_X86_CPU_FEATURE(HYPERV_CPUID_ENLIGHTMENT_INFO, 0, EAX, 14) +/* HYPERV_CPUID_NESTED_FEATURES.EAX */ +#define HV_X64_NESTED_DIRECT_FLUSH \ + KVM_X86_CPU_FEATURE(HYPERV_CPUID_NESTED_FEATURES, 0, EAX, 17) +#define HV_X64_NESTED_GUEST_MAPPING_FLUSH \ + KVM_X86_CPU_FEATURE(HYPERV_CPUID_NESTED_FEATURES, 0, EAX, 18) +#define HV_X64_NESTED_MSR_BITMAP \ + KVM_X86_CPU_FEATURE(HYPERV_CPUID_NESTED_FEATURES, 0, EAX, 19) + +/* HYPERV_CPUID_NESTED_FEATURES.EBX */ +#define HV_X64_NESTED_EVMCS1_PERF_GLOBAL_CTRL \ + KVM_X86_CPU_FEATURE(HYPERV_CPUID_NESTED_FEATURES, 0, EBX, 0) + /* HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES.EAX */ #define HV_X64_SYNDBG_CAP_ALLOW_KERNEL_DEBUGGING \ KVM_X86_CPU_FEATURE(HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES, 0, EAX, 1) @@ -347,4 +359,6 @@ const struct kvm_cpuid2 *kvm_get_supported_hv_cpuid(void); const struct kvm_cpuid2 *vcpu_get_supported_hv_cpuid(struct kvm_vcpu *vcpu); void vcpu_set_hv_cpuid(struct kvm_vcpu *vcpu); +bool kvm_hv_cpu_has(struct kvm_x86_cpu_feature feature); + #endif /* !SELFTEST_KVM_HYPERV_H */ diff --git a/tools/testing/selftests/kvm/lib/x86_64/hyperv.c b/tools/testing/selftests/kvm/lib/x86_64/hyperv.c index b4a5e4ad7105..15bc8cd583aa 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/hyperv.c +++ b/tools/testing/selftests/kvm/lib/x86_64/hyperv.c @@ -67,6 +67,14 @@ const struct kvm_cpuid2 *vcpu_get_supported_hv_cpuid(struct kvm_vcpu *vcpu) return cpuid; } +bool kvm_hv_cpu_has(struct kvm_x86_cpu_feature feature) +{ + if (!kvm_has_cap(KVM_CAP_SYS_HYPERV_CPUID)) + return false; + + return kvm_cpuid_has(kvm_get_supported_hv_cpuid(), feature); +} + struct hyperv_test_pages *vcpu_alloc_hyperv_test_pages(struct kvm_vm *vm, vm_vaddr_t *p_hv_pages_gva) { diff --git a/tools/testing/selftests/kvm/x86_64/hyperv_evmcs.c b/tools/testing/selftests/kvm/x86_64/hyperv_evmcs.c index e192720bfe14..74cf19661309 100644 --- a/tools/testing/selftests/kvm/x86_64/hyperv_evmcs.c +++ b/tools/testing/selftests/kvm/x86_64/hyperv_evmcs.c @@ -242,7 +242,7 @@ int main(int argc, char *argv[]) TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_VMX)); TEST_REQUIRE(kvm_has_cap(KVM_CAP_NESTED_STATE)); TEST_REQUIRE(kvm_has_cap(KVM_CAP_HYPERV_ENLIGHTENED_VMCS)); - TEST_REQUIRE(kvm_has_cap(KVM_CAP_HYPERV_DIRECT_TLBFLUSH)); + TEST_REQUIRE(kvm_hv_cpu_has(HV_X64_NESTED_DIRECT_FLUSH)); vm = vm_create_with_one_vcpu(&vcpu, guest_code); diff --git a/tools/testing/selftests/kvm/x86_64/hyperv_svm_test.c b/tools/testing/selftests/kvm/x86_64/hyperv_svm_test.c index b987a3d79715..0ddb63229bcb 100644 --- a/tools/testing/selftests/kvm/x86_64/hyperv_svm_test.c +++ b/tools/testing/selftests/kvm/x86_64/hyperv_svm_test.c @@ -157,7 +157,7 @@ int main(int argc, char *argv[]) int stage; TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_SVM)); - TEST_REQUIRE(kvm_has_cap(KVM_CAP_HYPERV_DIRECT_TLBFLUSH)); + TEST_REQUIRE(kvm_hv_cpu_has(HV_X64_NESTED_DIRECT_FLUSH)); /* Create VM */ vm = vm_create_with_one_vcpu(&vcpu, guest_code); -- cgit v1.2.3 From 00dc514612fe98cfa117193b9df28f15e7c9db9c Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Thu, 22 Aug 2024 14:13:49 -0300 Subject: perf python: Disable -Wno-cast-function-type-mismatch if present on clang The -Wcast-function-type-mismatch option was introduced in clang 19 and its enabled by default, since we use -Werror, and python bindings do casts that are valid but trips this warning, disable it if present. Closes: https://lore.kernel.org/all/CA+icZUXoJ6BS3GMhJHV3aZWyb5Cz2haFneX0C5pUMUUhG-UVKQ@mail.gmail.com Reported-by: Sedat Dilek Tested-by: Sedat Dilek Cc: Ian Rogers Cc: Ingo Molnar Cc: Namhyung Kim Cc: Nathan Chancellor Cc: Peter Zijlstra Cc: stable@vger.kernel.org # To allow building with the upcoming clang 19 Link: https://lore.kernel.org/lkml/CA+icZUVtHn8X1Tb_Y__c-WswsO0K8U9uy3r2MzKXwTA5THtL7w@mail.gmail.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/setup.py | 2 ++ 1 file changed, 2 insertions(+) (limited to 'tools') diff --git a/tools/perf/util/setup.py b/tools/perf/util/setup.py index 26c0f2614fe9..649550e9b7aa 100644 --- a/tools/perf/util/setup.py +++ b/tools/perf/util/setup.py @@ -63,6 +63,8 @@ cflags = getenv('CFLAGS', '').split() cflags += ['-fno-strict-aliasing', '-Wno-write-strings', '-Wno-unused-parameter', '-Wno-redundant-decls' ] if cc_is_clang: cflags += ["-Wno-unused-command-line-argument" ] + if clang_has_option("-Wno-cast-function-type-mismatch"): + cflags += ["-Wno-cast-function-type-mismatch" ] else: cflags += ['-Wno-cast-function-type' ] -- cgit v1.2.3 From 7559a7a84ef83a2dd86caf623430b8d834843cec Mon Sep 17 00:00:00 2001 From: Leon Hwang Date: Sun, 28 Jul 2024 19:46:12 +0800 Subject: selftests/bpf: Add testcase for updating attached freplace prog to prog_array map Add a selftest to confirm the issue, which gets -EINVAL when update attached freplace prog to prog_array map, has been fixed. cd tools/testing/selftests/bpf; ./test_progs -t tailcalls 328/25 tailcalls/tailcall_freplace:OK 328 tailcalls:OK Summary: 1/25 PASSED, 0 SKIPPED, 0 FAILED Acked-by: Yonghong Song Signed-off-by: Leon Hwang Link: https://lore.kernel.org/r/20240728114612.48486-3-leon.hwang@linux.dev Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/prog_tests/tailcalls.c | 65 +++++++++++++++++++++- .../selftests/bpf/progs/tailcall_freplace.c | 23 ++++++++ tools/testing/selftests/bpf/progs/tc_bpf2bpf.c | 22 ++++++++ 3 files changed, 109 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/bpf/progs/tailcall_freplace.c create mode 100644 tools/testing/selftests/bpf/progs/tc_bpf2bpf.c (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/tailcalls.c b/tools/testing/selftests/bpf/prog_tests/tailcalls.c index e01fabb8cc41..21c5a37846ad 100644 --- a/tools/testing/selftests/bpf/prog_tests/tailcalls.c +++ b/tools/testing/selftests/bpf/prog_tests/tailcalls.c @@ -5,7 +5,8 @@ #include "tailcall_poke.skel.h" #include "tailcall_bpf2bpf_hierarchy2.skel.h" #include "tailcall_bpf2bpf_hierarchy3.skel.h" - +#include "tailcall_freplace.skel.h" +#include "tc_bpf2bpf.skel.h" /* test_tailcall_1 checks basic functionality by patching multiple locations * in a single program for a single tail call slot with nop->jmp, jmp->nop @@ -1495,6 +1496,66 @@ static void test_tailcall_bpf2bpf_hierarchy_3(void) RUN_TESTS(tailcall_bpf2bpf_hierarchy3); } +/* test_tailcall_freplace checks that the attached freplace prog is OK to + * update the prog_array map. + */ +static void test_tailcall_freplace(void) +{ + struct tailcall_freplace *freplace_skel = NULL; + struct bpf_link *freplace_link = NULL; + struct bpf_program *freplace_prog; + struct tc_bpf2bpf *tc_skel = NULL; + int prog_fd, map_fd; + char buff[128] = {}; + int err, key; + + LIBBPF_OPTS(bpf_test_run_opts, topts, + .data_in = buff, + .data_size_in = sizeof(buff), + .repeat = 1, + ); + + freplace_skel = tailcall_freplace__open(); + if (!ASSERT_OK_PTR(freplace_skel, "tailcall_freplace__open")) + return; + + tc_skel = tc_bpf2bpf__open_and_load(); + if (!ASSERT_OK_PTR(tc_skel, "tc_bpf2bpf__open_and_load")) + goto out; + + prog_fd = bpf_program__fd(tc_skel->progs.entry_tc); + freplace_prog = freplace_skel->progs.entry_freplace; + err = bpf_program__set_attach_target(freplace_prog, prog_fd, "subprog"); + if (!ASSERT_OK(err, "set_attach_target")) + goto out; + + err = tailcall_freplace__load(freplace_skel); + if (!ASSERT_OK(err, "tailcall_freplace__load")) + goto out; + + freplace_link = bpf_program__attach_freplace(freplace_prog, prog_fd, + "subprog"); + if (!ASSERT_OK_PTR(freplace_link, "attach_freplace")) + goto out; + + map_fd = bpf_map__fd(freplace_skel->maps.jmp_table); + prog_fd = bpf_program__fd(freplace_prog); + key = 0; + err = bpf_map_update_elem(map_fd, &key, &prog_fd, BPF_ANY); + if (!ASSERT_OK(err, "update jmp_table")) + goto out; + + prog_fd = bpf_program__fd(tc_skel->progs.entry_tc); + err = bpf_prog_test_run_opts(prog_fd, &topts); + ASSERT_OK(err, "test_run"); + ASSERT_EQ(topts.retval, 34, "test_run retval"); + +out: + bpf_link__destroy(freplace_link); + tc_bpf2bpf__destroy(tc_skel); + tailcall_freplace__destroy(freplace_skel); +} + void test_tailcalls(void) { if (test__start_subtest("tailcall_1")) @@ -1543,4 +1604,6 @@ void test_tailcalls(void) test_tailcall_bpf2bpf_hierarchy_fentry_entry(); test_tailcall_bpf2bpf_hierarchy_2(); test_tailcall_bpf2bpf_hierarchy_3(); + if (test__start_subtest("tailcall_freplace")) + test_tailcall_freplace(); } diff --git a/tools/testing/selftests/bpf/progs/tailcall_freplace.c b/tools/testing/selftests/bpf/progs/tailcall_freplace.c new file mode 100644 index 000000000000..6713b809df44 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/tailcall_freplace.c @@ -0,0 +1,23 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include + +struct { + __uint(type, BPF_MAP_TYPE_PROG_ARRAY); + __uint(max_entries, 1); + __uint(key_size, sizeof(__u32)); + __uint(value_size, sizeof(__u32)); +} jmp_table SEC(".maps"); + +int count = 0; + +SEC("freplace") +int entry_freplace(struct __sk_buff *skb) +{ + count++; + bpf_tail_call_static(skb, &jmp_table, 0); + return count; +} + +char __license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/tc_bpf2bpf.c b/tools/testing/selftests/bpf/progs/tc_bpf2bpf.c new file mode 100644 index 000000000000..8a0632c37839 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/tc_bpf2bpf.c @@ -0,0 +1,22 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include "bpf_misc.h" + +__noinline +int subprog(struct __sk_buff *skb) +{ + int ret = 1; + + __sink(ret); + return ret; +} + +SEC("tc") +int entry_tc(struct __sk_buff *skb) +{ + return subprog(skb); +} + +char __license[] SEC("license") = "GPL"; -- cgit v1.2.3 From 1820b84f3c61fbe5d9cf82a09cb9733a23cd463c Mon Sep 17 00:00:00 2001 From: Abhinav Jain Date: Wed, 21 Aug 2024 22:49:01 +0530 Subject: selftests: net: Create veth pair for testing in networkless kernel Check if the netdev list is empty and create veth pair to be used for feature on/off testing. Remove the veth pair after testing is complete. Signed-off-by: Abhinav Jain Reviewed-by: Simon Horman Link: https://patch.msgid.link/20240821171903.118324-2-jain.abhinav177@gmail.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/netdevice.sh | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/net/netdevice.sh b/tools/testing/selftests/net/netdevice.sh index e3afcb424710..999d72b6670c 100755 --- a/tools/testing/selftests/net/netdevice.sh +++ b/tools/testing/selftests/net/netdevice.sh @@ -129,6 +129,7 @@ kci_netdev_ethtool() kci_netdev_ethtool_test 74 'dump' "ethtool -d $netdev" kci_netdev_ethtool_test 94 'stats' "ethtool -S $netdev" + return 0 } @@ -196,10 +197,24 @@ if [ ! -e "$TMP_LIST_NETDEV" ];then fi ip link show |grep '^[0-9]' | grep -oE '[[:space:]].*eth[0-9]*:|[[:space:]].*enp[0-9]s[0-9]:' | cut -d\ -f2 | cut -d: -f1> "$TMP_LIST_NETDEV" + +if [ ! -s "$TMP_LIST_NETDEV" ]; then + echo "No valid network device found, creating veth pair" + ip link add veth0 type veth peer name veth1 + echo "veth0" > "$TMP_LIST_NETDEV" + veth_created=1 +fi + while read netdev do kci_test_netdev "$netdev" done < "$TMP_LIST_NETDEV" +#clean up veth interface pair if it was created +if [ "$veth_created" ]; then + ip link delete veth0 + echo "Removed veth pair" +fi + rm "$TMP_LIST_NETDEV" exit 0 -- cgit v1.2.3 From 6ce7bdbc0d4bde7578727e95e7d138e364512b33 Mon Sep 17 00:00:00 2001 From: Abhinav Jain Date: Wed, 21 Aug 2024 22:49:02 +0530 Subject: selftests: net: Add on/off checks for non-fixed features of interface Implement on/off testing for all non-fixed features via while loop. Signed-off-by: Abhinav Jain Reviewed-by: Simon Horman Link: https://patch.msgid.link/20240821171903.118324-3-jain.abhinav177@gmail.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/netdevice.sh | 35 +++++++++++++++++++++++++++++++- 1 file changed, 34 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/netdevice.sh b/tools/testing/selftests/net/netdevice.sh index 999d72b6670c..f7752e1ebe22 100755 --- a/tools/testing/selftests/net/netdevice.sh +++ b/tools/testing/selftests/net/netdevice.sh @@ -124,7 +124,40 @@ kci_netdev_ethtool() return 1 fi echo "PASS: $netdev: ethtool list features" - #TODO for each non fixed features, try to turn them on/off + + while read -r FEATURE VALUE FIXED; do + [ "$FEATURE" != "Features" ] || continue # Skip "Features" + [ "$FIXED" != "[fixed]" ] || continue # Skip fixed features + feature="${FEATURE%:*}" + + ethtool --offload "$netdev" "$feature" off + if [ $? -eq 0 ]; then + echo "PASS: $netdev: Turned off feature: $feature" + else + echo "FAIL: $netdev: Failed to turn off feature:" \ + "$feature" + fi + + ethtool --offload "$netdev" "$feature" on + if [ $? -eq 0 ]; then + echo "PASS: $netdev: Turned on feature: $feature" + else + echo "FAIL: $netdev: Failed to turn on feature:" \ + "$feature" + fi + + #restore the feature to its initial state + ethtool --offload "$netdev" "$feature" "$VALUE" + if [ $? -eq 0 ]; then + echo "PASS: $netdev: Restore feature $feature" \ + "to initial state $VALUE" + else + echo "FAIL: $netdev: Failed to restore feature" \ + "$feature to initial state $VALUE" + fi + + done < "$TMP_ETHTOOL_FEATURES" + rm "$TMP_ETHTOOL_FEATURES" kci_netdev_ethtool_test 74 'dump' "ethtool -d $netdev" -- cgit v1.2.3 From 8402a158028ffe030f3a02ec57b8c94cb94b137c Mon Sep 17 00:00:00 2001 From: Abhinav Jain Date: Wed, 21 Aug 2024 22:49:03 +0530 Subject: selftests: net: Use XFAIL for operations not supported by the driver Check if veth pair was created and if yes, xfail on setting IP address logging an informational message. Use XFAIL instead of SKIP for unsupported ethtool APIs. Signed-off-by: Abhinav Jain Reviewed-by: Simon Horman Link: https://patch.msgid.link/20240821171903.118324-4-jain.abhinav177@gmail.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/netdevice.sh | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/netdevice.sh b/tools/testing/selftests/net/netdevice.sh index f7752e1ebe22..438f7b2acc5f 100755 --- a/tools/testing/selftests/net/netdevice.sh +++ b/tools/testing/selftests/net/netdevice.sh @@ -67,8 +67,12 @@ kci_net_setup() return $ksft_skip fi - # TODO what ipaddr to set ? DHCP ? - echo "SKIP: $netdev: set IP address" + if [ "$veth_created" ]; then + echo "XFAIL: $netdev: set IP address unsupported for veth*" + else + # TODO what ipaddr to set ? DHCP ? + echo "SKIP: $netdev: set IP address" + fi return $ksft_skip } @@ -86,7 +90,7 @@ kci_netdev_ethtool_test() ret=$? if [ $ret -ne 0 ];then if [ $ret -eq "$1" ];then - echo "SKIP: $netdev: ethtool $2 not supported" + echo "XFAIL: $netdev: ethtool $2 not supported" return $ksft_skip else echo "FAIL: $netdev: ethtool $2" -- cgit v1.2.3 From 5225b6562b9a7dc808d5a1e465aaf5e2ebb220cd Mon Sep 17 00:00:00 2001 From: Andre Przywara Date: Wed, 21 Aug 2024 17:44:01 +0100 Subject: kselftest/arm64: signal: fix/refactor SVE vector length enumeration Currently a number of SVE/SME related tests have almost identical functions to enumerate all supported vector lengths. However over time the copy&pasted code has diverged, allowing some bugs to creep in: - fake_sigreturn_sme_change_vl reports a failure, not a SKIP if only one vector length is supported (but the SVE version is fine) - fake_sigreturn_sme_change_vl tries to set the SVE vector length, not the SME one (but the other SME tests are fine) - za_no_regs keeps iterating forever if only one vector length is supported (but za_regs is correct) Since those bugs seem to be mostly copy&paste ones, let's consolidate the enumeration loop into one shared function, and just call that from each test. That should fix the above bugs, and prevent similar issues from happening again. Fixes: 4963aeb35a9e ("kselftest/arm64: signal: Add SME signal handling tests") Signed-off-by: Andre Przywara Reviewed-by: Mark Brown Link: https://lore.kernel.org/r/20240821164401.3598545-1-andre.przywara@arm.com Signed-off-by: Will Deacon --- tools/testing/selftests/arm64/signal/Makefile | 2 +- tools/testing/selftests/arm64/signal/sve_helpers.c | 56 ++++++++++++++++++++++ tools/testing/selftests/arm64/signal/sve_helpers.h | 21 ++++++++ .../testcases/fake_sigreturn_sme_change_vl.c | 32 ++++--------- .../testcases/fake_sigreturn_sve_change_vl.c | 30 +++--------- .../selftests/arm64/signal/testcases/ssve_regs.c | 36 ++++---------- .../arm64/signal/testcases/ssve_za_regs.c | 36 ++++---------- .../selftests/arm64/signal/testcases/sve_regs.c | 32 ++++--------- .../selftests/arm64/signal/testcases/za_no_regs.c | 32 ++++--------- .../selftests/arm64/signal/testcases/za_regs.c | 36 ++++---------- 10 files changed, 132 insertions(+), 181 deletions(-) create mode 100644 tools/testing/selftests/arm64/signal/sve_helpers.c create mode 100644 tools/testing/selftests/arm64/signal/sve_helpers.h (limited to 'tools') diff --git a/tools/testing/selftests/arm64/signal/Makefile b/tools/testing/selftests/arm64/signal/Makefile index 8f5febaf1a9a..edb3613513b8 100644 --- a/tools/testing/selftests/arm64/signal/Makefile +++ b/tools/testing/selftests/arm64/signal/Makefile @@ -23,7 +23,7 @@ $(TEST_GEN_PROGS): $(PROGS) # Common test-unit targets to build common-layout test-cases executables # Needs secondary expansion to properly include the testcase c-file in pre-reqs COMMON_SOURCES := test_signals.c test_signals_utils.c testcases/testcases.c \ - signals.S + signals.S sve_helpers.c COMMON_HEADERS := test_signals.h test_signals_utils.h testcases/testcases.h .SECONDEXPANSION: diff --git a/tools/testing/selftests/arm64/signal/sve_helpers.c b/tools/testing/selftests/arm64/signal/sve_helpers.c new file mode 100644 index 000000000000..0acc121af306 --- /dev/null +++ b/tools/testing/selftests/arm64/signal/sve_helpers.c @@ -0,0 +1,56 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2024 ARM Limited + * + * Common helper functions for SVE and SME functionality. + */ + +#include +#include +#include +#include + +unsigned int vls[SVE_VQ_MAX]; +unsigned int nvls; + +int sve_fill_vls(bool use_sme, int min_vls) +{ + int vq, vl; + int pr_set_vl = use_sme ? PR_SME_SET_VL : PR_SVE_SET_VL; + int len_mask = use_sme ? PR_SME_VL_LEN_MASK : PR_SVE_VL_LEN_MASK; + + /* + * Enumerate up to SVE_VQ_MAX vector lengths + */ + for (vq = SVE_VQ_MAX; vq > 0; --vq) { + vl = prctl(pr_set_vl, vq * 16); + if (vl == -1) + return KSFT_FAIL; + + vl &= len_mask; + + /* + * Unlike SVE, SME does not require the minimum vector length + * to be implemented, or the VLs to be consecutive, so any call + * to the prctl might return the single implemented VL, which + * might be larger than 16. So to avoid this loop never + * terminating, bail out here when we find a higher VL than + * we asked for. + * See the ARM ARM, DDI 0487K.a, B1.4.2: I_QQRNR and I_NWYBP. + */ + if (vq < sve_vq_from_vl(vl)) + break; + + /* Skip missing VLs */ + vq = sve_vq_from_vl(vl); + + vls[nvls++] = vl; + } + + if (nvls < min_vls) { + fprintf(stderr, "Only %d VL supported\n", nvls); + return KSFT_SKIP; + } + + return KSFT_PASS; +} diff --git a/tools/testing/selftests/arm64/signal/sve_helpers.h b/tools/testing/selftests/arm64/signal/sve_helpers.h new file mode 100644 index 000000000000..50948ce471cc --- /dev/null +++ b/tools/testing/selftests/arm64/signal/sve_helpers.h @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2024 ARM Limited + * + * Common helper functions for SVE and SME functionality. + */ + +#ifndef __SVE_HELPERS_H__ +#define __SVE_HELPERS_H__ + +#include + +#define VLS_USE_SVE false +#define VLS_USE_SME true + +extern unsigned int vls[]; +extern unsigned int nvls; + +int sve_fill_vls(bool use_sme, int min_vls); + +#endif diff --git a/tools/testing/selftests/arm64/signal/testcases/fake_sigreturn_sme_change_vl.c b/tools/testing/selftests/arm64/signal/testcases/fake_sigreturn_sme_change_vl.c index ebd5815b54bb..cb8c051b5c8f 100644 --- a/tools/testing/selftests/arm64/signal/testcases/fake_sigreturn_sme_change_vl.c +++ b/tools/testing/selftests/arm64/signal/testcases/fake_sigreturn_sme_change_vl.c @@ -6,44 +6,28 @@ * handler, this is not supported and is expected to segfault. */ +#include #include #include #include #include "test_signals_utils.h" +#include "sve_helpers.h" #include "testcases.h" struct fake_sigframe sf; -static unsigned int vls[SVE_VQ_MAX]; -unsigned int nvls = 0; static bool sme_get_vls(struct tdescr *td) { - int vq, vl; + int res = sve_fill_vls(VLS_USE_SME, 2); - /* - * Enumerate up to SVE_VQ_MAX vector lengths - */ - for (vq = SVE_VQ_MAX; vq > 0; --vq) { - vl = prctl(PR_SVE_SET_VL, vq * 16); - if (vl == -1) - return false; + if (!res) + return true; - vl &= PR_SME_VL_LEN_MASK; + if (res == KSFT_SKIP) + td->result = KSFT_SKIP; - /* Skip missing VLs */ - vq = sve_vq_from_vl(vl); - - vls[nvls++] = vl; - } - - /* We need at least two VLs */ - if (nvls < 2) { - fprintf(stderr, "Only %d VL supported\n", nvls); - return false; - } - - return true; + return false; } static int fake_sigreturn_ssve_change_vl(struct tdescr *td, diff --git a/tools/testing/selftests/arm64/signal/testcases/fake_sigreturn_sve_change_vl.c b/tools/testing/selftests/arm64/signal/testcases/fake_sigreturn_sve_change_vl.c index e2a452190511..e1ccf8f85a70 100644 --- a/tools/testing/selftests/arm64/signal/testcases/fake_sigreturn_sve_change_vl.c +++ b/tools/testing/selftests/arm64/signal/testcases/fake_sigreturn_sve_change_vl.c @@ -12,40 +12,22 @@ #include #include "test_signals_utils.h" +#include "sve_helpers.h" #include "testcases.h" struct fake_sigframe sf; -static unsigned int vls[SVE_VQ_MAX]; -unsigned int nvls = 0; static bool sve_get_vls(struct tdescr *td) { - int vq, vl; + int res = sve_fill_vls(VLS_USE_SVE, 2); - /* - * Enumerate up to SVE_VQ_MAX vector lengths - */ - for (vq = SVE_VQ_MAX; vq > 0; --vq) { - vl = prctl(PR_SVE_SET_VL, vq * 16); - if (vl == -1) - return false; + if (!res) + return true; - vl &= PR_SVE_VL_LEN_MASK; - - /* Skip missing VLs */ - vq = sve_vq_from_vl(vl); - - vls[nvls++] = vl; - } - - /* We need at least two VLs */ - if (nvls < 2) { - fprintf(stderr, "Only %d VL supported\n", nvls); + if (res == KSFT_SKIP) td->result = KSFT_SKIP; - return false; - } - return true; + return false; } static int fake_sigreturn_sve_change_vl(struct tdescr *td, diff --git a/tools/testing/selftests/arm64/signal/testcases/ssve_regs.c b/tools/testing/selftests/arm64/signal/testcases/ssve_regs.c index 3d37daafcff5..6dbe48cf8b09 100644 --- a/tools/testing/selftests/arm64/signal/testcases/ssve_regs.c +++ b/tools/testing/selftests/arm64/signal/testcases/ssve_regs.c @@ -6,51 +6,31 @@ * set up as expected. */ +#include #include #include #include #include "test_signals_utils.h" +#include "sve_helpers.h" #include "testcases.h" static union { ucontext_t uc; char buf[1024 * 64]; } context; -static unsigned int vls[SVE_VQ_MAX]; -unsigned int nvls = 0; static bool sme_get_vls(struct tdescr *td) { - int vq, vl; + int res = sve_fill_vls(VLS_USE_SME, 1); - /* - * Enumerate up to SVE_VQ_MAX vector lengths - */ - for (vq = SVE_VQ_MAX; vq > 0; --vq) { - vl = prctl(PR_SME_SET_VL, vq * 16); - if (vl == -1) - return false; - - vl &= PR_SME_VL_LEN_MASK; - - /* Did we find the lowest supported VL? */ - if (vq < sve_vq_from_vl(vl)) - break; + if (!res) + return true; - /* Skip missing VLs */ - vq = sve_vq_from_vl(vl); - - vls[nvls++] = vl; - } - - /* We need at least one VL */ - if (nvls < 1) { - fprintf(stderr, "Only %d VL supported\n", nvls); - return false; - } + if (res == KSFT_SKIP) + td->result = KSFT_SKIP; - return true; + return false; } static void setup_ssve_regs(void) diff --git a/tools/testing/selftests/arm64/signal/testcases/ssve_za_regs.c b/tools/testing/selftests/arm64/signal/testcases/ssve_za_regs.c index 9dc5f128bbc0..5557e116e973 100644 --- a/tools/testing/selftests/arm64/signal/testcases/ssve_za_regs.c +++ b/tools/testing/selftests/arm64/signal/testcases/ssve_za_regs.c @@ -6,51 +6,31 @@ * signal frames is set up as expected when enabled simultaneously. */ +#include #include #include #include #include "test_signals_utils.h" +#include "sve_helpers.h" #include "testcases.h" static union { ucontext_t uc; char buf[1024 * 128]; } context; -static unsigned int vls[SVE_VQ_MAX]; -unsigned int nvls = 0; static bool sme_get_vls(struct tdescr *td) { - int vq, vl; + int res = sve_fill_vls(VLS_USE_SME, 1); - /* - * Enumerate up to SVE_VQ_MAX vector lengths - */ - for (vq = SVE_VQ_MAX; vq > 0; --vq) { - vl = prctl(PR_SME_SET_VL, vq * 16); - if (vl == -1) - return false; - - vl &= PR_SME_VL_LEN_MASK; - - /* Did we find the lowest supported VL? */ - if (vq < sve_vq_from_vl(vl)) - break; + if (!res) + return true; - /* Skip missing VLs */ - vq = sve_vq_from_vl(vl); - - vls[nvls++] = vl; - } - - /* We need at least one VL */ - if (nvls < 1) { - fprintf(stderr, "Only %d VL supported\n", nvls); - return false; - } + if (res == KSFT_SKIP) + td->result = KSFT_SKIP; - return true; + return false; } static void setup_regs(void) diff --git a/tools/testing/selftests/arm64/signal/testcases/sve_regs.c b/tools/testing/selftests/arm64/signal/testcases/sve_regs.c index 8b16eabbb769..8143eb1c58c1 100644 --- a/tools/testing/selftests/arm64/signal/testcases/sve_regs.c +++ b/tools/testing/selftests/arm64/signal/testcases/sve_regs.c @@ -6,47 +6,31 @@ * expected. */ +#include #include #include #include #include "test_signals_utils.h" +#include "sve_helpers.h" #include "testcases.h" static union { ucontext_t uc; char buf[1024 * 64]; } context; -static unsigned int vls[SVE_VQ_MAX]; -unsigned int nvls = 0; static bool sve_get_vls(struct tdescr *td) { - int vq, vl; + int res = sve_fill_vls(VLS_USE_SVE, 1); - /* - * Enumerate up to SVE_VQ_MAX vector lengths - */ - for (vq = SVE_VQ_MAX; vq > 0; --vq) { - vl = prctl(PR_SVE_SET_VL, vq * 16); - if (vl == -1) - return false; - - vl &= PR_SVE_VL_LEN_MASK; - - /* Skip missing VLs */ - vq = sve_vq_from_vl(vl); + if (!res) + return true; - vls[nvls++] = vl; - } - - /* We need at least one VL */ - if (nvls < 1) { - fprintf(stderr, "Only %d VL supported\n", nvls); - return false; - } + if (res == KSFT_SKIP) + td->result = KSFT_SKIP; - return true; + return false; } static void setup_sve_regs(void) diff --git a/tools/testing/selftests/arm64/signal/testcases/za_no_regs.c b/tools/testing/selftests/arm64/signal/testcases/za_no_regs.c index 4d6f94b6178f..ce26e9c2fa5e 100644 --- a/tools/testing/selftests/arm64/signal/testcases/za_no_regs.c +++ b/tools/testing/selftests/arm64/signal/testcases/za_no_regs.c @@ -6,47 +6,31 @@ * expected. */ +#include #include #include #include #include "test_signals_utils.h" +#include "sve_helpers.h" #include "testcases.h" static union { ucontext_t uc; char buf[1024 * 128]; } context; -static unsigned int vls[SVE_VQ_MAX]; -unsigned int nvls = 0; static bool sme_get_vls(struct tdescr *td) { - int vq, vl; + int res = sve_fill_vls(VLS_USE_SME, 1); - /* - * Enumerate up to SME_VQ_MAX vector lengths - */ - for (vq = SVE_VQ_MAX; vq > 0; --vq) { - vl = prctl(PR_SME_SET_VL, vq * 16); - if (vl == -1) - return false; - - vl &= PR_SME_VL_LEN_MASK; - - /* Skip missing VLs */ - vq = sve_vq_from_vl(vl); + if (!res) + return true; - vls[nvls++] = vl; - } - - /* We need at least one VL */ - if (nvls < 1) { - fprintf(stderr, "Only %d VL supported\n", nvls); - return false; - } + if (res == KSFT_SKIP) + td->result = KSFT_SKIP; - return true; + return false; } static int do_one_sme_vl(struct tdescr *td, siginfo_t *si, ucontext_t *uc, diff --git a/tools/testing/selftests/arm64/signal/testcases/za_regs.c b/tools/testing/selftests/arm64/signal/testcases/za_regs.c index 174ad6656696..b9e13f27f1f9 100644 --- a/tools/testing/selftests/arm64/signal/testcases/za_regs.c +++ b/tools/testing/selftests/arm64/signal/testcases/za_regs.c @@ -6,51 +6,31 @@ * expected. */ +#include #include #include #include #include "test_signals_utils.h" +#include "sve_helpers.h" #include "testcases.h" static union { ucontext_t uc; char buf[1024 * 128]; } context; -static unsigned int vls[SVE_VQ_MAX]; -unsigned int nvls = 0; static bool sme_get_vls(struct tdescr *td) { - int vq, vl; + int res = sve_fill_vls(VLS_USE_SME, 1); - /* - * Enumerate up to SME_VQ_MAX vector lengths - */ - for (vq = SVE_VQ_MAX; vq > 0; --vq) { - vl = prctl(PR_SME_SET_VL, vq * 16); - if (vl == -1) - return false; - - vl &= PR_SME_VL_LEN_MASK; - - /* Did we find the lowest supported VL? */ - if (vq < sve_vq_from_vl(vl)) - break; + if (!res) + return true; - /* Skip missing VLs */ - vq = sve_vq_from_vl(vl); - - vls[nvls++] = vl; - } - - /* We need at least one VL */ - if (nvls < 1) { - fprintf(stderr, "Only %d VL supported\n", nvls); - return false; - } + if (res == KSFT_SKIP) + td->result = KSFT_SKIP; - return true; + return false; } static void setup_za_regs(void) -- cgit v1.2.3 From ec1f77f6557b46639fa47c6980ef9d38995c1e05 Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Fri, 23 Aug 2024 01:06:42 -0700 Subject: selftests/bpf: test_loader.c:get_current_arch() should not return 0 At the moment, when test_loader.c:get_current_arch() can't determine the arch, it returns 0. The arch check in run_subtest() looks as follows: if ((get_current_arch() & spec->arch_mask) == 0) { test__skip(); return; } Which means that all test_loader based tests would be skipped if arch could not be determined. get_current_arch() recognizes x86_64, arm64 and riscv64. Which means that CI skips test_loader tests for s390. Fix this by making sure that get_current_arch() always returns non-zero value. In combination with default spec->arch_mask == -1 this should cover all possibilities. Fixes: f406026fefa7 ("selftests/bpf: by default use arch mask allowing all archs") Fixes: 7d743e4c759c ("selftests/bpf: __jited test tag to check disassembly after jit") Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20240823080644.263943-2-eddyz87@gmail.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/test_loader.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/test_loader.c b/tools/testing/selftests/bpf/test_loader.c index 2ca9b73e5a6b..4223cffc090e 100644 --- a/tools/testing/selftests/bpf/test_loader.c +++ b/tools/testing/selftests/bpf/test_loader.c @@ -336,9 +336,10 @@ static const char *skip_dynamic_pfx(const char *s, const char *pfx) } enum arch { - ARCH_X86_64 = 0x1, - ARCH_ARM64 = 0x2, - ARCH_RISCV64 = 0x4, + ARCH_UNKNOWN = 0x1, + ARCH_X86_64 = 0x2, + ARCH_ARM64 = 0x4, + ARCH_RISCV64 = 0x8, }; static int get_current_arch(void) @@ -350,7 +351,7 @@ static int get_current_arch(void) #elif defined(__riscv) && __riscv_xlen == 64 return ARCH_RISCV64; #endif - return 0; + return ARCH_UNKNOWN; } /* Uses btf_decl_tag attributes to describe the expected test -- cgit v1.2.3 From c52a1e6eb74ffe4f217e9b53ed75440a45b08c4c Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Fri, 23 Aug 2024 01:06:43 -0700 Subject: selftests/bpf: match both retq/rethunk in verifier_tailcall_jit Depending on kernel parameters, x86 jit generates either retq or jump to rethunk for 'exit' instruction. The difference could be seen when kernel is booted with and without mitigations=off parameter. Relax the verifier_tailcall_jit test case to match both variants. Fixes: e5bdd6a8be78 ("selftests/bpf: validate jit behaviour for tail calls") Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20240823080644.263943-3-eddyz87@gmail.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/progs/verifier_tailcall_jit.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/progs/verifier_tailcall_jit.c b/tools/testing/selftests/bpf/progs/verifier_tailcall_jit.c index 06d327cf1e1f..8d60c634a114 100644 --- a/tools/testing/selftests/bpf/progs/verifier_tailcall_jit.c +++ b/tools/testing/selftests/bpf/progs/verifier_tailcall_jit.c @@ -59,7 +59,7 @@ __jited(" movq -0x10(%rbp), %rax") __jited(" callq 0x{{.*}}") /* call to sub() */ __jited(" xorl %eax, %eax") __jited(" leave") -__jited(" retq") +__jited(" {{(retq|jmp 0x)}}") /* return or jump to rethunk */ __jited("...") /* subprogram entry for sub(), regular function prologue */ __jited(" endbr64") @@ -89,7 +89,7 @@ __jited(" popq %rax") __jited(" popq %rax") __jited(" jmp {{.*}}") /* jump to tail call tgt */ __jited("L0: leave") -__jited(" retq") +__jited(" {{(retq|jmp 0x)}}") /* return or jump to rethunk */ SEC("tc") __naked int main(void) { -- cgit v1.2.3 From 21a56fc503faae7589167fcd2771ab6dce36ab39 Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Fri, 23 Aug 2024 01:06:44 -0700 Subject: selftests/bpf: #define LOCAL_LABEL_LEN for jit_disasm_helpers.c Extract local label length as a #define directive and elaborate why 'i % MAX_LOCAL_LABELS' expression is needed for local labels array initialization. Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20240823080644.263943-4-eddyz87@gmail.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/jit_disasm_helpers.c | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/jit_disasm_helpers.c b/tools/testing/selftests/bpf/jit_disasm_helpers.c index 1b0f1fd267c0..febd6b12e372 100644 --- a/tools/testing/selftests/bpf/jit_disasm_helpers.c +++ b/tools/testing/selftests/bpf/jit_disasm_helpers.c @@ -16,6 +16,11 @@ */ #define MAX_LOCAL_LABELS 32 +/* Local labels are encoded as 'L42', this requires 4 bytes of storage: + * 3 characters + zero byte + */ +#define LOCAL_LABEL_LEN 4 + static bool llvm_initialized; struct local_labels { @@ -23,7 +28,7 @@ struct local_labels { __u32 prog_len; __u32 cnt; __u32 pcs[MAX_LOCAL_LABELS]; - char names[MAX_LOCAL_LABELS][4]; + char names[MAX_LOCAL_LABELS][LOCAL_LABEL_LEN]; }; static const char *lookup_symbol(void *data, uint64_t ref_value, uint64_t *ref_type, @@ -118,8 +123,14 @@ static int disasm_one_func(FILE *text_out, uint8_t *image, __u32 len) } qsort(labels.pcs, labels.cnt, sizeof(*labels.pcs), cmp_u32); for (i = 0; i < labels.cnt; ++i) - /* use (i % 100) to avoid format truncation warning */ - snprintf(labels.names[i], sizeof(labels.names[i]), "L%d", i % 100); + /* gcc is unable to infer upper bound for labels.cnt and assumes + * it to be U32_MAX. U32_MAX takes 10 decimal digits. + * snprintf below prints into labels.names[*], + * which has space only for two digits and a letter. + * To avoid truncation warning use (i % MAX_LOCAL_LABELS), + * which informs gcc about printed value upper bound. + */ + snprintf(labels.names[i], sizeof(labels.names[i]), "L%d", i % MAX_LOCAL_LABELS); /* now print with labels */ labels.print_phase = true; -- cgit v1.2.3 From 4e9e07603ecdfd0816838132f354239771c51edd Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 6 Aug 2024 16:03:19 -0700 Subject: selftests/bpf: make use of PROCMAP_QUERY ioctl if available Instead of parsing text-based /proc//maps file, try to use PROCMAP_QUERY ioctl() to simplify and speed up data fetching. This logic is used to do uprobe file offset calculation, so any bugs in this logic would manifest as failing uprobe BPF selftests. This also serves as a simple demonstration of one of the intended uses. Signed-off-by: Andrii Nakryiko Acked-by: Jiri Olsa Link: https://lore.kernel.org/r/20240806230319.869734-1-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/test_progs.c | 3 + tools/testing/selftests/bpf/test_progs.h | 2 + tools/testing/selftests/bpf/trace_helpers.c | 104 ++++++++++++++++++++++++---- 3 files changed, 94 insertions(+), 15 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/test_progs.c b/tools/testing/selftests/bpf/test_progs.c index f45b06791444..83f390a31681 100644 --- a/tools/testing/selftests/bpf/test_progs.c +++ b/tools/testing/selftests/bpf/test_progs.c @@ -35,6 +35,8 @@ __weak void backtrace_symbols_fd(void *const *buffer, int size, int fd) dprintf(fd, "\n"); } +int env_verbosity = 0; + static bool verbose(void) { return env.verbosity > VERBOSE_NONE; @@ -973,6 +975,7 @@ static error_t parse_arg(int key, char *arg, struct argp_state *state) return -EINVAL; } } + env_verbosity = env->verbosity; if (verbose()) { if (setenv("SELFTESTS_VERBOSE", "1", 1) == -1) { diff --git a/tools/testing/selftests/bpf/test_progs.h b/tools/testing/selftests/bpf/test_progs.h index 3ad131de14c6..7767d9a825ae 100644 --- a/tools/testing/selftests/bpf/test_progs.h +++ b/tools/testing/selftests/bpf/test_progs.h @@ -96,6 +96,8 @@ struct test_state { FILE *stdout_saved; }; +extern int env_verbosity; + struct test_env { struct test_selector test_selector; struct test_selector subtest_selector; diff --git a/tools/testing/selftests/bpf/trace_helpers.c b/tools/testing/selftests/bpf/trace_helpers.c index 465d196c7165..1bfd881c0e07 100644 --- a/tools/testing/selftests/bpf/trace_helpers.c +++ b/tools/testing/selftests/bpf/trace_helpers.c @@ -10,6 +10,8 @@ #include #include #include +#include +#include #include #include "trace_helpers.h" #include @@ -244,29 +246,91 @@ out: return err; } +#ifdef PROCMAP_QUERY +int env_verbosity __weak = 0; + +int procmap_query(int fd, const void *addr, __u32 query_flags, size_t *start, size_t *offset, int *flags) +{ + char path_buf[PATH_MAX], build_id_buf[20]; + struct procmap_query q; + int err; + + memset(&q, 0, sizeof(q)); + q.size = sizeof(q); + q.query_flags = query_flags; + q.query_addr = (__u64)addr; + q.vma_name_addr = (__u64)path_buf; + q.vma_name_size = sizeof(path_buf); + q.build_id_addr = (__u64)build_id_buf; + q.build_id_size = sizeof(build_id_buf); + + err = ioctl(fd, PROCMAP_QUERY, &q); + if (err < 0) { + err = -errno; + if (err == -ENOTTY) + return -EOPNOTSUPP; /* ioctl() not implemented yet */ + if (err == -ENOENT) + return -ESRCH; /* vma not found */ + return err; + } + + if (env_verbosity >= 1) { + printf("VMA FOUND (addr %08lx): %08lx-%08lx %c%c%c%c %08lx %02x:%02x %ld %s (build ID: %s, %d bytes)\n", + (long)addr, (long)q.vma_start, (long)q.vma_end, + (q.vma_flags & PROCMAP_QUERY_VMA_READABLE) ? 'r' : '-', + (q.vma_flags & PROCMAP_QUERY_VMA_WRITABLE) ? 'w' : '-', + (q.vma_flags & PROCMAP_QUERY_VMA_EXECUTABLE) ? 'x' : '-', + (q.vma_flags & PROCMAP_QUERY_VMA_SHARED) ? 's' : 'p', + (long)q.vma_offset, q.dev_major, q.dev_minor, (long)q.inode, + q.vma_name_size ? path_buf : "", + q.build_id_size ? "YES" : "NO", + q.build_id_size); + } + + *start = q.vma_start; + *offset = q.vma_offset; + *flags = q.vma_flags; + return 0; +} +#else +int procmap_query(int fd, const void *addr, size_t *start, size_t *offset, int *flags) +{ + return -EOPNOTSUPP; +} +#endif + ssize_t get_uprobe_offset(const void *addr) { - size_t start, end, base; - char buf[256]; - bool found = false; + size_t start, base, end; FILE *f; + char buf[256]; + int err, flags; f = fopen("/proc/self/maps", "r"); if (!f) return -errno; - while (fscanf(f, "%zx-%zx %s %zx %*[^\n]\n", &start, &end, buf, &base) == 4) { - if (buf[2] == 'x' && (uintptr_t)addr >= start && (uintptr_t)addr < end) { - found = true; - break; + /* requested executable VMA only */ + err = procmap_query(fileno(f), addr, PROCMAP_QUERY_VMA_EXECUTABLE, &start, &base, &flags); + if (err == -EOPNOTSUPP) { + bool found = false; + + while (fscanf(f, "%zx-%zx %s %zx %*[^\n]\n", &start, &end, buf, &base) == 4) { + if (buf[2] == 'x' && (uintptr_t)addr >= start && (uintptr_t)addr < end) { + found = true; + break; + } + } + if (!found) { + fclose(f); + return -ESRCH; } + } else if (err) { + fclose(f); + return err; } - fclose(f); - if (!found) - return -ESRCH; - #if defined(__powerpc64__) && defined(_CALL_ELF) && _CALL_ELF == 2 #define OP_RT_RA_MASK 0xffff0000UL @@ -307,15 +371,25 @@ ssize_t get_rel_offset(uintptr_t addr) size_t start, end, offset; char buf[256]; FILE *f; + int err, flags; f = fopen("/proc/self/maps", "r"); if (!f) return -errno; - while (fscanf(f, "%zx-%zx %s %zx %*[^\n]\n", &start, &end, buf, &offset) == 4) { - if (addr >= start && addr < end) { - fclose(f); - return (size_t)addr - start + offset; + err = procmap_query(fileno(f), (const void *)addr, 0, &start, &offset, &flags); + if (err == 0) { + fclose(f); + return (size_t)addr - start + offset; + } else if (err != -EOPNOTSUPP) { + fclose(f); + return err; + } else if (err) { + while (fscanf(f, "%zx-%zx %s %zx %*[^\n]\n", &start, &end, buf, &offset) == 4) { + if (addr >= start && addr < end) { + fclose(f); + return (size_t)addr - start + offset; + } } } -- cgit v1.2.3 From f727b13dbea16c5e117e263aa8aea59d632d5660 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 5 Aug 2024 21:29:35 -0700 Subject: selftests/bpf: add multi-uprobe benchmarks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add multi-uprobe and multi-uretprobe benchmarks to bench tool. Multi- and classic uprobes/uretprobes have different low-level triggering code paths, so it's sometimes important to be able to benchmark both flavors of uprobes/uretprobes. Sample examples from my dev machine below. Single-threaded peformance almost doesn't differ, but with more parallel CPUs triggering the same uprobe/uretprobe the difference grows. This might be due to [0], but given the code is slightly different, there could be other sources of slowdown. Note, all these numbers will change due to ongoing work to improve uprobe/uretprobe scalability (e.g., [1]), but having benchmark like this is useful for measurements and debugging nevertheless. \#!/bin/bash set -eufo pipefail for p in 1 8 16 32; do for i in uprobe-nop uretprobe-nop uprobe-multi-nop uretprobe-multi-nop; do summary=$(sudo ./bench -w1 -d3 -p$p -a trig-$i | tail -n1) total=$(echo "$summary" | cut -d'(' -f1 | cut -d' ' -f3-) percpu=$(echo "$summary" | cut -d'(' -f2 | cut -d')' -f1 | cut -d'/' -f1) printf "%-21s (%2d cpus): %s (%s/s/cpu)\n" $i $p "$total" "$percpu" done echo done uprobe-nop ( 1 cpus): 1.020 ± 0.005M/s ( 1.020M/s/cpu) uretprobe-nop ( 1 cpus): 0.515 ± 0.009M/s ( 0.515M/s/cpu) uprobe-multi-nop ( 1 cpus): 1.036 ± 0.004M/s ( 1.036M/s/cpu) uretprobe-multi-nop ( 1 cpus): 0.512 ± 0.005M/s ( 0.512M/s/cpu) uprobe-nop ( 8 cpus): 3.481 ± 0.030M/s ( 0.435M/s/cpu) uretprobe-nop ( 8 cpus): 2.222 ± 0.008M/s ( 0.278M/s/cpu) uprobe-multi-nop ( 8 cpus): 3.769 ± 0.094M/s ( 0.471M/s/cpu) uretprobe-multi-nop ( 8 cpus): 2.482 ± 0.007M/s ( 0.310M/s/cpu) uprobe-nop (16 cpus): 2.968 ± 0.011M/s ( 0.185M/s/cpu) uretprobe-nop (16 cpus): 1.870 ± 0.002M/s ( 0.117M/s/cpu) uprobe-multi-nop (16 cpus): 3.541 ± 0.037M/s ( 0.221M/s/cpu) uretprobe-multi-nop (16 cpus): 2.123 ± 0.026M/s ( 0.133M/s/cpu) uprobe-nop (32 cpus): 2.524 ± 0.026M/s ( 0.079M/s/cpu) uretprobe-nop (32 cpus): 1.572 ± 0.003M/s ( 0.049M/s/cpu) uprobe-multi-nop (32 cpus): 2.717 ± 0.003M/s ( 0.085M/s/cpu) uretprobe-multi-nop (32 cpus): 1.687 ± 0.007M/s ( 0.053M/s/cpu) [0] https://lore.kernel.org/linux-trace-kernel/20240805202803.1813090-1-andrii@kernel.org/ [1] https://lore.kernel.org/linux-trace-kernel/20240731214256.3588718-1-andrii@kernel.org/ Signed-off-by: Andrii Nakryiko Acked-by: Jiri Olsa Link: https://lore.kernel.org/r/20240806042935.3867862-1-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/bench.c | 12 ++++ tools/testing/selftests/bpf/benchs/bench_trigger.c | 81 ++++++++++++++++++---- tools/testing/selftests/bpf/progs/trigger_bench.c | 7 ++ 3 files changed, 85 insertions(+), 15 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/bench.c b/tools/testing/selftests/bpf/bench.c index 90dc3aca32bd..1bd403a5ef7b 100644 --- a/tools/testing/selftests/bpf/bench.c +++ b/tools/testing/selftests/bpf/bench.c @@ -520,6 +520,12 @@ extern const struct bench bench_trig_uprobe_push; extern const struct bench bench_trig_uretprobe_push; extern const struct bench bench_trig_uprobe_ret; extern const struct bench bench_trig_uretprobe_ret; +extern const struct bench bench_trig_uprobe_multi_nop; +extern const struct bench bench_trig_uretprobe_multi_nop; +extern const struct bench bench_trig_uprobe_multi_push; +extern const struct bench bench_trig_uretprobe_multi_push; +extern const struct bench bench_trig_uprobe_multi_ret; +extern const struct bench bench_trig_uretprobe_multi_ret; extern const struct bench bench_rb_libbpf; extern const struct bench bench_rb_custom; @@ -574,6 +580,12 @@ static const struct bench *benchs[] = { &bench_trig_uretprobe_push, &bench_trig_uprobe_ret, &bench_trig_uretprobe_ret, + &bench_trig_uprobe_multi_nop, + &bench_trig_uretprobe_multi_nop, + &bench_trig_uprobe_multi_push, + &bench_trig_uretprobe_multi_push, + &bench_trig_uprobe_multi_ret, + &bench_trig_uretprobe_multi_ret, /* ringbuf/perfbuf benchmarks */ &bench_rb_libbpf, &bench_rb_custom, diff --git a/tools/testing/selftests/bpf/benchs/bench_trigger.c b/tools/testing/selftests/bpf/benchs/bench_trigger.c index 4b05539f167d..a220545a3238 100644 --- a/tools/testing/selftests/bpf/benchs/bench_trigger.c +++ b/tools/testing/selftests/bpf/benchs/bench_trigger.c @@ -332,7 +332,7 @@ static void *uprobe_producer_ret(void *input) return NULL; } -static void usetup(bool use_retprobe, void *target_addr) +static void usetup(bool use_retprobe, bool use_multi, void *target_addr) { size_t uprobe_offset; struct bpf_link *link; @@ -346,7 +346,10 @@ static void usetup(bool use_retprobe, void *target_addr) exit(1); } - bpf_program__set_autoload(ctx.skel->progs.bench_trigger_uprobe, true); + if (use_multi) + bpf_program__set_autoload(ctx.skel->progs.bench_trigger_uprobe_multi, true); + else + bpf_program__set_autoload(ctx.skel->progs.bench_trigger_uprobe, true); err = trigger_bench__load(ctx.skel); if (err) { @@ -355,16 +358,28 @@ static void usetup(bool use_retprobe, void *target_addr) } uprobe_offset = get_uprobe_offset(target_addr); - link = bpf_program__attach_uprobe(ctx.skel->progs.bench_trigger_uprobe, - use_retprobe, - -1 /* all PIDs */, - "/proc/self/exe", - uprobe_offset); + if (use_multi) { + LIBBPF_OPTS(bpf_uprobe_multi_opts, opts, + .retprobe = use_retprobe, + .cnt = 1, + .offsets = &uprobe_offset, + ); + link = bpf_program__attach_uprobe_multi( + ctx.skel->progs.bench_trigger_uprobe_multi, + -1 /* all PIDs */, "/proc/self/exe", NULL, &opts); + ctx.skel->links.bench_trigger_uprobe_multi = link; + } else { + link = bpf_program__attach_uprobe(ctx.skel->progs.bench_trigger_uprobe, + use_retprobe, + -1 /* all PIDs */, + "/proc/self/exe", + uprobe_offset); + ctx.skel->links.bench_trigger_uprobe = link; + } if (!link) { - fprintf(stderr, "failed to attach uprobe!\n"); + fprintf(stderr, "failed to attach %s!\n", use_multi ? "multi-uprobe" : "uprobe"); exit(1); } - ctx.skel->links.bench_trigger_uprobe = link; } static void usermode_count_setup(void) @@ -374,32 +389,62 @@ static void usermode_count_setup(void) static void uprobe_nop_setup(void) { - usetup(false, &uprobe_target_nop); + usetup(false, false /* !use_multi */, &uprobe_target_nop); } static void uretprobe_nop_setup(void) { - usetup(true, &uprobe_target_nop); + usetup(true, false /* !use_multi */, &uprobe_target_nop); } static void uprobe_push_setup(void) { - usetup(false, &uprobe_target_push); + usetup(false, false /* !use_multi */, &uprobe_target_push); } static void uretprobe_push_setup(void) { - usetup(true, &uprobe_target_push); + usetup(true, false /* !use_multi */, &uprobe_target_push); } static void uprobe_ret_setup(void) { - usetup(false, &uprobe_target_ret); + usetup(false, false /* !use_multi */, &uprobe_target_ret); } static void uretprobe_ret_setup(void) { - usetup(true, &uprobe_target_ret); + usetup(true, false /* !use_multi */, &uprobe_target_ret); +} + +static void uprobe_multi_nop_setup(void) +{ + usetup(false, true /* use_multi */, &uprobe_target_nop); +} + +static void uretprobe_multi_nop_setup(void) +{ + usetup(true, true /* use_multi */, &uprobe_target_nop); +} + +static void uprobe_multi_push_setup(void) +{ + usetup(false, true /* use_multi */, &uprobe_target_push); +} + +static void uretprobe_multi_push_setup(void) +{ + usetup(true, true /* use_multi */, &uprobe_target_push); +} + +static void uprobe_multi_ret_setup(void) +{ + usetup(false, true /* use_multi */, &uprobe_target_ret); +} + +static void uretprobe_multi_ret_setup(void) +{ + usetup(true, true /* use_multi */, &uprobe_target_ret); } const struct bench bench_trig_syscall_count = { @@ -454,3 +499,9 @@ BENCH_TRIG_USERMODE(uprobe_ret, ret, "uprobe-ret"); BENCH_TRIG_USERMODE(uretprobe_nop, nop, "uretprobe-nop"); BENCH_TRIG_USERMODE(uretprobe_push, push, "uretprobe-push"); BENCH_TRIG_USERMODE(uretprobe_ret, ret, "uretprobe-ret"); +BENCH_TRIG_USERMODE(uprobe_multi_nop, nop, "uprobe-multi-nop"); +BENCH_TRIG_USERMODE(uprobe_multi_push, push, "uprobe-multi-push"); +BENCH_TRIG_USERMODE(uprobe_multi_ret, ret, "uprobe-multi-ret"); +BENCH_TRIG_USERMODE(uretprobe_multi_nop, nop, "uretprobe-multi-nop"); +BENCH_TRIG_USERMODE(uretprobe_multi_push, push, "uretprobe-multi-push"); +BENCH_TRIG_USERMODE(uretprobe_multi_ret, ret, "uretprobe-multi-ret"); diff --git a/tools/testing/selftests/bpf/progs/trigger_bench.c b/tools/testing/selftests/bpf/progs/trigger_bench.c index 2619ed193c65..044a6d78923e 100644 --- a/tools/testing/selftests/bpf/progs/trigger_bench.c +++ b/tools/testing/selftests/bpf/progs/trigger_bench.c @@ -32,6 +32,13 @@ int bench_trigger_uprobe(void *ctx) return 0; } +SEC("?uprobe.multi") +int bench_trigger_uprobe_multi(void *ctx) +{ + inc_counter(); + return 0; +} + const volatile int batch_iters = 0; SEC("?raw_tp") -- cgit v1.2.3 From 91c96842ab1e9159c8129ab5ddfeb7dd97bf840e Mon Sep 17 00:00:00 2001 From: Dave Marchevsky Date: Tue, 13 Aug 2024 21:24:24 +0000 Subject: selftests/bpf: Test bpf_kptr_xchg stashing into local kptr Test stashing both referenced kptr and local kptr into local kptrs. Then, test unstashing them. Acked-by: Martin KaFai Lau Acked-by: Hou Tao Signed-off-by: Dave Marchevsky Signed-off-by: Amery Hung Link: https://lore.kernel.org/r/20240813212424.2871455-6-amery.hung@bytedance.com Signed-off-by: Alexei Starovoitov --- .../testing/selftests/bpf/progs/local_kptr_stash.c | 30 ++++++++++++++++++++-- .../selftests/bpf/progs/task_kfunc_success.c | 26 ++++++++++++++++++- 2 files changed, 53 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/progs/local_kptr_stash.c b/tools/testing/selftests/bpf/progs/local_kptr_stash.c index 75043ffc5dad..b092a72b2c9d 100644 --- a/tools/testing/selftests/bpf/progs/local_kptr_stash.c +++ b/tools/testing/selftests/bpf/progs/local_kptr_stash.c @@ -8,9 +8,12 @@ #include "../bpf_experimental.h" #include "../bpf_testmod/bpf_testmod_kfunc.h" +struct plain_local; + struct node_data { long key; long data; + struct plain_local __kptr * stashed_in_local_kptr; struct bpf_rb_node node; }; @@ -85,6 +88,7 @@ static bool less(struct bpf_rb_node *a, const struct bpf_rb_node *b) static int create_and_stash(int idx, int val) { + struct plain_local *inner_local_kptr; struct map_value *mapval; struct node_data *res; @@ -92,11 +96,25 @@ static int create_and_stash(int idx, int val) if (!mapval) return 1; + inner_local_kptr = bpf_obj_new(typeof(*inner_local_kptr)); + if (!inner_local_kptr) + return 2; + res = bpf_obj_new(typeof(*res)); - if (!res) - return 1; + if (!res) { + bpf_obj_drop(inner_local_kptr); + return 3; + } res->key = val; + inner_local_kptr = bpf_kptr_xchg(&res->stashed_in_local_kptr, inner_local_kptr); + if (inner_local_kptr) { + /* Should never happen, we just obj_new'd res */ + bpf_obj_drop(inner_local_kptr); + bpf_obj_drop(res); + return 4; + } + res = bpf_kptr_xchg(&mapval->node, res); if (res) bpf_obj_drop(res); @@ -169,6 +187,7 @@ long stash_local_with_root(void *ctx) SEC("tc") long unstash_rb_node(void *ctx) { + struct plain_local *inner_local_kptr = NULL; struct map_value *mapval; struct node_data *res; long retval; @@ -180,6 +199,13 @@ long unstash_rb_node(void *ctx) res = bpf_kptr_xchg(&mapval->node, NULL); if (res) { + inner_local_kptr = bpf_kptr_xchg(&res->stashed_in_local_kptr, inner_local_kptr); + if (!inner_local_kptr) { + bpf_obj_drop(res); + return 1; + } + bpf_obj_drop(inner_local_kptr); + retval = res->key; bpf_obj_drop(res); return retval; diff --git a/tools/testing/selftests/bpf/progs/task_kfunc_success.c b/tools/testing/selftests/bpf/progs/task_kfunc_success.c index 70df695312dc..3138bb689b0b 100644 --- a/tools/testing/selftests/bpf/progs/task_kfunc_success.c +++ b/tools/testing/selftests/bpf/progs/task_kfunc_success.c @@ -5,6 +5,7 @@ #include #include +#include "../bpf_experimental.h" #include "task_kfunc_common.h" char _license[] SEC("license") = "GPL"; @@ -143,7 +144,7 @@ SEC("tp_btf/task_newtask") int BPF_PROG(test_task_xchg_release, struct task_struct *task, u64 clone_flags) { struct task_struct *kptr; - struct __tasks_kfunc_map_value *v; + struct __tasks_kfunc_map_value *v, *local; long status; if (!is_test_kfunc_task()) @@ -167,6 +168,29 @@ int BPF_PROG(test_task_xchg_release, struct task_struct *task, u64 clone_flags) return 0; } + local = bpf_obj_new(typeof(*local)); + if (!local) { + err = 4; + bpf_task_release(kptr); + return 0; + } + + kptr = bpf_kptr_xchg(&local->task, kptr); + if (kptr) { + err = 5; + bpf_obj_drop(local); + bpf_task_release(kptr); + return 0; + } + + kptr = bpf_kptr_xchg(&local->task, NULL); + if (!kptr) { + err = 6; + bpf_obj_drop(local); + return 0; + } + + bpf_obj_drop(local); bpf_task_release(kptr); return 0; -- cgit v1.2.3 From 5772c3458bb8d17d763e0f411e1bae1bf4eda88d Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Fri, 23 Aug 2024 12:44:09 -0700 Subject: selftests/bpf: use simply-expanded variables for libpcap flags Save pkg-config output for libpcap as simply-expanded variables. For an obscure reason 'shell' call in LDLIBS/CFLAGS recursively expanded variables makes *.test.o files compilation non-parallel when make is executed with -j option. While at it, reuse 'pkg-config --cflags' call to define -DTRAFFIC_MONITOR=1 option, it's exit status is the same as for 'pkg-config --exists'. Fixes: f52403b6bfea ("selftests/bpf: Add traffic monitor functions.") Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20240823194409.774815-1-eddyz87@gmail.com Signed-off-by: Martin KaFai Lau --- tools/testing/selftests/bpf/Makefile | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index ec7d425c4022..c120617b64ad 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -48,9 +48,10 @@ CFLAGS += -g $(OPT_FLAGS) -rdynamic \ LDFLAGS += $(SAN_LDFLAGS) LDLIBS += $(LIBELF_LIBS) -lz -lrt -lpthread -LDLIBS += $(shell $(PKG_CONFIG) --libs libpcap 2>/dev/null) -CFLAGS += $(shell $(PKG_CONFIG) --cflags libpcap 2>/dev/null) -CFLAGS += $(shell $(PKG_CONFIG) --exists libpcap 2>/dev/null && echo "-DTRAFFIC_MONITOR=1") +PCAP_CFLAGS := $(shell $(PKG_CONFIG) --cflags libpcap 2>/dev/null && echo "-DTRAFFIC_MONITOR=1") +PCAP_LIBS := $(shell $(PKG_CONFIG) --libs libpcap 2>/dev/null) +LDLIBS += $(PCAP_LIBS) +CFLAGS += $(PCAP_CFLAGS) # The following tests perform type punning and they may break strict # aliasing rules, which are exploited by both GCC and clang by default -- cgit v1.2.3 From 65ab5ac4df012388481d0414fcac1d5ac1721fb3 Mon Sep 17 00:00:00 2001 From: Jordan Rome Date: Fri, 23 Aug 2024 12:51:00 -0700 Subject: bpf: Add bpf_copy_from_user_str kfunc This adds a kfunc wrapper around strncpy_from_user, which can be called from sleepable BPF programs. This matches the non-sleepable 'bpf_probe_read_user_str' helper except it includes an additional 'flags' param, which allows consumers to clear the entire destination buffer on success or failure. Signed-off-by: Jordan Rome Link: https://lore.kernel.org/r/20240823195101.3621028-1-linux@jordanrome.com Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 9 +++++++++ kernel/bpf/helpers.c | 42 ++++++++++++++++++++++++++++++++++++++++++ tools/include/uapi/linux/bpf.h | 9 +++++++++ 3 files changed, 60 insertions(+) (limited to 'tools') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index e2629457d72d..c3a5728db115 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -7513,4 +7513,13 @@ struct bpf_iter_num { __u64 __opaque[1]; } __attribute__((aligned(8))); +/* + * Flags to control BPF kfunc behaviour. + * - BPF_F_PAD_ZEROS: Pad destination buffer with zeros. (See the respective + * helper documentation for details.) + */ +enum bpf_kfunc_flags { + BPF_F_PAD_ZEROS = (1ULL << 0), +}; + #endif /* _UAPI__LINUX_BPF_H__ */ diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index f12f075b70c5..7e7761c537f8 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -2962,6 +2962,47 @@ __bpf_kfunc void bpf_iter_bits_destroy(struct bpf_iter_bits *it) bpf_mem_free(&bpf_global_ma, kit->bits); } +/** + * bpf_copy_from_user_str() - Copy a string from an unsafe user address + * @dst: Destination address, in kernel space. This buffer must be + * at least @dst__sz bytes long. + * @dst__sz: Maximum number of bytes to copy, includes the trailing NUL. + * @unsafe_ptr__ign: Source address, in user space. + * @flags: The only supported flag is BPF_F_PAD_ZEROS + * + * Copies a NUL-terminated string from userspace to BPF space. If user string is + * too long this will still ensure zero termination in the dst buffer unless + * buffer size is 0. + * + * If BPF_F_PAD_ZEROS flag is set, memset the tail of @dst to 0 on success and + * memset all of @dst on failure. + */ +__bpf_kfunc int bpf_copy_from_user_str(void *dst, u32 dst__sz, const void __user *unsafe_ptr__ign, u64 flags) +{ + int ret; + + if (unlikely(flags & ~BPF_F_PAD_ZEROS)) + return -EINVAL; + + if (unlikely(!dst__sz)) + return 0; + + ret = strncpy_from_user(dst, unsafe_ptr__ign, dst__sz - 1); + if (ret < 0) { + if (flags & BPF_F_PAD_ZEROS) + memset((char *)dst, 0, dst__sz); + + return ret; + } + + if (flags & BPF_F_PAD_ZEROS) + memset((char *)dst + ret, 0, dst__sz - ret); + else + ((char *)dst)[ret] = '\0'; + + return ret + 1; +} + __bpf_kfunc_end_defs(); BTF_KFUNCS_START(generic_btf_ids) @@ -3047,6 +3088,7 @@ BTF_ID_FLAGS(func, bpf_preempt_enable) BTF_ID_FLAGS(func, bpf_iter_bits_new, KF_ITER_NEW) BTF_ID_FLAGS(func, bpf_iter_bits_next, KF_ITER_NEXT | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_iter_bits_destroy, KF_ITER_DESTROY) +BTF_ID_FLAGS(func, bpf_copy_from_user_str, KF_SLEEPABLE) BTF_KFUNCS_END(common_btf_ids) static const struct btf_kfunc_id_set common_kfunc_set = { diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 35bcf52dbc65..f329ee44627a 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -7512,4 +7512,13 @@ struct bpf_iter_num { __u64 __opaque[1]; } __attribute__((aligned(8))); +/* + * Flags to control BPF kfunc behaviour. + * - BPF_F_PAD_ZEROS: Pad destination buffer with zeros. (See the respective + * helper documentation for details.) + */ +enum bpf_kfunc_flags { + BPF_F_PAD_ZEROS = (1ULL << 0), +}; + #endif /* _UAPI__LINUX_BPF_H__ */ -- cgit v1.2.3 From ddc3d98807dca05ad10f4c76d13019c428302719 Mon Sep 17 00:00:00 2001 From: Jordan Rome Date: Fri, 23 Aug 2024 12:51:01 -0700 Subject: selftests/bpf: Add tests for bpf_copy_from_user_str kfunc. This adds tests for both the happy path and the error path. Signed-off-by: Jordan Rome Link: https://lore.kernel.org/r/20240823195101.3621028-2-linux@jordanrome.com Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/prog_tests/attach_probe.c | 8 ++- .../selftests/bpf/prog_tests/read_vsyscall.c | 1 + tools/testing/selftests/bpf/progs/read_vsyscall.c | 9 ++- .../selftests/bpf/progs/test_attach_probe.c | 64 +++++++++++++++++++++- 4 files changed, 75 insertions(+), 7 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/attach_probe.c b/tools/testing/selftests/bpf/prog_tests/attach_probe.c index 7175af39134f..329c7862b52d 100644 --- a/tools/testing/selftests/bpf/prog_tests/attach_probe.c +++ b/tools/testing/selftests/bpf/prog_tests/attach_probe.c @@ -283,9 +283,11 @@ static void test_uprobe_sleepable(struct test_attach_probe *skel) trigger_func3(); ASSERT_EQ(skel->bss->uprobe_byname3_sleepable_res, 9, "check_uprobe_byname3_sleepable_res"); - ASSERT_EQ(skel->bss->uprobe_byname3_res, 10, "check_uprobe_byname3_res"); - ASSERT_EQ(skel->bss->uretprobe_byname3_sleepable_res, 11, "check_uretprobe_byname3_sleepable_res"); - ASSERT_EQ(skel->bss->uretprobe_byname3_res, 12, "check_uretprobe_byname3_res"); + ASSERT_EQ(skel->bss->uprobe_byname3_str_sleepable_res, 10, "check_uprobe_byname3_str_sleepable_res"); + ASSERT_EQ(skel->bss->uprobe_byname3_res, 11, "check_uprobe_byname3_res"); + ASSERT_EQ(skel->bss->uretprobe_byname3_sleepable_res, 12, "check_uretprobe_byname3_sleepable_res"); + ASSERT_EQ(skel->bss->uretprobe_byname3_str_sleepable_res, 13, "check_uretprobe_byname3_str_sleepable_res"); + ASSERT_EQ(skel->bss->uretprobe_byname3_res, 14, "check_uretprobe_byname3_res"); } void test_attach_probe(void) diff --git a/tools/testing/selftests/bpf/prog_tests/read_vsyscall.c b/tools/testing/selftests/bpf/prog_tests/read_vsyscall.c index 3405923fe4e6..c7b9ba8b1d06 100644 --- a/tools/testing/selftests/bpf/prog_tests/read_vsyscall.c +++ b/tools/testing/selftests/bpf/prog_tests/read_vsyscall.c @@ -23,6 +23,7 @@ struct read_ret_desc { { .name = "probe_read_user_str", .ret = -EFAULT }, { .name = "copy_from_user", .ret = -EFAULT }, { .name = "copy_from_user_task", .ret = -EFAULT }, + { .name = "copy_from_user_str", .ret = -EFAULT }, }; void test_read_vsyscall(void) diff --git a/tools/testing/selftests/bpf/progs/read_vsyscall.c b/tools/testing/selftests/bpf/progs/read_vsyscall.c index 986f96687ae1..39ebef430059 100644 --- a/tools/testing/selftests/bpf/progs/read_vsyscall.c +++ b/tools/testing/selftests/bpf/progs/read_vsyscall.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* Copyright (C) 2024. Huawei Technologies Co., Ltd */ +#include "vmlinux.h" #include #include @@ -7,10 +8,15 @@ int target_pid = 0; void *user_ptr = 0; -int read_ret[8]; +int read_ret[9]; char _license[] SEC("license") = "GPL"; +/* + * This is the only kfunc, the others are helpers + */ +int bpf_copy_from_user_str(void *dst, u32, const void *, u64) __weak __ksym; + SEC("fentry/" SYS_PREFIX "sys_nanosleep") int do_probe_read(void *ctx) { @@ -40,6 +46,7 @@ int do_copy_from_user(void *ctx) read_ret[6] = bpf_copy_from_user(buf, sizeof(buf), user_ptr); read_ret[7] = bpf_copy_from_user_task(buf, sizeof(buf), user_ptr, bpf_get_current_task_btf(), 0); + read_ret[8] = bpf_copy_from_user_str((char *)buf, sizeof(buf), user_ptr, 0); return 0; } diff --git a/tools/testing/selftests/bpf/progs/test_attach_probe.c b/tools/testing/selftests/bpf/progs/test_attach_probe.c index 68466a6ad18c..fb79e6cab932 100644 --- a/tools/testing/selftests/bpf/progs/test_attach_probe.c +++ b/tools/testing/selftests/bpf/progs/test_attach_probe.c @@ -5,8 +5,10 @@ #include #include #include +#include #include "bpf_misc.h" +u32 dynamic_sz = 1; int kprobe2_res = 0; int kretprobe2_res = 0; int uprobe_byname_res = 0; @@ -14,11 +16,15 @@ int uretprobe_byname_res = 0; int uprobe_byname2_res = 0; int uretprobe_byname2_res = 0; int uprobe_byname3_sleepable_res = 0; +int uprobe_byname3_str_sleepable_res = 0; int uprobe_byname3_res = 0; int uretprobe_byname3_sleepable_res = 0; +int uretprobe_byname3_str_sleepable_res = 0; int uretprobe_byname3_res = 0; void *user_ptr = 0; +int bpf_copy_from_user_str(void *dst, u32, const void *, u64) __weak __ksym; + SEC("ksyscall/nanosleep") int BPF_KSYSCALL(handle_kprobe_auto, struct __kernel_timespec *req, struct __kernel_timespec *rem) { @@ -87,11 +93,61 @@ static __always_inline bool verify_sleepable_user_copy(void) return bpf_strncmp(data, sizeof(data), "test_data") == 0; } +static __always_inline bool verify_sleepable_user_copy_str(void) +{ + int ret; + char data_long[20]; + char data_long_pad[20]; + char data_long_err[20]; + char data_short[4]; + char data_short_pad[4]; + + ret = bpf_copy_from_user_str(data_short, sizeof(data_short), user_ptr, 0); + + if (bpf_strncmp(data_short, 4, "tes\0") != 0 || ret != 4) + return false; + + ret = bpf_copy_from_user_str(data_short_pad, sizeof(data_short_pad), user_ptr, BPF_F_PAD_ZEROS); + + if (bpf_strncmp(data_short, 4, "tes\0") != 0 || ret != 4) + return false; + + /* Make sure this passes the verifier */ + ret = bpf_copy_from_user_str(data_long, dynamic_sz & sizeof(data_long), user_ptr, 0); + + if (ret != 0) + return false; + + ret = bpf_copy_from_user_str(data_long, sizeof(data_long), user_ptr, 0); + + if (bpf_strncmp(data_long, 10, "test_data\0") != 0 || ret != 10) + return false; + + ret = bpf_copy_from_user_str(data_long_pad, sizeof(data_long_pad), user_ptr, BPF_F_PAD_ZEROS); + + if (bpf_strncmp(data_long_pad, 10, "test_data\0") != 0 || ret != 10 || data_long_pad[19] != '\0') + return false; + + ret = bpf_copy_from_user_str(data_long_err, sizeof(data_long_err), (void *)data_long, BPF_F_PAD_ZEROS); + + if (ret > 0 || data_long_err[19] != '\0') + return false; + + ret = bpf_copy_from_user_str(data_long, sizeof(data_long), user_ptr, 2); + + if (ret != -EINVAL) + return false; + + return true; +} + SEC("uprobe.s//proc/self/exe:trigger_func3") int handle_uprobe_byname3_sleepable(struct pt_regs *ctx) { if (verify_sleepable_user_copy()) uprobe_byname3_sleepable_res = 9; + if (verify_sleepable_user_copy_str()) + uprobe_byname3_str_sleepable_res = 10; return 0; } @@ -102,7 +158,7 @@ int handle_uprobe_byname3_sleepable(struct pt_regs *ctx) SEC("uprobe//proc/self/exe:trigger_func3") int handle_uprobe_byname3(struct pt_regs *ctx) { - uprobe_byname3_res = 10; + uprobe_byname3_res = 11; return 0; } @@ -110,14 +166,16 @@ SEC("uretprobe.s//proc/self/exe:trigger_func3") int handle_uretprobe_byname3_sleepable(struct pt_regs *ctx) { if (verify_sleepable_user_copy()) - uretprobe_byname3_sleepable_res = 11; + uretprobe_byname3_sleepable_res = 12; + if (verify_sleepable_user_copy_str()) + uretprobe_byname3_str_sleepable_res = 13; return 0; } SEC("uretprobe//proc/self/exe:trigger_func3") int handle_uretprobe_byname3(struct pt_regs *ctx) { - uretprobe_byname3_res = 12; + uretprobe_byname3_res = 14; return 0; } -- cgit v1.2.3 From 9c5b6d4e33dd78a0511ec34756d783b7e36028c2 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 22 Aug 2024 15:04:29 +0200 Subject: selftests: add xfrm policy insertion speed test script Nothing special, just test how long insertion of x policies takes. This should ideally show linear insertion speeds. Do not run this by default, it has little value, but it can be useful to check for insertion speed chahnges when altering the xfrm policy db implementation. Signed-off-by: Florian Westphal Signed-off-by: Steffen Klassert --- tools/testing/selftests/net/Makefile | 2 +- .../testing/selftests/net/xfrm_policy_add_speed.sh | 83 ++++++++++++++++++++++ 2 files changed, 84 insertions(+), 1 deletion(-) create mode 100755 tools/testing/selftests/net/xfrm_policy_add_speed.sh (limited to 'tools') diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile index 8eaffd7a641c..e127a80ff713 100644 --- a/tools/testing/selftests/net/Makefile +++ b/tools/testing/selftests/net/Makefile @@ -56,7 +56,7 @@ TEST_PROGS += ip_local_port_range.sh TEST_PROGS += rps_default_mask.sh TEST_PROGS += big_tcp.sh TEST_PROGS += netns-sysctl.sh -TEST_PROGS_EXTENDED := toeplitz_client.sh toeplitz.sh +TEST_PROGS_EXTENDED := toeplitz_client.sh toeplitz.sh xfrm_policy_add_speed.sh TEST_GEN_FILES = socket nettest TEST_GEN_FILES += psock_fanout psock_tpacket msg_zerocopy reuseport_addr_any TEST_GEN_FILES += tcp_mmap tcp_inq psock_snd txring_overwrite diff --git a/tools/testing/selftests/net/xfrm_policy_add_speed.sh b/tools/testing/selftests/net/xfrm_policy_add_speed.sh new file mode 100755 index 000000000000..2fab29d3cb91 --- /dev/null +++ b/tools/testing/selftests/net/xfrm_policy_add_speed.sh @@ -0,0 +1,83 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# +source lib.sh + +timeout=4m +ret=0 +tmp=$(mktemp) +cleanup() { + cleanup_all_ns + rm -f "$tmp" +} + +trap cleanup EXIT + +maxpolicies=100000 +[ "$KSFT_MACHINE_SLOW" = "yes" ] && maxpolicies=10000 + +do_dummies4() { + local dir="$1" + local max="$2" + + local policies + local pfx + pfx=30 + policies=0 + + ip netns exec "$ns" ip xfrm policy flush + + for i in $(seq 1 100);do + local s + local d + for j in $(seq 1 255);do + s=$((i+0)) + d=$((i+100)) + + for a in $(seq 1 8 255); do + policies=$((policies+1)) + [ "$policies" -gt "$max" ] && return + echo xfrm policy add src 10.$s.$j.0/30 dst 10.$d.$j.$a/$pfx dir $dir action block + done + for a in $(seq 1 8 255); do + policies=$((policies+1)) + [ "$policies" -gt "$max" ] && return + echo xfrm policy add src 10.$s.$j.$a/30 dst 10.$d.$j.0/$pfx dir $dir action block + done + done + done +} + +setup_ns ns + +do_bench() +{ + local max="$1" + + start=$(date +%s%3N) + do_dummies4 "out" "$max" > "$tmp" + if ! timeout "$timeout" ip netns exec "$ns" ip -batch "$tmp";then + echo "WARNING: policy insertion cancelled after $timeout" + ret=1 + fi + stop=$(date +%s%3N) + + result=$((stop-start)) + + policies=$(wc -l < "$tmp") + printf "Inserted %-06s policies in $result ms\n" $policies + + have=$(ip netns exec "$ns" ip xfrm policy show | grep "action block" | wc -l) + if [ "$have" -ne "$policies" ]; then + echo "WARNING: mismatch, have $have policies, expected $policies" + ret=1 + fi +} + +p=100 +while [ $p -le "$maxpolicies" ]; do + do_bench "$p" + p="${p}0" +done + +exit $ret -- cgit v1.2.3 From 996dc53ac289b81957aa70d62ccadc6986d26a87 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Thu, 22 Aug 2024 11:45:54 -0300 Subject: iommufd: Do not allow creating areas without READ or WRITE This results in passing 0 or just IOMMU_CACHE to iommu_map(). Most of the page table formats don't like this: amdv1 - -EINVAL armv7s - returns 0, doesn't update mapped arm-lpae - returns 0 doesn't update mapped dart - returns 0, doesn't update mapped VT-D - returns -EINVAL Unfortunately the three formats that return 0 cause serious problems: - Returning ret = but not uppdating mapped from domain->map_pages() causes an infinite loop in __iommu_map() - Not writing ioptes means that VFIO/iommufd have no way to recover them and we will have memory leaks and worse during unmap Since almost nothing can support this, and it is a useless thing to do, block it early in iommufd. Cc: stable@kernel.org Fixes: aad37e71d5c4 ("iommufd: IOCTLs for the io_pagetable") Signed-off-by: Jason Gunthorpe Reviewed-by: Nicolin Chen Reviewed-by: Kevin Tian Link: https://lore.kernel.org/r/1-v1-1211e1294c27+4b1-iommu_no_prot_jgg@nvidia.com Signed-off-by: Joerg Roedel --- drivers/iommu/iommufd/ioas.c | 8 ++++++++ tools/testing/selftests/iommu/iommufd.c | 6 +++--- 2 files changed, 11 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/drivers/iommu/iommufd/ioas.c b/drivers/iommu/iommufd/ioas.c index 742248276548..157a89b993e4 100644 --- a/drivers/iommu/iommufd/ioas.c +++ b/drivers/iommu/iommufd/ioas.c @@ -213,6 +213,10 @@ int iommufd_ioas_map(struct iommufd_ucmd *ucmd) if (cmd->iova >= ULONG_MAX || cmd->length >= ULONG_MAX) return -EOVERFLOW; + if (!(cmd->flags & + (IOMMU_IOAS_MAP_WRITEABLE | IOMMU_IOAS_MAP_READABLE))) + return -EINVAL; + ioas = iommufd_get_ioas(ucmd->ictx, cmd->ioas_id); if (IS_ERR(ioas)) return PTR_ERR(ioas); @@ -253,6 +257,10 @@ int iommufd_ioas_copy(struct iommufd_ucmd *ucmd) cmd->dst_iova >= ULONG_MAX) return -EOVERFLOW; + if (!(cmd->flags & + (IOMMU_IOAS_MAP_WRITEABLE | IOMMU_IOAS_MAP_READABLE))) + return -EINVAL; + src_ioas = iommufd_get_ioas(ucmd->ictx, cmd->src_ioas_id); if (IS_ERR(src_ioas)) return PTR_ERR(src_ioas); diff --git a/tools/testing/selftests/iommu/iommufd.c b/tools/testing/selftests/iommu/iommufd.c index 6343f4053bd4..4927b9add5ad 100644 --- a/tools/testing/selftests/iommu/iommufd.c +++ b/tools/testing/selftests/iommu/iommufd.c @@ -825,7 +825,7 @@ TEST_F(iommufd_ioas, copy_area) { struct iommu_ioas_copy copy_cmd = { .size = sizeof(copy_cmd), - .flags = IOMMU_IOAS_MAP_FIXED_IOVA, + .flags = IOMMU_IOAS_MAP_FIXED_IOVA | IOMMU_IOAS_MAP_WRITEABLE, .dst_ioas_id = self->ioas_id, .src_ioas_id = self->ioas_id, .length = PAGE_SIZE, @@ -1318,7 +1318,7 @@ TEST_F(iommufd_ioas, copy_sweep) { struct iommu_ioas_copy copy_cmd = { .size = sizeof(copy_cmd), - .flags = IOMMU_IOAS_MAP_FIXED_IOVA, + .flags = IOMMU_IOAS_MAP_FIXED_IOVA | IOMMU_IOAS_MAP_WRITEABLE, .src_ioas_id = self->ioas_id, .dst_iova = MOCK_APERTURE_START, .length = MOCK_PAGE_SIZE, @@ -1608,7 +1608,7 @@ TEST_F(iommufd_mock_domain, user_copy) }; struct iommu_ioas_copy copy_cmd = { .size = sizeof(copy_cmd), - .flags = IOMMU_IOAS_MAP_FIXED_IOVA, + .flags = IOMMU_IOAS_MAP_FIXED_IOVA | IOMMU_IOAS_MAP_WRITEABLE, .dst_ioas_id = self->ioas_id, .dst_iova = MOCK_APERTURE_START, .length = BUFFER_SIZE, -- cgit v1.2.3 From 052f3951640fd96d2e777b3272a925ec6c0c8100 Mon Sep 17 00:00:00 2001 From: Ryan Sullivan Date: Thu, 22 Aug 2024 13:31:22 -0400 Subject: selftests/livepatch: wait for atomic replace to occur On some machines with a large number of CPUs there is a sizable delay between an atomic replace occurring and when sysfs updates accordingly. This fix uses 'loop_until' to wait for the atomic replace to unload all previous livepatches. Reported-by: CKI Project Closes: https://datawarehouse.cki-project.org/kcidb/tests/redhat:1413102084-x86_64-kernel_upt_28 Signed-off-by: Ryan Sullivan Reviewed-by: Petr Mladek Acked-by: Joe Lawrence Link: https://lore.kernel.org/r/20240822173122.14760-1-rysulliv@redhat.com Signed-off-by: Petr Mladek --- tools/testing/selftests/livepatch/test-livepatch.sh | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/livepatch/test-livepatch.sh b/tools/testing/selftests/livepatch/test-livepatch.sh index 65c9c058458d..bd13257bfdfe 100755 --- a/tools/testing/selftests/livepatch/test-livepatch.sh +++ b/tools/testing/selftests/livepatch/test-livepatch.sh @@ -139,11 +139,8 @@ load_lp $MOD_REPLACE replace=1 grep 'live patched' /proc/cmdline > /dev/kmsg grep 'live patched' /proc/meminfo > /dev/kmsg -mods=(/sys/kernel/livepatch/*) -nmods=${#mods[@]} -if [ "$nmods" -ne 1 ]; then - die "Expecting only one moduled listed, found $nmods" -fi +loop_until 'mods=(/sys/kernel/livepatch/*); nmods=${#mods[@]}; [[ "$nmods" -eq 1 ]]' || + die "Expecting only one moduled listed, found $nmods" # These modules were disabled by the atomic replace for mod in $MOD_LIVEPATCH3 $MOD_LIVEPATCH2 $MOD_LIVEPATCH1; do -- cgit v1.2.3 From f133c76409c81653cb580903da49aecefca67013 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 13 Aug 2024 14:36:49 -0700 Subject: perf test: Support external tests for separate objdir Extend the searching for the test files so that it works when running perf from a separate objdir, and also when the perf executable is symlinked. Signed-off-by: Andi Kleen Acked-by: Namhyung Kim Cc: Andi Kleen Cc: Namhyung Kim Link: https://lore.kernel.org/r/20240813213651.1057362-2-ak@linux.intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/tests-scripts.c | 35 ++++++++++++++++++++++++++++++++--- 1 file changed, 32 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/perf/tests/tests-scripts.c b/tools/perf/tests/tests-scripts.c index e2042b368269..8dc1e398288c 100644 --- a/tools/perf/tests/tests-scripts.c +++ b/tools/perf/tests/tests-scripts.c @@ -29,16 +29,45 @@ static int shell_tests__dir_fd(void) { - char path[PATH_MAX], *exec_path; - static const char * const devel_dirs[] = { "./tools/perf/tests/shell", "./tests/shell", }; + struct stat st; + char path[PATH_MAX], path2[PATH_MAX], *exec_path; + static const char * const devel_dirs[] = { + "./tools/perf/tests/shell", + "./tests/shell", + "./source/tests/shell" + }; + int fd; + char *p; for (size_t i = 0; i < ARRAY_SIZE(devel_dirs); ++i) { - int fd = open(devel_dirs[i], O_PATH); + fd = open(devel_dirs[i], O_PATH); if (fd >= 0) return fd; } + /* Use directory of executable */ + if (readlink("/proc/self/exe", path2, sizeof path2) < 0) + return -1; + /* Follow another level of symlink if there */ + if (lstat(path2, &st) == 0 && (st.st_mode & S_IFMT) == S_IFLNK) { + scnprintf(path, sizeof(path), path2); + if (readlink(path, path2, sizeof path2) < 0) + return -1; + } + /* Get directory */ + p = strrchr(path2, '/'); + if (p) + *p = 0; + scnprintf(path, sizeof(path), "%s/tests/shell", path2); + fd = open(path, O_PATH); + if (fd >= 0) + return fd; + scnprintf(path, sizeof(path), "%s/source/tests/shell", path2); + fd = open(path, O_PATH); + if (fd >= 0) + return fd; + /* Then installed path. */ exec_path = get_argv_exec_path(); scnprintf(path, sizeof(path), "%s/tests/shell", exec_path); -- cgit v1.2.3 From b494b1673889c95ead43fb6978687fb61bb042d5 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Thu, 22 Aug 2024 02:56:39 -0700 Subject: net: netconsole: selftests: Create a new netconsole selftest Adds a selftest that creates two virtual interfaces, assigns one to a new namespace, and assigns IP addresses to both. It listens on the destination interface using socat and configures a dynamic target on netconsole, pointing to the destination IP address. The test then checks if the message was received properly on the destination interface. Signed-off-by: Breno Leitao Acked-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20240822095652.3806208-1-leitao@debian.org Signed-off-by: Jakub Kicinski --- MAINTAINERS | 1 + tools/testing/selftests/drivers/net/Makefile | 5 +- tools/testing/selftests/drivers/net/config | 4 + .../testing/selftests/drivers/net/netcons_basic.sh | 234 +++++++++++++++++++++ 4 files changed, 243 insertions(+), 1 deletion(-) create mode 100755 tools/testing/selftests/drivers/net/netcons_basic.sh (limited to 'tools') diff --git a/MAINTAINERS b/MAINTAINERS index e4fa9010fcb6..30a9b9450e11 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -15775,6 +15775,7 @@ M: Breno Leitao S: Maintained F: Documentation/networking/netconsole.rst F: drivers/net/netconsole.c +F: tools/testing/selftests/drivers/net/netcons_basic.sh NETDEVSIM M: Jakub Kicinski diff --git a/tools/testing/selftests/drivers/net/Makefile b/tools/testing/selftests/drivers/net/Makefile index e54f382bcb02..39fb97a8c1df 100644 --- a/tools/testing/selftests/drivers/net/Makefile +++ b/tools/testing/selftests/drivers/net/Makefile @@ -1,8 +1,11 @@ # SPDX-License-Identifier: GPL-2.0 -TEST_INCLUDES := $(wildcard lib/py/*.py) +TEST_INCLUDES := $(wildcard lib/py/*.py) \ + ../../net/net_helper.sh \ + ../../net/lib.sh \ TEST_PROGS := \ + netcons_basic.sh \ ping.py \ queues.py \ stats.py \ diff --git a/tools/testing/selftests/drivers/net/config b/tools/testing/selftests/drivers/net/config index f6a58ce8a230..a2d8af60876d 100644 --- a/tools/testing/selftests/drivers/net/config +++ b/tools/testing/selftests/drivers/net/config @@ -1,2 +1,6 @@ CONFIG_IPV6=y CONFIG_NETDEVSIM=m +CONFIG_CONFIGFS_FS=y +CONFIG_NETCONSOLE=m +CONFIG_NETCONSOLE_DYNAMIC=y +CONFIG_NETCONSOLE_EXTENDED_LOG=y diff --git a/tools/testing/selftests/drivers/net/netcons_basic.sh b/tools/testing/selftests/drivers/net/netcons_basic.sh new file mode 100755 index 000000000000..06021b2059b7 --- /dev/null +++ b/tools/testing/selftests/drivers/net/netcons_basic.sh @@ -0,0 +1,234 @@ +#!/usr/bin/env bash +# SPDX-License-Identifier: GPL-2.0 + +# This test creates two netdevsim virtual interfaces, assigns one of them (the +# "destination interface") to a new namespace, and assigns IP addresses to both +# interfaces. +# +# It listens on the destination interface using socat and configures a dynamic +# target on netconsole, pointing to the destination IP address. +# +# Finally, it checks whether the message was received properly on the +# destination interface. Note that this test may pollute the kernel log buffer +# (dmesg) and relies on dynamic configuration and namespaces being configured. +# +# Author: Breno Leitao + +set -euo pipefail + +SCRIPTDIR=$(dirname "$(readlink -e "${BASH_SOURCE[0]}")") + +# Simple script to test dynamic targets in netconsole +SRCIF="" # to be populated later +SRCIP=192.168.1.1 +DSTIF="" # to be populated later +DSTIP=192.168.1.2 + +PORT="6666" +MSG="netconsole selftest" +TARGET=$(mktemp -u netcons_XXXXX) +DEFAULT_PRINTK_VALUES=$(cat /proc/sys/kernel/printk) +NETCONS_CONFIGFS="/sys/kernel/config/netconsole" +NETCONS_PATH="${NETCONS_CONFIGFS}"/"${TARGET}" +# NAMESPACE will be populated by setup_ns with a random value +NAMESPACE="" + +# IDs for netdevsim +NSIM_DEV_1_ID=$((256 + RANDOM % 256)) +NSIM_DEV_2_ID=$((512 + RANDOM % 256)) + +# Used to create and delete namespaces +source "${SCRIPTDIR}"/../../net/lib.sh +source "${SCRIPTDIR}"/../../net/net_helper.sh + +# Create netdevsim interfaces +create_ifaces() { + local NSIM_DEV_SYS_NEW=/sys/bus/netdevsim/new_device + + echo "$NSIM_DEV_2_ID" > "$NSIM_DEV_SYS_NEW" + echo "$NSIM_DEV_1_ID" > "$NSIM_DEV_SYS_NEW" + udevadm settle 2> /dev/null || true + + local NSIM1=/sys/bus/netdevsim/devices/netdevsim"$NSIM_DEV_1_ID" + local NSIM2=/sys/bus/netdevsim/devices/netdevsim"$NSIM_DEV_2_ID" + + # These are global variables + SRCIF=$(find "$NSIM1"/net -maxdepth 1 -type d ! \ + -path "$NSIM1"/net -exec basename {} \;) + DSTIF=$(find "$NSIM2"/net -maxdepth 1 -type d ! \ + -path "$NSIM2"/net -exec basename {} \;) +} + +link_ifaces() { + local NSIM_DEV_SYS_LINK="/sys/bus/netdevsim/link_device" + local SRCIF_IFIDX=$(cat /sys/class/net/"$SRCIF"/ifindex) + local DSTIF_IFIDX=$(cat /sys/class/net/"$DSTIF"/ifindex) + + exec {NAMESPACE_FD} $NSIM_DEV_SYS_LINK + then + echo "linking netdevsim1 with netdevsim2 should succeed" + cleanup + exit "${ksft_skip}" + fi +} + +function configure_ip() { + # Configure the IPs for both interfaces + ip netns exec "${NAMESPACE}" ip addr add "${DSTIP}"/24 dev "${DSTIF}" + ip netns exec "${NAMESPACE}" ip link set "${DSTIF}" up + + ip addr add "${SRCIP}"/24 dev "${SRCIF}" + ip link set "${SRCIF}" up +} + +function set_network() { + # setup_ns function is coming from lib.sh + setup_ns NAMESPACE + + # Create both interfaces, and assign the destination to a different + # namespace + create_ifaces + + # Link both interfaces back to back + link_ifaces + + configure_ip +} + +function create_dynamic_target() { + DSTMAC=$(ip netns exec "${NAMESPACE}" \ + ip link show "${DSTIF}" | awk '/ether/ {print $2}') + + # Create a dynamic target + mkdir "${NETCONS_PATH}" + + echo "${DSTIP}" > "${NETCONS_PATH}"/remote_ip + echo "${SRCIP}" > "${NETCONS_PATH}"/local_ip + echo "${DSTMAC}" > "${NETCONS_PATH}"/remote_mac + echo "${SRCIF}" > "${NETCONS_PATH}"/dev_name + + echo 1 > "${NETCONS_PATH}"/enabled +} + +function cleanup() { + local NSIM_DEV_SYS_DEL="/sys/bus/netdevsim/del_device" + + # delete netconsole dynamic reconfiguration + echo 0 > "${NETCONS_PATH}"/enabled + # Remove the configfs entry + rmdir "${NETCONS_PATH}" + + # Delete netdevsim devices + echo "$NSIM_DEV_2_ID" > "$NSIM_DEV_SYS_DEL" + echo "$NSIM_DEV_1_ID" > "$NSIM_DEV_SYS_DEL" + + # this is coming from lib.sh + cleanup_all_ns + + # Restoring printk configurations + echo "${DEFAULT_PRINTK_VALUES}" > /proc/sys/kernel/printk +} + +function listen_port_and_save_to() { + local OUTPUT=${1} + # Just wait for 2 seconds + timeout 2 ip netns exec "${NAMESPACE}" \ + socat UDP-LISTEN:"${PORT}",fork "${OUTPUT}" +} + +function validate_result() { + local TMPFILENAME="$1" + + # Check if the file exists + if [ ! -f "$TMPFILENAME" ]; then + echo "FAIL: File was not generated." >&2 + exit "${ksft_fail}" + fi + + if ! grep -q "${MSG}" "${TMPFILENAME}"; then + echo "FAIL: ${MSG} not found in ${TMPFILENAME}" >&2 + cat "${TMPFILENAME}" >&2 + exit "${ksft_fail}" + fi + + # Delete the file once it is validated, otherwise keep it + # for debugging purposes + rm "${TMPFILENAME}" + exit "${ksft_pass}" +} + +function check_for_dependencies() { + if [ "$(id -u)" -ne 0 ]; then + echo "This test must be run as root" >&2 + exit "${ksft_skip}" + fi + + if ! which socat > /dev/null ; then + echo "SKIP: socat(1) is not available" >&2 + exit "${ksft_skip}" + fi + + if ! which ip > /dev/null ; then + echo "SKIP: ip(1) is not available" >&2 + exit "${ksft_skip}" + fi + + if ! which udevadm > /dev/null ; then + echo "SKIP: udevadm(1) is not available" >&2 + exit "${ksft_skip}" + fi + + if [ ! -d "${NETCONS_CONFIGFS}" ]; then + echo "SKIP: directory ${NETCONS_CONFIGFS} does not exist. Check if NETCONSOLE_DYNAMIC is enabled" >&2 + exit "${ksft_skip}" + fi + + if ip link show "${DSTIF}" 2> /dev/null; then + echo "SKIP: interface ${DSTIF} exists in the system. Not overwriting it." >&2 + exit "${ksft_skip}" + fi + + if ip addr list | grep -E "inet.*(${SRCIP}|${DSTIP})" 2> /dev/null; then + echo "SKIP: IPs already in use. Skipping it" >&2 + exit "${ksft_skip}" + fi +} + +# ========== # +# Start here # +# ========== # +modprobe netdevsim 2> /dev/null || true +modprobe netconsole 2> /dev/null || true + +# The content of kmsg will be save to the following file +OUTPUT_FILE="/tmp/${TARGET}" + +# Check for basic system dependency and exit if not found +check_for_dependencies +# Set current loglevel to KERN_INFO(6), and default to KERN_NOTICE(5) +echo "6 5" > /proc/sys/kernel/printk +# Remove the namespace, interfaces and netconsole target on exit +trap cleanup EXIT +# Create one namespace and two interfaces +set_network +# Create a dynamic target for netconsole +create_dynamic_target +# Listed for netconsole port inside the namespace and destination interface +listen_port_and_save_to "${OUTPUT_FILE}" & +# Wait for socat to start and listen to the port. +wait_local_port_listen "${NAMESPACE}" "${PORT}" udp +# Send the message +echo "${MSG}: ${TARGET}" > /dev/kmsg +# Wait until socat saves the file to disk +busywait "${BUSYWAIT_TIMEOUT}" test -s "${OUTPUT_FILE}" + +# Make sure the message was received in the dst part +# and exit +validate_result "${OUTPUT_FILE}" -- cgit v1.2.3 From eb2dcde9f970ed8d3669444d47c8524b4bdf7d32 Mon Sep 17 00:00:00 2001 From: Vincent Donnefort Date: Fri, 28 Jun 2024 11:46:11 +0100 Subject: ring-buffer: Align meta-page to sub-buffers for improved TLB usage Previously, the mapped ring-buffer layout caused misalignment between the meta-page and sub-buffers when the sub-buffer size was not a multiple of PAGE_SIZE. This prevented hardware with larger TLB entries from utilizing them effectively. Add a padding with the zero-page between the meta-page and sub-buffers. Also update the ring-buffer map_test to verify that padding. Link: https://lore.kernel.org/20240628104611.1443542-1-vdonnefort@google.com Signed-off-by: Vincent Donnefort Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ring_buffer.c | 33 ++++++++++++++++---------- tools/testing/selftests/ring-buffer/map_test.c | 14 +++++++++++ 2 files changed, 34 insertions(+), 13 deletions(-) (limited to 'tools') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index c3a5e6cbb940..77dc0b25140e 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -6852,10 +6852,10 @@ static void rb_setup_ids_meta_page(struct ring_buffer_per_cpu *cpu_buffer, /* install subbuf ID to kern VA translation */ cpu_buffer->subbuf_ids = subbuf_ids; - meta->meta_page_size = PAGE_SIZE; meta->meta_struct_len = sizeof(*meta); meta->nr_subbufs = nr_subbufs; meta->subbuf_size = cpu_buffer->buffer->subbuf_size + BUF_PAGE_HDR_SIZE; + meta->meta_page_size = meta->subbuf_size; rb_update_meta_page(cpu_buffer); } @@ -6949,6 +6949,12 @@ static int __rb_map_vma(struct ring_buffer_per_cpu *cpu_buffer, !(vma->vm_flags & VM_MAYSHARE)) return -EPERM; + subbuf_order = cpu_buffer->buffer->subbuf_order; + subbuf_pages = 1 << subbuf_order; + + if (subbuf_order && pgoff % subbuf_pages) + return -EINVAL; + /* * Make sure the mapping cannot become writable later. Also tell the VM * to not touch these pages (VM_DONTCOPY | VM_DONTEXPAND). @@ -6958,11 +6964,8 @@ static int __rb_map_vma(struct ring_buffer_per_cpu *cpu_buffer, lockdep_assert_held(&cpu_buffer->mapping_lock); - subbuf_order = cpu_buffer->buffer->subbuf_order; - subbuf_pages = 1 << subbuf_order; - nr_subbufs = cpu_buffer->nr_pages + 1; /* + reader-subbuf */ - nr_pages = ((nr_subbufs) << subbuf_order) - pgoff + 1; /* + meta-page */ + nr_pages = ((nr_subbufs + 1) << subbuf_order) - pgoff; /* + meta-page */ nr_vma_pages = vma_pages(vma); if (!nr_vma_pages || nr_vma_pages > nr_pages) @@ -6975,20 +6978,24 @@ static int __rb_map_vma(struct ring_buffer_per_cpu *cpu_buffer, return -ENOMEM; if (!pgoff) { + unsigned long meta_page_padding; + pages[p++] = virt_to_page(cpu_buffer->meta_page); /* - * TODO: Align sub-buffers on their size, once - * vm_insert_pages() supports the zero-page. + * Pad with the zero-page to align the meta-page with the + * sub-buffers. */ - } else { - /* Skip the meta-page */ - pgoff--; + meta_page_padding = subbuf_pages - 1; + while (meta_page_padding-- && p < nr_pages) { + unsigned long __maybe_unused zero_addr = + vma->vm_start + (PAGE_SIZE * p); - if (pgoff % subbuf_pages) { - err = -EINVAL; - goto out; + pages[p++] = ZERO_PAGE(zero_addr); } + } else { + /* Skip the meta-page */ + pgoff -= subbuf_pages; s += pgoff / subbuf_pages; } diff --git a/tools/testing/selftests/ring-buffer/map_test.c b/tools/testing/selftests/ring-buffer/map_test.c index a9006fa7097e..4bb0192e43f3 100644 --- a/tools/testing/selftests/ring-buffer/map_test.c +++ b/tools/testing/selftests/ring-buffer/map_test.c @@ -228,6 +228,20 @@ TEST_F(map, data_mmap) data = mmap(NULL, data_len, PROT_READ, MAP_SHARED, desc->cpu_fd, meta_len); ASSERT_EQ(data, MAP_FAILED); + + /* Verify meta-page padding */ + if (desc->meta->meta_page_size > getpagesize()) { + void *addr; + + data_len = desc->meta->meta_page_size; + data = mmap(NULL, data_len, + PROT_READ, MAP_SHARED, desc->cpu_fd, 0); + ASSERT_NE(data, MAP_FAILED); + + addr = (void *)((unsigned long)data + getpagesize()); + ASSERT_EQ(*((int *)addr), 0); + munmap(data, data_len); + } } FIXTURE(snapshot) { -- cgit v1.2.3 From 40153505259d8dc0e4ea6889fca5e567c42b76a9 Mon Sep 17 00:00:00 2001 From: "Borislav Petkov (AMD)" Date: Wed, 31 Jul 2024 18:05:31 +0200 Subject: Documentation/srso: Document a method for checking safe RET operates properly Add a method to quickly verify whether safe RET operates properly on a given system using perf tool. Also, add a selftest which does the same thing. Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/20240731160531.28640-1-bp@kernel.org --- Documentation/admin-guide/hw-vuln/srso.rst | 69 +++++++++++++++++++++++++++++ tools/testing/selftests/x86/Makefile | 2 +- tools/testing/selftests/x86/srso.c | 70 ++++++++++++++++++++++++++++++ 3 files changed, 140 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/x86/srso.c (limited to 'tools') diff --git a/Documentation/admin-guide/hw-vuln/srso.rst b/Documentation/admin-guide/hw-vuln/srso.rst index 4bd3ce3ba171..2ad1c05b8c88 100644 --- a/Documentation/admin-guide/hw-vuln/srso.rst +++ b/Documentation/admin-guide/hw-vuln/srso.rst @@ -158,3 +158,72 @@ poisoned BTB entry and using that safe one for all function returns. In older Zen1 and Zen2, this is accomplished using a reinterpretation technique similar to Retbleed one: srso_untrain_ret() and srso_safe_ret(). + +Checking the safe RET mitigation actually works +----------------------------------------------- + +In case one wants to validate whether the SRSO safe RET mitigation works +on a kernel, one could use two performance counters + +* PMC_0xc8 - Count of RET/RET lw retired +* PMC_0xc9 - Count of RET/RET lw retired mispredicted + +and compare the number of RETs retired properly vs those retired +mispredicted, in kernel mode. Another way of specifying those events +is:: + + # perf list ex_ret_near_ret + + List of pre-defined events (to be used in -e or -M): + + core: + ex_ret_near_ret + [Retired Near Returns] + ex_ret_near_ret_mispred + [Retired Near Returns Mispredicted] + +Either the command using the event mnemonics:: + + # perf stat -e ex_ret_near_ret:k -e ex_ret_near_ret_mispred:k sleep 10s + +or using the raw PMC numbers:: + + # perf stat -e cpu/event=0xc8,umask=0/k -e cpu/event=0xc9,umask=0/k sleep 10s + +should give the same amount. I.e., every RET retired should be +mispredicted:: + + [root@brent: ~/kernel/linux/tools/perf> ./perf stat -e cpu/event=0xc8,umask=0/k -e cpu/event=0xc9,umask=0/k sleep 10s + + Performance counter stats for 'sleep 10s': + + 137,167 cpu/event=0xc8,umask=0/k + 137,173 cpu/event=0xc9,umask=0/k + + 10.004110303 seconds time elapsed + + 0.000000000 seconds user + 0.004462000 seconds sys + +vs the case when the mitigation is disabled (spec_rstack_overflow=off) +or not functioning properly, showing usually a lot smaller number of +mispredicted retired RETs vs the overall count of retired RETs during +a workload:: + + [root@brent: ~/kernel/linux/tools/perf> ./perf stat -e cpu/event=0xc8,umask=0/k -e cpu/event=0xc9,umask=0/k sleep 10s + + Performance counter stats for 'sleep 10s': + + 201,627 cpu/event=0xc8,umask=0/k + 4,074 cpu/event=0xc9,umask=0/k + + 10.003267252 seconds time elapsed + + 0.002729000 seconds user + 0.000000000 seconds sys + +Also, there is a selftest which performs the above, go to +tools/testing/selftests/x86/ and do:: + + make srso + ./srso diff --git a/tools/testing/selftests/x86/Makefile b/tools/testing/selftests/x86/Makefile index 5c8757a25998..d51249f14e2f 100644 --- a/tools/testing/selftests/x86/Makefile +++ b/tools/testing/selftests/x86/Makefile @@ -77,7 +77,7 @@ all_32: $(BINARIES_32) all_64: $(BINARIES_64) -EXTRA_CLEAN := $(BINARIES_32) $(BINARIES_64) +EXTRA_CLEAN := $(BINARIES_32) $(BINARIES_64) srso $(BINARIES_32): $(OUTPUT)/%_32: %.c helpers.h $(CC) -m32 -o $@ $(CFLAGS) $(EXTRA_CFLAGS) $< $(EXTRA_FILES) -lrt -ldl -lm diff --git a/tools/testing/selftests/x86/srso.c b/tools/testing/selftests/x86/srso.c new file mode 100644 index 000000000000..394ec8bdeb00 --- /dev/null +++ b/tools/testing/selftests/x86/srso.c @@ -0,0 +1,70 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include +#include +#include +#include +#include + +int main(void) +{ + struct perf_event_attr ret_attr, mret_attr; + long long count_rets, count_rets_mispred; + int rrets_fd, mrrets_fd; + unsigned int cpuid1_eax, b, c, d; + + __cpuid(1, cpuid1_eax, b, c, d); + + if (cpuid1_eax < 0x00800f00 || + cpuid1_eax > 0x00afffff) { + fprintf(stderr, "This needs to run on a Zen[1-4] machine (CPUID(1).EAX: 0x%x). Exiting...\n", cpuid1_eax); + exit(EXIT_FAILURE); + } + + memset(&ret_attr, 0, sizeof(struct perf_event_attr)); + memset(&mret_attr, 0, sizeof(struct perf_event_attr)); + + ret_attr.type = mret_attr.type = PERF_TYPE_RAW; + ret_attr.size = mret_attr.size = sizeof(struct perf_event_attr); + ret_attr.config = 0xc8; + mret_attr.config = 0xc9; + ret_attr.disabled = mret_attr.disabled = 1; + ret_attr.exclude_user = mret_attr.exclude_user = 1; + ret_attr.exclude_hv = mret_attr.exclude_hv = 1; + + rrets_fd = syscall(SYS_perf_event_open, &ret_attr, 0, -1, -1, 0); + if (rrets_fd == -1) { + perror("opening retired RETs fd"); + exit(EXIT_FAILURE); + } + + mrrets_fd = syscall(SYS_perf_event_open, &mret_attr, 0, -1, -1, 0); + if (mrrets_fd == -1) { + perror("opening retired mispredicted RETs fd"); + exit(EXIT_FAILURE); + } + + ioctl(rrets_fd, PERF_EVENT_IOC_RESET, 0); + ioctl(mrrets_fd, PERF_EVENT_IOC_RESET, 0); + + ioctl(rrets_fd, PERF_EVENT_IOC_ENABLE, 0); + ioctl(mrrets_fd, PERF_EVENT_IOC_ENABLE, 0); + + printf("Sleeping for 10 seconds\n"); + sleep(10); + + ioctl(rrets_fd, PERF_EVENT_IOC_DISABLE, 0); + ioctl(mrrets_fd, PERF_EVENT_IOC_DISABLE, 0); + + read(rrets_fd, &count_rets, sizeof(long long)); + read(mrrets_fd, &count_rets_mispred, sizeof(long long)); + + printf("RETs: (%lld retired <-> %lld mispredicted)\n", + count_rets, count_rets_mispred); + printf("SRSO Safe-RET mitigation works correctly if both counts are almost equal.\n"); + + return 0; +} -- cgit v1.2.3 From 59c330eccee82f9e53421dd8a83b1bc236f4557a Mon Sep 17 00:00:00 2001 From: Boris Sukholitko Date: Thu, 22 Aug 2024 13:35:09 +0300 Subject: selftests: tc_actions: test ingress 2nd vlan push Add new test checking the correctness of inner vlan flushing to the skb data when outer vlan tag is added through act_vlan on ingress. Signed-off-by: Boris Sukholitko Signed-off-by: Paolo Abeni --- .../testing/selftests/net/forwarding/tc_actions.sh | 23 +++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/forwarding/tc_actions.sh b/tools/testing/selftests/net/forwarding/tc_actions.sh index 589629636502..f2f1e99a90b2 100755 --- a/tools/testing/selftests/net/forwarding/tc_actions.sh +++ b/tools/testing/selftests/net/forwarding/tc_actions.sh @@ -4,7 +4,7 @@ ALL_TESTS="gact_drop_and_ok_test mirred_egress_redirect_test \ mirred_egress_mirror_test matchall_mirred_egress_mirror_test \ gact_trap_test mirred_egress_to_ingress_test \ - mirred_egress_to_ingress_tcp_test" + mirred_egress_to_ingress_tcp_test ingress_2nd_vlan_push" NUM_NETIFS=4 source tc_common.sh source lib.sh @@ -244,6 +244,27 @@ mirred_egress_to_ingress_tcp_test() log_test "mirred_egress_to_ingress_tcp ($tcflags)" } +ingress_2nd_vlan_push() +{ + tc filter add dev $swp1 ingress pref 20 chain 0 handle 20 flower \ + $tcflags num_of_vlans 1 \ + action vlan push id 100 protocol 0x8100 action goto chain 5 + tc filter add dev $swp1 ingress pref 30 chain 5 handle 30 flower \ + $tcflags num_of_vlans 2 \ + cvlan_ethtype 0x800 action pass + + $MZ $h1 -c 1 -p 64 -a $h1mac -b $h2mac -A 192.0.2.1 -B 192.0.2.2 \ + -t ip -Q 10 -q + + tc_check_packets "dev $swp1 ingress" 30 1 + check_err $? "No double-vlan packets received" + + tc filter del dev $swp1 ingress pref 20 chain 0 handle 20 flower + tc filter del dev $swp1 ingress pref 30 chain 5 handle 30 flower + + log_test "ingress_2nd_vlan_push ($tcflags)" +} + setup_prepare() { h1=${NETIFS[p1]} -- cgit v1.2.3 From 2da44703a54403b9048ba268b2896dc0537a154f Mon Sep 17 00:00:00 2001 From: Boris Sukholitko Date: Thu, 22 Aug 2024 13:35:10 +0300 Subject: selftests: tc_actions: test egress 2nd vlan push Add new test checking the correctness of inner vlan flushing to the skb data when outer vlan tag is added through act_vlan on egress. Signed-off-by: Boris Sukholitko Signed-off-by: Paolo Abeni --- .../testing/selftests/net/forwarding/tc_actions.sh | 25 +++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/forwarding/tc_actions.sh b/tools/testing/selftests/net/forwarding/tc_actions.sh index f2f1e99a90b2..ea89e558672d 100755 --- a/tools/testing/selftests/net/forwarding/tc_actions.sh +++ b/tools/testing/selftests/net/forwarding/tc_actions.sh @@ -4,7 +4,8 @@ ALL_TESTS="gact_drop_and_ok_test mirred_egress_redirect_test \ mirred_egress_mirror_test matchall_mirred_egress_mirror_test \ gact_trap_test mirred_egress_to_ingress_test \ - mirred_egress_to_ingress_tcp_test ingress_2nd_vlan_push" + mirred_egress_to_ingress_tcp_test \ + ingress_2nd_vlan_push egress_2nd_vlan_push" NUM_NETIFS=4 source tc_common.sh source lib.sh @@ -265,6 +266,28 @@ ingress_2nd_vlan_push() log_test "ingress_2nd_vlan_push ($tcflags)" } +egress_2nd_vlan_push() +{ + tc filter add dev $h1 egress pref 20 chain 0 handle 20 flower \ + $tcflags num_of_vlans 0 \ + action vlan push id 10 protocol 0x8100 \ + pipe action vlan push id 100 protocol 0x8100 action goto chain 5 + tc filter add dev $h1 egress pref 30 chain 5 handle 30 flower \ + $tcflags num_of_vlans 2 \ + cvlan_ethtype 0x800 action pass + + $MZ $h1 -c 1 -p 64 -a $h1mac -b $h2mac -A 192.0.2.1 -B 192.0.2.2 \ + -t ip -q + + tc_check_packets "dev $h1 egress" 30 1 + check_err $? "No double-vlan packets received" + + tc filter del dev $h1 egress pref 20 chain 0 handle 20 flower + tc filter del dev $h1 egress pref 30 chain 5 handle 30 flower + + log_test "egress_2nd_vlan_push ($tcflags)" +} + setup_prepare() { h1=${NETIFS[p1]} -- cgit v1.2.3 From de2e75209303b98d3169a249a1bc847be9657d9b Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 27 Aug 2024 16:25:17 +0100 Subject: KVM: arm64: Add selftest checking how the absence of GICv3 is handled Given how tortuous and fragile the whole lack-of-GICv3 story is, add a selftest checking that we don't regress it. Reviewed-by: Oliver Upton Link: https://lore.kernel.org/r/20240827152517.3909653-12-maz@kernel.org Signed-off-by: Marc Zyngier --- tools/testing/selftests/kvm/Makefile | 1 + tools/testing/selftests/kvm/aarch64/no-vgic-v3.c | 175 +++++++++++++++++++++++ 2 files changed, 176 insertions(+) create mode 100644 tools/testing/selftests/kvm/aarch64/no-vgic-v3.c (limited to 'tools') diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile index 48d32c5aa3eb..f66b37acc0b0 100644 --- a/tools/testing/selftests/kvm/Makefile +++ b/tools/testing/selftests/kvm/Makefile @@ -163,6 +163,7 @@ TEST_GEN_PROGS_aarch64 += aarch64/vgic_init TEST_GEN_PROGS_aarch64 += aarch64/vgic_irq TEST_GEN_PROGS_aarch64 += aarch64/vgic_lpi_stress TEST_GEN_PROGS_aarch64 += aarch64/vpmu_counter_access +TEST_GEN_PROGS_aarch64 += aarch64/no-vgic-v3 TEST_GEN_PROGS_aarch64 += access_tracking_perf_test TEST_GEN_PROGS_aarch64 += arch_timer TEST_GEN_PROGS_aarch64 += demand_paging_test diff --git a/tools/testing/selftests/kvm/aarch64/no-vgic-v3.c b/tools/testing/selftests/kvm/aarch64/no-vgic-v3.c new file mode 100644 index 000000000000..943d65fc6b0b --- /dev/null +++ b/tools/testing/selftests/kvm/aarch64/no-vgic-v3.c @@ -0,0 +1,175 @@ +// SPDX-License-Identifier: GPL-2.0 + +// Check that, on a GICv3 system, not configuring GICv3 correctly +// results in all of the sysregs generating an UNDEF exception. + +#include +#include +#include + +static volatile bool handled; + +#define __check_sr_read(r) \ + ({ \ + uint64_t val; \ + \ + handled = false; \ + dsb(sy); \ + val = read_sysreg_s(SYS_ ## r); \ + val; \ + }) + +#define __check_sr_write(r) \ + do { \ + handled = false; \ + dsb(sy); \ + write_sysreg_s(0, SYS_ ## r); \ + isb(); \ + } while(0) + +/* Fatal checks */ +#define check_sr_read(r) \ + do { \ + __check_sr_read(r); \ + __GUEST_ASSERT(handled, #r " no read trap"); \ + } while(0) + +#define check_sr_write(r) \ + do { \ + __check_sr_write(r); \ + __GUEST_ASSERT(handled, #r " no write trap"); \ + } while(0) + +#define check_sr_rw(r) \ + do { \ + check_sr_read(r); \ + check_sr_write(r); \ + } while(0) + +static void guest_code(void) +{ + uint64_t val; + + /* + * Check that we advertise that ID_AA64PFR0_EL1.GIC == 0, having + * hidden the feature at runtime without any other userspace action. + */ + __GUEST_ASSERT(FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_GIC), + read_sysreg(id_aa64pfr0_el1)) == 0, + "GICv3 wrongly advertised"); + + /* + * Access all GICv3 registers, and fail if we don't get an UNDEF. + * Note that we happily access all the APxRn registers without + * checking their existance, as all we want to see is a failure. + */ + check_sr_rw(ICC_PMR_EL1); + check_sr_read(ICC_IAR0_EL1); + check_sr_write(ICC_EOIR0_EL1); + check_sr_rw(ICC_HPPIR0_EL1); + check_sr_rw(ICC_BPR0_EL1); + check_sr_rw(ICC_AP0R0_EL1); + check_sr_rw(ICC_AP0R1_EL1); + check_sr_rw(ICC_AP0R2_EL1); + check_sr_rw(ICC_AP0R3_EL1); + check_sr_rw(ICC_AP1R0_EL1); + check_sr_rw(ICC_AP1R1_EL1); + check_sr_rw(ICC_AP1R2_EL1); + check_sr_rw(ICC_AP1R3_EL1); + check_sr_write(ICC_DIR_EL1); + check_sr_read(ICC_RPR_EL1); + check_sr_write(ICC_SGI1R_EL1); + check_sr_write(ICC_ASGI1R_EL1); + check_sr_write(ICC_SGI0R_EL1); + check_sr_read(ICC_IAR1_EL1); + check_sr_write(ICC_EOIR1_EL1); + check_sr_rw(ICC_HPPIR1_EL1); + check_sr_rw(ICC_BPR1_EL1); + check_sr_rw(ICC_CTLR_EL1); + check_sr_rw(ICC_IGRPEN0_EL1); + check_sr_rw(ICC_IGRPEN1_EL1); + + /* + * ICC_SRE_EL1 may not be trappable, as ICC_SRE_EL2.Enable can + * be RAO/WI. Engage in non-fatal accesses, starting with a + * write of 0 to try and disable SRE, and let's see if it + * sticks. + */ + __check_sr_write(ICC_SRE_EL1); + if (!handled) + GUEST_PRINTF("ICC_SRE_EL1 write not trapping (OK)\n"); + + val = __check_sr_read(ICC_SRE_EL1); + if (!handled) { + __GUEST_ASSERT((val & BIT(0)), + "ICC_SRE_EL1 not trapped but ICC_SRE_EL1.SRE not set\n"); + GUEST_PRINTF("ICC_SRE_EL1 read not trapping (OK)\n"); + } + + GUEST_DONE(); +} + +static void guest_undef_handler(struct ex_regs *regs) +{ + /* Success, we've gracefully exploded! */ + handled = true; + regs->pc += 4; +} + +static void test_run_vcpu(struct kvm_vcpu *vcpu) +{ + struct ucall uc; + + do { + vcpu_run(vcpu); + + switch (get_ucall(vcpu, &uc)) { + case UCALL_ABORT: + REPORT_GUEST_ASSERT(uc); + break; + case UCALL_PRINTF: + printf("%s", uc.buffer); + break; + case UCALL_DONE: + break; + default: + TEST_FAIL("Unknown ucall %lu", uc.cmd); + } + } while (uc.cmd != UCALL_DONE); +} + +static void test_guest_no_gicv3(void) +{ + struct kvm_vcpu *vcpu; + struct kvm_vm *vm; + + /* Create a VM without a GICv3 */ + vm = vm_create_with_one_vcpu(&vcpu, guest_code); + + vm_init_descriptor_tables(vm); + vcpu_init_descriptor_tables(vcpu); + + vm_install_sync_handler(vm, VECTOR_SYNC_CURRENT, + ESR_EC_UNKNOWN, guest_undef_handler); + + test_run_vcpu(vcpu); + + kvm_vm_free(vm); +} + +int main(int argc, char *argv[]) +{ + struct kvm_vcpu *vcpu; + struct kvm_vm *vm; + uint64_t pfr0; + + vm = vm_create_with_one_vcpu(&vcpu, NULL); + vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(SYS_ID_AA64PFR0_EL1), &pfr0); + __TEST_REQUIRE(FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_GIC), pfr0), + "GICv3 not supported."); + kvm_vm_free(vm); + + test_guest_no_gicv3(); + + return 0; +} -- cgit v1.2.3 From e8497d6951ee8541d73784f9aac9942a7f239980 Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Fri, 23 Aug 2024 18:25:37 +0200 Subject: selftests: forwarding: no_forwarding: Down ports on cleanup This test neglects to put ports down on cleanup. Fix it. Fixes: 476a4f05d9b8 ("selftests: forwarding: add a no_forwarding.sh test") Signed-off-by: Petr Machata Reviewed-by: Simon Horman Link: https://patch.msgid.link/0baf91dc24b95ae0cadfdf5db05b74888e6a228a.1724430120.git.petrm@nvidia.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/forwarding/no_forwarding.sh | 3 +++ 1 file changed, 3 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/net/forwarding/no_forwarding.sh b/tools/testing/selftests/net/forwarding/no_forwarding.sh index af3b398d13f0..9e677aa64a06 100755 --- a/tools/testing/selftests/net/forwarding/no_forwarding.sh +++ b/tools/testing/selftests/net/forwarding/no_forwarding.sh @@ -233,6 +233,9 @@ cleanup() { pre_cleanup + ip link set dev $swp2 down + ip link set dev $swp1 down + h2_destroy h1_destroy -- cgit v1.2.3 From 65a3cce43d5b4c53cf16b0be1a03991f665a0806 Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Mon, 26 Aug 2024 19:15:11 +0200 Subject: selftests: forwarding: local_termination: Down ports on cleanup This test neglects to put ports down on cleanup. Fix it. Fixes: 90b9566aa5cd ("selftests: forwarding: add a test for local_termination.sh") Signed-off-by: Petr Machata Link: https://patch.msgid.link/bf9b79f45de378f88344d44550f0a5052b386199.1724692132.git.petrm@nvidia.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/forwarding/local_termination.sh | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/net/forwarding/local_termination.sh b/tools/testing/selftests/net/forwarding/local_termination.sh index 648868f74604..c35548767756 100755 --- a/tools/testing/selftests/net/forwarding/local_termination.sh +++ b/tools/testing/selftests/net/forwarding/local_termination.sh @@ -571,6 +571,10 @@ vlan_over_vlan_aware_bridge() cleanup() { pre_cleanup + + ip link set $h2 down + ip link set $h1 down + vrf_cleanup } -- cgit v1.2.3 From 59cfdf3f3349019fbfc986a285afcc3873d155f4 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 27 Aug 2024 10:05:58 -1000 Subject: scx_central: Fix smatch checker warning ARRAY_ELEM_PTR() is an access macro used to help the BPF verifier not confused by offseted memory acceeses by yiedling a valid pointer or NULL in a way that's clear to the verifier. As such, the canonical usage involves checking NULL return from the macro. Note that in many cases, the NULL condition can never happen - they're there just to hint the verifier. In a bpf_loop in scx_central.bpf.c::central_dispatch(), the NULL check was incorrect in that there was another dereference of the pointer in addition to the NULL checked access. This worked as the pointer can never be NULL and the verifier could tell it would never be NULL in this case. However, this still looks wrong and trips smatch: ./tools/sched_ext/scx_central.bpf.c:205 ____central_dispatch() error: we previously assumed 'gimme' could be null (see line 201) ./tools/sched_ext/scx_central.bpf.c 195 196 if (!scx_bpf_dispatch_nr_slots()) 197 break; 198 199 /* central's gimme is never set */ 200 gimme = ARRAY_ELEM_PTR(cpu_gimme_task, cpu, nr_cpu_ids); 201 if (gimme && !*gimme) ^^^^^ If gimme is NULL 202 continue; 203 204 if (dispatch_to_cpu(cpu)) --> 205 *gimme = false; Fix the NULL check so that there are no derefs if NULL. This doesn't change actual behavior. Signed-off-by: Tejun Heo Reported-by: Dan Carpenter Link: http://lkml.kernel.org/r/<955e1c3c-ace2-4a1d-b246-15b8196038a3@stanley.mountain> --- tools/sched_ext/scx_central.bpf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/sched_ext/scx_central.bpf.c b/tools/sched_ext/scx_central.bpf.c index 1d8fd570eaa7..8dd8eb73b6b8 100644 --- a/tools/sched_ext/scx_central.bpf.c +++ b/tools/sched_ext/scx_central.bpf.c @@ -198,7 +198,7 @@ void BPF_STRUCT_OPS(central_dispatch, s32 cpu, struct task_struct *prev) /* central's gimme is never set */ gimme = ARRAY_ELEM_PTR(cpu_gimme_task, cpu, nr_cpu_ids); - if (gimme && !*gimme) + if (!gimme || !*gimme) continue; if (dispatch_to_cpu(cpu)) -- cgit v1.2.3 From 79504a47339cd9a1574d974869aecaba838e1213 Mon Sep 17 00:00:00 2001 From: Dmitry Safonov <0x7f454c46@gmail.com> Date: Fri, 23 Aug 2024 23:04:51 +0100 Subject: selftests/net: Clean-up double assignment Correct copy'n'paste typo: the previous line already initialises get_all to 1. Reported-by: Nassiri, Mohammad Closes: https://lore.kernel.org/all/DM6PR04MB4202BC58A9FD5BDD24A16E8EC56F2@DM6PR04MB4202.namprd04.prod.outlook.com/ Signed-off-by: Dmitry Safonov <0x7f454c46@gmail.com> Link: https://patch.msgid.link/20240823-tcp-ao-selftests-upd-6-12-v4-1-05623636fe8c@gmail.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/tcp_ao/lib/sock.c | 1 - 1 file changed, 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/tcp_ao/lib/sock.c b/tools/testing/selftests/net/tcp_ao/lib/sock.c index 15aeb0963058..0ffda966c677 100644 --- a/tools/testing/selftests/net/tcp_ao/lib/sock.c +++ b/tools/testing/selftests/net/tcp_ao/lib/sock.c @@ -379,7 +379,6 @@ int test_get_tcp_ao_counters(int sk, struct tcp_ao_counters *out) key_dump[0].nkeys = nr_keys; key_dump[0].get_all = 1; - key_dump[0].get_all = 1; err = getsockopt(sk, IPPROTO_TCP, TCP_AO_GET_KEYS, key_dump, &key_dump_sz); if (err) { -- cgit v1.2.3 From 7053e788ded5f8ec589e4507d764d6a894780d6c Mon Sep 17 00:00:00 2001 From: Dmitry Safonov <0x7f454c46@gmail.com> Date: Fri, 23 Aug 2024 23:04:52 +0100 Subject: selftests/net: Provide test_snprintf() helper Instead of pre-allocating a fixed-sized buffer of TEST_MSG_BUFFER_SIZE and printing into it, call vsnprintf() with str = NULL, which will return the needed size of the buffer. This hack is documented in man 3 vsnprintf. Essentially, in C++ terms, it re-invents std::stringstream, which is going to be used to print different tracing paths and formatted strings. Use it straight away in __test_print() - which is thread-safe version of printing in selftests. Signed-off-by: Dmitry Safonov <0x7f454c46@gmail.com> Link: https://patch.msgid.link/20240823-tcp-ao-selftests-upd-6-12-v4-2-05623636fe8c@gmail.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/tcp_ao/lib/aolib.h | 61 +++++++++++++++++++++----- 1 file changed, 51 insertions(+), 10 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/tcp_ao/lib/aolib.h b/tools/testing/selftests/net/tcp_ao/lib/aolib.h index fbc7f6111815..78466863435f 100644 --- a/tools/testing/selftests/net/tcp_ao/lib/aolib.h +++ b/tools/testing/selftests/net/tcp_ao/lib/aolib.h @@ -37,17 +37,58 @@ extern void __test_xfail(const char *buf); extern void __test_error(const char *buf); extern void __test_skip(const char *buf); -__attribute__((__format__(__printf__, 2, 3))) -static inline void __test_print(void (*fn)(const char *), const char *fmt, ...) +static inline char *test_snprintf(const char *fmt, va_list vargs) { -#define TEST_MSG_BUFFER_SIZE 4096 - char buf[TEST_MSG_BUFFER_SIZE]; - va_list arg; - - va_start(arg, fmt); - vsnprintf(buf, sizeof(buf), fmt, arg); - va_end(arg); - fn(buf); + char *ret = NULL; + size_t size = 0; + va_list tmp; + int n = 0; + + va_copy(tmp, vargs); + n = vsnprintf(ret, size, fmt, tmp); + if (n < 0) + return NULL; + + size = n + 1; + ret = malloc(size); + if (!ret) + return NULL; + + n = vsnprintf(ret, size, fmt, vargs); + if (n < 0 || n > size - 1) { + free(ret); + return NULL; + } + return ret; +} + +static __printf(1, 2) inline char *test_sprintf(const char *fmt, ...) +{ + va_list vargs; + char *ret; + + va_start(vargs, fmt); + ret = test_snprintf(fmt, vargs); + va_end(vargs); + + return ret; +} + +static __printf(2, 3) inline void __test_print(void (*fn)(const char *), + const char *fmt, ...) +{ + va_list vargs; + char *msg; + + va_start(vargs, fmt); + msg = test_snprintf(fmt, vargs); + va_end(vargs); + + if (!msg) + return; + + fn(msg); + free(msg); } #define test_print(fmt, ...) \ -- cgit v1.2.3 From bc2468f98221a194769cac80051da6b86cc39c2f Mon Sep 17 00:00:00 2001 From: Dmitry Safonov <0x7f454c46@gmail.com> Date: Fri, 23 Aug 2024 23:04:53 +0100 Subject: selftests/net: Be consistent in kconfig checks Most of the functions in tcp-ao lib/ return negative errno or -1 in case of a failure. That creates inconsistencies in lib/kconfig, which saves what was the error code. As well as the uninitialized kconfig value is -1, which also may be the result of a check. Define KCONFIG_UNKNOWN and save negative return code, rather than libc-style errno. Signed-off-by: Dmitry Safonov <0x7f454c46@gmail.com> Link: https://patch.msgid.link/20240823-tcp-ao-selftests-upd-6-12-v4-3-05623636fe8c@gmail.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/tcp_ao/lib/kconfig.c | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/tcp_ao/lib/kconfig.c b/tools/testing/selftests/net/tcp_ao/lib/kconfig.c index f279ffc3843b..3bf4a7e4b3c9 100644 --- a/tools/testing/selftests/net/tcp_ao/lib/kconfig.c +++ b/tools/testing/selftests/net/tcp_ao/lib/kconfig.c @@ -6,7 +6,7 @@ #include "aolib.h" struct kconfig_t { - int _errno; /* the returned error if not supported */ + int _error; /* negative errno if not supported */ int (*check_kconfig)(int *error); }; @@ -62,7 +62,7 @@ static int has_tcp_ao(int *err) memcpy(&tmp.addr, &addr, sizeof(addr)); *err = 0; if (setsockopt(sk, IPPROTO_TCP, TCP_AO_ADD_KEY, &tmp, sizeof(tmp)) < 0) { - *err = errno; + *err = -errno; if (errno != ENOPROTOOPT) ret = -errno; } @@ -87,7 +87,7 @@ static int has_tcp_md5(int *err) */ *err = 0; if (test_set_md5(sk, addr_any, 0, -1, DEFAULT_TEST_PASSWORD)) { - *err = errno; + *err = -errno; if (errno != ENOPROTOOPT && errno == ENOMEM) { test_print("setsockopt(TCP_MD5SIG_EXT): %m"); ret = -errno; @@ -116,13 +116,14 @@ static int has_vrfs(int *err) return ret; } +#define KCONFIG_UNKNOWN 1 static pthread_mutex_t kconfig_lock = PTHREAD_MUTEX_INITIALIZER; static struct kconfig_t kconfig[__KCONFIG_LAST__] = { - { -1, has_net_ns }, - { -1, has_veth }, - { -1, has_tcp_ao }, - { -1, has_tcp_md5 }, - { -1, has_vrfs }, + { KCONFIG_UNKNOWN, has_net_ns }, + { KCONFIG_UNKNOWN, has_veth }, + { KCONFIG_UNKNOWN, has_tcp_ao }, + { KCONFIG_UNKNOWN, has_tcp_md5 }, + { KCONFIG_UNKNOWN, has_vrfs }, }; const char *tests_skip_reason[__KCONFIG_LAST__] = { @@ -138,11 +139,11 @@ bool kernel_config_has(enum test_needs_kconfig k) bool ret; pthread_mutex_lock(&kconfig_lock); - if (kconfig[k]._errno == -1) { - if (kconfig[k].check_kconfig(&kconfig[k]._errno)) + if (kconfig[k]._error == KCONFIG_UNKNOWN) { + if (kconfig[k].check_kconfig(&kconfig[k]._error)) test_error("Failed to initialize kconfig %u", k); } - ret = kconfig[k]._errno == 0; + ret = kconfig[k]._error == 0; pthread_mutex_unlock(&kconfig_lock); return ret; } -- cgit v1.2.3 From 8acb1806e8c2846ae619f500a3229a325c3304f5 Mon Sep 17 00:00:00 2001 From: Dmitry Safonov <0x7f454c46@gmail.com> Date: Fri, 23 Aug 2024 23:04:54 +0100 Subject: selftests/net: Open /proc/thread-self in open_netns() It turns to be that open_netns() is called rarely from the child-thread and more often from parent-thread. Yet, on initialization of kconfig checks, either of threads may reach kconfig_lock mutex first. VRF-related checks do create a temporary ksft-check VRF in an unshare()'d namespace and than setns() back to the original. As original was opened from "/proc/self/ns/net", it's valid for thread-leader (parent), but it's invalid for the child, resulting in the following failure on tests that check has_vrfs() support: > # ok 54 TCP-AO required on socket + TCP-MD5 key: prefailed as expected: Key was rejected by service > # not ok 55 # error 381[unsigned-md5.c:24] Failed to add a VRF: -17 > # not ok 56 # error 383[unsigned-md5.c:33] Failed to add a route to VRF: -22: Key was rejected by service > not ok 1 selftests: net/tcp_ao: unsigned-md5_ipv6 # exit=1 Use "/proc/thread-self/ns/net" which is valid for any thread. Signed-off-by: Dmitry Safonov <0x7f454c46@gmail.com> Link: https://patch.msgid.link/20240823-tcp-ao-selftests-upd-6-12-v4-4-05623636fe8c@gmail.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/tcp_ao/lib/setup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/tcp_ao/lib/setup.c b/tools/testing/selftests/net/tcp_ao/lib/setup.c index e408b9243b2c..d5212ffe9489 100644 --- a/tools/testing/selftests/net/tcp_ao/lib/setup.c +++ b/tools/testing/selftests/net/tcp_ao/lib/setup.c @@ -111,7 +111,7 @@ static void sig_int(int signo) int open_netns(void) { - const char *netns_path = "/proc/self/ns/net"; + const char *netns_path = "/proc/thread-self/ns/net"; int fd; fd = open(netns_path, O_RDONLY); -- cgit v1.2.3 From a9e1693406f96f3739611dd08518c9eda3acecfd Mon Sep 17 00:00:00 2001 From: Dmitry Safonov <0x7f454c46@gmail.com> Date: Fri, 23 Aug 2024 23:04:55 +0100 Subject: selftests/net: Don't forget to close nsfd after switch_save_ns() The switch_save_ns() helper suppose to help switching to another namespace for some action and to return back to original namespace. The fd should be closed. Signed-off-by: Dmitry Safonov <0x7f454c46@gmail.com> Link: https://patch.msgid.link/20240823-tcp-ao-selftests-upd-6-12-v4-5-05623636fe8c@gmail.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/tcp_ao/lib/setup.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/tcp_ao/lib/setup.c b/tools/testing/selftests/net/tcp_ao/lib/setup.c index d5212ffe9489..86a4f6e20450 100644 --- a/tools/testing/selftests/net/tcp_ao/lib/setup.c +++ b/tools/testing/selftests/net/tcp_ao/lib/setup.c @@ -142,6 +142,13 @@ int switch_save_ns(int new_ns) return ret; } +void switch_close_ns(int fd) +{ + if (setns(fd, CLONE_NEWNET)) + test_error("setns()"); + close(fd); +} + static int nsfd_outside = -1; static int nsfd_parent = -1; static int nsfd_child = -1; @@ -296,7 +303,7 @@ static bool is_optmem_namespaced(void) int old_ns = switch_save_ns(nsfd_child); optmem_ns = !access(optmem_file, F_OK); - switch_ns(old_ns); + switch_close_ns(old_ns); } return !!optmem_ns; } @@ -317,7 +324,7 @@ size_t test_get_optmem(void) test_error("can't read from %s", optmem_file); fclose(foptmem); if (!is_optmem_namespaced()) - switch_ns(old_ns); + switch_close_ns(old_ns); return ret; } @@ -339,7 +346,7 @@ static void __test_set_optmem(size_t new, size_t *old) test_error("can't write %zu to %s", new, optmem_file); fclose(foptmem); if (!is_optmem_namespaced()) - switch_ns(old_ns); + switch_close_ns(old_ns); } static void test_revert_optmem(void) -- cgit v1.2.3 From 1c69e1f433990a81b5a5d415d8892ebcaacb985b Mon Sep 17 00:00:00 2001 From: Mohammad Nassiri Date: Fri, 23 Aug 2024 23:04:56 +0100 Subject: selftests/tcp_ao: Fix printing format for uint64_t It's not safe to use '%zu' specifier for printing uint64_t on 32-bit systems. For uint64_t, we should use the 'PRIu64' macro from the inttypes.h library. This ensures that the uint64_t is printed correctly from the selftests regardless of the system architecture. Signed-off-by: Mohammad Nassiri [Added missing spaces in fail/ok messages and uint64_t cast in setsockopt-closed, as otherwise it was giving warnings on 64bit. And carried it to netdev ml] Signed-off-by: Dmitry Safonov <0x7f454c46@gmail.com> Link: https://patch.msgid.link/20240823-tcp-ao-selftests-upd-6-12-v4-6-05623636fe8c@gmail.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/tcp_ao/connect-deny.c | 4 ++-- tools/testing/selftests/net/tcp_ao/connect.c | 4 ++-- tools/testing/selftests/net/tcp_ao/restore.c | 8 ++++---- tools/testing/selftests/net/tcp_ao/self-connect.c | 4 ++-- tools/testing/selftests/net/tcp_ao/seq-ext.c | 12 ++++++------ tools/testing/selftests/net/tcp_ao/setsockopt-closed.c | 4 ++-- tools/testing/selftests/net/tcp_ao/unsigned-md5.c | 4 ++-- 7 files changed, 20 insertions(+), 20 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/tcp_ao/connect-deny.c b/tools/testing/selftests/net/tcp_ao/connect-deny.c index 185a2f6e5ff3..5691f3d00603 100644 --- a/tools/testing/selftests/net/tcp_ao/connect-deny.c +++ b/tools/testing/selftests/net/tcp_ao/connect-deny.c @@ -84,10 +84,10 @@ static void try_accept(const char *tst_name, unsigned int port, const char *pwd, after_cnt = netstat_get_one(cnt_name, NULL); if (after_cnt <= before_cnt) { - test_fail("%s: %s counter did not increase: %zu <= %zu", + test_fail("%s: %s counter did not increase: %" PRIu64 " <= %" PRIu64, tst_name, cnt_name, after_cnt, before_cnt); } else { - test_ok("%s: counter %s increased %zu => %zu", + test_ok("%s: counter %s increased %" PRIu64 " => %" PRIu64, tst_name, cnt_name, before_cnt, after_cnt); } diff --git a/tools/testing/selftests/net/tcp_ao/connect.c b/tools/testing/selftests/net/tcp_ao/connect.c index 81653b47f303..9eecc4a1072e 100644 --- a/tools/testing/selftests/net/tcp_ao/connect.c +++ b/tools/testing/selftests/net/tcp_ao/connect.c @@ -67,14 +67,14 @@ static void *client_fn(void *arg) netstat_free(ns_after); if (nr_packets > (after_aogood - before_aogood)) { - test_fail("TCPAOGood counter mismatch: %zu > (%zu - %zu)", + test_fail("TCPAOGood counter mismatch: %zu > (%" PRIu64 " - %" PRIu64 ")", nr_packets, after_aogood, before_aogood); return NULL; } if (test_tcp_ao_counters_cmp("connect", &ao1, &ao2, TEST_CNT_GOOD)) return NULL; - test_ok("connect TCPAOGood %" PRIu64 "/%" PRIu64 "/%" PRIu64 " => %" PRIu64 "/%" PRIu64 "/%" PRIu64 ", sent %" PRIu64, + test_ok("connect TCPAOGood %" PRIu64 "/%" PRIu64 "/%" PRIu64 " => %" PRIu64 "/%" PRIu64 "/%" PRIu64 ", sent %zu", before_aogood, ao1.ao_info_pkt_good, ao1.key_cnts[0].pkt_good, after_aogood, ao2.ao_info_pkt_good, diff --git a/tools/testing/selftests/net/tcp_ao/restore.c b/tools/testing/selftests/net/tcp_ao/restore.c index 8fdc808df325..7b91d7fde2bc 100644 --- a/tools/testing/selftests/net/tcp_ao/restore.c +++ b/tools/testing/selftests/net/tcp_ao/restore.c @@ -71,10 +71,10 @@ static void try_server_run(const char *tst_name, unsigned int port, test_tcp_ao_counters_cmp(tst_name, &ao1, &ao2, cnt_expected); if (after_cnt <= before_cnt) { - test_fail("%s: %s counter did not increase: %zu <= %zu", + test_fail("%s: %s counter did not increase: %" PRIu64 " <= %" PRIu64, tst_name, cnt_name, after_cnt, before_cnt); } else { - test_ok("%s: counter %s increased %zu => %zu", + test_ok("%s: counter %s increased %" PRIu64 " => %" PRIu64, tst_name, cnt_name, before_cnt, after_cnt); } @@ -183,10 +183,10 @@ static void test_sk_restore(const char *tst_name, unsigned int server_port, test_tcp_ao_counters_cmp(tst_name, &ao1, &ao2, cnt_expected); if (after_cnt <= before_cnt) { - test_fail("%s: %s counter did not increase: %zu <= %zu", + test_fail("%s: %s counter did not increase: %" PRIu64 " <= %" PRIu64, tst_name, cnt_name, after_cnt, before_cnt); } else { - test_ok("%s: counter %s increased %zu => %zu", + test_ok("%s: counter %s increased %" PRIu64 " => %" PRIu64, tst_name, cnt_name, before_cnt, after_cnt); } synchronize_threads(); /* 3: verified => closed */ diff --git a/tools/testing/selftests/net/tcp_ao/self-connect.c b/tools/testing/selftests/net/tcp_ao/self-connect.c index a5698b0a3718..e56931d38e06 100644 --- a/tools/testing/selftests/net/tcp_ao/self-connect.c +++ b/tools/testing/selftests/net/tcp_ao/self-connect.c @@ -87,7 +87,7 @@ static void tcp_self_connect(const char *tst, unsigned int port, netstat_free(ns_after); if (after_aogood <= before_aogood) { - test_fail("%s: TCPAOGood counter mismatch: %zu <= %zu", + test_fail("%s: TCPAOGood counter mismatch: %" PRIu64 " <= %" PRIu64, tst, after_aogood, before_aogood); close(sk); return; @@ -148,7 +148,7 @@ static void tcp_self_connect(const char *tst, unsigned int port, netstat_free(ns_after); close(sk); if (after_aogood <= before_aogood) { - test_fail("%s: TCPAOGood counter mismatch: %zu <= %zu", + test_fail("%s: TCPAOGood counter mismatch: %" PRIu64 " <= %" PRIu64, tst, after_aogood, before_aogood); return; } diff --git a/tools/testing/selftests/net/tcp_ao/seq-ext.c b/tools/testing/selftests/net/tcp_ao/seq-ext.c index ad4e77d6823e..9c7dde7fd776 100644 --- a/tools/testing/selftests/net/tcp_ao/seq-ext.c +++ b/tools/testing/selftests/net/tcp_ao/seq-ext.c @@ -134,15 +134,15 @@ static void *server_fn(void *arg) test_tcp_ao_counters_cmp(NULL, &ao1, &ao2, TEST_CNT_GOOD); if (after_good <= before_good) { - test_fail("TCPAOGood counter did not increase: %zu <= %zu", + test_fail("TCPAOGood counter did not increase: %" PRIu64 " <= %" PRIu64, after_good, before_good); } else { - test_ok("TCPAOGood counter increased %zu => %zu", + test_ok("TCPAOGood counter increased %" PRIu64 " => %" PRIu64, before_good, after_good); } after_bad = netstat_get_one("TCPAOBad", NULL); if (after_bad) - test_fail("TCPAOBad counter is non-zero: %zu", after_bad); + test_fail("TCPAOBad counter is non-zero: %" PRIu64, after_bad); else test_ok("TCPAOBad counter didn't increase"); test_enable_repair(sk); @@ -219,15 +219,15 @@ static void *client_fn(void *arg) test_tcp_ao_counters_cmp(NULL, &ao1, &ao2, TEST_CNT_GOOD); if (after_good <= before_good) { - test_fail("TCPAOGood counter did not increase: %zu <= %zu", + test_fail("TCPAOGood counter did not increase: %" PRIu64 " <= %" PRIu64, after_good, before_good); } else { - test_ok("TCPAOGood counter increased %zu => %zu", + test_ok("TCPAOGood counter increased %" PRIu64 " => %" PRIu64, before_good, after_good); } after_bad = netstat_get_one("TCPAOBad", NULL); if (after_bad) - test_fail("TCPAOBad counter is non-zero: %zu", after_bad); + test_fail("TCPAOBad counter is non-zero: %" PRIu64, after_bad); else test_ok("TCPAOBad counter didn't increase"); diff --git a/tools/testing/selftests/net/tcp_ao/setsockopt-closed.c b/tools/testing/selftests/net/tcp_ao/setsockopt-closed.c index 517930f9721b..5eee826c37aa 100644 --- a/tools/testing/selftests/net/tcp_ao/setsockopt-closed.c +++ b/tools/testing/selftests/net/tcp_ao/setsockopt-closed.c @@ -30,8 +30,8 @@ static void test_vefify_ao_info(int sk, struct tcp_ao_info_opt *info, #define __cmp_ao(member) \ do { \ if (info->member != tmp.member) { \ - test_fail("%s: getsockopt(): " __stringify(member) " %zu != %zu", \ - tst, (size_t)info->member, (size_t)tmp.member); \ + test_fail("%s: getsockopt(): " __stringify(member) " %" PRIu64 " != %" PRIu64, \ + tst, (uint64_t)info->member, (uint64_t)tmp.member); \ return; \ } \ } while(0) diff --git a/tools/testing/selftests/net/tcp_ao/unsigned-md5.c b/tools/testing/selftests/net/tcp_ao/unsigned-md5.c index 6b59a652159f..ec2848036341 100644 --- a/tools/testing/selftests/net/tcp_ao/unsigned-md5.c +++ b/tools/testing/selftests/net/tcp_ao/unsigned-md5.c @@ -100,10 +100,10 @@ static void try_accept(const char *tst_name, unsigned int port, after_cnt = netstat_get_one(cnt_name, NULL); if (after_cnt <= before_cnt) { - test_fail("%s: %s counter did not increase: %zu <= %zu", + test_fail("%s: %s counter did not increase: %" PRIu64 " <= %" PRIu64, tst_name, cnt_name, after_cnt, before_cnt); } else { - test_ok("%s: counter %s increased %zu => %zu", + test_ok("%s: counter %s increased %" PRIu64 " => %" PRIu64, tst_name, cnt_name, before_cnt, after_cnt); } if (ao_addr) -- cgit v1.2.3 From 044e037051252ca8df07e1355bf4d7964645a6e8 Mon Sep 17 00:00:00 2001 From: Dmitry Safonov <0x7f454c46@gmail.com> Date: Fri, 23 Aug 2024 23:04:57 +0100 Subject: selftests/net: Synchronize client/server before counters checks On tests that are expecting failure the timeout value is TEST_RETRANSMIT_SEC == 1 second. Which is big enough for most of devices under tests. But on a particularly slow machine/VM, 1 second might be not enough for another thread to be scheduled and attempt to connect(). It is not a problem for tests that expect connect() to succeed as the timeout value for them (TEST_TIMEOUT_SEC) is intentionally bigger. One obvious way to solve this would be to increase TEST_RETRANSMIT_SEC. But as all tests would increase the timeouts, that's going to sum up. But here is less obvious way that keeps timeouts for expected connect() failures low: just synchronize the two threads, which will assure that before counter checks the other thread got a chance to run and timeout on connect(). The expected increase of the related counter for listen() socket will yet test the expected failure. Never happens on my machine, but I suppose the majority of netdev's connect-deny-* flakes [1] are caused by this. Prevents the following testing issue: > # selftests: net/tcp_ao: connect-deny_ipv6 > # 1..21 > # # 462[lib/setup.c:243] rand seed 1720905426 > # TAP version 13 > # ok 1 Non-AO server + AO client > # not ok 2 Non-AO server + AO client: TCPAOKeyNotFound counter did not increase: 0 <= 0 > # ok 3 AO server + Non-AO client > # ok 4 AO server + Non-AO client: counter TCPAORequired increased 0 => 1 ... [1]: https://netdev-3.bots.linux.dev/vmksft-tcp-ao/results/681741/6-connect-deny-ipv6/stdout Signed-off-by: Dmitry Safonov <0x7f454c46@gmail.com> Link: https://patch.msgid.link/20240823-tcp-ao-selftests-upd-6-12-v4-7-05623636fe8c@gmail.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/tcp_ao/connect-deny.c | 3 +++ tools/testing/selftests/net/tcp_ao/restore.c | 6 ++++-- tools/testing/selftests/net/tcp_ao/seq-ext.c | 6 ++++-- tools/testing/selftests/net/tcp_ao/unsigned-md5.c | 3 +++ 4 files changed, 14 insertions(+), 4 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/tcp_ao/connect-deny.c b/tools/testing/selftests/net/tcp_ao/connect-deny.c index 5691f3d00603..166ad4549ef2 100644 --- a/tools/testing/selftests/net/tcp_ao/connect-deny.c +++ b/tools/testing/selftests/net/tcp_ao/connect-deny.c @@ -71,10 +71,12 @@ static void try_accept(const char *tst_name, unsigned int port, const char *pwd, } } + synchronize_threads(); /* before counter checks */ if (pwd && test_get_tcp_ao_counters(lsk, &ao_cnt2)) test_error("test_get_tcp_ao_counters()"); close(lsk); + if (pwd) test_tcp_ao_counters_cmp(tst_name, &ao_cnt1, &ao_cnt2, cnt_expected); @@ -180,6 +182,7 @@ static void try_connect(const char *tst_name, unsigned int port, timeout = fault(TIMEOUT) ? TEST_RETRANSMIT_SEC : TEST_TIMEOUT_SEC; ret = _test_connect_socket(sk, this_ip_dest, port, timeout); + synchronize_threads(); /* before counter checks */ if (ret < 0) { if (fault(KEYREJECT) && ret == -EKEYREJECTED) { test_ok("%s: connect() was prevented", tst_name); diff --git a/tools/testing/selftests/net/tcp_ao/restore.c b/tools/testing/selftests/net/tcp_ao/restore.c index 7b91d7fde2bc..f6ea2190f43d 100644 --- a/tools/testing/selftests/net/tcp_ao/restore.c +++ b/tools/testing/selftests/net/tcp_ao/restore.c @@ -64,6 +64,7 @@ static void try_server_run(const char *tst_name, unsigned int port, else test_ok("%s: server alive", tst_name); } + synchronize_threads(); /* 3: counters checks */ if (test_get_tcp_ao_counters(sk, &ao2)) test_error("test_get_tcp_ao_counters()"); after_cnt = netstat_get_one(cnt_name, NULL); @@ -82,7 +83,7 @@ static void try_server_run(const char *tst_name, unsigned int port, * Before close() as that will send FIN and move the peer in TCP_CLOSE * and that will prevent reading AO counters from the peer's socket. */ - synchronize_threads(); /* 3: verified => closed */ + synchronize_threads(); /* 4: verified => closed */ out: close(sk); } @@ -176,6 +177,7 @@ static void test_sk_restore(const char *tst_name, unsigned int server_port, else test_ok("%s: post-migrate connection is alive", tst_name); } + synchronize_threads(); /* 3: counters checks */ if (test_get_tcp_ao_counters(sk, &ao2)) test_error("test_get_tcp_ao_counters()"); after_cnt = netstat_get_one(cnt_name, NULL); @@ -189,7 +191,7 @@ static void test_sk_restore(const char *tst_name, unsigned int server_port, test_ok("%s: counter %s increased %" PRIu64 " => %" PRIu64, tst_name, cnt_name, before_cnt, after_cnt); } - synchronize_threads(); /* 3: verified => closed */ + synchronize_threads(); /* 4: verified => closed */ close(sk); } diff --git a/tools/testing/selftests/net/tcp_ao/seq-ext.c b/tools/testing/selftests/net/tcp_ao/seq-ext.c index 9c7dde7fd776..885866cc193c 100644 --- a/tools/testing/selftests/net/tcp_ao/seq-ext.c +++ b/tools/testing/selftests/net/tcp_ao/seq-ext.c @@ -116,7 +116,7 @@ static void *server_fn(void *arg) sk = test_sk_restore(&img, &ao_img, &saddr, this_ip_dest, client_new_port, &ao1); - synchronize_threads(); /* 5: verify counters during SEQ-number rollover */ + synchronize_threads(); /* 5: verify the connection during SEQ-number rollover */ bytes = test_server_run(sk, quota, TEST_TIMEOUT_SEC); if (bytes != quota) { if (bytes > 0) @@ -127,6 +127,7 @@ static void *server_fn(void *arg) test_ok("server alive"); } + synchronize_threads(); /* 6: verify counters after SEQ-number rollover */ if (test_get_tcp_ao_counters(sk, &ao2)) test_error("test_get_tcp_ao_counters()"); after_good = netstat_get_one("TCPAOGood", NULL); @@ -206,12 +207,13 @@ static void *client_fn(void *arg) sk = test_sk_restore(&img, &ao_img, &saddr, this_ip_dest, test_server_port + 1, &ao1); - synchronize_threads(); /* 5: verify counters during SEQ-number rollover */ + synchronize_threads(); /* 5: verify the connection during SEQ-number rollover */ if (test_client_verify(sk, msg_len, nr_packets, TEST_TIMEOUT_SEC)) test_fail("post-migrate verify failed"); else test_ok("post-migrate connection alive"); + synchronize_threads(); /* 5: verify counters after SEQ-number rollover */ if (test_get_tcp_ao_counters(sk, &ao2)) test_error("test_get_tcp_ao_counters()"); after_good = netstat_get_one("TCPAOGood", NULL); diff --git a/tools/testing/selftests/net/tcp_ao/unsigned-md5.c b/tools/testing/selftests/net/tcp_ao/unsigned-md5.c index ec2848036341..02346b58efbd 100644 --- a/tools/testing/selftests/net/tcp_ao/unsigned-md5.c +++ b/tools/testing/selftests/net/tcp_ao/unsigned-md5.c @@ -70,6 +70,7 @@ static void try_accept(const char *tst_name, unsigned int port, timeout = fault(TIMEOUT) ? TEST_RETRANSMIT_SEC : TEST_TIMEOUT_SEC; err = test_wait_fd(lsk, timeout, 0); + synchronize_threads(); /* connect()/accept() timeouts */ if (err == -ETIMEDOUT) { if (!fault(TIMEOUT)) test_fail("timed out for accept()"); @@ -283,6 +284,7 @@ static void try_connect(const char *tst_name, unsigned int port, timeout = fault(TIMEOUT) ? TEST_RETRANSMIT_SEC : TEST_TIMEOUT_SEC; ret = _test_connect_socket(sk, this_ip_dest, port, timeout); + synchronize_threads(); /* connect()/accept() timeouts */ if (ret < 0) { if (fault(KEYREJECT) && ret == -EKEYREJECTED) test_ok("%s: connect() was prevented", tst_name); @@ -451,6 +453,7 @@ static void try_to_add(const char *tst_name, unsigned int port, timeout = fault(TIMEOUT) ? TEST_RETRANSMIT_SEC : TEST_TIMEOUT_SEC; ret = _test_connect_socket(sk, this_ip_dest, port, timeout); + synchronize_threads(); /* connect()/accept() timeouts */ if (ret <= 0) { test_error("%s: connect() returned %d", tst_name, ret); goto out; -- cgit v1.2.3 From 586d87021f224b0434372f5b4418216e29b0a544 Mon Sep 17 00:00:00 2001 From: Dmitry Safonov <0x7f454c46@gmail.com> Date: Fri, 23 Aug 2024 23:04:58 +0100 Subject: selftests/net: Add trace events matching to tcp_ao Setup trace points, add a new ftrace instance in order to not interfere with the rest of the system, filtering by net namespace cookies. Raise a new background thread that parses trace_pipe, matches them with the list of expected events. Wiring up trace events to selftests provides another insight if there is anything unexpected happining in the tcp-ao code (i.e. key rotation when it's not expected). Note: in real programs libtraceevent should be used instead of this manual labor of setting ftrace up and parsing. I'm not using it here as I don't want to have an .so library dependency that one would have to bring into VM or DUT (Device Under Test). Please, don't copy it over into any real world programs, that aren't tests. Signed-off-by: Dmitry Safonov <0x7f454c46@gmail.com> Link: https://patch.msgid.link/20240823-tcp-ao-selftests-upd-6-12-v4-8-05623636fe8c@gmail.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/tcp_ao/Makefile | 3 +- tools/testing/selftests/net/tcp_ao/bench-lookups.c | 2 +- tools/testing/selftests/net/tcp_ao/config | 1 + tools/testing/selftests/net/tcp_ao/connect-deny.c | 18 +- tools/testing/selftests/net/tcp_ao/connect.c | 2 +- tools/testing/selftests/net/tcp_ao/icmps-discard.c | 2 +- .../testing/selftests/net/tcp_ao/key-management.c | 18 +- tools/testing/selftests/net/tcp_ao/lib/aolib.h | 119 +++++ .../testing/selftests/net/tcp_ao/lib/ftrace-tcp.c | 559 +++++++++++++++++++++ tools/testing/selftests/net/tcp_ao/lib/ftrace.c | 543 ++++++++++++++++++++ tools/testing/selftests/net/tcp_ao/lib/kconfig.c | 8 + tools/testing/selftests/net/tcp_ao/lib/setup.c | 2 +- tools/testing/selftests/net/tcp_ao/lib/utils.c | 26 + tools/testing/selftests/net/tcp_ao/restore.c | 16 +- tools/testing/selftests/net/tcp_ao/rst.c | 2 +- tools/testing/selftests/net/tcp_ao/self-connect.c | 15 +- tools/testing/selftests/net/tcp_ao/seq-ext.c | 10 +- .../selftests/net/tcp_ao/setsockopt-closed.c | 2 +- tools/testing/selftests/net/tcp_ao/unsigned-md5.c | 28 +- 19 files changed, 1358 insertions(+), 18 deletions(-) create mode 100644 tools/testing/selftests/net/tcp_ao/lib/ftrace-tcp.c create mode 100644 tools/testing/selftests/net/tcp_ao/lib/ftrace.c (limited to 'tools') diff --git a/tools/testing/selftests/net/tcp_ao/Makefile b/tools/testing/selftests/net/tcp_ao/Makefile index bd88b90b902b..5b0205c70c39 100644 --- a/tools/testing/selftests/net/tcp_ao/Makefile +++ b/tools/testing/selftests/net/tcp_ao/Makefile @@ -31,7 +31,8 @@ CFLAGS += $(KHDR_INCLUDES) CFLAGS += -iquote ./lib/ -I ../../../../include/ # Library -LIBSRC := kconfig.c netlink.c proc.c repair.c setup.c sock.c utils.c +LIBSRC := ftrace.c ftrace-tcp.c kconfig.c netlink.c +LIBSRC += proc.c repair.c setup.c sock.c utils.c LIBOBJ := $(LIBSRC:%.c=$(LIBDIR)/%.o) EXTRA_CLEAN += $(LIBOBJ) $(LIB) diff --git a/tools/testing/selftests/net/tcp_ao/bench-lookups.c b/tools/testing/selftests/net/tcp_ao/bench-lookups.c index a1e6e007c291..6736484996a3 100644 --- a/tools/testing/selftests/net/tcp_ao/bench-lookups.c +++ b/tools/testing/selftests/net/tcp_ao/bench-lookups.c @@ -355,6 +355,6 @@ static void *client_fn(void *arg) int main(int argc, char *argv[]) { - test_init(30, server_fn, client_fn); + test_init(31, server_fn, client_fn); return 0; } diff --git a/tools/testing/selftests/net/tcp_ao/config b/tools/testing/selftests/net/tcp_ao/config index d3277a9de987..3605e38711cb 100644 --- a/tools/testing/selftests/net/tcp_ao/config +++ b/tools/testing/selftests/net/tcp_ao/config @@ -7,4 +7,5 @@ CONFIG_NET_L3_MASTER_DEV=y CONFIG_NET_VRF=y CONFIG_TCP_AO=y CONFIG_TCP_MD5SIG=y +CONFIG_TRACEPOINTS=y CONFIG_VETH=m diff --git a/tools/testing/selftests/net/tcp_ao/connect-deny.c b/tools/testing/selftests/net/tcp_ao/connect-deny.c index 166ad4549ef2..d418162d335f 100644 --- a/tools/testing/selftests/net/tcp_ao/connect-deny.c +++ b/tools/testing/selftests/net/tcp_ao/connect-deny.c @@ -215,30 +215,44 @@ out: static void *client_fn(void *arg) { - union tcp_addr wrong_addr, network_addr; + union tcp_addr wrong_addr, network_addr, addr_any = {}; unsigned int port = test_server_port; if (inet_pton(TEST_FAMILY, TEST_WRONG_IP, &wrong_addr) != 1) test_error("Can't convert ip address %s", TEST_WRONG_IP); + trace_ao_event_expect(TCP_AO_KEY_NOT_FOUND, this_ip_addr, this_ip_dest, + -1, port, 0, 0, 1, 0, 0, 0, 100, 100, -1); try_connect("Non-AO server + AO client", port++, DEFAULT_TEST_PASSWORD, this_ip_dest, -1, 100, 100, 0, FAULT_TIMEOUT); + trace_hash_event_expect(TCP_HASH_AO_REQUIRED, this_ip_addr, this_ip_dest, + -1, port, 0, 0, 1, 0, 0, 0); try_connect("AO server + Non-AO client", port++, NULL, this_ip_dest, -1, 100, 100, 0, FAULT_TIMEOUT); + trace_ao_event_expect(TCP_AO_MISMATCH, this_ip_addr, this_ip_dest, + -1, port, 0, 0, 1, 0, 0, 0, 100, 100, -1); try_connect("Wrong password", port++, DEFAULT_TEST_PASSWORD, this_ip_dest, -1, 100, 100, 0, FAULT_TIMEOUT); + trace_ao_event_expect(TCP_AO_KEY_NOT_FOUND, this_ip_addr, this_ip_dest, + -1, port, 0, 0, 1, 0, 0, 0, 100, 100, -1); try_connect("Wrong rcv id", port++, DEFAULT_TEST_PASSWORD, this_ip_dest, -1, 100, 100, 0, FAULT_TIMEOUT); + trace_ao_event_sk_expect(TCP_AO_SYNACK_NO_KEY, this_ip_dest, addr_any, + port, 0, 100, 100); try_connect("Wrong snd id", port++, DEFAULT_TEST_PASSWORD, this_ip_dest, -1, 100, 100, 0, FAULT_TIMEOUT); + trace_ao_event_expect(TCP_AO_WRONG_MACLEN, this_ip_addr, this_ip_dest, + -1, port, 0, 0, 1, 0, 0, 0, 100, 100, -1); try_connect("Different maclen", port++, DEFAULT_TEST_PASSWORD, this_ip_dest, -1, 100, 100, 0, FAULT_TIMEOUT); + trace_ao_event_expect(TCP_AO_KEY_NOT_FOUND, this_ip_addr, this_ip_dest, + -1, port, 0, 0, 1, 0, 0, 0, 100, 100, -1); try_connect("Server: Wrong addr", port++, DEFAULT_TEST_PASSWORD, this_ip_dest, -1, 100, 100, 0, FAULT_TIMEOUT); @@ -262,6 +276,6 @@ static void *client_fn(void *arg) int main(int argc, char *argv[]) { - test_init(21, server_fn, client_fn); + test_init(22, server_fn, client_fn); return 0; } diff --git a/tools/testing/selftests/net/tcp_ao/connect.c b/tools/testing/selftests/net/tcp_ao/connect.c index 9eecc4a1072e..f1d8d29e393f 100644 --- a/tools/testing/selftests/net/tcp_ao/connect.c +++ b/tools/testing/selftests/net/tcp_ao/connect.c @@ -85,6 +85,6 @@ static void *client_fn(void *arg) int main(int argc, char *argv[]) { - test_init(1, server_fn, client_fn); + test_init(2, server_fn, client_fn); return 0; } diff --git a/tools/testing/selftests/net/tcp_ao/icmps-discard.c b/tools/testing/selftests/net/tcp_ao/icmps-discard.c index d69bcba3c929..a1614f0d8c44 100644 --- a/tools/testing/selftests/net/tcp_ao/icmps-discard.c +++ b/tools/testing/selftests/net/tcp_ao/icmps-discard.c @@ -444,6 +444,6 @@ static void *client_fn(void *arg) int main(int argc, char *argv[]) { - test_init(3, server_fn, client_fn); + test_init(4, server_fn, client_fn); return 0; } diff --git a/tools/testing/selftests/net/tcp_ao/key-management.c b/tools/testing/selftests/net/tcp_ao/key-management.c index 24e62120b792..d4385b52c10b 100644 --- a/tools/testing/selftests/net/tcp_ao/key-management.c +++ b/tools/testing/selftests/net/tcp_ao/key-management.c @@ -965,7 +965,7 @@ static void end_client(const char *tst_name, int sk, unsigned int nr_keys, synchronize_threads(); /* 5: counters */ } -static void try_unmatched_keys(int sk, int *rnext_index) +static void try_unmatched_keys(int sk, int *rnext_index, unsigned int port) { struct test_key *key; unsigned int i = 0; @@ -1013,6 +1013,9 @@ static void try_unmatched_keys(int sk, int *rnext_index) test_error("all keys on server match the client"); if (test_set_key(sk, -1, key->server_keyid)) test_error("Can't change the current key"); + trace_ao_event_expect(TCP_AO_RNEXT_REQUEST, this_ip_addr, this_ip_dest, + -1, port, 0, -1, -1, -1, -1, -1, + -1, key->server_keyid, -1); if (test_client_verify(sk, msg_len, nr_packets, TEST_TIMEOUT_SEC)) test_fail("verify failed"); *rnext_index = i; @@ -1054,6 +1057,10 @@ static void check_current_back(const char *tst_name, unsigned int port, return; if (test_set_key(sk, collection.keys[rotate_to_index].client_keyid, -1)) test_error("Can't change the current key"); + trace_ao_event_expect(TCP_AO_RNEXT_REQUEST, this_ip_dest, this_ip_addr, + port, -1, 0, -1, -1, -1, -1, -1, + collection.keys[rotate_to_index].client_keyid, + collection.keys[current_index].client_keyid, -1); if (test_client_verify(sk, msg_len, nr_packets, TEST_TIMEOUT_SEC)) test_fail("verify failed"); /* There is a race here: between setting the current_key with @@ -1085,6 +1092,11 @@ static void roll_over_keys(const char *tst_name, unsigned int port, for (i = rnext_index + 1; rotations > 0; i++, rotations--) { if (i >= collection.nr_keys) i = 0; + trace_ao_event_expect(TCP_AO_RNEXT_REQUEST, + this_ip_addr, this_ip_dest, + -1, port, 0, -1, -1, -1, -1, -1, + i == 0 ? -1 : collection.keys[i - 1].server_keyid, + collection.keys[i].server_keyid, -1); if (test_set_key(sk, -1, collection.keys[i].server_keyid)) test_error("Can't change the Rnext key"); if (test_client_verify(sk, msg_len, nr_packets, TEST_TIMEOUT_SEC)) { @@ -1124,7 +1136,7 @@ static void try_client_match(const char *tst_name, unsigned int port, rnext_index, msg_len, nr_packets); if (sk < 0) return; - try_unmatched_keys(sk, &rnext_index); + try_unmatched_keys(sk, &rnext_index, port); end_client(tst_name, sk, nr_keys, current_index, rnext_index, NULL); } @@ -1181,6 +1193,6 @@ static void *client_fn(void *arg) int main(int argc, char *argv[]) { - test_init(120, server_fn, client_fn); + test_init(121, server_fn, client_fn); return 0; } diff --git a/tools/testing/selftests/net/tcp_ao/lib/aolib.h b/tools/testing/selftests/net/tcp_ao/lib/aolib.h index 78466863435f..db44e77428dd 100644 --- a/tools/testing/selftests/net/tcp_ao/lib/aolib.h +++ b/tools/testing/selftests/net/tcp_ao/lib/aolib.h @@ -144,6 +144,7 @@ enum test_needs_kconfig { KCONFIG_TCP_AO, /* required */ KCONFIG_TCP_MD5, /* optional, for TCP-MD5 features */ KCONFIG_NET_VRF, /* optional, for L3/VRF testing */ + KCONFIG_FTRACE, /* optional, for tracepoints checks */ __KCONFIG_LAST__ }; extern bool kernel_config_has(enum test_needs_kconfig k); @@ -183,6 +184,8 @@ static inline void test_init2(unsigned int ntests, __test_init(ntests, family, prefix, taddr1, taddr2, peer1, peer2); } extern void test_add_destructor(void (*d)(void)); +extern void test_init_ftrace(int nsfd1, int nsfd2); +extern int test_setup_tracing(void); /* To adjust optmem socket limit, approximately estimate a number, * that is bigger than sizeof(struct tcp_ao_key). @@ -257,12 +260,17 @@ static inline void test_init(unsigned int ntests, } extern void synchronize_threads(void); extern void switch_ns(int fd); +extern int switch_save_ns(int fd); +extern void switch_close_ns(int fd); extern __thread union tcp_addr this_ip_addr; extern __thread union tcp_addr this_ip_dest; extern int test_family; extern void randomize_buffer(void *buf, size_t buflen); +extern __printf(3, 4) int test_echo(const char *fname, bool append, + const char *fmt, ...); + extern int open_netns(void); extern int unshare_open_netns(void); extern const char veth_name[]; @@ -643,4 +651,115 @@ static inline int test_add_repaired_key(int sk, return test_verify_socket_key(sk, &tmp); } +#define DEFAULT_FTRACE_BUFFER_KB 10000 +#define DEFAULT_TRACER_LINES_ARR 200 +struct test_ftracer; +extern uint64_t ns_cookie1, ns_cookie2; + +enum ftracer_op { + FTRACER_LINE_DISCARD = 0, + FTRACER_LINE_PRESERVE, + FTRACER_EXIT, +}; + +extern struct test_ftracer *create_ftracer(const char *name, + enum ftracer_op (*process_line)(const char *line), + void (*destructor)(struct test_ftracer *tracer), + bool (*expecting_more)(void), + size_t lines_buf_sz, size_t buffer_size_kb); +extern int setup_trace_event(struct test_ftracer *tracer, + const char *event, const char *filter); +extern void destroy_ftracer(struct test_ftracer *tracer); +extern const size_t tracer_get_savedlines_nr(struct test_ftracer *tracer); +extern const char **tracer_get_savedlines(struct test_ftracer *tracer); + +enum trace_events { + /* TCP_HASH_EVENT */ + TCP_HASH_BAD_HEADER = 0, + TCP_HASH_MD5_REQUIRED, + TCP_HASH_MD5_UNEXPECTED, + TCP_HASH_MD5_MISMATCH, + TCP_HASH_AO_REQUIRED, + /* TCP_AO_EVENT */ + TCP_AO_HANDSHAKE_FAILURE, + TCP_AO_WRONG_MACLEN, + TCP_AO_MISMATCH, + TCP_AO_KEY_NOT_FOUND, + TCP_AO_RNEXT_REQUEST, + /* TCP_AO_EVENT_SK */ + TCP_AO_SYNACK_NO_KEY, + /* TCP_AO_EVENT_SNE */ + TCP_AO_SND_SNE_UPDATE, + TCP_AO_RCV_SNE_UPDATE, + __MAX_TRACE_EVENTS +}; + +extern int __trace_event_expect(enum trace_events type, int family, + union tcp_addr src, union tcp_addr dst, + int src_port, int dst_port, int L3index, + int fin, int syn, int rst, int psh, int ack, + int keyid, int rnext, int maclen, int sne); + +static inline void trace_hash_event_expect(enum trace_events type, + union tcp_addr src, union tcp_addr dst, + int src_port, int dst_port, int L3index, + int fin, int syn, int rst, int psh, int ack) +{ + int err; + + err = __trace_event_expect(type, TEST_FAMILY, src, dst, + src_port, dst_port, L3index, + fin, syn, rst, psh, ack, + -1, -1, -1, -1); + if (err) + test_error("Couldn't add a trace event: %d", err); +} + +static inline void trace_ao_event_expect(enum trace_events type, + union tcp_addr src, union tcp_addr dst, + int src_port, int dst_port, int L3index, + int fin, int syn, int rst, int psh, int ack, + int keyid, int rnext, int maclen) +{ + int err; + + err = __trace_event_expect(type, TEST_FAMILY, src, dst, + src_port, dst_port, L3index, + fin, syn, rst, psh, ack, + keyid, rnext, maclen, -1); + if (err) + test_error("Couldn't add a trace event: %d", err); +} + +static inline void trace_ao_event_sk_expect(enum trace_events type, + union tcp_addr src, union tcp_addr dst, + int src_port, int dst_port, + int keyid, int rnext) +{ + int err; + + err = __trace_event_expect(type, TEST_FAMILY, src, dst, + src_port, dst_port, -1, + -1, -1, -1, -1, -1, + keyid, rnext, -1, -1); + if (err) + test_error("Couldn't add a trace event: %d", err); +} + +static inline void trace_ao_event_sne_expect(enum trace_events type, + union tcp_addr src, union tcp_addr dst, + int src_port, int dst_port, int sne) +{ + int err; + + err = __trace_event_expect(type, TEST_FAMILY, src, dst, + src_port, dst_port, -1, + -1, -1, -1, -1, -1, + -1, -1, -1, sne); + if (err) + test_error("Couldn't add a trace event: %d", err); +} + +extern int setup_aolib_ftracer(void); + #endif /* _AOLIB_H_ */ diff --git a/tools/testing/selftests/net/tcp_ao/lib/ftrace-tcp.c b/tools/testing/selftests/net/tcp_ao/lib/ftrace-tcp.c new file mode 100644 index 000000000000..24380c68fec6 --- /dev/null +++ b/tools/testing/selftests/net/tcp_ao/lib/ftrace-tcp.c @@ -0,0 +1,559 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include "aolib.h" + +static const char *trace_event_names[__MAX_TRACE_EVENTS] = { + /* TCP_HASH_EVENT */ + "tcp_hash_bad_header", + "tcp_hash_md5_required", + "tcp_hash_md5_unexpected", + "tcp_hash_md5_mismatch", + "tcp_hash_ao_required", + /* TCP_AO_EVENT */ + "tcp_ao_handshake_failure", + "tcp_ao_wrong_maclen", + "tcp_ao_mismatch", + "tcp_ao_key_not_found", + "tcp_ao_rnext_request", + /* TCP_AO_EVENT_SK */ + "tcp_ao_synack_no_key", + /* TCP_AO_EVENT_SNE */ + "tcp_ao_snd_sne_update", + "tcp_ao_rcv_sne_update" +}; + +struct expected_trace_point { + /* required */ + enum trace_events type; + int family; + union tcp_addr src; + union tcp_addr dst; + + /* optional */ + int src_port; + int dst_port; + int L3index; + + int fin; + int syn; + int rst; + int psh; + int ack; + + int keyid; + int rnext; + int maclen; + int sne; + + size_t matched; +}; + +static struct expected_trace_point *exp_tps; +static size_t exp_tps_nr; +static size_t exp_tps_size; +static pthread_mutex_t exp_tps_mutex = PTHREAD_MUTEX_INITIALIZER; + +int __trace_event_expect(enum trace_events type, int family, + union tcp_addr src, union tcp_addr dst, + int src_port, int dst_port, int L3index, + int fin, int syn, int rst, int psh, int ack, + int keyid, int rnext, int maclen, int sne) +{ + struct expected_trace_point new_tp = { + .type = type, + .family = family, + .src = src, + .dst = dst, + .src_port = src_port, + .dst_port = dst_port, + .L3index = L3index, + .fin = fin, + .syn = syn, + .rst = rst, + .psh = psh, + .ack = ack, + .keyid = keyid, + .rnext = rnext, + .maclen = maclen, + .sne = sne, + .matched = 0, + }; + int ret = 0; + + if (!kernel_config_has(KCONFIG_FTRACE)) + return 0; + + pthread_mutex_lock(&exp_tps_mutex); + if (exp_tps_nr == exp_tps_size) { + struct expected_trace_point *tmp; + + if (exp_tps_size == 0) + exp_tps_size = 10; + else + exp_tps_size = exp_tps_size * 1.6; + + tmp = reallocarray(exp_tps, exp_tps_size, sizeof(exp_tps[0])); + if (!tmp) { + ret = -ENOMEM; + goto out; + } + exp_tps = tmp; + } + exp_tps[exp_tps_nr] = new_tp; + exp_tps_nr++; +out: + pthread_mutex_unlock(&exp_tps_mutex); + return ret; +} + +static void free_expected_events(void) +{ + /* We're from the process destructor - not taking the mutex */ + exp_tps_size = 0; + exp_tps = NULL; + free(exp_tps); +} + +struct trace_point { + int family; + union tcp_addr src; + union tcp_addr dst; + unsigned int src_port; + unsigned int dst_port; + int L3index; + unsigned int fin:1, + syn:1, + rst:1, + psh:1, + ack:1; + + unsigned int keyid; + unsigned int rnext; + unsigned int maclen; + + unsigned int sne; +}; + +static bool lookup_expected_event(int event_type, struct trace_point *e) +{ + size_t i; + + pthread_mutex_lock(&exp_tps_mutex); + for (i = 0; i < exp_tps_nr; i++) { + struct expected_trace_point *p = &exp_tps[i]; + size_t sk_size; + + if (p->type != event_type) + continue; + if (p->family != e->family) + continue; + if (p->family == AF_INET) + sk_size = sizeof(p->src.a4); + else + sk_size = sizeof(p->src.a6); + if (memcmp(&p->src, &e->src, sk_size)) + continue; + if (memcmp(&p->dst, &e->dst, sk_size)) + continue; + if (p->src_port >= 0 && p->src_port != e->src_port) + continue; + if (p->dst_port >= 0 && p->dst_port != e->dst_port) + continue; + if (p->L3index >= 0 && p->L3index != e->L3index) + continue; + + if (p->fin >= 0 && p->fin != e->fin) + continue; + if (p->syn >= 0 && p->syn != e->syn) + continue; + if (p->rst >= 0 && p->rst != e->rst) + continue; + if (p->psh >= 0 && p->psh != e->psh) + continue; + if (p->ack >= 0 && p->ack != e->ack) + continue; + + if (p->keyid >= 0 && p->keyid != e->keyid) + continue; + if (p->rnext >= 0 && p->rnext != e->rnext) + continue; + if (p->maclen >= 0 && p->maclen != e->maclen) + continue; + if (p->sne >= 0 && p->sne != e->sne) + continue; + p->matched++; + pthread_mutex_unlock(&exp_tps_mutex); + return true; + } + pthread_mutex_unlock(&exp_tps_mutex); + return false; +} + +static int check_event_type(const char *line) +{ + size_t i; + + /* + * This should have been a set or hashmap, but it's a selftest, + * so... KISS. + */ + for (i = 0; i < __MAX_TRACE_EVENTS; i++) { + if (!strncmp(trace_event_names[i], line, strlen(trace_event_names[i]))) + return i; + } + return -1; +} + +static bool event_has_flags(enum trace_events event) +{ + switch (event) { + case TCP_HASH_BAD_HEADER: + case TCP_HASH_MD5_REQUIRED: + case TCP_HASH_MD5_UNEXPECTED: + case TCP_HASH_MD5_MISMATCH: + case TCP_HASH_AO_REQUIRED: + case TCP_AO_HANDSHAKE_FAILURE: + case TCP_AO_WRONG_MACLEN: + case TCP_AO_MISMATCH: + case TCP_AO_KEY_NOT_FOUND: + case TCP_AO_RNEXT_REQUEST: + return true; + default: + return false; + } +} + +static int tracer_ip_split(int family, char *src, char **addr, char **port) +{ + char *p; + + if (family == AF_INET) { + /* fomat is :port, i.e.: 10.0.254.1:7015 */ + *addr = src; + p = strchr(src, ':'); + if (!p) { + test_print("Couldn't parse trace event addr:port %s", src); + return -EINVAL; + } + *p++ = '\0'; + *port = p; + return 0; + } + if (family != AF_INET6) + return -EAFNOSUPPORT; + + /* format is []:port, i.e.: [2001:db8:254::1]:7013 */ + *addr = strchr(src, '['); + p = strchr(src, ']'); + + if (!p || !*addr) { + test_print("Couldn't parse trace event [addr]:port %s", src); + return -EINVAL; + } + + *addr = *addr + 1; /* '[' */ + *p++ = '\0'; /* ']' */ + if (*p != ':') { + test_print("Couldn't parse trace event :port %s", p); + return -EINVAL; + } + *p++ = '\0'; /* ':' */ + *port = p; + return 0; +} + +static int tracer_scan_address(int family, char *src, + union tcp_addr *dst, unsigned int *port) +{ + char *addr, *port_str; + int ret; + + ret = tracer_ip_split(family, src, &addr, &port_str); + if (ret) + return ret; + + if (inet_pton(family, addr, dst) != 1) { + test_print("Couldn't parse trace event addr %s", addr); + return -EINVAL; + } + errno = 0; + *port = (unsigned int)strtoul(port_str, NULL, 10); + if (errno != 0) { + test_print("Couldn't parse trace event port %s", port_str); + return -errno; + } + return 0; +} + +static int tracer_scan_event(const char *line, enum trace_events event, + struct trace_point *out) +{ + char *src = NULL, *dst = NULL, *family = NULL; + char fin, syn, rst, psh, ack; + int nr_matched, ret = 0; + uint64_t netns_cookie; + + switch (event) { + case TCP_HASH_BAD_HEADER: + case TCP_HASH_MD5_REQUIRED: + case TCP_HASH_MD5_UNEXPECTED: + case TCP_HASH_MD5_MISMATCH: + case TCP_HASH_AO_REQUIRED: { + nr_matched = sscanf(line, "%*s net=%" PRIu64 " state%*s family=%ms src=%ms dest=%ms L3index=%d [%c%c%c%c%c]", + &netns_cookie, &family, + &src, &dst, &out->L3index, + &fin, &syn, &rst, &psh, &ack); + if (nr_matched != 10) + test_print("Couldn't parse trace event, matched = %d/10", + nr_matched); + break; + } + case TCP_AO_HANDSHAKE_FAILURE: + case TCP_AO_WRONG_MACLEN: + case TCP_AO_MISMATCH: + case TCP_AO_KEY_NOT_FOUND: + case TCP_AO_RNEXT_REQUEST: { + nr_matched = sscanf(line, "%*s net=%" PRIu64 " state%*s family=%ms src=%ms dest=%ms L3index=%d [%c%c%c%c%c] keyid=%u rnext=%u maclen=%u", + &netns_cookie, &family, + &src, &dst, &out->L3index, + &fin, &syn, &rst, &psh, &ack, + &out->keyid, &out->rnext, &out->maclen); + if (nr_matched != 13) + test_print("Couldn't parse trace event, matched = %d/13", + nr_matched); + break; + } + case TCP_AO_SYNACK_NO_KEY: { + nr_matched = sscanf(line, "%*s net=%" PRIu64 " state%*s family=%ms src=%ms dest=%ms keyid=%u rnext=%u", + &netns_cookie, &family, + &src, &dst, &out->keyid, &out->rnext); + if (nr_matched != 6) + test_print("Couldn't parse trace event, matched = %d/6", + nr_matched); + break; + } + case TCP_AO_SND_SNE_UPDATE: + case TCP_AO_RCV_SNE_UPDATE: { + nr_matched = sscanf(line, "%*s net=%" PRIu64 " state%*s family=%ms src=%ms dest=%ms sne=%u", + &netns_cookie, &family, + &src, &dst, &out->sne); + if (nr_matched != 5) + test_print("Couldn't parse trace event, matched = %d/5", + nr_matched); + break; + } + default: + return -1; + } + + if (family) { + if (!strcmp(family, "AF_INET")) { + out->family = AF_INET; + } else if (!strcmp(family, "AF_INET6")) { + out->family = AF_INET6; + } else { + test_print("Couldn't parse trace event family %s", family); + ret = -EINVAL; + goto out_free; + } + } + + if (event_has_flags(event)) { + out->fin = (fin == 'F'); + out->syn = (syn == 'S'); + out->rst = (rst == 'R'); + out->psh = (psh == 'P'); + out->ack = (ack == '.'); + + if ((fin != 'F' && fin != ' ') || + (syn != 'S' && syn != ' ') || + (rst != 'R' && rst != ' ') || + (psh != 'P' && psh != ' ') || + (ack != '.' && ack != ' ')) { + test_print("Couldn't parse trace event flags %c%c%c%c%c", + fin, syn, rst, psh, ack); + ret = -EINVAL; + goto out_free; + } + } + + if (src && tracer_scan_address(out->family, src, &out->src, &out->src_port)) { + ret = -EINVAL; + goto out_free; + } + + if (dst && tracer_scan_address(out->family, dst, &out->dst, &out->dst_port)) { + ret = -EINVAL; + goto out_free; + } + + if (netns_cookie != ns_cookie1 && netns_cookie != ns_cookie2) { + test_print("Net namespace filter for trace event didn't work: %" PRIu64 " != %" PRIu64 " OR %" PRIu64, + netns_cookie, ns_cookie1, ns_cookie2); + ret = -EINVAL; + } + +out_free: + free(src); + free(dst); + free(family); + return ret; +} + +static enum ftracer_op aolib_tracer_process_event(const char *line) +{ + int event_type = check_event_type(line); + struct trace_point tmp = {}; + + if (event_type < 0) + return FTRACER_LINE_PRESERVE; + + if (tracer_scan_event(line, event_type, &tmp)) + return FTRACER_LINE_PRESERVE; + + return lookup_expected_event(event_type, &tmp) ? + FTRACER_LINE_DISCARD : FTRACER_LINE_PRESERVE; +} + +static void dump_trace_event(struct expected_trace_point *e) +{ + char src[INET6_ADDRSTRLEN], dst[INET6_ADDRSTRLEN]; + + if (!inet_ntop(e->family, &e->src, src, INET6_ADDRSTRLEN)) + test_error("inet_ntop()"); + if (!inet_ntop(e->family, &e->dst, dst, INET6_ADDRSTRLEN)) + test_error("inet_ntop()"); + test_print("trace event filter %s [%s:%d => %s:%d, L3index %d, flags: %s%s%s%s%s, keyid: %d, rnext: %d, maclen: %d, sne: %d] = %zu", + trace_event_names[e->type], + src, e->src_port, dst, e->dst_port, e->L3index, + (e->fin > 0) ? "F" : (e->fin == 0) ? "!F" : "", + (e->syn > 0) ? "S" : (e->syn == 0) ? "!S" : "", + (e->rst > 0) ? "R" : (e->rst == 0) ? "!R" : "", + (e->psh > 0) ? "P" : (e->psh == 0) ? "!P" : "", + (e->ack > 0) ? "." : (e->ack == 0) ? "!." : "", + e->keyid, e->rnext, e->maclen, e->sne, e->matched); +} + +static void print_match_stats(bool unexpected_events) +{ + size_t matches_per_type[__MAX_TRACE_EVENTS] = {}; + bool expected_but_none = false; + size_t i, total_matched = 0; + char *stat_line = NULL; + + for (i = 0; i < exp_tps_nr; i++) { + struct expected_trace_point *e = &exp_tps[i]; + + total_matched += e->matched; + matches_per_type[e->type] += e->matched; + if (!e->matched) + expected_but_none = true; + } + for (i = 0; i < __MAX_TRACE_EVENTS; i++) { + if (!matches_per_type[i]) + continue; + stat_line = test_sprintf("%s%s[%zu] ", stat_line ?: "", + trace_event_names[i], + matches_per_type[i]); + if (!stat_line) + test_error("test_sprintf()"); + } + + if (unexpected_events || expected_but_none) { + for (i = 0; i < exp_tps_nr; i++) + dump_trace_event(&exp_tps[i]); + } + + if (unexpected_events) + return; + + if (expected_but_none) + test_fail("Some trace events were expected, but didn't occur"); + else if (total_matched) + test_ok("Trace events matched expectations: %zu %s", + total_matched, stat_line); + else + test_ok("No unexpected trace events during the test run"); +} + +#define dump_events(fmt, ...) \ + __test_print(__test_msg, fmt, ##__VA_ARGS__) +static void check_free_events(struct test_ftracer *tracer) +{ + const char **lines; + size_t nr; + + if (!kernel_config_has(KCONFIG_FTRACE)) { + test_skip("kernel config doesn't have ftrace - no checks"); + return; + } + + nr = tracer_get_savedlines_nr(tracer); + lines = tracer_get_savedlines(tracer); + print_match_stats(!!nr); + if (!nr) + return; + + errno = 0; + test_xfail("Trace events [%zu] were not expected:", nr); + while (nr) + dump_events("\t%s", lines[--nr]); +} + +static int setup_tcp_trace_events(struct test_ftracer *tracer) +{ + char *filter; + size_t i; + int ret; + + filter = test_sprintf("net_cookie == %zu || net_cookie == %zu", + ns_cookie1, ns_cookie2); + if (!filter) + return -ENOMEM; + + for (i = 0; i < __MAX_TRACE_EVENTS; i++) { + char *event_name = test_sprintf("tcp/%s", trace_event_names[i]); + + if (!event_name) { + ret = -ENOMEM; + break; + } + ret = setup_trace_event(tracer, event_name, filter); + free(event_name); + if (ret) + break; + } + + free(filter); + return ret; +} + +static void aolib_tracer_destroy(struct test_ftracer *tracer) +{ + check_free_events(tracer); + free_expected_events(); +} + +static bool aolib_tracer_expecting_more(void) +{ + size_t i; + + for (i = 0; i < exp_tps_nr; i++) + if (!exp_tps[i].matched) + return true; + return false; +} + +int setup_aolib_ftracer(void) +{ + struct test_ftracer *f; + + f = create_ftracer("aolib", aolib_tracer_process_event, + aolib_tracer_destroy, aolib_tracer_expecting_more, + DEFAULT_FTRACE_BUFFER_KB, DEFAULT_TRACER_LINES_ARR); + if (!f) + return -1; + + return setup_tcp_trace_events(f); +} diff --git a/tools/testing/selftests/net/tcp_ao/lib/ftrace.c b/tools/testing/selftests/net/tcp_ao/lib/ftrace.c new file mode 100644 index 000000000000..e4d0b173bc94 --- /dev/null +++ b/tools/testing/selftests/net/tcp_ao/lib/ftrace.c @@ -0,0 +1,543 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include +#include +#include +#include +#include "../../../../../include/linux/kernel.h" +#include "aolib.h" + +static char ftrace_path[] = "ksft-ftrace-XXXXXX"; +static bool ftrace_mounted; +uint64_t ns_cookie1, ns_cookie2; + +struct test_ftracer { + pthread_t tracer_thread; + int error; + char *instance_path; + FILE *trace_pipe; + + enum ftracer_op (*process_line)(const char *line); + void (*destructor)(struct test_ftracer *tracer); + bool (*expecting_more)(void); + + char **saved_lines; + size_t saved_lines_size; + size_t next_line_ind; + + pthread_cond_t met_all_expected; + pthread_mutex_t met_all_expected_lock; + + struct test_ftracer *next; +}; + +static struct test_ftracer *ftracers; +static pthread_mutex_t ftracers_lock = PTHREAD_MUTEX_INITIALIZER; + +static int mount_ftrace(void) +{ + if (!mkdtemp(ftrace_path)) + test_error("Can't create temp dir"); + + if (mount("tracefs", ftrace_path, "tracefs", 0, "rw")) + return -errno; + + ftrace_mounted = true; + + return 0; +} + +static void unmount_ftrace(void) +{ + if (ftrace_mounted && umount(ftrace_path)) + test_print("Failed on cleanup: can't unmount tracefs: %m"); + + if (rmdir(ftrace_path)) + test_error("Failed on cleanup: can't remove ftrace dir %s", + ftrace_path); +} + +struct opts_list_t { + char *opt_name; + struct opts_list_t *next; +}; + +static int disable_trace_options(const char *ftrace_path) +{ + struct opts_list_t *opts_list = NULL; + char *fopts, *line = NULL; + size_t buf_len = 0; + ssize_t line_len; + int ret = 0; + FILE *opts; + + fopts = test_sprintf("%s/%s", ftrace_path, "trace_options"); + if (!fopts) + return -ENOMEM; + + opts = fopen(fopts, "r+"); + if (!opts) { + ret = -errno; + goto out_free; + } + + while ((line_len = getline(&line, &buf_len, opts)) != -1) { + struct opts_list_t *tmp; + + if (!strncmp(line, "no", 2)) + continue; + + tmp = malloc(sizeof(*tmp)); + if (!tmp) { + ret = -ENOMEM; + goto out_free_opts_list; + } + tmp->next = opts_list; + tmp->opt_name = test_sprintf("no%s", line); + if (!tmp->opt_name) { + ret = -ENOMEM; + free(tmp); + goto out_free_opts_list; + } + opts_list = tmp; + } + + while (opts_list) { + struct opts_list_t *tmp = opts_list; + + fseek(opts, 0, SEEK_SET); + fwrite(tmp->opt_name, 1, strlen(tmp->opt_name), opts); + + opts_list = opts_list->next; + free(tmp->opt_name); + free(tmp); + } + +out_free_opts_list: + while (opts_list) { + struct opts_list_t *tmp = opts_list; + + opts_list = opts_list->next; + free(tmp->opt_name); + free(tmp); + } + free(line); + fclose(opts); +out_free: + free(fopts); + return ret; +} + +static int setup_buffer_size(const char *ftrace_path, size_t sz) +{ + char *fbuf_size = test_sprintf("%s/buffer_size_kb", ftrace_path); + int ret; + + if (!fbuf_size) + return -1; + + ret = test_echo(fbuf_size, 0, "%zu", sz); + free(fbuf_size); + return ret; +} + +static int setup_ftrace_instance(struct test_ftracer *tracer, const char *name) +{ + char *tmp; + + tmp = test_sprintf("%s/instances/ksft-%s-XXXXXX", ftrace_path, name); + if (!tmp) + return -ENOMEM; + + tracer->instance_path = mkdtemp(tmp); + if (!tracer->instance_path) { + free(tmp); + return -errno; + } + + return 0; +} + +static void remove_ftrace_instance(struct test_ftracer *tracer) +{ + if (rmdir(tracer->instance_path)) + test_print("Failed on cleanup: can't remove ftrace instance %s", + tracer->instance_path); + free(tracer->instance_path); +} + +static void tracer_cleanup(void *arg) +{ + struct test_ftracer *tracer = arg; + + fclose(tracer->trace_pipe); +} + +static void tracer_set_error(struct test_ftracer *tracer, int error) +{ + if (!tracer->error) + tracer->error = error; +} + +const size_t tracer_get_savedlines_nr(struct test_ftracer *tracer) +{ + return tracer->next_line_ind; +} + +const char **tracer_get_savedlines(struct test_ftracer *tracer) +{ + return (const char **)tracer->saved_lines; +} + +static void *tracer_thread_func(void *arg) +{ + struct test_ftracer *tracer = arg; + + pthread_cleanup_push(tracer_cleanup, arg); + + while (tracer->next_line_ind < tracer->saved_lines_size) { + char **lp = &tracer->saved_lines[tracer->next_line_ind]; + enum ftracer_op op; + size_t buf_len = 0; + ssize_t line_len; + + line_len = getline(lp, &buf_len, tracer->trace_pipe); + if (line_len == -1) + break; + + pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, NULL); + op = tracer->process_line(*lp); + pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, NULL); + + if (tracer->expecting_more) { + pthread_mutex_lock(&tracer->met_all_expected_lock); + if (!tracer->expecting_more()) + pthread_cond_signal(&tracer->met_all_expected); + pthread_mutex_unlock(&tracer->met_all_expected_lock); + } + + if (op == FTRACER_LINE_DISCARD) + continue; + if (op == FTRACER_EXIT) + break; + if (op != FTRACER_LINE_PRESERVE) + test_error("unexpected tracer command %d", op); + + tracer->next_line_ind++; + buf_len = 0; + } + test_print("too many lines in ftracer buffer %zu, exiting tracer", + tracer->next_line_ind); + + pthread_cleanup_pop(1); + return NULL; +} + +static int setup_trace_thread(struct test_ftracer *tracer) +{ + int ret = 0; + char *path; + + path = test_sprintf("%s/trace_pipe", tracer->instance_path); + if (!path) + return -ENOMEM; + + tracer->trace_pipe = fopen(path, "r"); + if (!tracer->trace_pipe) { + ret = -errno; + goto out_free; + } + + if (pthread_create(&tracer->tracer_thread, NULL, + tracer_thread_func, (void *)tracer)) { + ret = -errno; + fclose(tracer->trace_pipe); + } + +out_free: + free(path); + return ret; +} + +static void stop_trace_thread(struct test_ftracer *tracer) +{ + void *res; + + if (pthread_cancel(tracer->tracer_thread)) { + test_print("Can't stop tracer pthread: %m"); + tracer_set_error(tracer, -errno); + } + if (pthread_join(tracer->tracer_thread, &res)) { + test_print("Can't join tracer pthread: %m"); + tracer_set_error(tracer, -errno); + } + if (res != PTHREAD_CANCELED) { + test_print("Tracer thread wasn't canceled"); + tracer_set_error(tracer, -errno); + } + if (tracer->error) + test_fail("tracer errored by %s", strerror(tracer->error)); +} + +static void final_wait_for_events(struct test_ftracer *tracer, + unsigned timeout_sec) +{ + struct timespec timeout; + struct timeval now; + int ret = 0; + + if (!tracer->expecting_more) + return; + + pthread_mutex_lock(&tracer->met_all_expected_lock); + gettimeofday(&now, NULL); + timeout.tv_sec = now.tv_sec + timeout_sec; + timeout.tv_nsec = now.tv_usec * 1000; + + while (tracer->expecting_more() && ret != ETIMEDOUT) + ret = pthread_cond_timedwait(&tracer->met_all_expected, + &tracer->met_all_expected_lock, &timeout); + pthread_mutex_unlock(&tracer->met_all_expected_lock); +} + +int setup_trace_event(struct test_ftracer *tracer, + const char *event, const char *filter) +{ + char *enable_path, *filter_path, *instance = tracer->instance_path; + int ret; + + enable_path = test_sprintf("%s/events/%s/enable", instance, event); + if (!enable_path) + return -ENOMEM; + + filter_path = test_sprintf("%s/events/%s/filter", instance, event); + if (!filter_path) { + ret = -ENOMEM; + goto out_free; + } + + ret = test_echo(filter_path, 0, "%s", filter); + if (!ret) + ret = test_echo(enable_path, 0, "1"); + +out_free: + free(filter_path); + free(enable_path); + return ret; +} + +struct test_ftracer *create_ftracer(const char *name, + enum ftracer_op (*process_line)(const char *line), + void (*destructor)(struct test_ftracer *tracer), + bool (*expecting_more)(void), + size_t lines_buf_sz, size_t buffer_size_kb) +{ + struct test_ftracer *tracer; + int err; + + /* XXX: separate __create_ftracer() helper and do here + * if (!kernel_config_has(KCONFIG_FTRACE)) + * return NULL; + */ + + tracer = malloc(sizeof(*tracer)); + if (!tracer) { + test_print("malloc()"); + return NULL; + } + + memset(tracer, 0, sizeof(*tracer)); + + err = setup_ftrace_instance(tracer, name); + if (err) { + test_print("setup_ftrace_instance(): %d", err); + goto err_free; + } + + err = disable_trace_options(tracer->instance_path); + if (err) { + test_print("disable_trace_options(): %d", err); + goto err_remove; + } + + err = setup_buffer_size(tracer->instance_path, buffer_size_kb); + if (err) { + test_print("disable_trace_options(): %d", err); + goto err_remove; + } + + tracer->saved_lines = calloc(lines_buf_sz, sizeof(tracer->saved_lines[0])); + if (!tracer->saved_lines) { + test_print("calloc()"); + goto err_remove; + } + tracer->saved_lines_size = lines_buf_sz; + + tracer->process_line = process_line; + tracer->destructor = destructor; + tracer->expecting_more = expecting_more; + + err = pthread_cond_init(&tracer->met_all_expected, NULL); + if (err) { + test_print("pthread_cond_init(): %d", err); + goto err_free_lines; + } + + err = pthread_mutex_init(&tracer->met_all_expected_lock, NULL); + if (err) { + test_print("pthread_mutex_init(): %d", err); + goto err_cond_destroy; + } + + err = setup_trace_thread(tracer); + if (err) { + test_print("setup_trace_thread(): %d", err); + goto err_mutex_destroy; + } + + pthread_mutex_lock(&ftracers_lock); + tracer->next = ftracers; + ftracers = tracer; + pthread_mutex_unlock(&ftracers_lock); + + return tracer; + +err_mutex_destroy: + pthread_mutex_destroy(&tracer->met_all_expected_lock); +err_cond_destroy: + pthread_cond_destroy(&tracer->met_all_expected); +err_free_lines: + free(tracer->saved_lines); +err_remove: + remove_ftrace_instance(tracer); +err_free: + free(tracer); + return NULL; +} + +static void __destroy_ftracer(struct test_ftracer *tracer) +{ + size_t i; + + final_wait_for_events(tracer, TEST_TIMEOUT_SEC); + stop_trace_thread(tracer); + remove_ftrace_instance(tracer); + if (tracer->destructor) + tracer->destructor(tracer); + for (i = 0; i < tracer->saved_lines_size; i++) + free(tracer->saved_lines[i]); + pthread_cond_destroy(&tracer->met_all_expected); + pthread_mutex_destroy(&tracer->met_all_expected_lock); + free(tracer); +} + +void destroy_ftracer(struct test_ftracer *tracer) +{ + pthread_mutex_lock(&ftracers_lock); + if (tracer == ftracers) { + ftracers = tracer->next; + } else { + struct test_ftracer *f = ftracers; + + while (f->next != tracer) { + if (!f->next) + test_error("tracers list corruption or double free %p", tracer); + f = f->next; + } + f->next = tracer->next; + } + tracer->next = NULL; + pthread_mutex_unlock(&ftracers_lock); + __destroy_ftracer(tracer); +} + +static void destroy_all_ftracers(void) +{ + struct test_ftracer *f; + + pthread_mutex_lock(&ftracers_lock); + f = ftracers; + ftracers = NULL; + pthread_mutex_unlock(&ftracers_lock); + + while (f) { + struct test_ftracer *n = f->next; + + f->next = NULL; + __destroy_ftracer(f); + f = n; + } +} + +static void test_unset_tracing(void) +{ + destroy_all_ftracers(); + unmount_ftrace(); +} + +int test_setup_tracing(void) +{ + /* + * Just a basic protection - this should be called only once from + * lib/kconfig. Not thread safe, which is fine as it's early, before + * threads are created. + */ + static int already_set; + int err; + + if (already_set) + return -1; + + /* Needs net-namespace cookies for filters */ + if (ns_cookie1 == ns_cookie2) { + test_print("net-namespace cookies: %" PRIu64 " == %" PRIu64 ", can't set up tracing", + ns_cookie1, ns_cookie2); + return -1; + } + + already_set = 1; + + test_add_destructor(test_unset_tracing); + + err = mount_ftrace(); + if (err) { + test_print("failed to mount_ftrace(): %d", err); + return err; + } + + return setup_aolib_ftracer(); +} + +static int get_ns_cookie(int nsfd, uint64_t *out) +{ + int old_ns = switch_save_ns(nsfd); + socklen_t size = sizeof(*out); + int sk; + + sk = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP); + if (sk < 0) { + test_print("socket(): %m"); + return -errno; + } + + if (getsockopt(sk, SOL_SOCKET, SO_NETNS_COOKIE, out, &size)) { + test_print("getsockopt(SO_NETNS_COOKIE): %m"); + close(sk); + return -errno; + } + + close(sk); + switch_close_ns(old_ns); + return 0; +} + +void test_init_ftrace(int nsfd1, int nsfd2) +{ + get_ns_cookie(nsfd1, &ns_cookie1); + get_ns_cookie(nsfd2, &ns_cookie2); + /* Populate kernel config state */ + kernel_config_has(KCONFIG_FTRACE); +} diff --git a/tools/testing/selftests/net/tcp_ao/lib/kconfig.c b/tools/testing/selftests/net/tcp_ao/lib/kconfig.c index 3bf4a7e4b3c9..9f1c175846f8 100644 --- a/tools/testing/selftests/net/tcp_ao/lib/kconfig.c +++ b/tools/testing/selftests/net/tcp_ao/lib/kconfig.c @@ -116,6 +116,12 @@ static int has_vrfs(int *err) return ret; } +static int has_ftrace(int *err) +{ + *err = test_setup_tracing(); + return 0; +} + #define KCONFIG_UNKNOWN 1 static pthread_mutex_t kconfig_lock = PTHREAD_MUTEX_INITIALIZER; static struct kconfig_t kconfig[__KCONFIG_LAST__] = { @@ -124,6 +130,7 @@ static struct kconfig_t kconfig[__KCONFIG_LAST__] = { { KCONFIG_UNKNOWN, has_tcp_ao }, { KCONFIG_UNKNOWN, has_tcp_md5 }, { KCONFIG_UNKNOWN, has_vrfs }, + { KCONFIG_UNKNOWN, has_ftrace }, }; const char *tests_skip_reason[__KCONFIG_LAST__] = { @@ -132,6 +139,7 @@ const char *tests_skip_reason[__KCONFIG_LAST__] = { "Tests require TCP-AO support (CONFIG_TCP_AO)", "setsockopt(TCP_MD5SIG_EXT) is not supported (CONFIG_TCP_MD5)", "VRFs are not supported (CONFIG_NET_VRF)", + "Ftrace points are not supported (CONFIG_TRACEPOINTS)", }; bool kernel_config_has(enum test_needs_kconfig k) diff --git a/tools/testing/selftests/net/tcp_ao/lib/setup.c b/tools/testing/selftests/net/tcp_ao/lib/setup.c index 86a4f6e20450..a27cc03c9fbd 100644 --- a/tools/testing/selftests/net/tcp_ao/lib/setup.c +++ b/tools/testing/selftests/net/tcp_ao/lib/setup.c @@ -250,9 +250,9 @@ void __test_init(unsigned int ntests, int family, unsigned int prefix, test_print("rand seed %u", (unsigned int)seed); srand(seed); - ksft_print_header(); init_namespaces(); + test_init_ftrace(nsfd_parent, nsfd_child); if (add_veth(veth_name, nsfd_parent, nsfd_child)) test_error("Failed to add veth"); diff --git a/tools/testing/selftests/net/tcp_ao/lib/utils.c b/tools/testing/selftests/net/tcp_ao/lib/utils.c index 372daca525f5..bdf5522c9213 100644 --- a/tools/testing/selftests/net/tcp_ao/lib/utils.c +++ b/tools/testing/selftests/net/tcp_ao/lib/utils.c @@ -21,6 +21,32 @@ void randomize_buffer(void *buf, size_t buflen) } } +__printf(3, 4) int test_echo(const char *fname, bool append, + const char *fmt, ...) +{ + size_t len, written; + va_list vargs; + char *msg; + FILE *f; + + f = fopen(fname, append ? "a" : "w"); + if (!f) + return -errno; + + va_start(vargs, fmt); + msg = test_snprintf(fmt, vargs); + va_end(vargs); + if (!msg) { + fclose(f); + return -1; + } + len = strlen(msg); + written = fwrite(msg, 1, len, f); + fclose(f); + free(msg); + return written == len ? 0 : -1; +} + const struct sockaddr_in6 addr_any6 = { .sin6_family = AF_INET6, }; diff --git a/tools/testing/selftests/net/tcp_ao/restore.c b/tools/testing/selftests/net/tcp_ao/restore.c index f6ea2190f43d..ecc6f1e3a414 100644 --- a/tools/testing/selftests/net/tcp_ao/restore.c +++ b/tools/testing/selftests/net/tcp_ao/restore.c @@ -208,22 +208,36 @@ static void *client_fn(void *arg) test_get_sk_checkpoint(port, &saddr, &tcp_img, &ao_img); ao_img.snt_isn += 1; + trace_ao_event_expect(TCP_AO_MISMATCH, this_ip_addr, this_ip_dest, + -1, port, 0, -1, -1, -1, -1, -1, 100, 100, -1); + trace_ao_event_expect(TCP_AO_MISMATCH, this_ip_dest, this_ip_addr, + port, -1, 0, -1, -1, -1, -1, -1, 100, 100, -1); test_sk_restore("TCP-AO with wrong send ISN", port++, &saddr, &tcp_img, &ao_img, FAULT_TIMEOUT, TEST_CNT_BAD); test_get_sk_checkpoint(port, &saddr, &tcp_img, &ao_img); ao_img.rcv_isn += 1; + trace_ao_event_expect(TCP_AO_MISMATCH, this_ip_addr, this_ip_dest, + -1, port, 0, -1, -1, -1, -1, -1, 100, 100, -1); + trace_ao_event_expect(TCP_AO_MISMATCH, this_ip_dest, this_ip_addr, + port, -1, 0, -1, -1, -1, -1, -1, 100, 100, -1); test_sk_restore("TCP-AO with wrong receive ISN", port++, &saddr, &tcp_img, &ao_img, FAULT_TIMEOUT, TEST_CNT_BAD); test_get_sk_checkpoint(port, &saddr, &tcp_img, &ao_img); ao_img.snd_sne += 1; + trace_ao_event_expect(TCP_AO_MISMATCH, this_ip_addr, this_ip_dest, + -1, port, 0, -1, -1, -1, -1, -1, 100, 100, -1); + /* not expecting server => client mismatches as only snd sne is broken */ test_sk_restore("TCP-AO with wrong send SEQ ext number", port++, &saddr, &tcp_img, &ao_img, FAULT_TIMEOUT, TEST_CNT_NS_BAD | TEST_CNT_GOOD); test_get_sk_checkpoint(port, &saddr, &tcp_img, &ao_img); ao_img.rcv_sne += 1; + /* not expecting client => server mismatches as only rcv sne is broken */ + trace_ao_event_expect(TCP_AO_MISMATCH, this_ip_dest, this_ip_addr, + port, -1, 0, -1, -1, -1, -1, -1, 100, 100, -1); test_sk_restore("TCP-AO with wrong receive SEQ ext number", port++, &saddr, &tcp_img, &ao_img, FAULT_TIMEOUT, TEST_CNT_NS_GOOD | TEST_CNT_BAD); @@ -233,6 +247,6 @@ static void *client_fn(void *arg) int main(int argc, char *argv[]) { - test_init(20, server_fn, client_fn); + test_init(21, server_fn, client_fn); return 0; } diff --git a/tools/testing/selftests/net/tcp_ao/rst.c b/tools/testing/selftests/net/tcp_ao/rst.c index a2fe88d35ac0..6364facaa63e 100644 --- a/tools/testing/selftests/net/tcp_ao/rst.c +++ b/tools/testing/selftests/net/tcp_ao/rst.c @@ -455,6 +455,6 @@ static void *client_fn(void *arg) int main(int argc, char *argv[]) { - test_init(14, server_fn, client_fn); + test_init(15, server_fn, client_fn); return 0; } diff --git a/tools/testing/selftests/net/tcp_ao/self-connect.c b/tools/testing/selftests/net/tcp_ao/self-connect.c index e56931d38e06..3ecd2b58de6a 100644 --- a/tools/testing/selftests/net/tcp_ao/self-connect.c +++ b/tools/testing/selftests/net/tcp_ao/self-connect.c @@ -163,17 +163,26 @@ static void *client_fn(void *arg) setup_lo_intf("lo"); tcp_self_connect("self-connect(same keyids)", port++, false, false); + + /* expecting rnext to change based on the first segment RNext != Current */ + trace_ao_event_expect(TCP_AO_RNEXT_REQUEST, local_addr, local_addr, + port, port, 0, -1, -1, -1, -1, -1, 7, 5, -1); tcp_self_connect("self-connect(different keyids)", port++, true, false); tcp_self_connect("self-connect(restore)", port, false, true); - port += 2; + port += 2; /* restore test restores over different port */ + trace_ao_event_expect(TCP_AO_RNEXT_REQUEST, local_addr, local_addr, + port, port, 0, -1, -1, -1, -1, -1, 7, 5, -1); + /* intentionally on restore they are added to the socket in different order */ + trace_ao_event_expect(TCP_AO_RNEXT_REQUEST, local_addr, local_addr, + port + 1, port + 1, 0, -1, -1, -1, -1, -1, 5, 7, -1); tcp_self_connect("self-connect(restore, different keyids)", port, true, true); - port += 2; + port += 2; /* restore test restores over different port */ return NULL; } int main(int argc, char *argv[]) { - test_init(4, client_fn, NULL); + test_init(5, client_fn, NULL); return 0; } diff --git a/tools/testing/selftests/net/tcp_ao/seq-ext.c b/tools/testing/selftests/net/tcp_ao/seq-ext.c index 885866cc193c..8901a6785dc8 100644 --- a/tools/testing/selftests/net/tcp_ao/seq-ext.c +++ b/tools/testing/selftests/net/tcp_ao/seq-ext.c @@ -116,6 +116,14 @@ static void *server_fn(void *arg) sk = test_sk_restore(&img, &ao_img, &saddr, this_ip_dest, client_new_port, &ao1); + trace_ao_event_sne_expect(TCP_AO_SND_SNE_UPDATE, this_ip_addr, + this_ip_dest, test_server_port + 1, client_new_port, 1); + trace_ao_event_sne_expect(TCP_AO_SND_SNE_UPDATE, this_ip_dest, + this_ip_addr, client_new_port, test_server_port + 1, 1); + trace_ao_event_sne_expect(TCP_AO_RCV_SNE_UPDATE, this_ip_addr, + this_ip_dest, test_server_port + 1, client_new_port, 1); + trace_ao_event_sne_expect(TCP_AO_RCV_SNE_UPDATE, this_ip_dest, + this_ip_addr, client_new_port, test_server_port + 1, 1); synchronize_threads(); /* 5: verify the connection during SEQ-number rollover */ bytes = test_server_run(sk, quota, TEST_TIMEOUT_SEC); if (bytes != quota) { @@ -242,6 +250,6 @@ static void *client_fn(void *arg) int main(int argc, char *argv[]) { - test_init(7, server_fn, client_fn); + test_init(8, server_fn, client_fn); return 0; } diff --git a/tools/testing/selftests/net/tcp_ao/setsockopt-closed.c b/tools/testing/selftests/net/tcp_ao/setsockopt-closed.c index 5eee826c37aa..084db4ecdff6 100644 --- a/tools/testing/selftests/net/tcp_ao/setsockopt-closed.c +++ b/tools/testing/selftests/net/tcp_ao/setsockopt-closed.c @@ -830,6 +830,6 @@ static void *client_fn(void *arg) int main(int argc, char *argv[]) { - test_init(120, client_fn, NULL); + test_init(121, client_fn, NULL); return 0; } diff --git a/tools/testing/selftests/net/tcp_ao/unsigned-md5.c b/tools/testing/selftests/net/tcp_ao/unsigned-md5.c index 02346b58efbd..f779e5892bc1 100644 --- a/tools/testing/selftests/net/tcp_ao/unsigned-md5.c +++ b/tools/testing/selftests/net/tcp_ao/unsigned-md5.c @@ -674,24 +674,38 @@ static void *client_fn(void *arg) try_connect("AO server (INADDR_ANY): AO client", port++, NULL, 0, &addr_any, 0, 100, 100, 0, 0, 0, &this_ip_addr); + trace_hash_event_expect(TCP_HASH_MD5_UNEXPECTED, this_ip_addr, + this_ip_dest, -1, port, 0, 0, 1, 0, 0, 0); try_connect("AO server (INADDR_ANY): MD5 client", port++, &addr_any, 0, NULL, 0, 100, 100, 0, FAULT_TIMEOUT, 1, &this_ip_addr); + trace_hash_event_expect(TCP_HASH_AO_REQUIRED, this_ip_addr, + this_ip_dest, -1, port, 0, 0, 1, 0, 0, 0); try_connect("AO server (INADDR_ANY): unsigned client", port++, NULL, 0, NULL, 0, 100, 100, 0, FAULT_TIMEOUT, 0, &this_ip_addr); try_connect("AO server (AO_REQUIRED): AO client", port++, NULL, 0, &addr_any, 0, 100, 100, 0, 0, 0, &this_ip_addr); + trace_hash_event_expect(TCP_HASH_AO_REQUIRED, client2, + this_ip_dest, -1, port, 0, 0, 1, 0, 0, 0); try_connect("AO server (AO_REQUIRED): unsigned client", port++, NULL, 0, NULL, 0, 100, 100, 0, FAULT_TIMEOUT, 0, &client2); + trace_ao_event_expect(TCP_AO_KEY_NOT_FOUND, this_ip_addr, this_ip_dest, + -1, port, 0, 0, 1, 0, 0, 0, 100, 100, -1); try_connect("MD5 server (INADDR_ANY): AO client", port++, NULL, 0, &addr_any, 0, 100, 100, 0, FAULT_TIMEOUT, 1, &this_ip_addr); try_connect("MD5 server (INADDR_ANY): MD5 client", port++, &addr_any, 0, NULL, 0, 100, 100, 0, 0, 1, &this_ip_addr); + trace_hash_event_expect(TCP_HASH_MD5_REQUIRED, this_ip_addr, + this_ip_dest, -1, port, 0, 0, 1, 0, 0, 0); try_connect("MD5 server (INADDR_ANY): no sign client", port++, NULL, 0, NULL, 0, 100, 100, 0, FAULT_TIMEOUT, 1, &this_ip_addr); + trace_ao_event_expect(TCP_AO_KEY_NOT_FOUND, this_ip_addr, this_ip_dest, + -1, port, 0, 0, 1, 0, 0, 0, 100, 100, -1); try_connect("no sign server: AO client", port++, NULL, 0, &addr_any, 0, 100, 100, 0, FAULT_TIMEOUT, 0, &this_ip_addr); + trace_hash_event_expect(TCP_HASH_MD5_UNEXPECTED, this_ip_addr, + this_ip_dest, -1, port, 0, 0, 1, 0, 0, 0); try_connect("no sign server: MD5 client", port++, &addr_any, 0, NULL, 0, 100, 100, 0, FAULT_TIMEOUT, 1, &this_ip_addr); try_connect("no sign server: no sign client", port++, NULL, 0, @@ -699,25 +713,37 @@ static void *client_fn(void *arg) try_connect("AO+MD5 server: AO client (matching)", port++, NULL, 0, &addr_any, 0, 100, 100, 0, 0, 1, &client2); + trace_ao_event_expect(TCP_AO_KEY_NOT_FOUND, this_ip_addr, this_ip_dest, + -1, port, 0, 0, 1, 0, 0, 0, 100, 100, -1); try_connect("AO+MD5 server: AO client (misconfig, matching MD5)", port++, NULL, 0, &addr_any, 0, 100, 100, 0, FAULT_TIMEOUT, 1, &this_ip_addr); + trace_ao_event_expect(TCP_AO_KEY_NOT_FOUND, client3, this_ip_dest, + -1, port, 0, 0, 1, 0, 0, 0, 100, 100, -1); try_connect("AO+MD5 server: AO client (misconfig, non-matching)", port++, NULL, 0, &addr_any, 0, 100, 100, 0, FAULT_TIMEOUT, 1, &client3); try_connect("AO+MD5 server: MD5 client (matching)", port++, &addr_any, 0, NULL, 0, 100, 100, 0, 0, 1, &this_ip_addr); + trace_hash_event_expect(TCP_HASH_MD5_UNEXPECTED, client2, + this_ip_dest, -1, port, 0, 0, 1, 0, 0, 0); try_connect("AO+MD5 server: MD5 client (misconfig, matching AO)", port++, &addr_any, 0, NULL, 0, 100, 100, 0, FAULT_TIMEOUT, 1, &client2); + trace_hash_event_expect(TCP_HASH_MD5_UNEXPECTED, client3, + this_ip_dest, -1, port, 0, 0, 1, 0, 0, 0); try_connect("AO+MD5 server: MD5 client (misconfig, non-matching)", port++, &addr_any, 0, NULL, 0, 100, 100, 0, FAULT_TIMEOUT, 1, &client3); try_connect("AO+MD5 server: no sign client (unmatched)", port++, NULL, 0, NULL, 0, 100, 100, 0, 0, 1, &client3); + trace_hash_event_expect(TCP_HASH_AO_REQUIRED, client2, + this_ip_dest, -1, port, 0, 0, 1, 0, 0, 0); try_connect("AO+MD5 server: no sign client (misconfig, matching AO)", port++, NULL, 0, NULL, 0, 100, 100, 0, FAULT_TIMEOUT, 1, &client2); + trace_hash_event_expect(TCP_HASH_MD5_REQUIRED, this_ip_addr, + this_ip_dest, -1, port, 0, 0, 1, 0, 0, 0); try_connect("AO+MD5 server: no sign client (misconfig, matching MD5)", port++, NULL, 0, NULL, 0, 100, 100, 0, FAULT_TIMEOUT, 1, &this_ip_addr); @@ -739,6 +765,6 @@ static void *client_fn(void *arg) int main(int argc, char *argv[]) { - test_init(72, server_fn, client_fn); + test_init(73, server_fn, client_fn); return 0; } -- cgit v1.2.3 From e93681afcb96864ec26c3b2ce94008ce93577373 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Mon, 26 Aug 2024 19:11:19 +0200 Subject: selftests: mptcp: join: cannot rm sf if closed Thanks to the previous commit, the MPTCP subflows are now closed on both directions even when only the MPTCP path-manager of one peer asks for their closure. In the two tests modified here -- "userspace pm add & remove address" and "userspace pm create destroy subflow" -- one peer is controlled by the userspace PM, and the other one by the in-kernel PM. When the userspace PM sends a RM_ADDR notification, the in-kernel PM will automatically react by closing all subflows using this address. Now, thanks to the previous commit, the subflows are properly closed on both directions, the userspace PM can then no longer closes the same subflows if they are already closed. Before, it was OK to do that, because the subflows were still half-opened, still OK to send a RM_ADDR. In other words, thanks to the previous commit closing the subflows, an error will be returned to the userspace if it tries to close a subflow that has already been closed. So no need to run this command, which mean that the linked counters will then not be incremented. These tests are then no longer sending both a RM_ADDR, then closing the linked subflow just after. The test with the userspace PM on the server side is now removing one subflow linked to one address, then sending a RM_ADDR for another address. The test with the userspace PM on the client side is now only removing the subflow that was previously created. Fixes: 4369c198e599 ("selftests: mptcp: test userspace pm out of transfer") Cc: stable@vger.kernel.org Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20240826-net-mptcp-close-extra-sf-fin-v1-2-905199fe1172@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/mptcp/mptcp_join.sh | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh index 89e553e0e0c2..264040a760c6 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_join.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh @@ -3429,14 +3429,12 @@ userspace_tests() "signal" userspace_pm_chk_get_addr "${ns1}" "10" "id 10 flags signal 10.0.2.1" userspace_pm_chk_get_addr "${ns1}" "20" "id 20 flags signal 10.0.3.1" - userspace_pm_rm_addr $ns1 10 userspace_pm_rm_sf $ns1 "::ffff:10.0.2.1" $MPTCP_LIB_EVENT_SUB_ESTABLISHED userspace_pm_chk_dump_addr "${ns1}" \ - "id 20 flags signal 10.0.3.1" "after rm_addr 10" + "id 20 flags signal 10.0.3.1" "after rm_sf 10" userspace_pm_rm_addr $ns1 20 - userspace_pm_rm_sf $ns1 10.0.3.1 $MPTCP_LIB_EVENT_SUB_ESTABLISHED userspace_pm_chk_dump_addr "${ns1}" "" "after rm_addr 20" - chk_rm_nr 2 2 invert + chk_rm_nr 1 1 invert chk_mptcp_info subflows 0 subflows 0 chk_subflows_total 1 1 kill_events_pids @@ -3460,12 +3458,11 @@ userspace_tests() "id 20 flags subflow 10.0.3.2" \ "subflow" userspace_pm_chk_get_addr "${ns2}" "20" "id 20 flags subflow 10.0.3.2" - userspace_pm_rm_addr $ns2 20 userspace_pm_rm_sf $ns2 10.0.3.2 $MPTCP_LIB_EVENT_SUB_ESTABLISHED userspace_pm_chk_dump_addr "${ns2}" \ "" \ - "after rm_addr 20" - chk_rm_nr 1 1 + "after rm_sf 20" + chk_rm_nr 0 1 chk_mptcp_info subflows 0 subflows 0 chk_subflows_total 1 1 kill_events_pids -- cgit v1.2.3 From c264487e5410e5a72db8a414566ab7d144223e6c Mon Sep 17 00:00:00 2001 From: Hao Ge Date: Tue, 20 Aug 2024 10:36:22 +0800 Subject: selftests/bpf: Fix incorrect parameters in NULL pointer checking Smatch reported the following warning: ./tools/testing/selftests/bpf/testing_helpers.c:455 get_xlated_program() warn: variable dereferenced before check 'buf' (see line 454) It seems correct,so let's modify it based on it's suggestion. Actually,commit b23ed4d74c4d ("selftests/bpf: Fix invalid pointer check in get_xlated_program()") fixed an issue in the test_verifier.c once,but it was reverted this time. Let's solve this issue with the minimal changes possible. Reported-by: Dan Carpenter Closes: https://lore.kernel.org/all/1eb3732f-605a-479d-ba64-cd14250cbf91@stanley.mountain/ Fixes: b4b7a4099b8c ("selftests/bpf: Factor out get_xlated_program() helper") Signed-off-by: Hao Ge Link: https://lore.kernel.org/r/20240820023622.29190-1-hao.ge@linux.dev Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/testing_helpers.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/testing_helpers.c b/tools/testing/selftests/bpf/testing_helpers.c index c217e12bd9da..d3c3c3a24150 100644 --- a/tools/testing/selftests/bpf/testing_helpers.c +++ b/tools/testing/selftests/bpf/testing_helpers.c @@ -452,7 +452,7 @@ int get_xlated_program(int fd_prog, struct bpf_insn **buf, __u32 *cnt) *cnt = xlated_prog_len / buf_element_size; *buf = calloc(*cnt, buf_element_size); - if (!buf) { + if (!*buf) { perror("can't allocate xlated program buffer"); return -ENOMEM; } -- cgit v1.2.3 From 4f3affe0abf5d5910dc469a1f63257629605d3c3 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Tue, 20 Aug 2024 11:32:02 -0700 Subject: perf hist: Don't set hpp_fmt_value for members in --no-group Perf crashes as below when applying --no-group # perf record -e "{cache-misses,branches"} -b sleep 1 # perf report --stdio --no-group free(): invalid next size (fast) Aborted (core dumped) # In the __hpp__fmt(), only 1 hpp_fmt_value is allocated for the current event when --no-group is applied. However, the current implementation tries to assign the hists from all members to the hpp_fmt_value, which exceeds the allocated memory. Fixes: 8f6071a3dce40e69 ("perf hist: Simplify __hpp_fmt() using hpp_fmt_data") Signed-off-by: Kan Liang Acked-by: Namhyung Kim Cc: Ian Rogers Cc: Ingo Molnar Cc: Namhyung Kim Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20240820183202.3174323-1-kan.liang@linux.intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/ui/hist.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'tools') diff --git a/tools/perf/ui/hist.c b/tools/perf/ui/hist.c index 5d1f04f66a5a..e5491995adf0 100644 --- a/tools/perf/ui/hist.c +++ b/tools/perf/ui/hist.c @@ -62,7 +62,7 @@ static int __hpp__fmt(struct perf_hpp *hpp, struct hist_entry *he, struct evsel *pos; char *buf = hpp->buf; size_t size = hpp->size; - int i, nr_members = 1; + int i = 0, nr_members = 1; struct hpp_fmt_value *values; if (evsel__is_group_event(evsel)) @@ -72,16 +72,16 @@ static int __hpp__fmt(struct perf_hpp *hpp, struct hist_entry *he, if (values == NULL) return 0; - i = 0; - for_each_group_evsel(pos, evsel) - values[i++].hists = evsel__hists(pos); - + values[0].hists = evsel__hists(evsel); values[0].val = get_field(he); values[0].samples = he->stat.nr_events; if (evsel__is_group_event(evsel)) { struct hist_entry *pair; + for_each_group_member(pos, evsel) + values[++i].hists = evsel__hists(pos); + list_for_each_entry(pair, &he->pairs.head, pairs.node) { for (i = 0; i < nr_members; i++) { if (values[i].hists != pair->hists) -- cgit v1.2.3 From 0fe2b18ddc40bf8fbcd96dab651822dbadc1cd85 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Tue, 20 Aug 2024 08:45:02 -0700 Subject: perf bpf-filter: Support multiple events properly So far it used tgid as a key to get the filter expressions in the pinned filters map for regular users but it won't work well if the has more than one filters at the same time. Let's add the event id to the key of the filter hash map so that it can identify the right filter expression in the BPF program. As the event can be inherited to child tasks, it should use the primary id which belongs to the parent (original) event. Since evsel opens the event for multiple CPUs and tasks, it needs to maintain a separate hash map for the event id. In the user space, it keeps a list for the multiple evsel and release the entries in the both hash map when it closes the event. Signed-off-by: Namhyung Kim Cc: Adrian Hunter Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: KP Singh Cc: Kan Liang Cc: Peter Zijlstra Cc: Song Liu Link: https://lore.kernel.org/r/20240820154504.128923-1-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/bpf-filter.c | 288 ++++++++++++++++++++++++--- tools/perf/util/bpf_skel/sample-filter.h | 11 +- tools/perf/util/bpf_skel/sample_filter.bpf.c | 42 +++- tools/perf/util/bpf_skel/vmlinux/vmlinux.h | 5 + 4 files changed, 304 insertions(+), 42 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/bpf-filter.c b/tools/perf/util/bpf-filter.c index c5eb0b7eec19..0a1832564dd2 100644 --- a/tools/perf/util/bpf-filter.c +++ b/tools/perf/util/bpf-filter.c @@ -1,4 +1,45 @@ /* SPDX-License-Identifier: GPL-2.0 */ +/** + * Generic event filter for sampling events in BPF. + * + * The BPF program is fixed and just to read filter expressions in the 'filters' + * map and compare the sample data in order to reject samples that don't match. + * Each filter expression contains a sample flag (term) to compare, an operation + * (==, >=, and so on) and a value. + * + * Note that each entry has an array of filter expressions and it only succeeds + * when all of the expressions are satisfied. But it supports the logical OR + * using a GROUP operation which is satisfied when any of its member expression + * is evaluated to true. But it doesn't allow nested GROUP operations for now. + * + * To support non-root users, the filters map can be loaded and pinned in the BPF + * filesystem by root (perf record --setup-filter pin). Then each user will get + * a new entry in the shared filters map to fill the filter expressions. And the + * BPF program will find the filter using (task-id, event-id) as a key. + * + * The pinned BPF object (shared for regular users) has: + * + * event_hash | + * | | | + * event->id ---> | id | ---+ idx_hash | filters + * | | | | | | | | + * | .... | +-> | idx | --+--> | exprs | ---> perf_bpf_filter_entry[] + * | | | | | | .op + * task id (tgid) --------------+ | .... | | | ... | .term (+ part) + * | .value + * | + * ======= (root would skip this part) ======== (compares it in a loop) + * + * This is used for per-task use cases while system-wide profiling (normally from + * root user) uses a separate copy of the program and the maps for its own so that + * it can proceed even if a lot of non-root users are using the filters at the + * same time. In this case the filters map has a single entry and no need to use + * the hash maps to get the index (key) of the filters map (IOW it's always 0). + * + * The BPF program returns 1 to accept the sample or 0 to drop it. + * The 'dropped' map is to keep how many samples it dropped by the filter and + * it will be reported as lost samples. + */ #include #include #include @@ -6,6 +47,7 @@ #include #include +#include #include #include #include @@ -27,7 +69,14 @@ #define PERF_SAMPLE_TYPE(_st, opt) __PERF_SAMPLE_TYPE(PBF_TERM_##_st, PERF_SAMPLE_##_st, opt) /* Index in the pinned 'filters' map. Should be released after use. */ -static int pinned_filter_idx = -1; +struct pinned_filter_idx { + struct list_head list; + struct evsel *evsel; + u64 event_id; + int hash_idx; +}; + +static LIST_HEAD(pinned_filters); static const struct perf_sample_info { enum perf_bpf_filter_term type; @@ -175,24 +224,145 @@ static int convert_to_tgid(int tid) return tgid; } -static int update_pid_hash(struct evsel *evsel, struct perf_bpf_filter_entry *entry) +/* + * The event might be closed already so we cannot get the list of ids using FD + * like in create_event_hash() below, let's iterate the event_hash map and + * delete all entries that have the event id as a key. + */ +static void destroy_event_hash(u64 event_id) +{ + int fd; + u64 key, *prev_key = NULL; + int num = 0, alloced = 32; + u64 *ids = calloc(alloced, sizeof(*ids)); + + if (ids == NULL) + return; + + fd = get_pinned_fd("event_hash"); + if (fd < 0) { + pr_debug("cannot get fd for 'event_hash' map\n"); + free(ids); + return; + } + + /* Iterate the whole map to collect keys for the event id. */ + while (!bpf_map_get_next_key(fd, prev_key, &key)) { + u64 id; + + if (bpf_map_lookup_elem(fd, &key, &id) == 0 && id == event_id) { + if (num == alloced) { + void *tmp; + + alloced *= 2; + tmp = realloc(ids, alloced * sizeof(*ids)); + if (tmp == NULL) + break; + + ids = tmp; + } + ids[num++] = key; + } + + prev_key = &key; + } + + for (int i = 0; i < num; i++) + bpf_map_delete_elem(fd, &ids[i]); + + free(ids); + close(fd); +} + +/* + * Return a representative id if ok, or 0 for failures. + * + * The perf_event->id is good for this, but an evsel would have multiple + * instances for CPUs and tasks. So pick up the first id and setup a hash + * from id of each instance to the representative id (the first one). + */ +static u64 create_event_hash(struct evsel *evsel) +{ + int x, y, fd; + u64 the_id = 0, id; + + fd = get_pinned_fd("event_hash"); + if (fd < 0) { + pr_err("cannot get fd for 'event_hash' map\n"); + return 0; + } + + for (x = 0; x < xyarray__max_x(evsel->core.fd); x++) { + for (y = 0; y < xyarray__max_y(evsel->core.fd); y++) { + int ret = ioctl(FD(evsel, x, y), PERF_EVENT_IOC_ID, &id); + + if (ret < 0) { + pr_err("Failed to get the event id\n"); + if (the_id) + destroy_event_hash(the_id); + return 0; + } + + if (the_id == 0) + the_id = id; + + bpf_map_update_elem(fd, &id, &the_id, BPF_ANY); + } + } + + close(fd); + return the_id; +} + +static void destroy_idx_hash(struct pinned_filter_idx *pfi) +{ + int fd, nr; + struct perf_thread_map *threads; + + fd = get_pinned_fd("filters"); + bpf_map_delete_elem(fd, &pfi->hash_idx); + close(fd); + + if (pfi->event_id) + destroy_event_hash(pfi->event_id); + + threads = perf_evsel__threads(&pfi->evsel->core); + if (threads == NULL) + return; + + fd = get_pinned_fd("idx_hash"); + nr = perf_thread_map__nr(threads); + for (int i = 0; i < nr; i++) { + /* The target task might be dead already, just try the pid */ + struct idx_hash_key key = { + .evt_id = pfi->event_id, + .tgid = perf_thread_map__pid(threads, i), + }; + + bpf_map_delete_elem(fd, &key); + } + close(fd); +} + +/* Maintain a hashmap from (tgid, event-id) to filter index */ +static int create_idx_hash(struct evsel *evsel, struct perf_bpf_filter_entry *entry) { int filter_idx; int fd, nr, last; + u64 event_id = 0; + struct pinned_filter_idx *pfi = NULL; struct perf_thread_map *threads; fd = get_pinned_fd("filters"); if (fd < 0) { - pr_debug("cannot get fd for 'filters' map\n"); + pr_err("cannot get fd for 'filters' map\n"); return fd; } /* Find the first available entry in the filters map */ for (filter_idx = 0; filter_idx < MAX_FILTERS; filter_idx++) { - if (bpf_map_update_elem(fd, &filter_idx, entry, BPF_NOEXIST) == 0) { - pinned_filter_idx = filter_idx; + if (bpf_map_update_elem(fd, &filter_idx, entry, BPF_NOEXIST) == 0) break; - } } close(fd); @@ -201,22 +371,44 @@ static int update_pid_hash(struct evsel *evsel, struct perf_bpf_filter_entry *en return -EBUSY; } + pfi = zalloc(sizeof(*pfi)); + if (pfi == NULL) { + pr_err("Cannot save pinned filter index\n"); + goto err; + } + + pfi->evsel = evsel; + pfi->hash_idx = filter_idx; + + event_id = create_event_hash(evsel); + if (event_id == 0) { + pr_err("Cannot update the event hash\n"); + goto err; + } + + pfi->event_id = event_id; + threads = perf_evsel__threads(&evsel->core); if (threads == NULL) { pr_err("Cannot get the thread list of the event\n"); - return -EINVAL; + goto err; } /* save the index to a hash map */ - fd = get_pinned_fd("pid_hash"); - if (fd < 0) - return fd; + fd = get_pinned_fd("idx_hash"); + if (fd < 0) { + pr_err("cannot get fd for 'idx_hash' map\n"); + goto err; + } last = -1; nr = perf_thread_map__nr(threads); for (int i = 0; i < nr; i++) { int pid = perf_thread_map__pid(threads, i); int tgid; + struct idx_hash_key key = { + .evt_id = event_id, + }; /* it actually needs tgid, let's get tgid from /proc. */ tgid = convert_to_tgid(pid); @@ -228,16 +420,25 @@ static int update_pid_hash(struct evsel *evsel, struct perf_bpf_filter_entry *en if (tgid == last) continue; last = tgid; + key.tgid = tgid; - if (bpf_map_update_elem(fd, &tgid, &filter_idx, BPF_ANY) < 0) { - pr_err("Failed to update the pid hash\n"); + if (bpf_map_update_elem(fd, &key, &filter_idx, BPF_ANY) < 0) { + pr_err("Failed to update the idx_hash\n"); close(fd); - return -1; + goto err; } - pr_debug("pid hash: %d -> %d\n", tgid, filter_idx); + pr_debug("bpf-filter: idx_hash (task=%d,%s) -> %d\n", + tgid, evsel__name(evsel), filter_idx); } + + list_add(&pfi->list, &pinned_filters); close(fd); - return 0; + return filter_idx; + +err: + destroy_idx_hash(pfi); + free(pfi); + return -1; } int perf_bpf_filter__prepare(struct evsel *evsel, struct target *target) @@ -247,7 +448,7 @@ int perf_bpf_filter__prepare(struct evsel *evsel, struct target *target) struct bpf_program *prog; struct bpf_link *link; struct perf_bpf_filter_entry *entry; - bool needs_pid_hash = !target__has_cpu(target) && !target->uid_str; + bool needs_idx_hash = !target__has_cpu(target) && !target->uid_str; entry = calloc(MAX_FILTERS, sizeof(*entry)); if (entry == NULL) @@ -259,11 +460,11 @@ int perf_bpf_filter__prepare(struct evsel *evsel, struct target *target) goto err; } - if (needs_pid_hash && geteuid() != 0) { + if (needs_idx_hash && geteuid() != 0) { int zero = 0; /* The filters map is shared among other processes */ - ret = update_pid_hash(evsel, entry); + ret = create_idx_hash(evsel, entry); if (ret < 0) goto err; @@ -274,7 +475,7 @@ int perf_bpf_filter__prepare(struct evsel *evsel, struct target *target) } /* Reset the lost count */ - bpf_map_update_elem(fd, &pinned_filter_idx, &zero, BPF_ANY); + bpf_map_update_elem(fd, &ret, &zero, BPF_ANY); close(fd); fd = get_pinned_fd("perf_sample_filter"); @@ -288,6 +489,7 @@ int perf_bpf_filter__prepare(struct evsel *evsel, struct target *target) ret = ioctl(FD(evsel, x, y), PERF_EVENT_IOC_SET_BPF, fd); if (ret < 0) { pr_err("Failed to attach perf sample-filter\n"); + close(fd); goto err; } } @@ -332,6 +534,15 @@ int perf_bpf_filter__prepare(struct evsel *evsel, struct target *target) err: free(entry); + if (!list_empty(&pinned_filters)) { + struct pinned_filter_idx *pfi, *tmp; + + list_for_each_entry_safe(pfi, tmp, &pinned_filters, list) { + destroy_idx_hash(pfi); + list_del(&pfi->list); + free(pfi); + } + } sample_filter_bpf__destroy(skel); return ret; } @@ -339,6 +550,7 @@ err: int perf_bpf_filter__destroy(struct evsel *evsel) { struct perf_bpf_filter_expr *expr, *tmp; + struct pinned_filter_idx *pfi, *pos; list_for_each_entry_safe(expr, tmp, &evsel->bpf_filters, list) { list_del(&expr->list); @@ -346,14 +558,11 @@ int perf_bpf_filter__destroy(struct evsel *evsel) } sample_filter_bpf__destroy(evsel->bpf_skel); - if (pinned_filter_idx >= 0) { - int fd = get_pinned_fd("filters"); - - bpf_map_delete_elem(fd, &pinned_filter_idx); - pinned_filter_idx = -1; - close(fd); + list_for_each_entry_safe(pfi, pos, &pinned_filters, list) { + destroy_idx_hash(pfi); + list_del(&pfi->list); + free(pfi); } - return 0; } @@ -364,10 +573,20 @@ u64 perf_bpf_filter__lost_count(struct evsel *evsel) if (list_empty(&evsel->bpf_filters)) return 0; - if (pinned_filter_idx >= 0) { + if (!list_empty(&pinned_filters)) { int fd = get_pinned_fd("dropped"); + struct pinned_filter_idx *pfi; + + if (fd < 0) + return 0; - bpf_map_lookup_elem(fd, &pinned_filter_idx, &count); + list_for_each_entry(pfi, &pinned_filters, list) { + if (pfi->evsel != evsel) + continue; + + bpf_map_lookup_elem(fd, &pfi->hash_idx, &count); + break; + } close(fd); } else if (evsel->bpf_skel) { struct sample_filter_bpf *skel = evsel->bpf_skel; @@ -429,9 +648,10 @@ int perf_bpf_filter__pin(void) /* pinned program will use pid-hash */ bpf_map__set_max_entries(skel->maps.filters, MAX_FILTERS); - bpf_map__set_max_entries(skel->maps.pid_hash, MAX_PIDS); + bpf_map__set_max_entries(skel->maps.event_hash, MAX_EVT_HASH); + bpf_map__set_max_entries(skel->maps.idx_hash, MAX_IDX_HASH); bpf_map__set_max_entries(skel->maps.dropped, MAX_FILTERS); - skel->rodata->use_pid_hash = 1; + skel->rodata->use_idx_hash = 1; if (sample_filter_bpf__load(skel) < 0) { ret = -errno; @@ -484,8 +704,12 @@ int perf_bpf_filter__pin(void) pr_debug("chmod for filters failed\n"); ret = -errno; } - if (fchmodat(dir_fd, "pid_hash", 0666, 0) < 0) { - pr_debug("chmod for pid_hash failed\n"); + if (fchmodat(dir_fd, "event_hash", 0666, 0) < 0) { + pr_debug("chmod for event_hash failed\n"); + ret = -errno; + } + if (fchmodat(dir_fd, "idx_hash", 0666, 0) < 0) { + pr_debug("chmod for idx_hash failed\n"); ret = -errno; } if (fchmodat(dir_fd, "dropped", 0666, 0) < 0) { diff --git a/tools/perf/util/bpf_skel/sample-filter.h b/tools/perf/util/bpf_skel/sample-filter.h index e666bfd5fbdd..5f0c8e4e83d3 100644 --- a/tools/perf/util/bpf_skel/sample-filter.h +++ b/tools/perf/util/bpf_skel/sample-filter.h @@ -1,8 +1,9 @@ #ifndef PERF_UTIL_BPF_SKEL_SAMPLE_FILTER_H #define PERF_UTIL_BPF_SKEL_SAMPLE_FILTER_H -#define MAX_FILTERS 64 -#define MAX_PIDS (16 * 1024) +#define MAX_FILTERS 64 +#define MAX_IDX_HASH (16 * 1024) +#define MAX_EVT_HASH (1024 * 1024) /* supported filter operations */ enum perf_bpf_filter_op { @@ -62,4 +63,10 @@ struct perf_bpf_filter_entry { __u64 value; }; +struct idx_hash_key { + __u64 evt_id; + __u32 tgid; + __u32 reserved; +}; + #endif /* PERF_UTIL_BPF_SKEL_SAMPLE_FILTER_H */ diff --git a/tools/perf/util/bpf_skel/sample_filter.bpf.c b/tools/perf/util/bpf_skel/sample_filter.bpf.c index 4c75354b84fd..4872a16eedfd 100644 --- a/tools/perf/util/bpf_skel/sample_filter.bpf.c +++ b/tools/perf/util/bpf_skel/sample_filter.bpf.c @@ -15,13 +15,25 @@ struct filters { __uint(max_entries, 1); } filters SEC(".maps"); -/* tgid to filter index */ -struct pid_hash { +/* + * An evsel has multiple instances for each CPU or task but we need a single + * id to be used as a key for the idx_hash. This hashmap would translate the + * instance's ID to a representative ID. + */ +struct event_hash { __uint(type, BPF_MAP_TYPE_HASH); - __type(key, int); + __type(key, __u64); + __type(value, __u64); + __uint(max_entries, 1); +} event_hash SEC(".maps"); + +/* tgid/evtid to filter index */ +struct idx_hash { + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, struct idx_hash_key); __type(value, int); __uint(max_entries, 1); -} pid_hash SEC(".maps"); +} idx_hash SEC(".maps"); /* tgid to filter index */ struct lost_count { @@ -31,7 +43,7 @@ struct lost_count { __uint(max_entries, 1); } dropped SEC(".maps"); -volatile const int use_pid_hash; +volatile const int use_idx_hash; void *bpf_cast_to_kern_ctx(void *) __ksym; @@ -202,11 +214,25 @@ int perf_sample_filter(void *ctx) k = 0; - if (use_pid_hash) { - int tgid = bpf_get_current_pid_tgid() >> 32; + if (use_idx_hash) { + struct idx_hash_key key = { + .tgid = bpf_get_current_pid_tgid() >> 32, + }; + __u64 eid = kctx->event->id; + __u64 *key_id; int *idx; - idx = bpf_map_lookup_elem(&pid_hash, &tgid); + /* get primary_event_id */ + if (kctx->event->parent) + eid = kctx->event->parent->id; + + key_id = bpf_map_lookup_elem(&event_hash, &eid); + if (key_id == NULL) + goto drop; + + key.evt_id = *key_id; + + idx = bpf_map_lookup_elem(&idx_hash, &key); if (idx) k = *idx; else diff --git a/tools/perf/util/bpf_skel/vmlinux/vmlinux.h b/tools/perf/util/bpf_skel/vmlinux/vmlinux.h index d818e30c5457..4fa21468487e 100644 --- a/tools/perf/util/bpf_skel/vmlinux/vmlinux.h +++ b/tools/perf/util/bpf_skel/vmlinux/vmlinux.h @@ -175,6 +175,11 @@ struct perf_sample_data { u64 code_page_size; } __attribute__((__aligned__(64))) __attribute__((preserve_access_index)); +struct perf_event { + struct perf_event *parent; + u64 id; +} __attribute__((preserve_access_index)); + struct bpf_perf_event_data_kern { struct perf_sample_data *data; struct perf_event *event; -- cgit v1.2.3 From 1a5474a779798f6ba9d798469326ebfa6b4a8fae Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Tue, 20 Aug 2024 08:45:03 -0700 Subject: perf tools: Print lost samples due to BPF filter Print the actual dropped sample count in the event stat. $ sudo perf record -o- -e cycles --filter 'period < 10000' \ -e instructions --filter 'ip > 0x8000000000000000' perf test -w noploop | \ perf report --stat -i- [ perf record: Woken up 1 times to write data ] [ perf record: Captured and wrote 0.058 MB - ] Aggregated stats: TOTAL events: 469 MMAP events: 268 (57.1%) COMM events: 2 ( 0.4%) EXIT events: 1 ( 0.2%) SAMPLE events: 16 ( 3.4%) MMAP2 events: 22 ( 4.7%) LOST_SAMPLES events: 2 ( 0.4%) KSYMBOL events: 89 (19.0%) BPF_EVENT events: 39 ( 8.3%) ATTR events: 2 ( 0.4%) FINISHED_ROUND events: 1 ( 0.2%) ID_INDEX events: 1 ( 0.2%) THREAD_MAP events: 1 ( 0.2%) CPU_MAP events: 1 ( 0.2%) EVENT_UPDATE events: 2 ( 0.4%) TIME_CONV events: 1 ( 0.2%) FEATURE events: 20 ( 4.3%) FINISHED_INIT events: 1 ( 0.2%) cycles stats: SAMPLE events: 2 LOST_SAMPLES (BPF) events: 4010 instructions stats: SAMPLE events: 14 LOST_SAMPLES (BPF) events: 3990 Signed-off-by: Namhyung Kim Cc: Adrian Hunter Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: KP Singh Cc: Kan Liang Cc: Peter Zijlstra Cc: Song Liu Link: https://lore.kernel.org/r/20240820154504.128923-2-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-report.c | 9 +++++++-- tools/perf/ui/stdio/hist.c | 4 ++-- tools/perf/util/events_stats.h | 15 ++++++++++++++- tools/perf/util/hist.c | 19 +++++++++++++++---- tools/perf/util/hist.h | 1 + tools/perf/util/machine.c | 5 +++-- tools/perf/util/session.c | 5 +++-- 7 files changed, 45 insertions(+), 13 deletions(-) (limited to 'tools') diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c index 1643113616f4..c304d9b06190 100644 --- a/tools/perf/builtin-report.c +++ b/tools/perf/builtin-report.c @@ -795,8 +795,13 @@ static int count_lost_samples_event(const struct perf_tool *tool, evsel = evlist__id2evsel(rep->session->evlist, sample->id); if (evsel) { - hists__inc_nr_lost_samples(evsel__hists(evsel), - event->lost_samples.lost); + struct hists *hists = evsel__hists(evsel); + u32 count = event->lost_samples.lost; + + if (event->header.misc & PERF_RECORD_MISC_LOST_SAMPLES_BPF) + hists__inc_nr_dropped_samples(hists, count); + else + hists__inc_nr_lost_samples(hists, count); } return 0; } diff --git a/tools/perf/ui/stdio/hist.c b/tools/perf/ui/stdio/hist.c index 9372e8904d22..74b2c619c56c 100644 --- a/tools/perf/ui/stdio/hist.c +++ b/tools/perf/ui/stdio/hist.c @@ -913,11 +913,11 @@ size_t events_stats__fprintf(struct events_stats *stats, FILE *fp) continue; if (i && total) { - ret += fprintf(fp, "%16s events: %10d (%4.1f%%)\n", + ret += fprintf(fp, "%20s events: %10d (%4.1f%%)\n", name, stats->nr_events[i], 100.0 * stats->nr_events[i] / total); } else { - ret += fprintf(fp, "%16s events: %10d\n", + ret += fprintf(fp, "%20s events: %10d\n", name, stats->nr_events[i]); } } diff --git a/tools/perf/util/events_stats.h b/tools/perf/util/events_stats.h index f43e5b1a366a..eabd7913c309 100644 --- a/tools/perf/util/events_stats.h +++ b/tools/perf/util/events_stats.h @@ -18,7 +18,18 @@ * PERF_RECORD_LOST_SAMPLES event. The number of lost-samples events is stored * in .nr_events[PERF_RECORD_LOST_SAMPLES] while total_lost_samples tells * exactly how many samples the kernel in fact dropped, i.e. it is the sum of - * all struct perf_record_lost_samples.lost fields reported. + * all struct perf_record_lost_samples.lost fields reported without setting the + * misc field in the header. + * + * The BPF program can discard samples according to the filter expressions given + * by the user. This number is kept in a BPF map and dumped at the end of perf + * record in a PERF_RECORD_LOST_SAMPLES event. To differentiate it from other + * lost samples, perf tools sets PERF_RECORD_MISC_LOST_SAMPLES_BPF flag in the + * header.misc field. The number of dropped-samples events is stored in + * .nr_events[PERF_RECORD_LOST_SAMPLES] while total_dropped_samples tells + * exactly how many samples the BPF program in fact dropped, i.e. it is the sum + * of all struct perf_record_lost_samples.lost fields reported with the misc + * field set in the header. * * The total_period is needed because by default auto-freq is used, so * multiplying nr_events[PERF_EVENT_SAMPLE] by a frequency isn't possible to get @@ -28,6 +39,7 @@ struct events_stats { u64 total_lost; u64 total_lost_samples; + u64 total_dropped_samples; u64 total_aux_lost; u64 total_aux_partial; u64 total_aux_collision; @@ -48,6 +60,7 @@ struct hists_stats { u32 nr_samples; u32 nr_non_filtered_samples; u32 nr_lost_samples; + u32 nr_dropped_samples; }; void events_stats__inc(struct events_stats *stats, u32 type); diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c index 0f9ce2ee2c31..dadce2889e52 100644 --- a/tools/perf/util/hist.c +++ b/tools/perf/util/hist.c @@ -2385,6 +2385,11 @@ void hists__inc_nr_lost_samples(struct hists *hists, u32 lost) hists->stats.nr_lost_samples += lost; } +void hists__inc_nr_dropped_samples(struct hists *hists, u32 lost) +{ + hists->stats.nr_dropped_samples += lost; +} + static struct hist_entry *hists__add_dummy_entry(struct hists *hists, struct hist_entry *pair) { @@ -2729,18 +2734,24 @@ size_t evlist__fprintf_nr_events(struct evlist *evlist, FILE *fp) evlist__for_each_entry(evlist, pos) { struct hists *hists = evsel__hists(pos); + u64 total_samples = hists->stats.nr_samples; + + total_samples += hists->stats.nr_lost_samples; + total_samples += hists->stats.nr_dropped_samples; - if (symbol_conf.skip_empty && !hists->stats.nr_samples && - !hists->stats.nr_lost_samples) + if (symbol_conf.skip_empty && total_samples == 0) continue; ret += fprintf(fp, "%s stats:\n", evsel__name(pos)); if (hists->stats.nr_samples) - ret += fprintf(fp, "%16s events: %10d\n", + ret += fprintf(fp, "%20s events: %10d\n", "SAMPLE", hists->stats.nr_samples); if (hists->stats.nr_lost_samples) - ret += fprintf(fp, "%16s events: %10d\n", + ret += fprintf(fp, "%20s events: %10d\n", "LOST_SAMPLES", hists->stats.nr_lost_samples); + if (hists->stats.nr_dropped_samples) + ret += fprintf(fp, "%20s events: %10d\n", + "LOST_SAMPLES (BPF)", hists->stats.nr_dropped_samples); } return ret; diff --git a/tools/perf/util/hist.h b/tools/perf/util/hist.h index deb1087c5948..7d7ae94b4b31 100644 --- a/tools/perf/util/hist.h +++ b/tools/perf/util/hist.h @@ -372,6 +372,7 @@ void hists__inc_stats(struct hists *hists, struct hist_entry *h); void hists__inc_nr_events(struct hists *hists); void hists__inc_nr_samples(struct hists *hists, bool filtered); void hists__inc_nr_lost_samples(struct hists *hists, u32 lost); +void hists__inc_nr_dropped_samples(struct hists *hists, u32 lost); size_t hists__fprintf(struct hists *hists, bool show_header, int max_rows, int max_cols, float min_pcnt, FILE *fp, diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c index cd79a830abae..5783b96fb988 100644 --- a/tools/perf/util/machine.c +++ b/tools/perf/util/machine.c @@ -642,8 +642,9 @@ int machine__process_lost_event(struct machine *machine __maybe_unused, int machine__process_lost_samples_event(struct machine *machine __maybe_unused, union perf_event *event, struct perf_sample *sample) { - dump_printf(": id:%" PRIu64 ": lost samples :%" PRI_lu64 "\n", - sample->id, event->lost_samples.lost); + dump_printf(": id:%" PRIu64 ": lost samples :%" PRI_lu64 "%s\n", + sample->id, event->lost_samples.lost, + event->header.misc & PERF_RECORD_MISC_LOST_SAMPLES_BPF ? " (BPF)" : ""); return 0; } diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c index d2bd563119bc..774eb3382000 100644 --- a/tools/perf/util/session.c +++ b/tools/perf/util/session.c @@ -1290,8 +1290,9 @@ static int machines__deliver_event(struct machines *machines, evlist->stats.total_lost += event->lost.lost; return tool->lost(tool, event, sample, machine); case PERF_RECORD_LOST_SAMPLES: - if (tool->lost_samples == perf_event__process_lost_samples && - !(event->header.misc & PERF_RECORD_MISC_LOST_SAMPLES_BPF)) + if (event->header.misc & PERF_RECORD_MISC_LOST_SAMPLES_BPF) + evlist->stats.total_dropped_samples += event->lost_samples.lost; + else if (tool->lost_samples == perf_event__process_lost_samples) evlist->stats.total_lost_samples += event->lost_samples.lost; return tool->lost_samples(tool, event, sample, machine); case PERF_RECORD_READ: -- cgit v1.2.3 From 150ca9ccc4e94eaa07b9d3e07f45fec842b68a78 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Tue, 20 Aug 2024 08:45:04 -0700 Subject: perf test: Update sample filtering tests with multiple events Add Multiple bpf-filter test for two or more events with filters. It uses task-clock and page-faults events with different filter expressions and check the perf script output $ sudo ./perf test filtering -vv 96: perf record sample filtering (by BPF) tests: --- start --- test child forked, pid 2804025 Checking BPF-filter privilege Basic bpf-filter test Basic bpf-filter test [Success] Failing bpf-filter test Error: task-clock event does not have PERF_SAMPLE_CPU Failing bpf-filter test [Success] Group bpf-filter test Error: task-clock event does not have PERF_SAMPLE_CPU Error: task-clock event does not have PERF_SAMPLE_CODE_PAGE_SIZE Group bpf-filter test [Success] Multiple bpf-filter test Multiple bpf-filter test [Success] ---- end(0) ---- 96: perf record sample filtering (by BPF) tests : Ok Signed-off-by: Namhyung Kim Cc: Adrian Hunter Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: KP Singh Cc: Kan Liang Cc: Peter Zijlstra Cc: Song Liu Link: https://lore.kernel.org/r/20240820154504.128923-3-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/shell/record_bpf_filter.sh | 34 +++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) (limited to 'tools') diff --git a/tools/perf/tests/shell/record_bpf_filter.sh b/tools/perf/tests/shell/record_bpf_filter.sh index c5882d620db7..4df33c15bfa7 100755 --- a/tools/perf/tests/shell/record_bpf_filter.sh +++ b/tools/perf/tests/shell/record_bpf_filter.sh @@ -116,6 +116,36 @@ test_bpf_filter_group() { echo "Group bpf-filter test [Success]" } +test_bpf_filter_multi() { + echo "Multiple bpf-filter test" + + if ! perf record -e task-clock --filter 'period > 100000' \ + -e page-faults --filter 'ip < 0xffffffff00000000' \ + -o "${perfdata}" true 2> /dev/null + then + echo "Multiple bpf-filter test [Failed record]" + err=1 + return + fi + + if ! perf script -i "${perfdata}" -F period,event | grep task-clock | \ + awk '{ if (int($1) <= 100000) { print $0; exit(1); } }' + then + echo "Multiple bpf-filter test [Failed task-clock period]" + err=1 + return + fi + + if perf script -i "${perfdata}" -F event,ip | grep page-fault | \ + grep 'ffffffff[0-9a-f]*' + then + echo "Multiple bpf-filter test [Failed page-faults ip]" + err=1 + return + fi + + echo "Multiple bpf-filter test [Success]" +} test_bpf_filter_priv @@ -131,5 +161,9 @@ if [ $err = 0 ]; then test_bpf_filter_group fi +if [ $err = 0 ]; then + test_bpf_filter_multi +fi + cleanup exit $err -- cgit v1.2.3 From a68080e1a21b3183ee804ae05726f6103e322348 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 27 Aug 2024 15:57:33 -0300 Subject: perf test vfs_getname: Look for alternative line where to collect the pathname The getname_flags() routine changed recently and thus the place where we were getting the pathname is not probeable anymore, albeit still present, so use the next line for that, before: root@number:/home/acme/git/perf-tools-next# perf test vfs_getname 91: Add vfs_getname probe to get syscall args filenames : FAILED! 93: Use vfs_getname probe to get syscall args filenames : FAILED! 126: Check open filename arg using perf trace + vfs_getname : FAILED! root@number:/home/acme/git/perf-tools-next# Now tests 91 and 126 are passing, some more investigation is needed for test 93, that continues to fail. Cc: Adrian Hunter Cc: Ian Rogers Cc: Jiri Olsa Cc: Kan Liang Cc: Namhyung Kim Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/shell/lib/probe_vfs_getname.sh | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/perf/tests/shell/lib/probe_vfs_getname.sh b/tools/perf/tests/shell/lib/probe_vfs_getname.sh index bf4c1fb71c4b..b3802d9494b4 100644 --- a/tools/perf/tests/shell/lib/probe_vfs_getname.sh +++ b/tools/perf/tests/shell/lib/probe_vfs_getname.sh @@ -13,7 +13,12 @@ cleanup_probe_vfs_getname() { add_probe_vfs_getname() { add_probe_verbose=$1 if [ $had_vfs_getname -eq 1 ] ; then - line=$(perf probe -L getname_flags 2>&1 | grep -E 'result.*=.*filename;' | sed -r 's/[[:space:]]+([[:digit:]]+)[[:space:]]+result->uptr.*/\1/') + result_filename_re="[[:space:]]+([[:digit:]]+)[[:space:]]+result->uptr.*" + line=$(perf probe -L getname_flags 2>&1 | grep -E "$result_filename_re=" | sed -r "s/$result_filename_re=/\1/") + if [ -z "$line" ] ; then + result_aname_re="[[:space:]]+([[:digit:]]+)[[:space:]]+result->aname = NULL;" + line=$(perf probe -L getname_flags 2>&1 | grep -E "$result_aname_re=" | sed -r "s/$result_aname_re=/\1/") + fi perf probe -q "vfs_getname=getname_flags:${line} pathname=result->name:string" || \ perf probe $add_probe_verbose "vfs_getname=getname_flags:${line} pathname=filename:ustring" fi -- cgit v1.2.3 From 5a02447c8145b87373ada2040bbab5bc2a8f87be Mon Sep 17 00:00:00 2001 From: Michael Petlan Date: Tue, 2 Jul 2024 13:08:40 +0200 Subject: perf tests shell: Skip base_* dirs in test script search The test scripts in base_* directories currently have their own drivers that run them. Before this patch, the shell test-suite generator causes them to run twice. Fix that by skipping them in the generator. A cleaner solution (for future) will be to use the directory structure idea (introduced by Carsten Haitzler in 7391db645938 ("perf test: Refactor shell tests allowing subdirs")) to generate test entries with subtests, like: $ perf test list [...] 97: perf probe shell tests 97:1: perf probe basic functionality 97:2: perf probe tests with arguments 97:3: perf probe invalid options handling [...] There is already a lot of shell test scripts and many are about to come, so there is a need for some hierarchy. Signed-off-by: Michael Petlan Cc: Athira Rajeev Cc: Ian Rogers Cc: Masami Hiramatsu Cc: Namhyung Kim Link: https://lore.kernel.org/r/20240702110849.31904-3-vmolnaro@redhat.com Signed-off-by: Veronika Molnarova Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/tests-scripts.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'tools') diff --git a/tools/perf/tests/tests-scripts.c b/tools/perf/tests/tests-scripts.c index 8dc1e398288c..ed114b044293 100644 --- a/tools/perf/tests/tests-scripts.c +++ b/tools/perf/tests/tests-scripts.c @@ -251,6 +251,8 @@ static void append_scripts_in_dir(int dir_fd, if (!S_ISDIR(st.st_mode)) continue; } + if (strncmp(ent->d_name, "base_", 5) == 0) + continue; /* Skip scripts that have a separate driver. */ fd = openat(dir_fd, ent->d_name, O_PATH); append_scripts_in_dir(fd, result, result_sz); } -- cgit v1.2.3 From a3a02a52bcfcbcc4a637d4b68bf1bc391c9fad02 Mon Sep 17 00:00:00 2001 From: Veronika Molnarova Date: Tue, 2 Jul 2024 13:08:41 +0200 Subject: perf testsuite: Merge settings files for shell tests Merge perf testsuite setting files into common settings to reduce duplicates and prevent errors. Signed-off-by: Michael Petlan Signed-off-by: Veronika Molnarova Cc: Athira Rajeev Cc: Ian Rogers Cc: Masami Hiramatsu Cc: Namhyung Kim Link: https://lore.kernel.org/r/20240702110849.31904-4-vmolnaro@redhat.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/shell/base_probe/settings.sh | 48 ---------------------- .../tests/shell/base_probe/test_adding_kernel.sh | 3 -- tools/perf/tests/shell/common/init.sh | 23 +++++++++++ tools/perf/tests/shell/common/settings.sh | 23 +++++++++++ 4 files changed, 46 insertions(+), 51 deletions(-) delete mode 100644 tools/perf/tests/shell/base_probe/settings.sh (limited to 'tools') diff --git a/tools/perf/tests/shell/base_probe/settings.sh b/tools/perf/tests/shell/base_probe/settings.sh deleted file mode 100644 index 123621c7f95e..000000000000 --- a/tools/perf/tests/shell/base_probe/settings.sh +++ /dev/null @@ -1,48 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0 -# -# settings.sh of perf_probe test -# Author: Michael Petlan -# Author: Masami Hiramatsu -# - -export TEST_NAME="perf_probe" - -export MY_ARCH=`arch` - -if [ -n "$PERFSUITE_RUN_DIR" ]; then - # when $PERFSUITE_RUN_DIR is set to something, all the logs and temp files will be placed there - # --> the $PERFSUITE_RUN_DIR/perf_something/examples and $PERFSUITE_RUN_DIR/perf_something/logs - # dirs will be used for that - export PERFSUITE_RUN_DIR=`readlink -f $PERFSUITE_RUN_DIR` - export CURRENT_TEST_DIR="$PERFSUITE_RUN_DIR/$TEST_NAME" - export MAKE_TARGET_DIR="$CURRENT_TEST_DIR/examples" - test -d "$MAKE_TARGET_DIR" || mkdir -p "$MAKE_TARGET_DIR" - export LOGS_DIR="$PERFSUITE_RUN_DIR/$TEST_NAME/logs" - test -d "$LOGS_DIR" || mkdir -p "$LOGS_DIR" -else - # when $PERFSUITE_RUN_DIR is not set, logs will be placed here - export CURRENT_TEST_DIR="." - export LOGS_DIR="." -fi - -check_kprobes_available() -{ - test -e /sys/kernel/debug/tracing/kprobe_events -} - -check_uprobes_available() -{ - test -e /sys/kernel/debug/tracing/uprobe_events -} - -clear_all_probes() -{ - echo 0 > /sys/kernel/debug/tracing/events/enable - check_kprobes_available && echo > /sys/kernel/debug/tracing/kprobe_events - check_uprobes_available && echo > /sys/kernel/debug/tracing/uprobe_events -} - -check_sdt_support() -{ - $CMD_PERF list sdt | grep sdt > /dev/null 2> /dev/null -} diff --git a/tools/perf/tests/shell/base_probe/test_adding_kernel.sh b/tools/perf/tests/shell/base_probe/test_adding_kernel.sh index 187dc8d4b163..d541ffd44a93 100755 --- a/tools/perf/tests/shell/base_probe/test_adding_kernel.sh +++ b/tools/perf/tests/shell/base_probe/test_adding_kernel.sh @@ -15,10 +15,7 @@ # include working environment . ../common/init.sh -. ./settings.sh -# shellcheck disable=SC2034 # the variable is later used after the working environment is included -THIS_TEST_NAME=`basename $0 .sh` TEST_RESULT=0 # shellcheck source=lib/probe_vfs_getname.sh diff --git a/tools/perf/tests/shell/common/init.sh b/tools/perf/tests/shell/common/init.sh index aadeaf782e03..b99fba50ba94 100644 --- a/tools/perf/tests/shell/common/init.sh +++ b/tools/perf/tests/shell/common/init.sh @@ -115,3 +115,26 @@ detect_amd() # 1 = is not AMD or unknown grep "vendor_id" < /proc/cpuinfo | grep -q "AMD" } + +# base probe utility +check_kprobes_available() +{ + test -e /sys/kernel/debug/tracing/kprobe_events +} + +check_uprobes_available() +{ + test -e /sys/kernel/debug/tracing/uprobe_events +} + +clear_all_probes() +{ + echo 0 > /sys/kernel/debug/tracing/events/enable + check_kprobes_available && echo > /sys/kernel/debug/tracing/kprobe_events + check_uprobes_available && echo > /sys/kernel/debug/tracing/uprobe_events +} + +check_sdt_support() +{ + $CMD_PERF list sdt | grep sdt > /dev/null 2> /dev/null +} diff --git a/tools/perf/tests/shell/common/settings.sh b/tools/perf/tests/shell/common/settings.sh index 361641dbaaad..dcdb7d49ac00 100644 --- a/tools/perf/tests/shell/common/settings.sh +++ b/tools/perf/tests/shell/common/settings.sh @@ -65,6 +65,29 @@ else export MEND="" fi +### general info +DIR_PATH=`dirname "$(readlink -e "$0")"` + +export TEST_NAME=`basename $DIR_PATH | sed 's/base/perf/'` +export MY_ARCH=`arch` + +# storing logs and temporary files variables +if [ -n "$PERFSUITE_RUN_DIR" ]; then + # when $PERFSUITE_RUN_DIR is set to something, all the logs and temp files will be placed there + # --> the $PERFSUITE_RUN_DIR/perf_something/examples and $PERFSUITE_RUN_DIR/perf_something/logs + # dirs will be used for that + export PERFSUITE_RUN_DIR=`readlink -f $PERFSUITE_RUN_DIR` + export CURRENT_TEST_DIR="$PERFSUITE_RUN_DIR/$TEST_NAME" + export MAKE_TARGET_DIR="$CURRENT_TEST_DIR/examples" + export LOGS_DIR="$CURRENT_TEST_DIR/logs" + test -d "$CURRENT_TEST_DIR" || mkdir -p "$CURRENT_TEST_DIR" + test -d "$LOGS_DIR" || mkdir -p "$LOGS_DIR" +else + # when $PERFSUITE_RUN_DIR is not set, logs will be placed here + export CURRENT_TEST_DIR="." + export LOGS_DIR="." +fi + #### test parametrization if [ ! -d ./common ]; then -- cgit v1.2.3 From 32ddd082dcac111908c5f11e0617cd0f8ec10350 Mon Sep 17 00:00:00 2001 From: Veronika Molnarova Date: Tue, 2 Jul 2024 13:08:42 +0200 Subject: perf testsuite: Fix shellcheck warnings Shellcheck is becoming a standard when building perf to prevent any unnecessary mistakes. Fix shellcheck warnings in perf testsuite. Signed-off-by: Veronika Molnarova Cc: Athira Rajeev Cc: Ian Rogers Cc: Masami Hiramatsu Cc: Namhyung Kim Link: https://lore.kernel.org/r/20240702110849.31904-5-vmolnaro@redhat.com Signed-off-by: Michael Petlan Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/shell/common/init.sh | 8 ++++---- tools/perf/tests/shell/common/settings.sh | 9 +++++---- 2 files changed, 9 insertions(+), 8 deletions(-) (limited to 'tools') diff --git a/tools/perf/tests/shell/common/init.sh b/tools/perf/tests/shell/common/init.sh index b99fba50ba94..075f17623c8e 100644 --- a/tools/perf/tests/shell/common/init.sh +++ b/tools/perf/tests/shell/common/init.sh @@ -26,8 +26,8 @@ print_results() PERF_RETVAL="$1"; shift CHECK_RETVAL="$1"; shift FAILURE_REASON="" - TASK_COMMENT="$@" - if [ $PERF_RETVAL -eq 0 -a $CHECK_RETVAL -eq 0 ]; then + TASK_COMMENT="$*" + if [ $PERF_RETVAL -eq 0 ] && [ $CHECK_RETVAL -eq 0 ]; then _echo "$MPASS-- [ PASS ] --$MEND $TEST_NAME :: $THIS_TEST_NAME :: $TASK_COMMENT" return 0 else @@ -56,7 +56,7 @@ print_overall_results() print_testcase_skipped() { - TASK_COMMENT="$@" + TASK_COMMENT="$*" _echo "$MSKIP-- [ SKIP ] --$MEND $TEST_NAME :: $THIS_TEST_NAME :: $TASK_COMMENT :: testcase skipped" return 0 } @@ -69,7 +69,7 @@ print_overall_skipped() print_warning() { - WARN_COMMENT="$@" + WARN_COMMENT="$*" _echo "$MWARN-- [ WARN ] --$MEND $TEST_NAME :: $THIS_TEST_NAME :: $WARN_COMMENT" return 0 } diff --git a/tools/perf/tests/shell/common/settings.sh b/tools/perf/tests/shell/common/settings.sh index dcdb7d49ac00..2e50dd0bff9b 100644 --- a/tools/perf/tests/shell/common/settings.sh +++ b/tools/perf/tests/shell/common/settings.sh @@ -45,7 +45,7 @@ export TEST_IGNORE_MISSING_PMU=${TEST_IGNORE_MISSING_PMU:-n} export LC_ALL=C #### colors -if [ -t 1 -o "$TESTLOG_FORCE_COLOR" = "yes" ]; then +if [ -t 1 ] || [ "$TESTLOG_FORCE_COLOR" = "yes" ]; then export MPASS="\e[32m" export MALLPASS="\e[1;32m" export MFAIL="\e[31m" @@ -68,15 +68,15 @@ fi ### general info DIR_PATH=`dirname "$(readlink -e "$0")"` -export TEST_NAME=`basename $DIR_PATH | sed 's/base/perf/'` -export MY_ARCH=`arch` +TEST_NAME=`basename $DIR_PATH | sed 's/base/perf/'`; export TEST_NAME +MY_ARCH=`arch`; export MY_ARCH # storing logs and temporary files variables if [ -n "$PERFSUITE_RUN_DIR" ]; then # when $PERFSUITE_RUN_DIR is set to something, all the logs and temp files will be placed there # --> the $PERFSUITE_RUN_DIR/perf_something/examples and $PERFSUITE_RUN_DIR/perf_something/logs # dirs will be used for that - export PERFSUITE_RUN_DIR=`readlink -f $PERFSUITE_RUN_DIR` + PERFSUITE_RUN_DIR=`readlink -f $PERFSUITE_RUN_DIR`; export PERFSUITE_RUN_DIR export CURRENT_TEST_DIR="$PERFSUITE_RUN_DIR/$TEST_NAME" export MAKE_TARGET_DIR="$CURRENT_TEST_DIR/examples" export LOGS_DIR="$CURRENT_TEST_DIR/logs" @@ -93,6 +93,7 @@ fi if [ ! -d ./common ]; then # set parameters based on runmode if [ -f ../common/parametrization.$PERFTOOL_TESTSUITE_RUNMODE.sh ]; then + # shellcheck source=/dev/null . ../common/parametrization.$PERFTOOL_TESTSUITE_RUNMODE.sh fi # if some parameters haven't been set until now, set them to default -- cgit v1.2.3 From def5480d63c1e84779d36bc2fe78382e3c5e1b12 Mon Sep 17 00:00:00 2001 From: Veronika Molnarova Date: Tue, 2 Jul 2024 13:08:43 +0200 Subject: perf testsuite probe: Add test for blacklisted kprobes handling Test perf probe interface. Blacklisted functions should be rejected when there is an attempt to set a kprobe to them. Signed-off-by: Veronika Molnarova Cc: Athira Rajeev Cc: Ian Rogers Cc: Masami Hiramatsu Cc: Namhyung Kim Link: https://lore.kernel.org/r/20240702110849.31904-6-vmolnaro@redhat.com Signed-off-by: Michael Petlan Signed-off-by: Arnaldo Carvalho de Melo --- .../shell/base_probe/test_adding_blacklisted.sh | 67 ++++++++++++++++++++++ 1 file changed, 67 insertions(+) create mode 100755 tools/perf/tests/shell/base_probe/test_adding_blacklisted.sh (limited to 'tools') diff --git a/tools/perf/tests/shell/base_probe/test_adding_blacklisted.sh b/tools/perf/tests/shell/base_probe/test_adding_blacklisted.sh new file mode 100755 index 000000000000..b5dc10b2a738 --- /dev/null +++ b/tools/perf/tests/shell/base_probe/test_adding_blacklisted.sh @@ -0,0 +1,67 @@ +#!/bin/bash + +# SPDX-License-Identifier: GPL-2.0 + +# +# test_adding_blacklisted of perf_probe test +# Author: Masami Hiramatsu +# Author: Michael Petlan +# +# Description: +# +# Blacklisted functions should not be added successfully as probes, +# they must be skipped. +# + +# include working environment +. ../common/init.sh + +TEST_RESULT=0 + +# skip if not supported +BLACKFUNC=`head -n 1 /sys/kernel/debug/kprobes/blacklist 2> /dev/null | cut -f2` +if [ -z "$BLACKFUNC" ]; then + print_overall_skipped + exit 0 +fi + +# remove all previously added probes +clear_all_probes + + +### adding blacklisted function + +# functions from blacklist should be skipped by perf probe +! $CMD_PERF probe $BLACKFUNC > $LOGS_DIR/adding_blacklisted.log 2> $LOGS_DIR/adding_blacklisted.err +PERF_EXIT_CODE=$? + +REGEX_SCOPE_FAIL="Failed to find scope of probe point" +REGEX_SKIP_MESSAGE=" is blacklisted function, skip it\." +REGEX_NOT_FOUND_MESSAGE="Probe point \'$BLACKFUNC\' not found." +REGEX_ERROR_MESSAGE="Error: Failed to add events." +REGEX_INVALID_ARGUMENT="Failed to write event: Invalid argument" +REGEX_SYMBOL_FAIL="Failed to find symbol at $RE_ADDRESS" +REGEX_OUT_SECTION="$BLACKFUNC is out of \.\w+, skip it" +../common/check_all_lines_matched.pl "$REGEX_SKIP_MESSAGE" "$REGEX_NOT_FOUND_MESSAGE" "$REGEX_ERROR_MESSAGE" "$REGEX_SCOPE_FAIL" "$REGEX_INVALID_ARGUMENT" "$REGEX_SYMBOL_FAIL" "$REGEX_OUT_SECTION" < $LOGS_DIR/adding_blacklisted.err +CHECK_EXIT_CODE=$? + +print_results $PERF_EXIT_CODE $CHECK_EXIT_CODE "adding blacklisted function $BLACKFUNC" +(( TEST_RESULT += $? )) + + +### listing not-added probe + +# blacklisted probes should NOT appear in perf-list output +$CMD_PERF list probe:\* > $LOGS_DIR/adding_blacklisted_list.log +PERF_EXIT_CODE=$? + +../common/check_all_lines_matched.pl "$RE_LINE_EMPTY" "List of pre-defined events" "Metric Groups:" < $LOGS_DIR/adding_blacklisted_list.log +CHECK_EXIT_CODE=$? + +print_results $PERF_EXIT_CODE $CHECK_EXIT_CODE "listing blacklisted probe (should NOT be listed)" +(( TEST_RESULT += $? )) + + +# print overall results +print_overall_results "$TEST_RESULT" +exit $? -- cgit v1.2.3 From adc1dd00dbc192a8811fab482d0e26b2914ba3e3 Mon Sep 17 00:00:00 2001 From: Veronika Molnarova Date: Tue, 2 Jul 2024 13:08:44 +0200 Subject: perf testsuite probe: Add test for basic perf-probe options Test basic behavior of perf-probe subcommand. It is run as a part of perftool-testsuite_probe test case. Signed-off-by: Veronika Molnarova Cc: Athira Rajeev Cc: Ian Rogers Cc: Masami Hiramatsu Cc: Namhyung Kim Link: https://lore.kernel.org/r/20240702110849.31904-7-vmolnaro@redhat.com Signed-off-by: Michael Petlan Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/shell/base_probe/test_basic.sh | 80 +++++++++++++++++++++++++ 1 file changed, 80 insertions(+) create mode 100755 tools/perf/tests/shell/base_probe/test_basic.sh (limited to 'tools') diff --git a/tools/perf/tests/shell/base_probe/test_basic.sh b/tools/perf/tests/shell/base_probe/test_basic.sh new file mode 100755 index 000000000000..09669ec479f2 --- /dev/null +++ b/tools/perf/tests/shell/base_probe/test_basic.sh @@ -0,0 +1,80 @@ +#!/bin/bash + +# SPDX-License-Identifier: GPL-2.0 + +# +# test_basic of perf_probe test +# Author: Michael Petlan +# Author: Masami Hiramatsu +# +# Description: +# +# This test tests basic functionality of perf probe command. +# + +# include working environment +. ../common/init.sh + +TEST_RESULT=0 + +if ! check_kprobes_available; then + print_overall_skipped + exit 0 +fi + + +### help message + +if [ "$PARAM_GENERAL_HELP_TEXT_CHECK" = "y" ]; then + # test that a help message is shown and looks reasonable + $CMD_PERF probe --help > $LOGS_DIR/basic_helpmsg.log 2> $LOGS_DIR/basic_helpmsg.err + PERF_EXIT_CODE=$? + + ../common/check_all_patterns_found.pl "PERF-PROBE" "NAME" "SYNOPSIS" "DESCRIPTION" "OPTIONS" "PROBE\s+SYNTAX" "PROBE\s+ARGUMENT" "LINE\s+SYNTAX" < $LOGS_DIR/basic_helpmsg.log + CHECK_EXIT_CODE=$? + ../common/check_all_patterns_found.pl "LAZY\s+MATCHING" "FILTER\s+PATTERN" "EXAMPLES" "SEE\s+ALSO" < $LOGS_DIR/basic_helpmsg.log + (( CHECK_EXIT_CODE += $? )) + ../common/check_all_patterns_found.pl "vmlinux" "module=" "source=" "verbose" "quiet" "add=" "del=" "list.*EVENT" "line=" "vars=" "externs" < $LOGS_DIR/basic_helpmsg.log + (( CHECK_EXIT_CODE += $? )) + ../common/check_all_patterns_found.pl "no-inlines" "funcs.*FILTER" "filter=FILTER" "force" "dry-run" "max-probes" "exec=" "demangle-kernel" < $LOGS_DIR/basic_helpmsg.log + (( CHECK_EXIT_CODE += $? )) + ../common/check_no_patterns_found.pl "No manual entry for" < $LOGS_DIR/basic_helpmsg.err + (( CHECK_EXIT_CODE += $? )) + + print_results $PERF_EXIT_CODE $CHECK_EXIT_CODE "help message" + (( TEST_RESULT += $? )) +else + print_testcase_skipped "help message" +fi + + +### usage message + +# without any args perf-probe should print usage +$CMD_PERF probe 2> $LOGS_DIR/basic_usage.log > /dev/null + +../common/check_all_patterns_found.pl "[Uu]sage" "perf probe" "verbose" "quiet" "add" "del" "force" "line" "vars" "externs" "range" < $LOGS_DIR/basic_usage.log +CHECK_EXIT_CODE=$? + +print_results 0 $CHECK_EXIT_CODE "usage message" +(( TEST_RESULT += $? )) + + +### quiet switch + +# '--quiet' should mute all output +$CMD_PERF probe --quiet --add vfs_read > $LOGS_DIR/basic_quiet01.log 2> $LOGS_DIR/basic_quiet01.err +PERF_EXIT_CODE=$? +$CMD_PERF probe --quiet --del vfs_read > $LOGS_DIR/basic_quiet03.log 2> $LOGS_DIR/basic_quiet02.err +(( PERF_EXIT_CODE += $? )) + +test "`cat $LOGS_DIR/basic_quiet*log $LOGS_DIR/basic_quiet*err | wc -l`" -eq 0 +CHECK_EXIT_CODE=$? + +print_results $PERF_EXIT_CODE $CHECK_EXIT_CODE "quiet switch" +(( TEST_RESULT += $? )) + + +# print overall results +print_overall_results "$TEST_RESULT" +exit $? -- cgit v1.2.3 From 83b6815dbb57aca3cd9b20bdcae769d2a7c38d5f Mon Sep 17 00:00:00 2001 From: Veronika Molnarova Date: Tue, 2 Jul 2024 13:08:45 +0200 Subject: perf testsuite probe: Add test for invalid options Test if various incompatible options are correctly handled-rejected. It is run as a part of perftool-testsuite_probe test case. Signed-off-by: Veronika Molnarova Cc: Athira Rajeev Cc: Ian Rogers Cc: Masami Hiramatsu Cc: Namhyung Kim Link: https://lore.kernel.org/r/20240702110849.31904-8-vmolnaro@redhat.com Signed-off-by: Michael Petlan Signed-off-by: Arnaldo Carvalho de Melo --- .../tests/shell/base_probe/test_invalid_options.sh | 79 ++++++++++++++++++++++ 1 file changed, 79 insertions(+) create mode 100755 tools/perf/tests/shell/base_probe/test_invalid_options.sh (limited to 'tools') diff --git a/tools/perf/tests/shell/base_probe/test_invalid_options.sh b/tools/perf/tests/shell/base_probe/test_invalid_options.sh new file mode 100755 index 000000000000..1fedfd8b0d0d --- /dev/null +++ b/tools/perf/tests/shell/base_probe/test_invalid_options.sh @@ -0,0 +1,79 @@ +#!/bin/bash + +# SPDX-License-Identifier: GPL-2.0 + +# +# test_invalid_options of perf_probe test +# Author: Masami Hiramatsu +# Author: Michael Petlan +# +# Description: +# +# This test checks whether the invalid and incompatible options are reported +# + +# include working environment +. ../common/init.sh + +TEST_RESULT=0 + +if ! check_kprobes_available; then + print_overall_skipped + exit 0 +fi + + +### missing argument + +# some options require an argument +for opt in '-a' '-d' '-L' '-V'; do + ! $CMD_PERF probe $opt 2> $LOGS_DIR/invalid_options_missing_argument$opt.err + PERF_EXIT_CODE=$? + + ../common/check_all_patterns_found.pl "Error: switch .* requires a value" < $LOGS_DIR/invalid_options_missing_argument$opt.err + CHECK_EXIT_CODE=$? + + print_results $PERF_EXIT_CODE $CHECK_EXIT_CODE "missing argument for $opt" + (( TEST_RESULT += $? )) +done + + +### unnecessary argument + +# some options may omit the argument +for opt in '-F' '-l'; do + $CMD_PERF probe -F > /dev/null 2> $LOGS_DIR/invalid_options_unnecessary_argument$opt.err + PERF_EXIT_CODE=$? + + test ! -s $LOGS_DIR/invalid_options_unnecessary_argument$opt.err + CHECK_EXIT_CODE=$? + + print_results $PERF_EXIT_CODE $CHECK_EXIT_CODE "unnecessary argument for $opt" + (( TEST_RESULT += $? )) +done + + +### mutually exclusive options + +# some options are mutually exclusive +test -e $LOGS_DIR/invalid_options_mutually_exclusive.log && rm -f $LOGS_DIR/invalid_options_mutually_exclusive.log +for opt in '-a xxx -d xxx' '-a xxx -L foo' '-a xxx -V foo' '-a xxx -l' '-a xxx -F' \ + '-d xxx -L foo' '-d xxx -V foo' '-d xxx -l' '-d xxx -F' \ + '-L foo -V bar' '-L foo -l' '-L foo -F' '-V foo -l' '-V foo -F' '-l -F'; do + ! $CMD_PERF probe $opt > /dev/null 2> $LOGS_DIR/aux.log + PERF_EXIT_CODE=$? + + ../common/check_all_patterns_found.pl "Error: switch .+ cannot be used with switch .+" < $LOGS_DIR/aux.log + CHECK_EXIT_CODE=$? + + print_results $PERF_EXIT_CODE $CHECK_EXIT_CODE "mutually exclusive options :: $opt" + (( TEST_RESULT += $? )) + + # gather the logs + cat $LOGS_DIR/aux.log | grep "Error" >> $LOGS_DIR/invalid_options_mutually_exclusive.log +done + + +# print overall results +print_overall_results "$TEST_RESULT" +exit $? -- cgit v1.2.3 From c0964af8162e7ce9a7b8bb896ece92534d5e7e64 Mon Sep 17 00:00:00 2001 From: Veronika Molnarova Date: Tue, 2 Jul 2024 13:08:46 +0200 Subject: perf testsuite probe: Add test for line semantics The perf-probe command uses a specific semantics to describe probes. Test some patterns that are known to be both valid and invalid if they are handled appropriately. This test is run as a part of perftool-testsuite_probe test case. Signed-off-by: Veronika Molnarova Cc: Athira Rajeev Cc: Ian Rogers Cc: Masami Hiramatsu Cc: Namhyung Kim Link: https://lore.kernel.org/r/20240702110849.31904-9-vmolnaro@redhat.com Signed-off-by: Michael Petlan Signed-off-by: Arnaldo Carvalho de Melo --- .../tests/shell/base_probe/test_line_semantics.sh | 55 ++++++++++++++++++++++ 1 file changed, 55 insertions(+) create mode 100755 tools/perf/tests/shell/base_probe/test_line_semantics.sh (limited to 'tools') diff --git a/tools/perf/tests/shell/base_probe/test_line_semantics.sh b/tools/perf/tests/shell/base_probe/test_line_semantics.sh new file mode 100755 index 000000000000..d8f4bde0f585 --- /dev/null +++ b/tools/perf/tests/shell/base_probe/test_line_semantics.sh @@ -0,0 +1,55 @@ +#!/bin/bash + +# SPDX-License-Identifier: GPL-2.0 + +# +# test_line_semantics of perf_probe test +# Author: Masami Hiramatsu +# Author: Michael Petlan +# +# Description: +# +# This test checks whether the semantic errors of line option's +# arguments are properly reported. +# + +# include working environment +. ../common/init.sh + +TEST_RESULT=0 + +if ! check_kprobes_available; then + print_overall_skipped + exit 0 +fi + + +### acceptable --line descriptions + +# testing acceptance of valid patterns for the '--line' option +VALID_PATTERNS="func func:10 func:0-10 func:2+10 func@source.c func@source.c:1 source.c:1 source.c:1+1 source.c:1-10" +for desc in $VALID_PATTERNS; do + ! ( $CMD_PERF probe --line $desc 2>&1 | grep -q "Semantic error" ) + CHECK_EXIT_CODE=$? + + print_results 0 $CHECK_EXIT_CODE "acceptable descriptions :: $desc" + (( TEST_RESULT += $? )) +done + + +### unacceptable --line descriptions + +# testing handling of invalid patterns for the '--line' option +INVALID_PATTERNS="func:foo func:1-foo func:1+foo func;lazy\*pattern" +for desc in $INVALID_PATTERNS; do + $CMD_PERF probe --line $desc 2>&1 | grep -q "Semantic error" + CHECK_EXIT_CODE=$? + + print_results 0 $CHECK_EXIT_CODE "unacceptable descriptions :: $desc" + (( TEST_RESULT += $? )) +done + + +# print overall results +print_overall_results "$TEST_RESULT" +exit $? -- cgit v1.2.3 From 13d58a6672d11200bd3c35604caa9e166468d61b Mon Sep 17 00:00:00 2001 From: Veronika Molnarova Date: Tue, 2 Jul 2024 13:08:47 +0200 Subject: perf testsuite: Add common output checking helper As a form of validation, it is a common practice to check the outputs of commands whether they contain expected patterns or match a certain regular expression. This output checking helper is designed to allow checking stderr output of perf commands for unexpected messages, while ignoring messages that are known to be harmless, e.g.: "Lowering default frequency rate to \d+\." "\d+ out of order events recorded." etc. Signed-off-by: Veronika Molnarova Cc: Athira Rajeev Cc: Ian Rogers Cc: Masami Hiramatsu Cc: Namhyung Kim Link: https://lore.kernel.org/r/20240702110849.31904-10-vmolnaro@redhat.com Signed-off-by: Michael Petlan Signed-off-by: Arnaldo Carvalho de Melo --- .../tests/shell/common/check_errors_whitelisted.pl | 51 ++++++++++++++++++++++ 1 file changed, 51 insertions(+) create mode 100755 tools/perf/tests/shell/common/check_errors_whitelisted.pl (limited to 'tools') diff --git a/tools/perf/tests/shell/common/check_errors_whitelisted.pl b/tools/perf/tests/shell/common/check_errors_whitelisted.pl new file mode 100755 index 000000000000..c57d355dd76e --- /dev/null +++ b/tools/perf/tests/shell/common/check_errors_whitelisted.pl @@ -0,0 +1,51 @@ +#!/usr/bin/perl +# SPDX-License-Identifier: GPL-2.0 + +$whitelist_file = shift; + +if (defined $whitelist_file) +{ + open (INFILE, $whitelist_file) or die "Checker error: Unable to open the whitelist file: $whitelist_file\n"; + @regexps = ; + close INFILE or die "Checker error: Unable to close the whitelist file: $whitelist_file\n"; +} +else +{ + @regexps = (); +} + +$max_printed_lines = 20; +$max_printed_lines = $ENV{TESTLOG_ERR_MSG_MAX_LINES} if (defined $ENV{TESTLOG_ERR_MSG_MAX_LINES}); + +$quiet = 1; +$quiet = 0 if (defined $ENV{TESTLOG_VERBOSITY} && $ENV{TESTLOG_VERBOSITY} ge 2); + +$passed = 1; +$lines_printed = 0; + +while () +{ + s/\n//; + + $line_matched = 0; + for $r (@regexps) + { + chomp $r; + if (/$r/) + { + $line_matched = 1; + last; + } + } + + unless ($line_matched) + { + if ($lines_printed++ < $max_printed_lines) + { + print "Line did not match any pattern: \"$_\"\n" unless $quiet; + } + $passed = 0; + } +} + +exit ($passed == 0); -- cgit v1.2.3 From 61f87151839b9767a06be93b99f1fc324147a0d3 Mon Sep 17 00:00:00 2001 From: Veronika Molnarova Date: Tue, 2 Jul 2024 13:08:48 +0200 Subject: perf testsuite report: Add test for perf-report basic functionality Test basic execution and some options of perf-report subcommand, like show-nr-samples, header, showcpuutilization, pid and symbol filtering. Signed-off-by: Veronika Molnarova Cc: Athira Rajeev Cc: Ian Rogers Cc: Masami Hiramatsu Cc: Namhyung Kim Link: https://lore.kernel.org/r/20240702110849.31904-11-vmolnaro@redhat.com Signed-off-by: Michael Petlan Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/shell/base_report/setup.sh | 32 ++++ .../tests/shell/base_report/stderr-whitelist.txt | 5 + tools/perf/tests/shell/base_report/test_basic.sh | 190 +++++++++++++++++++++ tools/perf/tests/shell/common/settings.sh | 2 + 4 files changed, 229 insertions(+) create mode 100755 tools/perf/tests/shell/base_report/setup.sh create mode 100644 tools/perf/tests/shell/base_report/stderr-whitelist.txt create mode 100755 tools/perf/tests/shell/base_report/test_basic.sh (limited to 'tools') diff --git a/tools/perf/tests/shell/base_report/setup.sh b/tools/perf/tests/shell/base_report/setup.sh new file mode 100755 index 000000000000..4caa496660c6 --- /dev/null +++ b/tools/perf/tests/shell/base_report/setup.sh @@ -0,0 +1,32 @@ +#!/bin/bash + +# SPDX-License-Identifier: GPL-2.0 + +# +# setup.sh of perf report test +# Author: Michael Petlan +# +# Description: +# +# We need some sample data for perf-report testing +# +# + +# include working environment +. ../common/init.sh + +test -d "$HEADER_TAR_DIR" || mkdir -p "$HEADER_TAR_DIR" + +SW_EVENT="cpu-clock" + +$CMD_PERF record -asdg -e $SW_EVENT -o $CURRENT_TEST_DIR/perf.data -- $CMD_LONGER_SLEEP 2> $LOGS_DIR/setup.log +PERF_EXIT_CODE=$? + +../common/check_all_patterns_found.pl "$RE_LINE_RECORD1" "$RE_LINE_RECORD2" < $LOGS_DIR/setup.log +CHECK_EXIT_CODE=$? + +print_results $PERF_EXIT_CODE $CHECK_EXIT_CODE "prepare the perf.data file" +TEST_RESULT=$? + +print_overall_results $TEST_RESULT +exit $? diff --git a/tools/perf/tests/shell/base_report/stderr-whitelist.txt b/tools/perf/tests/shell/base_report/stderr-whitelist.txt new file mode 100644 index 000000000000..e3341401b47c --- /dev/null +++ b/tools/perf/tests/shell/base_report/stderr-whitelist.txt @@ -0,0 +1,5 @@ +no symbols found in .*, maybe install a debug package +was updated .*is prelink enabled.+ Restart the long running apps that use it +Warning: +\d+ out of order events recorded. +detected invalid bpf_prog_info diff --git a/tools/perf/tests/shell/base_report/test_basic.sh b/tools/perf/tests/shell/base_report/test_basic.sh new file mode 100755 index 000000000000..47677cbd4df3 --- /dev/null +++ b/tools/perf/tests/shell/base_report/test_basic.sh @@ -0,0 +1,190 @@ +#!/bin/bash + +# SPDX-License-Identifier: GPL-2.0 + +# +# test_basic of perf_report test +# Author: Michael Petlan +# +# Description: +# +# This test tests basic functionality of perf report command. +# +# + +# include working environment +. ../common/init.sh + +TEST_RESULT=0 + + +### help message + +if [ "$PARAM_GENERAL_HELP_TEXT_CHECK" = "y" ]; then + # test that a help message is shown and looks reasonable + $CMD_PERF report --help > $LOGS_DIR/basic_helpmsg.log 2> $LOGS_DIR/basic_helpmsg.err + PERF_EXIT_CODE=$? + + ../common/check_all_patterns_found.pl "PERF-REPORT" "NAME" "SYNOPSIS" "DESCRIPTION" "OPTIONS" "OVERHEAD\s+CALCULATION" "SEE ALSO" < $LOGS_DIR/basic_helpmsg.log + CHECK_EXIT_CODE=$? + ../common/check_all_patterns_found.pl "input" "verbose" "show-nr-samples" "show-cpu-utilization" "threads" "comms" "pid" "tid" "dsos" "symbols" "symbol-filter" < $LOGS_DIR/basic_helpmsg.log + (( CHECK_EXIT_CODE += $? )) + ../common/check_all_patterns_found.pl "hide-unresolved" "sort" "fields" "parent" "exclude-other" "column-widths" "field-separator" "dump-raw-trace" "children" < $LOGS_DIR/basic_helpmsg.log + (( CHECK_EXIT_CODE += $? )) + ../common/check_all_patterns_found.pl "call-graph" "max-stack" "inverted" "ignore-callees" "pretty" "stdio" "tui" "gtk" "vmlinux" "kallsyms" "modules" < $LOGS_DIR/basic_helpmsg.log + (( CHECK_EXIT_CODE += $? )) + ../common/check_all_patterns_found.pl "force" "symfs" "cpu" "disassembler-style" "source" "asm-raw" "show-total-period" "show-info" "branch-stack" "group" < $LOGS_DIR/basic_helpmsg.log + (( CHECK_EXIT_CODE += $? )) + ../common/check_all_patterns_found.pl "branch-history" "objdump" "demangle" "percent-limit" "percentage" "header" "itrace" "full-source-path" "show-ref-call-graph" < $LOGS_DIR/basic_helpmsg.log + (( CHECK_EXIT_CODE += $? )) + ../common/check_no_patterns_found.pl "No manual entry for" < $LOGS_DIR/basic_helpmsg.err + (( CHECK_EXIT_CODE += $? )) + + print_results $PERF_EXIT_CODE $CHECK_EXIT_CODE "help message" + (( TEST_RESULT += $? )) +else + print_testcase_skipped "help message" +fi + + +### basic execution + +# test that perf report is even working +$CMD_PERF report -i $CURRENT_TEST_DIR/perf.data --stdio > $LOGS_DIR/basic_basic.log 2> $LOGS_DIR/basic_basic.err +PERF_EXIT_CODE=$? + +REGEX_LOST_SAMPLES_INFO="#\s*Total Lost Samples:\s+$RE_NUMBER" +REGEX_SAMPLES_INFO="#\s*Samples:\s+(?:$RE_NUMBER)\w?\s+of\s+event\s+'$RE_EVENT_ANY'" +REGEX_LINES_HEADER="#\s*Children\s+Self\s+Command\s+Shared Object\s+Symbol" +REGEX_LINES="\s*$RE_NUMBER%\s+$RE_NUMBER%\s+\S+\s+\[kernel\.(?:vmlinux)|(?:kallsyms)\]\s+\[[k\.]\]\s+\w+" +../common/check_all_patterns_found.pl "$REGEX_LOST_SAMPLES_INFO" "$REGEX_SAMPLES_INFO" "$REGEX_LINES_HEADER" "$REGEX_LINES" < $LOGS_DIR/basic_basic.log +CHECK_EXIT_CODE=$? +../common/check_errors_whitelisted.pl "stderr-whitelist.txt" < $LOGS_DIR/basic_basic.err +(( CHECK_EXIT_CODE += $? )) + +print_results $PERF_EXIT_CODE $CHECK_EXIT_CODE "basic execution" +(( TEST_RESULT += $? )) + + +### number of samples + +# '--show-nr-samples' should show number of samples for each symbol +$CMD_PERF report --stdio -i $CURRENT_TEST_DIR/perf.data --show-nr-samples > $LOGS_DIR/basic_nrsamples.log 2> $LOGS_DIR/basic_nrsamples.err +PERF_EXIT_CODE=$? + +REGEX_LINES_HEADER="#\s*Children\s+Self\s+Samples\s+Command\s+Shared Object\s+Symbol" +REGEX_LINES="\s*$RE_NUMBER%\s+$RE_NUMBER%\s+$RE_NUMBER\s+\S+\s+\[kernel\.(?:vmlinux)|(?:kallsyms)\]\s+\[[k\.]\]\s+\w+" +../common/check_all_patterns_found.pl "$REGEX_LINES_HEADER" "$REGEX_LINES" < $LOGS_DIR/basic_nrsamples.log +CHECK_EXIT_CODE=$? +../common/check_errors_whitelisted.pl "stderr-whitelist.txt" < $LOGS_DIR/basic_nrsamples.err +(( CHECK_EXIT_CODE += $? )) + +print_results $PERF_EXIT_CODE $CHECK_EXIT_CODE "number of samples" +(( TEST_RESULT += $? )) + + +### header + +# '--header' and '--header-only' should show perf report header +$CMD_PERF report -i $CURRENT_TEST_DIR/perf.data --stdio --header-only > $LOGS_DIR/basic_header.log +PERF_EXIT_CODE=$? + +REGEX_LINE_TIMESTAMP="#\s+captured on\s*:\s*$RE_DATE_TIME" +REGEX_LINE_HOSTNAME="#\s+hostname\s*:\s*$MY_HOSTNAME" +REGEX_LINE_KERNEL="#\s+os release\s*:\s*${MY_KERNEL_VERSION//+/\\+}" +REGEX_LINE_PERF="#\s+perf version\s*:\s*" +REGEX_LINE_ARCH="#\s+arch\s*:\s*$MY_ARCH" +REGEX_LINE_CPUS_ONLINE="#\s+nrcpus online\s*:\s*$MY_CPUS_ONLINE" +REGEX_LINE_CPUS_AVAIL="#\s+nrcpus avail\s*:\s*$MY_CPUS_AVAILABLE" +# disable precise check for "nrcpus avail" in BASIC runmode +test $PERFTOOL_TESTSUITE_RUNMODE -lt $RUNMODE_STANDARD && REGEX_LINE_CPUS_AVAIL="#\s+nrcpus avail\s*:\s*$RE_NUMBER" +../common/check_all_patterns_found.pl "$REGEX_LINE_TIMESTAMP" "$REGEX_LINE_HOSTNAME" "$REGEX_LINE_KERNEL" "$REGEX_LINE_PERF" "$REGEX_LINE_ARCH" "$REGEX_LINE_CPUS_ONLINE" "$REGEX_LINE_CPUS_AVAIL" < $LOGS_DIR/basic_header.log +CHECK_EXIT_CODE=$? + +print_results $PERF_EXIT_CODE $CHECK_EXIT_CODE "header" +(( TEST_RESULT += $? )) + +# '--header' and '--header-only' should use creation time +OLD_TIMESTAMP=`$CMD_PERF report --stdio --header-only -i $CURRENT_TEST_DIR/perf.data | grep "captured on"` +PERF_EXIT_CODE=$? + +( tar -C $CURRENT_TEST_DIR -c perf.data | xz > $CURRENT_TEST_DIR/perf.data.tar.xz ; xzcat $CURRENT_TEST_DIR/perf.data.tar.xz | tar x -C $HEADER_TAR_DIR ) +(( PERF_EXIT_CODE += $? )) + +NEW_TIMESTAMP=`$CMD_PERF report --stdio --header-only -i $HEADER_TAR_DIR/perf.data | grep "captured on"` +(( PERF_EXIT_CODE += $? )) + +test "$OLD_TIMESTAMP" = "$NEW_TIMESTAMP" +CHECK_EXIT_CODE=$? + +print_results $PERF_EXIT_CODE $CHECK_EXIT_CODE "header timestamp" +(( TEST_RESULT += $? )) + + +### show CPU utilization + +# '--showcpuutilization' should show percentage for both system and userspace mode +$CMD_PERF report -i $CURRENT_TEST_DIR/perf.data --stdio --showcpuutilization > $LOGS_DIR/basic_cpuut.log 2> $LOGS_DIR/basic_cpuut.err +PERF_EXIT_CODE=$? + +REGEX_LINES_HEADER="#\s*Children\s+Self\s+sys\s+usr\s+Command\s+Shared Object\s+Symbol" +REGEX_LINES="\s*$RE_NUMBER%\s+$RE_NUMBER%\s+$RE_NUMBER%\s+$RE_NUMBER%\s+\S+\s+\[kernel\.(?:vmlinux)|(?:kallsyms)\]\s+\[[k\.]\]\s+\w+" +../common/check_all_patterns_found.pl "$REGEX_LINES_HEADER" "$REGEX_LINES" < $LOGS_DIR/basic_cpuut.log +CHECK_EXIT_CODE=$? +../common/check_errors_whitelisted.pl "stderr-whitelist.txt" < $LOGS_DIR/basic_cpuut.err +(( CHECK_EXIT_CODE += $? )) + +print_results $PERF_EXIT_CODE $CHECK_EXIT_CODE "show CPU utilization" +(( TEST_RESULT += $? )) + + +### pid + +# '--pid=' should limit the output for a process with the given pid only +$CMD_PERF report --stdio -i $CURRENT_TEST_DIR/perf.data --pid=1 > $LOGS_DIR/basic_pid.log 2> $LOGS_DIR/basic_pid.err +PERF_EXIT_CODE=$? + +grep -P -v '^#' $LOGS_DIR/basic_pid.log | grep -P '\s+[\d\.]+%' | ../common/check_all_lines_matched.pl "systemd|init" +CHECK_EXIT_CODE=$? +../common/check_errors_whitelisted.pl "stderr-whitelist.txt" < $LOGS_DIR/basic_pid.err +(( CHECK_EXIT_CODE += $? )) + +print_results $PERF_EXIT_CODE $CHECK_EXIT_CODE "pid" +(( TEST_RESULT += $? )) + + +### non-existing symbol + +# '--symbols' should show only the given symbols +$CMD_PERF report --stdio -i $CURRENT_TEST_DIR/perf.data --symbols=dummynonexistingsymbol > $LOGS_DIR/basic_symbols.log 2> $LOGS_DIR/basic_symbols.err +PERF_EXIT_CODE=$? + +../common/check_all_lines_matched.pl "$RE_LINE_EMPTY" "$RE_LINE_COMMENT" < $LOGS_DIR/basic_symbols.log +CHECK_EXIT_CODE=$? +../common/check_errors_whitelisted.pl "stderr-whitelist.txt" < $LOGS_DIR/basic_symbols.err +(( CHECK_EXIT_CODE += $? )) + +print_results $PERF_EXIT_CODE $CHECK_EXIT_CODE "non-existing symbol" +(( TEST_RESULT += $? )) + + +### symbol filter + +# '--symbol-filter' should filter symbols based on substrings +$CMD_PERF report --stdio -i $CURRENT_TEST_DIR/perf.data --symbol-filter=map > $LOGS_DIR/basic_symbolfilter.log 2> $LOGS_DIR/basic_symbolfilter.err +PERF_EXIT_CODE=$? + +grep -P -v '^#' $LOGS_DIR/basic_symbolfilter.log | grep -P '\s+[\d\.]+%' | ../common/check_all_lines_matched.pl "\[[k\.]\]\s+.*map" +CHECK_EXIT_CODE=$? +../common/check_errors_whitelisted.pl "stderr-whitelist.txt" < $LOGS_DIR/basic_symbolfilter.err +(( CHECK_EXIT_CODE += $? )) + +print_results $PERF_EXIT_CODE $CHECK_EXIT_CODE "symbol filter" +(( TEST_RESULT += $? )) + + +# TODO: $CMD_PERF report -n --showcpuutilization -TUxDg 2> 01.log + +# print overall results +print_overall_results "$TEST_RESULT" +exit $? diff --git a/tools/perf/tests/shell/common/settings.sh b/tools/perf/tests/shell/common/settings.sh index 2e50dd0bff9b..cba1b338f96f 100644 --- a/tools/perf/tests/shell/common/settings.sh +++ b/tools/perf/tests/shell/common/settings.sh @@ -80,12 +80,14 @@ if [ -n "$PERFSUITE_RUN_DIR" ]; then export CURRENT_TEST_DIR="$PERFSUITE_RUN_DIR/$TEST_NAME" export MAKE_TARGET_DIR="$CURRENT_TEST_DIR/examples" export LOGS_DIR="$CURRENT_TEST_DIR/logs" + export HEADER_TAR_DIR="$CURRENT_TEST_DIR/header_tar" test -d "$CURRENT_TEST_DIR" || mkdir -p "$CURRENT_TEST_DIR" test -d "$LOGS_DIR" || mkdir -p "$LOGS_DIR" else # when $PERFSUITE_RUN_DIR is not set, logs will be placed here export CURRENT_TEST_DIR="." export LOGS_DIR="." + export HEADER_TAR_DIR="./header_tar" fi -- cgit v1.2.3 From e37cb2a6beed817df510832931bfdaa6bb8a0447 Mon Sep 17 00:00:00 2001 From: Veronika Molnarova Date: Tue, 2 Jul 2024 13:08:49 +0200 Subject: perf testsuite report: Add test case for perf report Add a new 'perf report' test case that acts as an entry element in 'perf test list'. Runs multiple subtests from directory "base_report", which can be expanded without further editing. Signed-off-by: Veronika Molnarova Cc: Athira Rajeev Cc: Ian Rogers Cc: Masami Hiramatsu Cc: Namhyung Kim Link: https://lore.kernel.org/r/20240702110849.31904-12-vmolnaro@redhat.com Signed-off-by: Michael Petlan Signed-off-by: Arnaldo Carvalho de Melo --- .../perf/tests/shell/perftool-testsuite_report.sh | 23 ++++++++++++++++++++++ 1 file changed, 23 insertions(+) create mode 100755 tools/perf/tests/shell/perftool-testsuite_report.sh (limited to 'tools') diff --git a/tools/perf/tests/shell/perftool-testsuite_report.sh b/tools/perf/tests/shell/perftool-testsuite_report.sh new file mode 100755 index 000000000000..973012ce92a7 --- /dev/null +++ b/tools/perf/tests/shell/perftool-testsuite_report.sh @@ -0,0 +1,23 @@ +#!/bin/bash +# perftool-testsuite_report +# SPDX-License-Identifier: GPL-2.0 + +test -d "$(dirname "$0")/base_report" || exit 2 +cd "$(dirname "$0")/base_report" || exit 2 +status=0 + +PERFSUITE_RUN_DIR=$(mktemp -d /tmp/"$(basename "$0" .sh)".XXX) +export PERFSUITE_RUN_DIR + +for testcase in setup.sh test_*; do # skip setup.sh if not present or not executable + test -x "$testcase" || continue + ./"$testcase" + (( status += $? )) +done + +if ! [ "$PERFTEST_KEEP_LOGS" = "y" ]; then + rm -rf "$PERFSUITE_RUN_DIR" +fi + +test $status -ne 0 && exit 1 +exit 0 -- cgit v1.2.3 From 097fe67df1aa9cc79092b15c68c9135569a679d6 Mon Sep 17 00:00:00 2001 From: Michael Petlan Date: Tue, 2 Jul 2024 13:08:50 +0200 Subject: perf testsuite: Install perf-report tests in the 'make install-tests -C tools/perf' target Signed-off-by: Michael Petlan Cc: Athira Rajeev Cc: Ian Rogers Cc: Masami Hiramatsu Cc: Namhyung Kim Link: https://lore.kernel.org/r/20240702110849.31904-13-vmolnaro@redhat.com Signed-off-by: Veronika Molnarova Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Makefile.perf | 2 ++ 1 file changed, 2 insertions(+) (limited to 'tools') diff --git a/tools/perf/Makefile.perf b/tools/perf/Makefile.perf index 9731f5e84131..9dd2e8d3f3c9 100644 --- a/tools/perf/Makefile.perf +++ b/tools/perf/Makefile.perf @@ -1142,6 +1142,8 @@ install-tests: all install-gtk $(INSTALL) tests/shell/common/*.pl '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/tests/shell/common'; \ $(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/tests/shell/base_probe'; \ $(INSTALL) tests/shell/base_probe/*.sh '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/tests/shell/base_probe'; \ + $(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/tests/shell/base_report'; \ + $(INSTALL) tests/shell/base_probe/*.sh '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/tests/shell/base_report'; \ $(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/tests/shell/coresight' ; \ $(INSTALL) tests/shell/coresight/*.sh '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/tests/shell/coresight' $(Q)$(MAKE) -C tests/shell/coresight install-tests -- cgit v1.2.3 From 8b48f8ba16b06024920c8ff349fed769118777cc Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Tue, 27 Aug 2024 14:27:57 -0700 Subject: perf report: Name events in stats for pipe mode In stats mode PERF_RECORD_EVENT_UPDATE isn't being handled meaning the evsels aren't named when handling pipe mode output. Before: $ perf record -e inst_retired.any -a -o - sleep 0.1|perf report --stats -i - ... Aggregated stats: TOTAL events: 23358 COMM events: 2608 (11.2%) EXIT events: 1 ( 0.0%) FORK events: 2607 (11.2%) SAMPLE events: 174 ( 0.7%) MMAP2 events: 17936 (76.8%) ATTR events: 2 ( 0.0%) FINISHED_ROUND events: 2 ( 0.0%) ID_INDEX events: 1 ( 0.0%) THREAD_MAP events: 1 ( 0.0%) CPU_MAP events: 1 ( 0.0%) EVENT_UPDATE events: 3 ( 0.0%) TIME_CONV events: 1 ( 0.0%) FEATURE events: 20 ( 0.1%) FINISHED_INIT events: 1 ( 0.0%) raw 0xc0 stats: SAMPLE events: 174 After: $ perf record -e inst_retired.any -a -o - sleep 0.1|perf report --stats -i - ... Aggregated stats: TOTAL events: 23742 COMM events: 2620 (11.0%) EXIT events: 2 ( 0.0%) FORK events: 2619 (11.0%) SAMPLE events: 165 ( 0.7%) MMAP2 events: 18304 (77.1%) ATTR events: 2 ( 0.0%) FINISHED_ROUND events: 2 ( 0.0%) ID_INDEX events: 1 ( 0.0%) THREAD_MAP events: 1 ( 0.0%) CPU_MAP events: 1 ( 0.0%) EVENT_UPDATE events: 3 ( 0.0%) TIME_CONV events: 1 ( 0.0%) FEATURE events: 20 ( 0.1%) FINISHED_INIT events: 1 ( 0.0%) inst_retired.any stats: SAMPLE events: 165 This makes the pipe output match the regular output. Signed-off-by: Ian Rogers Acked-by: Namhyung Kim Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kan Liang Cc: Mark Rutland Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20240827212757.1469340-1-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-report.c | 1 + 1 file changed, 1 insertion(+) (limited to 'tools') diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c index c304d9b06190..5dc17ffee27a 100644 --- a/tools/perf/builtin-report.c +++ b/tools/perf/builtin-report.c @@ -816,6 +816,7 @@ static void stats_setup(struct report *rep) rep->tool.attr = process_attr; rep->tool.sample = count_sample_event; rep->tool.lost_samples = count_lost_samples_event; + rep->tool.event_update = perf_event__process_event_update; rep->tool.no_warn = true; } -- cgit v1.2.3 From 4451dae46992131ef5c42147444c183c364b1149 Mon Sep 17 00:00:00 2001 From: Howard Chu Date: Thu, 15 Aug 2024 09:36:17 +0800 Subject: perf evlist: Introduce method to find if there is a bpf-output event We'll use it in the next patch, to deciding how to set up the ring buffer. Signed-off-by: Howard Chu Tested-by: Arnaldo Carvalho de Melo Cc: Adrian Hunter Cc: Ian Rogers Cc: Jiri Olsa Cc: Kan Liang Cc: Namhyung Kim Link: https://lore.kernel.org/r/20240815013626.935097-2-howardchu95@gmail.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/evlist.c | 12 ++++++++++++ tools/perf/util/evlist.h | 1 + 2 files changed, 13 insertions(+) (limited to 'tools') diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c index c0379fa1ccfe..ef58a7764318 100644 --- a/tools/perf/util/evlist.c +++ b/tools/perf/util/evlist.c @@ -2627,3 +2627,15 @@ void evlist__uniquify_name(struct evlist *evlist) } } } + +bool evlist__has_bpf_output(struct evlist *evlist) +{ + struct evsel *evsel; + + evlist__for_each_entry(evlist, evsel) { + if (evsel__is_bpf_output(evsel)) + return true; + } + + return false; +} diff --git a/tools/perf/util/evlist.h b/tools/perf/util/evlist.h index b46f1a320783..bcc1c6984bb5 100644 --- a/tools/perf/util/evlist.h +++ b/tools/perf/util/evlist.h @@ -447,5 +447,6 @@ int evlist__scnprintf_evsels(struct evlist *evlist, size_t size, char *bf); void evlist__check_mem_load_aux(struct evlist *evlist); void evlist__warn_user_requested_cpus(struct evlist *evlist, const char *cpu_list); void evlist__uniquify_name(struct evlist *evlist); +bool evlist__has_bpf_output(struct evlist *evlist); #endif /* __PERF_EVLIST_H */ -- cgit v1.2.3 From 8df1d8c6cbd6825b3784068e7c2b37fa8a8a43f0 Mon Sep 17 00:00:00 2001 From: Howard Chu Date: Thu, 15 Aug 2024 09:36:17 +0800 Subject: perf trace: Fix perf trace -p 'perf trace -p ' work on a syscall that is unaugmented, but doesn't work on a syscall that's augmented (when it calls perf_event_output() in BPF). Let's take open() as an example. open() is augmented in perf trace. Before: $ perf trace -e open -p 3792392 ? ( ): ... [continued]: open()) = -1 ENOENT (No such file or directory) ? ( ): ... [continued]: open()) = -1 ENOENT (No such file or directory) We can see there's no output. After: $ perf trace -e open -p 3792392 0.000 ( 0.123 ms): a.out/3792392 open(filename: "DINGZHEN", flags: WRONLY) = -1 ENOENT (No such file or directory) 1000.398 ( 0.116 ms): a.out/3792392 open(filename: "DINGZHEN", flags: WRONLY) = -1 ENOENT (No such file or directory) Reason: bpf_perf_event_output() will fail when you specify a pid in 'perf trace' (EOPNOTSUPP). When using 'perf trace -p 114', before perf_event_open(), we'll have PID = 114, and CPU = -1. This is bad for bpf-output event, because the ring buffer won't accept output from BPF's perf_event_output(), making it fail. I'm still trying to find out why. If we open bpf-output for every cpu, instead of setting it to -1, like this: PID = , CPU = 0 PID = , CPU = 1 PID = , CPU = 2 PID = , CPU = 3 Everything works. You can test it with this script (open.c): #include #include int main() { int i1 = 1, i2 = 2, i3 = 3, i4 = 4; char s1[] = "DINGZHEN", s2[] = "XUEBAO"; while (1) { syscall(SYS_open, s1, i1, i2); sleep(1); } return 0; } save, compile: make open perf trace: perf trace -e open Signed-off-by: Howard Chu Tested-by: Arnaldo Carvalho de Melo Cc: Adrian Hunter Cc: Ian Rogers Cc: Jiri Olsa Cc: Kan Liang Cc: Namhyung Kim Link: https://lore.kernel.org/r/20240815013626.935097-2-howardchu95@gmail.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/evlist.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c index ef58a7764318..f14b7e6ff1dc 100644 --- a/tools/perf/util/evlist.c +++ b/tools/perf/util/evlist.c @@ -1067,7 +1067,7 @@ int evlist__create_maps(struct evlist *evlist, struct target *target) if (!threads) return -1; - if (target__uses_dummy_map(target)) + if (target__uses_dummy_map(target) && !evlist__has_bpf_output(evlist)) cpus = perf_cpu_map__new_any_cpu(); else cpus = perf_cpu_map__new(target->cpu_list); -- cgit v1.2.3 From 7bedcbaefdf5d4f71302cb5428eb72ada4f380e9 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Thu, 22 Aug 2024 15:10:27 -0300 Subject: perf trace: Pass the richer 'struct syscall_arg' pointer to trace__btf_scnprintf() Since we'll need it later in the current patch series and we can get the syscall_arg_fmt from syscall_arg->fmt. Based-on-a-patch-by: Howard Chu Cc: Adrian Hunter Cc: Ian Rogers Cc: Jiri Olsa Cc: Kan Liang Cc: Namhyung Kim Link: https://lore.kernel.org/lkml/Zsd8vqCrTh5h69rp@x1 Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-trace.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'tools') diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c index d6ca541fdc78..53690bbb143e 100644 --- a/tools/perf/builtin-trace.c +++ b/tools/perf/builtin-trace.c @@ -989,9 +989,11 @@ static size_t btf_enum_scnprintf(const struct btf_type *type, struct btf *btf, c return 0; } -static size_t trace__btf_scnprintf(struct trace *trace, struct syscall_arg_fmt *arg_fmt, char *bf, +static size_t trace__btf_scnprintf(struct trace *trace, struct syscall_arg *arg, char *bf, size_t size, int val, char *type) { + struct syscall_arg_fmt *arg_fmt = arg->fmt; + if (trace->btf == NULL) return 0; @@ -1011,7 +1013,7 @@ static size_t trace__btf_scnprintf(struct trace *trace, struct syscall_arg_fmt * } #else // HAVE_LIBBPF_SUPPORT -static size_t trace__btf_scnprintf(struct trace *trace __maybe_unused, struct syscall_arg_fmt *arg_fmt __maybe_unused, +static size_t trace__btf_scnprintf(struct trace *trace __maybe_unused, struct syscall_arg *arg __maybe_unused, char *bf __maybe_unused, size_t size __maybe_unused, int val __maybe_unused, char *type __maybe_unused) { @@ -2260,7 +2262,7 @@ static size_t syscall__scnprintf_args(struct syscall *sc, char *bf, size_t size, if (trace->show_arg_names) printed += scnprintf(bf + printed, size - printed, "%s: ", field->name); - btf_printed = trace__btf_scnprintf(trace, &sc->arg_fmt[arg.idx], bf + printed, + btf_printed = trace__btf_scnprintf(trace, &arg, bf + printed, size - printed, val, field->type); if (btf_printed) { printed += btf_printed; @@ -2963,7 +2965,7 @@ static size_t trace__fprintf_tp_fields(struct trace *trace, struct evsel *evsel, if (trace->show_arg_names) printed += scnprintf(bf + printed, size - printed, "%s: ", field->name); - btf_printed = trace__btf_scnprintf(trace, arg, bf + printed, size - printed, val, field->type); + btf_printed = trace__btf_scnprintf(trace, &syscall_arg, bf + printed, size - printed, val, field->type); if (btf_printed) { printed += btf_printed; continue; -- cgit v1.2.3 From c5d50457a8fc2695ca5a921f3a4a402343cf5313 Mon Sep 17 00:00:00 2001 From: Kajol Jain Date: Tue, 27 Aug 2024 11:02:04 +0530 Subject: perf vendor events power10: Update JSON/events Update JSON/events for power10 platform with additional events. Reviewed-by: Ian Rogers Signed-off-by: Kajol Jain Cc: Athira Rajeev Cc: Disha Goel Cc: Hari Bathini Cc: Madhavan Srinivasan Cc: Namhyung Kim Cc: linuxppc-dev@lists.ozlabs.org Link: https://lore.kernel.org/r/20240827053206.538814-1-kjain@linux.ibm.com Signed-off-by: Arnaldo Carvalho de Melo --- .../arch/powerpc/power10/datasource.json | 25 ++++++++++++++++++++++ .../pmu-events/arch/powerpc/power10/frontend.json | 10 +++++++++ .../perf/pmu-events/arch/powerpc/power10/pmc.json | 5 +++++ 3 files changed, 40 insertions(+) (limited to 'tools') diff --git a/tools/perf/pmu-events/arch/powerpc/power10/datasource.json b/tools/perf/pmu-events/arch/powerpc/power10/datasource.json index 0eeaaf1a95b8..283284745d9c 100644 --- a/tools/perf/pmu-events/arch/powerpc/power10/datasource.json +++ b/tools/perf/pmu-events/arch/powerpc/power10/datasource.json @@ -14,6 +14,31 @@ "EventName": "PM_DATA_FROM_MEMORY", "BriefDescription": "The processor's data cache was reloaded from local, remote, or distant memory due to a demand miss." }, + { + "EventCode": "0x0000004080", + "EventName": "PM_INST_FROM_L1", + "BriefDescription": "An instruction fetch hit in the L1. Each fetch group contains 8 instructions. The same line can hit 4 times if 32 sequential instructions are fetched." + }, + { + "EventCode": "0x000000026080", + "EventName": "PM_L2_LD_MISS", + "BriefDescription": "All successful D-Side Load dispatches for this thread that missed in the L2. Since the event happens in a 2:1 clock domain and is time-sliced across all 4 threads, the event count should be multiplied by 2." + }, + { + "EventCode": "0x000000026880", + "EventName": "PM_L2_ST_MISS", + "BriefDescription": "All successful D-Side Store dispatches for this thread that missed in the L2. Since the event happens in a 2:1 clock domain and is time-sliced across all 4 threads, the event count should be multiplied by 2." + }, + { + "EventCode": "0x010000046880", + "EventName": "PM_L2_ST_HIT", + "BriefDescription": "All successful D-side store dispatches for this thread that were L2 hits. Since the event happens in a 2:1 clock domain and is time-sliced across all 4 threads, the event count should be multiplied by 2." + }, + { + "EventCode": "0x000000036880", + "EventName": "PM_L2_INST_MISS", + "BriefDescription": "All successful instruction (demand and prefetch) dispatches for this thread that missed in the L2. Since the event happens in a 2:1 clock domain and is time-sliced across all 4 threads, the event count should be multiplied by 2." + }, { "EventCode": "0x000300000000C040", "EventName": "PM_INST_FROM_L2", diff --git a/tools/perf/pmu-events/arch/powerpc/power10/frontend.json b/tools/perf/pmu-events/arch/powerpc/power10/frontend.json index 53660c279286..456971f60814 100644 --- a/tools/perf/pmu-events/arch/powerpc/power10/frontend.json +++ b/tools/perf/pmu-events/arch/powerpc/power10/frontend.json @@ -93,5 +93,15 @@ "EventCode": "0x400FC", "EventName": "PM_ITLB_MISS", "BriefDescription": "Instruction TLB reload (after a miss), all page sizes. Includes only demand misses." + }, + { + "EventCode": "0x00000040B8", + "EventName": "PM_PRED_BR_TKN_COND_DIR", + "BriefDescription": "A conditional branch finished with correctly predicted direction. Resolved taken." + }, + { + "EventCode": "0x00000048B8", + "EventName": "PM_PRED_BR_NTKN_COND_DIR", + "BriefDescription": "A conditional branch finished with correctly predicted direction. Resolved not taken." } ] diff --git a/tools/perf/pmu-events/arch/powerpc/power10/pmc.json b/tools/perf/pmu-events/arch/powerpc/power10/pmc.json index 0e0253d0e757..04732698d9b2 100644 --- a/tools/perf/pmu-events/arch/powerpc/power10/pmc.json +++ b/tools/perf/pmu-events/arch/powerpc/power10/pmc.json @@ -104,6 +104,11 @@ "EventName": "PM_RUN_CYC", "BriefDescription": "Processor cycles gated by the run latch." }, + { + "EventCode": "0x200F8", + "EventName": "PM_EXT_INT", + "BriefDescription": "Cycles an external interrupt was active." + }, { "EventCode": "0x30010", "EventName": "PM_PMC2_OVERFLOW", -- cgit v1.2.3 From 0edee819712ed00ce199f94108a8d588970eea80 Mon Sep 17 00:00:00 2001 From: Kajol Jain Date: Tue, 27 Aug 2024 11:02:05 +0530 Subject: perf vendor events power10: Move the JSON/events Move some of the JSON/events from others.json to more appropriate JSON files for power10 platform. Reviewed-by: Ian Rogers Signed-off-by: Kajol Jain Cc: Athira Rajeev Cc: Disha Goel Cc: Hari Bathini Cc: Ian Rogers Cc: Madhavan Srinivasan Cc: Namhyung Kim Cc: linuxppc-dev@lists.ozlabs.org Link: https://lore.kernel.org/r/20240827053206.538814-2-kjain@linux.ibm.com Signed-off-by: Arnaldo Carvalho de Melo --- .../pmu-events/arch/powerpc/power10/cache.json | 15 +++ .../arch/powerpc/power10/datasource.json | 15 +++ .../pmu-events/arch/powerpc/power10/frontend.json | 10 ++ .../pmu-events/arch/powerpc/power10/locks.json | 10 ++ .../pmu-events/arch/powerpc/power10/memory.json | 30 +++++ .../pmu-events/arch/powerpc/power10/others.json | 130 --------------------- .../pmu-events/arch/powerpc/power10/pipeline.json | 45 +++++++ .../perf/pmu-events/arch/powerpc/power10/pmc.json | 5 + 8 files changed, 130 insertions(+), 130 deletions(-) (limited to 'tools') diff --git a/tools/perf/pmu-events/arch/powerpc/power10/cache.json b/tools/perf/pmu-events/arch/powerpc/power10/cache.json index 839ae26945fb..9814a59fce31 100644 --- a/tools/perf/pmu-events/arch/powerpc/power10/cache.json +++ b/tools/perf/pmu-events/arch/powerpc/power10/cache.json @@ -1,4 +1,19 @@ [ + { + "EventCode": "0x1002C", + "EventName": "PM_LD_PREFETCH_CACHE_LINE_MISS", + "BriefDescription": "The L1 cache was reloaded with a line that fulfills a prefetch request." + }, + { + "EventCode": "0x200FD", + "EventName": "PM_L1_ICACHE_MISS", + "BriefDescription": "Demand instruction cache miss." + }, + { + "EventCode": "0x30068", + "EventName": "PM_L1_ICACHE_RELOADED_PREF", + "BriefDescription": "Counts all instruction cache prefetch reloads (includes demand turned into prefetch)." + }, { "EventCode": "0x300F4", "EventName": "PM_RUN_INST_CMPL_CONC", diff --git a/tools/perf/pmu-events/arch/powerpc/power10/datasource.json b/tools/perf/pmu-events/arch/powerpc/power10/datasource.json index 283284745d9c..a5d5be35b5e6 100644 --- a/tools/perf/pmu-events/arch/powerpc/power10/datasource.json +++ b/tools/perf/pmu-events/arch/powerpc/power10/datasource.json @@ -1,4 +1,14 @@ [ + { + "EventCode": "0x1505E", + "EventName": "PM_LD_HIT_L1", + "BriefDescription": "Load finished without experiencing an L1 miss." + }, + { + "EventCode": "0x100FC", + "EventName": "PM_LD_REF_L1", + "BriefDescription": "All L1 D cache load references counted at finish, gated by reject. In P9 and earlier this event counted only cacheable loads but in P10 both cacheable and non-cacheable loads are included." + }, { "EventCode": "0x200FE", "EventName": "PM_DATA_FROM_L2MISS", @@ -9,6 +19,11 @@ "EventName": "PM_DATA_FROM_L3MISS", "BriefDescription": "The processor's L1 data cache was reloaded from beyond the local core's L3 due to a demand miss." }, + { + "EventCode": "0x400F0", + "EventName": "PM_LD_DEMAND_MISS_L1_FIN", + "BriefDescription": "Load missed L1, counted at finish time." + }, { "EventCode": "0x400FE", "EventName": "PM_DATA_FROM_MEMORY", diff --git a/tools/perf/pmu-events/arch/powerpc/power10/frontend.json b/tools/perf/pmu-events/arch/powerpc/power10/frontend.json index 456971f60814..684374fe5699 100644 --- a/tools/perf/pmu-events/arch/powerpc/power10/frontend.json +++ b/tools/perf/pmu-events/arch/powerpc/power10/frontend.json @@ -84,6 +84,11 @@ "EventName": "PM_VECTOR_ST_CMPL", "BriefDescription": "Vector store instruction completed." }, + { + "EventCode": "0x4D05E", + "EventName": "PM_BR_CMPL", + "BriefDescription": "A branch completed. All branches are included." + }, { "EventCode": "0x4E054", "EventName": "PM_DTLB_HIT_1G", @@ -94,6 +99,11 @@ "EventName": "PM_ITLB_MISS", "BriefDescription": "Instruction TLB reload (after a miss), all page sizes. Includes only demand misses." }, + { + "EventCode": "0x00000048B4", + "EventName": "PM_BR_TKN_UNCOND_FIN", + "BriefDescription": "An unconditional branch finished. All unconditional branches are taken." + }, { "EventCode": "0x00000040B8", "EventName": "PM_PRED_BR_TKN_COND_DIR", diff --git a/tools/perf/pmu-events/arch/powerpc/power10/locks.json b/tools/perf/pmu-events/arch/powerpc/power10/locks.json index b5a0d6521963..a8ea4d0def1a 100644 --- a/tools/perf/pmu-events/arch/powerpc/power10/locks.json +++ b/tools/perf/pmu-events/arch/powerpc/power10/locks.json @@ -4,9 +4,19 @@ "EventName": "PM_STCX_FAIL_FIN", "BriefDescription": "Conditional store instruction (STCX) failed. LARX and STCX are instructions used to acquire a lock." }, + { + "EventCode": "0x2E014", + "EventName": "PM_STCX_FIN", + "BriefDescription": "Conditional store instruction (STCX) finished. LARX and STCX are instructions used to acquire a lock." + }, { "EventCode": "0x4E050", "EventName": "PM_STCX_PASS_FIN", "BriefDescription": "Conditional store instruction (STCX) passed. LARX and STCX are instructions used to acquire a lock." + }, + { + "EventCode": "0x000000C8B8", + "EventName": "PM_STCX_SUCCESS_CMPL", + "BriefDescription": "STCX instructions that completed successfully. Specifically, counts only when a pass status is returned from the nest." } ] diff --git a/tools/perf/pmu-events/arch/powerpc/power10/memory.json b/tools/perf/pmu-events/arch/powerpc/power10/memory.json index 885262957beb..0d7191b3f2c6 100644 --- a/tools/perf/pmu-events/arch/powerpc/power10/memory.json +++ b/tools/perf/pmu-events/arch/powerpc/power10/memory.json @@ -69,6 +69,11 @@ "EventName": "PM_XFER_FROM_SRC_PMC3", "BriefDescription": "The processor's L1 data cache was reloaded from the source specified in MMCR3[30:42]. If MMCR1[16|17] is 0 (default), this count includes only lines that were reloaded to satisfy a demand miss. If MMCR1[16|17] is 1, this count includes both demand misses and prefetch reloads." }, + { + "EventCode": "0x3F04A", + "EventName": "PM_LSU_ST5_FIN", + "BriefDescription": "LSU Finished an internal operation in ST2 port." + }, { "EventCode": "0x3C054", "EventName": "PM_DERAT_MISS_16M", @@ -108,5 +113,30 @@ "EventCode": "0x4C05A", "EventName": "PM_DTLB_MISS_1G", "BriefDescription": "Data TLB reload (after a miss) page size 1G. Implies radix translation was used. When MMCR1[16]=0 this event counts only for demand misses. When MMCR1[16]=1 this event includes demand misses and prefetches." + }, + { + "EventCode": "0x000000F880", + "EventName": "PM_SNOOP_TLBIE_CYC", + "BriefDescription": "Cycles in which TLBIE snoops are executed in the LSU." + }, + { + "EventCode": "0x000000F084", + "EventName": "PM_SNOOP_TLBIE_CACHE_WALK_CYC", + "BriefDescription": "TLBIE snoop cycles in which the data cache is being walked." + }, + { + "EventCode": "0x000000F884", + "EventName": "PM_SNOOP_TLBIE_WAIT_ST_CYC", + "BriefDescription": "TLBIE snoop cycles in which older stores are still draining." + }, + { + "EventCode": "0x000000F088", + "EventName": "PM_SNOOP_TLBIE_WAIT_LD_CYC", + "BriefDescription": "TLBIE snoop cycles in which older loads are still draining." + }, + { + "EventCode": "0x000000F08C", + "EventName": "PM_SNOOP_TLBIE_WAIT_MMU_CYC", + "BriefDescription": "TLBIE snoop cycles in which the Load-Store unit is waiting for the MMU to finish invalidation." } ] diff --git a/tools/perf/pmu-events/arch/powerpc/power10/others.json b/tools/perf/pmu-events/arch/powerpc/power10/others.json index 3789304cb363..1bf802076ee0 100644 --- a/tools/perf/pmu-events/arch/powerpc/power10/others.json +++ b/tools/perf/pmu-events/arch/powerpc/power10/others.json @@ -1,89 +1,19 @@ [ - { - "EventCode": "0x1002C", - "EventName": "PM_LD_PREFETCH_CACHE_LINE_MISS", - "BriefDescription": "The L1 cache was reloaded with a line that fulfills a prefetch request." - }, - { - "EventCode": "0x1505E", - "EventName": "PM_LD_HIT_L1", - "BriefDescription": "Load finished without experiencing an L1 miss." - }, - { - "EventCode": "0x1F056", - "EventName": "PM_DISP_SS0_2_INSTR_CYC", - "BriefDescription": "Cycles in which Superslice 0 dispatches either 1 or 2 instructions." - }, - { - "EventCode": "0x1F05A", - "EventName": "PM_DISP_HELD_SYNC_CYC", - "BriefDescription": "Cycles dispatch is held because of a synchronizing instruction that requires the ICT to be empty before dispatch." - }, { "EventCode": "0x10066", "EventName": "PM_ADJUNCT_CYC", "BriefDescription": "Cycles in which the thread is in Adjunct state. MSR[S HV PR] bits = 011." }, - { - "EventCode": "0x100FC", - "EventName": "PM_LD_REF_L1", - "BriefDescription": "All L1 D cache load references counted at finish, gated by reject. In P9 and earlier this event counted only cacheable loads but in P10 both cacheable and non-cacheable loads are included." - }, { "EventCode": "0x2E010", "EventName": "PM_ADJUNCT_INST_CMPL", "BriefDescription": "PowerPC instruction completed while the thread was in Adjunct state." }, - { - "EventCode": "0x2E014", - "EventName": "PM_STCX_FIN", - "BriefDescription": "Conditional store instruction (STCX) finished. LARX and STCX are instructions used to acquire a lock." - }, - { - "EventCode": "0x2F054", - "EventName": "PM_DISP_SS1_2_INSTR_CYC", - "BriefDescription": "Cycles in which Superslice 1 dispatches either 1 or 2 instructions." - }, - { - "EventCode": "0x2F056", - "EventName": "PM_DISP_SS1_4_INSTR_CYC", - "BriefDescription": "Cycles in which Superslice 1 dispatches either 3 or 4 instructions." - }, { "EventCode": "0x200F2", "EventName": "PM_INST_DISP", "BriefDescription": "PowerPC instruction dispatched." }, - { - "EventCode": "0x200FD", - "EventName": "PM_L1_ICACHE_MISS", - "BriefDescription": "Demand instruction cache miss." - }, - { - "EventCode": "0x3F04A", - "EventName": "PM_LSU_ST5_FIN", - "BriefDescription": "LSU Finished an internal operation in ST2 port." - }, - { - "EventCode": "0x3405A", - "EventName": "PM_PRIVILEGED_INST_CMPL", - "BriefDescription": "PowerPC instruction completed while the thread was in Privileged state." - }, - { - "EventCode": "0x3F054", - "EventName": "PM_DISP_SS0_4_INSTR_CYC", - "BriefDescription": "Cycles in which Superslice 0 dispatches either 3 or 4 instructions." - }, - { - "EventCode": "0x3F056", - "EventName": "PM_DISP_SS0_8_INSTR_CYC", - "BriefDescription": "Cycles in which Superslice 0 dispatches either 5, 6, 7 or 8 instructions." - }, - { - "EventCode": "0x30068", - "EventName": "PM_L1_ICACHE_RELOADED_PREF", - "BriefDescription": "Counts all instruction cache prefetch reloads (includes demand turned into prefetch)." - }, { "EventCode": "0x300F6", "EventName": "PM_LD_DEMAND_MISS_L1", @@ -94,16 +24,6 @@ "EventName": "PM_L1_ICACHE_RELOADED_ALL", "BriefDescription": "Counts all instruction cache reloads includes demand, prefetch, prefetch turned into demand and demand turned into prefetch." }, - { - "EventCode": "0x4D05E", - "EventName": "PM_BR_CMPL", - "BriefDescription": "A branch completed. All branches are included." - }, - { - "EventCode": "0x400F0", - "EventName": "PM_LD_DEMAND_MISS_L1_FIN", - "BriefDescription": "Load missed L1, counted at finish time." - }, { "EventCode": "0x00000038BC", "EventName": "PM_ISYNC_CMPL", @@ -139,64 +59,14 @@ "EventName": "PM_ST1_UNALIGNED_FIN", "BriefDescription": "Store instructions in ST1 port that are either unaligned, or treated as unaligned and require an additional recycle through the pipeline. This typically adds about 10 cycles to the latency of the instruction. This only includes stores that cross the 128 byte boundary. Counted at finish time." }, - { - "EventCode": "0x000000C8B8", - "EventName": "PM_STCX_SUCCESS_CMPL", - "BriefDescription": "STCX instructions that completed successfully. Specifically, counts only when a pass status is returned from the nest." - }, { "EventCode": "0x000000D0B4", "EventName": "PM_DC_PREF_STRIDED_CONF", "BriefDescription": "A demand load referenced a line in an active strided prefetch stream. The stream could have been allocated through the hardware prefetch mechanism or through software." }, - { - "EventCode": "0x000000F880", - "EventName": "PM_SNOOP_TLBIE_CYC", - "BriefDescription": "Cycles in which TLBIE snoops are executed in the LSU." - }, - { - "EventCode": "0x000000F084", - "EventName": "PM_SNOOP_TLBIE_CACHE_WALK_CYC", - "BriefDescription": "TLBIE snoop cycles in which the data cache is being walked." - }, - { - "EventCode": "0x000000F884", - "EventName": "PM_SNOOP_TLBIE_WAIT_ST_CYC", - "BriefDescription": "TLBIE snoop cycles in which older stores are still draining." - }, - { - "EventCode": "0x000000F088", - "EventName": "PM_SNOOP_TLBIE_WAIT_LD_CYC", - "BriefDescription": "TLBIE snoop cycles in which older loads are still draining." - }, - { - "EventCode": "0x000000F08C", - "EventName": "PM_SNOOP_TLBIE_WAIT_MMU_CYC", - "BriefDescription": "TLBIE snoop cycles in which the Load-Store unit is waiting for the MMU to finish invalidation." - }, { "EventCode": "0x0000004884", "EventName": "PM_NO_FETCH_IBUF_FULL_CYC", "BriefDescription": "Cycles in which no instructions are fetched because there is no room in the instruction buffers." - }, - { - "EventCode": "0x00000048B4", - "EventName": "PM_BR_TKN_UNCOND_FIN", - "BriefDescription": "An unconditional branch finished. All unconditional branches are taken." - }, - { - "EventCode": "0x0B0000016080", - "EventName": "PM_L2_TLBIE_SLBIE_START", - "BriefDescription": "NCU Master received a TLBIE/SLBIEG/SLBIAG operation from the core. Event count should be multiplied by 2 since the data is coming from a 2:1 clock domain and the data is time sliced across all 4 threads." - }, - { - "EventCode": "0x0B0000016880", - "EventName": "PM_L2_TLBIE_SLBIE_DELAY", - "BriefDescription": "Cycles when a TLBIE/SLBIEG/SLBIAG command was held in a hottemp condition by the NCU Master. Multiply this count by 1000 to obtain the total number of cycles. This can be divided by PM_L2_TLBIE_SLBIE_SENT to obtain the average time a TLBIE/SLBIEG/SLBIAG command was held. Event count should be multiplied by 2 since the data is coming from a 2:1 clock domain and the data is time sliced across all 4 threads." - }, - { - "EventCode": "0x0B0000026880", - "EventName": "PM_L2_SNP_TLBIE_SLBIE_DELAY", - "BriefDescription": "Cycles when a TLBIE/SLBIEG/SLBIAG that targets this thread's LPAR was in flight while in a hottemp condition. Multiply this count by 1000 to obtain the total number of cycles. This can be divided by PM_L2_SNP_TLBIE_SLBIE_START to obtain the overall efficiency. Note: 'inflight' means SnpTLB has been sent to core(ie doesn't include when SnpTLB is in NCU waiting to be launched serially behind different SnpTLB). The NCU Snooper gets in a 'hottemp' delay window when it detects it is above its TLBIE/SLBIE threshold for process SnpTLBIE/SLBIE with this core. Event count should be multiplied by 2 since the data is coming from a 2:1 clock domain and the data is time sliced across all 4 threads." } ] diff --git a/tools/perf/pmu-events/arch/powerpc/power10/pipeline.json b/tools/perf/pmu-events/arch/powerpc/power10/pipeline.json index 21b23bb55d0d..940375d251cb 100644 --- a/tools/perf/pmu-events/arch/powerpc/power10/pipeline.json +++ b/tools/perf/pmu-events/arch/powerpc/power10/pipeline.json @@ -94,11 +94,21 @@ "EventName": "PM_CMPL_STALL_LWSYNC", "BriefDescription": "Cycles in which the oldest instruction in the pipeline was a lwsync waiting to complete." }, + { + "EventCode": "0x1F056", + "EventName": "PM_DISP_SS0_2_INSTR_CYC", + "BriefDescription": "Cycles in which Superslice 0 dispatches either 1 or 2 instructions." + }, { "EventCode": "0x1F058", "EventName": "PM_DISP_HELD_CYC", "BriefDescription": "Cycles dispatch is held." }, + { + "EventCode": "0x1F05A", + "EventName": "PM_DISP_HELD_SYNC_CYC", + "BriefDescription": "Cycles dispatch is held because of a synchronizing instruction that requires the ICT to be empty before dispatch." + }, { "EventCode": "0x10064", "EventName": "PM_DISP_STALL_IC_L2", @@ -229,6 +239,16 @@ "EventName": "PM_NTC_FIN", "BriefDescription": "Cycles in which the oldest instruction in the pipeline (NTC) finishes. Note that instructions can finish out of order, therefore not all the instructions that finish have a Next-to-complete status." }, + { + "EventCode": "0x2F054", + "EventName": "PM_DISP_SS1_2_INSTR_CYC", + "BriefDescription": "Cycles in which Superslice 1 dispatches either 1 or 2 instructions." + }, + { + "EventCode": "0x2F056", + "EventName": "PM_DISP_SS1_4_INSTR_CYC", + "BriefDescription": "Cycles in which Superslice 1 dispatches either 3 or 4 instructions." + }, { "EventCode": "0x20066", "EventName": "PM_DISP_HELD_OTHER_CYC", @@ -329,6 +349,16 @@ "EventName": "PM_DISP_STALL_IC_L3", "BriefDescription": "Cycles when dispatch was stalled while the instruction was fetched from the local L3." }, + { + "EventCode": "0x3F054", + "EventName": "PM_DISP_SS0_4_INSTR_CYC", + "BriefDescription": "Cycles in which Superslice 0 dispatches either 3 or 4 instructions." + }, + { + "EventCode": "0x3F056", + "EventName": "PM_DISP_SS0_8_INSTR_CYC", + "BriefDescription": "Cycles in which Superslice 0 dispatches either 5, 6, 7 or 8 instructions." + }, { "EventCode": "0x30060", "EventName": "PM_DISP_HELD_XVFC_MAPPER_CYC", @@ -458,5 +488,20 @@ "EventCode": "0x400F8", "EventName": "PM_FLUSH", "BriefDescription": "Flush (any type)." + }, + { + "EventCode": "0x0B0000016080", + "EventName": "PM_L2_TLBIE_SLBIE_START", + "BriefDescription": "NCU Master received a TLBIE/SLBIEG/SLBIAG operation from the core. Event count should be multiplied by 2 since the data is coming from a 2:1 clock domain and the data is time sliced across all 4 threads." + }, + { + "EventCode": "0x0B0000016880", + "EventName": "PM_L2_TLBIE_SLBIE_DELAY", + "BriefDescription": "Cycles when a TLBIE/SLBIEG/SLBIAG command was held in a hottemp condition by the NCU Master. Multiply this count by 1000 to obtain the total number of cycles. This can be divided by PM_L2_TLBIE_SLBIE_SENT to obtain the average time a TLBIE/SLBIEG/SLBIAG command was held. Event count should be multiplied by 2 since the data is coming from a 2:1 clock domain and the data is time sliced across all 4 threads." + }, + { + "EventCode": "0x0B0000026880", + "EventName": "PM_L2_SNP_TLBIE_SLBIE_DELAY", + "BriefDescription": "Cycles when a TLBIE/SLBIEG/SLBIAG that targets this thread's LPAR was in flight while in a hottemp condition. Multiply this count by 1000 to obtain the total number of cycles. This can be divided by PM_L2_SNP_TLBIE_SLBIE_START to obtain the overall efficiency. Note: 'inflight' means SnpTLB has been sent to core(ie doesn't include when SnpTLB is in NCU waiting to be launched serially behind different SnpTLB). The NCU Snooper gets in a 'hottemp' delay window when it detects it is above its TLBIE/SLBIE threshold for process SnpTLBIE/SLBIE with this core. Event count should be multiplied by 2 since the data is coming from a 2:1 clock domain and the data is time sliced across all 4 threads." } ] diff --git a/tools/perf/pmu-events/arch/powerpc/power10/pmc.json b/tools/perf/pmu-events/arch/powerpc/power10/pmc.json index 04732698d9b2..6f5b0e8fde12 100644 --- a/tools/perf/pmu-events/arch/powerpc/power10/pmc.json +++ b/tools/perf/pmu-events/arch/powerpc/power10/pmc.json @@ -129,6 +129,11 @@ "EventName": "PM_PMC6_OVERFLOW", "BriefDescription": "The event selected for PMC6 caused the event counter to overflow." }, + { + "EventCode": "0x3405A", + "EventName": "PM_PRIVILEGED_INST_CMPL", + "BriefDescription": "PowerPC instruction completed while the thread was in Privileged state." + }, { "EventCode": "0x3006C", "EventName": "PM_RUN_CYC_SMT2_MODE", -- cgit v1.2.3 From adf50a6e66ae7527f9db4f5c9c11e6cc2a870d4f Mon Sep 17 00:00:00 2001 From: Kajol Jain Date: Tue, 27 Aug 2024 11:02:06 +0530 Subject: perf vendor events: Move PM_BR_MPRED_CMPL event for power10 platform Move PM_BR_MPRED_CMPL event from cache.json to frontend.json file for power10 platform Reviewed-by: Ian Rogers Signed-off-by: Kajol Jain Cc: Athira Rajeev Cc: Disha Goel Cc: Hari Bathini Cc: Ian Rogers Cc: Madhavan Srinivasan Cc: Namhyung Kim Cc: linuxppc-dev@lists.ozlabs.org Link: https://lore.kernel.org/r/20240827053206.538814-3-kjain@linux.ibm.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/pmu-events/arch/powerpc/power10/cache.json | 5 ----- tools/perf/pmu-events/arch/powerpc/power10/frontend.json | 5 +++++ 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'tools') diff --git a/tools/perf/pmu-events/arch/powerpc/power10/cache.json b/tools/perf/pmu-events/arch/powerpc/power10/cache.json index 9814a59fce31..b7e0be09ff57 100644 --- a/tools/perf/pmu-events/arch/powerpc/power10/cache.json +++ b/tools/perf/pmu-events/arch/powerpc/power10/cache.json @@ -18,10 +18,5 @@ "EventCode": "0x300F4", "EventName": "PM_RUN_INST_CMPL_CONC", "BriefDescription": "PowerPC instruction completed by this thread when all threads in the core had the run-latch set." - }, - { - "EventCode": "0x400F6", - "EventName": "PM_BR_MPRED_CMPL", - "BriefDescription": "A mispredicted branch completed. Includes direction and target." } ] diff --git a/tools/perf/pmu-events/arch/powerpc/power10/frontend.json b/tools/perf/pmu-events/arch/powerpc/power10/frontend.json index 684374fe5699..b6998987ab75 100644 --- a/tools/perf/pmu-events/arch/powerpc/power10/frontend.json +++ b/tools/perf/pmu-events/arch/powerpc/power10/frontend.json @@ -94,6 +94,11 @@ "EventName": "PM_DTLB_HIT_1G", "BriefDescription": "Data TLB hit (DERAT reload) page size 1G. Implies radix translation. When MMCR1[16]=0 this event counts only for demand misses. When MMCR1[16]=1 this event includes demand misses and prefetches." }, + { + "EventCode": "0x400F6", + "EventName": "PM_BR_MPRED_CMPL", + "BriefDescription": "A mispredicted branch completed. Includes direction and target." + }, { "EventCode": "0x400FC", "EventName": "PM_ITLB_MISS", -- cgit v1.2.3 From aea4d463459c95a1b19627d1aa9c21b691da843a Mon Sep 17 00:00:00 2001 From: Lucas Stach Date: Mon, 1 Jul 2024 19:57:35 +0200 Subject: perf vendor events arm64: Move Yitian 710 DDR PMU into T-Head directory The Yitian 710 is not a Freescale/NXP design and thus should be located in a separate T-Head vendor directory. Reviewed-by: Jing Zhang Signed-off-by: Lucas Stach Cc: Ingo Molnar Cc: James Clark Cc: John Garry Cc: Leo Yan Cc: Mark Rutland Cc: Mike Leach Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Shuai Xue Cc: Will Deacon Cc: kernel@pengutronix.de Cc: linux-arm-kernel@lists.infradead.org Cc: patchwork-lst@pengutronix.de Link: https://lore.kernel.org/r/20240701175735.485655-1-l.stach@pengutronix.de Signed-off-by: Arnaldo Carvalho de Melo --- .../arm64/freescale/yitian710/sys/ali_drw.json | 373 --------------------- .../arm64/freescale/yitian710/sys/metrics.json | 20 -- .../arch/arm64/thead/yitian710/sys/ali_drw.json | 373 +++++++++++++++++++++ .../arch/arm64/thead/yitian710/sys/metrics.json | 20 ++ 4 files changed, 393 insertions(+), 393 deletions(-) delete mode 100644 tools/perf/pmu-events/arch/arm64/freescale/yitian710/sys/ali_drw.json delete mode 100644 tools/perf/pmu-events/arch/arm64/freescale/yitian710/sys/metrics.json create mode 100644 tools/perf/pmu-events/arch/arm64/thead/yitian710/sys/ali_drw.json create mode 100644 tools/perf/pmu-events/arch/arm64/thead/yitian710/sys/metrics.json (limited to 'tools') diff --git a/tools/perf/pmu-events/arch/arm64/freescale/yitian710/sys/ali_drw.json b/tools/perf/pmu-events/arch/arm64/freescale/yitian710/sys/ali_drw.json deleted file mode 100644 index e21c469a8ef0..000000000000 --- a/tools/perf/pmu-events/arch/arm64/freescale/yitian710/sys/ali_drw.json +++ /dev/null @@ -1,373 +0,0 @@ -[ - { - "BriefDescription": "A Write or Read Op at HIF interface. The unit is 64B.", - "ConfigCode": "0x0", - "EventName": "hif_rd_or_wr", - "Unit": "ali_drw", - "Compat": "ali_drw_pmu" - }, - { - "BriefDescription": "A Write Op at HIF interface. The unit is 64B.", - "ConfigCode": "0x1", - "EventName": "hif_wr", - "Unit": "ali_drw", - "Compat": "ali_drw_pmu" - }, - { - "BriefDescription": "A Read Op at HIF interface. The unit is 64B.", - "ConfigCode": "0x2", - "EventName": "hif_rd", - "Unit": "ali_drw", - "Compat": "ali_drw_pmu" - }, - { - "BriefDescription": "A Read-Modify-Write Op at HIF interface. The unit is 64B.", - "ConfigCode": "0x3", - "EventName": "hif_rmw", - "Unit": "ali_drw", - "Compat": "ali_drw_pmu" - }, - { - "BriefDescription": "A high priority Read at HIF interface. The unit is 64B.", - "ConfigCode": "0x4", - "EventName": "hif_hi_pri_rd", - "Unit": "ali_drw", - "Compat": "ali_drw_pmu" - }, - { - "BriefDescription": "A write data cycle at DFI interface (to DRAM).", - "ConfigCode": "0x7", - "EventName": "dfi_wr_data_cycles", - "Unit": "ali_drw", - "Compat": "ali_drw_pmu" - }, - { - "BriefDescription": "A read data cycle at DFI interface (to DRAM).", - "ConfigCode": "0x8", - "EventName": "dfi_rd_data_cycles", - "Unit": "ali_drw", - "Compat": "ali_drw_pmu" - }, - { - "BriefDescription": "A high priority read becomes critical.", - "ConfigCode": "0x9", - "EventName": "hpr_xact_when_critical", - "Unit": "ali_drw", - "Compat": "ali_drw_pmu" - }, - { - "BriefDescription": "A low priority read becomes critical.", - "ConfigCode": "0xA", - "EventName": "lpr_xact_when_critical", - "Unit": "ali_drw", - "Compat": "ali_drw_pmu" - }, - { - "BriefDescription": "A write becomes critical.", - "ConfigCode": "0xB", - "EventName": "wr_xact_when_critical", - "Unit": "ali_drw", - "Compat": "ali_drw_pmu" - }, - { - "BriefDescription": "An Activate(ACT) command to DRAM.", - "ConfigCode": "0xC", - "EventName": "op_is_activate", - "Unit": "ali_drw", - "Compat": "ali_drw_pmu" - }, - { - "BriefDescription": "A Read or Write CAS command to DRAM.", - "ConfigCode": "0xD", - "EventName": "op_is_rd_or_wr", - "Unit": "ali_drw", - "Compat": "ali_drw_pmu" - }, - { - "BriefDescription": "An Activate(ACT) command for read to DRAM.", - "ConfigCode": "0xE", - "EventName": "op_is_rd_activate", - "Unit": "ali_drw", - "Compat": "ali_drw_pmu" - }, - { - "BriefDescription": "A Read CAS command to DRAM.", - "ConfigCode": "0xF", - "EventName": "op_is_rd", - "Unit": "ali_drw", - "Compat": "ali_drw_pmu" - }, - { - "BriefDescription": "A Write CAS command to DRAM.", - "ConfigCode": "0x10", - "EventName": "op_is_wr", - "Unit": "ali_drw", - "Compat": "ali_drw_pmu" - }, - { - "BriefDescription": "A Masked Write command to DRAM.", - "ConfigCode": "0x11", - "EventName": "op_is_mwr", - "Unit": "ali_drw", - "Compat": "ali_drw_pmu" - }, - { - "BriefDescription": "A Precharge(PRE) command to DRAM.", - "ConfigCode": "0x12", - "EventName": "op_is_precharge", - "Unit": "ali_drw", - "Compat": "ali_drw_pmu" - }, - { - "BriefDescription": "A Precharge(PRE) required by read or write.", - "ConfigCode": "0x13", - "EventName": "precharge_for_rdwr", - "Unit": "ali_drw", - "Compat": "ali_drw_pmu" - }, - { - "BriefDescription": "A Precharge(PRE) required by other conditions.", - "ConfigCode": "0x14", - "EventName": "precharge_for_other", - "Unit": "ali_drw", - "Compat": "ali_drw_pmu" - }, - { - "BriefDescription": "A read-write turnaround.", - "ConfigCode": "0x15", - "EventName": "rdwr_transitions", - "Unit": "ali_drw", - "Compat": "ali_drw_pmu" - }, - { - "BriefDescription": "A write combine(merge) in write data buffer.", - "ConfigCode": "0x16", - "EventName": "write_combine", - "Unit": "ali_drw", - "Compat": "ali_drw_pmu" - }, - { - "BriefDescription": "A Write-After-Read hazard.", - "ConfigCode": "0x17", - "EventName": "war_hazard", - "Unit": "ali_drw", - "Compat": "ali_drw_pmu" - }, - { - "BriefDescription": "A Read-After-Write hazard.", - "ConfigCode": "0x18", - "EventName": "raw_hazard", - "Unit": "ali_drw", - "Compat": "ali_drw_pmu" - }, - { - "BriefDescription": "A Write-After-Write hazard.", - "ConfigCode": "0x19", - "EventName": "waw_hazard", - "Unit": "ali_drw", - "Compat": "ali_drw_pmu" - }, - { - "BriefDescription": "Rank0 enters self-refresh(SRE).", - "ConfigCode": "0x1A", - "EventName": "op_is_enter_selfref_rk0", - "Unit": "ali_drw", - "Compat": "ali_drw_pmu" - }, - { - "BriefDescription": "Rank1 enters self-refresh(SRE).", - "ConfigCode": "0x1B", - "EventName": "op_is_enter_selfref_rk1", - "Unit": "ali_drw", - "Compat": "ali_drw_pmu" - }, - { - "BriefDescription": "Rank2 enters self-refresh(SRE).", - "ConfigCode": "0x1C", - "EventName": "op_is_enter_selfref_rk2", - "Unit": "ali_drw", - "Compat": "ali_drw_pmu" - }, - { - "BriefDescription": "Rank3 enters self-refresh(SRE).", - "ConfigCode": "0x1D", - "EventName": "op_is_enter_selfref_rk3", - "Unit": "ali_drw", - "Compat": "ali_drw_pmu" - }, - { - "BriefDescription": "Rank0 enters power-down(PDE).", - "ConfigCode": "0x1E", - "EventName": "op_is_enter_powerdown_rk0", - "Unit": "ali_drw", - "Compat": "ali_drw_pmu" - }, - { - "BriefDescription": "Rank1 enters power-down(PDE).", - "ConfigCode": "0x1F", - "EventName": "op_is_enter_powerdown_rk1", - "Unit": "ali_drw", - "Compat": "ali_drw_pmu" - }, - { - "BriefDescription": "Rank2 enters power-down(PDE).", - "ConfigCode": "0x20", - "EventName": "op_is_enter_powerdown_rk2", - "Unit": "ali_drw", - "Compat": "ali_drw_pmu" - }, - { - "BriefDescription": "Rank3 enters power-down(PDE).", - "ConfigCode": "0x21", - "EventName": "op_is_enter_powerdown_rk3", - "Unit": "ali_drw", - "Compat": "ali_drw_pmu" - }, - { - "BriefDescription": "A cycle that Rank0 stays in self-refresh mode.", - "ConfigCode": "0x26", - "EventName": "selfref_mode_rk0", - "Unit": "ali_drw", - "Compat": "ali_drw_pmu" - }, - { - "BriefDescription": "A cycle that Rank1 stays in self-refresh mode.", - "ConfigCode": "0x27", - "EventName": "selfref_mode_rk1", - "Unit": "ali_drw", - "Compat": "ali_drw_pmu" - }, - { - "BriefDescription": "A cycle that Rank2 stays in self-refresh mode.", - "ConfigCode": "0x28", - "EventName": "selfref_mode_rk2", - "Unit": "ali_drw", - "Compat": "ali_drw_pmu" - }, - { - "BriefDescription": "A cycle that Rank3 stays in self-refresh mode.", - "ConfigCode": "0x29", - "EventName": "selfref_mode_rk3", - "Unit": "ali_drw", - "Compat": "ali_drw_pmu" - }, - { - "BriefDescription": "An auto-refresh(REF) command to DRAM.", - "ConfigCode": "0x2A", - "EventName": "op_is_refresh", - "Unit": "ali_drw", - "Compat": "ali_drw_pmu" - }, - { - "BriefDescription": "A critical auto-refresh(REF) command to DRAM.", - "ConfigCode": "0x2B", - "EventName": "op_is_crit_ref", - "Unit": "ali_drw", - "Compat": "ali_drw_pmu" - }, - { - "BriefDescription": "An MRR or MRW command to DRAM.", - "ConfigCode": "0x2D", - "EventName": "op_is_load_mode", - "Unit": "ali_drw", - "Compat": "ali_drw_pmu" - }, - { - "BriefDescription": "A ZQCal command to DRAM.", - "ConfigCode": "0x2E", - "EventName": "op_is_zqcl", - "Unit": "ali_drw", - "Compat": "ali_drw_pmu" - }, - { - "BriefDescription": "At least one entry in read queue reaches the visible window limit.", - "ConfigCode": "0x30", - "EventName": "visible_window_limit_reached_rd", - "Unit": "ali_drw", - "Compat": "ali_drw_pmu" - }, - { - "BriefDescription": "At least one entry in write queue reaches the visible window limit.", - "ConfigCode": "0x31", - "EventName": "visible_window_limit_reached_wr", - "Unit": "ali_drw", - "Compat": "ali_drw_pmu" - }, - { - "BriefDescription": "A DQS Oscillator MPC command to DRAM.", - "ConfigCode": "0x34", - "EventName": "op_is_dqsosc_mpc", - "Unit": "ali_drw", - "Compat": "ali_drw_pmu" - }, - { - "BriefDescription": "A DQS Oscillator MRR command to DRAM.", - "ConfigCode": "0x35", - "EventName": "op_is_dqsosc_mrr", - "Unit": "ali_drw", - "Compat": "ali_drw_pmu" - }, - { - "BriefDescription": "A Temperature Compensated Refresh(TCR) MRR command to DRAM.", - "ConfigCode": "0x36", - "EventName": "op_is_tcr_mrr", - "Unit": "ali_drw", - "Compat": "ali_drw_pmu" - }, - { - "BriefDescription": "A ZQCal Start command to DRAM.", - "ConfigCode": "0x37", - "EventName": "op_is_zqstart", - "Unit": "ali_drw", - "Compat": "ali_drw_pmu" - }, - { - "BriefDescription": "A ZQCal Latch command to DRAM.", - "ConfigCode": "0x38", - "EventName": "op_is_zqlatch", - "Unit": "ali_drw", - "Compat": "ali_drw_pmu" - }, - { - "BriefDescription": "A packet at CHI TXREQ interface (request).", - "ConfigCode": "0x39", - "EventName": "chi_txreq", - "Unit": "ali_drw", - "Compat": "ali_drw_pmu" - }, - { - "BriefDescription": "A packet at CHI TXDAT interface (read data).", - "ConfigCode": "0x3A", - "EventName": "chi_txdat", - "Unit": "ali_drw", - "Compat": "ali_drw_pmu" - }, - { - "BriefDescription": "A packet at CHI RXDAT interface (write data).", - "ConfigCode": "0x3B", - "EventName": "chi_rxdat", - "Unit": "ali_drw", - "Compat": "ali_drw_pmu" - }, - { - "BriefDescription": "A packet at CHI RXRSP interface.", - "ConfigCode": "0x3C", - "EventName": "chi_rxrsp", - "Unit": "ali_drw", - "Compat": "ali_drw_pmu" - }, - { - "BriefDescription": "A violation detected in TZC.", - "ConfigCode": "0x3D", - "EventName": "tsz_vio", - "Unit": "ali_drw", - "Compat": "ali_drw_pmu" - }, - { - "BriefDescription": "The ddr cycles.", - "ConfigCode": "0x80", - "EventName": "ddr_cycles", - "Unit": "ali_drw", - "Compat": "ali_drw_pmu" - } -] diff --git a/tools/perf/pmu-events/arch/arm64/freescale/yitian710/sys/metrics.json b/tools/perf/pmu-events/arch/arm64/freescale/yitian710/sys/metrics.json deleted file mode 100644 index bc865b374b6a..000000000000 --- a/tools/perf/pmu-events/arch/arm64/freescale/yitian710/sys/metrics.json +++ /dev/null @@ -1,20 +0,0 @@ -[ - { - "MetricName": "ddr_read_bandwidth.all", - "BriefDescription": "The ddr read bandwidth(MB/s).", - "MetricGroup": "ali_drw", - "MetricExpr": "hif_rd * 64 / 1e6 / duration_time", - "ScaleUnit": "1MB/s", - "Unit": "ali_drw", - "Compat": "ali_drw_pmu" - }, - { - "MetricName": "ddr_write_bandwidth.all", - "BriefDescription": "The ddr write bandwidth(MB/s).", - "MetricGroup": "ali_drw", - "MetricExpr": "(hif_wr + hif_rmw) * 64 / 1e6 / duration_time", - "ScaleUnit": "1MB/s", - "Unit": "ali_drw", - "Compat": "ali_drw_pmu" - } -] diff --git a/tools/perf/pmu-events/arch/arm64/thead/yitian710/sys/ali_drw.json b/tools/perf/pmu-events/arch/arm64/thead/yitian710/sys/ali_drw.json new file mode 100644 index 000000000000..e21c469a8ef0 --- /dev/null +++ b/tools/perf/pmu-events/arch/arm64/thead/yitian710/sys/ali_drw.json @@ -0,0 +1,373 @@ +[ + { + "BriefDescription": "A Write or Read Op at HIF interface. The unit is 64B.", + "ConfigCode": "0x0", + "EventName": "hif_rd_or_wr", + "Unit": "ali_drw", + "Compat": "ali_drw_pmu" + }, + { + "BriefDescription": "A Write Op at HIF interface. The unit is 64B.", + "ConfigCode": "0x1", + "EventName": "hif_wr", + "Unit": "ali_drw", + "Compat": "ali_drw_pmu" + }, + { + "BriefDescription": "A Read Op at HIF interface. The unit is 64B.", + "ConfigCode": "0x2", + "EventName": "hif_rd", + "Unit": "ali_drw", + "Compat": "ali_drw_pmu" + }, + { + "BriefDescription": "A Read-Modify-Write Op at HIF interface. The unit is 64B.", + "ConfigCode": "0x3", + "EventName": "hif_rmw", + "Unit": "ali_drw", + "Compat": "ali_drw_pmu" + }, + { + "BriefDescription": "A high priority Read at HIF interface. The unit is 64B.", + "ConfigCode": "0x4", + "EventName": "hif_hi_pri_rd", + "Unit": "ali_drw", + "Compat": "ali_drw_pmu" + }, + { + "BriefDescription": "A write data cycle at DFI interface (to DRAM).", + "ConfigCode": "0x7", + "EventName": "dfi_wr_data_cycles", + "Unit": "ali_drw", + "Compat": "ali_drw_pmu" + }, + { + "BriefDescription": "A read data cycle at DFI interface (to DRAM).", + "ConfigCode": "0x8", + "EventName": "dfi_rd_data_cycles", + "Unit": "ali_drw", + "Compat": "ali_drw_pmu" + }, + { + "BriefDescription": "A high priority read becomes critical.", + "ConfigCode": "0x9", + "EventName": "hpr_xact_when_critical", + "Unit": "ali_drw", + "Compat": "ali_drw_pmu" + }, + { + "BriefDescription": "A low priority read becomes critical.", + "ConfigCode": "0xA", + "EventName": "lpr_xact_when_critical", + "Unit": "ali_drw", + "Compat": "ali_drw_pmu" + }, + { + "BriefDescription": "A write becomes critical.", + "ConfigCode": "0xB", + "EventName": "wr_xact_when_critical", + "Unit": "ali_drw", + "Compat": "ali_drw_pmu" + }, + { + "BriefDescription": "An Activate(ACT) command to DRAM.", + "ConfigCode": "0xC", + "EventName": "op_is_activate", + "Unit": "ali_drw", + "Compat": "ali_drw_pmu" + }, + { + "BriefDescription": "A Read or Write CAS command to DRAM.", + "ConfigCode": "0xD", + "EventName": "op_is_rd_or_wr", + "Unit": "ali_drw", + "Compat": "ali_drw_pmu" + }, + { + "BriefDescription": "An Activate(ACT) command for read to DRAM.", + "ConfigCode": "0xE", + "EventName": "op_is_rd_activate", + "Unit": "ali_drw", + "Compat": "ali_drw_pmu" + }, + { + "BriefDescription": "A Read CAS command to DRAM.", + "ConfigCode": "0xF", + "EventName": "op_is_rd", + "Unit": "ali_drw", + "Compat": "ali_drw_pmu" + }, + { + "BriefDescription": "A Write CAS command to DRAM.", + "ConfigCode": "0x10", + "EventName": "op_is_wr", + "Unit": "ali_drw", + "Compat": "ali_drw_pmu" + }, + { + "BriefDescription": "A Masked Write command to DRAM.", + "ConfigCode": "0x11", + "EventName": "op_is_mwr", + "Unit": "ali_drw", + "Compat": "ali_drw_pmu" + }, + { + "BriefDescription": "A Precharge(PRE) command to DRAM.", + "ConfigCode": "0x12", + "EventName": "op_is_precharge", + "Unit": "ali_drw", + "Compat": "ali_drw_pmu" + }, + { + "BriefDescription": "A Precharge(PRE) required by read or write.", + "ConfigCode": "0x13", + "EventName": "precharge_for_rdwr", + "Unit": "ali_drw", + "Compat": "ali_drw_pmu" + }, + { + "BriefDescription": "A Precharge(PRE) required by other conditions.", + "ConfigCode": "0x14", + "EventName": "precharge_for_other", + "Unit": "ali_drw", + "Compat": "ali_drw_pmu" + }, + { + "BriefDescription": "A read-write turnaround.", + "ConfigCode": "0x15", + "EventName": "rdwr_transitions", + "Unit": "ali_drw", + "Compat": "ali_drw_pmu" + }, + { + "BriefDescription": "A write combine(merge) in write data buffer.", + "ConfigCode": "0x16", + "EventName": "write_combine", + "Unit": "ali_drw", + "Compat": "ali_drw_pmu" + }, + { + "BriefDescription": "A Write-After-Read hazard.", + "ConfigCode": "0x17", + "EventName": "war_hazard", + "Unit": "ali_drw", + "Compat": "ali_drw_pmu" + }, + { + "BriefDescription": "A Read-After-Write hazard.", + "ConfigCode": "0x18", + "EventName": "raw_hazard", + "Unit": "ali_drw", + "Compat": "ali_drw_pmu" + }, + { + "BriefDescription": "A Write-After-Write hazard.", + "ConfigCode": "0x19", + "EventName": "waw_hazard", + "Unit": "ali_drw", + "Compat": "ali_drw_pmu" + }, + { + "BriefDescription": "Rank0 enters self-refresh(SRE).", + "ConfigCode": "0x1A", + "EventName": "op_is_enter_selfref_rk0", + "Unit": "ali_drw", + "Compat": "ali_drw_pmu" + }, + { + "BriefDescription": "Rank1 enters self-refresh(SRE).", + "ConfigCode": "0x1B", + "EventName": "op_is_enter_selfref_rk1", + "Unit": "ali_drw", + "Compat": "ali_drw_pmu" + }, + { + "BriefDescription": "Rank2 enters self-refresh(SRE).", + "ConfigCode": "0x1C", + "EventName": "op_is_enter_selfref_rk2", + "Unit": "ali_drw", + "Compat": "ali_drw_pmu" + }, + { + "BriefDescription": "Rank3 enters self-refresh(SRE).", + "ConfigCode": "0x1D", + "EventName": "op_is_enter_selfref_rk3", + "Unit": "ali_drw", + "Compat": "ali_drw_pmu" + }, + { + "BriefDescription": "Rank0 enters power-down(PDE).", + "ConfigCode": "0x1E", + "EventName": "op_is_enter_powerdown_rk0", + "Unit": "ali_drw", + "Compat": "ali_drw_pmu" + }, + { + "BriefDescription": "Rank1 enters power-down(PDE).", + "ConfigCode": "0x1F", + "EventName": "op_is_enter_powerdown_rk1", + "Unit": "ali_drw", + "Compat": "ali_drw_pmu" + }, + { + "BriefDescription": "Rank2 enters power-down(PDE).", + "ConfigCode": "0x20", + "EventName": "op_is_enter_powerdown_rk2", + "Unit": "ali_drw", + "Compat": "ali_drw_pmu" + }, + { + "BriefDescription": "Rank3 enters power-down(PDE).", + "ConfigCode": "0x21", + "EventName": "op_is_enter_powerdown_rk3", + "Unit": "ali_drw", + "Compat": "ali_drw_pmu" + }, + { + "BriefDescription": "A cycle that Rank0 stays in self-refresh mode.", + "ConfigCode": "0x26", + "EventName": "selfref_mode_rk0", + "Unit": "ali_drw", + "Compat": "ali_drw_pmu" + }, + { + "BriefDescription": "A cycle that Rank1 stays in self-refresh mode.", + "ConfigCode": "0x27", + "EventName": "selfref_mode_rk1", + "Unit": "ali_drw", + "Compat": "ali_drw_pmu" + }, + { + "BriefDescription": "A cycle that Rank2 stays in self-refresh mode.", + "ConfigCode": "0x28", + "EventName": "selfref_mode_rk2", + "Unit": "ali_drw", + "Compat": "ali_drw_pmu" + }, + { + "BriefDescription": "A cycle that Rank3 stays in self-refresh mode.", + "ConfigCode": "0x29", + "EventName": "selfref_mode_rk3", + "Unit": "ali_drw", + "Compat": "ali_drw_pmu" + }, + { + "BriefDescription": "An auto-refresh(REF) command to DRAM.", + "ConfigCode": "0x2A", + "EventName": "op_is_refresh", + "Unit": "ali_drw", + "Compat": "ali_drw_pmu" + }, + { + "BriefDescription": "A critical auto-refresh(REF) command to DRAM.", + "ConfigCode": "0x2B", + "EventName": "op_is_crit_ref", + "Unit": "ali_drw", + "Compat": "ali_drw_pmu" + }, + { + "BriefDescription": "An MRR or MRW command to DRAM.", + "ConfigCode": "0x2D", + "EventName": "op_is_load_mode", + "Unit": "ali_drw", + "Compat": "ali_drw_pmu" + }, + { + "BriefDescription": "A ZQCal command to DRAM.", + "ConfigCode": "0x2E", + "EventName": "op_is_zqcl", + "Unit": "ali_drw", + "Compat": "ali_drw_pmu" + }, + { + "BriefDescription": "At least one entry in read queue reaches the visible window limit.", + "ConfigCode": "0x30", + "EventName": "visible_window_limit_reached_rd", + "Unit": "ali_drw", + "Compat": "ali_drw_pmu" + }, + { + "BriefDescription": "At least one entry in write queue reaches the visible window limit.", + "ConfigCode": "0x31", + "EventName": "visible_window_limit_reached_wr", + "Unit": "ali_drw", + "Compat": "ali_drw_pmu" + }, + { + "BriefDescription": "A DQS Oscillator MPC command to DRAM.", + "ConfigCode": "0x34", + "EventName": "op_is_dqsosc_mpc", + "Unit": "ali_drw", + "Compat": "ali_drw_pmu" + }, + { + "BriefDescription": "A DQS Oscillator MRR command to DRAM.", + "ConfigCode": "0x35", + "EventName": "op_is_dqsosc_mrr", + "Unit": "ali_drw", + "Compat": "ali_drw_pmu" + }, + { + "BriefDescription": "A Temperature Compensated Refresh(TCR) MRR command to DRAM.", + "ConfigCode": "0x36", + "EventName": "op_is_tcr_mrr", + "Unit": "ali_drw", + "Compat": "ali_drw_pmu" + }, + { + "BriefDescription": "A ZQCal Start command to DRAM.", + "ConfigCode": "0x37", + "EventName": "op_is_zqstart", + "Unit": "ali_drw", + "Compat": "ali_drw_pmu" + }, + { + "BriefDescription": "A ZQCal Latch command to DRAM.", + "ConfigCode": "0x38", + "EventName": "op_is_zqlatch", + "Unit": "ali_drw", + "Compat": "ali_drw_pmu" + }, + { + "BriefDescription": "A packet at CHI TXREQ interface (request).", + "ConfigCode": "0x39", + "EventName": "chi_txreq", + "Unit": "ali_drw", + "Compat": "ali_drw_pmu" + }, + { + "BriefDescription": "A packet at CHI TXDAT interface (read data).", + "ConfigCode": "0x3A", + "EventName": "chi_txdat", + "Unit": "ali_drw", + "Compat": "ali_drw_pmu" + }, + { + "BriefDescription": "A packet at CHI RXDAT interface (write data).", + "ConfigCode": "0x3B", + "EventName": "chi_rxdat", + "Unit": "ali_drw", + "Compat": "ali_drw_pmu" + }, + { + "BriefDescription": "A packet at CHI RXRSP interface.", + "ConfigCode": "0x3C", + "EventName": "chi_rxrsp", + "Unit": "ali_drw", + "Compat": "ali_drw_pmu" + }, + { + "BriefDescription": "A violation detected in TZC.", + "ConfigCode": "0x3D", + "EventName": "tsz_vio", + "Unit": "ali_drw", + "Compat": "ali_drw_pmu" + }, + { + "BriefDescription": "The ddr cycles.", + "ConfigCode": "0x80", + "EventName": "ddr_cycles", + "Unit": "ali_drw", + "Compat": "ali_drw_pmu" + } +] diff --git a/tools/perf/pmu-events/arch/arm64/thead/yitian710/sys/metrics.json b/tools/perf/pmu-events/arch/arm64/thead/yitian710/sys/metrics.json new file mode 100644 index 000000000000..bc865b374b6a --- /dev/null +++ b/tools/perf/pmu-events/arch/arm64/thead/yitian710/sys/metrics.json @@ -0,0 +1,20 @@ +[ + { + "MetricName": "ddr_read_bandwidth.all", + "BriefDescription": "The ddr read bandwidth(MB/s).", + "MetricGroup": "ali_drw", + "MetricExpr": "hif_rd * 64 / 1e6 / duration_time", + "ScaleUnit": "1MB/s", + "Unit": "ali_drw", + "Compat": "ali_drw_pmu" + }, + { + "MetricName": "ddr_write_bandwidth.all", + "BriefDescription": "The ddr write bandwidth(MB/s).", + "MetricGroup": "ali_drw", + "MetricExpr": "(hif_wr + hif_rmw) * 64 / 1e6 / duration_time", + "ScaleUnit": "1MB/s", + "Unit": "ali_drw", + "Compat": "ali_drw_pmu" + } +] -- cgit v1.2.3 From c87826ddcefaf0b1245d1d8c61040679ea0bd387 Mon Sep 17 00:00:00 2001 From: Leo Yan Date: Tue, 6 Aug 2024 21:41:22 +0100 Subject: perf auxtrace: Use evsel__is_aux_event() for checking AUX event Use evsel__is_aux_event() to decide if an event is a AUX event, this is a refactoring to replace comparing the PMU type. Reviewed-by: Adrian Hunter Signed-off-by: Leo Yan Cc: Ian Rogers Cc: James Clark Cc: Kan Liang Cc: Mike Leach Cc: Namhyung Kim Cc: Suzuki Poulouse Cc: coresight@lists.linaro.org Cc: linux-arm-kernel@lists.infradead.org Link: https://lore.kernel.org/r/20240806204130.720977-2-leo.yan@arm.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/auxtrace.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/auxtrace.c b/tools/perf/util/auxtrace.c index cbb773ed6f1a..ca8682966fae 100644 --- a/tools/perf/util/auxtrace.c +++ b/tools/perf/util/auxtrace.c @@ -671,11 +671,11 @@ int auxtrace_record__read_finish(struct auxtrace_record *itr, int idx) { struct evsel *evsel; - if (!itr->evlist || !itr->pmu) + if (!itr->evlist) return -EINVAL; evlist__for_each_entry(itr->evlist, evsel) { - if (evsel->core.attr.type == itr->pmu->type) { + if (evsel__is_aux_event(evsel)) { if (evsel->disabled) return 0; return evlist__enable_event_idx(itr->evlist, evsel, idx); -- cgit v1.2.3 From d5726f1c8d421f000a9265601c28279e82d43db0 Mon Sep 17 00:00:00 2001 From: Leo Yan Date: Tue, 6 Aug 2024 21:41:23 +0100 Subject: perf auxtrace: Remove unused 'pmu' pointer from struct auxtrace_record The 'pmu' pointer in the auxtrace_record structure is not used after support multiple AUX events, remove it. Reviewed-by: Adrian Hunter Signed-off-by: Leo Yan Cc: Ian Rogers Cc: James Clark Cc: Kan Liang Cc: Mike Leach Cc: Namhyung Kim Cc: Suzuki Poulouse Cc: coresight@lists.linaro.org Cc: linux-arm-kernel@lists.infradead.org Link: https://lore.kernel.org/r/20240806204130.720977-3-leo.yan@arm.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/arch/arm/util/cs-etm.c | 1 - tools/perf/arch/arm64/util/arm-spe.c | 1 - tools/perf/arch/arm64/util/hisi-ptt.c | 1 - tools/perf/arch/x86/util/intel-bts.c | 1 - tools/perf/arch/x86/util/intel-pt.c | 1 - tools/perf/util/auxtrace.h | 1 - 6 files changed, 6 deletions(-) (limited to 'tools') diff --git a/tools/perf/arch/arm/util/cs-etm.c b/tools/perf/arch/arm/util/cs-etm.c index da6231367993..96aeb7cdbee1 100644 --- a/tools/perf/arch/arm/util/cs-etm.c +++ b/tools/perf/arch/arm/util/cs-etm.c @@ -888,7 +888,6 @@ struct auxtrace_record *cs_etm_record_init(int *err) } ptr->cs_etm_pmu = cs_etm_pmu; - ptr->itr.pmu = cs_etm_pmu; ptr->itr.parse_snapshot_options = cs_etm_parse_snapshot_options; ptr->itr.recording_options = cs_etm_recording_options; ptr->itr.info_priv_size = cs_etm_info_priv_size; diff --git a/tools/perf/arch/arm64/util/arm-spe.c b/tools/perf/arch/arm64/util/arm-spe.c index d59f6ca499f2..2be99fdf997d 100644 --- a/tools/perf/arch/arm64/util/arm-spe.c +++ b/tools/perf/arch/arm64/util/arm-spe.c @@ -514,7 +514,6 @@ struct auxtrace_record *arm_spe_recording_init(int *err, } sper->arm_spe_pmu = arm_spe_pmu; - sper->itr.pmu = arm_spe_pmu; sper->itr.snapshot_start = arm_spe_snapshot_start; sper->itr.snapshot_finish = arm_spe_snapshot_finish; sper->itr.find_snapshot = arm_spe_find_snapshot; diff --git a/tools/perf/arch/arm64/util/hisi-ptt.c b/tools/perf/arch/arm64/util/hisi-ptt.c index ba97c8a562a0..eac9739c87e6 100644 --- a/tools/perf/arch/arm64/util/hisi-ptt.c +++ b/tools/perf/arch/arm64/util/hisi-ptt.c @@ -174,7 +174,6 @@ struct auxtrace_record *hisi_ptt_recording_init(int *err, } pttr->hisi_ptt_pmu = hisi_ptt_pmu; - pttr->itr.pmu = hisi_ptt_pmu; pttr->itr.recording_options = hisi_ptt_recording_options; pttr->itr.info_priv_size = hisi_ptt_info_priv_size; pttr->itr.info_fill = hisi_ptt_info_fill; diff --git a/tools/perf/arch/x86/util/intel-bts.c b/tools/perf/arch/x86/util/intel-bts.c index 34696f3d3d5d..85c8186300c8 100644 --- a/tools/perf/arch/x86/util/intel-bts.c +++ b/tools/perf/arch/x86/util/intel-bts.c @@ -434,7 +434,6 @@ struct auxtrace_record *intel_bts_recording_init(int *err) } btsr->intel_bts_pmu = intel_bts_pmu; - btsr->itr.pmu = intel_bts_pmu; btsr->itr.recording_options = intel_bts_recording_options; btsr->itr.info_priv_size = intel_bts_info_priv_size; btsr->itr.info_fill = intel_bts_info_fill; diff --git a/tools/perf/arch/x86/util/intel-pt.c b/tools/perf/arch/x86/util/intel-pt.c index 4b710e875953..ea510a7486b1 100644 --- a/tools/perf/arch/x86/util/intel-pt.c +++ b/tools/perf/arch/x86/util/intel-pt.c @@ -1197,7 +1197,6 @@ struct auxtrace_record *intel_pt_recording_init(int *err) } ptr->intel_pt_pmu = intel_pt_pmu; - ptr->itr.pmu = intel_pt_pmu; ptr->itr.recording_options = intel_pt_recording_options; ptr->itr.info_priv_size = intel_pt_info_priv_size; ptr->itr.info_fill = intel_pt_info_fill; diff --git a/tools/perf/util/auxtrace.h b/tools/perf/util/auxtrace.h index d405efcd8708..a1895a4f530b 100644 --- a/tools/perf/util/auxtrace.h +++ b/tools/perf/util/auxtrace.h @@ -411,7 +411,6 @@ struct auxtrace_record { int (*read_finish)(struct auxtrace_record *itr, int idx); unsigned int alignment; unsigned int default_aux_sample_size; - struct perf_pmu *pmu; struct evlist *evlist; }; -- cgit v1.2.3 From 6f87543c74ddc9649cfd9a6e6908821cdb4d397a Mon Sep 17 00:00:00 2001 From: James Clark Date: Fri, 9 Aug 2024 10:54:22 +0100 Subject: perf test trace_btf_enum: Fix shellcheck warning Shellcheck versions < v0.7.2 can't follow this path so add the helper to fix the following warning: In tests/shell/trace_btf_enum.sh line 13: . "$(dirname $0)"/lib/probe.sh ^--------------------------^ SC1090: Can't follow non-constant source. Use a directive to specify location. Fixes: d66763fed30f0bd8 ("perf test trace_btf_enum: Add regression test for the BTF augmentation of enums in 'perf trace'") Signed-off-by: James Clark Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Howard Chu Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kan Liang Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20240809095426.3065163-1-james.clark@linaro.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/shell/trace_btf_enum.sh | 1 + 1 file changed, 1 insertion(+) (limited to 'tools') diff --git a/tools/perf/tests/shell/trace_btf_enum.sh b/tools/perf/tests/shell/trace_btf_enum.sh index 7d407b52bea5..5a3b8a5a9b5c 100755 --- a/tools/perf/tests/shell/trace_btf_enum.sh +++ b/tools/perf/tests/shell/trace_btf_enum.sh @@ -10,6 +10,7 @@ non_syscall="timer:hrtimer_init,timer:hrtimer_start" TESTPROG="perf test -w landlock" +# shellcheck source=lib/probe.sh . "$(dirname $0)"/lib/probe.sh skip_if_no_perf_trace || exit 2 -- cgit v1.2.3 From 9af2efee41b27a0f386fb5aa95d8d0b4b5d9fede Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Mon, 26 Aug 2024 15:10:42 -0700 Subject: perf report: Fix segfault when 'sym' sort key is not used The fields in the hist_entry are filled on-demand which means they only have meaningful values when relevant sort keys are used. So if neither of 'dso' nor 'sym' sort keys are used, the map/symbols in the hist entry can be garbage. So it shouldn't access it unconditionally. I got a segfault, when I wanted to see cgroup profiles. $ sudo perf record -a --all-cgroups --synth=cgroup true $ sudo perf report -s cgroup Program received signal SIGSEGV, Segmentation fault. 0x00005555557a8d90 in map__dso (map=0x0) at util/map.h:48 48 return RC_CHK_ACCESS(map)->dso; (gdb) bt #0 0x00005555557a8d90 in map__dso (map=0x0) at util/map.h:48 #1 0x00005555557aa39b in map__load (map=0x0) at util/map.c:344 #2 0x00005555557aa592 in map__find_symbol (map=0x0, addr=140736115941088) at util/map.c:385 #3 0x00005555557ef000 in hists__findnew_entry (hists=0x555556039d60, entry=0x7fffffffa4c0, al=0x7fffffffa8c0, sample_self=true) at util/hist.c:644 #4 0x00005555557ef61c in __hists__add_entry (hists=0x555556039d60, al=0x7fffffffa8c0, sym_parent=0x0, bi=0x0, mi=0x0, ki=0x0, block_info=0x0, sample=0x7fffffffaa90, sample_self=true, ops=0x0) at util/hist.c:761 #5 0x00005555557ef71f in hists__add_entry (hists=0x555556039d60, al=0x7fffffffa8c0, sym_parent=0x0, bi=0x0, mi=0x0, ki=0x0, sample=0x7fffffffaa90, sample_self=true) at util/hist.c:779 #6 0x00005555557f00fb in iter_add_single_normal_entry (iter=0x7fffffffa900, al=0x7fffffffa8c0) at util/hist.c:1015 #7 0x00005555557f09a7 in hist_entry_iter__add (iter=0x7fffffffa900, al=0x7fffffffa8c0, max_stack_depth=127, arg=0x7fffffffbce0) at util/hist.c:1260 #8 0x00005555555ba7ce in process_sample_event (tool=0x7fffffffbce0, event=0x7ffff7c14128, sample=0x7fffffffaa90, evsel=0x555556039ad0, machine=0x5555560388e8) at builtin-report.c:334 #9 0x00005555557b30c8 in evlist__deliver_sample (evlist=0x555556039010, tool=0x7fffffffbce0, event=0x7ffff7c14128, sample=0x7fffffffaa90, evsel=0x555556039ad0, machine=0x5555560388e8) at util/session.c:1232 #10 0x00005555557b32bc in machines__deliver_event (machines=0x5555560388e8, evlist=0x555556039010, event=0x7ffff7c14128, sample=0x7fffffffaa90, tool=0x7fffffffbce0, file_offset=110888, file_path=0x555556038ff0 "perf.data") at util/session.c:1271 #11 0x00005555557b3848 in perf_session__deliver_event (session=0x5555560386d0, event=0x7ffff7c14128, tool=0x7fffffffbce0, file_offset=110888, file_path=0x555556038ff0 "perf.data") at util/session.c:1354 #12 0x00005555557affaf in ordered_events__deliver_event (oe=0x555556038e60, event=0x555556135aa0) at util/session.c:132 #13 0x00005555557bb605 in do_flush (oe=0x555556038e60, show_progress=false) at util/ordered-events.c:245 #14 0x00005555557bb95c in __ordered_events__flush (oe=0x555556038e60, how=OE_FLUSH__ROUND, timestamp=0) at util/ordered-events.c:324 #15 0x00005555557bba46 in ordered_events__flush (oe=0x555556038e60, how=OE_FLUSH__ROUND) at util/ordered-events.c:342 #16 0x00005555557b1b3b in perf_event__process_finished_round (tool=0x7fffffffbce0, event=0x7ffff7c15bb8, oe=0x555556038e60) at util/session.c:780 #17 0x00005555557b3b27 in perf_session__process_user_event (session=0x5555560386d0, event=0x7ffff7c15bb8, file_offset=117688, file_path=0x555556038ff0 "perf.data") at util/session.c:1406 As you can see the entry->ms.map was NULL even if he->ms.map has a value. This is because 'sym' sort key is not given, so it cannot assume whether he->ms.sym and entry->ms.sym is the same. I only checked the 'sym' sort key here as it implies 'dso' behavior (so maps are the same). Fixes: ac01c8c4246546fd ("perf hist: Update hist symbol when updating maps") Signed-off-by: Namhyung Kim Cc: Adrian Hunter Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kan Liang Cc: Matt Fleming Cc: Peter Zijlstra Cc: Stephane Eranian Link: https://lore.kernel.org/r/20240826221045.1202305-2-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/hist.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c index dadce2889e52..f387e85a0087 100644 --- a/tools/perf/util/hist.c +++ b/tools/perf/util/hist.c @@ -638,7 +638,7 @@ static struct hist_entry *hists__findnew_entry(struct hists *hists, * mis-adjust symbol addresses when computing * the history counter to increment. */ - if (he->ms.map != entry->ms.map) { + if (hists__has(hists, sym) && he->ms.map != entry->ms.map) { if (he->ms.sym) { u64 addr = he->ms.sym->start; he->ms.sym = map__find_symbol(entry->ms.map, addr); -- cgit v1.2.3 From 591156f25f6bc71b143d4a7c7cbe7cdb68dda94e Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Mon, 26 Aug 2024 15:10:43 -0700 Subject: perf bpf-filter: Add build dependency to header files The flex and bison files need to be recompiled when one of these header filters are changed. * util/bpf-filter.h * util/bpf_skel/sample-filter.h Signed-off-by: Namhyung Kim Cc: Adrian Hunter Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kan Liang Cc: Peter Zijlstra Cc: Stephane Eranian Link: https://lore.kernel.org/r/20240826221045.1202305-3-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/Build | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/Build b/tools/perf/util/Build index b87f918bdfe7..260cec2f6c0b 100644 --- a/tools/perf/util/Build +++ b/tools/perf/util/Build @@ -278,12 +278,12 @@ $(OUTPUT)util/pmu-bison.c $(OUTPUT)util/pmu-bison.h: util/pmu.y $(Q)$(call echo-cmd,bison)$(BISON) -v $< -d $(PARSER_DEBUG_BISON) $(BISON_FILE_PREFIX_MAP) \ -o $(OUTPUT)util/pmu-bison.c -p perf_pmu_ -$(OUTPUT)util/bpf-filter-flex.c $(OUTPUT)util/bpf-filter-flex.h: util/bpf-filter.l $(OUTPUT)util/bpf-filter-bison.c +$(OUTPUT)util/bpf-filter-flex.c $(OUTPUT)util/bpf-filter-flex.h: util/bpf-filter.l $(OUTPUT)util/bpf-filter-bison.c util/bpf-filter.h util/bpf_skel/sample-filter.h $(call rule_mkdir) $(Q)$(call echo-cmd,flex)$(FLEX) -o $(OUTPUT)util/bpf-filter-flex.c \ --header-file=$(OUTPUT)util/bpf-filter-flex.h $(PARSER_DEBUG_FLEX) $< -$(OUTPUT)util/bpf-filter-bison.c $(OUTPUT)util/bpf-filter-bison.h: util/bpf-filter.y +$(OUTPUT)util/bpf-filter-bison.c $(OUTPUT)util/bpf-filter-bison.h: util/bpf-filter.y util/bpf-filter.h util/bpf_skel/sample-filter.h $(call rule_mkdir) $(Q)$(call echo-cmd,bison)$(BISON) -v $< -d $(PARSER_DEBUG_BISON) $(BISON_FILE_PREFIX_MAP) \ -o $(OUTPUT)util/bpf-filter-bison.c -p perf_bpf_filter_ -- cgit v1.2.3 From 91e88437d5156b209b1d69b69b560f0a02b80712 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Mon, 26 Aug 2024 15:10:44 -0700 Subject: perf bpf-filter: Support filtering on cgroups The new cgroup filter can take either of '==' or '!=' operator and a pathname for the target cgroup. $ perf record -a --all-cgroups -e cycles --filter 'cgroup == /abc/def' -- sleep 1 Users should have --all-cgroups option in the command line to enable cgroup filtering. Technically it doesn't need to have the option as it can get the current task's cgroup info directly from BPF. But I want to follow the convention for the other sample info. Signed-off-by: Namhyung Kim Cc: Adrian Hunter Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kan Liang Cc: Peter Zijlstra Cc: Stephane Eranian Link: https://lore.kernel.org/r/20240826221045.1202305-4-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/bpf-filter.c | 1 + tools/perf/util/bpf-filter.l | 28 +++++++++++++++++++++++----- tools/perf/util/bpf-filter.y | 28 ++++++++++++++++++++++++++-- tools/perf/util/bpf_skel/sample-filter.h | 2 +- tools/perf/util/bpf_skel/sample_filter.bpf.c | 4 +++- tools/perf/util/bpf_skel/vmlinux/vmlinux.h | 1 + 6 files changed, 55 insertions(+), 9 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/bpf-filter.c b/tools/perf/util/bpf-filter.c index 0a1832564dd2..e87b6789eb9e 100644 --- a/tools/perf/util/bpf-filter.c +++ b/tools/perf/util/bpf-filter.c @@ -100,6 +100,7 @@ static const struct perf_sample_info { PERF_SAMPLE_TYPE(TRANSACTION, "--transaction"), PERF_SAMPLE_TYPE(CODE_PAGE_SIZE, "--code-page-size"), PERF_SAMPLE_TYPE(DATA_PAGE_SIZE, "--data-page-size"), + PERF_SAMPLE_TYPE(CGROUP, "--all-cgroups"), }; static int get_pinned_fd(const char *name); diff --git a/tools/perf/util/bpf-filter.l b/tools/perf/util/bpf-filter.l index 2a7c839f3fae..f313404f95a9 100644 --- a/tools/perf/util/bpf-filter.l +++ b/tools/perf/util/bpf-filter.l @@ -9,8 +9,11 @@ #include "bpf-filter.h" #include "bpf-filter-bison.h" +extern int perf_bpf_filter_needs_path; + static int sample(enum perf_bpf_filter_term term) { + perf_bpf_filter_needs_path = 0; perf_bpf_filter_lval.sample.term = term; perf_bpf_filter_lval.sample.part = 0; return BFT_SAMPLE; @@ -18,11 +21,20 @@ static int sample(enum perf_bpf_filter_term term) static int sample_part(enum perf_bpf_filter_term term, int part) { + perf_bpf_filter_needs_path = 0; perf_bpf_filter_lval.sample.term = term; perf_bpf_filter_lval.sample.part = part; return BFT_SAMPLE; } +static int sample_path(enum perf_bpf_filter_term term) +{ + perf_bpf_filter_needs_path = 1; + perf_bpf_filter_lval.sample.term = term; + perf_bpf_filter_lval.sample.part = 0; + return BFT_SAMPLE_PATH; +} + static int operator(enum perf_bpf_filter_op op) { perf_bpf_filter_lval.op = op; @@ -48,10 +60,15 @@ static int constant(int val) return BFT_NUM; } -static int error(const char *str) +static int path_or_error(void) { - printf("perf_bpf_filter: Unexpected filter %s: %s\n", str, perf_bpf_filter_text); - return BFT_ERROR; + if (!perf_bpf_filter_needs_path) { + printf("perf_bpf_filter: Error: Unexpected item: %s\n", + perf_bpf_filter_text); + return BFT_ERROR; + } + perf_bpf_filter_lval.path = perf_bpf_filter_text; + return BFT_PATH; } %} @@ -59,6 +76,7 @@ static int error(const char *str) num_dec [0-9]+ num_hex 0[Xx][0-9a-fA-F]+ space [ \t]+ +path [^ \t\n]+ ident [_a-zA-Z][_a-zA-Z0-9]+ %% @@ -97,6 +115,7 @@ mem_blk { return sample_part(PBF_TERM_DATA_SRC, 7); } mem_hops { return sample_part(PBF_TERM_DATA_SRC, 8); } uid { return sample(PBF_TERM_UID); } gid { return sample(PBF_TERM_GID); } +cgroup { return sample_path(PBF_TERM_CGROUP); } "==" { return operator(PBF_OP_EQ); } "!=" { return operator(PBF_OP_NEQ); } @@ -155,7 +174,6 @@ hops3 { return constant(PERF_MEM_HOPS_3); } "," { return ','; } "||" { return BFT_LOGICAL_OR; } -{ident} { return error("ident"); } -. { return error("input"); } +{path} { return path_or_error(); } %% diff --git a/tools/perf/util/bpf-filter.y b/tools/perf/util/bpf-filter.y index 0c56fccb8874..5a79a8e7a45b 100644 --- a/tools/perf/util/bpf-filter.y +++ b/tools/perf/util/bpf-filter.y @@ -12,9 +12,13 @@ #include #include #include "bpf-filter.h" +#include "cgroup.h" int perf_bpf_filter_lex(void); +/* To indicate if the current term needs a pathname or not */ +int perf_bpf_filter_needs_path; + static void perf_bpf_filter_error(struct list_head *expr __maybe_unused, char const *msg) { @@ -26,6 +30,7 @@ static void perf_bpf_filter_error(struct list_head *expr __maybe_unused, %union { unsigned long num; + char *path; struct { enum perf_bpf_filter_term term; int part; @@ -34,12 +39,13 @@ static void perf_bpf_filter_error(struct list_head *expr __maybe_unused, struct perf_bpf_filter_expr *expr; } -%token BFT_SAMPLE BFT_OP BFT_ERROR BFT_NUM BFT_LOGICAL_OR +%token BFT_SAMPLE BFT_SAMPLE_PATH BFT_OP BFT_ERROR BFT_NUM BFT_LOGICAL_OR BFT_PATH %type filter_term filter_expr %destructor { free ($$); } -%type BFT_SAMPLE +%type BFT_SAMPLE BFT_SAMPLE_PATH %type BFT_OP %type BFT_NUM +%type BFT_PATH %% @@ -81,5 +87,23 @@ BFT_SAMPLE BFT_OP BFT_NUM { $$ = perf_bpf_filter_expr__new($1.term, $1.part, $2, $3); } +| +BFT_SAMPLE_PATH BFT_OP BFT_PATH +{ + struct cgroup *cgrp; + unsigned long cgroup_id = 0; + + if ($2 != PBF_OP_EQ && $2 != PBF_OP_NEQ) { + printf("perf_bpf_filter: cgroup accepts '==' or '!=' only\n"); + YYERROR; + } + + cgrp = cgroup__new($3, /*do_open=*/false); + if (cgrp && read_cgroup_id(cgrp) == 0) + cgroup_id = cgrp->id; + + $$ = perf_bpf_filter_expr__new($1.term, $1.part, $2, cgroup_id); + cgroup__put(cgrp); +} %% diff --git a/tools/perf/util/bpf_skel/sample-filter.h b/tools/perf/util/bpf_skel/sample-filter.h index 5f0c8e4e83d3..683fec85e71e 100644 --- a/tools/perf/util/bpf_skel/sample-filter.h +++ b/tools/perf/util/bpf_skel/sample-filter.h @@ -45,7 +45,7 @@ enum perf_bpf_filter_term { __PBF_UNUSED_TERM18 = PBF_TERM_SAMPLE_START + 18, /* SAMPLE_REGS_INTR = 1U << 18 */ PBF_TERM_PHYS_ADDR = PBF_TERM_SAMPLE_START + 19, /* SAMPLE_PHYS_ADDR = 1U << 19 */ __PBF_UNUSED_TERM20 = PBF_TERM_SAMPLE_START + 20, /* SAMPLE_AUX = 1U << 20 */ - __PBF_UNUSED_TERM21 = PBF_TERM_SAMPLE_START + 21, /* SAMPLE_CGROUP = 1U << 21 */ + PBF_TERM_CGROUP = PBF_TERM_SAMPLE_START + 21, /* SAMPLE_CGROUP = 1U << 21 */ PBF_TERM_DATA_PAGE_SIZE = PBF_TERM_SAMPLE_START + 22, /* SAMPLE_DATA_PAGE_SIZE = 1U << 22 */ PBF_TERM_CODE_PAGE_SIZE = PBF_TERM_SAMPLE_START + 23, /* SAMPLE_CODE_PAGE_SIZE = 1U << 23 */ PBF_TERM_WEIGHT_STRUCT = PBF_TERM_SAMPLE_START + 24, /* SAMPLE_WEIGHT_STRUCT = 1U << 24 */ diff --git a/tools/perf/util/bpf_skel/sample_filter.bpf.c b/tools/perf/util/bpf_skel/sample_filter.bpf.c index 4872a16eedfd..b195e6efeb8b 100644 --- a/tools/perf/util/bpf_skel/sample_filter.bpf.c +++ b/tools/perf/util/bpf_skel/sample_filter.bpf.c @@ -93,6 +93,7 @@ static inline __u64 perf_get_sample(struct bpf_perf_event_data_kern *kctx, BUILD_CHECK_SAMPLE(DATA_SRC); BUILD_CHECK_SAMPLE(TRANSACTION); BUILD_CHECK_SAMPLE(PHYS_ADDR); + BUILD_CHECK_SAMPLE(CGROUP); BUILD_CHECK_SAMPLE(DATA_PAGE_SIZE); BUILD_CHECK_SAMPLE(CODE_PAGE_SIZE); BUILD_CHECK_SAMPLE(WEIGHT_STRUCT); @@ -135,6 +136,8 @@ static inline __u64 perf_get_sample(struct bpf_perf_event_data_kern *kctx, return kctx->data->weight.full; case PBF_TERM_PHYS_ADDR: return kctx->data->phys_addr; + case PBF_TERM_CGROUP: + return kctx->data->cgroup; case PBF_TERM_CODE_PAGE_SIZE: return kctx->data->code_page_size; case PBF_TERM_DATA_PAGE_SIZE: @@ -183,7 +186,6 @@ static inline __u64 perf_get_sample(struct bpf_perf_event_data_kern *kctx, case __PBF_UNUSED_TERM16: case __PBF_UNUSED_TERM18: case __PBF_UNUSED_TERM20: - case __PBF_UNUSED_TERM21: default: break; } diff --git a/tools/perf/util/bpf_skel/vmlinux/vmlinux.h b/tools/perf/util/bpf_skel/vmlinux/vmlinux.h index 4fa21468487e..4dcad7b682bd 100644 --- a/tools/perf/util/bpf_skel/vmlinux/vmlinux.h +++ b/tools/perf/util/bpf_skel/vmlinux/vmlinux.h @@ -171,6 +171,7 @@ struct perf_sample_data { u32 cpu; } cpu_entry; u64 phys_addr; + u64 cgroup; u64 data_page_size; u64 code_page_size; } __attribute__((__aligned__(64))) __attribute__((preserve_access_index)); -- cgit v1.2.3 From d56a4d56a25c6aa76d816f1ee1888d38cf654004 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Mon, 26 Aug 2024 15:10:45 -0700 Subject: perf test: Add 'perf record cgroup' filtering test $ sudo ./perf test filtering -vv 96: perf record sample filtering (by BPF) tests: --- start --- test child forked, pid 2966908 Checking BPF-filter privilege Basic bpf-filter test Basic bpf-filter test [Success] Failing bpf-filter test Failing bpf-filter test [Success] Group bpf-filter test Group bpf-filter test [Success] Multiple bpf-filter test Multiple bpf-filter test [Success] Cgroup bpf-filter test Cgroup bpf-filter test [Success] ---- end(0) ---- 96: perf record sample filtering (by BPF) tests : Ok Signed-off-by: Namhyung Kim Cc: Adrian Hunter Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kan Liang Cc: Peter Zijlstra Cc: Stephane Eranian Link: https://lore.kernel.org/r/20240826221045.1202305-5-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/shell/record_bpf_filter.sh | 39 ++++++++++++++++++++++++++--- 1 file changed, 36 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/perf/tests/shell/record_bpf_filter.sh b/tools/perf/tests/shell/record_bpf_filter.sh index 4df33c15bfa7..1b58ccc1fd88 100755 --- a/tools/perf/tests/shell/record_bpf_filter.sh +++ b/tools/perf/tests/shell/record_bpf_filter.sh @@ -68,7 +68,7 @@ test_bpf_filter_fail() { # 'cpu' requires PERF_SAMPLE_CPU flag if ! perf record -e task-clock --filter 'cpu > 0' \ - -o /dev/null true 2>&1 | grep PERF_SAMPLE_CPU + -o /dev/null true 2>&1 | grep -q PERF_SAMPLE_CPU then echo "Failing bpf-filter test [Failed forbidden CPU]" err=1 @@ -98,7 +98,7 @@ test_bpf_filter_group() { fi if ! perf record -e task-clock --filter 'cpu > 0 || ip > 0' \ - -o /dev/null true 2>&1 | grep PERF_SAMPLE_CPU + -o /dev/null true 2>&1 | grep -q PERF_SAMPLE_CPU then echo "Group bpf-filter test [Failed forbidden CPU]" err=1 @@ -106,7 +106,7 @@ test_bpf_filter_group() { fi if ! perf record -e task-clock --filter 'period > 0 || code_pgsz > 4096' \ - -o /dev/null true 2>&1 | grep PERF_SAMPLE_CODE_PAGE_SIZE + -o /dev/null true 2>&1 | grep -q PERF_SAMPLE_CODE_PAGE_SIZE then echo "Group bpf-filter test [Failed forbidden CODE_PAGE_SIZE]" err=1 @@ -147,6 +147,35 @@ test_bpf_filter_multi() { echo "Multiple bpf-filter test [Success]" } +test_bpf_filter_cgroup() { + echo "Cgroup bpf-filter test" + + if ! perf record -e task-clock --filter 'cgroup == /' \ + -a --all-cgroups --synth=cgroup -o "${perfdata}" true 2> /dev/null + then + echo "Cgroup bpf-filter test [Skipped cgroup not supported]" + return + fi + + # 'cgroup' requires PERF_SAMPLE_CGROUP flag + if ! perf record -e task-clock --filter 'cgroup == /' \ + -o /dev/null true 2>&1 | grep -q PERF_SAMPLE_CGROUP + then + echo "Cgroup bpf-filter test [Failed CGROUP requires --all-cgroups]" + err=1 + return + fi + + if ! perf report -i "${perfdata}" -s cgroup -q | grep -q -F '100.00%' + then + echo "Cgroup bpf-filter test [Failed root cgroup does not have 100%]" + err=1 + return + fi + + echo "Cgroup bpf-filter test [Success]" +} + test_bpf_filter_priv if [ $err = 0 ]; then @@ -165,5 +194,9 @@ if [ $err = 0 ]; then test_bpf_filter_multi fi +if [ $err = 0 ]; then + test_bpf_filter_cgroup +fi + cleanup exit $err -- cgit v1.2.3 From 47b3b6435e4bfb61ae8ffc63a11bd3c310f69acf Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Wed, 28 Aug 2024 19:06:47 -0300 Subject: tools build: Remove leftover libcap tests that prevents fast path feature detection from working I noticed that the fast path feature detection was failing: $ cat /tmp/build/perf-tools-next/feature/test-all.make.output /usr/bin/ld: cannot find -lcap: No such file or directory collect2: error: ld returned 1 exit status $ The patch removing the dependency (Fixes tag below) didn't remove the detection of libcap, and as the fast path feature detection (test-all.c) had -lcap in its Makefile link list of libraries to link, it was failing when libcap-devel is not available, fix it by removing those leftover files. Fixes: e25ebda78e230283 ("perf cap: Tidy up and improve capability testing") Cc: Adrian Hunter Cc: Ian Rogers Cc: Jiri Olsa Cc: Kan Liang Cc: Namhyung Kim Link: https://lore.kernel.org/lkml/Zs-gjOGFWtAvIZit@x1 Signed-off-by: Arnaldo Carvalho de Melo --- tools/build/feature/Makefile | 6 +----- tools/build/feature/test-libcap.c | 20 -------------------- 2 files changed, 1 insertion(+), 25 deletions(-) delete mode 100644 tools/build/feature/test-libcap.c (limited to 'tools') diff --git a/tools/build/feature/Makefile b/tools/build/feature/Makefile index 12796808f07a..b873fc58804e 100644 --- a/tools/build/feature/Makefile +++ b/tools/build/feature/Makefile @@ -25,7 +25,6 @@ FILES= \ test-libbfd-liberty-z.bin \ test-cplus-demangle.bin \ test-cxa-demangle.bin \ - test-libcap.bin \ test-libelf.bin \ test-libelf-getphdrnum.bin \ test-libelf-gelf_getnote.bin \ @@ -112,7 +111,7 @@ all: $(FILES) __BUILD = $(CC) $(CFLAGS) -MD -Wall -Werror -o $@ $(patsubst %.bin,%.c,$(@F)) $(LDFLAGS) BUILD = $(__BUILD) > $(@:.bin=.make.output) 2>&1 BUILD_BFD = $(BUILD) -DPACKAGE='"perf"' -lbfd -ldl - BUILD_ALL = $(BUILD) -fstack-protector-all -O2 -D_FORTIFY_SOURCE=2 -ldw -lelf -lnuma -lelf -lslang $(FLAGS_PERL_EMBED) $(FLAGS_PYTHON_EMBED) -DPACKAGE='"perf"' -lbfd -ldl -lz -llzma -lzstd -lcap + BUILD_ALL = $(BUILD) -fstack-protector-all -O2 -D_FORTIFY_SOURCE=2 -ldw -lelf -lnuma -lelf -lslang $(FLAGS_PERL_EMBED) $(FLAGS_PYTHON_EMBED) -DPACKAGE='"perf"' -lbfd -ldl -lz -llzma -lzstd __BUILDXX = $(CXX) $(CXXFLAGS) -MD -Wall -Werror -o $@ $(patsubst %.bin,%.cpp,$(@F)) $(LDFLAGS) BUILDXX = $(__BUILDXX) > $(@:.bin=.make.output) 2>&1 @@ -140,9 +139,6 @@ $(OUTPUT)test-fortify-source.bin: $(OUTPUT)test-bionic.bin: $(BUILD) -$(OUTPUT)test-libcap.bin: - $(BUILD) -lcap - $(OUTPUT)test-libelf.bin: $(BUILD) -lelf diff --git a/tools/build/feature/test-libcap.c b/tools/build/feature/test-libcap.c deleted file mode 100644 index d2a2e152195f..000000000000 --- a/tools/build/feature/test-libcap.c +++ /dev/null @@ -1,20 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include -#include - -int main(void) -{ - cap_flag_value_t val; - cap_t caps = cap_get_proc(); - - if (!caps) - return 1; - - if (cap_get_flag(caps, CAP_SYS_ADMIN, CAP_EFFECTIVE, &val) != 0) - return 1; - - if (cap_free(caps) != 0) - return 1; - - return 0; -} -- cgit v1.2.3 From 6db59c4935c9354f4c33fc63287d110317ca4d70 Mon Sep 17 00:00:00 2001 From: Juntong Deng Date: Wed, 28 Aug 2024 20:51:32 +0100 Subject: selftests/bpf: Add test for zero offset or non-zero offset pointers as KF_ACQUIRE kfuncs argument This patch adds test cases for zero offset (implicit cast) or non-zero offset pointer as KF_ACQUIRE kfuncs argument. Currently KF_ACQUIRE kfuncs should support passing in pointers like &sk->sk_write_queue (non-zero offset) or &sk->__sk_common (zero offset) and not be rejected by the verifier. Signed-off-by: Juntong Deng Link: https://lore.kernel.org/r/AM6PR03MB5848CB6F0D4D9068669A905B99952@AM6PR03MB5848.eurprd03.prod.outlook.com Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/bpf_testmod/bpf_testmod.c | 17 +++++++++++ .../selftests/bpf/bpf_testmod/bpf_testmod_kfunc.h | 4 +++ .../selftests/bpf/prog_tests/nested_trust.c | 4 +++ tools/testing/selftests/bpf/progs/nested_acquire.c | 33 ++++++++++++++++++++++ 4 files changed, 58 insertions(+) create mode 100644 tools/testing/selftests/bpf/progs/nested_acquire.c (limited to 'tools') diff --git a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c index c04b7dec2ab9..8a71a91b752d 100644 --- a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c +++ b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c @@ -183,6 +183,20 @@ __bpf_kfunc void bpf_kfunc_dynptr_test(struct bpf_dynptr *ptr, { } +__bpf_kfunc struct sk_buff *bpf_kfunc_nested_acquire_nonzero_offset_test(struct sk_buff_head *ptr) +{ + return NULL; +} + +__bpf_kfunc struct sk_buff *bpf_kfunc_nested_acquire_zero_offset_test(struct sock_common *ptr) +{ + return NULL; +} + +__bpf_kfunc void bpf_kfunc_nested_release_test(struct sk_buff *ptr) +{ +} + __bpf_kfunc struct bpf_testmod_ctx * bpf_testmod_ctx_create(int *err) { @@ -541,6 +555,9 @@ BTF_ID_FLAGS(func, bpf_iter_testmod_seq_destroy, KF_ITER_DESTROY) BTF_ID_FLAGS(func, bpf_iter_testmod_seq_value) BTF_ID_FLAGS(func, bpf_kfunc_common_test) BTF_ID_FLAGS(func, bpf_kfunc_dynptr_test) +BTF_ID_FLAGS(func, bpf_kfunc_nested_acquire_nonzero_offset_test, KF_ACQUIRE) +BTF_ID_FLAGS(func, bpf_kfunc_nested_acquire_zero_offset_test, KF_ACQUIRE) +BTF_ID_FLAGS(func, bpf_kfunc_nested_release_test, KF_RELEASE) BTF_ID_FLAGS(func, bpf_testmod_ctx_create, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_testmod_ctx_release, KF_RELEASE) BTF_KFUNCS_END(bpf_testmod_common_kfunc_ids) diff --git a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod_kfunc.h b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod_kfunc.h index e587a79f2239..c6c314965bb1 100644 --- a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod_kfunc.h +++ b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod_kfunc.h @@ -144,4 +144,8 @@ void bpf_kfunc_dynptr_test(struct bpf_dynptr *ptr, struct bpf_dynptr *ptr__nulla struct bpf_testmod_ctx *bpf_testmod_ctx_create(int *err) __ksym; void bpf_testmod_ctx_release(struct bpf_testmod_ctx *ctx) __ksym; +struct sk_buff *bpf_kfunc_nested_acquire_nonzero_offset_test(struct sk_buff_head *ptr) __ksym; +struct sk_buff *bpf_kfunc_nested_acquire_zero_offset_test(struct sock_common *ptr) __ksym; +void bpf_kfunc_nested_release_test(struct sk_buff *ptr) __ksym; + #endif /* _BPF_TESTMOD_KFUNC_H */ diff --git a/tools/testing/selftests/bpf/prog_tests/nested_trust.c b/tools/testing/selftests/bpf/prog_tests/nested_trust.c index 39886f58924e..54a112ad5f9c 100644 --- a/tools/testing/selftests/bpf/prog_tests/nested_trust.c +++ b/tools/testing/selftests/bpf/prog_tests/nested_trust.c @@ -4,9 +4,13 @@ #include #include "nested_trust_failure.skel.h" #include "nested_trust_success.skel.h" +#include "nested_acquire.skel.h" void test_nested_trust(void) { RUN_TESTS(nested_trust_success); RUN_TESTS(nested_trust_failure); + + if (env.has_testmod) + RUN_TESTS(nested_acquire); } diff --git a/tools/testing/selftests/bpf/progs/nested_acquire.c b/tools/testing/selftests/bpf/progs/nested_acquire.c new file mode 100644 index 000000000000..8e521a21d995 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/nested_acquire.c @@ -0,0 +1,33 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include +#include "bpf_misc.h" +#include "../bpf_testmod/bpf_testmod_kfunc.h" + +char _license[] SEC("license") = "GPL"; + +SEC("tp_btf/tcp_probe") +__success +int BPF_PROG(test_nested_acquire_nonzero, struct sock *sk, struct sk_buff *skb) +{ + struct sk_buff *ptr; + + ptr = bpf_kfunc_nested_acquire_nonzero_offset_test(&sk->sk_write_queue); + + bpf_kfunc_nested_release_test(ptr); + return 0; +} + +SEC("tp_btf/tcp_probe") +__success +int BPF_PROG(test_nested_acquire_zero, struct sock *sk, struct sk_buff *skb) +{ + struct sk_buff *ptr; + + ptr = bpf_kfunc_nested_acquire_zero_offset_test(&sk->__sk_common); + + bpf_kfunc_nested_release_test(ptr); + return 0; +} -- cgit v1.2.3 From 4641c7ea88d1029500ff64c4d0a1df0584b1bfcc Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Thu, 29 Aug 2024 00:46:22 +0000 Subject: KVM: arm64: selftests: Cope with lack of GICv3 in set_id_regs Broonie reports that the set_id_regs test is failing as of commit 5cb57a1aff75 ("KVM: arm64: Zero ID_AA64PFR0_EL1.GIC when no GICv3 is presented to the guest"). The test does not anticipate the 'late' ID register fixup where KVM clobbers the GIC field in absence of GICv3. While the field technically has FTR_LOWER_SAFE behavior, fix the issue by setting it to an exact value of 0, matching the effect of the 'late' fixup. Reported-by: Mark Brown Signed-off-by: Oliver Upton Link: https://lore.kernel.org/r/20240829004622.3058639-1-oliver.upton@linux.dev Signed-off-by: Marc Zyngier --- tools/testing/selftests/kvm/aarch64/set_id_regs.c | 1 + 1 file changed, 1 insertion(+) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/aarch64/set_id_regs.c b/tools/testing/selftests/kvm/aarch64/set_id_regs.c index d20981663831..2a3fe7914b72 100644 --- a/tools/testing/selftests/kvm/aarch64/set_id_regs.c +++ b/tools/testing/selftests/kvm/aarch64/set_id_regs.c @@ -126,6 +126,7 @@ static const struct reg_ftr_bits ftr_id_aa64pfr0_el1[] = { REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64PFR0_EL1, CSV2, 0), REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64PFR0_EL1, DIT, 0), REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64PFR0_EL1, SEL2, 0), + REG_FTR_BITS(FTR_EXACT, ID_AA64PFR0_EL1, GIC, 0), REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64PFR0_EL1, EL3, 0), REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64PFR0_EL1, EL2, 0), REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64PFR0_EL1, EL1, 0), -- cgit v1.2.3 From 0a8b08c554dabea952a75363c89050b1fbcbfffb Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 27 Aug 2024 11:00:12 +0200 Subject: selftests: netfilter: nft_queue.sh: reduce test file size for debug build The sctp selftest is very slow on debug kernels. Reported-by: Jakub Kicinski Closes: https://lore.kernel.org/netdev/20240826192500.32efa22c@kernel.org/ Fixes: 4e97d521c2be ("selftests: netfilter: nft_queue.sh: sctp coverage") Signed-off-by: Florian Westphal Acked-by: Pablo Neira Ayuso Link: https://patch.msgid.link/20240827090023.8917-1-fw@strlen.de Signed-off-by: Paolo Abeni --- tools/testing/selftests/net/netfilter/nft_queue.sh | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/netfilter/nft_queue.sh b/tools/testing/selftests/net/netfilter/nft_queue.sh index f3bdeb1271eb..9e5f423bff09 100755 --- a/tools/testing/selftests/net/netfilter/nft_queue.sh +++ b/tools/testing/selftests/net/netfilter/nft_queue.sh @@ -39,7 +39,9 @@ TMPFILE2=$(mktemp) TMPFILE3=$(mktemp) TMPINPUT=$(mktemp) -dd conv=sparse status=none if=/dev/zero bs=1M count=200 of="$TMPINPUT" +COUNT=200 +[ "$KSFT_MACHINE_SLOW" = "yes" ] && COUNT=25 +dd conv=sparse status=none if=/dev/zero bs=1M count=$COUNT of="$TMPINPUT" if ! ip link add veth0 netns "$nsrouter" type veth peer name eth0 netns "$ns1" > /dev/null 2>&1; then echo "SKIP: No virtual ethernet pair device support in kernel" -- cgit v1.2.3 From 375e9bde9fc0fdb9e8d2339b06be5c2c2a5cb701 Mon Sep 17 00:00:00 2001 From: Benjamin Tissoires Date: Tue, 27 Aug 2024 17:19:30 +0900 Subject: selftests/hid: extract the utility part of hid_bpf.c into its own header When adding new tests programs, we need the same mechanics to create new virtual devices, and read from their matching hidraw node. Extract the common part into its own header so we can easily add new tests C-files. Link: https://patch.msgid.link/20240827-hidraw-revoke-v5-2-d004a7451aea@kernel.org Signed-off-by: Benjamin Tissoires --- tools/testing/selftests/hid/hid_bpf.c | 437 +------------------------------ tools/testing/selftests/hid/hid_common.h | 436 ++++++++++++++++++++++++++++++ 2 files changed, 438 insertions(+), 435 deletions(-) create mode 100644 tools/testing/selftests/hid/hid_common.h (limited to 'tools') diff --git a/tools/testing/selftests/hid/hid_bpf.c b/tools/testing/selftests/hid/hid_bpf.c index 75b7b4ef6cfa..d10cf6883683 100644 --- a/tools/testing/selftests/hid/hid_bpf.c +++ b/tools/testing/selftests/hid/hid_bpf.c @@ -1,93 +1,8 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (c) 2022 Red Hat */ +/* Copyright (c) 2022-2024 Red Hat */ #include "hid.skel.h" - -#include "../kselftest_harness.h" - +#include "hid_common.h" #include -#include -#include -#include -#include -#include -#include -#include -#include - -#define SHOW_UHID_DEBUG 0 - -#define min(a, b) \ - ({ __typeof__(a) _a = (a); \ - __typeof__(b) _b = (b); \ - _a < _b ? _a : _b; }) - -static unsigned char rdesc[] = { - 0x06, 0x00, 0xff, /* Usage Page (Vendor Defined Page 1) */ - 0x09, 0x21, /* Usage (Vendor Usage 0x21) */ - 0xa1, 0x01, /* COLLECTION (Application) */ - 0x09, 0x01, /* Usage (Vendor Usage 0x01) */ - 0xa1, 0x00, /* COLLECTION (Physical) */ - 0x85, 0x02, /* REPORT_ID (2) */ - 0x19, 0x01, /* USAGE_MINIMUM (1) */ - 0x29, 0x08, /* USAGE_MAXIMUM (3) */ - 0x15, 0x00, /* LOGICAL_MINIMUM (0) */ - 0x25, 0xff, /* LOGICAL_MAXIMUM (255) */ - 0x95, 0x08, /* REPORT_COUNT (8) */ - 0x75, 0x08, /* REPORT_SIZE (8) */ - 0x81, 0x02, /* INPUT (Data,Var,Abs) */ - 0xc0, /* END_COLLECTION */ - 0x09, 0x01, /* Usage (Vendor Usage 0x01) */ - 0xa1, 0x00, /* COLLECTION (Physical) */ - 0x85, 0x01, /* REPORT_ID (1) */ - 0x06, 0x00, 0xff, /* Usage Page (Vendor Defined Page 1) */ - 0x19, 0x01, /* USAGE_MINIMUM (1) */ - 0x29, 0x03, /* USAGE_MAXIMUM (3) */ - 0x15, 0x00, /* LOGICAL_MINIMUM (0) */ - 0x25, 0x01, /* LOGICAL_MAXIMUM (1) */ - 0x95, 0x03, /* REPORT_COUNT (3) */ - 0x75, 0x01, /* REPORT_SIZE (1) */ - 0x81, 0x02, /* INPUT (Data,Var,Abs) */ - 0x95, 0x01, /* REPORT_COUNT (1) */ - 0x75, 0x05, /* REPORT_SIZE (5) */ - 0x81, 0x01, /* INPUT (Cnst,Var,Abs) */ - 0x05, 0x01, /* USAGE_PAGE (Generic Desktop) */ - 0x09, 0x30, /* USAGE (X) */ - 0x09, 0x31, /* USAGE (Y) */ - 0x15, 0x81, /* LOGICAL_MINIMUM (-127) */ - 0x25, 0x7f, /* LOGICAL_MAXIMUM (127) */ - 0x75, 0x10, /* REPORT_SIZE (16) */ - 0x95, 0x02, /* REPORT_COUNT (2) */ - 0x81, 0x06, /* INPUT (Data,Var,Rel) */ - - 0x06, 0x00, 0xff, /* Usage Page (Vendor Defined Page 1) */ - 0x19, 0x01, /* USAGE_MINIMUM (1) */ - 0x29, 0x03, /* USAGE_MAXIMUM (3) */ - 0x15, 0x00, /* LOGICAL_MINIMUM (0) */ - 0x25, 0x01, /* LOGICAL_MAXIMUM (1) */ - 0x95, 0x03, /* REPORT_COUNT (3) */ - 0x75, 0x01, /* REPORT_SIZE (1) */ - 0x91, 0x02, /* Output (Data,Var,Abs) */ - 0x95, 0x01, /* REPORT_COUNT (1) */ - 0x75, 0x05, /* REPORT_SIZE (5) */ - 0x91, 0x01, /* Output (Cnst,Var,Abs) */ - - 0x06, 0x00, 0xff, /* Usage Page (Vendor Defined Page 1) */ - 0x19, 0x06, /* USAGE_MINIMUM (6) */ - 0x29, 0x08, /* USAGE_MAXIMUM (8) */ - 0x15, 0x00, /* LOGICAL_MINIMUM (0) */ - 0x25, 0x01, /* LOGICAL_MAXIMUM (1) */ - 0x95, 0x03, /* REPORT_COUNT (3) */ - 0x75, 0x01, /* REPORT_SIZE (1) */ - 0xb1, 0x02, /* Feature (Data,Var,Abs) */ - 0x95, 0x01, /* REPORT_COUNT (1) */ - 0x75, 0x05, /* REPORT_SIZE (5) */ - 0x91, 0x01, /* Output (Cnst,Var,Abs) */ - - 0xc0, /* END_COLLECTION */ - 0xc0, /* END_COLLECTION */ -}; - -static __u8 feature_data[] = { 1, 2 }; struct attach_prog_args { int prog_fd; @@ -105,354 +20,6 @@ struct hid_hw_request_syscall_args { __u8 request_type; }; -#define ASSERT_OK(data) ASSERT_FALSE(data) -#define ASSERT_OK_PTR(ptr) ASSERT_NE(NULL, ptr) - -#define UHID_LOG(fmt, ...) do { \ - if (SHOW_UHID_DEBUG) \ - TH_LOG(fmt, ##__VA_ARGS__); \ -} while (0) - -static pthread_mutex_t uhid_started_mtx = PTHREAD_MUTEX_INITIALIZER; -static pthread_cond_t uhid_started = PTHREAD_COND_INITIALIZER; - -static pthread_mutex_t uhid_output_mtx = PTHREAD_MUTEX_INITIALIZER; -static pthread_cond_t uhid_output_cond = PTHREAD_COND_INITIALIZER; -static unsigned char output_report[10]; - -/* no need to protect uhid_stopped, only one thread accesses it */ -static bool uhid_stopped; - -static int uhid_write(struct __test_metadata *_metadata, int fd, const struct uhid_event *ev) -{ - ssize_t ret; - - ret = write(fd, ev, sizeof(*ev)); - if (ret < 0) { - TH_LOG("Cannot write to uhid: %m"); - return -errno; - } else if (ret != sizeof(*ev)) { - TH_LOG("Wrong size written to uhid: %zd != %zu", - ret, sizeof(ev)); - return -EFAULT; - } else { - return 0; - } -} - -static int uhid_create(struct __test_metadata *_metadata, int fd, int rand_nb) -{ - struct uhid_event ev; - char buf[25]; - - sprintf(buf, "test-uhid-device-%d", rand_nb); - - memset(&ev, 0, sizeof(ev)); - ev.type = UHID_CREATE; - strcpy((char *)ev.u.create.name, buf); - ev.u.create.rd_data = rdesc; - ev.u.create.rd_size = sizeof(rdesc); - ev.u.create.bus = BUS_USB; - ev.u.create.vendor = 0x0001; - ev.u.create.product = 0x0a37; - ev.u.create.version = 0; - ev.u.create.country = 0; - - sprintf(buf, "%d", rand_nb); - strcpy((char *)ev.u.create.phys, buf); - - return uhid_write(_metadata, fd, &ev); -} - -static void uhid_destroy(struct __test_metadata *_metadata, int fd) -{ - struct uhid_event ev; - - memset(&ev, 0, sizeof(ev)); - ev.type = UHID_DESTROY; - - uhid_write(_metadata, fd, &ev); -} - -static int uhid_event(struct __test_metadata *_metadata, int fd) -{ - struct uhid_event ev, answer; - ssize_t ret; - - memset(&ev, 0, sizeof(ev)); - ret = read(fd, &ev, sizeof(ev)); - if (ret == 0) { - UHID_LOG("Read HUP on uhid-cdev"); - return -EFAULT; - } else if (ret < 0) { - UHID_LOG("Cannot read uhid-cdev: %m"); - return -errno; - } else if (ret != sizeof(ev)) { - UHID_LOG("Invalid size read from uhid-dev: %zd != %zu", - ret, sizeof(ev)); - return -EFAULT; - } - - switch (ev.type) { - case UHID_START: - pthread_mutex_lock(&uhid_started_mtx); - pthread_cond_signal(&uhid_started); - pthread_mutex_unlock(&uhid_started_mtx); - - UHID_LOG("UHID_START from uhid-dev"); - break; - case UHID_STOP: - uhid_stopped = true; - - UHID_LOG("UHID_STOP from uhid-dev"); - break; - case UHID_OPEN: - UHID_LOG("UHID_OPEN from uhid-dev"); - break; - case UHID_CLOSE: - UHID_LOG("UHID_CLOSE from uhid-dev"); - break; - case UHID_OUTPUT: - UHID_LOG("UHID_OUTPUT from uhid-dev"); - - pthread_mutex_lock(&uhid_output_mtx); - memcpy(output_report, - ev.u.output.data, - min(ev.u.output.size, sizeof(output_report))); - pthread_cond_signal(&uhid_output_cond); - pthread_mutex_unlock(&uhid_output_mtx); - break; - case UHID_GET_REPORT: - UHID_LOG("UHID_GET_REPORT from uhid-dev"); - - answer.type = UHID_GET_REPORT_REPLY; - answer.u.get_report_reply.id = ev.u.get_report.id; - answer.u.get_report_reply.err = ev.u.get_report.rnum == 1 ? 0 : -EIO; - answer.u.get_report_reply.size = sizeof(feature_data); - memcpy(answer.u.get_report_reply.data, feature_data, sizeof(feature_data)); - - uhid_write(_metadata, fd, &answer); - - break; - case UHID_SET_REPORT: - UHID_LOG("UHID_SET_REPORT from uhid-dev"); - break; - default: - TH_LOG("Invalid event from uhid-dev: %u", ev.type); - } - - return 0; -} - -struct uhid_thread_args { - int fd; - struct __test_metadata *_metadata; -}; -static void *uhid_read_events_thread(void *arg) -{ - struct uhid_thread_args *args = (struct uhid_thread_args *)arg; - struct __test_metadata *_metadata = args->_metadata; - struct pollfd pfds[1]; - int fd = args->fd; - int ret = 0; - - pfds[0].fd = fd; - pfds[0].events = POLLIN; - - uhid_stopped = false; - - while (!uhid_stopped) { - ret = poll(pfds, 1, 100); - if (ret < 0) { - TH_LOG("Cannot poll for fds: %m"); - break; - } - if (pfds[0].revents & POLLIN) { - ret = uhid_event(_metadata, fd); - if (ret) - break; - } - } - - return (void *)(long)ret; -} - -static int uhid_start_listener(struct __test_metadata *_metadata, pthread_t *tid, int uhid_fd) -{ - struct uhid_thread_args args = { - .fd = uhid_fd, - ._metadata = _metadata, - }; - int err; - - pthread_mutex_lock(&uhid_started_mtx); - err = pthread_create(tid, NULL, uhid_read_events_thread, (void *)&args); - ASSERT_EQ(0, err) { - TH_LOG("Could not start the uhid thread: %d", err); - pthread_mutex_unlock(&uhid_started_mtx); - close(uhid_fd); - return -EIO; - } - pthread_cond_wait(&uhid_started, &uhid_started_mtx); - pthread_mutex_unlock(&uhid_started_mtx); - - return 0; -} - -static int uhid_send_event(struct __test_metadata *_metadata, int fd, __u8 *buf, size_t size) -{ - struct uhid_event ev; - - if (size > sizeof(ev.u.input.data)) - return -E2BIG; - - memset(&ev, 0, sizeof(ev)); - ev.type = UHID_INPUT2; - ev.u.input2.size = size; - - memcpy(ev.u.input2.data, buf, size); - - return uhid_write(_metadata, fd, &ev); -} - -static int setup_uhid(struct __test_metadata *_metadata, int rand_nb) -{ - int fd; - const char *path = "/dev/uhid"; - int ret; - - fd = open(path, O_RDWR | O_CLOEXEC); - ASSERT_GE(fd, 0) TH_LOG("open uhid-cdev failed; %d", fd); - - ret = uhid_create(_metadata, fd, rand_nb); - ASSERT_EQ(0, ret) { - TH_LOG("create uhid device failed: %d", ret); - close(fd); - } - - return fd; -} - -static bool match_sysfs_device(int dev_id, const char *workdir, struct dirent *dir) -{ - const char *target = "0003:0001:0A37.*"; - char phys[512]; - char uevent[1024]; - char temp[512]; - int fd, nread; - bool found = false; - - if (fnmatch(target, dir->d_name, 0)) - return false; - - /* we found the correct VID/PID, now check for phys */ - sprintf(uevent, "%s/%s/uevent", workdir, dir->d_name); - - fd = open(uevent, O_RDONLY | O_NONBLOCK); - if (fd < 0) - return false; - - sprintf(phys, "PHYS=%d", dev_id); - - nread = read(fd, temp, ARRAY_SIZE(temp)); - if (nread > 0 && (strstr(temp, phys)) != NULL) - found = true; - - close(fd); - - return found; -} - -static int get_hid_id(int dev_id) -{ - const char *workdir = "/sys/devices/virtual/misc/uhid"; - const char *str_id; - DIR *d; - struct dirent *dir; - int found = -1, attempts = 3; - - /* it would be nice to be able to use nftw, but the no_alu32 target doesn't support it */ - - while (found < 0 && attempts > 0) { - attempts--; - d = opendir(workdir); - if (d) { - while ((dir = readdir(d)) != NULL) { - if (!match_sysfs_device(dev_id, workdir, dir)) - continue; - - str_id = dir->d_name + sizeof("0003:0001:0A37."); - found = (int)strtol(str_id, NULL, 16); - - break; - } - closedir(d); - } - if (found < 0) - usleep(100000); - } - - return found; -} - -static int get_hidraw(int dev_id) -{ - const char *workdir = "/sys/devices/virtual/misc/uhid"; - char sysfs[1024]; - DIR *d, *subd; - struct dirent *dir, *subdir; - int i, found = -1; - - /* retry 5 times in case the system is loaded */ - for (i = 5; i > 0; i--) { - usleep(10); - d = opendir(workdir); - - if (!d) - continue; - - while ((dir = readdir(d)) != NULL) { - if (!match_sysfs_device(dev_id, workdir, dir)) - continue; - - sprintf(sysfs, "%s/%s/hidraw", workdir, dir->d_name); - - subd = opendir(sysfs); - if (!subd) - continue; - - while ((subdir = readdir(subd)) != NULL) { - if (fnmatch("hidraw*", subdir->d_name, 0)) - continue; - - found = atoi(subdir->d_name + strlen("hidraw")); - } - - closedir(subd); - - if (found > 0) - break; - } - closedir(d); - } - - return found; -} - -static int open_hidraw(int dev_id) -{ - int hidraw_number; - char hidraw_path[64] = { 0 }; - - hidraw_number = get_hidraw(dev_id); - if (hidraw_number < 0) - return hidraw_number; - - /* open hidraw node to check the other side of the pipe */ - sprintf(hidraw_path, "/dev/hidraw%d", hidraw_number); - return open(hidraw_path, O_RDWR | O_NONBLOCK); -} - FIXTURE(hid_bpf) { int dev_id; int uhid_fd; diff --git a/tools/testing/selftests/hid/hid_common.h b/tools/testing/selftests/hid/hid_common.h new file mode 100644 index 000000000000..f151f151a1ed --- /dev/null +++ b/tools/testing/selftests/hid/hid_common.h @@ -0,0 +1,436 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (c) 2022-2024 Red Hat */ + +#include "../kselftest_harness.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +#define SHOW_UHID_DEBUG 0 + +#define min(a, b) \ + ({ __typeof__(a) _a = (a); \ + __typeof__(b) _b = (b); \ + _a < _b ? _a : _b; }) + +static unsigned char rdesc[] = { + 0x06, 0x00, 0xff, /* Usage Page (Vendor Defined Page 1) */ + 0x09, 0x21, /* Usage (Vendor Usage 0x21) */ + 0xa1, 0x01, /* COLLECTION (Application) */ + 0x09, 0x01, /* Usage (Vendor Usage 0x01) */ + 0xa1, 0x00, /* COLLECTION (Physical) */ + 0x85, 0x02, /* REPORT_ID (2) */ + 0x19, 0x01, /* USAGE_MINIMUM (1) */ + 0x29, 0x08, /* USAGE_MAXIMUM (3) */ + 0x15, 0x00, /* LOGICAL_MINIMUM (0) */ + 0x25, 0xff, /* LOGICAL_MAXIMUM (255) */ + 0x95, 0x08, /* REPORT_COUNT (8) */ + 0x75, 0x08, /* REPORT_SIZE (8) */ + 0x81, 0x02, /* INPUT (Data,Var,Abs) */ + 0xc0, /* END_COLLECTION */ + 0x09, 0x01, /* Usage (Vendor Usage 0x01) */ + 0xa1, 0x00, /* COLLECTION (Physical) */ + 0x85, 0x01, /* REPORT_ID (1) */ + 0x06, 0x00, 0xff, /* Usage Page (Vendor Defined Page 1) */ + 0x19, 0x01, /* USAGE_MINIMUM (1) */ + 0x29, 0x03, /* USAGE_MAXIMUM (3) */ + 0x15, 0x00, /* LOGICAL_MINIMUM (0) */ + 0x25, 0x01, /* LOGICAL_MAXIMUM (1) */ + 0x95, 0x03, /* REPORT_COUNT (3) */ + 0x75, 0x01, /* REPORT_SIZE (1) */ + 0x81, 0x02, /* INPUT (Data,Var,Abs) */ + 0x95, 0x01, /* REPORT_COUNT (1) */ + 0x75, 0x05, /* REPORT_SIZE (5) */ + 0x81, 0x01, /* INPUT (Cnst,Var,Abs) */ + 0x05, 0x01, /* USAGE_PAGE (Generic Desktop) */ + 0x09, 0x30, /* USAGE (X) */ + 0x09, 0x31, /* USAGE (Y) */ + 0x15, 0x81, /* LOGICAL_MINIMUM (-127) */ + 0x25, 0x7f, /* LOGICAL_MAXIMUM (127) */ + 0x75, 0x10, /* REPORT_SIZE (16) */ + 0x95, 0x02, /* REPORT_COUNT (2) */ + 0x81, 0x06, /* INPUT (Data,Var,Rel) */ + + 0x06, 0x00, 0xff, /* Usage Page (Vendor Defined Page 1) */ + 0x19, 0x01, /* USAGE_MINIMUM (1) */ + 0x29, 0x03, /* USAGE_MAXIMUM (3) */ + 0x15, 0x00, /* LOGICAL_MINIMUM (0) */ + 0x25, 0x01, /* LOGICAL_MAXIMUM (1) */ + 0x95, 0x03, /* REPORT_COUNT (3) */ + 0x75, 0x01, /* REPORT_SIZE (1) */ + 0x91, 0x02, /* Output (Data,Var,Abs) */ + 0x95, 0x01, /* REPORT_COUNT (1) */ + 0x75, 0x05, /* REPORT_SIZE (5) */ + 0x91, 0x01, /* Output (Cnst,Var,Abs) */ + + 0x06, 0x00, 0xff, /* Usage Page (Vendor Defined Page 1) */ + 0x19, 0x06, /* USAGE_MINIMUM (6) */ + 0x29, 0x08, /* USAGE_MAXIMUM (8) */ + 0x15, 0x00, /* LOGICAL_MINIMUM (0) */ + 0x25, 0x01, /* LOGICAL_MAXIMUM (1) */ + 0x95, 0x03, /* REPORT_COUNT (3) */ + 0x75, 0x01, /* REPORT_SIZE (1) */ + 0xb1, 0x02, /* Feature (Data,Var,Abs) */ + 0x95, 0x01, /* REPORT_COUNT (1) */ + 0x75, 0x05, /* REPORT_SIZE (5) */ + 0x91, 0x01, /* Output (Cnst,Var,Abs) */ + + 0xc0, /* END_COLLECTION */ + 0xc0, /* END_COLLECTION */ +}; + +static __u8 feature_data[] = { 1, 2 }; + +#define ASSERT_OK(data) ASSERT_FALSE(data) +#define ASSERT_OK_PTR(ptr) ASSERT_NE(NULL, ptr) + +#define UHID_LOG(fmt, ...) do { \ + if (SHOW_UHID_DEBUG) \ + TH_LOG(fmt, ##__VA_ARGS__); \ +} while (0) + +static pthread_mutex_t uhid_started_mtx = PTHREAD_MUTEX_INITIALIZER; +static pthread_cond_t uhid_started = PTHREAD_COND_INITIALIZER; + +static pthread_mutex_t uhid_output_mtx = PTHREAD_MUTEX_INITIALIZER; +static pthread_cond_t uhid_output_cond = PTHREAD_COND_INITIALIZER; +static unsigned char output_report[10]; + +/* no need to protect uhid_stopped, only one thread accesses it */ +static bool uhid_stopped; + +static int uhid_write(struct __test_metadata *_metadata, int fd, const struct uhid_event *ev) +{ + ssize_t ret; + + ret = write(fd, ev, sizeof(*ev)); + if (ret < 0) { + TH_LOG("Cannot write to uhid: %m"); + return -errno; + } else if (ret != sizeof(*ev)) { + TH_LOG("Wrong size written to uhid: %zd != %zu", + ret, sizeof(ev)); + return -EFAULT; + } else { + return 0; + } +} + +static int uhid_create(struct __test_metadata *_metadata, int fd, int rand_nb) +{ + struct uhid_event ev; + char buf[25]; + + sprintf(buf, "test-uhid-device-%d", rand_nb); + + memset(&ev, 0, sizeof(ev)); + ev.type = UHID_CREATE; + strcpy((char *)ev.u.create.name, buf); + ev.u.create.rd_data = rdesc; + ev.u.create.rd_size = sizeof(rdesc); + ev.u.create.bus = BUS_USB; + ev.u.create.vendor = 0x0001; + ev.u.create.product = 0x0a37; + ev.u.create.version = 0; + ev.u.create.country = 0; + + sprintf(buf, "%d", rand_nb); + strcpy((char *)ev.u.create.phys, buf); + + return uhid_write(_metadata, fd, &ev); +} + +static void uhid_destroy(struct __test_metadata *_metadata, int fd) +{ + struct uhid_event ev; + + memset(&ev, 0, sizeof(ev)); + ev.type = UHID_DESTROY; + + uhid_write(_metadata, fd, &ev); +} + +static int uhid_event(struct __test_metadata *_metadata, int fd) +{ + struct uhid_event ev, answer; + ssize_t ret; + + memset(&ev, 0, sizeof(ev)); + ret = read(fd, &ev, sizeof(ev)); + if (ret == 0) { + UHID_LOG("Read HUP on uhid-cdev"); + return -EFAULT; + } else if (ret < 0) { + UHID_LOG("Cannot read uhid-cdev: %m"); + return -errno; + } else if (ret != sizeof(ev)) { + UHID_LOG("Invalid size read from uhid-dev: %zd != %zu", + ret, sizeof(ev)); + return -EFAULT; + } + + switch (ev.type) { + case UHID_START: + pthread_mutex_lock(&uhid_started_mtx); + pthread_cond_signal(&uhid_started); + pthread_mutex_unlock(&uhid_started_mtx); + + UHID_LOG("UHID_START from uhid-dev"); + break; + case UHID_STOP: + uhid_stopped = true; + + UHID_LOG("UHID_STOP from uhid-dev"); + break; + case UHID_OPEN: + UHID_LOG("UHID_OPEN from uhid-dev"); + break; + case UHID_CLOSE: + UHID_LOG("UHID_CLOSE from uhid-dev"); + break; + case UHID_OUTPUT: + UHID_LOG("UHID_OUTPUT from uhid-dev"); + + pthread_mutex_lock(&uhid_output_mtx); + memcpy(output_report, + ev.u.output.data, + min(ev.u.output.size, sizeof(output_report))); + pthread_cond_signal(&uhid_output_cond); + pthread_mutex_unlock(&uhid_output_mtx); + break; + case UHID_GET_REPORT: + UHID_LOG("UHID_GET_REPORT from uhid-dev"); + + answer.type = UHID_GET_REPORT_REPLY; + answer.u.get_report_reply.id = ev.u.get_report.id; + answer.u.get_report_reply.err = ev.u.get_report.rnum == 1 ? 0 : -EIO; + answer.u.get_report_reply.size = sizeof(feature_data); + memcpy(answer.u.get_report_reply.data, feature_data, sizeof(feature_data)); + + uhid_write(_metadata, fd, &answer); + + break; + case UHID_SET_REPORT: + UHID_LOG("UHID_SET_REPORT from uhid-dev"); + break; + default: + TH_LOG("Invalid event from uhid-dev: %u", ev.type); + } + + return 0; +} + +struct uhid_thread_args { + int fd; + struct __test_metadata *_metadata; +}; +static void *uhid_read_events_thread(void *arg) +{ + struct uhid_thread_args *args = (struct uhid_thread_args *)arg; + struct __test_metadata *_metadata = args->_metadata; + struct pollfd pfds[1]; + int fd = args->fd; + int ret = 0; + + pfds[0].fd = fd; + pfds[0].events = POLLIN; + + uhid_stopped = false; + + while (!uhid_stopped) { + ret = poll(pfds, 1, 100); + if (ret < 0) { + TH_LOG("Cannot poll for fds: %m"); + break; + } + if (pfds[0].revents & POLLIN) { + ret = uhid_event(_metadata, fd); + if (ret) + break; + } + } + + return (void *)(long)ret; +} + +static int uhid_start_listener(struct __test_metadata *_metadata, pthread_t *tid, int uhid_fd) +{ + struct uhid_thread_args args = { + .fd = uhid_fd, + ._metadata = _metadata, + }; + int err; + + pthread_mutex_lock(&uhid_started_mtx); + err = pthread_create(tid, NULL, uhid_read_events_thread, (void *)&args); + ASSERT_EQ(0, err) { + TH_LOG("Could not start the uhid thread: %d", err); + pthread_mutex_unlock(&uhid_started_mtx); + close(uhid_fd); + return -EIO; + } + pthread_cond_wait(&uhid_started, &uhid_started_mtx); + pthread_mutex_unlock(&uhid_started_mtx); + + return 0; +} + +static int uhid_send_event(struct __test_metadata *_metadata, int fd, __u8 *buf, size_t size) +{ + struct uhid_event ev; + + if (size > sizeof(ev.u.input.data)) + return -E2BIG; + + memset(&ev, 0, sizeof(ev)); + ev.type = UHID_INPUT2; + ev.u.input2.size = size; + + memcpy(ev.u.input2.data, buf, size); + + return uhid_write(_metadata, fd, &ev); +} + +static int setup_uhid(struct __test_metadata *_metadata, int rand_nb) +{ + int fd; + const char *path = "/dev/uhid"; + int ret; + + fd = open(path, O_RDWR | O_CLOEXEC); + ASSERT_GE(fd, 0) TH_LOG("open uhid-cdev failed; %d", fd); + + ret = uhid_create(_metadata, fd, rand_nb); + ASSERT_EQ(0, ret) { + TH_LOG("create uhid device failed: %d", ret); + close(fd); + } + + return fd; +} + +static bool match_sysfs_device(int dev_id, const char *workdir, struct dirent *dir) +{ + const char *target = "0003:0001:0A37.*"; + char phys[512]; + char uevent[1024]; + char temp[512]; + int fd, nread; + bool found = false; + + if (fnmatch(target, dir->d_name, 0)) + return false; + + /* we found the correct VID/PID, now check for phys */ + sprintf(uevent, "%s/%s/uevent", workdir, dir->d_name); + + fd = open(uevent, O_RDONLY | O_NONBLOCK); + if (fd < 0) + return false; + + sprintf(phys, "PHYS=%d", dev_id); + + nread = read(fd, temp, ARRAY_SIZE(temp)); + if (nread > 0 && (strstr(temp, phys)) != NULL) + found = true; + + close(fd); + + return found; +} + +static int get_hid_id(int dev_id) +{ + const char *workdir = "/sys/devices/virtual/misc/uhid"; + const char *str_id; + DIR *d; + struct dirent *dir; + int found = -1, attempts = 3; + + /* it would be nice to be able to use nftw, but the no_alu32 target doesn't support it */ + + while (found < 0 && attempts > 0) { + attempts--; + d = opendir(workdir); + if (d) { + while ((dir = readdir(d)) != NULL) { + if (!match_sysfs_device(dev_id, workdir, dir)) + continue; + + str_id = dir->d_name + sizeof("0003:0001:0A37."); + found = (int)strtol(str_id, NULL, 16); + + break; + } + closedir(d); + } + if (found < 0) + usleep(100000); + } + + return found; +} + +static int get_hidraw(int dev_id) +{ + const char *workdir = "/sys/devices/virtual/misc/uhid"; + char sysfs[1024]; + DIR *d, *subd; + struct dirent *dir, *subdir; + int i, found = -1; + + /* retry 5 times in case the system is loaded */ + for (i = 5; i > 0; i--) { + usleep(10); + d = opendir(workdir); + + if (!d) + continue; + + while ((dir = readdir(d)) != NULL) { + if (!match_sysfs_device(dev_id, workdir, dir)) + continue; + + sprintf(sysfs, "%s/%s/hidraw", workdir, dir->d_name); + + subd = opendir(sysfs); + if (!subd) + continue; + + while ((subdir = readdir(subd)) != NULL) { + if (fnmatch("hidraw*", subdir->d_name, 0)) + continue; + + found = atoi(subdir->d_name + strlen("hidraw")); + } + + closedir(subd); + + if (found > 0) + break; + } + closedir(d); + } + + return found; +} + +static int open_hidraw(int dev_id) +{ + int hidraw_number; + char hidraw_path[64] = { 0 }; + + hidraw_number = get_hidraw(dev_id); + if (hidraw_number < 0) + return hidraw_number; + + /* open hidraw node to check the other side of the pipe */ + sprintf(hidraw_path, "/dev/hidraw%d", hidraw_number); + return open(hidraw_path, O_RDWR | O_NONBLOCK); +} -- cgit v1.2.3 From 8163892a629ca544af575ce54955bf275a3250cd Mon Sep 17 00:00:00 2001 From: Benjamin Tissoires Date: Tue, 27 Aug 2024 17:19:31 +0900 Subject: selftests/hid: Add initial hidraw tests skeleton Largely inspired from hid_bpf.c for the fixture setup. Create a couple of tests for hidraw: - create a uhid device and check if the fixture is working properly - inject one uhid event and read it through hidraw These tests are not that useful for now, but will be once we start adding the ioctl and BPFs to revoke the hidraw node. Reviewed-by: Peter Hutterer Link: https://patch.msgid.link/20240827-hidraw-revoke-v5-3-d004a7451aea@kernel.org Signed-off-by: Benjamin Tissoires --- tools/testing/selftests/hid/.gitignore | 1 + tools/testing/selftests/hid/Makefile | 2 +- tools/testing/selftests/hid/hidraw.c | 90 ++++++++++++++++++++++++++++++++++ 3 files changed, 92 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/hid/hidraw.c (limited to 'tools') diff --git a/tools/testing/selftests/hid/.gitignore b/tools/testing/selftests/hid/.gitignore index 995af0670f69..746c62361f77 100644 --- a/tools/testing/selftests/hid/.gitignore +++ b/tools/testing/selftests/hid/.gitignore @@ -2,4 +2,5 @@ bpftool *.skel.h /tools hid_bpf +hidraw results diff --git a/tools/testing/selftests/hid/Makefile b/tools/testing/selftests/hid/Makefile index 2b5ea18bde38..72be55ac4bdf 100644 --- a/tools/testing/selftests/hid/Makefile +++ b/tools/testing/selftests/hid/Makefile @@ -32,7 +32,7 @@ CFLAGS += -Wno-unused-command-line-argument endif # Order correspond to 'make run_tests' order -TEST_GEN_PROGS = hid_bpf +TEST_GEN_PROGS = hid_bpf hidraw # Emit succinct information message describing current building step # $1 - generic step name (e.g., CC, LINK, etc); diff --git a/tools/testing/selftests/hid/hidraw.c b/tools/testing/selftests/hid/hidraw.c new file mode 100644 index 000000000000..37372709130c --- /dev/null +++ b/tools/testing/selftests/hid/hidraw.c @@ -0,0 +1,90 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2022-2024 Red Hat */ + +#include "hid_common.h" + +FIXTURE(hidraw) { + int dev_id; + int uhid_fd; + int hidraw_fd; + int hid_id; + pthread_t tid; +}; +static void close_hidraw(FIXTURE_DATA(hidraw) * self) +{ + if (self->hidraw_fd) + close(self->hidraw_fd); + self->hidraw_fd = 0; +} + +FIXTURE_TEARDOWN(hidraw) { + void *uhid_err; + + uhid_destroy(_metadata, self->uhid_fd); + + close_hidraw(self); + pthread_join(self->tid, &uhid_err); +} +#define TEARDOWN_LOG(fmt, ...) do { \ + TH_LOG(fmt, ##__VA_ARGS__); \ + hidraw_teardown(_metadata, self, variant); \ +} while (0) + +FIXTURE_SETUP(hidraw) +{ + time_t t; + int err; + + /* initialize random number generator */ + srand((unsigned int)time(&t)); + + self->dev_id = rand() % 1024; + + self->uhid_fd = setup_uhid(_metadata, self->dev_id); + + /* locate the uev, self, variant);ent file of the created device */ + self->hid_id = get_hid_id(self->dev_id); + ASSERT_GT(self->hid_id, 0) + TEARDOWN_LOG("Could not locate uhid device id: %d", self->hid_id); + + err = uhid_start_listener(_metadata, &self->tid, self->uhid_fd); + ASSERT_EQ(0, err) TEARDOWN_LOG("could not start udev listener: %d", err); + + self->hidraw_fd = open_hidraw(self->dev_id); + ASSERT_GE(self->hidraw_fd, 0) TH_LOG("open_hidraw"); +} + +/* + * A simple test to see if the fixture is working fine. + * If this fails, none of the other tests will pass. + */ +TEST_F(hidraw, test_create_uhid) +{ +} + +/* + * Inject one event in the uhid device, + * check that we get the same data through hidraw + */ +TEST_F(hidraw, raw_event) +{ + __u8 buf[10] = {0}; + int err; + + /* inject one event */ + buf[0] = 1; + buf[1] = 42; + uhid_send_event(_metadata, self->uhid_fd, buf, 6); + + /* read the data from hidraw */ + memset(buf, 0, sizeof(buf)); + err = read(self->hidraw_fd, buf, sizeof(buf)); + ASSERT_EQ(err, 6) TH_LOG("read_hidraw"); + ASSERT_EQ(buf[0], 1); + ASSERT_EQ(buf[1], 42); +} + +int main(int argc, char **argv) +{ + return test_harness_run(argc, argv); +} -- cgit v1.2.3 From 321f7798cfb8d834ae0ed0d467c8bf46804243f9 Mon Sep 17 00:00:00 2001 From: Benjamin Tissoires Date: Tue, 27 Aug 2024 17:19:32 +0900 Subject: selftests/hid: Add HIDIOCREVOKE tests Add 4 tests for the new revoke ioctl, for read/write/ioctl and poll. Reviewed-by: Peter Hutterer Link: https://patch.msgid.link/20240827-hidraw-revoke-v5-4-d004a7451aea@kernel.org Signed-off-by: Benjamin Tissoires --- tools/testing/selftests/hid/hidraw.c | 147 +++++++++++++++++++++++++++++++++++ 1 file changed, 147 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/hid/hidraw.c b/tools/testing/selftests/hid/hidraw.c index 37372709130c..f8b4f7ff292c 100644 --- a/tools/testing/selftests/hid/hidraw.c +++ b/tools/testing/selftests/hid/hidraw.c @@ -3,6 +3,11 @@ #include "hid_common.h" +/* for older kernels */ +#ifndef HIDIOCREVOKE +#define HIDIOCREVOKE _IOW('H', 0x0D, int) /* Revoke device access */ +#endif /* HIDIOCREVOKE */ + FIXTURE(hidraw) { int dev_id; int uhid_fd; @@ -84,6 +89,148 @@ TEST_F(hidraw, raw_event) ASSERT_EQ(buf[1], 42); } +/* + * After initial opening/checks of hidraw, revoke the hidraw + * node and check that we can not read any more data. + */ +TEST_F(hidraw, raw_event_revoked) +{ + __u8 buf[10] = {0}; + int err; + + /* inject one event */ + buf[0] = 1; + buf[1] = 42; + uhid_send_event(_metadata, self->uhid_fd, buf, 6); + + /* read the data from hidraw */ + memset(buf, 0, sizeof(buf)); + err = read(self->hidraw_fd, buf, sizeof(buf)); + ASSERT_EQ(err, 6) TH_LOG("read_hidraw"); + ASSERT_EQ(buf[0], 1); + ASSERT_EQ(buf[1], 42); + + /* call the revoke ioctl */ + err = ioctl(self->hidraw_fd, HIDIOCREVOKE, NULL); + ASSERT_OK(err) TH_LOG("couldn't revoke the hidraw fd"); + + /* inject one other event */ + buf[0] = 1; + buf[1] = 43; + uhid_send_event(_metadata, self->uhid_fd, buf, 6); + + /* read the data from hidraw */ + memset(buf, 0, sizeof(buf)); + err = read(self->hidraw_fd, buf, sizeof(buf)); + ASSERT_EQ(err, -1) TH_LOG("read_hidraw"); + ASSERT_EQ(errno, ENODEV) TH_LOG("unexpected error code while reading the hidraw node: %d", + errno); +} + +/* + * Revoke the hidraw node and check that we can not do any ioctl. + */ +TEST_F(hidraw, ioctl_revoked) +{ + int err, desc_size = 0; + + /* call the revoke ioctl */ + err = ioctl(self->hidraw_fd, HIDIOCREVOKE, NULL); + ASSERT_OK(err) TH_LOG("couldn't revoke the hidraw fd"); + + /* do an ioctl */ + err = ioctl(self->hidraw_fd, HIDIOCGRDESCSIZE, &desc_size); + ASSERT_EQ(err, -1) TH_LOG("ioctl_hidraw"); + ASSERT_EQ(errno, ENODEV) TH_LOG("unexpected error code while doing an ioctl: %d", + errno); +} + +/* + * Setup polling of the fd, and check that revoke works properly. + */ +TEST_F(hidraw, poll_revoked) +{ + struct pollfd pfds[1]; + __u8 buf[10] = {0}; + int err, ready; + + /* setup polling */ + pfds[0].fd = self->hidraw_fd; + pfds[0].events = POLLIN; + + /* inject one event */ + buf[0] = 1; + buf[1] = 42; + uhid_send_event(_metadata, self->uhid_fd, buf, 6); + + while (true) { + ready = poll(pfds, 1, 5000); + ASSERT_EQ(ready, 1) TH_LOG("poll return value"); + + if (pfds[0].revents & POLLIN) { + memset(buf, 0, sizeof(buf)); + err = read(self->hidraw_fd, buf, sizeof(buf)); + ASSERT_EQ(err, 6) TH_LOG("read_hidraw"); + ASSERT_EQ(buf[0], 1); + ASSERT_EQ(buf[1], 42); + + /* call the revoke ioctl */ + err = ioctl(self->hidraw_fd, HIDIOCREVOKE, NULL); + ASSERT_OK(err) TH_LOG("couldn't revoke the hidraw fd"); + } else { + break; + } + } + + ASSERT_TRUE(pfds[0].revents & POLLHUP); +} + +/* + * After initial opening/checks of hidraw, revoke the hidraw + * node and check that we can not read any more data. + */ +TEST_F(hidraw, write_event_revoked) +{ + struct timespec time_to_wait; + __u8 buf[10] = {0}; + int err; + + /* inject one event from hidraw */ + buf[0] = 1; /* report ID */ + buf[1] = 2; + buf[2] = 42; + + pthread_mutex_lock(&uhid_output_mtx); + + memset(output_report, 0, sizeof(output_report)); + clock_gettime(CLOCK_REALTIME, &time_to_wait); + time_to_wait.tv_sec += 2; + + err = write(self->hidraw_fd, buf, 3); + ASSERT_EQ(err, 3) TH_LOG("unexpected error while writing to hidraw node: %d", err); + + err = pthread_cond_timedwait(&uhid_output_cond, &uhid_output_mtx, &time_to_wait); + ASSERT_OK(err) TH_LOG("error while calling waiting for the condition"); + + ASSERT_EQ(output_report[0], 1); + ASSERT_EQ(output_report[1], 2); + ASSERT_EQ(output_report[2], 42); + + /* call the revoke ioctl */ + err = ioctl(self->hidraw_fd, HIDIOCREVOKE, NULL); + ASSERT_OK(err) TH_LOG("couldn't revoke the hidraw fd"); + + /* inject one other event */ + buf[0] = 1; + buf[1] = 43; + err = write(self->hidraw_fd, buf, 3); + ASSERT_LT(err, 0) TH_LOG("unexpected success while writing to hidraw node: %d", err); + ASSERT_EQ(errno, ENODEV) TH_LOG("unexpected error code while writing to hidraw node: %d", + errno); + + pthread_mutex_unlock(&uhid_output_mtx); +} + int main(int argc, char **argv) { return test_harness_run(argc, argv); -- cgit v1.2.3 From 5f94b08c001290acda94d9d8868075590931c198 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Wed, 28 Aug 2024 08:14:26 +0200 Subject: selftests: mptcp: join: check removing ID 0 endpoint Removing the endpoint linked to the initial subflow should trigger a RM_ADDR for the right ID, and the removal of the subflow. That's what is now being verified in the "delete and re-add" test. Note that removing the initial subflow will not decrement the 'subflows' counters, which corresponds to the *additional* subflows. On the other hand, when the same endpoint is re-added, it will increment this counter, as it will be seen as an additional subflow this time. The 'Fixes' tag here below is the same as the one from the previous commit: this patch here is not fixing anything wrong in the selftests, but it validates the previous fix for an issue introduced by this commit ID. Fixes: 3ad14f54bd74 ("mptcp: more accurate MPC endpoint tracking") Cc: stable@vger.kernel.org Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Signed-off-by: Paolo Abeni --- tools/testing/selftests/net/mptcp/mptcp_join.sh | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh index 264040a760c6..8b4529ff15e5 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_join.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh @@ -3572,8 +3572,9 @@ endpoint_tests() if reset_with_tcp_filter "delete and re-add" ns2 10.0.3.2 REJECT OUTPUT && mptcp_lib_kallsyms_has "subflow_rebuild_header$"; then - pm_nl_set_limits $ns1 0 2 - pm_nl_set_limits $ns2 0 2 + pm_nl_set_limits $ns1 0 3 + pm_nl_set_limits $ns2 0 3 + pm_nl_add_endpoint $ns2 10.0.1.2 id 1 dev ns2eth1 flags subflow pm_nl_add_endpoint $ns2 10.0.2.2 id 2 dev ns2eth2 flags subflow test_linkfail=4 speed=20 \ run_tests $ns1 $ns2 10.0.1.1 & @@ -3582,17 +3583,17 @@ endpoint_tests() wait_mpj $ns2 pm_nl_check_endpoint "creation" \ $ns2 10.0.2.2 id 2 flags subflow dev ns2eth2 - chk_subflow_nr "before delete" 2 + chk_subflow_nr "before delete id 2" 2 chk_mptcp_info subflows 1 subflows 1 pm_nl_del_endpoint $ns2 2 10.0.2.2 sleep 0.5 - chk_subflow_nr "after delete" 1 + chk_subflow_nr "after delete id 2" 1 chk_mptcp_info subflows 0 subflows 0 pm_nl_add_endpoint $ns2 10.0.2.2 id 2 dev ns2eth2 flags subflow wait_mpj $ns2 - chk_subflow_nr "after re-add" 2 + chk_subflow_nr "after re-add id 2" 2 chk_mptcp_info subflows 1 subflows 1 pm_nl_add_endpoint $ns2 10.0.3.2 id 3 flags subflow @@ -3607,10 +3608,20 @@ endpoint_tests() chk_subflow_nr "after no reject" 3 chk_mptcp_info subflows 2 subflows 2 + pm_nl_del_endpoint $ns2 1 10.0.1.2 + sleep 0.5 + chk_subflow_nr "after delete id 0" 2 + chk_mptcp_info subflows 2 subflows 2 # only decr for additional sf + + pm_nl_add_endpoint $ns2 10.0.1.2 id 1 dev ns2eth1 flags subflow + wait_mpj $ns2 + chk_subflow_nr "after re-add id 0" 3 + chk_mptcp_info subflows 3 subflows 3 + mptcp_lib_kill_wait $tests_pid - chk_join_nr 3 3 3 - chk_rm_nr 1 1 + chk_join_nr 4 4 4 + chk_rm_nr 2 2 fi # remove and re-add -- cgit v1.2.3 From 1c2326fcae4f0c5de8ad0d734ced43a8e5f17dac Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Wed, 28 Aug 2024 08:14:30 +0200 Subject: selftests: mptcp: join: check re-adding init endp with != id The initial subflow has a special local ID: 0. It is specific per connection. When a global endpoint is deleted and re-added later, it can have a different ID, but the kernel should still use the ID 0 if it corresponds to the initial address. This test validates this behaviour: the endpoint linked to the initial subflow is removed, and re-added with a different ID. Note that removing the initial subflow will not decrement the 'subflows' counters, which corresponds to the *additional* subflows. On the other hand, when the same endpoint is re-added, it will increment this counter, as it will be seen as an additional subflow this time. The 'Fixes' tag here below is the same as the one from the previous commit: this patch here is not fixing anything wrong in the selftests, but it validates the previous fix for an issue introduced by this commit ID. Fixes: 3ad14f54bd74 ("mptcp: more accurate MPC endpoint tracking") Cc: stable@vger.kernel.org Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Signed-off-by: Paolo Abeni --- tools/testing/selftests/net/mptcp/mptcp_join.sh | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh index 8b4529ff15e5..75458ade32c7 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_join.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh @@ -3627,11 +3627,12 @@ endpoint_tests() # remove and re-add if reset "delete re-add signal" && mptcp_lib_kallsyms_has "subflow_rebuild_header$"; then - pm_nl_set_limits $ns1 0 2 - pm_nl_set_limits $ns2 2 2 + pm_nl_set_limits $ns1 0 3 + pm_nl_set_limits $ns2 3 3 pm_nl_add_endpoint $ns1 10.0.2.1 id 1 flags signal # broadcast IP: no packet for this address will be received on ns1 pm_nl_add_endpoint $ns1 224.0.0.1 id 2 flags signal + pm_nl_add_endpoint $ns1 10.0.1.1 id 42 flags signal test_linkfail=4 speed=20 \ run_tests $ns1 $ns2 10.0.1.1 & local tests_pid=$! @@ -3653,11 +3654,21 @@ endpoint_tests() wait_mpj $ns2 chk_subflow_nr "after re-add" 3 chk_mptcp_info subflows 2 subflows 2 + + pm_nl_del_endpoint $ns1 42 10.0.1.1 + sleep 0.5 + chk_subflow_nr "after delete ID 0" 2 + chk_mptcp_info subflows 2 subflows 2 + + pm_nl_add_endpoint $ns1 10.0.1.1 id 99 flags signal + wait_mpj $ns2 + chk_subflow_nr "after re-add" 3 + chk_mptcp_info subflows 3 subflows 3 mptcp_lib_kill_wait $tests_pid - chk_join_nr 3 3 3 - chk_add_nr 4 4 - chk_rm_nr 2 1 invert + chk_join_nr 4 4 4 + chk_add_nr 5 5 + chk_rm_nr 3 2 invert fi # flush and re-add -- cgit v1.2.3 From 76a2d8394cc183df872adf04bf636eaf42746449 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Wed, 28 Aug 2024 08:14:31 +0200 Subject: selftests: mptcp: join: no extra msg if no counter The checksum and fail counters might not be available. Then no need to display an extra message with missing info. While at it, fix the indentation around, which is wrong since the same commit. Fixes: 47867f0a7e83 ("selftests: mptcp: join: skip check if MIB counter not supported") Cc: stable@vger.kernel.org Reviewed-by: Geliang Tang Signed-off-by: Matthieu Baerts (NGI0) Signed-off-by: Paolo Abeni --- tools/testing/selftests/net/mptcp/mptcp_join.sh | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh index 75458ade32c7..a10714b6952f 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_join.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh @@ -1112,26 +1112,26 @@ chk_csum_nr() print_check "sum" count=$(mptcp_lib_get_counter ${ns1} "MPTcpExtDataCsumErr") - if [ "$count" != "$csum_ns1" ]; then + if [ -n "$count" ] && [ "$count" != "$csum_ns1" ]; then extra_msg+=" ns1=$count" fi if [ -z "$count" ]; then print_skip elif { [ "$count" != $csum_ns1 ] && [ $allow_multi_errors_ns1 -eq 0 ]; } || - { [ "$count" -lt $csum_ns1 ] && [ $allow_multi_errors_ns1 -eq 1 ]; }; then + { [ "$count" -lt $csum_ns1 ] && [ $allow_multi_errors_ns1 -eq 1 ]; }; then fail_test "got $count data checksum error[s] expected $csum_ns1" else print_ok fi print_check "csum" count=$(mptcp_lib_get_counter ${ns2} "MPTcpExtDataCsumErr") - if [ "$count" != "$csum_ns2" ]; then + if [ -n "$count" ] && [ "$count" != "$csum_ns2" ]; then extra_msg+=" ns2=$count" fi if [ -z "$count" ]; then print_skip elif { [ "$count" != $csum_ns2 ] && [ $allow_multi_errors_ns2 -eq 0 ]; } || - { [ "$count" -lt $csum_ns2 ] && [ $allow_multi_errors_ns2 -eq 1 ]; }; then + { [ "$count" -lt $csum_ns2 ] && [ $allow_multi_errors_ns2 -eq 1 ]; }; then fail_test "got $count data checksum error[s] expected $csum_ns2" else print_ok @@ -1169,13 +1169,13 @@ chk_fail_nr() print_check "ftx" count=$(mptcp_lib_get_counter ${ns_tx} "MPTcpExtMPFailTx") - if [ "$count" != "$fail_tx" ]; then + if [ -n "$count" ] && [ "$count" != "$fail_tx" ]; then extra_msg+=",tx=$count" fi if [ -z "$count" ]; then print_skip elif { [ "$count" != "$fail_tx" ] && [ $allow_tx_lost -eq 0 ]; } || - { [ "$count" -gt "$fail_tx" ] && [ $allow_tx_lost -eq 1 ]; }; then + { [ "$count" -gt "$fail_tx" ] && [ $allow_tx_lost -eq 1 ]; }; then fail_test "got $count MP_FAIL[s] TX expected $fail_tx" else print_ok @@ -1183,13 +1183,13 @@ chk_fail_nr() print_check "failrx" count=$(mptcp_lib_get_counter ${ns_rx} "MPTcpExtMPFailRx") - if [ "$count" != "$fail_rx" ]; then + if [ -n "$count" ] && [ "$count" != "$fail_rx" ]; then extra_msg+=",rx=$count" fi if [ -z "$count" ]; then print_skip elif { [ "$count" != "$fail_rx" ] && [ $allow_rx_lost -eq 0 ]; } || - { [ "$count" -gt "$fail_rx" ] && [ $allow_rx_lost -eq 1 ]; }; then + { [ "$count" -gt "$fail_rx" ] && [ $allow_rx_lost -eq 1 ]; }; then fail_test "got $count MP_FAIL[s] RX expected $fail_rx" else print_ok -- cgit v1.2.3 From d397d7246c11ca36c33c932bc36d38e3a79e9aa0 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Wed, 28 Aug 2024 08:14:34 +0200 Subject: selftests: mptcp: join: check re-re-adding ID 0 endp This test extends "delete and re-add" to validate the previous commit: when the endpoint linked to the initial subflow (ID 0) is re-added multiple times, it was no longer being used, because the internal linked counters are not decremented for this special endpoint: it is not an additional endpoint. Here, the "del/add id 0" steps are done 3 times to unsure this case is validated. The 'Fixes' tag here below is the same as the one from the previous commit: this patch here is not fixing anything wrong in the selftests, but it validates the previous fix for an issue introduced by this commit ID. Fixes: 3ad14f54bd74 ("mptcp: more accurate MPC endpoint tracking") Cc: stable@vger.kernel.org Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Signed-off-by: Paolo Abeni --- tools/testing/selftests/net/mptcp/mptcp_join.sh | 27 ++++++++++++++----------- 1 file changed, 15 insertions(+), 12 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh index a10714b6952f..965b614e4b16 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_join.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh @@ -3576,7 +3576,7 @@ endpoint_tests() pm_nl_set_limits $ns2 0 3 pm_nl_add_endpoint $ns2 10.0.1.2 id 1 dev ns2eth1 flags subflow pm_nl_add_endpoint $ns2 10.0.2.2 id 2 dev ns2eth2 flags subflow - test_linkfail=4 speed=20 \ + test_linkfail=4 speed=5 \ run_tests $ns1 $ns2 10.0.1.1 & local tests_pid=$! @@ -3608,20 +3608,23 @@ endpoint_tests() chk_subflow_nr "after no reject" 3 chk_mptcp_info subflows 2 subflows 2 - pm_nl_del_endpoint $ns2 1 10.0.1.2 - sleep 0.5 - chk_subflow_nr "after delete id 0" 2 - chk_mptcp_info subflows 2 subflows 2 # only decr for additional sf - - pm_nl_add_endpoint $ns2 10.0.1.2 id 1 dev ns2eth1 flags subflow - wait_mpj $ns2 - chk_subflow_nr "after re-add id 0" 3 - chk_mptcp_info subflows 3 subflows 3 + local i + for i in $(seq 3); do + pm_nl_del_endpoint $ns2 1 10.0.1.2 + sleep 0.5 + chk_subflow_nr "after delete id 0 ($i)" 2 + chk_mptcp_info subflows 2 subflows 2 # only decr for additional sf + + pm_nl_add_endpoint $ns2 10.0.1.2 id 1 dev ns2eth1 flags subflow + wait_mpj $ns2 + chk_subflow_nr "after re-add id 0 ($i)" 3 + chk_mptcp_info subflows 3 subflows 3 + done mptcp_lib_kill_wait $tests_pid - chk_join_nr 4 4 4 - chk_rm_nr 2 2 + chk_join_nr 6 6 6 + chk_rm_nr 4 4 fi # remove and re-add -- cgit v1.2.3 From 20ccc7c5f7a3aa48092441a4b182f9f40418392e Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Wed, 28 Aug 2024 08:14:36 +0200 Subject: selftests: mptcp: join: validate event numbers This test extends "delete and re-add" and "delete re-add signal" to validate the previous commit: the number of MPTCP events are checked to make sure there are no duplicated or unexpected ones. A new helper has been introduced to easily check these events. The missing events have been added to the lib. The 'Fixes' tag here below is the same as the one from the previous commit: this patch here is not fixing anything wrong in the selftests, but it validates the previous fix for an issue introduced by this commit ID. Fixes: b911c97c7dc7 ("mptcp: add netlink event support") Cc: stable@vger.kernel.org Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Signed-off-by: Paolo Abeni --- tools/testing/selftests/net/mptcp/mptcp_join.sh | 74 ++++++++++++++++++++++++- tools/testing/selftests/net/mptcp/mptcp_lib.sh | 4 ++ 2 files changed, 75 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh index 965b614e4b16..a8ea0fe200fb 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_join.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh @@ -420,12 +420,17 @@ reset_with_fail() fi } +start_events() +{ + mptcp_lib_events "${ns1}" "${evts_ns1}" evts_ns1_pid + mptcp_lib_events "${ns2}" "${evts_ns2}" evts_ns2_pid +} + reset_with_events() { reset "${1}" || return 1 - mptcp_lib_events "${ns1}" "${evts_ns1}" evts_ns1_pid - mptcp_lib_events "${ns2}" "${evts_ns2}" evts_ns2_pid + start_events } reset_with_tcp_filter() @@ -3333,6 +3338,36 @@ userspace_pm_chk_get_addr() fi } +# $1: ns ; $2: event type ; $3: count +chk_evt_nr() +{ + local ns=${1} + local evt_name="${2}" + local exp="${3}" + + local evts="${evts_ns1}" + local evt="${!evt_name}" + local count + + evt_name="${evt_name:16}" # without MPTCP_LIB_EVENT_ + [ "${ns}" == "ns2" ] && evts="${evts_ns2}" + + print_check "event ${ns} ${evt_name} (${exp})" + + if [[ "${evt_name}" = "LISTENER_"* ]] && + ! mptcp_lib_kallsyms_has "mptcp_event_pm_listener$"; then + print_skip "event not supported" + return + fi + + count=$(grep -cw "type:${evt}" "${evts}") + if [ "${count}" != "${exp}" ]; then + fail_test "got ${count} events, expected ${exp}" + else + print_ok + fi +} + userspace_tests() { # userspace pm type prevents add_addr @@ -3572,6 +3607,7 @@ endpoint_tests() if reset_with_tcp_filter "delete and re-add" ns2 10.0.3.2 REJECT OUTPUT && mptcp_lib_kallsyms_has "subflow_rebuild_header$"; then + start_events pm_nl_set_limits $ns1 0 3 pm_nl_set_limits $ns2 0 3 pm_nl_add_endpoint $ns2 10.0.1.2 id 1 dev ns2eth1 flags subflow @@ -3623,12 +3659,28 @@ endpoint_tests() mptcp_lib_kill_wait $tests_pid + kill_events_pids + chk_evt_nr ns1 MPTCP_LIB_EVENT_LISTENER_CREATED 1 + chk_evt_nr ns1 MPTCP_LIB_EVENT_CREATED 1 + chk_evt_nr ns1 MPTCP_LIB_EVENT_ESTABLISHED 1 + chk_evt_nr ns1 MPTCP_LIB_EVENT_ANNOUNCED 0 + chk_evt_nr ns1 MPTCP_LIB_EVENT_REMOVED 4 + chk_evt_nr ns1 MPTCP_LIB_EVENT_SUB_ESTABLISHED 6 + chk_evt_nr ns1 MPTCP_LIB_EVENT_SUB_CLOSED 4 + + chk_evt_nr ns2 MPTCP_LIB_EVENT_CREATED 1 + chk_evt_nr ns2 MPTCP_LIB_EVENT_ESTABLISHED 1 + chk_evt_nr ns2 MPTCP_LIB_EVENT_ANNOUNCED 0 + chk_evt_nr ns2 MPTCP_LIB_EVENT_REMOVED 0 + chk_evt_nr ns2 MPTCP_LIB_EVENT_SUB_ESTABLISHED 6 + chk_evt_nr ns2 MPTCP_LIB_EVENT_SUB_CLOSED 5 # one has been closed before estab + chk_join_nr 6 6 6 chk_rm_nr 4 4 fi # remove and re-add - if reset "delete re-add signal" && + if reset_with_events "delete re-add signal" && mptcp_lib_kallsyms_has "subflow_rebuild_header$"; then pm_nl_set_limits $ns1 0 3 pm_nl_set_limits $ns2 3 3 @@ -3669,6 +3721,22 @@ endpoint_tests() chk_mptcp_info subflows 3 subflows 3 mptcp_lib_kill_wait $tests_pid + kill_events_pids + chk_evt_nr ns1 MPTCP_LIB_EVENT_LISTENER_CREATED 1 + chk_evt_nr ns1 MPTCP_LIB_EVENT_CREATED 1 + chk_evt_nr ns1 MPTCP_LIB_EVENT_ESTABLISHED 1 + chk_evt_nr ns1 MPTCP_LIB_EVENT_ANNOUNCED 0 + chk_evt_nr ns1 MPTCP_LIB_EVENT_REMOVED 0 + chk_evt_nr ns1 MPTCP_LIB_EVENT_SUB_ESTABLISHED 4 + chk_evt_nr ns1 MPTCP_LIB_EVENT_SUB_CLOSED 2 + + chk_evt_nr ns2 MPTCP_LIB_EVENT_CREATED 1 + chk_evt_nr ns2 MPTCP_LIB_EVENT_ESTABLISHED 1 + chk_evt_nr ns2 MPTCP_LIB_EVENT_ANNOUNCED 5 + chk_evt_nr ns2 MPTCP_LIB_EVENT_REMOVED 3 + chk_evt_nr ns2 MPTCP_LIB_EVENT_SUB_ESTABLISHED 4 + chk_evt_nr ns2 MPTCP_LIB_EVENT_SUB_CLOSED 2 + chk_join_nr 4 4 4 chk_add_nr 5 5 chk_rm_nr 3 2 invert diff --git a/tools/testing/selftests/net/mptcp/mptcp_lib.sh b/tools/testing/selftests/net/mptcp/mptcp_lib.sh index 438280e68434..4578a331041e 100644 --- a/tools/testing/selftests/net/mptcp/mptcp_lib.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_lib.sh @@ -12,10 +12,14 @@ readonly KSFT_SKIP=4 readonly KSFT_TEST="${MPTCP_LIB_KSFT_TEST:-$(basename "${0}" .sh)}" # These variables are used in some selftests, read-only +declare -rx MPTCP_LIB_EVENT_CREATED=1 # MPTCP_EVENT_CREATED +declare -rx MPTCP_LIB_EVENT_ESTABLISHED=2 # MPTCP_EVENT_ESTABLISHED +declare -rx MPTCP_LIB_EVENT_CLOSED=3 # MPTCP_EVENT_CLOSED declare -rx MPTCP_LIB_EVENT_ANNOUNCED=6 # MPTCP_EVENT_ANNOUNCED declare -rx MPTCP_LIB_EVENT_REMOVED=7 # MPTCP_EVENT_REMOVED declare -rx MPTCP_LIB_EVENT_SUB_ESTABLISHED=10 # MPTCP_EVENT_SUB_ESTABLISHED declare -rx MPTCP_LIB_EVENT_SUB_CLOSED=11 # MPTCP_EVENT_SUB_CLOSED +declare -rx MPTCP_LIB_EVENT_SUB_PRIORITY=13 # MPTCP_EVENT_SUB_PRIORITY declare -rx MPTCP_LIB_EVENT_LISTENER_CREATED=15 # MPTCP_EVENT_LISTENER_CREATED declare -rx MPTCP_LIB_EVENT_LISTENER_CLOSED=16 # MPTCP_EVENT_LISTENER_CLOSED -- cgit v1.2.3 From f18fa2abf81099d822d842a107f8c9889c86043c Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Wed, 28 Aug 2024 08:14:38 +0200 Subject: selftests: mptcp: join: check re-re-adding ID 0 signal This test extends "delete re-add signal" to validate the previous commit: when the 'signal' endpoint linked to the initial subflow (ID 0) is re-added multiple times, it will re-send the ADD_ADDR with id 0. The client should still be able to re-create this subflow, even if the add_addr_accepted limit has been reached as this special address is not considered as a new address. The 'Fixes' tag here below is the same as the one from the previous commit: this patch here is not fixing anything wrong in the selftests, but it validates the previous fix for an issue introduced by this commit ID. Fixes: d0876b2284cf ("mptcp: add the incoming RM_ADDR support") Cc: stable@vger.kernel.org Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Signed-off-by: Paolo Abeni --- tools/testing/selftests/net/mptcp/mptcp_join.sh | 32 ++++++++++++++++--------- 1 file changed, 21 insertions(+), 11 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh index a8ea0fe200fb..a4762c49a878 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_join.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh @@ -3688,7 +3688,7 @@ endpoint_tests() # broadcast IP: no packet for this address will be received on ns1 pm_nl_add_endpoint $ns1 224.0.0.1 id 2 flags signal pm_nl_add_endpoint $ns1 10.0.1.1 id 42 flags signal - test_linkfail=4 speed=20 \ + test_linkfail=4 speed=5 \ run_tests $ns1 $ns2 10.0.1.1 & local tests_pid=$! @@ -3717,7 +3717,17 @@ endpoint_tests() pm_nl_add_endpoint $ns1 10.0.1.1 id 99 flags signal wait_mpj $ns2 - chk_subflow_nr "after re-add" 3 + chk_subflow_nr "after re-add ID 0" 3 + chk_mptcp_info subflows 3 subflows 3 + + pm_nl_del_endpoint $ns1 99 10.0.1.1 + sleep 0.5 + chk_subflow_nr "after re-delete ID 0" 2 + chk_mptcp_info subflows 2 subflows 2 + + pm_nl_add_endpoint $ns1 10.0.1.1 id 88 flags signal + wait_mpj $ns2 + chk_subflow_nr "after re-re-add ID 0" 3 chk_mptcp_info subflows 3 subflows 3 mptcp_lib_kill_wait $tests_pid @@ -3727,19 +3737,19 @@ endpoint_tests() chk_evt_nr ns1 MPTCP_LIB_EVENT_ESTABLISHED 1 chk_evt_nr ns1 MPTCP_LIB_EVENT_ANNOUNCED 0 chk_evt_nr ns1 MPTCP_LIB_EVENT_REMOVED 0 - chk_evt_nr ns1 MPTCP_LIB_EVENT_SUB_ESTABLISHED 4 - chk_evt_nr ns1 MPTCP_LIB_EVENT_SUB_CLOSED 2 + chk_evt_nr ns1 MPTCP_LIB_EVENT_SUB_ESTABLISHED 5 + chk_evt_nr ns1 MPTCP_LIB_EVENT_SUB_CLOSED 3 chk_evt_nr ns2 MPTCP_LIB_EVENT_CREATED 1 chk_evt_nr ns2 MPTCP_LIB_EVENT_ESTABLISHED 1 - chk_evt_nr ns2 MPTCP_LIB_EVENT_ANNOUNCED 5 - chk_evt_nr ns2 MPTCP_LIB_EVENT_REMOVED 3 - chk_evt_nr ns2 MPTCP_LIB_EVENT_SUB_ESTABLISHED 4 - chk_evt_nr ns2 MPTCP_LIB_EVENT_SUB_CLOSED 2 + chk_evt_nr ns2 MPTCP_LIB_EVENT_ANNOUNCED 6 + chk_evt_nr ns2 MPTCP_LIB_EVENT_REMOVED 4 + chk_evt_nr ns2 MPTCP_LIB_EVENT_SUB_ESTABLISHED 5 + chk_evt_nr ns2 MPTCP_LIB_EVENT_SUB_CLOSED 3 - chk_join_nr 4 4 4 - chk_add_nr 5 5 - chk_rm_nr 3 2 invert + chk_join_nr 5 5 5 + chk_add_nr 6 6 + chk_rm_nr 4 3 invert fi # flush and re-add -- cgit v1.2.3 From 83dae72ac0382693540a055ec6210dd3691a8df6 Mon Sep 17 00:00:00 2001 From: Charlie Jenkins Date: Mon, 26 Aug 2024 09:36:46 -0700 Subject: riscv: selftests: Remove mmap hint address checks The mmap behavior that restricts the addresses returned by mmap caused unexpected behavior, so get rid of the test cases that check that behavior. Signed-off-by: Charlie Jenkins Fixes: 73d05262a2ca ("selftests: riscv: Generalize mm selftests") Link: https://lore.kernel.org/r/20240826-riscv_mmap-v1-2-cd8962afe47f@rivosinc.com Signed-off-by: Palmer Dabbelt --- tools/testing/selftests/riscv/mm/mmap_bottomup.c | 2 - tools/testing/selftests/riscv/mm/mmap_default.c | 2 - tools/testing/selftests/riscv/mm/mmap_test.h | 67 ------------------------ 3 files changed, 71 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/riscv/mm/mmap_bottomup.c b/tools/testing/selftests/riscv/mm/mmap_bottomup.c index 7f7d3eb8b9c9..f9ccae50349b 100644 --- a/tools/testing/selftests/riscv/mm/mmap_bottomup.c +++ b/tools/testing/selftests/riscv/mm/mmap_bottomup.c @@ -7,8 +7,6 @@ TEST(infinite_rlimit) { EXPECT_EQ(BOTTOM_UP, memory_layout()); - - TEST_MMAPS; } TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/riscv/mm/mmap_default.c b/tools/testing/selftests/riscv/mm/mmap_default.c index 2ba3ec990006..3f53b6ecc326 100644 --- a/tools/testing/selftests/riscv/mm/mmap_default.c +++ b/tools/testing/selftests/riscv/mm/mmap_default.c @@ -7,8 +7,6 @@ TEST(default_rlimit) { EXPECT_EQ(TOP_DOWN, memory_layout()); - - TEST_MMAPS; } TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/riscv/mm/mmap_test.h b/tools/testing/selftests/riscv/mm/mmap_test.h index 3b29ca3bb3d4..75918d15919f 100644 --- a/tools/testing/selftests/riscv/mm/mmap_test.h +++ b/tools/testing/selftests/riscv/mm/mmap_test.h @@ -10,76 +10,9 @@ #define TOP_DOWN 0 #define BOTTOM_UP 1 -#if __riscv_xlen == 64 -uint64_t random_addresses[] = { - 0x19764f0d73b3a9f0, 0x016049584cecef59, 0x3580bdd3562f4acd, - 0x1164219f20b17da0, 0x07d97fcb40ff2373, 0x76ec528921272ee7, - 0x4dd48c38a3de3f70, 0x2e11415055f6997d, 0x14b43334ac476c02, - 0x375a60795aff19f6, 0x47f3051725b8ee1a, 0x4e697cf240494a9f, - 0x456b59b5c2f9e9d1, 0x101724379d63cb96, 0x7fe9ad31619528c1, - 0x2f417247c495c2ea, 0x329a5a5b82943a5e, 0x06d7a9d6adcd3827, - 0x327b0b9ee37f62d5, 0x17c7b1851dfd9b76, 0x006ebb6456ec2cd9, - 0x00836cd14146a134, 0x00e5c4dcde7126db, 0x004c29feadf75753, - 0x00d8b20149ed930c, 0x00d71574c269387a, 0x0006ebe4a82acb7a, - 0x0016135df51f471b, 0x00758bdb55455160, 0x00d0bdd949b13b32, - 0x00ecea01e7c5f54b, 0x00e37b071b9948b1, 0x0011fdd00ff57ab3, - 0x00e407294b52f5ea, 0x00567748c200ed20, 0x000d073084651046, - 0x00ac896f4365463c, 0x00eb0d49a0b26216, 0x0066a2564a982a31, - 0x002e0d20237784ae, 0x0000554ff8a77a76, 0x00006ce07a54c012, - 0x000009570516d799, 0x00000954ca15b84d, 0x0000684f0d453379, - 0x00002ae5816302b5, 0x0000042403fb54bf, 0x00004bad7392bf30, - 0x00003e73bfa4b5e3, 0x00005442c29978e0, 0x00002803f11286b6, - 0x000073875d745fc6, 0x00007cede9cb8240, 0x000027df84cc6a4f, - 0x00006d7e0e74242a, 0x00004afd0b836e02, 0x000047d0e837cd82, - 0x00003b42405efeda, 0x00001531bafa4c95, 0x00007172cae34ac4, -}; -#else -uint32_t random_addresses[] = { - 0x8dc302e0, 0x929ab1e0, 0xb47683ba, 0xea519c73, 0xa19f1c90, 0xc49ba213, - 0x8f57c625, 0xadfe5137, 0x874d4d95, 0xaa20f09d, 0xcf21ebfc, 0xda7737f1, - 0xcedf392a, 0x83026c14, 0xccedca52, 0xc6ccf826, 0xe0cd9415, 0x997472ca, - 0xa21a44c1, 0xe82196f5, 0xa23fd66b, 0xc28d5590, 0xd009cdce, 0xcf0be646, - 0x8fc8c7ff, 0xe2a85984, 0xa3d3236b, 0x89a0619d, 0xc03db924, 0xb5d4cc1b, - 0xb96ee04c, 0xd191da48, 0xb432a000, 0xaa2bebbc, 0xa2fcb289, 0xb0cca89b, - 0xb0c18d6a, 0x88f58deb, 0xa4d42d1c, 0xe4d74e86, 0x99902b09, 0x8f786d31, - 0xbec5e381, 0x9a727e65, 0xa9a65040, 0xa880d789, 0x8f1b335e, 0xfc821c1e, - 0x97e34be4, 0xbbef84ed, 0xf447d197, 0xfd7ceee2, 0xe632348d, 0xee4590f4, - 0x958992a5, 0xd57e05d6, 0xfd240970, 0xc5b0dcff, 0xd96da2c2, 0xa7ae041d, -}; -#endif - -// Only works on 64 bit -#if __riscv_xlen == 64 #define PROT (PROT_READ | PROT_WRITE) #define FLAGS (MAP_PRIVATE | MAP_ANONYMOUS) -/* mmap must return a value that doesn't use more bits than the hint address. */ -static inline unsigned long get_max_value(unsigned long input) -{ - unsigned long max_bit = (1UL << (((sizeof(unsigned long) * 8) - 1 - - __builtin_clzl(input)))); - - return max_bit + (max_bit - 1); -} - -#define TEST_MMAPS \ - ({ \ - void *mmap_addr; \ - for (int i = 0; i < ARRAY_SIZE(random_addresses); i++) { \ - mmap_addr = mmap((void *)random_addresses[i], \ - 5 * sizeof(int), PROT, FLAGS, 0, 0); \ - EXPECT_NE(MAP_FAILED, mmap_addr); \ - EXPECT_GE((void *)get_max_value(random_addresses[i]), \ - mmap_addr); \ - mmap_addr = mmap((void *)random_addresses[i], \ - 5 * sizeof(int), PROT, FLAGS, 0, 0); \ - EXPECT_NE(MAP_FAILED, mmap_addr); \ - EXPECT_GE((void *)get_max_value(random_addresses[i]), \ - mmap_addr); \ - } \ - }) -#endif /* __riscv_xlen == 64 */ - static inline int memory_layout(void) { void *value1 = mmap(NULL, sizeof(int), PROT, FLAGS, 0, 0); -- cgit v1.2.3 From 0fd77ae4a3c94eca3f07e01083680ad0056df628 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Thu, 29 Aug 2024 11:46:40 -0300 Subject: Revert "tools build: Remove leftover libcap tests that prevents fast path feature detection from working" Ian pointed out that the libcap feature test is also used by bpftool, so we can't remove it just because perf stopped using it, revert the removal of the feature test. Since both perf and libcap uses the fast path feature detection (tools/build/feature/test-all.c), probably the best thing is to keep libcap-devel when building perf even it not being used there. This reverts commit 47b3b6435e4bfb61ae8ffc63a11bd3c310f69acf. Cc: Adrian Hunter Cc: Ian Rogers Cc: Jiri Olsa Cc: Kan Liang Cc: Namhyung Kim Signed-off-by: Arnaldo Carvalho de Melo --- tools/build/feature/Makefile | 6 +++++- tools/build/feature/test-libcap.c | 20 ++++++++++++++++++++ 2 files changed, 25 insertions(+), 1 deletion(-) create mode 100644 tools/build/feature/test-libcap.c (limited to 'tools') diff --git a/tools/build/feature/Makefile b/tools/build/feature/Makefile index b873fc58804e..12796808f07a 100644 --- a/tools/build/feature/Makefile +++ b/tools/build/feature/Makefile @@ -25,6 +25,7 @@ FILES= \ test-libbfd-liberty-z.bin \ test-cplus-demangle.bin \ test-cxa-demangle.bin \ + test-libcap.bin \ test-libelf.bin \ test-libelf-getphdrnum.bin \ test-libelf-gelf_getnote.bin \ @@ -111,7 +112,7 @@ all: $(FILES) __BUILD = $(CC) $(CFLAGS) -MD -Wall -Werror -o $@ $(patsubst %.bin,%.c,$(@F)) $(LDFLAGS) BUILD = $(__BUILD) > $(@:.bin=.make.output) 2>&1 BUILD_BFD = $(BUILD) -DPACKAGE='"perf"' -lbfd -ldl - BUILD_ALL = $(BUILD) -fstack-protector-all -O2 -D_FORTIFY_SOURCE=2 -ldw -lelf -lnuma -lelf -lslang $(FLAGS_PERL_EMBED) $(FLAGS_PYTHON_EMBED) -DPACKAGE='"perf"' -lbfd -ldl -lz -llzma -lzstd + BUILD_ALL = $(BUILD) -fstack-protector-all -O2 -D_FORTIFY_SOURCE=2 -ldw -lelf -lnuma -lelf -lslang $(FLAGS_PERL_EMBED) $(FLAGS_PYTHON_EMBED) -DPACKAGE='"perf"' -lbfd -ldl -lz -llzma -lzstd -lcap __BUILDXX = $(CXX) $(CXXFLAGS) -MD -Wall -Werror -o $@ $(patsubst %.bin,%.cpp,$(@F)) $(LDFLAGS) BUILDXX = $(__BUILDXX) > $(@:.bin=.make.output) 2>&1 @@ -139,6 +140,9 @@ $(OUTPUT)test-fortify-source.bin: $(OUTPUT)test-bionic.bin: $(BUILD) +$(OUTPUT)test-libcap.bin: + $(BUILD) -lcap + $(OUTPUT)test-libelf.bin: $(BUILD) -lelf diff --git a/tools/build/feature/test-libcap.c b/tools/build/feature/test-libcap.c new file mode 100644 index 000000000000..d2a2e152195f --- /dev/null +++ b/tools/build/feature/test-libcap.c @@ -0,0 +1,20 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include + +int main(void) +{ + cap_flag_value_t val; + cap_t caps = cap_get_proc(); + + if (!caps) + return 1; + + if (cap_get_flag(caps, CAP_SYS_ADMIN, CAP_EFFECTIVE, &val) != 0) + return 1; + + if (cap_free(caps) != 0) + return 1; + + return 0; +} -- cgit v1.2.3 From b6aa0de9a53a231eb068ce1e62b5e7ec9e30e627 Mon Sep 17 00:00:00 2001 From: James Clark Date: Mon, 22 Jul 2024 11:11:43 +0100 Subject: perf cs-etm: Create decoders after both AUX and HW_ID search passes Both of these passes gather information about how to create the decoders. AUX records determine formatted/unformatted, and the HW_IDs determine the traceID/metadata mappings. Therefore it makes sense to cache the information and wait until both passes are over until creating the decoders, rather than creating them at the first HW_ID found. This will allow a simplification of the creation process where cs_etm_queue->traceid_list will exclusively used to create the decoders, rather than the current two methods depending on whether the trace is formatted or not. Previously the sample CPU from the AUX record was used to initialize the decoder CPU, but actually sample CPU == AUX queue index in per-CPU mode, so saving the sample CPU isn't required. Similarly formatted/unformatted was used upfront to create the decoders, but now it's cached until later. Reviewed-by: Anshuman Khandual Reviewed-by: Mike Leach Signed-off-by: James Clark Signed-off-by: James Clark Tested-by: Ganapatrao Kulkarni Tested-by: Leo Yan Acked-by: Suzuki Poulouse Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Alexandre Torgue Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: John Garry Cc: Kan Liang Cc: Leo Yan Cc: Mark Rutland Cc: Maxime Coquelin Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Will Deacon Link: https://lore.kernel.org/r/20240722101202.26915-2-james.clark@linaro.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/cs-etm.c | 182 +++++++++++++++++++++++++++++------------------ 1 file changed, 113 insertions(+), 69 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/cs-etm.c b/tools/perf/util/cs-etm.c index 34552940ec1d..0915a4356fb3 100644 --- a/tools/perf/util/cs-etm.c +++ b/tools/perf/util/cs-etm.c @@ -97,12 +97,19 @@ struct cs_etm_traceid_queue { struct cs_etm_packet_queue packet_queue; }; +enum cs_etm_format { + UNSET, + FORMATTED, + UNFORMATTED +}; + struct cs_etm_queue { struct cs_etm_auxtrace *etm; struct cs_etm_decoder *decoder; struct auxtrace_buffer *buffer; unsigned int queue_nr; u8 pending_timestamp_chan_id; + enum cs_etm_format format; u64 offset; const unsigned char *buf; size_t buf_len, buf_used; @@ -696,7 +703,7 @@ static void cs_etm__set_trace_param_ete(struct cs_etm_trace_params *t_params, static int cs_etm__init_trace_params(struct cs_etm_trace_params *t_params, struct cs_etm_auxtrace *etm, - bool formatted, + enum cs_etm_format format, int sample_cpu, int decoders) { @@ -705,7 +712,7 @@ static int cs_etm__init_trace_params(struct cs_etm_trace_params *t_params, u64 architecture; for (t_idx = 0; t_idx < decoders; t_idx++) { - if (formatted) + if (format == FORMATTED) m_idx = t_idx; else { m_idx = get_cpu_data_idx(etm, sample_cpu); @@ -738,8 +745,7 @@ static int cs_etm__init_trace_params(struct cs_etm_trace_params *t_params, static int cs_etm__init_decoder_params(struct cs_etm_decoder_params *d_params, struct cs_etm_queue *etmq, - enum cs_etm_decoder_operation mode, - bool formatted) + enum cs_etm_decoder_operation mode) { int ret = -EINVAL; @@ -749,7 +755,7 @@ static int cs_etm__init_decoder_params(struct cs_etm_decoder_params *d_params, d_params->packet_printer = cs_etm__packet_dump; d_params->operation = mode; d_params->data = etmq; - d_params->formatted = formatted; + d_params->formatted = etmq->format == FORMATTED; d_params->fsyncs = false; d_params->hsyncs = false; d_params->frame_aligned = true; @@ -1041,81 +1047,34 @@ out: return ret; } -static struct cs_etm_queue *cs_etm__alloc_queue(struct cs_etm_auxtrace *etm, - bool formatted, int sample_cpu) +static struct cs_etm_queue *cs_etm__alloc_queue(void) { - struct cs_etm_decoder_params d_params; - struct cs_etm_trace_params *t_params = NULL; - struct cs_etm_queue *etmq; - /* - * Each queue can only contain data from one CPU when unformatted, so only one decoder is - * needed. - */ - int decoders = formatted ? etm->num_cpu : 1; - - etmq = zalloc(sizeof(*etmq)); + struct cs_etm_queue *etmq = zalloc(sizeof(*etmq)); if (!etmq) return NULL; etmq->traceid_queues_list = intlist__new(NULL); if (!etmq->traceid_queues_list) - goto out_free; - - /* Use metadata to fill in trace parameters for trace decoder */ - t_params = zalloc(sizeof(*t_params) * decoders); - - if (!t_params) - goto out_free; - - if (cs_etm__init_trace_params(t_params, etm, formatted, sample_cpu, decoders)) - goto out_free; - - /* Set decoder parameters to decode trace packets */ - if (cs_etm__init_decoder_params(&d_params, etmq, - dump_trace ? CS_ETM_OPERATION_PRINT : - CS_ETM_OPERATION_DECODE, - formatted)) - goto out_free; - - etmq->decoder = cs_etm_decoder__new(decoders, &d_params, - t_params); - - if (!etmq->decoder) - goto out_free; - - /* - * Register a function to handle all memory accesses required by - * the trace decoder library. - */ - if (cs_etm_decoder__add_mem_access_cb(etmq->decoder, - 0x0L, ((u64) -1L), - cs_etm__mem_access)) - goto out_free_decoder; + free(etmq); - zfree(&t_params); return etmq; - -out_free_decoder: - cs_etm_decoder__free(etmq->decoder); -out_free: - intlist__delete(etmq->traceid_queues_list); - free(etmq); - - return NULL; } static int cs_etm__setup_queue(struct cs_etm_auxtrace *etm, struct auxtrace_queue *queue, - unsigned int queue_nr, - bool formatted, - int sample_cpu) + unsigned int queue_nr, enum cs_etm_format format) { struct cs_etm_queue *etmq = queue->priv; + if (etmq && format != etmq->format) { + pr_err("CS_ETM: mixed formatted and unformatted trace not supported\n"); + return -EINVAL; + } + if (list_empty(&queue->head) || etmq) return 0; - etmq = cs_etm__alloc_queue(etm, formatted, sample_cpu); + etmq = cs_etm__alloc_queue(); if (!etmq) return -ENOMEM; @@ -1123,7 +1082,9 @@ static int cs_etm__setup_queue(struct cs_etm_auxtrace *etm, queue->priv = etmq; etmq->etm = etm; etmq->queue_nr = queue_nr; + queue->cpu = queue_nr; /* Placeholder, may be reset to -1 in per-thread mode */ etmq->offset = 0; + etmq->format = format; return 0; } @@ -2818,7 +2779,7 @@ static int cs_etm__process_auxtrace_event(struct perf_session *session, * formatted in piped mode (true). */ err = cs_etm__setup_queue(etm, &etm->queues.queue_array[idx], - idx, true, -1); + idx, FORMATTED); if (err) return err; @@ -2939,7 +2900,7 @@ static int cs_etm__queue_aux_fragment(struct perf_session *session, off_t file_o union perf_event auxtrace_fragment; __u64 aux_offset, aux_size; __u32 idx; - bool formatted; + enum cs_etm_format format; struct cs_etm_auxtrace *etm = container_of(session->auxtrace, struct cs_etm_auxtrace, @@ -3022,9 +2983,10 @@ static int cs_etm__queue_aux_fragment(struct perf_session *session, off_t file_o return err; idx = auxtrace_event->idx; - formatted = !(aux_event->flags & PERF_AUX_FLAG_CORESIGHT_FORMAT_RAW); - return cs_etm__setup_queue(etm, &etm->queues.queue_array[idx], - idx, formatted, sample->cpu); + format = (aux_event->flags & PERF_AUX_FLAG_CORESIGHT_FORMAT_RAW) ? + UNFORMATTED : FORMATTED; + + return cs_etm__setup_queue(etm, &etm->queues.queue_array[idx], idx, format); } /* Wasn't inside this buffer, but there were no parse errors. 1 == 'not found' */ @@ -3208,6 +3170,84 @@ static int cs_etm__clear_unused_trace_ids_metadata(int num_cpu, u64 **metadata) return 0; } +/* + * Use the data gathered by the peeks for HW_ID (trace ID mappings) and AUX + * (formatted or not) packets to create the decoders. + */ +static int cs_etm__create_queue_decoders(struct cs_etm_queue *etmq) +{ + struct cs_etm_decoder_params d_params; + + /* + * Each queue can only contain data from one CPU when unformatted, so only one decoder is + * needed. + */ + int decoders = etmq->format == FORMATTED ? etmq->etm->num_cpu : 1; + + /* Use metadata to fill in trace parameters for trace decoder */ + struct cs_etm_trace_params *t_params = zalloc(sizeof(*t_params) * decoders); + + if (!t_params) + goto out_free; + + if (cs_etm__init_trace_params(t_params, etmq->etm, etmq->format, + etmq->queue_nr, decoders)) + goto out_free; + + /* Set decoder parameters to decode trace packets */ + if (cs_etm__init_decoder_params(&d_params, etmq, + dump_trace ? CS_ETM_OPERATION_PRINT : + CS_ETM_OPERATION_DECODE)) + goto out_free; + + etmq->decoder = cs_etm_decoder__new(decoders, &d_params, + t_params); + + if (!etmq->decoder) + goto out_free; + + /* + * Register a function to handle all memory accesses required by + * the trace decoder library. + */ + if (cs_etm_decoder__add_mem_access_cb(etmq->decoder, + 0x0L, ((u64) -1L), + cs_etm__mem_access)) + goto out_free_decoder; + + zfree(&t_params); + return 0; + +out_free_decoder: + cs_etm_decoder__free(etmq->decoder); +out_free: + zfree(&t_params); + return -EINVAL; +} + +static int cs_etm__create_decoders(struct cs_etm_auxtrace *etm) +{ + struct auxtrace_queues *queues = &etm->queues; + + for (unsigned int i = 0; i < queues->nr_queues; i++) { + bool empty = list_empty(&queues->queue_array[i].head); + struct cs_etm_queue *etmq = queues->queue_array[i].priv; + int ret; + + /* + * Don't create decoders for empty queues, mainly because + * etmq->format is unknown for empty queues. + */ + if (empty) + continue; + + ret = cs_etm__create_queue_decoders(etmq); + if (ret) + return ret; + } + return 0; +} + int cs_etm__process_auxtrace_info_full(union perf_event *event, struct perf_session *session) { @@ -3371,6 +3411,10 @@ int cs_etm__process_auxtrace_info_full(union perf_event *event, if (err) goto err_free_queues; + err = cs_etm__queue_aux_records(session); + if (err) + goto err_free_queues; + /* * Map Trace ID values to CPU metadata. * @@ -3393,7 +3437,7 @@ int cs_etm__process_auxtrace_info_full(union perf_event *event, * flags if present. */ - /* first scan for AUX_OUTPUT_HW_ID records to map trace ID values to CPU metadata */ + /* Scan for AUX_OUTPUT_HW_ID records to map trace ID values to CPU metadata */ aux_hw_id_found = 0; err = perf_session__peek_events(session, session->header.data_offset, session->header.data_size, @@ -3411,7 +3455,7 @@ int cs_etm__process_auxtrace_info_full(union perf_event *event, if (err) goto err_free_queues; - err = cs_etm__queue_aux_records(session); + err = cs_etm__create_decoders(etm); if (err) goto err_free_queues; -- cgit v1.2.3 From 57880a7966be510cdaa08ab76b9d63e3df786bf0 Mon Sep 17 00:00:00 2001 From: James Clark Date: Mon, 22 Jul 2024 11:11:44 +0100 Subject: perf: cs-etm: Allocate queues for all CPUs Make cs_etm__setup_queue() setup a queue even if it's empty, and pre-allocate queues based on the max CPU that was recorded. In per-CPU mode aux queues are indexed based on CPU ID even if all CPUs aren't recorded, sparse queue arrays aren't used. This will allow HW_IDs to be saved even if no aux data was received in that queue without having to call cs_etm__setup_queue() from two different places. Reviewed-by: Mike Leach Signed-off-by: James Clark Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Alexandre Torgue Cc: Anshuman Khandual Cc: Ganapatrao Kulkarni Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: John Garry Cc: Kan Liang Cc: Leo Yan Cc: Mark Rutland Cc: Maxime Coquelin Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Suzuki Poulouse Cc: Will Deacon Link: https://lore.kernel.org/r/20240722101202.26915-3-james.clark@linaro.org Signed-off-by: James Clark Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/cs-etm.c | 53 +++++++++++++++++++++++------------------------- 1 file changed, 25 insertions(+), 28 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/cs-etm.c b/tools/perf/util/cs-etm.c index 0915a4356fb3..03df8894a900 100644 --- a/tools/perf/util/cs-etm.c +++ b/tools/perf/util/cs-etm.c @@ -1062,16 +1062,11 @@ static struct cs_etm_queue *cs_etm__alloc_queue(void) static int cs_etm__setup_queue(struct cs_etm_auxtrace *etm, struct auxtrace_queue *queue, - unsigned int queue_nr, enum cs_etm_format format) + unsigned int queue_nr) { struct cs_etm_queue *etmq = queue->priv; - if (etmq && format != etmq->format) { - pr_err("CS_ETM: mixed formatted and unformatted trace not supported\n"); - return -EINVAL; - } - - if (list_empty(&queue->head) || etmq) + if (etmq) return 0; etmq = cs_etm__alloc_queue(); @@ -1084,7 +1079,6 @@ static int cs_etm__setup_queue(struct cs_etm_auxtrace *etm, etmq->queue_nr = queue_nr; queue->cpu = queue_nr; /* Placeholder, may be reset to -1 in per-thread mode */ etmq->offset = 0; - etmq->format = format; return 0; } @@ -2772,17 +2766,6 @@ static int cs_etm__process_auxtrace_event(struct perf_session *session, if (err) return err; - /* - * Knowing if the trace is formatted or not requires a lookup of - * the aux record so only works in non-piped mode where data is - * queued in cs_etm__queue_aux_records(). Always assume - * formatted in piped mode (true). - */ - err = cs_etm__setup_queue(etm, &etm->queues.queue_array[idx], - idx, FORMATTED); - if (err) - return err; - if (dump_trace) if (auxtrace_buffer__get_data(buffer, fd)) { cs_etm__dump_event(etm->queues.queue_array[idx].priv, buffer); @@ -2899,7 +2882,6 @@ static int cs_etm__queue_aux_fragment(struct perf_session *session, off_t file_o struct perf_record_auxtrace *auxtrace_event; union perf_event auxtrace_fragment; __u64 aux_offset, aux_size; - __u32 idx; enum cs_etm_format format; struct cs_etm_auxtrace *etm = container_of(session->auxtrace, @@ -2966,6 +2948,8 @@ static int cs_etm__queue_aux_fragment(struct perf_session *session, off_t file_o if (aux_offset >= auxtrace_event->offset && aux_offset + aux_size <= auxtrace_event->offset + auxtrace_event->size) { + struct cs_etm_queue *etmq = etm->queues.queue_array[auxtrace_event->idx].priv; + /* * If this AUX event was inside this buffer somewhere, create a new auxtrace event * based on the sizes of the aux event, and queue that fragment. @@ -2982,11 +2966,14 @@ static int cs_etm__queue_aux_fragment(struct perf_session *session, off_t file_o if (err) return err; - idx = auxtrace_event->idx; format = (aux_event->flags & PERF_AUX_FLAG_CORESIGHT_FORMAT_RAW) ? UNFORMATTED : FORMATTED; - - return cs_etm__setup_queue(etm, &etm->queues.queue_array[idx], idx, format); + if (etmq->format != UNSET && format != etmq->format) { + pr_err("CS_ETM: mixed formatted and unformatted trace not supported\n"); + return -EINVAL; + } + etmq->format = format; + return 0; } /* Wasn't inside this buffer, but there were no parse errors. 1 == 'not found' */ @@ -3238,6 +3225,7 @@ static int cs_etm__create_decoders(struct cs_etm_auxtrace *etm) * Don't create decoders for empty queues, mainly because * etmq->format is unknown for empty queues. */ + assert(empty == (etmq->format == UNSET)); if (empty) continue; @@ -3257,10 +3245,10 @@ int cs_etm__process_auxtrace_info_full(union perf_event *event, int event_header_size = sizeof(struct perf_event_header); int total_size = auxtrace_info->header.size; int priv_size = 0; - int num_cpu; + int num_cpu, max_cpu = 0; int err = 0; int aux_hw_id_found; - int i, j; + int i; u64 *ptr = NULL; u64 **metadata = NULL; @@ -3291,7 +3279,7 @@ int cs_etm__process_auxtrace_info_full(union perf_event *event, * required by the trace decoder to properly decode the trace due * to its highly compressed nature. */ - for (j = 0; j < num_cpu; j++) { + for (int j = 0; j < num_cpu; j++) { if (ptr[i] == __perf_cs_etmv3_magic) { metadata[j] = cs_etm__create_meta_blk(ptr, &i, @@ -3315,6 +3303,9 @@ int cs_etm__process_auxtrace_info_full(union perf_event *event, err = -ENOMEM; goto err_free_metadata; } + + if ((int) metadata[j][CS_ETM_CPU] > max_cpu) + max_cpu = metadata[j][CS_ETM_CPU]; } /* @@ -3344,10 +3335,16 @@ int cs_etm__process_auxtrace_info_full(union perf_event *event, */ etm->pid_fmt = cs_etm__init_pid_fmt(metadata[0]); - err = auxtrace_queues__init(&etm->queues); + err = auxtrace_queues__init_nr(&etm->queues, max_cpu + 1); if (err) goto err_free_etm; + for (unsigned int j = 0; j < etm->queues.nr_queues; ++j) { + err = cs_etm__setup_queue(etm, &etm->queues.queue_array[j], j); + if (err) + goto err_free_queues; + } + if (session->itrace_synth_opts->set) { etm->synth_opts = *session->itrace_synth_opts; } else { @@ -3469,7 +3466,7 @@ err_free_etm: zfree(&etm); err_free_metadata: /* No need to check @metadata[j], free(NULL) is supported */ - for (j = 0; j < num_cpu; j++) + for (int j = 0; j < num_cpu; j++) zfree(&metadata[j]); zfree(&metadata); err_free_traceid_list: -- cgit v1.2.3 From c634d6f4e12d00c954410ba11db45799a8c77b5b Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 27 Aug 2024 13:37:21 -0700 Subject: libbpf: Fix bpf_object__open_skeleton()'s mishandling of options MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We do an ugly copying of options in bpf_object__open_skeleton() just to be able to set object name from skeleton's recorded name (while still allowing user to override it through opts->object_name). This is not just ugly, but it also is broken due to memcpy() that doesn't take into account potential skel_opts' and user-provided opts' sizes differences due to backward and forward compatibility. This leads to copying over extra bytes and then failing to validate options properly. It could, technically, lead also to SIGSEGV, if we are unlucky. So just get rid of that memory copy completely and instead pass default object name into bpf_object_open() directly, simplifying all this significantly. The rule now is that obj_name should be non-NULL for bpf_object_open() when called with in-memory buffer, so validate that explicitly as well. We adopt bpf_object__open_mem() to this as well and generate default name (based on buffer memory address and size) outside of bpf_object_open(). Fixes: d66562fba1ce ("libbpf: Add BPF object skeleton support") Reported-by: Daniel Müller Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Reviewed-by: Daniel Müller Acked-by: Eduard Zingerman Link: https://lore.kernel.org/bpf/20240827203721.1145494-1-andrii@kernel.org --- tools/lib/bpf/libbpf.c | 52 ++++++++++++++++++-------------------------------- 1 file changed, 19 insertions(+), 33 deletions(-) (limited to 'tools') diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index e55353887439..d3a542649e6b 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -7905,16 +7905,19 @@ static int bpf_object_init_progs(struct bpf_object *obj, const struct bpf_object } static struct bpf_object *bpf_object_open(const char *path, const void *obj_buf, size_t obj_buf_sz, + const char *obj_name, const struct bpf_object_open_opts *opts) { - const char *obj_name, *kconfig, *btf_tmp_path, *token_path; + const char *kconfig, *btf_tmp_path, *token_path; struct bpf_object *obj; - char tmp_name[64]; int err; char *log_buf; size_t log_size; __u32 log_level; + if (obj_buf && !obj_name) + return ERR_PTR(-EINVAL); + if (elf_version(EV_CURRENT) == EV_NONE) { pr_warn("failed to init libelf for %s\n", path ? : "(mem buf)"); @@ -7924,16 +7927,12 @@ static struct bpf_object *bpf_object_open(const char *path, const void *obj_buf, if (!OPTS_VALID(opts, bpf_object_open_opts)) return ERR_PTR(-EINVAL); - obj_name = OPTS_GET(opts, object_name, NULL); + obj_name = OPTS_GET(opts, object_name, NULL) ?: obj_name; if (obj_buf) { - if (!obj_name) { - snprintf(tmp_name, sizeof(tmp_name), "%lx-%lx", - (unsigned long)obj_buf, - (unsigned long)obj_buf_sz); - obj_name = tmp_name; - } path = obj_name; pr_debug("loading object '%s' from buffer\n", obj_name); + } else { + pr_debug("loading object from %s\n", path); } log_buf = OPTS_GET(opts, kernel_log_buf, NULL); @@ -8017,9 +8016,7 @@ bpf_object__open_file(const char *path, const struct bpf_object_open_opts *opts) if (!path) return libbpf_err_ptr(-EINVAL); - pr_debug("loading %s\n", path); - - return libbpf_ptr(bpf_object_open(path, NULL, 0, opts)); + return libbpf_ptr(bpf_object_open(path, NULL, 0, NULL, opts)); } struct bpf_object *bpf_object__open(const char *path) @@ -8031,10 +8028,15 @@ struct bpf_object * bpf_object__open_mem(const void *obj_buf, size_t obj_buf_sz, const struct bpf_object_open_opts *opts) { + char tmp_name[64]; + if (!obj_buf || obj_buf_sz == 0) return libbpf_err_ptr(-EINVAL); - return libbpf_ptr(bpf_object_open(NULL, obj_buf, obj_buf_sz, opts)); + /* create a (quite useless) default "name" for this memory buffer object */ + snprintf(tmp_name, sizeof(tmp_name), "%lx-%zx", (unsigned long)obj_buf, obj_buf_sz); + + return libbpf_ptr(bpf_object_open(NULL, obj_buf, obj_buf_sz, tmp_name, opts)); } static int bpf_object_unload(struct bpf_object *obj) @@ -13761,29 +13763,13 @@ static int populate_skeleton_progs(const struct bpf_object *obj, int bpf_object__open_skeleton(struct bpf_object_skeleton *s, const struct bpf_object_open_opts *opts) { - DECLARE_LIBBPF_OPTS(bpf_object_open_opts, skel_opts, - .object_name = s->name, - ); struct bpf_object *obj; int err; - /* Attempt to preserve opts->object_name, unless overriden by user - * explicitly. Overwriting object name for skeletons is discouraged, - * as it breaks global data maps, because they contain object name - * prefix as their own map name prefix. When skeleton is generated, - * bpftool is making an assumption that this name will stay the same. - */ - if (opts) { - memcpy(&skel_opts, opts, sizeof(*opts)); - if (!opts->object_name) - skel_opts.object_name = s->name; - } - - obj = bpf_object__open_mem(s->data, s->data_sz, &skel_opts); - err = libbpf_get_error(obj); - if (err) { - pr_warn("failed to initialize skeleton BPF object '%s': %d\n", - s->name, err); + obj = bpf_object_open(NULL, s->data, s->data_sz, s->name, opts); + if (IS_ERR(obj)) { + err = PTR_ERR(obj); + pr_warn("failed to initialize skeleton BPF object '%s': %d\n", s->name, err); return libbpf_err(err); } -- cgit v1.2.3 From 77c123f53e97ad4bde0271eb671b71774a99ebf6 Mon Sep 17 00:00:00 2001 From: James Clark Date: Mon, 22 Jul 2024 11:11:45 +0100 Subject: perf: cs-etm: Move traceid_list to each queue The global list won't work for per-sink trace ID allocations, so put a list in each queue where the IDs will be unique to that queue. To keep the same behavior as before, for version 0 of the HW_ID packets, copy all the HW_ID mappings into all queues. This change doesn't effect the decoders, only trace ID lookups on the Perf side. The decoders are still created with global mappings which will be fixed in a later commit. Reviewed-by: Mike Leach Signed-off-by: James Clark Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Alexandre Torgue Cc: Anshuman Khandual Cc: Ganapatrao Kulkarni Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: John Garry Cc: Kan Liang Cc: Leo Yan Cc: Mark Rutland Cc: Maxime Coquelin Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Suzuki Poulouse Cc: Will Deacon Link: https://lore.kernel.org/r/20240722101202.26915-4-james.clark@linaro.org Signed-off-by: James Clark Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/cs-etm-decoder/cs-etm-decoder.c | 28 +-- tools/perf/util/cs-etm.c | 215 ++++++++++++++---------- tools/perf/util/cs-etm.h | 2 +- 3 files changed, 147 insertions(+), 98 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c b/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c index e917985bbbe6..0c9c48cedbf1 100644 --- a/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c +++ b/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c @@ -388,7 +388,8 @@ cs_etm_decoder__reset_timestamp(struct cs_etm_packet_queue *packet_queue) } static ocsd_datapath_resp_t -cs_etm_decoder__buffer_packet(struct cs_etm_packet_queue *packet_queue, +cs_etm_decoder__buffer_packet(struct cs_etm_queue *etmq, + struct cs_etm_packet_queue *packet_queue, const u8 trace_chan_id, enum cs_etm_sample_type sample_type) { @@ -398,7 +399,7 @@ cs_etm_decoder__buffer_packet(struct cs_etm_packet_queue *packet_queue, if (packet_queue->packet_count >= CS_ETM_PACKET_MAX_BUFFER - 1) return OCSD_RESP_FATAL_SYS_ERR; - if (cs_etm__get_cpu(trace_chan_id, &cpu) < 0) + if (cs_etm__get_cpu(etmq, trace_chan_id, &cpu) < 0) return OCSD_RESP_FATAL_SYS_ERR; et = packet_queue->tail; @@ -436,7 +437,7 @@ cs_etm_decoder__buffer_range(struct cs_etm_queue *etmq, int ret = 0; struct cs_etm_packet *packet; - ret = cs_etm_decoder__buffer_packet(packet_queue, trace_chan_id, + ret = cs_etm_decoder__buffer_packet(etmq, packet_queue, trace_chan_id, CS_ETM_RANGE); if (ret != OCSD_RESP_CONT && ret != OCSD_RESP_WAIT) return ret; @@ -496,7 +497,8 @@ out: } static ocsd_datapath_resp_t -cs_etm_decoder__buffer_discontinuity(struct cs_etm_packet_queue *queue, +cs_etm_decoder__buffer_discontinuity(struct cs_etm_queue *etmq, + struct cs_etm_packet_queue *queue, const uint8_t trace_chan_id) { /* @@ -504,18 +506,19 @@ cs_etm_decoder__buffer_discontinuity(struct cs_etm_packet_queue *queue, * reset time statistics. */ cs_etm_decoder__reset_timestamp(queue); - return cs_etm_decoder__buffer_packet(queue, trace_chan_id, + return cs_etm_decoder__buffer_packet(etmq, queue, trace_chan_id, CS_ETM_DISCONTINUITY); } static ocsd_datapath_resp_t -cs_etm_decoder__buffer_exception(struct cs_etm_packet_queue *queue, +cs_etm_decoder__buffer_exception(struct cs_etm_queue *etmq, + struct cs_etm_packet_queue *queue, const ocsd_generic_trace_elem *elem, const uint8_t trace_chan_id) { int ret = 0; struct cs_etm_packet *packet; - ret = cs_etm_decoder__buffer_packet(queue, trace_chan_id, + ret = cs_etm_decoder__buffer_packet(etmq, queue, trace_chan_id, CS_ETM_EXCEPTION); if (ret != OCSD_RESP_CONT && ret != OCSD_RESP_WAIT) return ret; @@ -527,10 +530,11 @@ cs_etm_decoder__buffer_exception(struct cs_etm_packet_queue *queue, } static ocsd_datapath_resp_t -cs_etm_decoder__buffer_exception_ret(struct cs_etm_packet_queue *queue, +cs_etm_decoder__buffer_exception_ret(struct cs_etm_queue *etmq, + struct cs_etm_packet_queue *queue, const uint8_t trace_chan_id) { - return cs_etm_decoder__buffer_packet(queue, trace_chan_id, + return cs_etm_decoder__buffer_packet(etmq, queue, trace_chan_id, CS_ETM_EXCEPTION_RET); } @@ -599,7 +603,7 @@ static ocsd_datapath_resp_t cs_etm_decoder__gen_trace_elem_printer( case OCSD_GEN_TRC_ELEM_EO_TRACE: case OCSD_GEN_TRC_ELEM_NO_SYNC: case OCSD_GEN_TRC_ELEM_TRACE_ON: - resp = cs_etm_decoder__buffer_discontinuity(packet_queue, + resp = cs_etm_decoder__buffer_discontinuity(etmq, packet_queue, trace_chan_id); break; case OCSD_GEN_TRC_ELEM_INSTR_RANGE: @@ -607,11 +611,11 @@ static ocsd_datapath_resp_t cs_etm_decoder__gen_trace_elem_printer( trace_chan_id); break; case OCSD_GEN_TRC_ELEM_EXCEPTION: - resp = cs_etm_decoder__buffer_exception(packet_queue, elem, + resp = cs_etm_decoder__buffer_exception(etmq, packet_queue, elem, trace_chan_id); break; case OCSD_GEN_TRC_ELEM_EXCEPTION_RET: - resp = cs_etm_decoder__buffer_exception_ret(packet_queue, + resp = cs_etm_decoder__buffer_exception_ret(etmq, packet_queue, trace_chan_id); break; case OCSD_GEN_TRC_ELEM_TIMESTAMP: diff --git a/tools/perf/util/cs-etm.c b/tools/perf/util/cs-etm.c index 03df8894a900..0f56618fc7a8 100644 --- a/tools/perf/util/cs-etm.c +++ b/tools/perf/util/cs-etm.c @@ -116,16 +116,18 @@ struct cs_etm_queue { /* Conversion between traceID and index in traceid_queues array */ struct intlist *traceid_queues_list; struct cs_etm_traceid_queue **traceid_queues; + /* Conversion between traceID and metadata pointers */ + struct intlist *traceid_list; }; -/* RB tree for quick conversion between traceID and metadata pointers */ -static struct intlist *traceid_list; - static int cs_etm__process_timestamped_queues(struct cs_etm_auxtrace *etm); static int cs_etm__process_timeless_queues(struct cs_etm_auxtrace *etm, pid_t tid); static int cs_etm__get_data_block(struct cs_etm_queue *etmq); static int cs_etm__decode_data_block(struct cs_etm_queue *etmq); +static int cs_etm__metadata_get_trace_id(u8 *trace_chan_id, u64 *cpu_metadata); +static u64 *get_cpu_data(struct cs_etm_auxtrace *etm, int cpu); +static int cs_etm__metadata_set_trace_id(u8 trace_chan_id, u64 *cpu_metadata); /* PTMs ETMIDR [11:8] set to b0011 */ #define ETMIDR_PTM_VERSION 0x00000300 @@ -151,12 +153,12 @@ static u32 cs_etm__get_v7_protocol_version(u32 etmidr) return CS_ETM_PROTO_ETMV3; } -static int cs_etm__get_magic(u8 trace_chan_id, u64 *magic) +static int cs_etm__get_magic(struct cs_etm_queue *etmq, u8 trace_chan_id, u64 *magic) { struct int_node *inode; u64 *metadata; - inode = intlist__find(traceid_list, trace_chan_id); + inode = intlist__find(etmq->traceid_list, trace_chan_id); if (!inode) return -EINVAL; @@ -165,12 +167,12 @@ static int cs_etm__get_magic(u8 trace_chan_id, u64 *magic) return 0; } -int cs_etm__get_cpu(u8 trace_chan_id, int *cpu) +int cs_etm__get_cpu(struct cs_etm_queue *etmq, u8 trace_chan_id, int *cpu) { struct int_node *inode; u64 *metadata; - inode = intlist__find(traceid_list, trace_chan_id); + inode = intlist__find(etmq->traceid_list, trace_chan_id); if (!inode) return -EINVAL; @@ -222,30 +224,108 @@ enum cs_etm_pid_fmt cs_etm__get_pid_fmt(struct cs_etm_queue *etmq) return etmq->etm->pid_fmt; } -static int cs_etm__map_trace_id(u8 trace_chan_id, u64 *cpu_metadata) +static int cs_etm__insert_trace_id_node(struct cs_etm_queue *etmq, + u8 trace_chan_id, u64 *cpu_metadata) { - struct int_node *inode; - /* Get an RB node for this CPU */ - inode = intlist__findnew(traceid_list, trace_chan_id); + struct int_node *inode = intlist__findnew(etmq->traceid_list, trace_chan_id); /* Something went wrong, no need to continue */ if (!inode) return -ENOMEM; + /* Disallow re-mapping a different traceID to metadata pair. */ + if (inode->priv) { + u64 *curr_cpu_data = inode->priv; + u8 curr_chan_id; + int err; + + if (curr_cpu_data[CS_ETM_CPU] != cpu_metadata[CS_ETM_CPU]) { + pr_err("CS_ETM: map mismatch between HW_ID packet CPU and Trace ID\n"); + return -EINVAL; + } + + /* check that the mapped ID matches */ + err = cs_etm__metadata_get_trace_id(&curr_chan_id, curr_cpu_data); + if (err) + return err; + + if (curr_chan_id != trace_chan_id) { + pr_err("CS_ETM: mismatch between CPU trace ID and HW_ID packet ID\n"); + return -EINVAL; + } + + /* Skip re-adding the same mappings if everything matched */ + return 0; + } + + /* Not one we've seen before, associate the traceID with the metadata pointer */ + inode->priv = cpu_metadata; + + return 0; +} + +static struct cs_etm_queue *cs_etm__get_queue(struct cs_etm_auxtrace *etm, int cpu) +{ + if (etm->per_thread_decoding) + return etm->queues.queue_array[0].priv; + else + return etm->queues.queue_array[cpu].priv; +} + +static int cs_etm__map_trace_id_v0(struct cs_etm_auxtrace *etm, u8 trace_chan_id, + u64 *cpu_metadata) +{ + struct cs_etm_queue *etmq; + /* - * The node for that CPU should not be taken. - * Back out if that's the case. + * If the queue is unformatted then only save one mapping in the + * queue associated with that CPU so only one decoder is made. */ - if (inode->priv) - return -EINVAL; + etmq = cs_etm__get_queue(etm, cpu_metadata[CS_ETM_CPU]); + if (etmq->format == UNFORMATTED) + return cs_etm__insert_trace_id_node(etmq, trace_chan_id, + cpu_metadata); - /* All good, associate the traceID with the metadata pointer */ - inode->priv = cpu_metadata; + /* + * Otherwise, version 0 trace IDs are global so save them into every + * queue. + */ + for (unsigned int i = 0; i < etm->queues.nr_queues; ++i) { + int ret; + + etmq = etm->queues.queue_array[i].priv; + ret = cs_etm__insert_trace_id_node(etmq, trace_chan_id, + cpu_metadata); + if (ret) + return ret; + } return 0; } +static int cs_etm__process_trace_id_v0(struct cs_etm_auxtrace *etm, int cpu, + u64 hw_id) +{ + int err; + u64 *cpu_data; + u8 trace_chan_id = FIELD_GET(CS_AUX_HW_ID_TRACE_ID_MASK, hw_id); + + cpu_data = get_cpu_data(etm, cpu); + if (cpu_data == NULL) + return -EINVAL; + + err = cs_etm__map_trace_id_v0(etm, trace_chan_id, cpu_data); + if (err) + return err; + + /* + * if we are picking up the association from the packet, need to plug + * the correct trace ID into the metadata for setting up decoders later. + */ + return cs_etm__metadata_set_trace_id(trace_chan_id, cpu_data); +} + static int cs_etm__metadata_get_trace_id(u8 *trace_chan_id, u64 *cpu_metadata) { u64 cs_etm_magic = cpu_metadata[CS_ETM_MAGIC]; @@ -329,17 +409,13 @@ static int cs_etm__process_aux_output_hw_id(struct perf_session *session, { struct cs_etm_auxtrace *etm; struct perf_sample sample; - struct int_node *inode; struct evsel *evsel; - u64 *cpu_data; u64 hw_id; int cpu, version, err; - u8 trace_chan_id, curr_chan_id; /* extract and parse the HW ID */ hw_id = event->aux_output_hw_id.hw_id; version = FIELD_GET(CS_AUX_HW_ID_VERSION_MASK, hw_id); - trace_chan_id = FIELD_GET(CS_AUX_HW_ID_TRACE_ID_MASK, hw_id); /* check that we can handle this version */ if (version > CS_AUX_HW_ID_CURR_VERSION) { @@ -367,43 +443,7 @@ static int cs_etm__process_aux_output_hw_id(struct perf_session *session, return -EINVAL; } - /* See if the ID is mapped to a CPU, and it matches the current CPU */ - inode = intlist__find(traceid_list, trace_chan_id); - if (inode) { - cpu_data = inode->priv; - if ((int)cpu_data[CS_ETM_CPU] != cpu) { - pr_err("CS_ETM: map mismatch between HW_ID packet CPU and Trace ID\n"); - return -EINVAL; - } - - /* check that the mapped ID matches */ - err = cs_etm__metadata_get_trace_id(&curr_chan_id, cpu_data); - if (err) - return err; - if (curr_chan_id != trace_chan_id) { - pr_err("CS_ETM: mismatch between CPU trace ID and HW_ID packet ID\n"); - return -EINVAL; - } - - /* mapped and matched - return OK */ - return 0; - } - - cpu_data = get_cpu_data(etm, cpu); - if (cpu_data == NULL) - return err; - - /* not one we've seen before - lets map it */ - err = cs_etm__map_trace_id(trace_chan_id, cpu_data); - if (err) - return err; - - /* - * if we are picking up the association from the packet, need to plug - * the correct trace ID into the metadata for setting up decoders later. - */ - err = cs_etm__metadata_set_trace_id(trace_chan_id, cpu_data); - return err; + return cs_etm__process_trace_id_v0(etm, cpu, hw_id); } void cs_etm__etmq_set_traceid_queue_timestamp(struct cs_etm_queue *etmq, @@ -856,6 +896,7 @@ static void cs_etm__free_traceid_queues(struct cs_etm_queue *etmq) static void cs_etm__free_queue(void *priv) { + struct int_node *inode, *tmp; struct cs_etm_queue *etmq = priv; if (!etmq) @@ -863,6 +904,14 @@ static void cs_etm__free_queue(void *priv) cs_etm_decoder__free(etmq->decoder); cs_etm__free_traceid_queues(etmq); + + /* First remove all traceID/metadata nodes for the RB tree */ + intlist__for_each_entry_safe(inode, tmp, etmq->traceid_list) + intlist__remove(etmq->traceid_list, inode); + + /* Then the RB tree itself */ + intlist__delete(etmq->traceid_list); + free(etmq); } @@ -885,19 +934,12 @@ static void cs_etm__free_events(struct perf_session *session) static void cs_etm__free(struct perf_session *session) { int i; - struct int_node *inode, *tmp; struct cs_etm_auxtrace *aux = container_of(session->auxtrace, struct cs_etm_auxtrace, auxtrace); cs_etm__free_events(session); session->auxtrace = NULL; - /* First remove all traceID/metadata nodes for the RB tree */ - intlist__for_each_entry_safe(inode, tmp, traceid_list) - intlist__remove(traceid_list, inode); - /* Then the RB tree itself */ - intlist__delete(traceid_list); - for (i = 0; i < aux->num_cpu; i++) zfree(&aux->metadata[i]); @@ -1055,9 +1097,24 @@ static struct cs_etm_queue *cs_etm__alloc_queue(void) etmq->traceid_queues_list = intlist__new(NULL); if (!etmq->traceid_queues_list) - free(etmq); + goto out_free; + + /* + * Create an RB tree for traceID-metadata tuple. Since the conversion + * has to be made for each packet that gets decoded, optimizing access + * in anything other than a sequential array is worth doing. + */ + etmq->traceid_list = intlist__new(NULL); + if (!etmq->traceid_list) + goto out_free; return etmq; + +out_free: + intlist__delete(etmq->traceid_queues_list); + free(etmq); + + return NULL; } static int cs_etm__setup_queue(struct cs_etm_auxtrace *etm, @@ -2182,7 +2239,7 @@ static int cs_etm__set_sample_flags(struct cs_etm_queue *etmq, PERF_IP_FLAG_TRACE_END; break; case CS_ETM_EXCEPTION: - ret = cs_etm__get_magic(packet->trace_chan_id, &magic); + ret = cs_etm__get_magic(etmq, packet->trace_chan_id, &magic); if (ret) return ret; @@ -3099,7 +3156,8 @@ static bool cs_etm__has_virtual_ts(u64 **metadata, int num_cpu) } /* map trace ids to correct metadata block, from information in metadata */ -static int cs_etm__map_trace_ids_metadata(int num_cpu, u64 **metadata) +static int cs_etm__map_trace_ids_metadata(struct cs_etm_auxtrace *etm, int num_cpu, + u64 **metadata) { u64 cs_etm_magic; u8 trace_chan_id; @@ -3121,7 +3179,7 @@ static int cs_etm__map_trace_ids_metadata(int num_cpu, u64 **metadata) /* unknown magic number */ return -EINVAL; } - err = cs_etm__map_trace_id(trace_chan_id, metadata[i]); + err = cs_etm__map_trace_id_v0(etm, trace_chan_id, metadata[i]); if (err) return err; } @@ -3252,23 +3310,12 @@ int cs_etm__process_auxtrace_info_full(union perf_event *event, u64 *ptr = NULL; u64 **metadata = NULL; - /* - * Create an RB tree for traceID-metadata tuple. Since the conversion - * has to be made for each packet that gets decoded, optimizing access - * in anything other than a sequential array is worth doing. - */ - traceid_list = intlist__new(NULL); - if (!traceid_list) - return -ENOMEM; - /* First the global part */ ptr = (u64 *) auxtrace_info->priv; num_cpu = ptr[CS_PMU_TYPE_CPUS] & 0xffffffff; metadata = zalloc(sizeof(*metadata) * num_cpu); - if (!metadata) { - err = -ENOMEM; - goto err_free_traceid_list; - } + if (!metadata) + return -ENOMEM; /* Start parsing after the common part of the header */ i = CS_HEADER_VERSION_MAX; @@ -3447,7 +3494,7 @@ int cs_etm__process_auxtrace_info_full(union perf_event *event, err = cs_etm__clear_unused_trace_ids_metadata(num_cpu, metadata); /* otherwise, this is a file with metadata values only, map from metadata */ else - err = cs_etm__map_trace_ids_metadata(num_cpu, metadata); + err = cs_etm__map_trace_ids_metadata(etm, num_cpu, metadata); if (err) goto err_free_queues; @@ -3469,7 +3516,5 @@ err_free_metadata: for (int j = 0; j < num_cpu; j++) zfree(&metadata[j]); zfree(&metadata); -err_free_traceid_list: - intlist__delete(traceid_list); return err; } diff --git a/tools/perf/util/cs-etm.h b/tools/perf/util/cs-etm.h index 4696267a32f0..f4f69f7cc0f3 100644 --- a/tools/perf/util/cs-etm.h +++ b/tools/perf/util/cs-etm.h @@ -252,7 +252,7 @@ enum cs_etm_pid_fmt { #ifdef HAVE_CSTRACE_SUPPORT #include -int cs_etm__get_cpu(u8 trace_chan_id, int *cpu); +int cs_etm__get_cpu(struct cs_etm_queue *etmq, u8 trace_chan_id, int *cpu); enum cs_etm_pid_fmt cs_etm__get_pid_fmt(struct cs_etm_queue *etmq); int cs_etm__etmq_set_tid_el(struct cs_etm_queue *etmq, pid_t tid, u8 trace_chan_id, ocsd_ex_level el); -- cgit v1.2.3 From 19c3e4db38c5bf30c7e7b53dad5a464d7031dec4 Mon Sep 17 00:00:00 2001 From: James Clark Date: Mon, 22 Jul 2024 11:11:46 +0100 Subject: perf: cs-etm: Create decoders based on the trace ID mappings Now that each queue has a unique set of trace ID mappings, use this list to create the decoders. In unformatted mode just add a single mapping so only one decoder is made. Previously each queue would have a decoder created for each traced CPU on the system but this won't work anymore because CPUs can have overlapping trace IDs. This also means that the CORESIGHT_TRACE_ID_UNUSED_FLAG isn't needed any more. If mappings aren't added then decoders aren't created, rather than needing a flag to suppress creation. Reviewed-by: Mike Leach Signed-off-by: James Clark Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Alexandre Torgue Cc: Anshuman Khandual Cc: Ganapatrao Kulkarni Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: John Garry Cc: Kan Liang Cc: Leo Yan Cc: Mark Rutland Cc: Maxime Coquelin Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Suzuki Poulouse Cc: Will Deacon Link: https://lore.kernel.org/r/20240722101202.26915-5-james.clark@linaro.org Signed-off-by: James Clark Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/arch/arm/util/cs-etm.c | 8 +- tools/perf/util/cs-etm-decoder/cs-etm-decoder.c | 4 - tools/perf/util/cs-etm.c | 155 ++++++++---------------- tools/perf/util/cs-etm.h | 10 -- 4 files changed, 55 insertions(+), 122 deletions(-) (limited to 'tools') diff --git a/tools/perf/arch/arm/util/cs-etm.c b/tools/perf/arch/arm/util/cs-etm.c index 96aeb7cdbee1..3bbc1e57d71d 100644 --- a/tools/perf/arch/arm/util/cs-etm.c +++ b/tools/perf/arch/arm/util/cs-etm.c @@ -654,8 +654,7 @@ static void cs_etm_save_etmv4_header(__u64 data[], struct auxtrace_record *itr, /* Get trace configuration register */ data[CS_ETMV4_TRCCONFIGR] = cs_etmv4_get_config(itr); /* traceID set to legacy version, in case new perf running on older system */ - data[CS_ETMV4_TRCTRACEIDR] = cs_etm_get_legacy_trace_id(cpu) | - CORESIGHT_TRACE_ID_UNUSED_FLAG; + data[CS_ETMV4_TRCTRACEIDR] = cs_etm_get_legacy_trace_id(cpu); /* Get read-only information from sysFS */ cs_etm_get_ro(cs_etm_pmu, cpu, metadata_etmv4_ro[CS_ETMV4_TRCIDR0], @@ -687,7 +686,7 @@ static void cs_etm_save_ete_header(__u64 data[], struct auxtrace_record *itr, st /* Get trace configuration register */ data[CS_ETE_TRCCONFIGR] = cs_etmv4_get_config(itr); /* traceID set to legacy version, in case new perf running on older system */ - data[CS_ETE_TRCTRACEIDR] = cs_etm_get_legacy_trace_id(cpu) | CORESIGHT_TRACE_ID_UNUSED_FLAG; + data[CS_ETE_TRCTRACEIDR] = cs_etm_get_legacy_trace_id(cpu); /* Get read-only information from sysFS */ cs_etm_get_ro(cs_etm_pmu, cpu, metadata_ete_ro[CS_ETE_TRCIDR0], &data[CS_ETE_TRCIDR0]); @@ -743,8 +742,7 @@ static void cs_etm_get_metadata(struct perf_cpu cpu, u32 *offset, /* Get configuration register */ info->priv[*offset + CS_ETM_ETMCR] = cs_etm_get_config(itr); /* traceID set to legacy value in case new perf running on old system */ - info->priv[*offset + CS_ETM_ETMTRACEIDR] = cs_etm_get_legacy_trace_id(cpu) | - CORESIGHT_TRACE_ID_UNUSED_FLAG; + info->priv[*offset + CS_ETM_ETMTRACEIDR] = cs_etm_get_legacy_trace_id(cpu); /* Get read-only information from sysFS */ cs_etm_get_ro(cs_etm_pmu, cpu, metadata_etmv3_ro[CS_ETM_ETMCCER], &info->priv[*offset + CS_ETM_ETMCCER]); diff --git a/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c b/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c index 0c9c48cedbf1..d49c3e9c7c21 100644 --- a/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c +++ b/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c @@ -684,10 +684,6 @@ cs_etm_decoder__create_etm_decoder(struct cs_etm_decoder_params *d_params, return -1; } - /* if the CPU has no trace ID associated, no decoder needed */ - if (csid == CORESIGHT_TRACE_ID_UNUSED_VAL) - return 0; - if (d_params->operation == CS_ETM_OPERATION_DECODE) { if (ocsd_dt_create_decoder(decoder->dcd_tree, decoder->decoder_name, diff --git a/tools/perf/util/cs-etm.c b/tools/perf/util/cs-etm.c index 0f56618fc7a8..01c60a660fb3 100644 --- a/tools/perf/util/cs-etm.c +++ b/tools/perf/util/cs-etm.c @@ -348,7 +348,6 @@ static int cs_etm__metadata_get_trace_id(u8 *trace_chan_id, u64 *cpu_metadata) /* * update metadata trace ID from the value found in the AUX_HW_INFO packet. - * This will also clear the CORESIGHT_TRACE_ID_UNUSED_FLAG flag if present. */ static int cs_etm__metadata_set_trace_id(u8 trace_chan_id, u64 *cpu_metadata) { @@ -700,80 +699,58 @@ static void cs_etm__packet_dump(const char *pkt_string) } static void cs_etm__set_trace_param_etmv3(struct cs_etm_trace_params *t_params, - struct cs_etm_auxtrace *etm, int t_idx, - int m_idx, u32 etmidr) + u64 *metadata, u32 etmidr) { - u64 **metadata = etm->metadata; - - t_params[t_idx].protocol = cs_etm__get_v7_protocol_version(etmidr); - t_params[t_idx].etmv3.reg_ctrl = metadata[m_idx][CS_ETM_ETMCR]; - t_params[t_idx].etmv3.reg_trc_id = metadata[m_idx][CS_ETM_ETMTRACEIDR]; + t_params->protocol = cs_etm__get_v7_protocol_version(etmidr); + t_params->etmv3.reg_ctrl = metadata[CS_ETM_ETMCR]; + t_params->etmv3.reg_trc_id = metadata[CS_ETM_ETMTRACEIDR]; } static void cs_etm__set_trace_param_etmv4(struct cs_etm_trace_params *t_params, - struct cs_etm_auxtrace *etm, int t_idx, - int m_idx) + u64 *metadata) { - u64 **metadata = etm->metadata; - - t_params[t_idx].protocol = CS_ETM_PROTO_ETMV4i; - t_params[t_idx].etmv4.reg_idr0 = metadata[m_idx][CS_ETMV4_TRCIDR0]; - t_params[t_idx].etmv4.reg_idr1 = metadata[m_idx][CS_ETMV4_TRCIDR1]; - t_params[t_idx].etmv4.reg_idr2 = metadata[m_idx][CS_ETMV4_TRCIDR2]; - t_params[t_idx].etmv4.reg_idr8 = metadata[m_idx][CS_ETMV4_TRCIDR8]; - t_params[t_idx].etmv4.reg_configr = metadata[m_idx][CS_ETMV4_TRCCONFIGR]; - t_params[t_idx].etmv4.reg_traceidr = metadata[m_idx][CS_ETMV4_TRCTRACEIDR]; + t_params->protocol = CS_ETM_PROTO_ETMV4i; + t_params->etmv4.reg_idr0 = metadata[CS_ETMV4_TRCIDR0]; + t_params->etmv4.reg_idr1 = metadata[CS_ETMV4_TRCIDR1]; + t_params->etmv4.reg_idr2 = metadata[CS_ETMV4_TRCIDR2]; + t_params->etmv4.reg_idr8 = metadata[CS_ETMV4_TRCIDR8]; + t_params->etmv4.reg_configr = metadata[CS_ETMV4_TRCCONFIGR]; + t_params->etmv4.reg_traceidr = metadata[CS_ETMV4_TRCTRACEIDR]; } static void cs_etm__set_trace_param_ete(struct cs_etm_trace_params *t_params, - struct cs_etm_auxtrace *etm, int t_idx, - int m_idx) + u64 *metadata) { - u64 **metadata = etm->metadata; - - t_params[t_idx].protocol = CS_ETM_PROTO_ETE; - t_params[t_idx].ete.reg_idr0 = metadata[m_idx][CS_ETE_TRCIDR0]; - t_params[t_idx].ete.reg_idr1 = metadata[m_idx][CS_ETE_TRCIDR1]; - t_params[t_idx].ete.reg_idr2 = metadata[m_idx][CS_ETE_TRCIDR2]; - t_params[t_idx].ete.reg_idr8 = metadata[m_idx][CS_ETE_TRCIDR8]; - t_params[t_idx].ete.reg_configr = metadata[m_idx][CS_ETE_TRCCONFIGR]; - t_params[t_idx].ete.reg_traceidr = metadata[m_idx][CS_ETE_TRCTRACEIDR]; - t_params[t_idx].ete.reg_devarch = metadata[m_idx][CS_ETE_TRCDEVARCH]; + t_params->protocol = CS_ETM_PROTO_ETE; + t_params->ete.reg_idr0 = metadata[CS_ETE_TRCIDR0]; + t_params->ete.reg_idr1 = metadata[CS_ETE_TRCIDR1]; + t_params->ete.reg_idr2 = metadata[CS_ETE_TRCIDR2]; + t_params->ete.reg_idr8 = metadata[CS_ETE_TRCIDR8]; + t_params->ete.reg_configr = metadata[CS_ETE_TRCCONFIGR]; + t_params->ete.reg_traceidr = metadata[CS_ETE_TRCTRACEIDR]; + t_params->ete.reg_devarch = metadata[CS_ETE_TRCDEVARCH]; } static int cs_etm__init_trace_params(struct cs_etm_trace_params *t_params, - struct cs_etm_auxtrace *etm, - enum cs_etm_format format, - int sample_cpu, - int decoders) -{ - int t_idx, m_idx; - u32 etmidr; - u64 architecture; - - for (t_idx = 0; t_idx < decoders; t_idx++) { - if (format == FORMATTED) - m_idx = t_idx; - else { - m_idx = get_cpu_data_idx(etm, sample_cpu); - if (m_idx == -1) { - pr_warning("CS_ETM: unknown CPU, falling back to first metadata\n"); - m_idx = 0; - } - } + struct cs_etm_queue *etmq) +{ + struct int_node *inode; - architecture = etm->metadata[m_idx][CS_ETM_MAGIC]; + intlist__for_each_entry(inode, etmq->traceid_list) { + u64 *metadata = inode->priv; + u64 architecture = metadata[CS_ETM_MAGIC]; + u32 etmidr; switch (architecture) { case __perf_cs_etmv3_magic: - etmidr = etm->metadata[m_idx][CS_ETM_ETMIDR]; - cs_etm__set_trace_param_etmv3(t_params, etm, t_idx, m_idx, etmidr); + etmidr = metadata[CS_ETM_ETMIDR]; + cs_etm__set_trace_param_etmv3(t_params++, metadata, etmidr); break; case __perf_cs_etmv4_magic: - cs_etm__set_trace_param_etmv4(t_params, etm, t_idx, m_idx); + cs_etm__set_trace_param_etmv4(t_params++, metadata); break; case __perf_cs_ete_magic: - cs_etm__set_trace_param_ete(t_params, etm, t_idx, m_idx); + cs_etm__set_trace_param_ete(t_params++, metadata); break; default: return -EINVAL; @@ -3186,35 +3163,6 @@ static int cs_etm__map_trace_ids_metadata(struct cs_etm_auxtrace *etm, int num_c return 0; } -/* - * If we found AUX_HW_ID packets, then set any metadata marked as unused to the - * unused value to reduce the number of unneeded decoders created. - */ -static int cs_etm__clear_unused_trace_ids_metadata(int num_cpu, u64 **metadata) -{ - u64 cs_etm_magic; - int i; - - for (i = 0; i < num_cpu; i++) { - cs_etm_magic = metadata[i][CS_ETM_MAGIC]; - switch (cs_etm_magic) { - case __perf_cs_etmv3_magic: - if (metadata[i][CS_ETM_ETMTRACEIDR] & CORESIGHT_TRACE_ID_UNUSED_FLAG) - metadata[i][CS_ETM_ETMTRACEIDR] = CORESIGHT_TRACE_ID_UNUSED_VAL; - break; - case __perf_cs_etmv4_magic: - case __perf_cs_ete_magic: - if (metadata[i][CS_ETMV4_TRCTRACEIDR] & CORESIGHT_TRACE_ID_UNUSED_FLAG) - metadata[i][CS_ETMV4_TRCTRACEIDR] = CORESIGHT_TRACE_ID_UNUSED_VAL; - break; - default: - /* unknown magic number */ - return -EINVAL; - } - } - return 0; -} - /* * Use the data gathered by the peeks for HW_ID (trace ID mappings) and AUX * (formatted or not) packets to create the decoders. @@ -3222,21 +3170,26 @@ static int cs_etm__clear_unused_trace_ids_metadata(int num_cpu, u64 **metadata) static int cs_etm__create_queue_decoders(struct cs_etm_queue *etmq) { struct cs_etm_decoder_params d_params; + struct cs_etm_trace_params *t_params; + int decoders = intlist__nr_entries(etmq->traceid_list); + + if (decoders == 0) + return 0; /* * Each queue can only contain data from one CPU when unformatted, so only one decoder is * needed. */ - int decoders = etmq->format == FORMATTED ? etmq->etm->num_cpu : 1; + if (etmq->format == UNFORMATTED) + assert(decoders == 1); /* Use metadata to fill in trace parameters for trace decoder */ - struct cs_etm_trace_params *t_params = zalloc(sizeof(*t_params) * decoders); + t_params = zalloc(sizeof(*t_params) * decoders); if (!t_params) goto out_free; - if (cs_etm__init_trace_params(t_params, etmq->etm, etmq->format, - etmq->queue_nr, decoders)) + if (cs_etm__init_trace_params(t_params, etmq)) goto out_free; /* Set decoder parameters to decode trace packets */ @@ -3462,9 +3415,9 @@ int cs_etm__process_auxtrace_info_full(union perf_event *event, /* * Map Trace ID values to CPU metadata. * - * Trace metadata will always contain Trace ID values from the legacy algorithm. If the - * files has been recorded by a "new" perf updated to handle AUX_HW_ID then the metadata - * ID value will also have the CORESIGHT_TRACE_ID_UNUSED_FLAG set. + * Trace metadata will always contain Trace ID values from the legacy algorithm + * in case it's read by a version of Perf that doesn't know about HW_ID packets + * or the kernel doesn't emit them. * * The updated kernel drivers that use AUX_HW_ID to sent Trace IDs will attempt to use * the same IDs as the old algorithm as far as is possible, unless there are clashes @@ -3473,12 +3426,11 @@ int cs_etm__process_auxtrace_info_full(union perf_event *event, * * For a perf able to interpret AUX_HW_ID packets we first check for the presence of * those packets. If they are there then the values will be mapped and plugged into - * the metadata. We then set any remaining metadata values with the used flag to a - * value CORESIGHT_TRACE_ID_UNUSED_VAL - which indicates no decoder is required. + * the metadata and decoders are only created for each mapping received. * * If no AUX_HW_ID packets are present - which means a file recorded on an old kernel - * then we map Trace ID values to CPU directly from the metadata - clearing any unused - * flags if present. + * then we map Trace ID values to CPU directly from the metadata and create decoders + * for all mappings. */ /* Scan for AUX_OUTPUT_HW_ID records to map trace ID values to CPU metadata */ @@ -3489,15 +3441,12 @@ int cs_etm__process_auxtrace_info_full(union perf_event *event, if (err) goto err_free_queues; - /* if HW ID found then clear any unused metadata ID values */ - if (aux_hw_id_found) - err = cs_etm__clear_unused_trace_ids_metadata(num_cpu, metadata); - /* otherwise, this is a file with metadata values only, map from metadata */ - else + /* if no HW ID found this is a file with metadata values only, map from metadata */ + if (!aux_hw_id_found) { err = cs_etm__map_trace_ids_metadata(etm, num_cpu, metadata); - - if (err) - goto err_free_queues; + if (err) + goto err_free_queues; + } err = cs_etm__create_decoders(etm); if (err) diff --git a/tools/perf/util/cs-etm.h b/tools/perf/util/cs-etm.h index f4f69f7cc0f3..a8caeea720aa 100644 --- a/tools/perf/util/cs-etm.h +++ b/tools/perf/util/cs-etm.h @@ -230,16 +230,6 @@ struct cs_etm_packet_queue { /* CoreSight trace ID is currently the bottom 7 bits of the value */ #define CORESIGHT_TRACE_ID_VAL_MASK GENMASK(6, 0) -/* - * perf record will set the legacy meta data values as unused initially. - * This allows perf report to manage the decoders created when dynamic - * allocation in operation. - */ -#define CORESIGHT_TRACE_ID_UNUSED_FLAG BIT(31) - -/* Value to set for unused trace ID values */ -#define CORESIGHT_TRACE_ID_UNUSED_VAL 0x7F - int cs_etm__process_auxtrace_info(union perf_event *event, struct perf_session *session); void cs_etm_get_default_config(const struct perf_pmu *pmu, struct perf_event_attr *attr); -- cgit v1.2.3 From 940007cee539fd07c7c274030f7f7252f8c5a5d7 Mon Sep 17 00:00:00 2001 From: James Clark Date: Mon, 22 Jul 2024 11:11:47 +0100 Subject: perf: cs-etm: Only save valid trace IDs into files This isn't a bug because Perf always masks with CORESIGHT_TRACE_ID_VAL_MASK before using these values, but to avoid it looking like it could be, make an effort to not save bad values. Reviewed-by: Mike Leach Signed-off-by: James Clark Signed-off-by: James Clark Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Alexandre Torgue Cc: Anshuman Khandual Cc: Ganapatrao Kulkarni Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: John Garry Cc: Kan Liang Cc: Leo Yan Cc: Mark Rutland Cc: Maxime Coquelin Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Suzuki Poulouse Cc: Will Deacon Link: https://lore.kernel.org/r/20240722101202.26915-6-james.clark@linaro.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/arch/arm/util/cs-etm.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/perf/arch/arm/util/cs-etm.c b/tools/perf/arch/arm/util/cs-etm.c index 3bbc1e57d71d..ea891d12f8f4 100644 --- a/tools/perf/arch/arm/util/cs-etm.c +++ b/tools/perf/arch/arm/util/cs-etm.c @@ -643,7 +643,8 @@ static bool cs_etm_is_ete(struct perf_pmu *cs_etm_pmu, struct perf_cpu cpu) static __u64 cs_etm_get_legacy_trace_id(struct perf_cpu cpu) { - return CORESIGHT_LEGACY_CPU_TRACE_ID(cpu.cpu); + /* Wrap at 48 so that invalid trace IDs aren't saved into files. */ + return CORESIGHT_LEGACY_CPU_TRACE_ID(cpu.cpu % 48); } static void cs_etm_save_etmv4_header(__u64 data[], struct auxtrace_record *itr, struct perf_cpu cpu) -- cgit v1.2.3 From 1506af6db8c4abbe3d5dd573ec72b90f81abfcf7 Mon Sep 17 00:00:00 2001 From: James Clark Date: Mon, 22 Jul 2024 11:11:48 +0100 Subject: perf: cs-etm: Support version 0.1 of HW_ID packets v0.1 HW_ID packets have a new field that describes which sink each CPU writes to. Use the sink ID to link trace ID maps to each other so that mappings are shared wherever the sink is shared. Also update the error message to show that overlapping IDs aren't an error in per-thread mode, just not supported. In the future we can use the CPU ID from the AUX records, or watch for changing sink IDs on HW_ID packets to use the correct decoders. Reviewed-by: Mike Leach Signed-off-by: James Clark Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Alexandre Torgue Cc: Anshuman Khandual Cc: Ganapatrao Kulkarni Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: John Garry Cc: Kan Liang Cc: Leo Yan Cc: Mark Rutland Cc: Maxime Coquelin Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Suzuki Poulouse Cc: Will Deacon Link: https://lore.kernel.org/r/20240722101202.26915-7-james.clark@linaro.org Signed-off-by: James Clark Signed-off-by: Arnaldo Carvalho de Melo --- tools/include/linux/coresight-pmu.h | 17 ++++-- tools/perf/util/cs-etm.c | 100 ++++++++++++++++++++++++++++++++---- 2 files changed, 103 insertions(+), 14 deletions(-) (limited to 'tools') diff --git a/tools/include/linux/coresight-pmu.h b/tools/include/linux/coresight-pmu.h index 51ac441a37c3..89b0ac0014b0 100644 --- a/tools/include/linux/coresight-pmu.h +++ b/tools/include/linux/coresight-pmu.h @@ -49,12 +49,21 @@ * Interpretation of the PERF_RECORD_AUX_OUTPUT_HW_ID payload. * Used to associate a CPU with the CoreSight Trace ID. * [07:00] - Trace ID - uses 8 bits to make value easy to read in file. - * [59:08] - Unused (SBZ) - * [63:60] - Version + * [39:08] - Sink ID - as reported in /sys/bus/event_source/devices/cs_etm/sinks/ + * Added in minor version 1. + * [55:40] - Unused (SBZ) + * [59:56] - Minor Version - previously existing fields are compatible with + * all minor versions. + * [63:60] - Major Version - previously existing fields mean different things + * in new major versions. */ #define CS_AUX_HW_ID_TRACE_ID_MASK GENMASK_ULL(7, 0) -#define CS_AUX_HW_ID_VERSION_MASK GENMASK_ULL(63, 60) +#define CS_AUX_HW_ID_SINK_ID_MASK GENMASK_ULL(39, 8) -#define CS_AUX_HW_ID_CURR_VERSION 0 +#define CS_AUX_HW_ID_MINOR_VERSION_MASK GENMASK_ULL(59, 56) +#define CS_AUX_HW_ID_MAJOR_VERSION_MASK GENMASK_ULL(63, 60) + +#define CS_AUX_HW_ID_MAJOR_VERSION 0 +#define CS_AUX_HW_ID_MINOR_VERSION 1 #endif diff --git a/tools/perf/util/cs-etm.c b/tools/perf/util/cs-etm.c index 01c60a660fb3..775d4f35c270 100644 --- a/tools/perf/util/cs-etm.c +++ b/tools/perf/util/cs-etm.c @@ -118,6 +118,12 @@ struct cs_etm_queue { struct cs_etm_traceid_queue **traceid_queues; /* Conversion between traceID and metadata pointers */ struct intlist *traceid_list; + /* + * Same as traceid_list, but traceid_list may be a reference to another + * queue's which has a matching sink ID. + */ + struct intlist *own_traceid_list; + u32 sink_id; }; static int cs_etm__process_timestamped_queues(struct cs_etm_auxtrace *etm); @@ -142,6 +148,7 @@ static int cs_etm__metadata_set_trace_id(u8 trace_chan_id, u64 *cpu_metadata); (queue_nr << 16 | trace_chan_id) #define TO_QUEUE_NR(cs_queue_nr) (cs_queue_nr >> 16) #define TO_TRACE_CHAN_ID(cs_queue_nr) (cs_queue_nr & 0x0000ffff) +#define SINK_UNSET ((u32) -1) static u32 cs_etm__get_v7_protocol_version(u32 etmidr) { @@ -241,7 +248,16 @@ static int cs_etm__insert_trace_id_node(struct cs_etm_queue *etmq, int err; if (curr_cpu_data[CS_ETM_CPU] != cpu_metadata[CS_ETM_CPU]) { - pr_err("CS_ETM: map mismatch between HW_ID packet CPU and Trace ID\n"); + /* + * With > CORESIGHT_TRACE_IDS_MAX ETMs, overlapping IDs + * are expected (but not supported) in per-thread mode, + * rather than signifying an error. + */ + if (etmq->etm->per_thread_decoding) + pr_err("CS_ETM: overlapping Trace IDs aren't currently supported in per-thread mode\n"); + else + pr_err("CS_ETM: map mismatch between HW_ID packet CPU and Trace ID\n"); + return -EINVAL; } @@ -326,6 +342,64 @@ static int cs_etm__process_trace_id_v0(struct cs_etm_auxtrace *etm, int cpu, return cs_etm__metadata_set_trace_id(trace_chan_id, cpu_data); } +static int cs_etm__process_trace_id_v0_1(struct cs_etm_auxtrace *etm, int cpu, + u64 hw_id) +{ + struct cs_etm_queue *etmq = cs_etm__get_queue(etm, cpu); + int ret; + u64 *cpu_data; + u32 sink_id = FIELD_GET(CS_AUX_HW_ID_SINK_ID_MASK, hw_id); + u8 trace_id = FIELD_GET(CS_AUX_HW_ID_TRACE_ID_MASK, hw_id); + + /* + * Check sink id hasn't changed in per-cpu mode. In per-thread mode, + * let it pass for now until an actual overlapping trace ID is hit. In + * most cases IDs won't overlap even if the sink changes. + */ + if (!etmq->etm->per_thread_decoding && etmq->sink_id != SINK_UNSET && + etmq->sink_id != sink_id) { + pr_err("CS_ETM: mismatch between sink IDs\n"); + return -EINVAL; + } + + etmq->sink_id = sink_id; + + /* Find which other queues use this sink and link their ID maps */ + for (unsigned int i = 0; i < etm->queues.nr_queues; ++i) { + struct cs_etm_queue *other_etmq = etm->queues.queue_array[i].priv; + + /* Different sinks, skip */ + if (other_etmq->sink_id != etmq->sink_id) + continue; + + /* Already linked, skip */ + if (other_etmq->traceid_list == etmq->traceid_list) + continue; + + /* At the point of first linking, this one should be empty */ + if (!intlist__empty(etmq->traceid_list)) { + pr_err("CS_ETM: Can't link populated trace ID lists\n"); + return -EINVAL; + } + + etmq->own_traceid_list = NULL; + intlist__delete(etmq->traceid_list); + etmq->traceid_list = other_etmq->traceid_list; + break; + } + + cpu_data = get_cpu_data(etm, cpu); + ret = cs_etm__insert_trace_id_node(etmq, trace_id, cpu_data); + if (ret) + return ret; + + ret = cs_etm__metadata_set_trace_id(trace_id, cpu_data); + if (ret) + return ret; + + return 0; +} + static int cs_etm__metadata_get_trace_id(u8 *trace_chan_id, u64 *cpu_metadata) { u64 cs_etm_magic = cpu_metadata[CS_ETM_MAGIC]; @@ -414,10 +488,10 @@ static int cs_etm__process_aux_output_hw_id(struct perf_session *session, /* extract and parse the HW ID */ hw_id = event->aux_output_hw_id.hw_id; - version = FIELD_GET(CS_AUX_HW_ID_VERSION_MASK, hw_id); + version = FIELD_GET(CS_AUX_HW_ID_MAJOR_VERSION_MASK, hw_id); /* check that we can handle this version */ - if (version > CS_AUX_HW_ID_CURR_VERSION) { + if (version > CS_AUX_HW_ID_MAJOR_VERSION) { pr_err("CS ETM Trace: PERF_RECORD_AUX_OUTPUT_HW_ID version %d not supported. Please update Perf.\n", version); return -EINVAL; @@ -442,7 +516,10 @@ static int cs_etm__process_aux_output_hw_id(struct perf_session *session, return -EINVAL; } - return cs_etm__process_trace_id_v0(etm, cpu, hw_id); + if (FIELD_GET(CS_AUX_HW_ID_MINOR_VERSION_MASK, hw_id) == 0) + return cs_etm__process_trace_id_v0(etm, cpu, hw_id); + + return cs_etm__process_trace_id_v0_1(etm, cpu, hw_id); } void cs_etm__etmq_set_traceid_queue_timestamp(struct cs_etm_queue *etmq, @@ -882,12 +959,14 @@ static void cs_etm__free_queue(void *priv) cs_etm_decoder__free(etmq->decoder); cs_etm__free_traceid_queues(etmq); - /* First remove all traceID/metadata nodes for the RB tree */ - intlist__for_each_entry_safe(inode, tmp, etmq->traceid_list) - intlist__remove(etmq->traceid_list, inode); + if (etmq->own_traceid_list) { + /* First remove all traceID/metadata nodes for the RB tree */ + intlist__for_each_entry_safe(inode, tmp, etmq->own_traceid_list) + intlist__remove(etmq->own_traceid_list, inode); - /* Then the RB tree itself */ - intlist__delete(etmq->traceid_list); + /* Then the RB tree itself */ + intlist__delete(etmq->own_traceid_list); + } free(etmq); } @@ -1081,7 +1160,7 @@ static struct cs_etm_queue *cs_etm__alloc_queue(void) * has to be made for each packet that gets decoded, optimizing access * in anything other than a sequential array is worth doing. */ - etmq->traceid_list = intlist__new(NULL); + etmq->traceid_list = etmq->own_traceid_list = intlist__new(NULL); if (!etmq->traceid_list) goto out_free; @@ -1113,6 +1192,7 @@ static int cs_etm__setup_queue(struct cs_etm_auxtrace *etm, etmq->queue_nr = queue_nr; queue->cpu = queue_nr; /* Placeholder, may be reset to -1 in per-thread mode */ etmq->offset = 0; + etmq->sink_id = SINK_UNSET; return 0; } -- cgit v1.2.3 From 022aa67b5ab9077d8f5bf02df5a7f0f2d4dfb909 Mon Sep 17 00:00:00 2001 From: James Clark Date: Mon, 22 Jul 2024 11:11:49 +0100 Subject: perf: cs-etm: Print queue number in raw trace dump Now that we have overlapping trace IDs it's also useful to know what the queue number is to be able to distinguish the source of the trace so print it inline. Hide it behind the -v option because it might not be obvious to users what the queue number is. Reviewed-by: Mike Leach Signed-off-by: James Clark Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Alexandre Torgue Cc: Anshuman Khandual Cc: Ganapatrao Kulkarni Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: John Garry Cc: Kan Liang Cc: Leo Yan Cc: Mark Rutland Cc: Maxime Coquelin Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Suzuki Poulouse Cc: Will Deacon Link: https://lore.kernel.org/r/20240722101202.26915-8-james.clark@linaro.org Signed-off-by: James Clark Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/cs-etm-decoder/cs-etm-decoder.c | 4 ++-- tools/perf/util/cs-etm-decoder/cs-etm-decoder.h | 2 +- tools/perf/util/cs-etm.c | 13 ++++++++++--- 3 files changed, 13 insertions(+), 6 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c b/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c index d49c3e9c7c21..b78ef0262135 100644 --- a/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c +++ b/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c @@ -41,7 +41,7 @@ const u32 INSTR_PER_NS = 10; struct cs_etm_decoder { void *data; - void (*packet_printer)(const char *msg); + void (*packet_printer)(const char *msg, void *data); bool suppress_printing; dcd_tree_handle_t dcd_tree; cs_etm_mem_cb_type mem_access; @@ -202,7 +202,7 @@ static void cs_etm_decoder__print_str_cb(const void *p_context, const struct cs_etm_decoder *decoder = p_context; if (p_context && str_len && !decoder->suppress_printing) - decoder->packet_printer(msg); + decoder->packet_printer(msg, decoder->data); } static int diff --git a/tools/perf/util/cs-etm-decoder/cs-etm-decoder.h b/tools/perf/util/cs-etm-decoder/cs-etm-decoder.h index 272c2efe78ee..12c782fa6db2 100644 --- a/tools/perf/util/cs-etm-decoder/cs-etm-decoder.h +++ b/tools/perf/util/cs-etm-decoder/cs-etm-decoder.h @@ -60,7 +60,7 @@ struct cs_etm_trace_params { struct cs_etm_decoder_params { int operation; - void (*packet_printer)(const char *msg); + void (*packet_printer)(const char *msg, void *data); cs_etm_mem_cb_type mem_acc_cb; bool formatted; bool fsyncs; diff --git a/tools/perf/util/cs-etm.c b/tools/perf/util/cs-etm.c index 775d4f35c270..90f32f327b9b 100644 --- a/tools/perf/util/cs-etm.c +++ b/tools/perf/util/cs-etm.c @@ -762,15 +762,22 @@ static void cs_etm__packet_swap(struct cs_etm_auxtrace *etm, } } -static void cs_etm__packet_dump(const char *pkt_string) +static void cs_etm__packet_dump(const char *pkt_string, void *data) { const char *color = PERF_COLOR_BLUE; int len = strlen(pkt_string); + struct cs_etm_queue *etmq = data; + char queue_nr[64]; + + if (verbose) + snprintf(queue_nr, sizeof(queue_nr), "Qnr:%d; ", etmq->queue_nr); + else + queue_nr[0] = '\0'; if (len && (pkt_string[len-1] == '\n')) - color_fprintf(stdout, color, " %s", pkt_string); + color_fprintf(stdout, color, " %s%s", queue_nr, pkt_string); else - color_fprintf(stdout, color, " %s\n", pkt_string); + color_fprintf(stdout, color, " %s%s\n", queue_nr, pkt_string); fflush(stdout); } -- cgit v1.2.3 From d9c993100ef1906bd1f25bab789390828ef10a41 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Thu, 29 Aug 2024 08:01:48 -0700 Subject: perf session: Document 'struct perf_session' and constify its 'auxtrace' member perf_session is a central data structure to the tool so let's comment it. The auxtrace callbacks are never modified in session so constify. Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: Kan Liang Cc: Mark Rutland Cc: Namhyung Kim Cc: Nick Terrell Cc: Peter Zijlstra Cc: Yanteng Si Cc: Yicong Yang Link: https://lore.kernel.org/r/20240829150154.37929-3-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/session.h | 48 ++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 47 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/perf/util/session.h b/tools/perf/util/session.h index 7c8dd6956330..e56518639711 100644 --- a/tools/perf/util/session.h +++ b/tools/perf/util/session.h @@ -26,26 +26,72 @@ struct decomp_data { struct zstd_data *zstd_decomp; }; +/** + * struct perf_session- A Perf session holds the main state when the program is + * working with live perf events or reading data from an input file. + * + * The rough organization of a perf_session is: + * ``` + * +--------------+ +-----------+ +------------+ + * | Session |1..* ----->| Machine |1..* ----->| Thread | + * +--------------+ +-----------+ +------------+ + * ``` + */ struct perf_session { + /** + * @header: The read version of a perf_file_header, or captures global + * information from a live session. + */ struct perf_header header; + /** @machines: Machines within the session a host and 0 or more guests. */ struct machines machines; + /** @evlist: List of evsels/events of the session. */ struct evlist *evlist; - struct auxtrace *auxtrace; + /** @auxtrace: callbacks to allow AUX area data decoding. */ + const struct auxtrace *auxtrace; + /** @itrace_synth_opts: AUX area tracing synthesis options. */ struct itrace_synth_opts *itrace_synth_opts; + /** @auxtrace_index: index of AUX area tracing events within a perf.data file. */ struct list_head auxtrace_index; #ifdef HAVE_LIBTRACEEVENT + /** @tevent: handles for libtraceevent and plugins. */ struct trace_event tevent; #endif + /** @time_conv: Holds contents of last PERF_RECORD_TIME_CONV event. */ struct perf_record_time_conv time_conv; + /** + * @repipe: When set causes certain reading (header and trace events) to + * also write events. The written file descriptor must be provided for + * the header but is implicitly stdout for trace events. + */ bool repipe; + /** + * @one_mmap: The reader will use a single mmap by default. There may be + * multiple data files in particular for aux events. If this is true + * then the single big mmap for the data file can be assumed. + */ bool one_mmap; + /** @one_mmap_addr: Address of initial perf data file reader mmap. */ void *one_mmap_addr; + /** @one_mmap_offset: File offset in perf.data file when mapped. */ u64 one_mmap_offset; + /** @ordered_events: Used to turn unordered events into ordered ones. */ struct ordered_events ordered_events; + /** @data: Optional perf data file being read from. */ struct perf_data *data; + /** @tool: callbacks for event handling. */ const struct perf_tool *tool; + /** + * @bytes_transferred: Used by perf record to count written bytes before + * compression. + */ u64 bytes_transferred; + /** + * @bytes_compressed: Used by perf record to count written bytes after + * compression. + */ u64 bytes_compressed; + /** @zstd_data: Owner of global compression state, buffers, etc. */ struct zstd_data zstd_data; struct decomp_data decomp_data; struct decomp_data *active_decomp; -- cgit v1.2.3 From d71bbe799c0cda4a0581f98ea3e4d646a18e799d Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Thu, 29 Aug 2024 08:01:49 -0700 Subject: perf header: Add kerneldoc to 'struct perf_file_header' Some of the values are a little strange so add documentation to resolve ambiguity. Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: Kan Liang Cc: Mark Rutland Cc: Namhyung Kim Cc: Nick Terrell Cc: Peter Zijlstra Cc: Yanteng Si Cc: Yicong Yang Link: https://lore.kernel.org/r/20240829150154.37929-4-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/header.h | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/perf/util/header.h b/tools/perf/util/header.h index 07ff647197ff..3285981948d7 100644 --- a/tools/perf/util/header.h +++ b/tools/perf/util/header.h @@ -61,14 +61,28 @@ struct perf_file_section { u64 size; }; +/** + * struct perf_file_header: Header representation on disk. + */ struct perf_file_header { + /** @magic: Holds "PERFILE2". */ u64 magic; + /** @size: Size of this header - sizeof(struct perf_file_header). */ u64 size; + /** + * @attr_size: Size of attrs entries - sizeof(struct perf_event_attr) + + * sizeof(struct perf_file_section). + */ u64 attr_size; + /** @attrs: Offset and size of file section holding attributes. */ struct perf_file_section attrs; + /** @data: Offset and size of file section holding regular event data. */ struct perf_file_section data; - /* event_types is ignored */ + /** @event_types: Ignored. */ struct perf_file_section event_types; + /** + * @adds_features: Bitmap of features. The features are immediately after the data section. + */ DECLARE_BITMAP(adds_features, HEADER_FEAT_BITS); }; -- cgit v1.2.3 From 10df481fda13d29c2d9893b7460b8a7095d0ccf5 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Thu, 29 Aug 2024 08:01:50 -0700 Subject: perf header: Fail read if header sections overlap Buggy perf.data files can have the attributes and data overlapping. For example, when processing pipe data the attributes aren't known and so file offset header calculations can consider them not present. Later this can cause the attributes to overwrite the data. This can be seen in: $ perf record -o - true > a.data [ perf record: Woken up 1 times to write data ] [ perf record: Captured and wrote 0.059 MB - ] $ perf inject -i a.data -o b.data $ perf report --stats -i b.data 0x68 [0]: failed to process type: 510379 [Invalid argument] Error: failed to process sample $ This change makes reading the corrupt file fail: $ perf report --stats -i b.data Perf file header corrupt: Attributes and data overlap incompatible file format (rerun with -v to learn more) $ Which is more informative. Signed-off-by: Ian Rogers Tested-by: Arnaldo Carvalho de Melo Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: Kan Liang Cc: Mark Rutland Cc: Namhyung Kim Cc: Nick Terrell Cc: Peter Zijlstra Cc: Yanteng Si Cc: Yicong Yang Link: https://lore.kernel.org/r/20240829150154.37929-5-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/header.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'tools') diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c index 3309fe7f1d12..65c9086610cb 100644 --- a/tools/perf/util/header.c +++ b/tools/perf/util/header.c @@ -3986,6 +3986,24 @@ int perf_file_header__read(struct perf_file_header *header, adds_features)); } + if (header->size > header->attrs.offset) { + pr_err("Perf file header corrupt: header overlaps attrs\n"); + return -1; + } + + if (header->size > header->data.offset) { + pr_err("Perf file header corrupt: header overlaps data\n"); + return -1; + } + + if ((header->attrs.offset <= header->data.offset && + header->attrs.offset + header->attrs.size > header->data.offset) || + (header->attrs.offset > header->data.offset && + header->data.offset + header->data.size > header->attrs.offset)) { + pr_err("Perf file header corrupt: Attributes and data overlap\n"); + return -1; + } + if (header->size != sizeof(*header)) { /* Support the previous format */ if (header->size == offsetof(typeof(*header), adds_features)) -- cgit v1.2.3 From e9a7053da377e19ebc6f1afe55c7f75219aaef79 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Thu, 29 Aug 2024 08:01:51 -0700 Subject: perf header: Allow attributes to be written after data With a file, to write data an offset needs to be known. Typically data follows the event attributes in a file. However, if processing a pipe the number of event attributes may not be known. It is convenient in that case to write the attributes after the data. Expand perf_session__do_write_header() to allow this when the data offset and size are known. This approach may be useful for more than just taking a pipe file to write into a data file, `perf inject --itrace` will reserve and additional 8kb for attributes, which would be unnecessary if the attributes were written after the data. Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: Kan Liang Cc: Mark Rutland Cc: Namhyung Kim Cc: Nick Terrell Cc: Peter Zijlstra Cc: Yanteng Si Cc: Yicong Yang Link: https://lore.kernel.org/r/20240829150154.37929-6-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/header.c | 106 ++++++++++++++++++++++++++++++----------------- 1 file changed, 67 insertions(+), 39 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c index 65c9086610cb..4eb39463067e 100644 --- a/tools/perf/util/header.c +++ b/tools/perf/util/header.c @@ -3676,32 +3676,50 @@ int perf_header__write_pipe(int fd) static int perf_session__do_write_header(struct perf_session *session, struct evlist *evlist, int fd, bool at_exit, - struct feat_copier *fc) + struct feat_copier *fc, + bool write_attrs_after_data) { struct perf_file_header f_header; - struct perf_file_attr f_attr; struct perf_header *header = &session->header; struct evsel *evsel; struct feat_fd ff = { .fd = fd, }; - u64 attr_offset; + u64 attr_offset = sizeof(f_header), attr_size = 0; int err; - lseek(fd, sizeof(f_header), SEEK_SET); + if (write_attrs_after_data && at_exit) { + /* + * Write features at the end of the file first so that + * attributes may come after them. + */ + if (!header->data_offset && header->data_size) { + pr_err("File contains data but offset unknown\n"); + err = -1; + goto err_out; + } + header->feat_offset = header->data_offset + header->data_size; + err = perf_header__adds_write(header, evlist, fd, fc); + if (err < 0) + goto err_out; + attr_offset = lseek(fd, 0, SEEK_CUR); + } else { + lseek(fd, attr_offset, SEEK_SET); + } evlist__for_each_entry(session->evlist, evsel) { - evsel->id_offset = lseek(fd, 0, SEEK_CUR); - err = do_write(&ff, evsel->core.id, evsel->core.ids * sizeof(u64)); - if (err < 0) { - pr_debug("failed to write perf header\n"); - free(ff.buf); - return err; + evsel->id_offset = attr_offset; + /* Avoid writing at the end of the file until the session is exiting. */ + if (!write_attrs_after_data || at_exit) { + err = do_write(&ff, evsel->core.id, evsel->core.ids * sizeof(u64)); + if (err < 0) { + pr_debug("failed to write perf header\n"); + goto err_out; + } } + attr_offset += evsel->core.ids * sizeof(u64); } - attr_offset = lseek(ff.fd, 0, SEEK_CUR); - evlist__for_each_entry(evlist, evsel) { if (evsel->core.attr.size < sizeof(evsel->core.attr)) { /* @@ -3711,40 +3729,46 @@ static int perf_session__do_write_header(struct perf_session *session, */ evsel->core.attr.size = sizeof(evsel->core.attr); } - f_attr = (struct perf_file_attr){ - .attr = evsel->core.attr, - .ids = { - .offset = evsel->id_offset, - .size = evsel->core.ids * sizeof(u64), + /* Avoid writing at the end of the file until the session is exiting. */ + if (!write_attrs_after_data || at_exit) { + struct perf_file_attr f_attr = { + .attr = evsel->core.attr, + .ids = { + .offset = evsel->id_offset, + .size = evsel->core.ids * sizeof(u64), + } + }; + err = do_write(&ff, &f_attr, sizeof(f_attr)); + if (err < 0) { + pr_debug("failed to write perf header attribute\n"); + goto err_out; } - }; - err = do_write(&ff, &f_attr, sizeof(f_attr)); - if (err < 0) { - pr_debug("failed to write perf header attribute\n"); - free(ff.buf); - return err; } + attr_size += sizeof(struct perf_file_attr); } - if (!header->data_offset) - header->data_offset = lseek(fd, 0, SEEK_CUR); + if (!header->data_offset) { + if (write_attrs_after_data) + header->data_offset = sizeof(f_header); + else + header->data_offset = attr_offset + attr_size; + } header->feat_offset = header->data_offset + header->data_size; - if (at_exit) { + if (!write_attrs_after_data && at_exit) { + /* Write features now feat_offset is known. */ err = perf_header__adds_write(header, evlist, fd, fc); - if (err < 0) { - free(ff.buf); - return err; - } + if (err < 0) + goto err_out; } f_header = (struct perf_file_header){ .magic = PERF_MAGIC, .size = sizeof(f_header), - .attr_size = sizeof(f_attr), + .attr_size = sizeof(struct perf_file_attr), .attrs = { .offset = attr_offset, - .size = evlist->core.nr_entries * sizeof(f_attr), + .size = attr_size, }, .data = { .offset = header->data_offset, @@ -3757,21 +3781,24 @@ static int perf_session__do_write_header(struct perf_session *session, lseek(fd, 0, SEEK_SET); err = do_write(&ff, &f_header, sizeof(f_header)); - free(ff.buf); if (err < 0) { pr_debug("failed to write perf header\n"); - return err; + goto err_out; + } else { + lseek(fd, 0, SEEK_END); + err = 0; } - lseek(fd, header->data_offset + header->data_size, SEEK_SET); - - return 0; +err_out: + free(ff.buf); + return err; } int perf_session__write_header(struct perf_session *session, struct evlist *evlist, int fd, bool at_exit) { - return perf_session__do_write_header(session, evlist, fd, at_exit, NULL); + return perf_session__do_write_header(session, evlist, fd, at_exit, /*fc=*/NULL, + /*write_attrs_after_data=*/false); } size_t perf_session__data_offset(const struct evlist *evlist) @@ -3793,7 +3820,8 @@ int perf_session__inject_header(struct perf_session *session, int fd, struct feat_copier *fc) { - return perf_session__do_write_header(session, evlist, fd, true, fc); + return perf_session__do_write_header(session, evlist, fd, true, fc, + /*write_attrs_after_data=*/false); } static int perf_header__getbuffer64(struct perf_header *header, -- cgit v1.2.3 From bd0b4836a2333d5c725397d458c8edfa66f1d9bb Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Tue, 27 Aug 2024 01:13:01 +0000 Subject: selftests/bpf: Make sure stashed kptr in local kptr is freed recursively When dropping a local kptr, any kptr stashed into it is supposed to be freed through bpf_obj_free_fields->__bpf_obj_drop_impl recursively. Add a test to make sure it happens. The test first stashes a referenced kptr to "struct task" into a local kptr and gets the reference count of the task. Then, it drops the local kptr and reads the reference count of the task again. Since bpf_obj_free_fields and __bpf_obj_drop_impl will go through the local kptr recursively during bpf_obj_drop, the dtor of the stashed task kptr should eventually be called. The second reference count should be one less than the first one. Signed-off-by: Amery Hung Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20240827011301.608620-1-amery.hung@bytedance.com Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/progs/task_kfunc_success.c | 30 +++++++++++++++++++++- 1 file changed, 29 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/progs/task_kfunc_success.c b/tools/testing/selftests/bpf/progs/task_kfunc_success.c index 3138bb689b0b..a55149015063 100644 --- a/tools/testing/selftests/bpf/progs/task_kfunc_success.c +++ b/tools/testing/selftests/bpf/progs/task_kfunc_success.c @@ -143,8 +143,9 @@ int BPF_PROG(test_task_acquire_leave_in_map, struct task_struct *task, u64 clone SEC("tp_btf/task_newtask") int BPF_PROG(test_task_xchg_release, struct task_struct *task, u64 clone_flags) { - struct task_struct *kptr; + struct task_struct *kptr, *acquired; struct __tasks_kfunc_map_value *v, *local; + int refcnt, refcnt_after_drop; long status; if (!is_test_kfunc_task()) @@ -190,7 +191,34 @@ int BPF_PROG(test_task_xchg_release, struct task_struct *task, u64 clone_flags) return 0; } + /* Stash a copy into local kptr and check if it is released recursively */ + acquired = bpf_task_acquire(kptr); + if (!acquired) { + err = 7; + bpf_obj_drop(local); + bpf_task_release(kptr); + return 0; + } + bpf_probe_read_kernel(&refcnt, sizeof(refcnt), &acquired->rcu_users); + + acquired = bpf_kptr_xchg(&local->task, acquired); + if (acquired) { + err = 8; + bpf_obj_drop(local); + bpf_task_release(kptr); + bpf_task_release(acquired); + return 0; + } + bpf_obj_drop(local); + + bpf_probe_read_kernel(&refcnt_after_drop, sizeof(refcnt_after_drop), &kptr->rcu_users); + if (refcnt != refcnt_after_drop + 1) { + err = 9; + bpf_task_release(kptr); + return 0; + } + bpf_task_release(kptr); return 0; -- cgit v1.2.3 From 3d8806f37d318b10ddf9fa686821f58bc6301c7b Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 28 Aug 2024 10:36:09 -0700 Subject: tools: ynl: error check scanf() in a sample MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Someone reported on GitHub that the YNL NIPA test is failing when run locally. The test builds the tools, and it hits: netdev.c:82:9: warning: ignoring return value of ‘scanf’ declared with attribute ‘warn_unused_result’ [-Wunused-result] 82 | scanf("%d", &ifindex); I can't repro this on my setups but error seems clear enough. Link: https://github.com/linux-netdev/nipa/discussions/37 Reviewed-by: Simon Horman Reviewed-by: Nicolas Dichtel Link: https://patch.msgid.link/20240828173609.2951335-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- tools/net/ynl/samples/netdev.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/net/ynl/samples/netdev.c b/tools/net/ynl/samples/netdev.c index 3e7b29bd55d5..22609d44c89a 100644 --- a/tools/net/ynl/samples/netdev.c +++ b/tools/net/ynl/samples/netdev.c @@ -79,7 +79,10 @@ int main(int argc, char **argv) goto err_close; printf("Select ifc ($ifindex; or 0 = dump; or -2 ntf check): "); - scanf("%d", &ifindex); + if (scanf("%d", &ifindex) != 1) { + fprintf(stderr, "Error: unable to parse input\n"); + goto err_destroy; + } if (ifindex > 0) { struct netdev_dev_get_req *req; @@ -119,6 +122,7 @@ int main(int argc, char **argv) err_close: fprintf(stderr, "YNL: %s\n", ys->err.msg); +err_destroy: ynl_sock_destroy(ys); return 2; } -- cgit v1.2.3 From b24d22ac74bfdde2e1146e1838abdc99131b1d19 Mon Sep 17 00:00:00 2001 From: Jon Maloy Date: Wed, 28 Aug 2024 14:37:52 -0400 Subject: selftests: add selftest for tcp SO_PEEK_OFF support We add a selftest to check that the new feature added in commit 05ea491641d3 ("tcp: add support for SO_PEEK_OFF socket option") works correctly. Reviewed-by: Jason Xing Reviewed-by: Stefano Brivio Tested-by: Stefano Brivio Signed-off-by: Jon Maloy Link: https://patch.msgid.link/20240828183752.660267-3-jmaloy@redhat.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/Makefile | 1 + tools/testing/selftests/net/tcp_so_peek_off.c | 183 ++++++++++++++++++++++++++ 2 files changed, 184 insertions(+) create mode 100644 tools/testing/selftests/net/tcp_so_peek_off.c (limited to 'tools') diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile index 8eaffd7a641c..1179e3261bef 100644 --- a/tools/testing/selftests/net/Makefile +++ b/tools/testing/selftests/net/Makefile @@ -80,6 +80,7 @@ TEST_PROGS += io_uring_zerocopy_tx.sh TEST_GEN_FILES += bind_bhash TEST_GEN_PROGS += sk_bind_sendto_listen TEST_GEN_PROGS += sk_connect_zero_addr +TEST_GEN_PROGS += tcp_so_peek_off TEST_PROGS += test_ingress_egress_chaining.sh TEST_GEN_PROGS += so_incoming_cpu TEST_PROGS += sctp_vrf.sh diff --git a/tools/testing/selftests/net/tcp_so_peek_off.c b/tools/testing/selftests/net/tcp_so_peek_off.c new file mode 100644 index 000000000000..df8a39d9d3c3 --- /dev/null +++ b/tools/testing/selftests/net/tcp_so_peek_off.c @@ -0,0 +1,183 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include +#include +#include +#include +#include +#include +#include "../kselftest.h" + +static char *afstr(int af) +{ + return af == AF_INET ? "TCP/IPv4" : "TCP/IPv6"; +} + +int tcp_peek_offset_probe(sa_family_t af) +{ + int optv = 0; + int ret = 0; + int s; + + s = socket(af, SOCK_STREAM | SOCK_CLOEXEC, IPPROTO_TCP); + if (s < 0) { + ksft_perror("Temporary TCP socket creation failed"); + } else { + if (!setsockopt(s, SOL_SOCKET, SO_PEEK_OFF, &optv, sizeof(int))) + ret = 1; + else + printf("%s does not support SO_PEEK_OFF\n", afstr(af)); + close(s); + } + return ret; +} + +static void tcp_peek_offset_set(int s, int offset) +{ + if (setsockopt(s, SOL_SOCKET, SO_PEEK_OFF, &offset, sizeof(offset))) + ksft_perror("Failed to set SO_PEEK_OFF value\n"); +} + +static int tcp_peek_offset_get(int s) +{ + int offset; + socklen_t len = sizeof(offset); + + if (getsockopt(s, SOL_SOCKET, SO_PEEK_OFF, &offset, &len)) + ksft_perror("Failed to get SO_PEEK_OFF value\n"); + return offset; +} + +static int tcp_peek_offset_test(sa_family_t af) +{ + union { + struct sockaddr sa; + struct sockaddr_in a4; + struct sockaddr_in6 a6; + } a; + int res = 0; + int s[2] = {0, 0}; + int recv_sock = 0; + int offset = 0; + ssize_t len; + char buf; + + memset(&a, 0, sizeof(a)); + a.sa.sa_family = af; + + s[0] = socket(af, SOCK_STREAM, IPPROTO_TCP); + s[1] = socket(af, SOCK_STREAM | SOCK_NONBLOCK, IPPROTO_TCP); + + if (s[0] < 0 || s[1] < 0) { + ksft_perror("Temporary socket creation failed\n"); + goto out; + } + if (bind(s[0], &a.sa, sizeof(a)) < 0) { + ksft_perror("Temporary socket bind() failed\n"); + goto out; + } + if (getsockname(s[0], &a.sa, &((socklen_t) { sizeof(a) })) < 0) { + ksft_perror("Temporary socket getsockname() failed\n"); + goto out; + } + if (listen(s[0], 0) < 0) { + ksft_perror("Temporary socket listen() failed\n"); + goto out; + } + if (connect(s[1], &a.sa, sizeof(a)) >= 0 || errno != EINPROGRESS) { + ksft_perror("Temporary socket connect() failed\n"); + goto out; + } + recv_sock = accept(s[0], NULL, NULL); + if (recv_sock <= 0) { + ksft_perror("Temporary socket accept() failed\n"); + goto out; + } + + /* Some basic tests of getting/setting offset */ + offset = tcp_peek_offset_get(recv_sock); + if (offset != -1) { + ksft_perror("Initial value of socket offset not -1\n"); + goto out; + } + tcp_peek_offset_set(recv_sock, 0); + offset = tcp_peek_offset_get(recv_sock); + if (offset != 0) { + ksft_perror("Failed to set socket offset to 0\n"); + goto out; + } + + /* Transfer a message */ + if (send(s[1], (char *)("ab"), 2, 0) <= 0 || errno != EINPROGRESS) { + ksft_perror("Temporary probe socket send() failed\n"); + goto out; + } + /* Read first byte */ + len = recv(recv_sock, &buf, 1, MSG_PEEK); + if (len != 1 || buf != 'a') { + ksft_perror("Failed to read first byte of message\n"); + goto out; + } + offset = tcp_peek_offset_get(recv_sock); + if (offset != 1) { + ksft_perror("Offset not forwarded correctly at first byte\n"); + goto out; + } + /* Try to read beyond last byte */ + len = recv(recv_sock, &buf, 2, MSG_PEEK); + if (len != 1 || buf != 'b') { + ksft_perror("Failed to read last byte of message\n"); + goto out; + } + offset = tcp_peek_offset_get(recv_sock); + if (offset != 2) { + ksft_perror("Offset not forwarded correctly at last byte\n"); + goto out; + } + /* Flush message */ + len = recv(recv_sock, NULL, 2, MSG_TRUNC); + if (len != 2) { + ksft_perror("Failed to flush message\n"); + goto out; + } + offset = tcp_peek_offset_get(recv_sock); + if (offset != 0) { + ksft_perror("Offset not reverted correctly after flush\n"); + goto out; + } + + printf("%s with MSG_PEEK_OFF works correctly\n", afstr(af)); + res = 1; +out: + if (recv_sock >= 0) + close(recv_sock); + if (s[1] >= 0) + close(s[1]); + if (s[0] >= 0) + close(s[0]); + return res; +} + +int main(void) +{ + int res4, res6; + + res4 = tcp_peek_offset_probe(AF_INET); + res6 = tcp_peek_offset_probe(AF_INET6); + + if (!res4 && !res6) + return KSFT_SKIP; + + if (res4) + res4 = tcp_peek_offset_test(AF_INET); + + if (res6) + res6 = tcp_peek_offset_test(AF_INET6); + + if (!res4 || !res6) + return KSFT_FAIL; + + return KSFT_PASS; +} -- cgit v1.2.3 From d1c2cdca5a08f422b791670c11f9d4e3ed0a5518 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 19 Jul 2024 16:51:01 -0700 Subject: KVM: selftests: Open code vcpu_run() equivalent in guest_printf test Open code a version of vcpu_run() in the guest_printf test in anticipation of adding UCALL_ABORT handling to _vcpu_run(). The guest_printf test intentionally generates asserts to verify the output, and thus needs to bypass common assert handling. Open code a helper in the guest_printf test, as it's not expected that any other test would want to skip _only_ the UCALL_ABORT handling. Link: https://lore.kernel.org/r/20240719235107.3023592-5-seanjc@google.com Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/guest_print_test.c | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/guest_print_test.c b/tools/testing/selftests/kvm/guest_print_test.c index 8092c2d0f5d6..bcf582852db9 100644 --- a/tools/testing/selftests/kvm/guest_print_test.c +++ b/tools/testing/selftests/kvm/guest_print_test.c @@ -107,6 +107,21 @@ static void ucall_abort(const char *assert_msg, const char *expected_assert_msg) expected_assert_msg, &assert_msg[offset]); } +/* + * Open code vcpu_run(), sans the UCALL_ABORT handling, so that intentional + * guest asserts guest can be verified instead of being reported as failures. + */ +static void do_vcpu_run(struct kvm_vcpu *vcpu) +{ + int r; + + do { + r = __vcpu_run(vcpu); + } while (r == -1 && errno == EINTR); + + TEST_ASSERT(!r, KVM_IOCTL_ERROR(KVM_RUN, r)); +} + static void run_test(struct kvm_vcpu *vcpu, const char *expected_printf, const char *expected_assert) { @@ -114,7 +129,7 @@ static void run_test(struct kvm_vcpu *vcpu, const char *expected_printf, struct ucall uc; while (1) { - vcpu_run(vcpu); + do_vcpu_run(vcpu); TEST_ASSERT(run->exit_reason == UCALL_EXIT_REASON, "Unexpected exit reason: %u (%s),", @@ -159,7 +174,7 @@ static void test_limits(void) vm = vm_create_with_one_vcpu(&vcpu, guest_code_limits); run = vcpu->run; - vcpu_run(vcpu); + do_vcpu_run(vcpu); TEST_ASSERT(run->exit_reason == UCALL_EXIT_REASON, "Unexpected exit reason: %u (%s),", -- cgit v1.2.3 From ed24ba6c2c3450c96dcd804f81893d11e52463cc Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 19 Jul 2024 16:51:02 -0700 Subject: KVM: selftests: Report unhandled exceptions on x86 as regular guest asserts Now that selftests support printf() in the guest, report unexpected exceptions via the regular assertion framework. Exceptions were special cased purely to provide a better error message. Convert only x86 for now, as it's low-hanging fruit (already formats the assertion in the guest), and converting x86 will allow adding asserts in x86 library code without needing to update multiple tests. Once all other architectures are converted, this will allow moving the reporting to common code, which will in turn allow adding asserts in common library code, and will also allow removing UCALL_UNHANDLED. Link: https://lore.kernel.org/r/20240719235107.3023592-6-seanjc@google.com Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/lib/x86_64/processor.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/lib/x86_64/processor.c b/tools/testing/selftests/kvm/lib/x86_64/processor.c index 153739f2e201..814a604c0891 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/processor.c +++ b/tools/testing/selftests/kvm/lib/x86_64/processor.c @@ -566,10 +566,8 @@ void route_exception(struct ex_regs *regs) if (kvm_fixup_exception(regs)) return; - ucall_assert(UCALL_UNHANDLED, - "Unhandled exception in guest", __FILE__, __LINE__, - "Unhandled exception '0x%lx' at guest RIP '0x%lx'", - regs->vector, regs->rip); + GUEST_FAIL("Unhandled exception '0x%lx' at guest RIP '0x%lx'", + regs->vector, regs->rip); } static void vm_init_descriptor_tables(struct kvm_vm *vm) @@ -611,7 +609,7 @@ void assert_on_unhandled_exception(struct kvm_vcpu *vcpu) { struct ucall uc; - if (get_ucall(vcpu, &uc) == UCALL_UNHANDLED) + if (get_ucall(vcpu, &uc) == UCALL_ABORT) REPORT_GUEST_ASSERT(uc); } -- cgit v1.2.3 From f2e91e874179a27d4c29b3f31706b37e1e6bcf54 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 19 Jul 2024 16:51:03 -0700 Subject: KVM: selftests: Add x86 helpers to play nice with x2APIC MSR #GPs Add helpers to allow and expect #GP on x2APIC MSRs, and opportunistically have the existing helper spit out a more useful error message if an unexpected exception occurs. Link: https://lore.kernel.org/r/20240719235107.3023592-7-seanjc@google.com Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/include/x86_64/apic.h | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/include/x86_64/apic.h b/tools/testing/selftests/kvm/include/x86_64/apic.h index 0f268b55fa06..51990094effd 100644 --- a/tools/testing/selftests/kvm/include/x86_64/apic.h +++ b/tools/testing/selftests/kvm/include/x86_64/apic.h @@ -11,6 +11,7 @@ #include #include "processor.h" +#include "ucall_common.h" #define APIC_DEFAULT_GPA 0xfee00000ULL @@ -93,9 +94,27 @@ static inline uint64_t x2apic_read_reg(unsigned int reg) return rdmsr(APIC_BASE_MSR + (reg >> 4)); } +static inline uint8_t x2apic_write_reg_safe(unsigned int reg, uint64_t value) +{ + return wrmsr_safe(APIC_BASE_MSR + (reg >> 4), value); +} + static inline void x2apic_write_reg(unsigned int reg, uint64_t value) { - wrmsr(APIC_BASE_MSR + (reg >> 4), value); + uint8_t fault = x2apic_write_reg_safe(reg, value); + + __GUEST_ASSERT(!fault, "Unexpected fault 0x%x on WRMSR(%x) = %lx\n", + fault, APIC_BASE_MSR + (reg >> 4), value); } +static inline void x2apic_write_reg_fault(unsigned int reg, uint64_t value) +{ + uint8_t fault = x2apic_write_reg_safe(reg, value); + + __GUEST_ASSERT(fault == GP_VECTOR, + "Wanted #GP on WRMSR(%x) = %lx, got 0x%x\n", + APIC_BASE_MSR + (reg >> 4), value, fault); +} + + #endif /* SELFTEST_KVM_APIC_H */ -- cgit v1.2.3 From faf06a238254cec0c8c9bc0876caede63fcfeb24 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 19 Jul 2024 16:51:04 -0700 Subject: KVM: selftests: Skip ICR.BUSY test in xapic_state_test if x2APIC is enabled Don't test the ICR BUSY bit when x2APIC is enabled as AMD and Intel have different behavior (AMD #GPs, Intel ignores), and the fact that the CPU performs the reserved bit checks when IPI virtualization is enabled makes it impossible for KVM to precisely emulate one or the other. Link: https://lore.kernel.org/r/20240719235107.3023592-8-seanjc@google.com Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/x86_64/xapic_state_test.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/x86_64/xapic_state_test.c b/tools/testing/selftests/kvm/x86_64/xapic_state_test.c index 618cd2442390..d5a7adaa9502 100644 --- a/tools/testing/selftests/kvm/x86_64/xapic_state_test.c +++ b/tools/testing/selftests/kvm/x86_64/xapic_state_test.c @@ -70,12 +70,10 @@ static void ____test_icr(struct xapic_vcpu *x, uint64_t val) vcpu_ioctl(vcpu, KVM_GET_LAPIC, &xapic); icr = (u64)(*((u32 *)&xapic.regs[APIC_ICR])) | (u64)(*((u32 *)&xapic.regs[APIC_ICR2])) << 32; - if (!x->is_x2apic) { + if (!x->is_x2apic) val &= (-1u | (0xffull << (32 + 24))); - TEST_ASSERT_EQ(icr, val & ~APIC_ICR_BUSY); - } else { - TEST_ASSERT_EQ(icr & ~APIC_ICR_BUSY, val & ~APIC_ICR_BUSY); - } + + TEST_ASSERT_EQ(icr, val & ~APIC_ICR_BUSY); } #define X2APIC_RSVED_BITS_MASK (GENMASK_ULL(31,20) | \ @@ -91,7 +89,15 @@ static void __test_icr(struct xapic_vcpu *x, uint64_t val) */ val &= ~X2APIC_RSVED_BITS_MASK; } - ____test_icr(x, val | APIC_ICR_BUSY); + + /* + * The BUSY bit is reserved on both AMD and Intel, but only AMD treats + * it is as _must_ be zero. Intel simply ignores the bit. Don't test + * the BUSY bit for x2APIC, as there is no single correct behavior. + */ + if (!x->is_x2apic) + ____test_icr(x, val | APIC_ICR_BUSY); + ____test_icr(x, val & ~(u64)APIC_ICR_BUSY); } -- cgit v1.2.3 From 3426cb48adb4dc00b75e89c95d257d699f4d75ce Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 19 Jul 2024 16:51:05 -0700 Subject: KVM: selftests: Test x2APIC ICR reserved bits Actually test x2APIC ICR reserved bits instead of deliberately skipping them. The behavior that is observed when IPI virtualization is enabled is the architecturally correct behavior, KVM is the one who was wrong, i.e. KVM was missing reserved bit checks. Fixes: 4b88b1a518b3 ("KVM: selftests: Enhance handling WRMSR ICR register in x2APIC mode") Link: https://lore.kernel.org/r/20240719235107.3023592-9-seanjc@google.com Signed-off-by: Sean Christopherson --- .../selftests/kvm/x86_64/xapic_state_test.c | 23 ++++++++++------------ 1 file changed, 10 insertions(+), 13 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/x86_64/xapic_state_test.c b/tools/testing/selftests/kvm/x86_64/xapic_state_test.c index d5a7adaa9502..a17b75fb2506 100644 --- a/tools/testing/selftests/kvm/x86_64/xapic_state_test.c +++ b/tools/testing/selftests/kvm/x86_64/xapic_state_test.c @@ -31,6 +31,10 @@ static void xapic_guest_code(void) } } +#define X2APIC_RSVD_BITS_MASK (GENMASK_ULL(31, 20) | \ + GENMASK_ULL(17, 16) | \ + GENMASK_ULL(13, 13)) + static void x2apic_guest_code(void) { asm volatile("cli"); @@ -41,7 +45,10 @@ static void x2apic_guest_code(void) uint64_t val = x2apic_read_reg(APIC_IRR) | x2apic_read_reg(APIC_IRR + 0x10) << 32; - x2apic_write_reg(APIC_ICR, val); + if (val & X2APIC_RSVD_BITS_MASK) + x2apic_write_reg_fault(APIC_ICR, val); + else + x2apic_write_reg(APIC_ICR, val); GUEST_SYNC(val); } while (1); } @@ -72,24 +79,14 @@ static void ____test_icr(struct xapic_vcpu *x, uint64_t val) (u64)(*((u32 *)&xapic.regs[APIC_ICR2])) << 32; if (!x->is_x2apic) val &= (-1u | (0xffull << (32 + 24))); + else if (val & X2APIC_RSVD_BITS_MASK) + return; TEST_ASSERT_EQ(icr, val & ~APIC_ICR_BUSY); } -#define X2APIC_RSVED_BITS_MASK (GENMASK_ULL(31,20) | \ - GENMASK_ULL(17,16) | \ - GENMASK_ULL(13,13)) - static void __test_icr(struct xapic_vcpu *x, uint64_t val) { - if (x->is_x2apic) { - /* Hardware writing vICR register requires reserved bits 31:20, - * 17:16 and 13 kept as zero to avoid #GP exception. Data value - * written to vICR should mask out those bits above. - */ - val &= ~X2APIC_RSVED_BITS_MASK; - } - /* * The BUSY bit is reserved on both AMD and Intel, but only AMD treats * it is as _must_ be zero. Intel simply ignores the bit. Don't test -- cgit v1.2.3 From 0cb26ec320851f685280ff061f84855d0e97bf86 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 19 Jul 2024 16:51:06 -0700 Subject: KVM: selftests: Verify the guest can read back the x2APIC ICR it wrote Now that the BUSY bit mess is gone (for x2APIC), verify that the *guest* can read back the ICR value that it wrote. Due to the divergent behavior between AMD and Intel with respect to the backing storage of the ICR in the vAPIC page, emulating a seemingly simple MSR write is quite complex. Link: https://lore.kernel.org/r/20240719235107.3023592-10-seanjc@google.com Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/x86_64/xapic_state_test.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/x86_64/xapic_state_test.c b/tools/testing/selftests/kvm/x86_64/xapic_state_test.c index a17b75fb2506..dbd4f23ce92e 100644 --- a/tools/testing/selftests/kvm/x86_64/xapic_state_test.c +++ b/tools/testing/selftests/kvm/x86_64/xapic_state_test.c @@ -45,10 +45,12 @@ static void x2apic_guest_code(void) uint64_t val = x2apic_read_reg(APIC_IRR) | x2apic_read_reg(APIC_IRR + 0x10) << 32; - if (val & X2APIC_RSVD_BITS_MASK) + if (val & X2APIC_RSVD_BITS_MASK) { x2apic_write_reg_fault(APIC_ICR, val); - else + } else { x2apic_write_reg(APIC_ICR, val); + GUEST_ASSERT_EQ(x2apic_read_reg(APIC_ICR), val); + } GUEST_SYNC(val); } while (1); } -- cgit v1.2.3 From 5a7c7d148e488f43cf9c8e64fa5e1bd715ae0485 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 19 Jul 2024 16:51:07 -0700 Subject: KVM: selftests: Play nice with AMD's AVIC errata When AVIC, and thus IPI virtualization on AMD, is enabled, the CPU will virtualize ICR writes. Unfortunately, the CPU doesn't do a very good job, as it fails to clear the BUSY bit and also allows writing ICR2[23:0], despite them being "RESERVED MBZ". Account for the quirky behavior in the xapic_state test to avoid failures in a configuration that likely has no hope of ever being enabled in production. Link: https://lore.kernel.org/r/20240719235107.3023592-11-seanjc@google.com Signed-off-by: Sean Christopherson --- .../selftests/kvm/x86_64/xapic_state_test.c | 23 ++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/x86_64/xapic_state_test.c b/tools/testing/selftests/kvm/x86_64/xapic_state_test.c index dbd4f23ce92e..88bcca188799 100644 --- a/tools/testing/selftests/kvm/x86_64/xapic_state_test.c +++ b/tools/testing/selftests/kvm/x86_64/xapic_state_test.c @@ -13,6 +13,7 @@ struct xapic_vcpu { struct kvm_vcpu *vcpu; bool is_x2apic; + bool has_xavic_errata; }; static void xapic_guest_code(void) @@ -79,12 +80,17 @@ static void ____test_icr(struct xapic_vcpu *x, uint64_t val) vcpu_ioctl(vcpu, KVM_GET_LAPIC, &xapic); icr = (u64)(*((u32 *)&xapic.regs[APIC_ICR])) | (u64)(*((u32 *)&xapic.regs[APIC_ICR2])) << 32; - if (!x->is_x2apic) - val &= (-1u | (0xffull << (32 + 24))); - else if (val & X2APIC_RSVD_BITS_MASK) + if (!x->is_x2apic) { + if (!x->has_xavic_errata) + val &= (-1u | (0xffull << (32 + 24))); + } else if (val & X2APIC_RSVD_BITS_MASK) { return; + } - TEST_ASSERT_EQ(icr, val & ~APIC_ICR_BUSY); + if (x->has_xavic_errata) + TEST_ASSERT_EQ(icr & ~APIC_ICR_BUSY, val & ~APIC_ICR_BUSY); + else + TEST_ASSERT_EQ(icr, val & ~APIC_ICR_BUSY); } static void __test_icr(struct xapic_vcpu *x, uint64_t val) @@ -236,6 +242,15 @@ int main(int argc, char *argv[]) vm = vm_create_with_one_vcpu(&x.vcpu, xapic_guest_code); x.is_x2apic = false; + /* + * AMD's AVIC implementation is buggy (fails to clear the ICR BUSY bit), + * and also diverges from KVM with respect to ICR2[23:0] (KVM and Intel + * drops writes, AMD does not). Account for the errata when checking + * that KVM reads back what was written. + */ + x.has_xavic_errata = host_cpu_is_amd && + get_kvm_amd_param_bool("avic"); + vcpu_clear_cpuid_feature(x.vcpu, X86_FEATURE_X2APIC); virt_pg_map(vm, APIC_DEFAULT_GPA, APIC_DEFAULT_GPA); -- cgit v1.2.3 From a0dbf6d0b21e197bf919591081ff2eb7a34ef982 Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Thu, 29 Aug 2024 14:08:27 -0700 Subject: selftests/bpf: attach struct_ops maps before test prog runs In test_loader based tests to bpf_map__attach_struct_ops() before call to bpf_prog_test_run_opts() in order to trigger bpf_struct_ops->reg() callbacks on kernel side. This allows to use __retval macro for struct_ops tests. Signed-off-by: Eduard Zingerman Signed-off-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20240829210833.388152-6-martin.lau@linux.dev Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/test_loader.c | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/test_loader.c b/tools/testing/selftests/bpf/test_loader.c index 4223cffc090e..3e9b009580d4 100644 --- a/tools/testing/selftests/bpf/test_loader.c +++ b/tools/testing/selftests/bpf/test_loader.c @@ -890,11 +890,13 @@ void run_subtest(struct test_loader *tester, { struct test_subspec *subspec = unpriv ? &spec->unpriv : &spec->priv; struct bpf_program *tprog = NULL, *tprog_iter; + struct bpf_link *link, *links[32] = {}; struct test_spec *spec_iter; struct cap_state caps = {}; struct bpf_object *tobj; struct bpf_map *map; int retval, err, i; + int links_cnt = 0; bool should_load; if (!test__start_subtest(subspec->name)) @@ -999,6 +1001,26 @@ void run_subtest(struct test_loader *tester, if (restore_capabilities(&caps)) goto tobj_cleanup; + /* Do bpf_map__attach_struct_ops() for each struct_ops map. + * This should trigger bpf_struct_ops->reg callback on kernel side. + */ + bpf_object__for_each_map(map, tobj) { + if (!bpf_map__autocreate(map) || + bpf_map__type(map) != BPF_MAP_TYPE_STRUCT_OPS) + continue; + if (links_cnt >= ARRAY_SIZE(links)) { + PRINT_FAIL("too many struct_ops maps"); + goto tobj_cleanup; + } + link = bpf_map__attach_struct_ops(map); + if (!link) { + PRINT_FAIL("bpf_map__attach_struct_ops failed for map %s: err=%d\n", + bpf_map__name(map), err); + goto tobj_cleanup; + } + links[links_cnt++] = link; + } + if (tester->pre_execution_cb) { err = tester->pre_execution_cb(tobj); if (err) { @@ -1013,9 +1035,14 @@ void run_subtest(struct test_loader *tester, PRINT_FAIL("Unexpected retval: %d != %d\n", retval, subspec->retval); goto tobj_cleanup; } + /* redo bpf_map__attach_struct_ops for each test */ + while (links_cnt > 0) + bpf_link__destroy(links[--links_cnt]); } tobj_cleanup: + while (links_cnt > 0) + bpf_link__destroy(links[--links_cnt]); bpf_object__close(tobj); subtest_cleanup: test__end_subtest(); -- cgit v1.2.3 From 47e69431b57aee6c3c134014e7fca490ca8ca9d1 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 29 Aug 2024 14:08:28 -0700 Subject: selftests/bpf: Test gen_prologue and gen_epilogue This test adds a new struct_ops "bpf_testmod_st_ops" in bpf_testmod. The ops of the bpf_testmod_st_ops is triggered by new kfunc calls "bpf_kfunc_st_ops_test_*logue". These new kfunc calls are primarily used by the SEC("syscall") program. The test triggering sequence is like: SEC("syscall") syscall_prologue(struct st_ops_args *args) bpf_kfunc_st_op_test_prologue(args) st_ops->test_prologue(args) .gen_prologue adds 1000 to args->a .gen_epilogue adds 10000 to args->a .gen_epilogue will also set the r0 to 2 * args->a. The .gen_prologue and .gen_epilogue of the bpf_testmod_st_ops will test the prog->aux->attach_func_name to decide if it needs to generate codes. The main programs of the pro_epilogue.c will call a new kfunc bpf_kfunc_st_ops_inc10 which does "args->a += 10". It will also call a subprog() which does "args->a += 1". This patch uses the test_loader infra to check the __xlated instructions patched after gen_prologue and/or gen_epilogue. The __xlated check is based on Eduard's example (Thanks!) in v1. args->a is returned by the struct_ops prog (either the main prog or the epilogue). Thus, the __retval of the SEC("syscall") prog is checked. For example, when triggering the ops in the 'SEC("struct_ops/test_epilogue") int test_epilogue' The expected args->a is +1 (subprog call) + 10 (kfunc call) + 10000 (.gen_epilogue) = 10011. The expected return value is 2 * 10011 (.gen_epilogue). Suggested-by: Eduard Zingerman Acked-by: Eduard Zingerman Signed-off-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20240829210833.388152-7-martin.lau@linux.dev Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/bpf_testmod/bpf_testmod.c | 190 +++++++++++++++++++++ .../selftests/bpf/bpf_testmod/bpf_testmod.h | 11 ++ .../selftests/bpf/bpf_testmod/bpf_testmod_kfunc.h | 6 + .../selftests/bpf/prog_tests/pro_epilogue.c | 10 ++ tools/testing/selftests/bpf/progs/pro_epilogue.c | 154 +++++++++++++++++ 5 files changed, 371 insertions(+) create mode 100644 tools/testing/selftests/bpf/prog_tests/pro_epilogue.c create mode 100644 tools/testing/selftests/bpf/progs/pro_epilogue.c (limited to 'tools') diff --git a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c index 8a71a91b752d..42c38dbb6835 100644 --- a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c +++ b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include "bpf_testmod.h" @@ -945,6 +946,51 @@ out: return err; } +static DEFINE_MUTEX(st_ops_mutex); +static struct bpf_testmod_st_ops *st_ops; + +__bpf_kfunc int bpf_kfunc_st_ops_test_prologue(struct st_ops_args *args) +{ + int ret = -1; + + mutex_lock(&st_ops_mutex); + if (st_ops && st_ops->test_prologue) + ret = st_ops->test_prologue(args); + mutex_unlock(&st_ops_mutex); + + return ret; +} + +__bpf_kfunc int bpf_kfunc_st_ops_test_epilogue(struct st_ops_args *args) +{ + int ret = -1; + + mutex_lock(&st_ops_mutex); + if (st_ops && st_ops->test_epilogue) + ret = st_ops->test_epilogue(args); + mutex_unlock(&st_ops_mutex); + + return ret; +} + +__bpf_kfunc int bpf_kfunc_st_ops_test_pro_epilogue(struct st_ops_args *args) +{ + int ret = -1; + + mutex_lock(&st_ops_mutex); + if (st_ops && st_ops->test_pro_epilogue) + ret = st_ops->test_pro_epilogue(args); + mutex_unlock(&st_ops_mutex); + + return ret; +} + +__bpf_kfunc int bpf_kfunc_st_ops_inc10(struct st_ops_args *args) +{ + args->a += 10; + return args->a; +} + BTF_KFUNCS_START(bpf_testmod_check_kfunc_ids) BTF_ID_FLAGS(func, bpf_testmod_test_mod_kfunc) BTF_ID_FLAGS(func, bpf_kfunc_call_test1) @@ -981,6 +1027,10 @@ BTF_ID_FLAGS(func, bpf_kfunc_call_kernel_sendmsg, KF_SLEEPABLE) BTF_ID_FLAGS(func, bpf_kfunc_call_sock_sendmsg, KF_SLEEPABLE) BTF_ID_FLAGS(func, bpf_kfunc_call_kernel_getsockname, KF_SLEEPABLE) BTF_ID_FLAGS(func, bpf_kfunc_call_kernel_getpeername, KF_SLEEPABLE) +BTF_ID_FLAGS(func, bpf_kfunc_st_ops_test_prologue, KF_TRUSTED_ARGS | KF_SLEEPABLE) +BTF_ID_FLAGS(func, bpf_kfunc_st_ops_test_epilogue, KF_TRUSTED_ARGS | KF_SLEEPABLE) +BTF_ID_FLAGS(func, bpf_kfunc_st_ops_test_pro_epilogue, KF_TRUSTED_ARGS | KF_SLEEPABLE) +BTF_ID_FLAGS(func, bpf_kfunc_st_ops_inc10, KF_TRUSTED_ARGS) BTF_KFUNCS_END(bpf_testmod_check_kfunc_ids) static int bpf_testmod_ops_init(struct btf *btf) @@ -1100,6 +1150,144 @@ struct bpf_struct_ops bpf_testmod_ops2 = { .owner = THIS_MODULE, }; +static int bpf_test_mod_st_ops__test_prologue(struct st_ops_args *args) +{ + return 0; +} + +static int bpf_test_mod_st_ops__test_epilogue(struct st_ops_args *args) +{ + return 0; +} + +static int bpf_test_mod_st_ops__test_pro_epilogue(struct st_ops_args *args) +{ + return 0; +} + +static int st_ops_gen_prologue(struct bpf_insn *insn_buf, bool direct_write, + const struct bpf_prog *prog) +{ + struct bpf_insn *insn = insn_buf; + + if (strcmp(prog->aux->attach_func_name, "test_prologue") && + strcmp(prog->aux->attach_func_name, "test_pro_epilogue")) + return 0; + + /* r6 = r1[0]; // r6 will be "struct st_ops *args". r1 is "u64 *ctx". + * r7 = r6->a; + * r7 += 1000; + * r6->a = r7; + */ + *insn++ = BPF_LDX_MEM(BPF_DW, BPF_REG_6, BPF_REG_1, 0); + *insn++ = BPF_LDX_MEM(BPF_DW, BPF_REG_7, BPF_REG_6, offsetof(struct st_ops_args, a)); + *insn++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, 1000); + *insn++ = BPF_STX_MEM(BPF_DW, BPF_REG_6, BPF_REG_7, offsetof(struct st_ops_args, a)); + *insn++ = prog->insnsi[0]; + + return insn - insn_buf; +} + +static int st_ops_gen_epilogue(struct bpf_insn *insn_buf, const struct bpf_prog *prog, + s16 ctx_stack_off) +{ + struct bpf_insn *insn = insn_buf; + + if (strcmp(prog->aux->attach_func_name, "test_epilogue") && + strcmp(prog->aux->attach_func_name, "test_pro_epilogue")) + return 0; + + /* r1 = stack[ctx_stack_off]; // r1 will be "u64 *ctx" + * r1 = r1[0]; // r1 will be "struct st_ops *args" + * r6 = r1->a; + * r6 += 10000; + * r1->a = r6; + * r0 = r6; + * r0 *= 2; + * BPF_EXIT; + */ + *insn++ = BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_FP, ctx_stack_off); + *insn++ = BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_1, 0); + *insn++ = BPF_LDX_MEM(BPF_DW, BPF_REG_6, BPF_REG_1, offsetof(struct st_ops_args, a)); + *insn++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_6, 10000); + *insn++ = BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_6, offsetof(struct st_ops_args, a)); + *insn++ = BPF_MOV64_REG(BPF_REG_0, BPF_REG_6); + *insn++ = BPF_ALU64_IMM(BPF_MUL, BPF_REG_0, 2); + *insn++ = BPF_EXIT_INSN(); + + return insn - insn_buf; +} + +static int st_ops_btf_struct_access(struct bpf_verifier_log *log, + const struct bpf_reg_state *reg, + int off, int size) +{ + if (off < 0 || off + size > sizeof(struct st_ops_args)) + return -EACCES; + return 0; +} + +static const struct bpf_verifier_ops st_ops_verifier_ops = { + .is_valid_access = bpf_testmod_ops_is_valid_access, + .btf_struct_access = st_ops_btf_struct_access, + .gen_prologue = st_ops_gen_prologue, + .gen_epilogue = st_ops_gen_epilogue, + .get_func_proto = bpf_base_func_proto, +}; + +static struct bpf_testmod_st_ops st_ops_cfi_stubs = { + .test_prologue = bpf_test_mod_st_ops__test_prologue, + .test_epilogue = bpf_test_mod_st_ops__test_epilogue, + .test_pro_epilogue = bpf_test_mod_st_ops__test_pro_epilogue, +}; + +static int st_ops_reg(void *kdata, struct bpf_link *link) +{ + int err = 0; + + mutex_lock(&st_ops_mutex); + if (st_ops) { + pr_err("st_ops has already been registered\n"); + err = -EEXIST; + goto unlock; + } + st_ops = kdata; + +unlock: + mutex_unlock(&st_ops_mutex); + return err; +} + +static void st_ops_unreg(void *kdata, struct bpf_link *link) +{ + mutex_lock(&st_ops_mutex); + st_ops = NULL; + mutex_unlock(&st_ops_mutex); +} + +static int st_ops_init(struct btf *btf) +{ + return 0; +} + +static int st_ops_init_member(const struct btf_type *t, + const struct btf_member *member, + void *kdata, const void *udata) +{ + return 0; +} + +static struct bpf_struct_ops testmod_st_ops = { + .verifier_ops = &st_ops_verifier_ops, + .init = st_ops_init, + .init_member = st_ops_init_member, + .reg = st_ops_reg, + .unreg = st_ops_unreg, + .cfi_stubs = &st_ops_cfi_stubs, + .name = "bpf_testmod_st_ops", + .owner = THIS_MODULE, +}; + extern int bpf_fentry_test1(int a); static int bpf_testmod_init(void) @@ -1117,8 +1305,10 @@ static int bpf_testmod_init(void) ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &bpf_testmod_kfunc_set); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &bpf_testmod_kfunc_set); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL, &bpf_testmod_kfunc_set); + ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &bpf_testmod_kfunc_set); ret = ret ?: register_bpf_struct_ops(&bpf_bpf_testmod_ops, bpf_testmod_ops); ret = ret ?: register_bpf_struct_ops(&bpf_testmod_ops2, bpf_testmod_ops2); + ret = ret ?: register_bpf_struct_ops(&testmod_st_ops, bpf_testmod_st_ops); ret = ret ?: register_btf_id_dtor_kfuncs(bpf_testmod_dtors, ARRAY_SIZE(bpf_testmod_dtors), THIS_MODULE); diff --git a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.h b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.h index fe0d402b0d65..fb7dff47597a 100644 --- a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.h +++ b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.h @@ -94,4 +94,15 @@ struct bpf_testmod_ops2 { int (*test_1)(void); }; +struct st_ops_args { + u64 a; +}; + +struct bpf_testmod_st_ops { + int (*test_prologue)(struct st_ops_args *args); + int (*test_epilogue)(struct st_ops_args *args); + int (*test_pro_epilogue)(struct st_ops_args *args); + struct module *owner; +}; + #endif /* _BPF_TESTMOD_H */ diff --git a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod_kfunc.h b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod_kfunc.h index c6c314965bb1..7e76532be5fa 100644 --- a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod_kfunc.h +++ b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod_kfunc.h @@ -148,4 +148,10 @@ struct sk_buff *bpf_kfunc_nested_acquire_nonzero_offset_test(struct sk_buff_head struct sk_buff *bpf_kfunc_nested_acquire_zero_offset_test(struct sock_common *ptr) __ksym; void bpf_kfunc_nested_release_test(struct sk_buff *ptr) __ksym; +struct st_ops_args; +int bpf_kfunc_st_ops_test_prologue(struct st_ops_args *args) __ksym; +int bpf_kfunc_st_ops_test_epilogue(struct st_ops_args *args) __ksym; +int bpf_kfunc_st_ops_test_pro_epilogue(struct st_ops_args *args) __ksym; +int bpf_kfunc_st_ops_inc10(struct st_ops_args *args) __ksym; + #endif /* _BPF_TESTMOD_KFUNC_H */ diff --git a/tools/testing/selftests/bpf/prog_tests/pro_epilogue.c b/tools/testing/selftests/bpf/prog_tests/pro_epilogue.c new file mode 100644 index 000000000000..00b806804b99 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/pro_epilogue.c @@ -0,0 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */ + +#include +#include "pro_epilogue.skel.h" + +void test_pro_epilogue(void) +{ + RUN_TESTS(pro_epilogue); +} diff --git a/tools/testing/selftests/bpf/progs/pro_epilogue.c b/tools/testing/selftests/bpf/progs/pro_epilogue.c new file mode 100644 index 000000000000..44bc3f06b4b6 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/pro_epilogue.c @@ -0,0 +1,154 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */ + +#include +#include +#include "bpf_misc.h" +#include "../bpf_testmod/bpf_testmod.h" +#include "../bpf_testmod/bpf_testmod_kfunc.h" + +char _license[] SEC("license") = "GPL"; + +void __kfunc_btf_root(void) +{ + bpf_kfunc_st_ops_inc10(NULL); +} + +static __noinline __used int subprog(struct st_ops_args *args) +{ + args->a += 1; + return args->a; +} + +__success +/* prologue */ +__xlated("0: r6 = *(u64 *)(r1 +0)") +__xlated("1: r7 = *(u64 *)(r6 +0)") +__xlated("2: r7 += 1000") +__xlated("3: *(u64 *)(r6 +0) = r7") +/* main prog */ +__xlated("4: r1 = *(u64 *)(r1 +0)") +__xlated("5: r6 = r1") +__xlated("6: call kernel-function") +__xlated("7: r1 = r6") +__xlated("8: call pc+1") +__xlated("9: exit") +SEC("struct_ops/test_prologue") +__naked int test_prologue(void) +{ + asm volatile ( + "r1 = *(u64 *)(r1 +0);" + "r6 = r1;" + "call %[bpf_kfunc_st_ops_inc10];" + "r1 = r6;" + "call subprog;" + "exit;" + : + : __imm(bpf_kfunc_st_ops_inc10) + : __clobber_all); +} + +__success +/* save __u64 *ctx to stack */ +__xlated("0: *(u64 *)(r10 -8) = r1") +/* main prog */ +__xlated("1: r1 = *(u64 *)(r1 +0)") +__xlated("2: r6 = r1") +__xlated("3: call kernel-function") +__xlated("4: r1 = r6") +__xlated("5: call pc+") +/* epilogue */ +__xlated("6: r1 = *(u64 *)(r10 -8)") +__xlated("7: r1 = *(u64 *)(r1 +0)") +__xlated("8: r6 = *(u64 *)(r1 +0)") +__xlated("9: r6 += 10000") +__xlated("10: *(u64 *)(r1 +0) = r6") +__xlated("11: r0 = r6") +__xlated("12: r0 *= 2") +__xlated("13: exit") +SEC("struct_ops/test_epilogue") +__naked int test_epilogue(void) +{ + asm volatile ( + "r1 = *(u64 *)(r1 +0);" + "r6 = r1;" + "call %[bpf_kfunc_st_ops_inc10];" + "r1 = r6;" + "call subprog;" + "exit;" + : + : __imm(bpf_kfunc_st_ops_inc10) + : __clobber_all); +} + +__success +/* prologue */ +__xlated("0: r6 = *(u64 *)(r1 +0)") +__xlated("1: r7 = *(u64 *)(r6 +0)") +__xlated("2: r7 += 1000") +__xlated("3: *(u64 *)(r6 +0) = r7") +/* save __u64 *ctx to stack */ +__xlated("4: *(u64 *)(r10 -8) = r1") +/* main prog */ +__xlated("5: r1 = *(u64 *)(r1 +0)") +__xlated("6: r6 = r1") +__xlated("7: call kernel-function") +__xlated("8: r1 = r6") +__xlated("9: call pc+") +/* epilogue */ +__xlated("10: r1 = *(u64 *)(r10 -8)") +__xlated("11: r1 = *(u64 *)(r1 +0)") +__xlated("12: r6 = *(u64 *)(r1 +0)") +__xlated("13: r6 += 10000") +__xlated("14: *(u64 *)(r1 +0) = r6") +__xlated("15: r0 = r6") +__xlated("16: r0 *= 2") +__xlated("17: exit") +SEC("struct_ops/test_pro_epilogue") +__naked int test_pro_epilogue(void) +{ + asm volatile ( + "r1 = *(u64 *)(r1 +0);" + "r6 = r1;" + "call %[bpf_kfunc_st_ops_inc10];" + "r1 = r6;" + "call subprog;" + "exit;" + : + : __imm(bpf_kfunc_st_ops_inc10) + : __clobber_all); +} + +SEC("syscall") +__retval(1011) /* PROLOGUE_A [1000] + KFUNC_INC10 + SUBPROG_A [1] */ +int syscall_prologue(void *ctx) +{ + struct st_ops_args args = {}; + + return bpf_kfunc_st_ops_test_prologue(&args); +} + +SEC("syscall") +__retval(20022) /* (KFUNC_INC10 + SUBPROG_A [1] + EPILOGUE_A [10000]) * 2 */ +int syscall_epilogue(void *ctx) +{ + struct st_ops_args args = {}; + + return bpf_kfunc_st_ops_test_epilogue(&args); +} + +SEC("syscall") +__retval(22022) /* (PROLOGUE_A [1000] + KFUNC_INC10 + SUBPROG_A [1] + EPILOGUE_A [10000]) * 2 */ +int syscall_pro_epilogue(void *ctx) +{ + struct st_ops_args args = {}; + + return bpf_kfunc_st_ops_test_pro_epilogue(&args); +} + +SEC(".struct_ops.link") +struct bpf_testmod_st_ops pro_epilogue = { + .test_prologue = (void *)test_prologue, + .test_epilogue = (void *)test_epilogue, + .test_pro_epilogue = (void *)test_pro_epilogue, +}; -- cgit v1.2.3 From b191b0fd740062ede672693671c6e6e942fb02f4 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 29 Aug 2024 14:08:29 -0700 Subject: selftests/bpf: Add tailcall epilogue test This patch adds a gen_epilogue test to test a main prog using a bpf_tail_call. A non test_loader test is used. The tailcall target program, "test_epilogue_subprog", needs to be used in a struct_ops map before it can be loaded. Another struct_ops map is also needed to host the actual "test_epilogue_tailcall" struct_ops program that does the bpf_tail_call. The earlier test_loader patch will attach all struct_ops maps but the bpf_testmod.c does not support >1 attached struct_ops. The earlier patch used the test_loader which has already covered checking for the patched pro/epilogue instructions. This is done by the __xlated tag. This patch goes for the regular skel load and syscall test to do the tailcall test that can also allow to directly pass the the "struct st_ops_args *args" as ctx_in to the SEC("syscall") program. Acked-by: Eduard Zingerman Signed-off-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20240829210833.388152-8-martin.lau@linux.dev Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/prog_tests/pro_epilogue.c | 46 +++++++++++++++++ .../selftests/bpf/progs/epilogue_tailcall.c | 58 ++++++++++++++++++++++ 2 files changed, 104 insertions(+) create mode 100644 tools/testing/selftests/bpf/progs/epilogue_tailcall.c (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/pro_epilogue.c b/tools/testing/selftests/bpf/prog_tests/pro_epilogue.c index 00b806804b99..b82525c29de8 100644 --- a/tools/testing/selftests/bpf/prog_tests/pro_epilogue.c +++ b/tools/testing/selftests/bpf/prog_tests/pro_epilogue.c @@ -3,8 +3,54 @@ #include #include "pro_epilogue.skel.h" +#include "epilogue_tailcall.skel.h" + +struct st_ops_args { + __u64 a; +}; + +static void test_tailcall(void) +{ + LIBBPF_OPTS(bpf_test_run_opts, topts); + struct epilogue_tailcall *skel; + struct st_ops_args args; + int err, prog_fd; + + skel = epilogue_tailcall__open_and_load(); + if (!ASSERT_OK_PTR(skel, "epilogue_tailcall__open_and_load")) + return; + + topts.ctx_in = &args; + topts.ctx_size_in = sizeof(args); + + skel->links.epilogue_tailcall = + bpf_map__attach_struct_ops(skel->maps.epilogue_tailcall); + if (!ASSERT_OK_PTR(skel->links.epilogue_tailcall, "attach_struct_ops")) + goto done; + + /* Both test_epilogue_tailcall and test_epilogue_subprog are + * patched with epilogue. When syscall_epilogue_tailcall() + * is run, test_epilogue_tailcall() is triggered. + * It executes a tail call and control is transferred to + * test_epilogue_subprog(). Only test_epilogue_subprog() + * does args->a += 1, thus final args.a value of 10001 + * guarantees that only the epilogue of the + * test_epilogue_subprog is executed. + */ + memset(&args, 0, sizeof(args)); + prog_fd = bpf_program__fd(skel->progs.syscall_epilogue_tailcall); + err = bpf_prog_test_run_opts(prog_fd, &topts); + ASSERT_OK(err, "bpf_prog_test_run_opts"); + ASSERT_EQ(args.a, 10001, "args.a"); + ASSERT_EQ(topts.retval, 10001 * 2, "topts.retval"); + +done: + epilogue_tailcall__destroy(skel); +} void test_pro_epilogue(void) { RUN_TESTS(pro_epilogue); + if (test__start_subtest("tailcall")) + test_tailcall(); } diff --git a/tools/testing/selftests/bpf/progs/epilogue_tailcall.c b/tools/testing/selftests/bpf/progs/epilogue_tailcall.c new file mode 100644 index 000000000000..7275dd594de0 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/epilogue_tailcall.c @@ -0,0 +1,58 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */ + +#include +#include +#include "bpf_misc.h" +#include "../bpf_testmod/bpf_testmod.h" +#include "../bpf_testmod/bpf_testmod_kfunc.h" + +char _license[] SEC("license") = "GPL"; + +static __noinline __used int subprog(struct st_ops_args *args) +{ + args->a += 1; + return args->a; +} + +SEC("struct_ops/test_epilogue_subprog") +int BPF_PROG(test_epilogue_subprog, struct st_ops_args *args) +{ + subprog(args); + return args->a; +} + +struct { + __uint(type, BPF_MAP_TYPE_PROG_ARRAY); + __uint(max_entries, 1); + __uint(key_size, sizeof(__u32)); + __uint(value_size, sizeof(__u32)); + __array(values, void (void)); +} epilogue_map SEC(".maps") = { + .values = { + [0] = (void *)&test_epilogue_subprog, + } +}; + +SEC("struct_ops/test_epilogue_tailcall") +int test_epilogue_tailcall(unsigned long long *ctx) +{ + bpf_tail_call(ctx, &epilogue_map, 0); + return 0; +} + +SEC(".struct_ops.link") +struct bpf_testmod_st_ops epilogue_tailcall = { + .test_epilogue = (void *)test_epilogue_tailcall, +}; + +SEC(".struct_ops.link") +struct bpf_testmod_st_ops epilogue_subprog = { + .test_epilogue = (void *)test_epilogue_subprog, +}; + +SEC("syscall") +int syscall_epilogue_tailcall(struct st_ops_args *args) +{ + return bpf_kfunc_st_ops_test_epilogue(args); +} -- cgit v1.2.3 From 42fdbbde6cf4159f77de13a40f8c0be6ef48bcc1 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 29 Aug 2024 14:08:30 -0700 Subject: selftests/bpf: A pro/epilogue test when the main prog jumps back to the 1st insn This patch adds a pro/epilogue test when the main prog has a goto insn that goes back to the very first instruction of the prog. It is to test the correctness of the adjust_jmp_off(prog, 0, delta) after the verifier has applied the prologue and/or epilogue patch. Acked-by: Eduard Zingerman Signed-off-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20240829210833.388152-9-martin.lau@linux.dev Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/prog_tests/pro_epilogue.c | 2 + .../selftests/bpf/progs/pro_epilogue_goto_start.c | 149 +++++++++++++++++++++ 2 files changed, 151 insertions(+) create mode 100644 tools/testing/selftests/bpf/progs/pro_epilogue_goto_start.c (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/pro_epilogue.c b/tools/testing/selftests/bpf/prog_tests/pro_epilogue.c index b82525c29de8..f974ae9ac610 100644 --- a/tools/testing/selftests/bpf/prog_tests/pro_epilogue.c +++ b/tools/testing/selftests/bpf/prog_tests/pro_epilogue.c @@ -4,6 +4,7 @@ #include #include "pro_epilogue.skel.h" #include "epilogue_tailcall.skel.h" +#include "pro_epilogue_goto_start.skel.h" struct st_ops_args { __u64 a; @@ -51,6 +52,7 @@ done: void test_pro_epilogue(void) { RUN_TESTS(pro_epilogue); + RUN_TESTS(pro_epilogue_goto_start); if (test__start_subtest("tailcall")) test_tailcall(); } diff --git a/tools/testing/selftests/bpf/progs/pro_epilogue_goto_start.c b/tools/testing/selftests/bpf/progs/pro_epilogue_goto_start.c new file mode 100644 index 000000000000..3529e53be355 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/pro_epilogue_goto_start.c @@ -0,0 +1,149 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */ + +#include +#include +#include "bpf_misc.h" +#include "../bpf_testmod/bpf_testmod.h" +#include "../bpf_testmod/bpf_testmod_kfunc.h" + +char _license[] SEC("license") = "GPL"; + +__success +/* prologue */ +__xlated("0: r6 = *(u64 *)(r1 +0)") +__xlated("1: r7 = *(u64 *)(r6 +0)") +__xlated("2: r7 += 1000") +__xlated("3: *(u64 *)(r6 +0) = r7") +/* main prog */ +__xlated("4: if r1 == 0x0 goto pc+5") +__xlated("5: if r1 == 0x1 goto pc+2") +__xlated("6: r1 = 1") +__xlated("7: goto pc-3") +__xlated("8: r1 = 0") +__xlated("9: goto pc-6") +__xlated("10: r0 = 0") +__xlated("11: exit") +SEC("struct_ops/test_prologue_goto_start") +__naked int test_prologue_goto_start(void) +{ + asm volatile ( + "if r1 == 0 goto +5;" + "if r1 == 1 goto +2;" + "r1 = 1;" + "goto -3;" + "r1 = 0;" + "goto -6;" + "r0 = 0;" + "exit;" + ::: __clobber_all); +} + +__success +/* save __u64 *ctx to stack */ +__xlated("0: *(u64 *)(r10 -8) = r1") +/* main prog */ +__xlated("1: if r1 == 0x0 goto pc+5") +__xlated("2: if r1 == 0x1 goto pc+2") +__xlated("3: r1 = 1") +__xlated("4: goto pc-3") +__xlated("5: r1 = 0") +__xlated("6: goto pc-6") +__xlated("7: r0 = 0") +/* epilogue */ +__xlated("8: r1 = *(u64 *)(r10 -8)") +__xlated("9: r1 = *(u64 *)(r1 +0)") +__xlated("10: r6 = *(u64 *)(r1 +0)") +__xlated("11: r6 += 10000") +__xlated("12: *(u64 *)(r1 +0) = r6") +__xlated("13: r0 = r6") +__xlated("14: r0 *= 2") +__xlated("15: exit") +SEC("struct_ops/test_epilogue_goto_start") +__naked int test_epilogue_goto_start(void) +{ + asm volatile ( + "if r1 == 0 goto +5;" + "if r1 == 1 goto +2;" + "r1 = 1;" + "goto -3;" + "r1 = 0;" + "goto -6;" + "r0 = 0;" + "exit;" + ::: __clobber_all); +} + +__success +/* prologue */ +__xlated("0: r6 = *(u64 *)(r1 +0)") +__xlated("1: r7 = *(u64 *)(r6 +0)") +__xlated("2: r7 += 1000") +__xlated("3: *(u64 *)(r6 +0) = r7") +/* save __u64 *ctx to stack */ +__xlated("4: *(u64 *)(r10 -8) = r1") +/* main prog */ +__xlated("5: if r1 == 0x0 goto pc+5") +__xlated("6: if r1 == 0x1 goto pc+2") +__xlated("7: r1 = 1") +__xlated("8: goto pc-3") +__xlated("9: r1 = 0") +__xlated("10: goto pc-6") +__xlated("11: r0 = 0") +/* epilogue */ +__xlated("12: r1 = *(u64 *)(r10 -8)") +__xlated("13: r1 = *(u64 *)(r1 +0)") +__xlated("14: r6 = *(u64 *)(r1 +0)") +__xlated("15: r6 += 10000") +__xlated("16: *(u64 *)(r1 +0) = r6") +__xlated("17: r0 = r6") +__xlated("18: r0 *= 2") +__xlated("19: exit") +SEC("struct_ops/test_pro_epilogue_goto_start") +__naked int test_pro_epilogue_goto_start(void) +{ + asm volatile ( + "if r1 == 0 goto +5;" + "if r1 == 1 goto +2;" + "r1 = 1;" + "goto -3;" + "r1 = 0;" + "goto -6;" + "r0 = 0;" + "exit;" + ::: __clobber_all); +} + +SEC(".struct_ops.link") +struct bpf_testmod_st_ops epilogue_goto_start = { + .test_prologue = (void *)test_prologue_goto_start, + .test_epilogue = (void *)test_epilogue_goto_start, + .test_pro_epilogue = (void *)test_pro_epilogue_goto_start, +}; + +SEC("syscall") +__retval(0) +int syscall_prologue_goto_start(void *ctx) +{ + struct st_ops_args args = {}; + + return bpf_kfunc_st_ops_test_prologue(&args); +} + +SEC("syscall") +__retval(20000) /* (EPILOGUE_A [10000]) * 2 */ +int syscall_epilogue_goto_start(void *ctx) +{ + struct st_ops_args args = {}; + + return bpf_kfunc_st_ops_test_epilogue(&args); +} + +SEC("syscall") +__retval(22000) /* (PROLOGUE_A [1000] + EPILOGUE_A [10000]) * 2 */ +int syscall_pro_epilogue_goto_start(void *ctx) +{ + struct st_ops_args args = {}; + + return bpf_kfunc_st_ops_test_pro_epilogue(&args); +} -- cgit v1.2.3 From cada0bdcc471443dbd41bd63286f48be3dae0d89 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 29 Aug 2024 14:08:31 -0700 Subject: selftests/bpf: Test epilogue patching when the main prog has multiple BPF_EXIT This patch tests the epilogue patching when the main prog has multiple BPF_EXIT. The verifier should have patched the 2nd (and later) BPF_EXIT with a BPF_JA that goes back to the earlier patched epilogue instructions. Acked-by: Eduard Zingerman Signed-off-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20240829210833.388152-10-martin.lau@linux.dev Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/prog_tests/pro_epilogue.c | 2 + tools/testing/selftests/bpf/progs/epilogue_exit.c | 82 ++++++++++++++++++++++ 2 files changed, 84 insertions(+) create mode 100644 tools/testing/selftests/bpf/progs/epilogue_exit.c (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/pro_epilogue.c b/tools/testing/selftests/bpf/prog_tests/pro_epilogue.c index f974ae9ac610..509883e6823a 100644 --- a/tools/testing/selftests/bpf/prog_tests/pro_epilogue.c +++ b/tools/testing/selftests/bpf/prog_tests/pro_epilogue.c @@ -5,6 +5,7 @@ #include "pro_epilogue.skel.h" #include "epilogue_tailcall.skel.h" #include "pro_epilogue_goto_start.skel.h" +#include "epilogue_exit.skel.h" struct st_ops_args { __u64 a; @@ -53,6 +54,7 @@ void test_pro_epilogue(void) { RUN_TESTS(pro_epilogue); RUN_TESTS(pro_epilogue_goto_start); + RUN_TESTS(epilogue_exit); if (test__start_subtest("tailcall")) test_tailcall(); } diff --git a/tools/testing/selftests/bpf/progs/epilogue_exit.c b/tools/testing/selftests/bpf/progs/epilogue_exit.c new file mode 100644 index 000000000000..33d3a57bee90 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/epilogue_exit.c @@ -0,0 +1,82 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */ + +#include +#include +#include "bpf_misc.h" +#include "../bpf_testmod/bpf_testmod.h" +#include "../bpf_testmod/bpf_testmod_kfunc.h" + +char _license[] SEC("license") = "GPL"; + +__success +/* save __u64 *ctx to stack */ +__xlated("0: *(u64 *)(r10 -8) = r1") +/* main prog */ +__xlated("1: r1 = *(u64 *)(r1 +0)") +__xlated("2: r2 = *(u64 *)(r1 +0)") +__xlated("3: r3 = 0") +__xlated("4: r4 = 1") +__xlated("5: if r2 == 0x0 goto pc+10") +__xlated("6: r0 = 0") +__xlated("7: *(u64 *)(r1 +0) = r3") +/* epilogue */ +__xlated("8: r1 = *(u64 *)(r10 -8)") +__xlated("9: r1 = *(u64 *)(r1 +0)") +__xlated("10: r6 = *(u64 *)(r1 +0)") +__xlated("11: r6 += 10000") +__xlated("12: *(u64 *)(r1 +0) = r6") +__xlated("13: r0 = r6") +__xlated("14: r0 *= 2") +__xlated("15: exit") +/* 2nd part of the main prog after the first exit */ +__xlated("16: *(u64 *)(r1 +0) = r4") +__xlated("17: r0 = 1") +/* Clear the r1 to ensure it does not have + * off-by-1 error and ensure it jumps back to the + * beginning of epilogue which initializes + * the r1 with the ctx ptr. + */ +__xlated("18: r1 = 0") +__xlated("19: gotol pc-12") +SEC("struct_ops/test_epilogue_exit") +__naked int test_epilogue_exit(void) +{ + asm volatile ( + "r1 = *(u64 *)(r1 +0);" + "r2 = *(u64 *)(r1 +0);" + "r3 = 0;" + "r4 = 1;" + "if r2 == 0 goto +3;" + "r0 = 0;" + "*(u64 *)(r1 + 0) = r3;" + "exit;" + "*(u64 *)(r1 + 0) = r4;" + "r0 = 1;" + "r1 = 0;" + "exit;" + ::: __clobber_all); +} + +SEC(".struct_ops.link") +struct bpf_testmod_st_ops epilogue_exit = { + .test_epilogue = (void *)test_epilogue_exit, +}; + +SEC("syscall") +__retval(20000) +int syscall_epilogue_exit0(void *ctx) +{ + struct st_ops_args args = { .a = 1 }; + + return bpf_kfunc_st_ops_test_epilogue(&args); +} + +SEC("syscall") +__retval(20002) +int syscall_epilogue_exit1(void *ctx) +{ + struct st_ops_args args = {}; + + return bpf_kfunc_st_ops_test_epilogue(&args); +} -- cgit v1.2.3 From 7c5f7b16fe1b9d1eb0cbb46d20f57db4a912b6e0 Mon Sep 17 00:00:00 2001 From: Juntong Deng Date: Thu, 29 Aug 2024 21:13:15 +0100 Subject: selftests/bpf: Add tests for iter next method returning valid pointer This patch adds test cases for iter next method returning valid pointer, which can also used as usage examples. Currently iter next method should return valid pointer. iter_next_trusted is the correct usage and test if iter next method return valid pointer. bpf_iter_task_vma_next has KF_RET_NULL flag, so the returned pointer may be NULL. We need to check if the pointer is NULL before using it. iter_next_trusted_or_null is the incorrect usage. There is no checking before using the pointer, so it will be rejected by the verifier. iter_next_rcu and iter_next_rcu_or_null are similar test cases for KF_RCU_PROTECTED iterators. iter_next_rcu_not_trusted is used to test that the pointer returned by iter next method of KF_RCU_PROTECTED iterator cannot be passed in KF_TRUSTED_ARGS kfuncs. iter_next_ptr_mem_not_trusted is used to test that base type PTR_TO_MEM should not be combined with type flag PTR_TRUSTED. Signed-off-by: Juntong Deng Link: https://lore.kernel.org/r/AM6PR03MB5848709758F6922F02AF9F1F99962@AM6PR03MB5848.eurprd03.prod.outlook.com Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/bpf_testmod/bpf_testmod.c | 20 ++++ .../selftests/bpf/bpf_testmod/bpf_testmod_kfunc.h | 5 + tools/testing/selftests/bpf/prog_tests/iters.c | 5 +- tools/testing/selftests/bpf/progs/iters_testmod.c | 125 +++++++++++++++++++++ 4 files changed, 154 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/bpf/progs/iters_testmod.c (limited to 'tools') diff --git a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c index 42c38dbb6835..c73d04bc9e9d 100644 --- a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c +++ b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c @@ -198,6 +198,22 @@ __bpf_kfunc void bpf_kfunc_nested_release_test(struct sk_buff *ptr) { } +__bpf_kfunc void bpf_kfunc_trusted_vma_test(struct vm_area_struct *ptr) +{ +} + +__bpf_kfunc void bpf_kfunc_trusted_task_test(struct task_struct *ptr) +{ +} + +__bpf_kfunc void bpf_kfunc_trusted_num_test(int *ptr) +{ +} + +__bpf_kfunc void bpf_kfunc_rcu_task_test(struct task_struct *ptr) +{ +} + __bpf_kfunc struct bpf_testmod_ctx * bpf_testmod_ctx_create(int *err) { @@ -559,6 +575,10 @@ BTF_ID_FLAGS(func, bpf_kfunc_dynptr_test) BTF_ID_FLAGS(func, bpf_kfunc_nested_acquire_nonzero_offset_test, KF_ACQUIRE) BTF_ID_FLAGS(func, bpf_kfunc_nested_acquire_zero_offset_test, KF_ACQUIRE) BTF_ID_FLAGS(func, bpf_kfunc_nested_release_test, KF_RELEASE) +BTF_ID_FLAGS(func, bpf_kfunc_trusted_vma_test, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_kfunc_trusted_task_test, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_kfunc_trusted_num_test, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_kfunc_rcu_task_test, KF_RCU) BTF_ID_FLAGS(func, bpf_testmod_ctx_create, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_testmod_ctx_release, KF_RELEASE) BTF_KFUNCS_END(bpf_testmod_common_kfunc_ids) diff --git a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod_kfunc.h b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod_kfunc.h index 7e76532be5fa..b58817938deb 100644 --- a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod_kfunc.h +++ b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod_kfunc.h @@ -154,4 +154,9 @@ int bpf_kfunc_st_ops_test_epilogue(struct st_ops_args *args) __ksym; int bpf_kfunc_st_ops_test_pro_epilogue(struct st_ops_args *args) __ksym; int bpf_kfunc_st_ops_inc10(struct st_ops_args *args) __ksym; +void bpf_kfunc_trusted_vma_test(struct vm_area_struct *ptr) __ksym; +void bpf_kfunc_trusted_task_test(struct task_struct *ptr) __ksym; +void bpf_kfunc_trusted_num_test(int *ptr) __ksym; +void bpf_kfunc_rcu_task_test(struct task_struct *ptr) __ksym; + #endif /* _BPF_TESTMOD_KFUNC_H */ diff --git a/tools/testing/selftests/bpf/prog_tests/iters.c b/tools/testing/selftests/bpf/prog_tests/iters.c index 3c440370c1f0..89ff23c4a8bc 100644 --- a/tools/testing/selftests/bpf/prog_tests/iters.c +++ b/tools/testing/selftests/bpf/prog_tests/iters.c @@ -14,6 +14,7 @@ #include "iters_state_safety.skel.h" #include "iters_looping.skel.h" #include "iters_num.skel.h" +#include "iters_testmod.skel.h" #include "iters_testmod_seq.skel.h" #include "iters_task_vma.skel.h" #include "iters_task.skel.h" @@ -297,8 +298,10 @@ void test_iters(void) RUN_TESTS(iters); RUN_TESTS(iters_css_task); - if (env.has_testmod) + if (env.has_testmod) { + RUN_TESTS(iters_testmod); RUN_TESTS(iters_testmod_seq); + } if (test__start_subtest("num")) subtest_num_iters(); diff --git a/tools/testing/selftests/bpf/progs/iters_testmod.c b/tools/testing/selftests/bpf/progs/iters_testmod.c new file mode 100644 index 000000000000..df1d3db60b1b --- /dev/null +++ b/tools/testing/selftests/bpf/progs/iters_testmod.c @@ -0,0 +1,125 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "vmlinux.h" +#include "bpf_experimental.h" +#include +#include "bpf_misc.h" +#include "../bpf_testmod/bpf_testmod_kfunc.h" + +char _license[] SEC("license") = "GPL"; + +SEC("raw_tp/sys_enter") +__success +int iter_next_trusted(const void *ctx) +{ + struct task_struct *cur_task = bpf_get_current_task_btf(); + struct bpf_iter_task_vma vma_it; + struct vm_area_struct *vma_ptr; + + bpf_iter_task_vma_new(&vma_it, cur_task, 0); + + vma_ptr = bpf_iter_task_vma_next(&vma_it); + if (vma_ptr == NULL) + goto out; + + bpf_kfunc_trusted_vma_test(vma_ptr); +out: + bpf_iter_task_vma_destroy(&vma_it); + return 0; +} + +SEC("raw_tp/sys_enter") +__failure __msg("Possibly NULL pointer passed to trusted arg0") +int iter_next_trusted_or_null(const void *ctx) +{ + struct task_struct *cur_task = bpf_get_current_task_btf(); + struct bpf_iter_task_vma vma_it; + struct vm_area_struct *vma_ptr; + + bpf_iter_task_vma_new(&vma_it, cur_task, 0); + + vma_ptr = bpf_iter_task_vma_next(&vma_it); + + bpf_kfunc_trusted_vma_test(vma_ptr); + + bpf_iter_task_vma_destroy(&vma_it); + return 0; +} + +SEC("raw_tp/sys_enter") +__success +int iter_next_rcu(const void *ctx) +{ + struct task_struct *cur_task = bpf_get_current_task_btf(); + struct bpf_iter_task task_it; + struct task_struct *task_ptr; + + bpf_iter_task_new(&task_it, cur_task, 0); + + task_ptr = bpf_iter_task_next(&task_it); + if (task_ptr == NULL) + goto out; + + bpf_kfunc_rcu_task_test(task_ptr); +out: + bpf_iter_task_destroy(&task_it); + return 0; +} + +SEC("raw_tp/sys_enter") +__failure __msg("Possibly NULL pointer passed to trusted arg0") +int iter_next_rcu_or_null(const void *ctx) +{ + struct task_struct *cur_task = bpf_get_current_task_btf(); + struct bpf_iter_task task_it; + struct task_struct *task_ptr; + + bpf_iter_task_new(&task_it, cur_task, 0); + + task_ptr = bpf_iter_task_next(&task_it); + + bpf_kfunc_rcu_task_test(task_ptr); + + bpf_iter_task_destroy(&task_it); + return 0; +} + +SEC("raw_tp/sys_enter") +__failure __msg("R1 must be referenced or trusted") +int iter_next_rcu_not_trusted(const void *ctx) +{ + struct task_struct *cur_task = bpf_get_current_task_btf(); + struct bpf_iter_task task_it; + struct task_struct *task_ptr; + + bpf_iter_task_new(&task_it, cur_task, 0); + + task_ptr = bpf_iter_task_next(&task_it); + if (task_ptr == NULL) + goto out; + + bpf_kfunc_trusted_task_test(task_ptr); +out: + bpf_iter_task_destroy(&task_it); + return 0; +} + +SEC("raw_tp/sys_enter") +__failure __msg("R1 cannot write into rdonly_mem") +/* Message should not be 'R1 cannot write into rdonly_trusted_mem' */ +int iter_next_ptr_mem_not_trusted(const void *ctx) +{ + struct bpf_iter_num num_it; + int *num_ptr; + + bpf_iter_num_new(&num_it, 0, 10); + + num_ptr = bpf_iter_num_next(&num_it); + if (num_ptr == NULL) + goto out; + + bpf_kfunc_trusted_num_test(num_ptr); +out: + bpf_iter_num_destroy(&num_it); + return 0; +} -- cgit v1.2.3 From ce3b90bd0a165c0473e462b9182e30e0659f99cf Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 2 Aug 2024 13:08:53 -0700 Subject: KVM: selftests: Remove unused kvm_memcmp_hva_gva() Remove sefltests' kvm_memcmp_hva_gva(), which has literally never had a single user since it was introduced by commit 783e9e51266eb ("kvm: selftests: add API testing infrastructure"). Link: https://lore.kernel.org/r/20240802200853.336512-1-seanjc@google.com Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/include/kvm_util.h | 2 - tools/testing/selftests/kvm/lib/kvm_util.c | 70 -------------------------- 2 files changed, 72 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/include/kvm_util.h b/tools/testing/selftests/kvm/include/kvm_util.h index 63c2aaae51f3..acd2db809e83 100644 --- a/tools/testing/selftests/kvm/include/kvm_util.h +++ b/tools/testing/selftests/kvm/include/kvm_util.h @@ -428,8 +428,6 @@ const char *vm_guest_mode_string(uint32_t i); void kvm_vm_free(struct kvm_vm *vmp); void kvm_vm_restart(struct kvm_vm *vmp); void kvm_vm_release(struct kvm_vm *vmp); -int kvm_memcmp_hva_gva(void *hva, struct kvm_vm *vm, const vm_vaddr_t gva, - size_t len); void kvm_vm_elf_load(struct kvm_vm *vm, const char *filename); int kvm_memfd_alloc(size_t size, bool hugepages); diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index 56b170b725b3..f7b7185dff10 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -794,76 +794,6 @@ int kvm_memfd_alloc(size_t size, bool hugepages) return fd; } -/* - * Memory Compare, host virtual to guest virtual - * - * Input Args: - * hva - Starting host virtual address - * vm - Virtual Machine - * gva - Starting guest virtual address - * len - number of bytes to compare - * - * Output Args: None - * - * Input/Output Args: None - * - * Return: - * Returns 0 if the bytes starting at hva for a length of len - * are equal the guest virtual bytes starting at gva. Returns - * a value < 0, if bytes at hva are less than those at gva. - * Otherwise a value > 0 is returned. - * - * Compares the bytes starting at the host virtual address hva, for - * a length of len, to the guest bytes starting at the guest virtual - * address given by gva. - */ -int kvm_memcmp_hva_gva(void *hva, struct kvm_vm *vm, vm_vaddr_t gva, size_t len) -{ - size_t amt; - - /* - * Compare a batch of bytes until either a match is found - * or all the bytes have been compared. - */ - for (uintptr_t offset = 0; offset < len; offset += amt) { - uintptr_t ptr1 = (uintptr_t)hva + offset; - - /* - * Determine host address for guest virtual address - * at offset. - */ - uintptr_t ptr2 = (uintptr_t)addr_gva2hva(vm, gva + offset); - - /* - * Determine amount to compare on this pass. - * Don't allow the comparsion to cross a page boundary. - */ - amt = len - offset; - if ((ptr1 >> vm->page_shift) != ((ptr1 + amt) >> vm->page_shift)) - amt = vm->page_size - (ptr1 % vm->page_size); - if ((ptr2 >> vm->page_shift) != ((ptr2 + amt) >> vm->page_shift)) - amt = vm->page_size - (ptr2 % vm->page_size); - - assert((ptr1 >> vm->page_shift) == ((ptr1 + amt - 1) >> vm->page_shift)); - assert((ptr2 >> vm->page_shift) == ((ptr2 + amt - 1) >> vm->page_shift)); - - /* - * Perform the comparison. If there is a difference - * return that result to the caller, otherwise need - * to continue on looking for a mismatch. - */ - int ret = memcmp((void *)ptr1, (void *)ptr2, amt); - if (ret != 0) - return ret; - } - - /* - * No mismatch found. Let the caller know the two memory - * areas are equal. - */ - return 0; -} - static void vm_userspace_mem_region_gpa_insert(struct rb_root *gpa_tree, struct userspace_mem_region *region) { -- cgit v1.2.3 From c0d1a39d1d20e5e770bad72bbe1e9d4fa1367e28 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 2 Aug 2024 13:14:29 -0700 Subject: KVM: selftests: Always unlink memory regions when deleting (VM free) Unlink memory regions when freeing a VM, even though it's not strictly necessary since all tracking structures are freed soon after. The time spent deleting entries is negligible, and not unlinking entries is confusing, e.g. it's easy to overlook that the tree structures are freed by the caller. Link: https://lore.kernel.org/r/20240802201429.338412-1-seanjc@google.com Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/lib/kvm_util.c | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index f7b7185dff10..a2b7df5f1d39 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -712,16 +712,13 @@ void kvm_vm_release(struct kvm_vm *vmp) } static void __vm_mem_region_delete(struct kvm_vm *vm, - struct userspace_mem_region *region, - bool unlink) + struct userspace_mem_region *region) { int ret; - if (unlink) { - rb_erase(®ion->gpa_node, &vm->regions.gpa_tree); - rb_erase(®ion->hva_node, &vm->regions.hva_tree); - hash_del(®ion->slot_node); - } + rb_erase(®ion->gpa_node, &vm->regions.gpa_tree); + rb_erase(®ion->hva_node, &vm->regions.hva_tree); + hash_del(®ion->slot_node); region->region.memory_size = 0; vm_ioctl(vm, KVM_SET_USER_MEMORY_REGION2, ®ion->region); @@ -762,7 +759,7 @@ void kvm_vm_free(struct kvm_vm *vmp) /* Free userspace_mem_regions. */ hash_for_each_safe(vmp->regions.slot_hash, ctr, node, region, slot_node) - __vm_mem_region_delete(vmp, region, false); + __vm_mem_region_delete(vmp, region); /* Free sparsebit arrays. */ sparsebit_free(&vmp->vpages_valid); @@ -1200,7 +1197,7 @@ void vm_mem_region_move(struct kvm_vm *vm, uint32_t slot, uint64_t new_gpa) */ void vm_mem_region_delete(struct kvm_vm *vm, uint32_t slot) { - __vm_mem_region_delete(vm, memslot2region(vm, slot), true); + __vm_mem_region_delete(vm, memslot2region(vm, slot)); } void vm_guest_mem_fallocate(struct kvm_vm *vm, uint64_t base, uint64_t size, -- cgit v1.2.3 From 2f6fcfa1f4264c1f035ddd092ebd046499f7cbea Mon Sep 17 00:00:00 2001 From: Peter Gonda Date: Tue, 9 Jul 2024 11:29:36 -0700 Subject: KVM: selftests: Add SEV-ES shutdown test Regression test for ae20eef5 ("KVM: SVM: Update SEV-ES shutdown intercepts with more metadata"). Test confirms userspace is correctly indicated of a guest shutdown not previous behavior of an EINVAL from KVM_RUN. Cc: Paolo Bonzini Cc: Sean Christopherson Cc: Alper Gun Cc: Tom Lendacky Cc: Michael Roth Cc: kvm@vger.kernel.org Cc: linux-kselftest@vger.kernel.org Signed-off-by: Peter Gonda Tested-by: Pratik R. Sampat Link: https://lore.kernel.org/r/20240709182936.146487-1-pgonda@google.com [sean: clobber IDT to ensure #UD leads to SHUTDOWN] Signed-off-by: Sean Christopherson --- .../testing/selftests/kvm/x86_64/sev_smoke_test.c | 32 ++++++++++++++++++++++ 1 file changed, 32 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/x86_64/sev_smoke_test.c b/tools/testing/selftests/kvm/x86_64/sev_smoke_test.c index 7c70c0da4fb7..2e9197eb1652 100644 --- a/tools/testing/selftests/kvm/x86_64/sev_smoke_test.c +++ b/tools/testing/selftests/kvm/x86_64/sev_smoke_test.c @@ -160,6 +160,36 @@ static void test_sev(void *guest_code, uint64_t policy) kvm_vm_free(vm); } +static void guest_shutdown_code(void) +{ + struct desc_ptr idt; + + /* Clobber the IDT so that #UD is guaranteed to trigger SHUTDOWN. */ + memset(&idt, 0, sizeof(idt)); + __asm__ __volatile__("lidt %0" :: "m"(idt)); + + __asm__ __volatile__("ud2"); +} + +static void test_sev_es_shutdown(void) +{ + struct kvm_vcpu *vcpu; + struct kvm_vm *vm; + + uint32_t type = KVM_X86_SEV_ES_VM; + + vm = vm_sev_create_with_one_vcpu(type, guest_shutdown_code, &vcpu); + + vm_sev_launch(vm, SEV_POLICY_ES, NULL); + + vcpu_run(vcpu); + TEST_ASSERT(vcpu->run->exit_reason == KVM_EXIT_SHUTDOWN, + "Wanted SHUTDOWN, got %s", + exit_reason_str(vcpu->run->exit_reason)); + + kvm_vm_free(vm); +} + int main(int argc, char *argv[]) { TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_SEV)); @@ -171,6 +201,8 @@ int main(int argc, char *argv[]) test_sev(guest_sev_es_code, SEV_POLICY_ES | SEV_POLICY_NO_DBG); test_sev(guest_sev_es_code, SEV_POLICY_ES); + test_sev_es_shutdown(); + if (kvm_has_cap(KVM_CAP_XCRS) && (xgetbv(0) & XFEATURE_MASK_X87_AVX) == XFEATURE_MASK_X87_AVX) { test_sync_vmsa(0); -- cgit v1.2.3 From 215b3cb7a84f8d97b81fe8536cec05a11496da51 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 28 Aug 2024 11:14:45 -0700 Subject: KVM: selftests: Add a test for coalesced MMIO (and PIO on x86) Add a test to verify that KVM correctly exits (or not) when a vCPU's coalesced I/O ring is full (or isn't). Iterate over all legal starting points in the ring (with an empty ring), and verify that KVM doesn't exit until the ring is full. Opportunistically verify that KVM exits immediately on non-coalesced I/O, either because the MMIO/PIO region was never registered, or because a previous region was unregistered. This is a regression test for a KVM bug where KVM would prematurely exit due to bad math resulting in a false positive if the first entry in the ring was before the halfway mark. See commit 92f6d4130497 ("KVM: Fix coalesced_mmio_has_room() to avoid premature userspace exit"). Enable the test for x86, arm64, and risc-v, i.e. all architectures except s390, which doesn't have MMIO. On x86, which has both MMIO and PIO, interleave MMIO and PIO into the same ring, as KVM shouldn't exit until a non-coalesced I/O is encountered, regardless of whether the ring is filled with MMIO, PIO, or both. Lastly, wrap the coalesced I/O ring in a structure to prepare for a potential future where KVM supports multiple ring buffers beyond KVM's "default" built-in buffer. Link: https://lore.kernel.org/all/20240820133333.1724191-1-ilstam@amazon.com Cc: Ilias Stamatis Cc: Marc Zyngier Cc: Oliver Upton Cc: Anup Patel Link: https://lore.kernel.org/r/20240828181446.652474-2-seanjc@google.com Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/Makefile | 3 + tools/testing/selftests/kvm/coalesced_io_test.c | 236 ++++++++++++++++++++++++ tools/testing/selftests/kvm/include/kvm_util.h | 26 +++ 3 files changed, 265 insertions(+) create mode 100644 tools/testing/selftests/kvm/coalesced_io_test.c (limited to 'tools') diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile index 48d32c5aa3eb..45cb70c048bb 100644 --- a/tools/testing/selftests/kvm/Makefile +++ b/tools/testing/selftests/kvm/Makefile @@ -130,6 +130,7 @@ TEST_GEN_PROGS_x86_64 += x86_64/max_vcpuid_cap_test TEST_GEN_PROGS_x86_64 += x86_64/triple_fault_event_test TEST_GEN_PROGS_x86_64 += x86_64/recalc_apic_map_test TEST_GEN_PROGS_x86_64 += access_tracking_perf_test +TEST_GEN_PROGS_x86_64 += coalesced_io_test TEST_GEN_PROGS_x86_64 += demand_paging_test TEST_GEN_PROGS_x86_64 += dirty_log_test TEST_GEN_PROGS_x86_64 += dirty_log_perf_test @@ -165,6 +166,7 @@ TEST_GEN_PROGS_aarch64 += aarch64/vgic_lpi_stress TEST_GEN_PROGS_aarch64 += aarch64/vpmu_counter_access TEST_GEN_PROGS_aarch64 += access_tracking_perf_test TEST_GEN_PROGS_aarch64 += arch_timer +TEST_GEN_PROGS_aarch64 += coalesced_io_test TEST_GEN_PROGS_aarch64 += demand_paging_test TEST_GEN_PROGS_aarch64 += dirty_log_test TEST_GEN_PROGS_aarch64 += dirty_log_perf_test @@ -198,6 +200,7 @@ TEST_GEN_PROGS_s390x += kvm_binary_stats_test TEST_GEN_PROGS_riscv += riscv/sbi_pmu_test TEST_GEN_PROGS_riscv += riscv/ebreak_test TEST_GEN_PROGS_riscv += arch_timer +TEST_GEN_PROGS_riscv += coalesced_io_test TEST_GEN_PROGS_riscv += demand_paging_test TEST_GEN_PROGS_riscv += dirty_log_test TEST_GEN_PROGS_riscv += get-reg-list diff --git a/tools/testing/selftests/kvm/coalesced_io_test.c b/tools/testing/selftests/kvm/coalesced_io_test.c new file mode 100644 index 000000000000..60cb25454899 --- /dev/null +++ b/tools/testing/selftests/kvm/coalesced_io_test.c @@ -0,0 +1,236 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include + +#include + +#include +#include + +#include "ucall_common.h" + +struct kvm_coalesced_io { + struct kvm_coalesced_mmio_ring *ring; + uint32_t ring_size; + uint64_t mmio_gpa; + uint64_t *mmio; + + /* + * x86-only, but define pio_port for all architectures to minimize the + * amount of #ifdeffery and complexity, without having to sacrifice + * verbose error messages. + */ + uint8_t pio_port; +}; + +static struct kvm_coalesced_io kvm_builtin_io_ring; + +#ifdef __x86_64__ +static const int has_pio = 1; +#else +static const int has_pio = 0; +#endif + +static void guest_code(struct kvm_coalesced_io *io) +{ + int i, j; + + for (;;) { + for (j = 0; j < 1 + has_pio; j++) { + /* + * KVM always leaves one free entry, i.e. exits to + * userspace before the last entry is filled. + */ + for (i = 0; i < io->ring_size - 1; i++) { +#ifdef __x86_64__ + if (i & 1) + outl(io->pio_port, io->pio_port + i); + else +#endif + WRITE_ONCE(*io->mmio, io->mmio_gpa + i); + } +#ifdef __x86_64__ + if (j & 1) + outl(io->pio_port, io->pio_port + i); + else +#endif + WRITE_ONCE(*io->mmio, io->mmio_gpa + i); + } + GUEST_SYNC(0); + + WRITE_ONCE(*io->mmio, io->mmio_gpa + i); +#ifdef __x86_64__ + outl(io->pio_port, io->pio_port + i); +#endif + } +} + +static void vcpu_run_and_verify_io_exit(struct kvm_vcpu *vcpu, + struct kvm_coalesced_io *io, + uint32_t ring_start, + uint32_t expected_exit) +{ + const bool want_pio = expected_exit == KVM_EXIT_IO; + struct kvm_coalesced_mmio_ring *ring = io->ring; + struct kvm_run *run = vcpu->run; + uint32_t pio_value; + + WRITE_ONCE(ring->first, ring_start); + WRITE_ONCE(ring->last, ring_start); + + vcpu_run(vcpu); + + /* + * Annoyingly, reading PIO data is safe only for PIO exits, otherwise + * data_offset is garbage, e.g. an MMIO gpa. + */ + if (run->exit_reason == KVM_EXIT_IO) + pio_value = *(uint32_t *)((void *)run + run->io.data_offset); + else + pio_value = 0; + + TEST_ASSERT((!want_pio && (run->exit_reason == KVM_EXIT_MMIO && run->mmio.is_write && + run->mmio.phys_addr == io->mmio_gpa && run->mmio.len == 8 && + *(uint64_t *)run->mmio.data == io->mmio_gpa + io->ring_size - 1)) || + (want_pio && (run->exit_reason == KVM_EXIT_IO && run->io.port == io->pio_port && + run->io.direction == KVM_EXIT_IO_OUT && run->io.count == 1 && + pio_value == io->pio_port + io->ring_size - 1)), + "For start = %u, expected exit on %u-byte %s write 0x%llx = %lx, got exit_reason = %u (%s)\n " + "(MMIO addr = 0x%llx, write = %u, len = %u, data = %lx)\n " + "(PIO port = 0x%x, write = %u, len = %u, count = %u, data = %x", + ring_start, want_pio ? 4 : 8, want_pio ? "PIO" : "MMIO", + want_pio ? (unsigned long long)io->pio_port : io->mmio_gpa, + (want_pio ? io->pio_port : io->mmio_gpa) + io->ring_size - 1, run->exit_reason, + run->exit_reason == KVM_EXIT_MMIO ? "MMIO" : run->exit_reason == KVM_EXIT_IO ? "PIO" : "other", + run->mmio.phys_addr, run->mmio.is_write, run->mmio.len, *(uint64_t *)run->mmio.data, + run->io.port, run->io.direction, run->io.size, run->io.count, pio_value); +} + +static void vcpu_run_and_verify_coalesced_io(struct kvm_vcpu *vcpu, + struct kvm_coalesced_io *io, + uint32_t ring_start, + uint32_t expected_exit) +{ + struct kvm_coalesced_mmio_ring *ring = io->ring; + int i; + + vcpu_run_and_verify_io_exit(vcpu, io, ring_start, expected_exit); + + TEST_ASSERT((ring->last + 1) % io->ring_size == ring->first, + "Expected ring to be full (minus 1), first = %u, last = %u, max = %u, start = %u", + ring->first, ring->last, io->ring_size, ring_start); + + for (i = 0; i < io->ring_size - 1; i++) { + uint32_t idx = (ring->first + i) % io->ring_size; + struct kvm_coalesced_mmio *entry = &ring->coalesced_mmio[idx]; + +#ifdef __x86_64__ + if (i & 1) + TEST_ASSERT(entry->phys_addr == io->pio_port && + entry->len == 4 && entry->pio && + *(uint32_t *)entry->data == io->pio_port + i, + "Wanted 4-byte port I/O 0x%x = 0x%x in entry %u, got %u-byte %s 0x%llx = 0x%x", + io->pio_port, io->pio_port + i, i, + entry->len, entry->pio ? "PIO" : "MMIO", + entry->phys_addr, *(uint32_t *)entry->data); + else +#endif + TEST_ASSERT(entry->phys_addr == io->mmio_gpa && + entry->len == 8 && !entry->pio, + "Wanted 8-byte MMIO to 0x%lx = %lx in entry %u, got %u-byte %s 0x%llx = 0x%lx", + io->mmio_gpa, io->mmio_gpa + i, i, + entry->len, entry->pio ? "PIO" : "MMIO", + entry->phys_addr, *(uint64_t *)entry->data); + } +} + +static void test_coalesced_io(struct kvm_vcpu *vcpu, + struct kvm_coalesced_io *io, uint32_t ring_start) +{ + struct kvm_coalesced_mmio_ring *ring = io->ring; + + kvm_vm_register_coalesced_io(vcpu->vm, io->mmio_gpa, 8, false /* pio */); +#ifdef __x86_64__ + kvm_vm_register_coalesced_io(vcpu->vm, io->pio_port, 8, true /* pio */); +#endif + + vcpu_run_and_verify_coalesced_io(vcpu, io, ring_start, KVM_EXIT_MMIO); +#ifdef __x86_64__ + vcpu_run_and_verify_coalesced_io(vcpu, io, ring_start, KVM_EXIT_IO); +#endif + + /* + * Verify ucall, which may use non-coalesced MMIO or PIO, generates an + * immediate exit. + */ + WRITE_ONCE(ring->first, ring_start); + WRITE_ONCE(ring->last, ring_start); + vcpu_run(vcpu); + TEST_ASSERT_EQ(get_ucall(vcpu, NULL), UCALL_SYNC); + TEST_ASSERT_EQ(ring->first, ring_start); + TEST_ASSERT_EQ(ring->last, ring_start); + + /* Verify that non-coalesced MMIO/PIO generates an exit to userspace. */ + kvm_vm_unregister_coalesced_io(vcpu->vm, io->mmio_gpa, 8, false /* pio */); + vcpu_run_and_verify_io_exit(vcpu, io, ring_start, KVM_EXIT_MMIO); + +#ifdef __x86_64__ + kvm_vm_unregister_coalesced_io(vcpu->vm, io->pio_port, 8, true /* pio */); + vcpu_run_and_verify_io_exit(vcpu, io, ring_start, KVM_EXIT_IO); +#endif +} + +int main(int argc, char *argv[]) +{ + struct kvm_vcpu *vcpu; + struct kvm_vm *vm; + int i; + + TEST_REQUIRE(kvm_has_cap(KVM_CAP_COALESCED_MMIO)); + +#ifdef __x86_64__ + TEST_REQUIRE(kvm_has_cap(KVM_CAP_COALESCED_PIO)); +#endif + + vm = vm_create_with_one_vcpu(&vcpu, guest_code); + + kvm_builtin_io_ring = (struct kvm_coalesced_io) { + /* + * The I/O ring is a kernel-allocated page whose address is + * relative to each vCPU's run page, with the page offset + * provided by KVM in the return of KVM_CAP_COALESCED_MMIO. + */ + .ring = (void *)vcpu->run + + (kvm_check_cap(KVM_CAP_COALESCED_MMIO) * getpagesize()), + + /* + * The size of the I/O ring is fixed, but KVM defines the sized + * based on the kernel's PAGE_SIZE. Thus, userspace must query + * the host's page size at runtime to compute the ring size. + */ + .ring_size = (getpagesize() - sizeof(struct kvm_coalesced_mmio_ring)) / + sizeof(struct kvm_coalesced_mmio), + + /* + * Arbitrary address+port (MMIO mustn't overlap memslots), with + * the MMIO GPA identity mapped in the guest. + */ + .mmio_gpa = 4ull * SZ_1G, + .mmio = (uint64_t *)(4ull * SZ_1G), + .pio_port = 0x80, + }; + + virt_map(vm, (uint64_t)kvm_builtin_io_ring.mmio, kvm_builtin_io_ring.mmio_gpa, 1); + + sync_global_to_guest(vm, kvm_builtin_io_ring); + vcpu_args_set(vcpu, 1, &kvm_builtin_io_ring); + + for (i = 0; i < kvm_builtin_io_ring.ring_size; i++) + test_coalesced_io(vcpu, &kvm_builtin_io_ring, i); + + kvm_vm_free(vm); + return 0; +} diff --git a/tools/testing/selftests/kvm/include/kvm_util.h b/tools/testing/selftests/kvm/include/kvm_util.h index 63c2aaae51f3..b297a84f7be5 100644 --- a/tools/testing/selftests/kvm/include/kvm_util.h +++ b/tools/testing/selftests/kvm/include/kvm_util.h @@ -460,6 +460,32 @@ static inline uint32_t kvm_vm_reset_dirty_ring(struct kvm_vm *vm) return __vm_ioctl(vm, KVM_RESET_DIRTY_RINGS, NULL); } +static inline void kvm_vm_register_coalesced_io(struct kvm_vm *vm, + uint64_t address, + uint64_t size, bool pio) +{ + struct kvm_coalesced_mmio_zone zone = { + .addr = address, + .size = size, + .pio = pio, + }; + + vm_ioctl(vm, KVM_REGISTER_COALESCED_MMIO, &zone); +} + +static inline void kvm_vm_unregister_coalesced_io(struct kvm_vm *vm, + uint64_t address, + uint64_t size, bool pio) +{ + struct kvm_coalesced_mmio_zone zone = { + .addr = address, + .size = size, + .pio = pio, + }; + + vm_ioctl(vm, KVM_UNREGISTER_COALESCED_MMIO, &zone); +} + static inline int vm_get_stats_fd(struct kvm_vm *vm) { int fd = __vm_ioctl(vm, KVM_GET_STATS_FD, NULL); -- cgit v1.2.3 From 9d15171f39f0996fcfaec1788d3522eb581af347 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 28 Aug 2024 14:58:00 -0700 Subject: KVM: selftests: Explicitly include committed one-off assets in .gitignore Add KVM selftests' one-off assets, e.g. the Makefile, to the .gitignore so that they are explicitly included. The justification for omitting the one-offs was that including them wouldn't help prevent mistakes: Deliberately do not include the one-off assets, e.g. config, settings, .gitignore itself, etc as Git doesn't ignore files that are already in the repository. Adding the one-off assets won't prevent mistakes where developers forget to --force add files that don't match the "allowed". Turns out that's not the case, as W=1 will generate warnings, and the amazing-as-always kernel test bot reports new warnings: tools/testing/selftests/kvm/.gitignore: warning: ignored by one of the .gitignore files tools/testing/selftests/kvm/Makefile: warning: ignored by one of the .gitignore files >> tools/testing/selftests/kvm/Makefile.kvm: warning: ignored by one of the .gitignore files tools/testing/selftests/kvm/config: warning: ignored by one of the .gitignore files tools/testing/selftests/kvm/settings: warning: ignored by one of the .gitignore files Fixes: 43e96957e8b8 ("KVM: selftests: Use pattern matching in .gitignore") Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202408211818.85zIkDEK-lkp@intel.com Link: https://lore.kernel.org/r/20240828215800.737042-1-seanjc@google.com Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/.gitignore | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/.gitignore b/tools/testing/selftests/kvm/.gitignore index 6d9381d60172..7f57abf936e7 100644 --- a/tools/testing/selftests/kvm/.gitignore +++ b/tools/testing/selftests/kvm/.gitignore @@ -5,3 +5,7 @@ !*.h !*.S !*.sh +!.gitignore +!config +!settings +!Makefile -- cgit v1.2.3 From ca1a18368d764f9b29ab0c79b3ddd712f5511855 Mon Sep 17 00:00:00 2001 From: Colton Lewis Date: Fri, 23 Aug 2024 17:58:35 +0000 Subject: KVM: arm64: selftests: Ensure pending interrupts are handled in arch_timer test Break up the asm instructions poking daifclr and daifset to handle interrupts. R_RBZYL specifies pending interrupts will be handle after context synchronization events such as an ISB. Introduce a function wrapper for the WFI instruction. Signed-off-by: Colton Lewis Link: https://lore.kernel.org/r/20240823175836.2798235-2-coltonlewis@google.com Signed-off-by: Marc Zyngier --- tools/testing/selftests/kvm/aarch64/vgic_irq.c | 11 +++++------ tools/testing/selftests/kvm/include/aarch64/processor.h | 3 +++ tools/testing/selftests/kvm/lib/aarch64/processor.c | 6 ++++++ 3 files changed, 14 insertions(+), 6 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/aarch64/vgic_irq.c b/tools/testing/selftests/kvm/aarch64/vgic_irq.c index a51dbd2a5f84..f4ac28d53747 100644 --- a/tools/testing/selftests/kvm/aarch64/vgic_irq.c +++ b/tools/testing/selftests/kvm/aarch64/vgic_irq.c @@ -269,13 +269,12 @@ static void guest_inject(struct test_args *args, KVM_INJECT_MULTI(cmd, first_intid, num); while (irq_handled < num) { - asm volatile("wfi\n" - "msr daifclr, #2\n" - /* handle IRQ */ - "msr daifset, #2\n" - : : : "memory"); + wfi(); + local_irq_enable(); + isb(); /* handle IRQ */ + local_irq_disable(); } - asm volatile("msr daifclr, #2" : : : "memory"); + local_irq_enable(); GUEST_ASSERT_EQ(irq_handled, num); for (i = first_intid; i < num + first_intid; i++) diff --git a/tools/testing/selftests/kvm/include/aarch64/processor.h b/tools/testing/selftests/kvm/include/aarch64/processor.h index 9b20a355d81a..de977d131082 100644 --- a/tools/testing/selftests/kvm/include/aarch64/processor.h +++ b/tools/testing/selftests/kvm/include/aarch64/processor.h @@ -243,4 +243,7 @@ void smccc_smc(uint32_t function_id, uint64_t arg0, uint64_t arg1, uint64_t arg2, uint64_t arg3, uint64_t arg4, uint64_t arg5, uint64_t arg6, struct arm_smccc_res *res); +/* Execute a Wait For Interrupt instruction. */ +void wfi(void); + #endif /* SELFTEST_KVM_PROCESSOR_H */ diff --git a/tools/testing/selftests/kvm/lib/aarch64/processor.c b/tools/testing/selftests/kvm/lib/aarch64/processor.c index 0ac7cc89f38c..fe4dc3693112 100644 --- a/tools/testing/selftests/kvm/lib/aarch64/processor.c +++ b/tools/testing/selftests/kvm/lib/aarch64/processor.c @@ -639,3 +639,9 @@ void vm_vaddr_populate_bitmap(struct kvm_vm *vm) sparsebit_set_num(vm->vpages_valid, 0, (1ULL << vm->va_bits) >> vm->page_shift); } + +/* Helper to call wfi instruction. */ +void wfi(void) +{ + asm volatile("wfi"); +} -- cgit v1.2.3 From 54306f564441f6bc99514af45552236f28b1768d Mon Sep 17 00:00:00 2001 From: Colton Lewis Date: Fri, 23 Aug 2024 17:58:36 +0000 Subject: KVM: arm64: selftests: Add arch_timer_edge_cases selftest Add a new arch_timer_edge_cases selftests that validates: * timers above the max TVAL value * timers in the past * moving counters ahead and behind pending timers * reprograming timers * timers fired multiple times * masking/unmasking using the timer control mask These are intentionally unusual scenarios to stress compliance with the arm architecture. Co-developed-by: Ricardo Koller Signed-off-by: Ricardo Koller Signed-off-by: Colton Lewis Link: https://lore.kernel.org/r/20240823175836.2798235-3-coltonlewis@google.com Signed-off-by: Marc Zyngier --- tools/testing/selftests/kvm/Makefile | 1 + .../selftests/kvm/aarch64/arch_timer_edge_cases.c | 1062 ++++++++++++++++++++ .../selftests/kvm/include/aarch64/arch_timer.h | 18 +- 3 files changed, 1080 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/kvm/aarch64/arch_timer_edge_cases.c (limited to 'tools') diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile index 48d32c5aa3eb..facf1bc6beb5 100644 --- a/tools/testing/selftests/kvm/Makefile +++ b/tools/testing/selftests/kvm/Makefile @@ -152,6 +152,7 @@ TEST_GEN_PROGS_x86_64 += pre_fault_memory_test TEST_GEN_PROGS_EXTENDED_x86_64 += x86_64/nx_huge_pages_test TEST_GEN_PROGS_aarch64 += aarch64/aarch32_id_regs +TEST_GEN_PROGS_aarch64 += aarch64/arch_timer_edge_cases TEST_GEN_PROGS_aarch64 += aarch64/debug-exceptions TEST_GEN_PROGS_aarch64 += aarch64/hypercalls TEST_GEN_PROGS_aarch64 += aarch64/page_fault_test diff --git a/tools/testing/selftests/kvm/aarch64/arch_timer_edge_cases.c b/tools/testing/selftests/kvm/aarch64/arch_timer_edge_cases.c new file mode 100644 index 000000000000..a36a7e2db434 --- /dev/null +++ b/tools/testing/selftests/kvm/aarch64/arch_timer_edge_cases.c @@ -0,0 +1,1062 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * arch_timer_edge_cases.c - Tests the aarch64 timer IRQ functionality. + * + * The test validates some edge cases related to the arch-timer: + * - timers above the max TVAL value. + * - timers in the past + * - moving counters ahead and behind pending timers. + * - reprograming timers. + * - timers fired multiple times. + * - masking/unmasking using the timer control mask. + * + * Copyright (c) 2021, Google LLC. + */ + +#define _GNU_SOURCE + +#include +#include + +#include "arch_timer.h" +#include "gic.h" +#include "vgic.h" + +static const uint64_t CVAL_MAX = ~0ULL; +/* tval is a signed 32-bit int. */ +static const int32_t TVAL_MAX = INT32_MAX; +static const int32_t TVAL_MIN = INT32_MIN; + +/* After how much time we say there is no IRQ. */ +static const uint32_t TIMEOUT_NO_IRQ_US = 50000; + +/* A nice counter value to use as the starting one for most tests. */ +static const uint64_t DEF_CNT = (CVAL_MAX / 2); + +/* Number of runs. */ +static const uint32_t NR_TEST_ITERS_DEF = 5; + +/* Default wait test time in ms. */ +static const uint32_t WAIT_TEST_MS = 10; + +/* Default "long" wait test time in ms. */ +static const uint32_t LONG_WAIT_TEST_MS = 100; + +/* Shared with IRQ handler. */ +struct test_vcpu_shared_data { + atomic_t handled; + atomic_t spurious; +} shared_data; + +struct test_args { + /* Virtual or physical timer and counter tests. */ + enum arch_timer timer; + /* Delay used for most timer tests. */ + uint64_t wait_ms; + /* Delay used in the test_long_timer_delays test. */ + uint64_t long_wait_ms; + /* Number of iterations. */ + int iterations; + /* Whether to test the physical timer. */ + bool test_physical; + /* Whether to test the virtual timer. */ + bool test_virtual; +}; + +struct test_args test_args = { + .wait_ms = WAIT_TEST_MS, + .long_wait_ms = LONG_WAIT_TEST_MS, + .iterations = NR_TEST_ITERS_DEF, + .test_physical = true, + .test_virtual = true, +}; + +static int vtimer_irq, ptimer_irq; + +enum sync_cmd { + SET_COUNTER_VALUE, + USERSPACE_USLEEP, + USERSPACE_SCHED_YIELD, + USERSPACE_MIGRATE_SELF, + NO_USERSPACE_CMD, +}; + +typedef void (*sleep_method_t)(enum arch_timer timer, uint64_t usec); + +static void sleep_poll(enum arch_timer timer, uint64_t usec); +static void sleep_sched_poll(enum arch_timer timer, uint64_t usec); +static void sleep_in_userspace(enum arch_timer timer, uint64_t usec); +static void sleep_migrate(enum arch_timer timer, uint64_t usec); + +sleep_method_t sleep_method[] = { + sleep_poll, + sleep_sched_poll, + sleep_migrate, + sleep_in_userspace, +}; + +typedef void (*irq_wait_method_t)(void); + +static void wait_for_non_spurious_irq(void); +static void wait_poll_for_irq(void); +static void wait_sched_poll_for_irq(void); +static void wait_migrate_poll_for_irq(void); + +irq_wait_method_t irq_wait_method[] = { + wait_for_non_spurious_irq, + wait_poll_for_irq, + wait_sched_poll_for_irq, + wait_migrate_poll_for_irq, +}; + +enum timer_view { + TIMER_CVAL, + TIMER_TVAL, +}; + +static void assert_irqs_handled(uint32_t n) +{ + int h = atomic_read(&shared_data.handled); + + __GUEST_ASSERT(h == n, "Handled %d IRQS but expected %d", h, n); +} + +static void userspace_cmd(uint64_t cmd) +{ + GUEST_SYNC_ARGS(cmd, 0, 0, 0, 0); +} + +static void userspace_migrate_vcpu(void) +{ + userspace_cmd(USERSPACE_MIGRATE_SELF); +} + +static void userspace_sleep(uint64_t usecs) +{ + GUEST_SYNC_ARGS(USERSPACE_USLEEP, usecs, 0, 0, 0); +} + +static void set_counter(enum arch_timer timer, uint64_t counter) +{ + GUEST_SYNC_ARGS(SET_COUNTER_VALUE, counter, timer, 0, 0); +} + +static void guest_irq_handler(struct ex_regs *regs) +{ + unsigned int intid = gic_get_and_ack_irq(); + enum arch_timer timer; + uint64_t cnt, cval; + uint32_t ctl; + bool timer_condition, istatus; + + if (intid == IAR_SPURIOUS) { + atomic_inc(&shared_data.spurious); + goto out; + } + + if (intid == ptimer_irq) + timer = PHYSICAL; + else if (intid == vtimer_irq) + timer = VIRTUAL; + else + goto out; + + ctl = timer_get_ctl(timer); + cval = timer_get_cval(timer); + cnt = timer_get_cntct(timer); + timer_condition = cnt >= cval; + istatus = (ctl & CTL_ISTATUS) && (ctl & CTL_ENABLE); + GUEST_ASSERT_EQ(timer_condition, istatus); + + /* Disable and mask the timer. */ + timer_set_ctl(timer, CTL_IMASK); + + atomic_inc(&shared_data.handled); + +out: + gic_set_eoi(intid); +} + +static void set_cval_irq(enum arch_timer timer, uint64_t cval_cycles, + uint32_t ctl) +{ + atomic_set(&shared_data.handled, 0); + atomic_set(&shared_data.spurious, 0); + timer_set_cval(timer, cval_cycles); + timer_set_ctl(timer, ctl); +} + +static void set_tval_irq(enum arch_timer timer, uint64_t tval_cycles, + uint32_t ctl) +{ + atomic_set(&shared_data.handled, 0); + atomic_set(&shared_data.spurious, 0); + timer_set_ctl(timer, ctl); + timer_set_tval(timer, tval_cycles); +} + +static void set_xval_irq(enum arch_timer timer, uint64_t xval, uint32_t ctl, + enum timer_view tv) +{ + switch (tv) { + case TIMER_CVAL: + set_cval_irq(timer, xval, ctl); + break; + case TIMER_TVAL: + set_tval_irq(timer, xval, ctl); + break; + default: + GUEST_FAIL("Could not get timer %d", timer); + } +} + +/* + * Note that this can theoretically hang forever, so we rely on having + * a timeout mechanism in the "runner", like: + * tools/testing/selftests/kselftest/runner.sh. + */ +static void wait_for_non_spurious_irq(void) +{ + int h; + + local_irq_disable(); + + for (h = atomic_read(&shared_data.handled); h == atomic_read(&shared_data.handled);) { + wfi(); + local_irq_enable(); + isb(); /* handle IRQ */ + local_irq_disable(); + } +} + +/* + * Wait for an non-spurious IRQ by polling in the guest or in + * userspace (e.g. userspace_cmd=USERSPACE_SCHED_YIELD). + * + * Note that this can theoretically hang forever, so we rely on having + * a timeout mechanism in the "runner", like: + * tools/testing/selftests/kselftest/runner.sh. + */ +static void poll_for_non_spurious_irq(enum sync_cmd usp_cmd) +{ + int h; + + local_irq_disable(); + + h = atomic_read(&shared_data.handled); + + local_irq_enable(); + while (h == atomic_read(&shared_data.handled)) { + if (usp_cmd == NO_USERSPACE_CMD) + cpu_relax(); + else + userspace_cmd(usp_cmd); + } + local_irq_disable(); +} + +static void wait_poll_for_irq(void) +{ + poll_for_non_spurious_irq(NO_USERSPACE_CMD); +} + +static void wait_sched_poll_for_irq(void) +{ + poll_for_non_spurious_irq(USERSPACE_SCHED_YIELD); +} + +static void wait_migrate_poll_for_irq(void) +{ + poll_for_non_spurious_irq(USERSPACE_MIGRATE_SELF); +} + +/* + * Sleep for usec microseconds by polling in the guest or in + * userspace (e.g. userspace_cmd=USERSPACE_SCHEDULE). + */ +static void guest_poll(enum arch_timer test_timer, uint64_t usec, + enum sync_cmd usp_cmd) +{ + uint64_t cycles = usec_to_cycles(usec); + /* Whichever timer we are testing with, sleep with the other. */ + enum arch_timer sleep_timer = 1 - test_timer; + uint64_t start = timer_get_cntct(sleep_timer); + + while ((timer_get_cntct(sleep_timer) - start) < cycles) { + if (usp_cmd == NO_USERSPACE_CMD) + cpu_relax(); + else + userspace_cmd(usp_cmd); + } +} + +static void sleep_poll(enum arch_timer timer, uint64_t usec) +{ + guest_poll(timer, usec, NO_USERSPACE_CMD); +} + +static void sleep_sched_poll(enum arch_timer timer, uint64_t usec) +{ + guest_poll(timer, usec, USERSPACE_SCHED_YIELD); +} + +static void sleep_migrate(enum arch_timer timer, uint64_t usec) +{ + guest_poll(timer, usec, USERSPACE_MIGRATE_SELF); +} + +static void sleep_in_userspace(enum arch_timer timer, uint64_t usec) +{ + userspace_sleep(usec); +} + +/* + * Reset the timer state to some nice values like the counter not being close + * to the edge, and the control register masked and disabled. + */ +static void reset_timer_state(enum arch_timer timer, uint64_t cnt) +{ + set_counter(timer, cnt); + timer_set_ctl(timer, CTL_IMASK); +} + +static void test_timer_xval(enum arch_timer timer, uint64_t xval, + enum timer_view tv, irq_wait_method_t wm, bool reset_state, + uint64_t reset_cnt) +{ + local_irq_disable(); + + if (reset_state) + reset_timer_state(timer, reset_cnt); + + set_xval_irq(timer, xval, CTL_ENABLE, tv); + + /* This method re-enables IRQs to handle the one we're looking for. */ + wm(); + + assert_irqs_handled(1); + local_irq_enable(); +} + +/* + * The test_timer_* functions will program the timer, wait for it, and assert + * the firing of the correct IRQ. + * + * These functions don't have a timeout and return as soon as they receive an + * IRQ. They can hang (forever), so we rely on having a timeout mechanism in + * the "runner", like: tools/testing/selftests/kselftest/runner.sh. + */ + +static void test_timer_cval(enum arch_timer timer, uint64_t cval, + irq_wait_method_t wm, bool reset_state, + uint64_t reset_cnt) +{ + test_timer_xval(timer, cval, TIMER_CVAL, wm, reset_state, reset_cnt); +} + +static void test_timer_tval(enum arch_timer timer, int32_t tval, + irq_wait_method_t wm, bool reset_state, + uint64_t reset_cnt) +{ + test_timer_xval(timer, (uint64_t) tval, TIMER_TVAL, wm, reset_state, + reset_cnt); +} + +static void test_xval_check_no_irq(enum arch_timer timer, uint64_t xval, + uint64_t usec, enum timer_view timer_view, + sleep_method_t guest_sleep) +{ + local_irq_disable(); + + set_xval_irq(timer, xval, CTL_ENABLE | CTL_IMASK, timer_view); + guest_sleep(timer, usec); + + local_irq_enable(); + isb(); + + /* Assume success (no IRQ) after waiting usec microseconds */ + assert_irqs_handled(0); +} + +static void test_cval_no_irq(enum arch_timer timer, uint64_t cval, + uint64_t usec, sleep_method_t wm) +{ + test_xval_check_no_irq(timer, cval, usec, TIMER_CVAL, wm); +} + +static void test_tval_no_irq(enum arch_timer timer, int32_t tval, uint64_t usec, + sleep_method_t wm) +{ + /* tval will be cast to an int32_t in test_xval_check_no_irq */ + test_xval_check_no_irq(timer, (uint64_t) tval, usec, TIMER_TVAL, wm); +} + +/* Test masking/unmasking a timer using the timer mask (not the IRQ mask). */ +static void test_timer_control_mask_then_unmask(enum arch_timer timer) +{ + reset_timer_state(timer, DEF_CNT); + set_tval_irq(timer, -1, CTL_ENABLE | CTL_IMASK); + + /* Unmask the timer, and then get an IRQ. */ + local_irq_disable(); + timer_set_ctl(timer, CTL_ENABLE); + /* This method re-enables IRQs to handle the one we're looking for. */ + wait_for_non_spurious_irq(); + + assert_irqs_handled(1); + local_irq_enable(); +} + +/* Check that timer control masks actually mask a timer being fired. */ +static void test_timer_control_masks(enum arch_timer timer) +{ + reset_timer_state(timer, DEF_CNT); + + /* Local IRQs are not masked at this point. */ + + set_tval_irq(timer, -1, CTL_ENABLE | CTL_IMASK); + + /* Assume no IRQ after waiting TIMEOUT_NO_IRQ_US microseconds */ + sleep_poll(timer, TIMEOUT_NO_IRQ_US); + + assert_irqs_handled(0); + timer_set_ctl(timer, CTL_IMASK); +} + +static void test_fire_a_timer_multiple_times(enum arch_timer timer, + irq_wait_method_t wm, int num) +{ + int i; + + local_irq_disable(); + reset_timer_state(timer, DEF_CNT); + + set_tval_irq(timer, 0, CTL_ENABLE); + + for (i = 1; i <= num; i++) { + /* This method re-enables IRQs to handle the one we're looking for. */ + wm(); + + /* The IRQ handler masked and disabled the timer. + * Enable and unmmask it again. + */ + timer_set_ctl(timer, CTL_ENABLE); + + assert_irqs_handled(i); + } + + local_irq_enable(); +} + +static void test_timers_fired_multiple_times(enum arch_timer timer) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(irq_wait_method); i++) + test_fire_a_timer_multiple_times(timer, irq_wait_method[i], 10); +} + +/* + * Set a timer for tval=delta_1_ms then reprogram it to + * tval=delta_2_ms. Check that we get the timer fired. There is no + * timeout for the wait: we use the wfi instruction. + */ +static void test_reprogramming_timer(enum arch_timer timer, irq_wait_method_t wm, + int32_t delta_1_ms, int32_t delta_2_ms) +{ + local_irq_disable(); + reset_timer_state(timer, DEF_CNT); + + /* Program the timer to DEF_CNT + delta_1_ms. */ + set_tval_irq(timer, msec_to_cycles(delta_1_ms), CTL_ENABLE); + + /* Reprogram the timer to DEF_CNT + delta_2_ms. */ + timer_set_tval(timer, msec_to_cycles(delta_2_ms)); + + /* This method re-enables IRQs to handle the one we're looking for. */ + wm(); + + /* The IRQ should arrive at DEF_CNT + delta_2_ms (or after). */ + GUEST_ASSERT(timer_get_cntct(timer) >= + DEF_CNT + msec_to_cycles(delta_2_ms)); + + local_irq_enable(); + assert_irqs_handled(1); +}; + +static void test_reprogram_timers(enum arch_timer timer) +{ + int i; + uint64_t base_wait = test_args.wait_ms; + + for (i = 0; i < ARRAY_SIZE(irq_wait_method); i++) { + /* + * Ensure reprogramming works whether going from a + * longer time to a shorter or vice versa. + */ + test_reprogramming_timer(timer, irq_wait_method[i], 2 * base_wait, + base_wait); + test_reprogramming_timer(timer, irq_wait_method[i], base_wait, + 2 * base_wait); + } +} + +static void test_basic_functionality(enum arch_timer timer) +{ + int32_t tval = (int32_t) msec_to_cycles(test_args.wait_ms); + uint64_t cval = DEF_CNT + msec_to_cycles(test_args.wait_ms); + int i; + + for (i = 0; i < ARRAY_SIZE(irq_wait_method); i++) { + irq_wait_method_t wm = irq_wait_method[i]; + + test_timer_cval(timer, cval, wm, true, DEF_CNT); + test_timer_tval(timer, tval, wm, true, DEF_CNT); + } +} + +/* + * This test checks basic timer behavior without actually firing timers, things + * like: the relationship between cval and tval, tval down-counting. + */ +static void timers_sanity_checks(enum arch_timer timer, bool use_sched) +{ + reset_timer_state(timer, DEF_CNT); + + local_irq_disable(); + + /* cval in the past */ + timer_set_cval(timer, + timer_get_cntct(timer) - + msec_to_cycles(test_args.wait_ms)); + if (use_sched) + userspace_migrate_vcpu(); + GUEST_ASSERT(timer_get_tval(timer) < 0); + + /* tval in the past */ + timer_set_tval(timer, -1); + if (use_sched) + userspace_migrate_vcpu(); + GUEST_ASSERT(timer_get_cval(timer) < timer_get_cntct(timer)); + + /* tval larger than TVAL_MAX. This requires programming with + * timer_set_cval instead so the value is expressible + */ + timer_set_cval(timer, + timer_get_cntct(timer) + TVAL_MAX + + msec_to_cycles(test_args.wait_ms)); + if (use_sched) + userspace_migrate_vcpu(); + GUEST_ASSERT(timer_get_tval(timer) <= 0); + + /* + * tval larger than 2 * TVAL_MAX. + * Twice the TVAL_MAX completely loops around the TVAL. + */ + timer_set_cval(timer, + timer_get_cntct(timer) + 2ULL * TVAL_MAX + + msec_to_cycles(test_args.wait_ms)); + if (use_sched) + userspace_migrate_vcpu(); + GUEST_ASSERT(timer_get_tval(timer) <= + msec_to_cycles(test_args.wait_ms)); + + /* negative tval that rollovers from 0. */ + set_counter(timer, msec_to_cycles(1)); + timer_set_tval(timer, -1 * msec_to_cycles(test_args.wait_ms)); + if (use_sched) + userspace_migrate_vcpu(); + GUEST_ASSERT(timer_get_cval(timer) >= (CVAL_MAX - msec_to_cycles(test_args.wait_ms))); + + /* tval should keep down-counting from 0 to -1. */ + timer_set_tval(timer, 0); + sleep_poll(timer, 1); + GUEST_ASSERT(timer_get_tval(timer) < 0); + + local_irq_enable(); + + /* Mask and disable any pending timer. */ + timer_set_ctl(timer, CTL_IMASK); +} + +static void test_timers_sanity_checks(enum arch_timer timer) +{ + timers_sanity_checks(timer, false); + /* Check how KVM saves/restores these edge-case values. */ + timers_sanity_checks(timer, true); +} + +static void test_set_cnt_after_tval_max(enum arch_timer timer, irq_wait_method_t wm) +{ + local_irq_disable(); + reset_timer_state(timer, DEF_CNT); + + set_cval_irq(timer, + (uint64_t) TVAL_MAX + + msec_to_cycles(test_args.wait_ms) / 2, CTL_ENABLE); + + set_counter(timer, TVAL_MAX); + + /* This method re-enables IRQs to handle the one we're looking for. */ + wm(); + + assert_irqs_handled(1); + local_irq_enable(); +} + +/* Test timers set for: cval = now + TVAL_MAX + wait_ms / 2 */ +static void test_timers_above_tval_max(enum arch_timer timer) +{ + uint64_t cval; + int i; + + /* + * Test that the system is not implementing cval in terms of + * tval. If that was the case, setting a cval to "cval = now + * + TVAL_MAX + wait_ms" would wrap to "cval = now + + * wait_ms", and the timer would fire immediately. Test that it + * doesn't. + */ + for (i = 0; i < ARRAY_SIZE(sleep_method); i++) { + reset_timer_state(timer, DEF_CNT); + cval = timer_get_cntct(timer) + TVAL_MAX + + msec_to_cycles(test_args.wait_ms); + test_cval_no_irq(timer, cval, + msecs_to_usecs(test_args.wait_ms) + + TIMEOUT_NO_IRQ_US, sleep_method[i]); + } + + for (i = 0; i < ARRAY_SIZE(irq_wait_method); i++) { + /* Get the IRQ by moving the counter forward. */ + test_set_cnt_after_tval_max(timer, irq_wait_method[i]); + } +} + +/* + * Template function to be used by the test_move_counter_ahead_* tests. It + * sets the counter to cnt_1, the [c|t]val, the counter to cnt_2, and + * then waits for an IRQ. + */ +static void test_set_cnt_after_xval(enum arch_timer timer, uint64_t cnt_1, + uint64_t xval, uint64_t cnt_2, + irq_wait_method_t wm, enum timer_view tv) +{ + local_irq_disable(); + + set_counter(timer, cnt_1); + timer_set_ctl(timer, CTL_IMASK); + + set_xval_irq(timer, xval, CTL_ENABLE, tv); + set_counter(timer, cnt_2); + /* This method re-enables IRQs to handle the one we're looking for. */ + wm(); + + assert_irqs_handled(1); + local_irq_enable(); +} + +/* + * Template function to be used by the test_move_counter_ahead_* tests. It + * sets the counter to cnt_1, the [c|t]val, the counter to cnt_2, and + * then waits for an IRQ. + */ +static void test_set_cnt_after_xval_no_irq(enum arch_timer timer, + uint64_t cnt_1, uint64_t xval, + uint64_t cnt_2, + sleep_method_t guest_sleep, + enum timer_view tv) +{ + local_irq_disable(); + + set_counter(timer, cnt_1); + timer_set_ctl(timer, CTL_IMASK); + + set_xval_irq(timer, xval, CTL_ENABLE, tv); + set_counter(timer, cnt_2); + guest_sleep(timer, TIMEOUT_NO_IRQ_US); + + local_irq_enable(); + isb(); + + /* Assume no IRQ after waiting TIMEOUT_NO_IRQ_US microseconds */ + assert_irqs_handled(0); + timer_set_ctl(timer, CTL_IMASK); +} + +static void test_set_cnt_after_tval(enum arch_timer timer, uint64_t cnt_1, + int32_t tval, uint64_t cnt_2, + irq_wait_method_t wm) +{ + test_set_cnt_after_xval(timer, cnt_1, tval, cnt_2, wm, TIMER_TVAL); +} + +static void test_set_cnt_after_cval(enum arch_timer timer, uint64_t cnt_1, + uint64_t cval, uint64_t cnt_2, + irq_wait_method_t wm) +{ + test_set_cnt_after_xval(timer, cnt_1, cval, cnt_2, wm, TIMER_CVAL); +} + +static void test_set_cnt_after_tval_no_irq(enum arch_timer timer, + uint64_t cnt_1, int32_t tval, + uint64_t cnt_2, sleep_method_t wm) +{ + test_set_cnt_after_xval_no_irq(timer, cnt_1, tval, cnt_2, wm, + TIMER_TVAL); +} + +static void test_set_cnt_after_cval_no_irq(enum arch_timer timer, + uint64_t cnt_1, uint64_t cval, + uint64_t cnt_2, sleep_method_t wm) +{ + test_set_cnt_after_xval_no_irq(timer, cnt_1, cval, cnt_2, wm, + TIMER_CVAL); +} + +/* Set a timer and then move the counter ahead of it. */ +static void test_move_counters_ahead_of_timers(enum arch_timer timer) +{ + int i; + int32_t tval; + + for (i = 0; i < ARRAY_SIZE(irq_wait_method); i++) { + irq_wait_method_t wm = irq_wait_method[i]; + + test_set_cnt_after_cval(timer, 0, DEF_CNT, DEF_CNT + 1, wm); + test_set_cnt_after_cval(timer, CVAL_MAX, 1, 2, wm); + + /* Move counter ahead of negative tval. */ + test_set_cnt_after_tval(timer, 0, -1, DEF_CNT + 1, wm); + test_set_cnt_after_tval(timer, 0, -1, TVAL_MAX, wm); + tval = TVAL_MAX; + test_set_cnt_after_tval(timer, 0, tval, (uint64_t) tval + 1, + wm); + } + + for (i = 0; i < ARRAY_SIZE(sleep_method); i++) { + sleep_method_t sm = sleep_method[i]; + + test_set_cnt_after_cval_no_irq(timer, 0, DEF_CNT, CVAL_MAX, sm); + } +} + +/* + * Program a timer, mask it, and then change the tval or counter to cancel it. + * Unmask it and check that nothing fires. + */ +static void test_move_counters_behind_timers(enum arch_timer timer) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(sleep_method); i++) { + sleep_method_t sm = sleep_method[i]; + + test_set_cnt_after_cval_no_irq(timer, DEF_CNT, DEF_CNT - 1, 0, + sm); + test_set_cnt_after_tval_no_irq(timer, DEF_CNT, -1, 0, sm); + } +} + +static void test_timers_in_the_past(enum arch_timer timer) +{ + int32_t tval = -1 * (int32_t) msec_to_cycles(test_args.wait_ms); + uint64_t cval; + int i; + + for (i = 0; i < ARRAY_SIZE(irq_wait_method); i++) { + irq_wait_method_t wm = irq_wait_method[i]; + + /* set a timer wait_ms the past. */ + cval = DEF_CNT - msec_to_cycles(test_args.wait_ms); + test_timer_cval(timer, cval, wm, true, DEF_CNT); + test_timer_tval(timer, tval, wm, true, DEF_CNT); + + /* Set a timer to counter=0 (in the past) */ + test_timer_cval(timer, 0, wm, true, DEF_CNT); + + /* Set a time for tval=0 (now) */ + test_timer_tval(timer, 0, wm, true, DEF_CNT); + + /* Set a timer to as far in the past as possible */ + test_timer_tval(timer, TVAL_MIN, wm, true, DEF_CNT); + } + + /* + * Set the counter to wait_ms, and a tval to -wait_ms. There should be no + * IRQ as that tval means cval=CVAL_MAX-wait_ms. + */ + for (i = 0; i < ARRAY_SIZE(sleep_method); i++) { + sleep_method_t sm = sleep_method[i]; + + set_counter(timer, msec_to_cycles(test_args.wait_ms)); + test_tval_no_irq(timer, tval, TIMEOUT_NO_IRQ_US, sm); + } +} + +static void test_long_timer_delays(enum arch_timer timer) +{ + int32_t tval = (int32_t) msec_to_cycles(test_args.long_wait_ms); + uint64_t cval = DEF_CNT + msec_to_cycles(test_args.long_wait_ms); + int i; + + for (i = 0; i < ARRAY_SIZE(irq_wait_method); i++) { + irq_wait_method_t wm = irq_wait_method[i]; + + test_timer_cval(timer, cval, wm, true, DEF_CNT); + test_timer_tval(timer, tval, wm, true, DEF_CNT); + } +} + +static void guest_run_iteration(enum arch_timer timer) +{ + test_basic_functionality(timer); + test_timers_sanity_checks(timer); + + test_timers_above_tval_max(timer); + test_timers_in_the_past(timer); + + test_move_counters_ahead_of_timers(timer); + test_move_counters_behind_timers(timer); + test_reprogram_timers(timer); + + test_timers_fired_multiple_times(timer); + + test_timer_control_mask_then_unmask(timer); + test_timer_control_masks(timer); +} + +static void guest_code(enum arch_timer timer) +{ + int i; + + local_irq_disable(); + + gic_init(GIC_V3, 1); + + timer_set_ctl(VIRTUAL, CTL_IMASK); + timer_set_ctl(PHYSICAL, CTL_IMASK); + + gic_irq_enable(vtimer_irq); + gic_irq_enable(ptimer_irq); + local_irq_enable(); + + for (i = 0; i < test_args.iterations; i++) { + GUEST_SYNC(i); + guest_run_iteration(timer); + } + + test_long_timer_delays(timer); + GUEST_DONE(); +} + +static uint32_t next_pcpu(void) +{ + uint32_t max = get_nprocs(); + uint32_t cur = sched_getcpu(); + uint32_t next = cur; + cpu_set_t cpuset; + + TEST_ASSERT(max > 1, "Need at least two physical cpus"); + + sched_getaffinity(0, sizeof(cpuset), &cpuset); + + do { + next = (next + 1) % CPU_SETSIZE; + } while (!CPU_ISSET(next, &cpuset)); + + return next; +} + +static void migrate_self(uint32_t new_pcpu) +{ + int ret; + cpu_set_t cpuset; + pthread_t thread; + + thread = pthread_self(); + + CPU_ZERO(&cpuset); + CPU_SET(new_pcpu, &cpuset); + + pr_debug("Migrating from %u to %u\n", sched_getcpu(), new_pcpu); + + ret = pthread_setaffinity_np(thread, sizeof(cpuset), &cpuset); + + TEST_ASSERT(ret == 0, "Failed to migrate to pCPU: %u; ret: %d\n", + new_pcpu, ret); +} + +static void kvm_set_cntxct(struct kvm_vcpu *vcpu, uint64_t cnt, + enum arch_timer timer) +{ + if (timer == PHYSICAL) + vcpu_set_reg(vcpu, KVM_REG_ARM_PTIMER_CNT, cnt); + else + vcpu_set_reg(vcpu, KVM_REG_ARM_TIMER_CNT, cnt); +} + +static void handle_sync(struct kvm_vcpu *vcpu, struct ucall *uc) +{ + enum sync_cmd cmd = uc->args[1]; + uint64_t val = uc->args[2]; + enum arch_timer timer = uc->args[3]; + + switch (cmd) { + case SET_COUNTER_VALUE: + kvm_set_cntxct(vcpu, val, timer); + break; + case USERSPACE_USLEEP: + usleep(val); + break; + case USERSPACE_SCHED_YIELD: + sched_yield(); + break; + case USERSPACE_MIGRATE_SELF: + migrate_self(next_pcpu()); + break; + default: + break; + } +} + +static void test_run(struct kvm_vm *vm, struct kvm_vcpu *vcpu) +{ + struct ucall uc; + + /* Start on CPU 0 */ + migrate_self(0); + + while (true) { + vcpu_run(vcpu); + switch (get_ucall(vcpu, &uc)) { + case UCALL_SYNC: + handle_sync(vcpu, &uc); + break; + case UCALL_DONE: + goto out; + case UCALL_ABORT: + REPORT_GUEST_ASSERT(uc); + goto out; + default: + TEST_FAIL("Unexpected guest exit\n"); + } + } + + out: + return; +} + +static void test_init_timer_irq(struct kvm_vm *vm, struct kvm_vcpu *vcpu) +{ + vcpu_device_attr_get(vcpu, KVM_ARM_VCPU_TIMER_CTRL, + KVM_ARM_VCPU_TIMER_IRQ_PTIMER, &ptimer_irq); + vcpu_device_attr_get(vcpu, KVM_ARM_VCPU_TIMER_CTRL, + KVM_ARM_VCPU_TIMER_IRQ_VTIMER, &vtimer_irq); + + sync_global_to_guest(vm, ptimer_irq); + sync_global_to_guest(vm, vtimer_irq); + + pr_debug("ptimer_irq: %d; vtimer_irq: %d\n", ptimer_irq, vtimer_irq); +} + +static void test_vm_create(struct kvm_vm **vm, struct kvm_vcpu **vcpu, + enum arch_timer timer) +{ + *vm = vm_create_with_one_vcpu(vcpu, guest_code); + TEST_ASSERT(*vm, "Failed to create the test VM\n"); + + vm_init_descriptor_tables(*vm); + vm_install_exception_handler(*vm, VECTOR_IRQ_CURRENT, + guest_irq_handler); + + vcpu_init_descriptor_tables(*vcpu); + vcpu_args_set(*vcpu, 1, timer); + + test_init_timer_irq(*vm, *vcpu); + vgic_v3_setup(*vm, 1, 64); + sync_global_to_guest(*vm, test_args); +} + +static void test_print_help(char *name) +{ + pr_info("Usage: %s [-h] [-b] [-i iterations] [-l long_wait_ms] [-p] [-v]\n" + , name); + pr_info("\t-i: Number of iterations (default: %u)\n", + NR_TEST_ITERS_DEF); + pr_info("\t-b: Test both physical and virtual timers (default: true)\n"); + pr_info("\t-l: Delta (in ms) used for long wait time test (default: %u)\n", + LONG_WAIT_TEST_MS); + pr_info("\t-l: Delta (in ms) used for wait times (default: %u)\n", + WAIT_TEST_MS); + pr_info("\t-p: Test physical timer (default: true)\n"); + pr_info("\t-v: Test virtual timer (default: true)\n"); + pr_info("\t-h: Print this help message\n"); +} + +static bool parse_args(int argc, char *argv[]) +{ + int opt; + + while ((opt = getopt(argc, argv, "bhi:l:pvw:")) != -1) { + switch (opt) { + case 'b': + test_args.test_physical = true; + test_args.test_virtual = true; + break; + case 'i': + test_args.iterations = + atoi_positive("Number of iterations", optarg); + break; + case 'l': + test_args.long_wait_ms = + atoi_positive("Long wait time", optarg); + break; + case 'p': + test_args.test_physical = true; + test_args.test_virtual = false; + break; + case 'v': + test_args.test_virtual = true; + test_args.test_physical = false; + break; + case 'w': + test_args.wait_ms = atoi_positive("Wait time", optarg); + break; + case 'h': + default: + goto err; + } + } + + return true; + + err: + test_print_help(argv[0]); + return false; +} + +int main(int argc, char *argv[]) +{ + struct kvm_vcpu *vcpu; + struct kvm_vm *vm; + + /* Tell stdout not to buffer its content */ + setbuf(stdout, NULL); + + if (!parse_args(argc, argv)) + exit(KSFT_SKIP); + + if (test_args.test_virtual) { + test_vm_create(&vm, &vcpu, VIRTUAL); + test_run(vm, vcpu); + kvm_vm_free(vm); + } + + if (test_args.test_physical) { + test_vm_create(&vm, &vcpu, PHYSICAL); + test_run(vm, vcpu); + kvm_vm_free(vm); + } + + return 0; +} diff --git a/tools/testing/selftests/kvm/include/aarch64/arch_timer.h b/tools/testing/selftests/kvm/include/aarch64/arch_timer.h index b3e97525cb55..bf461de34785 100644 --- a/tools/testing/selftests/kvm/include/aarch64/arch_timer.h +++ b/tools/testing/selftests/kvm/include/aarch64/arch_timer.h @@ -79,7 +79,7 @@ static inline uint64_t timer_get_cval(enum arch_timer timer) return 0; } -static inline void timer_set_tval(enum arch_timer timer, uint32_t tval) +static inline void timer_set_tval(enum arch_timer timer, int32_t tval) { switch (timer) { case VIRTUAL: @@ -95,6 +95,22 @@ static inline void timer_set_tval(enum arch_timer timer, uint32_t tval) isb(); } +static inline int32_t timer_get_tval(enum arch_timer timer) +{ + isb(); + switch (timer) { + case VIRTUAL: + return read_sysreg(cntv_tval_el0); + case PHYSICAL: + return read_sysreg(cntp_tval_el0); + default: + GUEST_FAIL("Could not get timer %d\n", timer); + } + + /* We should not reach here */ + return 0; +} + static inline void timer_set_ctl(enum arch_timer timer, uint32_t ctl) { switch (timer) { -- cgit v1.2.3 From 9a7db819a184c21f605a4364131762d5db0c7010 Mon Sep 17 00:00:00 2001 From: Zhu Jun Date: Tue, 20 Aug 2024 00:42:42 -0700 Subject: crypto: tools/ccp - Remove unused variable the variable is never referenced in the code, just remove them. Signed-off-by: Zhu Jun Reviewed-by: Mario Limonciello Signed-off-by: Herbert Xu --- tools/crypto/ccp/dbc.c | 1 - 1 file changed, 1 deletion(-) (limited to 'tools') diff --git a/tools/crypto/ccp/dbc.c b/tools/crypto/ccp/dbc.c index a807df0f0597..80248d3d3a5a 100644 --- a/tools/crypto/ccp/dbc.c +++ b/tools/crypto/ccp/dbc.c @@ -57,7 +57,6 @@ int process_param(int fd, int msg_index, __u8 *signature, int *data) .msg_index = msg_index, .param = *data, }; - int ret; assert(signature); assert(data); -- cgit v1.2.3 From 6f0315330af7a57c1c00587fdfb69c7778bf1c50 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Thu, 29 Aug 2024 18:20:09 +0100 Subject: kselftest/arm64: Actually test SME vector length changes via sigreturn The test case for SME vector length changes via sigreturn use a bit too much cut'n'paste and only actually changed the SVE vector length in the test itself. Andre's recent factoring out of the initialisation code caused this to be exposed and the test to start failing. Fix the test to actually cover the thing it's supposed to test. Fixes: 4963aeb35a9e ("kselftest/arm64: signal: Add SME signal handling tests") Signed-off-by: Mark Brown Reviewed-by: Andre Przywara Tested-by: Andre Przywara Link: https://lore.kernel.org/r/20240829-arm64-sme-signal-vl-change-test-v1-1-42d7534cb818@kernel.org Signed-off-by: Will Deacon --- .../arm64/signal/testcases/fake_sigreturn_sme_change_vl.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/arm64/signal/testcases/fake_sigreturn_sme_change_vl.c b/tools/testing/selftests/arm64/signal/testcases/fake_sigreturn_sme_change_vl.c index cb8c051b5c8f..dfd6a2badf9f 100644 --- a/tools/testing/selftests/arm64/signal/testcases/fake_sigreturn_sme_change_vl.c +++ b/tools/testing/selftests/arm64/signal/testcases/fake_sigreturn_sme_change_vl.c @@ -35,30 +35,30 @@ static int fake_sigreturn_ssve_change_vl(struct tdescr *td, { size_t resv_sz, offset; struct _aarch64_ctx *head = GET_SF_RESV_HEAD(sf); - struct sve_context *sve; + struct za_context *za; /* Get a signal context with a SME ZA frame in it */ if (!get_current_context(td, &sf.uc, sizeof(sf.uc))) return 1; resv_sz = GET_SF_RESV_SIZE(sf); - head = get_header(head, SVE_MAGIC, resv_sz, &offset); + head = get_header(head, ZA_MAGIC, resv_sz, &offset); if (!head) { - fprintf(stderr, "No SVE context\n"); + fprintf(stderr, "No ZA context\n"); return 1; } - if (head->size != sizeof(struct sve_context)) { + if (head->size != sizeof(struct za_context)) { fprintf(stderr, "Register data present, aborting\n"); return 1; } - sve = (struct sve_context *)head; + za = (struct za_context *)head; /* No changes are supported; init left us at minimum VL so go to max */ fprintf(stderr, "Attempting to change VL from %d to %d\n", - sve->vl, vls[0]); - sve->vl = vls[0]; + za->vl, vls[0]); + za->vl = vls[0]; fake_sigreturn(&sf, sizeof(sf), 0); -- cgit v1.2.3 From 89d64e72732f3f970a048a46a7b60164b34dfc43 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Thu, 29 Aug 2024 08:01:52 -0700 Subject: perf inject: Overhaul handling of pipe files Previously inject->is_pipe was set if the input or output were a pipe. Determining the input was a pipe had to be done prior to starting the session and opening the file. This was done by comparing the input file name with '-' but it fails if the pipe file is written to disk. Opening a pipe file from disk will correctly set perf_data.is_pipe, but this is too late for 'perf inject' and results in a broken file. A workaround is 'cat pipe_perf|perf inject -i - ...'. This change removes inject->is_pipe and changes the dependent conditions to use the is_pipe flag on the input (inject->session->data) and output files (inject->output). This ensures the is_pipe condition reflects things like the header being read. The change removes the use of perf file header repiping, that is writing the file header out while reading it in. The case of input pipe and output file cannot repipe as the attributes for the file are unknown. To resolve this, write the file header when writing to disk and as the attributes may be unknown, write them after the data. Update sessions repipe variable to be trace_event_repipe as those are the only events now impacted by it. Update __perf_session__new as the repipe_fd no longer needs passing. Fully removing repipe from session header reading will be done in a later change. Committer testing: root@number:~# perf record -e syscalls:sys_enter_*sleep/max-stack=4/ -o - sleep 0.01 | perf report -i - # To display the perf.data header info, please use --header/--header-only options. # [ perf record: Woken up 1 times to write data ] [ perf record: Captured and wrote 0.050 MB - ] # # Total Lost Samples: 0 # # Samples: 1 of event 'syscalls:sys_enter_clock_nanosleep' # Event count (approx.): 1 # # Overhead Command Shared Object Symbol # ........ ....... ............. ............................... # 100.00% sleep libc.so.6 [.] clock_nanosleep@GLIBC_2.2.5 | ---__libc_start_main@@GLIBC_2.34 __libc_start_call_main 0x562fc2560a9f clock_nanosleep@GLIBC_2.2.5 # # (Tip: Create an archive with symtabs to analyse on other machine: perf archive) # root@number:~# perf record -e syscalls:sys_enter_*sleep/max-stack=4/ -o - sleep 0.01 > pipe.data [ perf record: Woken up 1 times to write data ] [ perf record: Captured and wrote 0.050 MB - ] root@number:~# perf report --stdio -i pipe.data # To display the perf.data header info, please use --header/--header-only options. # # # Total Lost Samples: 0 # # Samples: 1 of event 'syscalls:sys_enter_clock_nanosleep' # Event count (approx.): 1 # # Overhead Command Shared Object Symbol # ........ ....... ............. ............................... # 100.00% sleep libc.so.6 [.] clock_nanosleep@GLIBC_2.2.5 | ---__libc_start_main@@GLIBC_2.34 __libc_start_call_main 0x55f775975a9f clock_nanosleep@GLIBC_2.2.5 # # (Tip: To set sampling period of individual events use perf record -e cpu/cpu-cycles,period=100001/,cpu/branches,period=10001/ ...) # root@number:~# Signed-off-by: Ian Rogers Tested-by: Arnaldo Carvalho de Melo Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: Kan Liang Cc: Mark Rutland Cc: Namhyung Kim Cc: Nick Terrell Cc: Peter Zijlstra Cc: Yanteng Si Cc: Yicong Yang Link: https://lore.kernel.org/r/20240829150154.37929-7-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-inject.c | 60 +++++++++++++++++++++++---------------------- tools/perf/util/header.c | 12 ++++----- tools/perf/util/header.h | 3 ++- tools/perf/util/session.c | 8 +++--- tools/perf/util/session.h | 14 ++++------- 5 files changed, 48 insertions(+), 49 deletions(-) (limited to 'tools') diff --git a/tools/perf/builtin-inject.c b/tools/perf/builtin-inject.c index a7c859db2e15..0ccf80fe8399 100644 --- a/tools/perf/builtin-inject.c +++ b/tools/perf/builtin-inject.c @@ -119,7 +119,6 @@ struct perf_inject { bool jit_mode; bool in_place_update; bool in_place_update_dry_run; - bool is_pipe; bool copy_kcore_dir; const char *input_name; struct perf_data output; @@ -205,7 +204,8 @@ static int perf_event__repipe_attr(const struct perf_tool *tool, if (ret) return ret; - if (!inject->is_pipe) + /* If the output isn't a pipe then the attributes will be written as part of the header. */ + if (!inject->output.is_pipe) return 0; return perf_event__repipe_synth(tool, event); @@ -1966,7 +1966,13 @@ static int __cmd_inject(struct perf_inject *inject) struct guest_session *gs = &inject->guest_session; struct perf_session *session = inject->session; int fd = output_fd(inject); - u64 output_data_offset; + u64 output_data_offset = perf_session__data_offset(session->evlist); + /* + * Pipe input hasn't loaded the attributes and will handle them as + * events. So that the attributes don't overlap the data, write the + * attributes after the data. + */ + bool write_attrs_after_data = !inject->output.is_pipe && inject->session->data->is_pipe; signal(SIGINT, sig_handler); @@ -1980,8 +1986,6 @@ static int __cmd_inject(struct perf_inject *inject) #endif } - output_data_offset = perf_session__data_offset(session->evlist); - if (inject->build_id_style == BID_RWS__INJECT_HEADER_LAZY) { inject->tool.sample = perf_event__inject_buildid; } else if (inject->sched_stat) { @@ -2075,7 +2079,7 @@ static int __cmd_inject(struct perf_inject *inject) if (!inject->itrace_synth_opts.set) auxtrace_index__free(&session->auxtrace_index); - if (!inject->is_pipe && !inject->in_place_update) + if (!inject->output.is_pipe && !inject->in_place_update) lseek(fd, output_data_offset, SEEK_SET); ret = perf_session__process_events(session); @@ -2094,7 +2098,7 @@ static int __cmd_inject(struct perf_inject *inject) } } - if (!inject->is_pipe && !inject->in_place_update) { + if (!inject->output.is_pipe && !inject->in_place_update) { struct inject_fc inj_fc = { .fc.copy = feat_copy_cb, .inject = inject, @@ -2124,7 +2128,8 @@ static int __cmd_inject(struct perf_inject *inject) } session->header.data_offset = output_data_offset; session->header.data_size = inject->bytes_written; - perf_session__inject_header(session, session->evlist, fd, &inj_fc.fc); + perf_session__inject_header(session, session->evlist, fd, &inj_fc.fc, + write_attrs_after_data); if (inject->copy_kcore_dir) { ret = copy_kcore_dir(inject); @@ -2161,7 +2166,6 @@ int cmd_inject(int argc, const char **argv) .use_stdio = true, }; int ret; - bool repipe = true; const char *known_build_ids = NULL; bool build_ids; bool build_id_all; @@ -2273,16 +2277,7 @@ int cmd_inject(int argc, const char **argv) inject.build_id_style = BID_RWS__INJECT_HEADER_ALL; data.path = inject.input_name; - if (!strcmp(inject.input_name, "-") || inject.output.is_pipe) { - inject.is_pipe = true; - /* - * Do not repipe header when input is a regular file - * since either it can rewrite the header at the end - * or write a new pipe header. - */ - if (strcmp(inject.input_name, "-")) - repipe = false; - } + ordered_events = inject.jit_mode || inject.sched_stat || (inject.build_id_style == BID_RWS__INJECT_HEADER_LAZY); perf_tool__init(&inject.tool, ordered_events); @@ -2325,9 +2320,9 @@ int cmd_inject(int argc, const char **argv) inject.tool.compressed = perf_event__repipe_op4_synth; inject.tool.auxtrace = perf_event__repipe_auxtrace; inject.tool.dont_split_sample_group = true; - inject.session = __perf_session__new(&data, repipe, - output_fd(&inject), - &inject.tool); + inject.session = __perf_session__new(&data, &inject.tool, + /*trace_event_repipe=*/inject.output.is_pipe); + if (IS_ERR(inject.session)) { ret = PTR_ERR(inject.session); goto out_close_output; @@ -2341,19 +2336,26 @@ int cmd_inject(int argc, const char **argv) if (ret) goto out_delete; - if (!data.is_pipe && inject.output.is_pipe) { + if (inject.output.is_pipe) { ret = perf_header__write_pipe(perf_data__fd(&inject.output)); if (ret < 0) { pr_err("Couldn't write a new pipe header.\n"); goto out_delete; } - ret = perf_event__synthesize_for_pipe(&inject.tool, - inject.session, - &inject.output, - perf_event__repipe); - if (ret < 0) - goto out_delete; + /* + * If the input is already a pipe then the features and + * attributes don't need synthesizing, they will be present in + * the input. + */ + if (!data.is_pipe) { + ret = perf_event__synthesize_for_pipe(&inject.tool, + inject.session, + &inject.output, + perf_event__repipe); + if (ret < 0) + goto out_delete; + } } if (inject.build_id_style == BID_RWS__INJECT_HEADER_LAZY) { diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c index 4eb39463067e..59e2f37c1cb4 100644 --- a/tools/perf/util/header.c +++ b/tools/perf/util/header.c @@ -3818,10 +3818,11 @@ size_t perf_session__data_offset(const struct evlist *evlist) int perf_session__inject_header(struct perf_session *session, struct evlist *evlist, int fd, - struct feat_copier *fc) + struct feat_copier *fc, + bool write_attrs_after_data) { return perf_session__do_write_header(session, evlist, fd, true, fc, - /*write_attrs_after_data=*/false); + write_attrs_after_data); } static int perf_header__getbuffer64(struct perf_header *header, @@ -4145,7 +4146,7 @@ static int perf_header__read_pipe(struct perf_session *session, int repipe_fd) struct perf_pipe_file_header f_header; if (perf_file_header__read_pipe(&f_header, header, session->data, - session->repipe, repipe_fd) < 0) { + /*repipe=*/false, repipe_fd) < 0) { pr_debug("incompatible file format\n"); return -EINVAL; } @@ -4560,15 +4561,14 @@ int perf_event__process_tracing_data(struct perf_session *session, SEEK_SET); } - size_read = trace_report(fd, &session->tevent, - session->repipe); + size_read = trace_report(fd, &session->tevent, session->trace_event_repipe); padding = PERF_ALIGN(size_read, sizeof(u64)) - size_read; if (readn(fd, buf, padding) < 0) { pr_err("%s: reading input file", __func__); return -1; } - if (session->repipe) { + if (session->trace_event_repipe) { int retw = write(STDOUT_FILENO, buf, padding); if (retw <= 0 || retw != padding) { pr_err("%s: repiping tracing data padding", __func__); diff --git a/tools/perf/util/header.h b/tools/perf/util/header.h index 3285981948d7..7137509cf6d8 100644 --- a/tools/perf/util/header.h +++ b/tools/perf/util/header.h @@ -150,7 +150,8 @@ struct feat_copier { int perf_session__inject_header(struct perf_session *session, struct evlist *evlist, int fd, - struct feat_copier *fc); + struct feat_copier *fc, + bool write_attrs_after_data); size_t perf_session__data_offset(const struct evlist *evlist); diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c index 774eb3382000..5a51db036ee8 100644 --- a/tools/perf/util/session.c +++ b/tools/perf/util/session.c @@ -135,8 +135,8 @@ static int ordered_events__deliver_event(struct ordered_events *oe, } struct perf_session *__perf_session__new(struct perf_data *data, - bool repipe, int repipe_fd, - struct perf_tool *tool) + struct perf_tool *tool, + bool trace_event_repipe) { int ret = -ENOMEM; struct perf_session *session = zalloc(sizeof(*session)); @@ -144,7 +144,7 @@ struct perf_session *__perf_session__new(struct perf_data *data, if (!session) goto out; - session->repipe = repipe; + session->trace_event_repipe = trace_event_repipe; session->tool = tool; session->decomp_data.zstd_decomp = &session->zstd_data; session->active_decomp = &session->decomp_data; @@ -162,7 +162,7 @@ struct perf_session *__perf_session__new(struct perf_data *data, session->data = data; if (perf_data__is_read(data)) { - ret = perf_session__open(session, repipe_fd); + ret = perf_session__open(session, /*repipe_fd=*/-1); if (ret < 0) goto out_delete; diff --git a/tools/perf/util/session.h b/tools/perf/util/session.h index e56518639711..bcf1bcf06959 100644 --- a/tools/perf/util/session.h +++ b/tools/perf/util/session.h @@ -59,12 +59,8 @@ struct perf_session { #endif /** @time_conv: Holds contents of last PERF_RECORD_TIME_CONV event. */ struct perf_record_time_conv time_conv; - /** - * @repipe: When set causes certain reading (header and trace events) to - * also write events. The written file descriptor must be provided for - * the header but is implicitly stdout for trace events. - */ - bool repipe; + /** @trace_event_repipe: When set causes read trace events to be written to stdout. */ + bool trace_event_repipe; /** * @one_mmap: The reader will use a single mmap by default. There may be * multiple data files in particular for aux events. If this is true @@ -110,13 +106,13 @@ struct decomp { struct perf_tool; struct perf_session *__perf_session__new(struct perf_data *data, - bool repipe, int repipe_fd, - struct perf_tool *tool); + struct perf_tool *tool, + bool trace_event_repipe); static inline struct perf_session *perf_session__new(struct perf_data *data, struct perf_tool *tool) { - return __perf_session__new(data, false, -1, tool); + return __perf_session__new(data, tool, /*trace_event_repipe=*/false); } void perf_session__delete(struct perf_session *session); -- cgit v1.2.3 From 2d57c32b32fb84c6f6b2342bca85c9ba2b2d759b Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Thu, 29 Aug 2024 08:01:53 -0700 Subject: perf header: Remove repipe option No longer used by `perf inject` the repipe_fd is always -1 and repipe is always false. Remove the options and associated code knowing the constant values of the removed variables. Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: Kan Liang Cc: Mark Rutland Cc: Namhyung Kim Cc: Nick Terrell Cc: Peter Zijlstra Cc: Yanteng Si Cc: Yicong Yang Link: https://lore.kernel.org/r/20240829150154.37929-8-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/header.c | 19 +++++-------------- tools/perf/util/header.h | 2 +- tools/perf/util/session.c | 6 +++--- 3 files changed, 9 insertions(+), 18 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c index 59e2f37c1cb4..a6386d12afd7 100644 --- a/tools/perf/util/header.c +++ b/tools/perf/util/header.c @@ -4113,13 +4113,8 @@ static int perf_file_section__process(struct perf_file_section *section, static int perf_file_header__read_pipe(struct perf_pipe_file_header *header, struct perf_header *ph, - struct perf_data* data, - bool repipe, int repipe_fd) + struct perf_data *data) { - struct feat_fd ff = { - .fd = repipe_fd, - .ph = ph, - }; ssize_t ret; ret = perf_data__read(data, header, sizeof(*header)); @@ -4134,19 +4129,15 @@ static int perf_file_header__read_pipe(struct perf_pipe_file_header *header, if (ph->needs_swap) header->size = bswap_64(header->size); - if (repipe && do_write(&ff, header, sizeof(*header)) < 0) - return -1; - return 0; } -static int perf_header__read_pipe(struct perf_session *session, int repipe_fd) +static int perf_header__read_pipe(struct perf_session *session) { struct perf_header *header = &session->header; struct perf_pipe_file_header f_header; - if (perf_file_header__read_pipe(&f_header, header, session->data, - /*repipe=*/false, repipe_fd) < 0) { + if (perf_file_header__read_pipe(&f_header, header, session->data) < 0) { pr_debug("incompatible file format\n"); return -EINVAL; } @@ -4246,7 +4237,7 @@ static int evlist__prepare_tracepoint_events(struct evlist *evlist, struct tep_h } #endif -int perf_session__read_header(struct perf_session *session, int repipe_fd) +int perf_session__read_header(struct perf_session *session) { struct perf_data *data = session->data; struct perf_header *header = &session->header; @@ -4267,7 +4258,7 @@ int perf_session__read_header(struct perf_session *session, int repipe_fd) * We can read 'pipe' data event from regular file, * check for the pipe header regardless of source. */ - err = perf_header__read_pipe(session, repipe_fd); + err = perf_header__read_pipe(session); if (!err || perf_data__is_pipe(data)) { data->is_pipe = true; return err; diff --git a/tools/perf/util/header.h b/tools/perf/util/header.h index 7137509cf6d8..a63a361f20f4 100644 --- a/tools/perf/util/header.h +++ b/tools/perf/util/header.h @@ -131,7 +131,7 @@ union perf_event; extern const char perf_version_string[]; -int perf_session__read_header(struct perf_session *session, int repipe_fd); +int perf_session__read_header(struct perf_session *session); int perf_session__write_header(struct perf_session *session, struct evlist *evlist, int fd, bool at_exit); diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c index 5a51db036ee8..b492300ec959 100644 --- a/tools/perf/util/session.c +++ b/tools/perf/util/session.c @@ -45,11 +45,11 @@ static int perf_session__deliver_event(struct perf_session *session, u64 file_offset, const char *file_path); -static int perf_session__open(struct perf_session *session, int repipe_fd) +static int perf_session__open(struct perf_session *session) { struct perf_data *data = session->data; - if (perf_session__read_header(session, repipe_fd) < 0) { + if (perf_session__read_header(session) < 0) { pr_err("incompatible file format (rerun with -v to learn more)\n"); return -1; } @@ -162,7 +162,7 @@ struct perf_session *__perf_session__new(struct perf_data *data, session->data = data; if (perf_data__is_read(data)) { - ret = perf_session__open(session, /*repipe_fd=*/-1); + ret = perf_session__open(session); if (ret < 0) goto out_delete; -- cgit v1.2.3 From ccb9004656e55ea13e11e93862aa32aa575bf802 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Thu, 29 Aug 2024 08:01:54 -0700 Subject: perf test: Additional pipe tests with pipe output written to a file Additional pipe tests where piped files are written to disk. This means that spotting a file name of "-" isn't a sufficient "is pipe?" test. Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: Kan Liang Cc: Mark Rutland Cc: Namhyung Kim Cc: Nick Terrell Cc: Peter Zijlstra Cc: Yanteng Si Cc: Yicong Yang Link: https://lore.kernel.org/r/20240829150154.37929-9-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/shell/pipe_test.sh | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) (limited to 'tools') diff --git a/tools/perf/tests/shell/pipe_test.sh b/tools/perf/tests/shell/pipe_test.sh index ad10012fdc29..a3c94b4182c2 100755 --- a/tools/perf/tests/shell/pipe_test.sh +++ b/tools/perf/tests/shell/pipe_test.sh @@ -11,6 +11,7 @@ sym="noploop" skip_test_missing_symbol ${sym} data=$(mktemp /tmp/perf.data.XXXXXX) +data2=$(mktemp /tmp/perf.data2.XXXXXX) prog="perf test -w noploop" err=0 @@ -19,6 +20,8 @@ set -e cleanup() { rm -rf "${data}" rm -rf "${data}".old + rm -rf "${data2}" + rm -rf "${data2}".old trap - EXIT TERM INT } @@ -49,6 +52,14 @@ test_record_report() { return fi + perf record -g -e task-clock:u -o - ${prog} > ${data} + if ! perf report -i ${data} --task | grep -q ${task} + then + echo "Record+report pipe test [Failed - cannot find the test file in the perf report #3]" + err=1 + return + fi + echo "Record+report pipe test [Success]" } @@ -86,6 +97,21 @@ test_inject_bids() { return fi + perf record -e task-clock:u -o - ${prog} > ${data} + if ! perf inject ${inject_opt} -i ${data} | perf report -i - | grep -q ${sym}; then + echo "Inject ${inject_opt} build-ids test [Failed - cannot find noploop function in pipe #5]" + err=1 + return + fi + + perf record -e task-clock:u -o - ${prog} > ${data} + perf inject ${inject_opt} -i ${data} -o ${data2} + if ! perf report -i ${data2} | grep -q ${sym}; then + echo "Inject ${inject_opt} build-ids test [Failed - cannot find noploop function in pipe #6]" + err=1 + return + fi + echo "Inject ${inject_opt} build-ids test [Success]" } -- cgit v1.2.3 From 10d6c57c824e21b4cf8eda9491caea06edac9fd7 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Thu, 29 Aug 2024 23:51:48 -0700 Subject: perf lock contention: Handle error in a single place It has some duplicate codes to do the same job. Let's add a label and goto there to handle errors in a single place. Signed-off-by: Namhyung Kim Cc: Adrian Hunter Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kan Liang Cc: Peter Zijlstra Cc: Song Liu Cc: bpf@vger.kernel.org Link: https://lore.kernel.org/r/20240830065150.1758962-1-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/bpf_skel/lock_contention.bpf.c | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/bpf_skel/lock_contention.bpf.c b/tools/perf/util/bpf_skel/lock_contention.bpf.c index d931a898c434..e8a6f6463019 100644 --- a/tools/perf/util/bpf_skel/lock_contention.bpf.c +++ b/tools/perf/util/bpf_skel/lock_contention.bpf.c @@ -439,11 +439,8 @@ int contention_end(u64 *ctx) duration = bpf_ktime_get_ns() - pelem->timestamp; if ((__s64)duration < 0) { - pelem->lock = 0; - if (need_delete) - bpf_map_delete_elem(&tstamp, &pid); __sync_fetch_and_add(&time_fail, 1); - return 0; + goto out; } switch (aggr_mode) { @@ -477,11 +474,8 @@ int contention_end(u64 *ctx) data = bpf_map_lookup_elem(&lock_stat, &key); if (!data) { if (data_map_full) { - pelem->lock = 0; - if (need_delete) - bpf_map_delete_elem(&tstamp, &pid); __sync_fetch_and_add(&data_fail, 1); - return 0; + goto out; } struct contention_data first = { @@ -502,10 +496,7 @@ int contention_end(u64 *ctx) data_map_full = 1; __sync_fetch_and_add(&data_fail, 1); } - pelem->lock = 0; - if (need_delete) - bpf_map_delete_elem(&tstamp, &pid); - return 0; + goto out; } __sync_fetch_and_add(&data->total_time, duration); @@ -517,6 +508,7 @@ int contention_end(u64 *ctx) if (data->min_time > duration) data->min_time = duration; +out: pelem->lock = 0; if (need_delete) bpf_map_delete_elem(&tstamp, &pid); -- cgit v1.2.3 From 05a5dd1dfd8fdeee6498875c2d78d4385d627bd6 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Thu, 29 Aug 2024 23:51:49 -0700 Subject: perf lock contention: Simplify spinlock check The LCB_F_SPIN bit is used for spinlock, rwlock and optimistic spinning in mutex. In get_tstamp_elem() it needs to check spinlock and rwlock only. As mutex sets the LCB_F_MUTEX, it can check those two bits and reduce the number of operations. Signed-off-by: Namhyung Kim Cc: Adrian Hunter Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kan Liang Cc: Peter Zijlstra Cc: Song Liu Cc: bpf@vger.kernel.org Link: https://lore.kernel.org/r/20240830065150.1758962-2-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/bpf_skel/lock_contention.bpf.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/bpf_skel/lock_contention.bpf.c b/tools/perf/util/bpf_skel/lock_contention.bpf.c index e8a6f6463019..4b7237e178bd 100644 --- a/tools/perf/util/bpf_skel/lock_contention.bpf.c +++ b/tools/perf/util/bpf_skel/lock_contention.bpf.c @@ -323,8 +323,7 @@ static inline struct tstamp_data *get_tstamp_elem(__u32 flags) struct tstamp_data *pelem; /* Use per-cpu array map for spinlock and rwlock */ - if (flags == (LCB_F_SPIN | LCB_F_READ) || flags == LCB_F_SPIN || - flags == (LCB_F_SPIN | LCB_F_WRITE)) { + if ((flags & (LCB_F_SPIN | LCB_F_MUTEX)) == LCB_F_SPIN) { __u32 idx = 0; pelem = bpf_map_lookup_elem(&tstamp_cpu, &idx); -- cgit v1.2.3 From 36cddd105666d45a2cfb70a04c27b0c53bb383cd Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Thu, 29 Aug 2024 23:51:50 -0700 Subject: perf lock contention: Do not fail EEXIST for update When it updates the lock stat for the first time, it needs to create an element in the BPF hash map. But if there's a concurrent thread waiting for the same lock (like for rwsem or rwlock), it might race with the thread and possibly fail to update with -EEXIST. In that case, it can lookup the map again and put the data there instead of failing. Signed-off-by: Namhyung Kim Cc: Adrian Hunter Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kan Liang Cc: Peter Zijlstra Cc: Song Liu Cc: bpf@vger.kernel.org Link: https://lore.kernel.org/r/20240830065150.1758962-3-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/bpf_skel/lock_contention.bpf.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'tools') diff --git a/tools/perf/util/bpf_skel/lock_contention.bpf.c b/tools/perf/util/bpf_skel/lock_contention.bpf.c index 4b7237e178bd..52a876b42699 100644 --- a/tools/perf/util/bpf_skel/lock_contention.bpf.c +++ b/tools/perf/util/bpf_skel/lock_contention.bpf.c @@ -491,6 +491,12 @@ int contention_end(u64 *ctx) err = bpf_map_update_elem(&lock_stat, &key, &first, BPF_NOEXIST); if (err < 0) { + if (err == -EEXIST) { + /* it lost the race, try to get it again */ + data = bpf_map_lookup_elem(&lock_stat, &key); + if (data != NULL) + goto found; + } if (err == -E2BIG) data_map_full = 1; __sync_fetch_and_add(&data_fail, 1); @@ -498,6 +504,7 @@ int contention_end(u64 *ctx) goto out; } +found: __sync_fetch_and_add(&data->total_time, duration); __sync_fetch_and_add(&data->count, 1); -- cgit v1.2.3 From 74fd69a35cae1d53d9c95b700aae312d77b0d1cb Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Tue, 27 Aug 2024 22:29:53 -0700 Subject: perf lock contention: Fix spinlock and rwlock accounting The spinlock and rwlock use a single-element per-cpu array to track current locks due to performance reason. But this means the key is always available and it cannot simply account lock stats in the array because some of them are invalid. In fact, the contention_end() program in the BPF invalidates the entry by setting the 'lock' value to 0 instead of deleting the entry for the hashmap. So it should skip entries with the lock value of 0 in the account_end_timestamp(). Otherwise, it'd have spurious high contention on an idle machine: $ sudo perf lock con -ab -Y spinlock sleep 3 contended total wait max wait avg wait type caller 8 4.72 s 1.84 s 590.46 ms spinlock rcu_core+0xc7 8 1.87 s 1.87 s 233.48 ms spinlock process_one_work+0x1b5 2 1.87 s 1.87 s 933.92 ms spinlock worker_thread+0x1a2 3 1.81 s 1.81 s 603.93 ms spinlock tmigr_update_events+0x13c 2 1.72 s 1.72 s 861.98 ms spinlock tick_do_update_jiffies64+0x25 6 42.48 us 13.02 us 7.08 us spinlock futex_q_lock+0x2a 1 13.03 us 13.03 us 13.03 us spinlock futex_wake+0xce 1 11.61 us 11.61 us 11.61 us spinlock rcu_core+0xc7 I don't believe it has contention on a spinlock longer than 1 second. After this change, it only reports some small contentions. $ sudo perf lock con -ab -Y spinlock sleep 3 contended total wait max wait avg wait type caller 4 133.51 us 43.29 us 33.38 us spinlock tick_do_update_jiffies64+0x25 4 69.06 us 31.82 us 17.27 us spinlock process_one_work+0x1b5 2 50.66 us 25.77 us 25.33 us spinlock rcu_core+0xc7 1 28.45 us 28.45 us 28.45 us spinlock rcu_core+0xc7 1 24.77 us 24.77 us 24.77 us spinlock tmigr_update_events+0x13c 1 23.34 us 23.34 us 23.34 us spinlock raw_spin_rq_lock_nested+0x15 Fixes: b5711042a1c8cc88 ("perf lock contention: Use per-cpu array map for spinlocks") Reported-by: Xi Wang Signed-off-by: Namhyung Kim Cc: Adrian Hunter Cc: bpf@vger.kernel.org Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kan Liang Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Song Liu Link: https://lore.kernel.org/r/20240828052953.1445862-1-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/bpf_lock_contention.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'tools') diff --git a/tools/perf/util/bpf_lock_contention.c b/tools/perf/util/bpf_lock_contention.c index b4cb3fe5cc25..bc4e92c0c08b 100644 --- a/tools/perf/util/bpf_lock_contention.c +++ b/tools/perf/util/bpf_lock_contention.c @@ -286,6 +286,9 @@ static void account_end_timestamp(struct lock_contention *con) goto next; for (int i = 0; i < total_cpus; i++) { + if (cpu_data[i].lock == 0) + continue; + update_lock_stat(stat_fd, -1, end_ts, aggr_mode, &cpu_data[i]); } -- cgit v1.2.3 From 39c243411bdb8fb35777adf49ee32549633c4e12 Mon Sep 17 00:00:00 2001 From: Yang Jihong Date: Mon, 19 Aug 2024 10:47:20 +0800 Subject: perf sched timehist: Fixed timestamp error when unable to confirm event sched_in time If sched_in event for current task is not recorded, sched_in timestamp will be set to end_time of time window interest, causing an error in timestamp show. In this case, we choose to ignore this event. Test scenario: perf[1229608] does not record the first sched_in event, run time and sch delay are both 0 # perf sched timehist Samples of sched_switch event do not have callchains. time cpu task name wait time sch delay run time [tid/pid] (msec) (msec) (msec) --------------- ------ ------------------------------ --------- --------- --------- 2090450.763231 [0000] perf[1229608] 0.000 0.000 0.000 2090450.763235 [0000] migration/0[15] 0.000 0.001 0.003 2090450.763263 [0001] perf[1229608] 0.000 0.000 0.000 2090450.763268 [0001] migration/1[21] 0.000 0.001 0.004 2090450.763302 [0002] perf[1229608] 0.000 0.000 0.000 2090450.763309 [0002] migration/2[27] 0.000 0.001 0.007 2090450.763338 [0003] perf[1229608] 0.000 0.000 0.000 2090450.763343 [0003] migration/3[33] 0.000 0.001 0.004 Before: arbitrarily specify a time window of interest, timestamp will be set to an incorrect value # perf sched timehist --time 100,200 Samples of sched_switch event do not have callchains. time cpu task name wait time sch delay run time [tid/pid] (msec) (msec) (msec) --------------- ------ ------------------------------ --------- --------- --------- 200.000000 [0000] perf[1229608] 0.000 0.000 0.000 200.000000 [0001] perf[1229608] 0.000 0.000 0.000 200.000000 [0002] perf[1229608] 0.000 0.000 0.000 200.000000 [0003] perf[1229608] 0.000 0.000 0.000 200.000000 [0004] perf[1229608] 0.000 0.000 0.000 200.000000 [0005] perf[1229608] 0.000 0.000 0.000 200.000000 [0006] perf[1229608] 0.000 0.000 0.000 200.000000 [0007] perf[1229608] 0.000 0.000 0.000 After: # perf sched timehist --time 100,200 Samples of sched_switch event do not have callchains. time cpu task name wait time sch delay run time [tid/pid] (msec) (msec) (msec) --------------- ------ ------------------------------ --------- --------- --------- Fixes: 853b74071110bed3 ("perf sched timehist: Add option to specify time window of interest") Signed-off-by: Yang Jihong Acked-by: Namhyung Kim Cc: Adrian Hunter Cc: Alexander Shishkin Cc: David Ahern Cc: Ian Rogers Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: Kan Liang Cc: Mark Rutland Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20240819024720.2405244-1-yangjihong@bytedance.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-sched.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c index 9c1276dc8ef6..1c386ebe4b98 100644 --- a/tools/perf/builtin-sched.c +++ b/tools/perf/builtin-sched.c @@ -2683,9 +2683,12 @@ static int timehist_sched_change_event(const struct perf_tool *tool, * - previous sched event is out of window - we are done * - sample time is beyond window user cares about - reset it * to close out stats for time window interest + * - If tprev is 0, that is, sched_in event for current task is + * not recorded, cannot determine whether sched_in event is + * within time window interest - ignore it */ if (ptime->end) { - if (tprev > ptime->end) + if (!tprev || tprev > ptime->end) goto out; if (t > ptime->end) -- cgit v1.2.3 From 01b52f01c5a6bdc3b3e4229dccc84ed667e6867b Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Mon, 29 Jul 2024 18:02:11 +0200 Subject: selftests: vDSO: simplify getrandom thread local storage and structs Rather than using pthread_get/set_specific, just use gcc's __thread annotation, which is noticeably faster and makes the code more obvious. Also, just have one simplified struct called vgrnd, instead of trying to split things up semantically. Those divisions were useful when this code was split across several commit *messages*, but doesn't make as much sense within a single file. This should make the code more clear and provide a better example for implementers. Signed-off-by: Jason A. Donenfeld --- tools/testing/selftests/vDSO/vdso_test_getrandom.c | 67 +++++++++------------- 1 file changed, 27 insertions(+), 40 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/vDSO/vdso_test_getrandom.c b/tools/testing/selftests/vDSO/vdso_test_getrandom.c index 05122425a873..89c961175956 100644 --- a/tools/testing/selftests/vDSO/vdso_test_getrandom.c +++ b/tools/testing/selftests/vDSO/vdso_test_getrandom.c @@ -38,50 +38,43 @@ static struct { pthread_mutex_t lock; void **states; size_t len, cap; -} grnd_allocator = { - .lock = PTHREAD_MUTEX_INITIALIZER -}; - -static struct { ssize_t(*fn)(void *, size_t, unsigned long, void *, size_t); - pthread_key_t key; - pthread_once_t initialized; struct vgetrandom_opaque_params params; -} grnd_ctx = { - .initialized = PTHREAD_ONCE_INIT +} vgrnd = { + .lock = PTHREAD_MUTEX_INITIALIZER }; static void *vgetrandom_get_state(void) { void *state = NULL; - pthread_mutex_lock(&grnd_allocator.lock); - if (!grnd_allocator.len) { + pthread_mutex_lock(&vgrnd.lock); + if (!vgrnd.len) { size_t page_size = getpagesize(); size_t new_cap; size_t alloc_size, num = sysconf(_SC_NPROCESSORS_ONLN); /* Just a decent heuristic. */ void *new_block, *new_states; - alloc_size = (num * grnd_ctx.params.size_of_opaque_state + page_size - 1) & (~(page_size - 1)); - num = (page_size / grnd_ctx.params.size_of_opaque_state) * (alloc_size / page_size); - new_block = mmap(0, alloc_size, grnd_ctx.params.mmap_prot, grnd_ctx.params.mmap_flags, -1, 0); + alloc_size = (num * vgrnd.params.size_of_opaque_state + page_size - 1) & (~(page_size - 1)); + num = (page_size / vgrnd.params.size_of_opaque_state) * (alloc_size / page_size); + new_block = mmap(0, alloc_size, vgrnd.params.mmap_prot, vgrnd.params.mmap_flags, -1, 0); if (new_block == MAP_FAILED) goto out; - new_cap = grnd_allocator.cap + num; - new_states = reallocarray(grnd_allocator.states, new_cap, sizeof(*grnd_allocator.states)); + new_cap = vgrnd.cap + num; + new_states = reallocarray(vgrnd.states, new_cap, sizeof(*vgrnd.states)); if (!new_states) goto unmap; - grnd_allocator.cap = new_cap; - grnd_allocator.states = new_states; + vgrnd.cap = new_cap; + vgrnd.states = new_states; for (size_t i = 0; i < num; ++i) { - if (((uintptr_t)new_block & (page_size - 1)) + grnd_ctx.params.size_of_opaque_state > page_size) + if (((uintptr_t)new_block & (page_size - 1)) + vgrnd.params.size_of_opaque_state > page_size) new_block = (void *)(((uintptr_t)new_block + page_size - 1) & (~(page_size - 1))); - grnd_allocator.states[i] = new_block; - new_block += grnd_ctx.params.size_of_opaque_state; + vgrnd.states[i] = new_block; + new_block += vgrnd.params.size_of_opaque_state; } - grnd_allocator.len = num; + vgrnd.len = num; goto success; unmap: @@ -89,10 +82,10 @@ static void *vgetrandom_get_state(void) goto out; } success: - state = grnd_allocator.states[--grnd_allocator.len]; + state = vgrnd.states[--vgrnd.len]; out: - pthread_mutex_unlock(&grnd_allocator.lock); + pthread_mutex_unlock(&vgrnd.lock); return state; } @@ -100,27 +93,25 @@ static void vgetrandom_put_state(void *state) { if (!state) return; - pthread_mutex_lock(&grnd_allocator.lock); - grnd_allocator.states[grnd_allocator.len++] = state; - pthread_mutex_unlock(&grnd_allocator.lock); + pthread_mutex_lock(&vgrnd.lock); + vgrnd.states[vgrnd.len++] = state; + pthread_mutex_unlock(&vgrnd.lock); } static void vgetrandom_init(void) { - if (pthread_key_create(&grnd_ctx.key, vgetrandom_put_state) != 0) - return; unsigned long sysinfo_ehdr = getauxval(AT_SYSINFO_EHDR); if (!sysinfo_ehdr) { printf("AT_SYSINFO_EHDR is not present!\n"); exit(KSFT_SKIP); } vdso_init_from_sysinfo_ehdr(sysinfo_ehdr); - grnd_ctx.fn = (__typeof__(grnd_ctx.fn))vdso_sym("LINUX_2.6", "__vdso_getrandom"); - if (!grnd_ctx.fn) { + vgrnd.fn = (__typeof__(vgrnd.fn))vdso_sym("LINUX_2.6", "__vdso_getrandom"); + if (!vgrnd.fn) { printf("__vdso_getrandom is missing!\n"); exit(KSFT_FAIL); } - if (grnd_ctx.fn(NULL, 0, 0, &grnd_ctx.params, ~0UL) != 0) { + if (vgrnd.fn(NULL, 0, 0, &vgrnd.params, ~0UL) != 0) { printf("failed to fetch vgetrandom params!\n"); exit(KSFT_FAIL); } @@ -128,22 +119,16 @@ static void vgetrandom_init(void) static ssize_t vgetrandom(void *buf, size_t len, unsigned long flags) { - void *state; + static __thread void *state; - pthread_once(&grnd_ctx.initialized, vgetrandom_init); - state = pthread_getspecific(grnd_ctx.key); if (!state) { state = vgetrandom_get_state(); - if (pthread_setspecific(grnd_ctx.key, state) != 0) { - vgetrandom_put_state(state); - state = NULL; - } if (!state) { printf("vgetrandom_get_state failed!\n"); exit(KSFT_FAIL); } } - return grnd_ctx.fn(buf, len, flags, state, grnd_ctx.params.size_of_opaque_state); + return vgrnd.fn(buf, len, flags, state, vgrnd.params.size_of_opaque_state); } enum { TRIALS = 25000000, THREADS = 256 }; @@ -265,6 +250,8 @@ static void usage(const char *argv0) int main(int argc, char *argv[]) { + vgetrandom_init(); + if (argc == 1) { kselftest(); return 0; -- cgit v1.2.3 From 20a9af057cd7c4057c72c803dcf83c97eb5deb95 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Thu, 22 Aug 2024 09:13:21 +0200 Subject: selftests: vDSO: don't hard-code location of vDSO sources Architectures use different location for vDSO sources: arch/mips/vdso arch/sparc/vdso arch/arm64/kernel/vdso arch/riscv/kernel/vdso arch/csky/kernel/vdso arch/x86/um/vdso arch/x86/entry/vdso arch/powerpc/kernel/vdso arch/arm/vdso arch/loongarch/vdso Don't hard-code vdso sources location in selftest Makefile. Instead create a vdso/ symbolic link in tools/arch/$arch/ and update Makefile accordingly. Signed-off-by: Christophe Leroy Signed-off-by: Jason A. Donenfeld --- tools/arch/x86/vdso | 1 + tools/testing/selftests/vDSO/Makefile | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) create mode 120000 tools/arch/x86/vdso (limited to 'tools') diff --git a/tools/arch/x86/vdso b/tools/arch/x86/vdso new file mode 120000 index 000000000000..7eb962fd3454 --- /dev/null +++ b/tools/arch/x86/vdso @@ -0,0 +1 @@ +../../../arch/x86/entry/vdso/ \ No newline at end of file diff --git a/tools/testing/selftests/vDSO/Makefile b/tools/testing/selftests/vDSO/Makefile index 3de8e7e052ae..c9a819cacbf2 100644 --- a/tools/testing/selftests/vDSO/Makefile +++ b/tools/testing/selftests/vDSO/Makefile @@ -40,7 +40,7 @@ $(OUTPUT)/vdso_test_getrandom: parse_vdso.c $(OUTPUT)/vdso_test_getrandom: CFLAGS += -isystem $(top_srcdir)/tools/include \ -isystem $(top_srcdir)/include/uapi -$(OUTPUT)/vdso_test_chacha: $(top_srcdir)/arch/$(ARCH)/entry/vdso/vgetrandom-chacha.S +$(OUTPUT)/vdso_test_chacha: $(top_srcdir)/tools/arch/$(ARCH)/vdso/vgetrandom-chacha.S $(OUTPUT)/vdso_test_chacha: CFLAGS += -idirafter $(top_srcdir)/tools/include \ -isystem $(top_srcdir)/arch/$(ARCH)/include \ -isystem $(top_srcdir)/include \ -- cgit v1.2.3 From f8d92fc527ff0388b2e94c101afb0e5c7496199f Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Thu, 22 Aug 2024 09:13:23 +0200 Subject: selftests: vDSO: fix include order in build of test_vdso_chacha Building test_vdso_chacha currently leads to following issue: In file included from /home/chleroy/linux-powerpc/include/linux/limits.h:7, from /opt/powerpc64-e5500--glibc--stable-2024.02-1/powerpc64-buildroot-linux-gnu/sysroot/usr/include/bits/local_lim.h:38, from /opt/powerpc64-e5500--glibc--stable-2024.02-1/powerpc64-buildroot-linux-gnu/sysroot/usr/include/bits/posix1_lim.h:161, from /opt/powerpc64-e5500--glibc--stable-2024.02-1/powerpc64-buildroot-linux-gnu/sysroot/usr/include/limits.h:195, from /opt/powerpc64-e5500--glibc--stable-2024.02-1/lib/gcc/powerpc64-buildroot-linux-gnu/12.3.0/include-fixed/limits.h:203, from /opt/powerpc64-e5500--glibc--stable-2024.02-1/lib/gcc/powerpc64-buildroot-linux-gnu/12.3.0/include-fixed/syslimits.h:7, from /opt/powerpc64-e5500--glibc--stable-2024.02-1/lib/gcc/powerpc64-buildroot-linux-gnu/12.3.0/include-fixed/limits.h:34, from /tmp/sodium/usr/local/include/sodium/export.h:7, from /tmp/sodium/usr/local/include/sodium/crypto_stream_chacha20.h:14, from vdso_test_chacha.c:6: /opt/powerpc64-e5500--glibc--stable-2024.02-1/powerpc64-buildroot-linux-gnu/sysroot/usr/include/bits/xopen_lim.h:99:6: error: missing binary operator before token "(" 99 | # if INT_MAX == 32767 | ^~~~~~~ /opt/powerpc64-e5500--glibc--stable-2024.02-1/powerpc64-buildroot-linux-gnu/sysroot/usr/include/bits/xopen_lim.h:102:7: error: missing binary operator before token "(" 102 | # if INT_MAX == 2147483647 | ^~~~~~~ /opt/powerpc64-e5500--glibc--stable-2024.02-1/powerpc64-buildroot-linux-gnu/sysroot/usr/include/bits/xopen_lim.h:126:6: error: missing binary operator before token "(" 126 | # if LONG_MAX == 2147483647 | ^~~~~~~~ This is due to kernel include/linux/limits.h being included instead of libc's limits.h. This is because directory include/ is added through option -isystem so it goes prior to glibc's include directory. Replace -isystem by -idirafter. But this implies that now tools/include/linux/linkage.h is included instead of include/linux/linkage.h, so define a stub for SYM_FUNC_START() and SYM_FUNC_END(). Signed-off-by: Christophe Leroy Signed-off-by: Jason A. Donenfeld --- tools/include/linux/linkage.h | 4 ++++ tools/testing/selftests/vDSO/Makefile | 4 ++-- 2 files changed, 6 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/include/linux/linkage.h b/tools/include/linux/linkage.h index bc763d500262..a48ff086899c 100644 --- a/tools/include/linux/linkage.h +++ b/tools/include/linux/linkage.h @@ -1,4 +1,8 @@ #ifndef _TOOLS_INCLUDE_LINUX_LINKAGE_H #define _TOOLS_INCLUDE_LINUX_LINKAGE_H +#define SYM_FUNC_START(x) .globl x; x: + +#define SYM_FUNC_END(x) + #endif /* _TOOLS_INCLUDE_LINUX_LINKAGE_H */ diff --git a/tools/testing/selftests/vDSO/Makefile b/tools/testing/selftests/vDSO/Makefile index c9a819cacbf2..10ffdda3f2fa 100644 --- a/tools/testing/selftests/vDSO/Makefile +++ b/tools/testing/selftests/vDSO/Makefile @@ -42,7 +42,7 @@ $(OUTPUT)/vdso_test_getrandom: CFLAGS += -isystem $(top_srcdir)/tools/include \ $(OUTPUT)/vdso_test_chacha: $(top_srcdir)/tools/arch/$(ARCH)/vdso/vgetrandom-chacha.S $(OUTPUT)/vdso_test_chacha: CFLAGS += -idirafter $(top_srcdir)/tools/include \ - -isystem $(top_srcdir)/arch/$(ARCH)/include \ - -isystem $(top_srcdir)/include \ + -idirafter $(top_srcdir)/arch/$(ARCH)/include \ + -idirafter $(top_srcdir)/include \ -D__ASSEMBLY__ -DBULID_VDSO -DCONFIG_FUNCTION_ALIGNMENT=0 \ -Wa,--noexecstack $(SODIUM) -- cgit v1.2.3 From e1bbcab496f745d963e43a6e0f669359e82c4934 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Thu, 22 Aug 2024 09:13:22 +0200 Subject: selftests: vDSO: look for arch-specific function name in getrandom test Don't hard-code x86 specific names. Rather, use vdso_config definitions to find the correct function matching the architecture. Add random VDSO function names in names[][]. Remove the #ifdef CONFIG_VDSO32, as having the name there all the time is harmless and guaranties a steady index for following strings. Signed-off-by: Christophe Leroy [Jason: add [6] to variable declaration rather than each usage site.] Signed-off-by: Jason A. Donenfeld --- tools/testing/selftests/vDSO/vdso_config.h | 8 +++----- tools/testing/selftests/vDSO/vdso_test_getrandom.c | 8 ++++++-- 2 files changed, 9 insertions(+), 7 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/vDSO/vdso_config.h b/tools/testing/selftests/vDSO/vdso_config.h index 7b543e7f04d7..b5ba5ded3a61 100644 --- a/tools/testing/selftests/vDSO/vdso_config.h +++ b/tools/testing/selftests/vDSO/vdso_config.h @@ -68,16 +68,15 @@ static const char *versions[7] = { "LINUX_5.10" }; -static const char *names[2][6] = { +static const char *names[2][7] = { { "__kernel_gettimeofday", "__kernel_clock_gettime", "__kernel_time", "__kernel_clock_getres", "__kernel_getcpu", -#if defined(VDSO_32BIT) "__kernel_clock_gettime64", -#endif + "__kernel_getrandom", }, { "__vdso_gettimeofday", @@ -85,9 +84,8 @@ static const char *names[2][6] = { "__vdso_time", "__vdso_clock_getres", "__vdso_getcpu", -#if defined(VDSO_32BIT) "__vdso_clock_gettime64", -#endif + "__vdso_getrandom", }, }; diff --git a/tools/testing/selftests/vDSO/vdso_test_getrandom.c b/tools/testing/selftests/vDSO/vdso_test_getrandom.c index 89c961175956..20bbef992c48 100644 --- a/tools/testing/selftests/vDSO/vdso_test_getrandom.c +++ b/tools/testing/selftests/vDSO/vdso_test_getrandom.c @@ -21,6 +21,7 @@ #include "../kselftest.h" #include "parse_vdso.h" +#include "vdso_config.h" #ifndef timespecsub #define timespecsub(tsp, usp, vsp) \ @@ -100,15 +101,18 @@ static void vgetrandom_put_state(void *state) static void vgetrandom_init(void) { + const char *version = versions[VDSO_VERSION]; + const char *name = names[VDSO_NAMES][6]; unsigned long sysinfo_ehdr = getauxval(AT_SYSINFO_EHDR); + if (!sysinfo_ehdr) { printf("AT_SYSINFO_EHDR is not present!\n"); exit(KSFT_SKIP); } vdso_init_from_sysinfo_ehdr(sysinfo_ehdr); - vgrnd.fn = (__typeof__(vgrnd.fn))vdso_sym("LINUX_2.6", "__vdso_getrandom"); + vgrnd.fn = (__typeof__(vgrnd.fn))vdso_sym(version, name); if (!vgrnd.fn) { - printf("__vdso_getrandom is missing!\n"); + printf("%s is missing!\n", name); exit(KSFT_FAIL); } if (vgrnd.fn(NULL, 0, 0, &vgrnd.params, ~0UL) != 0) { -- cgit v1.2.3 From 1e661b349041c3d5026d9e95930b1d396c0c6e6d Mon Sep 17 00:00:00 2001 From: Xi Ruoyao Date: Tue, 27 Aug 2024 21:20:15 +0800 Subject: selftests: vDSO: add --cflags for pkg-config command querying libsodium When libsodium is installed into its own prefix, the --cflags output is needed for the compiler to find libsodium headers. Signed-off-by: Xi Ruoyao Signed-off-by: Jason A. Donenfeld --- tools/testing/selftests/vDSO/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/vDSO/Makefile b/tools/testing/selftests/vDSO/Makefile index 10ffdda3f2fa..180854eb9fec 100644 --- a/tools/testing/selftests/vDSO/Makefile +++ b/tools/testing/selftests/vDSO/Makefile @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 uname_M := $(shell uname -m 2>/dev/null || echo not) ARCH ?= $(shell echo $(uname_M) | sed -e s/i.86/x86/ -e s/x86_64/x86/) -SODIUM := $(shell pkg-config --libs libsodium 2>/dev/null) +SODIUM := $(shell pkg-config --libs --cflags libsodium 2>/dev/null) TEST_GEN_PROGS := vdso_test_gettimeofday TEST_GEN_PROGS += vdso_test_getcpu -- cgit v1.2.3 From a5330eb3bcd87f3ce7738a2e898671a0ff561e85 Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Tue, 27 Aug 2024 16:27:13 +0200 Subject: selftests: vDSO: separate LDLIBS from CFLAGS for libsodium On systems that set -Wl,--as-needed, putting the -lsodium in the wrong place on the command line means we get a linker error: CC vdso_test_chacha /usr/bin/ld: /tmp/ccKpjnSM.o: in function `main': vdso_test_chacha.c:(.text+0x276): undefined reference to `crypto_stream_chacha20' collect2: error: ld returned 1 exit status Fix this by passing pkg-config's --libs output to the LDFLAGS field instead of the CFLAGS field, as is customary. Reported-by: Adhemerval Zanella Netto Signed-off-by: Jason A. Donenfeld --- tools/testing/selftests/vDSO/Makefile | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/vDSO/Makefile b/tools/testing/selftests/vDSO/Makefile index 180854eb9fec..834aa862ba2c 100644 --- a/tools/testing/selftests/vDSO/Makefile +++ b/tools/testing/selftests/vDSO/Makefile @@ -1,7 +1,8 @@ # SPDX-License-Identifier: GPL-2.0 uname_M := $(shell uname -m 2>/dev/null || echo not) ARCH ?= $(shell echo $(uname_M) | sed -e s/i.86/x86/ -e s/x86_64/x86/) -SODIUM := $(shell pkg-config --libs --cflags libsodium 2>/dev/null) +SODIUM_LIBS := $(shell pkg-config --libs libsodium 2>/dev/null) +SODIUM_CFLAGS := $(shell pkg-config --cflags libsodium 2>/dev/null) TEST_GEN_PROGS := vdso_test_gettimeofday TEST_GEN_PROGS += vdso_test_getcpu @@ -13,7 +14,7 @@ endif TEST_GEN_PROGS += vdso_test_correctness ifeq ($(uname_M),x86_64) TEST_GEN_PROGS += vdso_test_getrandom -ifneq ($(SODIUM),) +ifneq ($(SODIUM_LIBS),) TEST_GEN_PROGS += vdso_test_chacha endif endif @@ -41,8 +42,9 @@ $(OUTPUT)/vdso_test_getrandom: CFLAGS += -isystem $(top_srcdir)/tools/include \ -isystem $(top_srcdir)/include/uapi $(OUTPUT)/vdso_test_chacha: $(top_srcdir)/tools/arch/$(ARCH)/vdso/vgetrandom-chacha.S +$(OUTPUT)/vdso_test_chacha: LDLIBS += $(SODIUM_LIBS) $(OUTPUT)/vdso_test_chacha: CFLAGS += -idirafter $(top_srcdir)/tools/include \ -idirafter $(top_srcdir)/arch/$(ARCH)/include \ -idirafter $(top_srcdir)/include \ -D__ASSEMBLY__ -DBULID_VDSO -DCONFIG_FUNCTION_ALIGNMENT=0 \ - -Wa,--noexecstack $(SODIUM) + -Wa,--noexecstack $(SODIUM_CFLAGS) -- cgit v1.2.3 From be9155154bc39e910ba8e56f32be299cf83e21d0 Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Tue, 27 Aug 2024 16:52:36 +0200 Subject: selftests: vDSO: remove unnecessary command line defs from chacha test CONFIG_FUNCTION_ALIGNMENT=0 is no longer necessary and BULID_VDSO wasn't spelled right while BUILD_VDSO isn't necessary, so just remove these. Reported-by: Adhemerval Zanella Netto Signed-off-by: Jason A. Donenfeld --- tools/testing/selftests/vDSO/Makefile | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/vDSO/Makefile b/tools/testing/selftests/vDSO/Makefile index 834aa862ba2c..d1452c7d6d4f 100644 --- a/tools/testing/selftests/vDSO/Makefile +++ b/tools/testing/selftests/vDSO/Makefile @@ -46,5 +46,4 @@ $(OUTPUT)/vdso_test_chacha: LDLIBS += $(SODIUM_LIBS) $(OUTPUT)/vdso_test_chacha: CFLAGS += -idirafter $(top_srcdir)/tools/include \ -idirafter $(top_srcdir)/arch/$(ARCH)/include \ -idirafter $(top_srcdir)/include \ - -D__ASSEMBLY__ -DBULID_VDSO -DCONFIG_FUNCTION_ALIGNMENT=0 \ - -Wa,--noexecstack $(SODIUM_CFLAGS) + -D__ASSEMBLY__ -Wa,--noexecstack $(SODIUM_CFLAGS) -- cgit v1.2.3 From b90eeff1badd9c3d12123e2721ae08dfb689cda7 Mon Sep 17 00:00:00 2001 From: Xi Ruoyao Date: Wed, 28 Aug 2024 13:29:57 +0200 Subject: selftests: vDSO: use KHDR_INCLUDES for UAPI headers for getrandom test Building test_vdso_getrandom currently leads to following issue: In file included from /home/xry111/git-repos/linux/tools/include/linux/compiler_types.h:36, from /home/xry111/git-repos/linux/include/uapi/linux/stddef.h:5, from /home/xry111/git-repos/linux/include/uapi/linux/posix_types.h:5, from /usr/include/asm/sigcontext.h:12, from /usr/include/bits/sigcontext.h:30, from /usr/include/signal.h:301, from vdso_test_getrandom.c:14: /home/xry111/git-repos/linux/tools/include/linux/compiler-gcc.h:3:2: error: #error "Please don't include directly, include instead." 3 | #error "Please don't include directly, include instead." | ^~~~~ It's because the compiler_types.h inclusion in include/uapi/linux/stddef.h is expected to be removed by the header_install.sh script, as compiler_types.h shouldn't be used from user space. Add KHDR_INCLUDES before the existing include/uapi inclusion so that usr/include takes precedence if it's populated. Signed-off-by: Xi Ruoyao Signed-off-by: Jason A. Donenfeld --- tools/testing/selftests/vDSO/Makefile | 1 + 1 file changed, 1 insertion(+) (limited to 'tools') diff --git a/tools/testing/selftests/vDSO/Makefile b/tools/testing/selftests/vDSO/Makefile index d1452c7d6d4f..32ea4c2a9a23 100644 --- a/tools/testing/selftests/vDSO/Makefile +++ b/tools/testing/selftests/vDSO/Makefile @@ -39,6 +39,7 @@ $(OUTPUT)/vdso_test_correctness: LDFLAGS += -ldl $(OUTPUT)/vdso_test_getrandom: parse_vdso.c $(OUTPUT)/vdso_test_getrandom: CFLAGS += -isystem $(top_srcdir)/tools/include \ + $(KHDR_INCLUDES) \ -isystem $(top_srcdir)/include/uapi $(OUTPUT)/vdso_test_chacha: $(top_srcdir)/tools/arch/$(ARCH)/vdso/vgetrandom-chacha.S -- cgit v1.2.3 From f78280b1a3cedd9f68d5f596179675514a15bd1d Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Thu, 29 Aug 2024 20:23:23 +0200 Subject: selftests: vDSO: skip getrandom test if architecture is unsupported If the getrandom test compiles for an arch, don't exit fatally if the actual cpu it's running on is unsupported. Suggested-by: Adhemerval Zanella Netto Signed-off-by: Jason A. Donenfeld --- tools/testing/selftests/vDSO/vdso_test_getrandom.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/vDSO/vdso_test_getrandom.c b/tools/testing/selftests/vDSO/vdso_test_getrandom.c index 20bbef992c48..5db8ac8999cd 100644 --- a/tools/testing/selftests/vDSO/vdso_test_getrandom.c +++ b/tools/testing/selftests/vDSO/vdso_test_getrandom.c @@ -104,6 +104,7 @@ static void vgetrandom_init(void) const char *version = versions[VDSO_VERSION]; const char *name = names[VDSO_NAMES][6]; unsigned long sysinfo_ehdr = getauxval(AT_SYSINFO_EHDR); + size_t ret; if (!sysinfo_ehdr) { printf("AT_SYSINFO_EHDR is not present!\n"); @@ -115,7 +116,11 @@ static void vgetrandom_init(void) printf("%s is missing!\n", name); exit(KSFT_FAIL); } - if (vgrnd.fn(NULL, 0, 0, &vgrnd.params, ~0UL) != 0) { + ret = vgrnd.fn(NULL, 0, 0, &vgrnd.params, ~0UL); + if (ret == -ENOSYS) { + printf("unsupported architecture\n"); + exit(KSFT_SKIP); + } else if (ret) { printf("failed to fetch vgetrandom params!\n"); exit(KSFT_FAIL); } -- cgit v1.2.3 From 59eb856c3ed9b3552befd240c0c339f22eed3fa1 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 30 Aug 2024 14:28:35 +0200 Subject: selftests: vDSO: fix vDSO name for powerpc Following error occurs when running vdso_test_correctness on powerpc: ~ # ./vdso_test_correctness [WARN] failed to find vDSO [SKIP] No vDSO, so skipping clock_gettime() tests [SKIP] No vDSO, so skipping clock_gettime64() tests [RUN] Testing getcpu... [OK] CPU 0: syscall: cpu 0, node 0 On powerpc, vDSO is neither called linux-vdso.so.1 nor linux-gate.so.1 but linux-vdso32.so.1 or linux-vdso64.so.1. Also search those two names before giving up. Fixes: c7e5789b24d3 ("kselftest: Move test_vdso to the vDSO test suite") Signed-off-by: Christophe Leroy Acked-by: Shuah Khan Signed-off-by: Jason A. Donenfeld --- tools/testing/selftests/vDSO/vdso_test_correctness.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/vDSO/vdso_test_correctness.c b/tools/testing/selftests/vDSO/vdso_test_correctness.c index e691a3cf1491..cdb697ae8343 100644 --- a/tools/testing/selftests/vDSO/vdso_test_correctness.c +++ b/tools/testing/selftests/vDSO/vdso_test_correctness.c @@ -114,6 +114,12 @@ static void fill_function_pointers() if (!vdso) vdso = dlopen("linux-gate.so.1", RTLD_LAZY | RTLD_LOCAL | RTLD_NOLOAD); + if (!vdso) + vdso = dlopen("linux-vdso32.so.1", + RTLD_LAZY | RTLD_LOCAL | RTLD_NOLOAD); + if (!vdso) + vdso = dlopen("linux-vdso64.so.1", + RTLD_LAZY | RTLD_LOCAL | RTLD_NOLOAD); if (!vdso) { printf("[WARN]\tfailed to find vDSO\n"); return; -- cgit v1.2.3 From 7d297c419b08eafa69ce27243ee9bbecab4fcaa4 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 30 Aug 2024 14:28:36 +0200 Subject: selftests: vDSO: fix vdso_config for powerpc Running vdso_test_correctness on powerpc64 gives the following warning: ~ # ./vdso_test_correctness Warning: failed to find clock_gettime64 in vDSO This is because vdso_test_correctness was built with VDSO_32BIT defined. __powerpc__ macro is defined on both powerpc32 and powerpc64 so __powerpc64__ needs to be checked first in vdso_config.h Fixes: 693f5ca08ca0 ("kselftest: Extend vDSO selftest") Signed-off-by: Christophe Leroy Acked-by: Shuah Khan Signed-off-by: Jason A. Donenfeld --- tools/testing/selftests/vDSO/vdso_config.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/vDSO/vdso_config.h b/tools/testing/selftests/vDSO/vdso_config.h index b5ba5ded3a61..740ce8c98d2e 100644 --- a/tools/testing/selftests/vDSO/vdso_config.h +++ b/tools/testing/selftests/vDSO/vdso_config.h @@ -18,13 +18,13 @@ #elif defined(__aarch64__) #define VDSO_VERSION 3 #define VDSO_NAMES 0 -#elif defined(__powerpc__) +#elif defined(__powerpc64__) #define VDSO_VERSION 1 #define VDSO_NAMES 0 -#define VDSO_32BIT 1 -#elif defined(__powerpc64__) +#elif defined(__powerpc__) #define VDSO_VERSION 1 #define VDSO_NAMES 0 +#define VDSO_32BIT 1 #elif defined (__s390__) #define VDSO_VERSION 2 #define VDSO_NAMES 0 -- cgit v1.2.3 From ba83b3239e657469709d15dcea5f9b65bf9dbf34 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 30 Aug 2024 14:28:37 +0200 Subject: selftests: vDSO: fix vDSO symbols lookup for powerpc64 On powerpc64, following tests fail locating vDSO functions: ~ # ./vdso_test_abi TAP version 13 1..16 # [vDSO kselftest] VDSO_VERSION: LINUX_2.6.15 # Couldn't find __kernel_gettimeofday ok 1 # SKIP __kernel_gettimeofday # clock_id: CLOCK_REALTIME # Couldn't find __kernel_clock_gettime ok 2 # SKIP __kernel_clock_gettime CLOCK_REALTIME # Couldn't find __kernel_clock_getres ok 3 # SKIP __kernel_clock_getres CLOCK_REALTIME ... # Couldn't find __kernel_time ok 16 # SKIP __kernel_time # Totals: pass:0 fail:0 xfail:0 xpass:0 skip:16 error:0 ~ # ./vdso_test_getrandom __kernel_getrandom is missing! ~ # ./vdso_test_gettimeofday Could not find __kernel_gettimeofday ~ # ./vdso_test_getcpu Could not find __kernel_getcpu On powerpc64, as shown below by readelf, vDSO functions symbols have type NOTYPE, so also accept that type when looking for symbols. $ powerpc64-linux-gnu-readelf -a arch/powerpc/kernel/vdso/vdso64.so.dbg ELF Header: Magic: 7f 45 4c 46 02 02 01 00 00 00 00 00 00 00 00 00 Class: ELF64 Data: 2's complement, big endian Version: 1 (current) OS/ABI: UNIX - System V ABI Version: 0 Type: DYN (Shared object file) Machine: PowerPC64 Version: 0x1 ... Symbol table '.dynsym' contains 12 entries: Num: Value Size Type Bind Vis Ndx Name 0: 0000000000000000 0 NOTYPE LOCAL DEFAULT UND 1: 0000000000000524 84 NOTYPE GLOBAL DEFAULT 8 __[...]@@LINUX_2.6.15 2: 00000000000005f0 36 NOTYPE GLOBAL DEFAULT 8 __[...]@@LINUX_2.6.15 3: 0000000000000578 68 NOTYPE GLOBAL DEFAULT 8 __[...]@@LINUX_2.6.15 4: 0000000000000000 0 OBJECT GLOBAL DEFAULT ABS LINUX_2.6.15 5: 00000000000006c0 48 NOTYPE GLOBAL DEFAULT 8 __[...]@@LINUX_2.6.15 6: 0000000000000614 172 NOTYPE GLOBAL DEFAULT 8 __[...]@@LINUX_2.6.15 7: 00000000000006f0 84 NOTYPE GLOBAL DEFAULT 8 __[...]@@LINUX_2.6.15 8: 000000000000047c 84 NOTYPE GLOBAL DEFAULT 8 __[...]@@LINUX_2.6.15 9: 0000000000000454 12 NOTYPE GLOBAL DEFAULT 8 __[...]@@LINUX_2.6.15 10: 00000000000004d0 84 NOTYPE GLOBAL DEFAULT 8 __[...]@@LINUX_2.6.15 11: 00000000000005bc 52 NOTYPE GLOBAL DEFAULT 8 __[...]@@LINUX_2.6.15 Symbol table '.symtab' contains 56 entries: Num: Value Size Type Bind Vis Ndx Name ... 45: 0000000000000000 0 OBJECT GLOBAL DEFAULT ABS LINUX_2.6.15 46: 00000000000006c0 48 NOTYPE GLOBAL DEFAULT 8 __kernel_getcpu 47: 0000000000000524 84 NOTYPE GLOBAL DEFAULT 8 __kernel_clock_getres 48: 00000000000005f0 36 NOTYPE GLOBAL DEFAULT 8 __kernel_get_tbfreq 49: 000000000000047c 84 NOTYPE GLOBAL DEFAULT 8 __kernel_gettimeofday 50: 0000000000000614 172 NOTYPE GLOBAL DEFAULT 8 __kernel_sync_dicache 51: 00000000000006f0 84 NOTYPE GLOBAL DEFAULT 8 __kernel_getrandom 52: 0000000000000454 12 NOTYPE GLOBAL DEFAULT 8 __kernel_sigtram[...] 53: 0000000000000578 68 NOTYPE GLOBAL DEFAULT 8 __kernel_time 54: 00000000000004d0 84 NOTYPE GLOBAL DEFAULT 8 __kernel_clock_g[...] 55: 00000000000005bc 52 NOTYPE GLOBAL DEFAULT 8 __kernel_get_sys[...] Fixes: 98eedc3a9dbf ("Document the vDSO and add a reference parser") Signed-off-by: Christophe Leroy Acked-by: Shuah Khan Signed-off-by: Jason A. Donenfeld --- tools/testing/selftests/vDSO/parse_vdso.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/vDSO/parse_vdso.c b/tools/testing/selftests/vDSO/parse_vdso.c index 4ae417372e9e..d9ccc5acac18 100644 --- a/tools/testing/selftests/vDSO/parse_vdso.c +++ b/tools/testing/selftests/vDSO/parse_vdso.c @@ -216,7 +216,8 @@ void *vdso_sym(const char *version, const char *name) ELF(Sym) *sym = &vdso_info.symtab[chain]; /* Check for a defined global or weak function w/ right name. */ - if (ELF64_ST_TYPE(sym->st_info) != STT_FUNC) + if (ELF64_ST_TYPE(sym->st_info) != STT_FUNC && + ELF64_ST_TYPE(sym->st_info) != STT_NOTYPE) continue; if (ELF64_ST_BIND(sym->st_info) != STB_GLOBAL && ELF64_ST_BIND(sym->st_info) != STB_WEAK) -- cgit v1.2.3 From 6eda706a535c3d0119eaefaad5fc119609639ed2 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 30 Aug 2024 14:28:38 +0200 Subject: selftests: vDSO: fix the way vDSO functions are called for powerpc vdso_test_correctness test fails on powerpc: ~ # ./vdso_test_correctness ... [RUN] Testing clock_gettime for clock CLOCK_REALTIME_ALARM (8)... [FAIL] No such clock, but __vdso_clock_gettime returned 22 [RUN] Testing clock_gettime for clock CLOCK_BOOTTIME_ALARM (9)... [FAIL] No such clock, but __vdso_clock_gettime returned 22 [RUN] Testing clock_gettime for clock CLOCK_SGI_CYCLE (10)... [FAIL] No such clock, but __vdso_clock_gettime returned 22 ... [RUN] Testing clock_gettime for clock invalid (-1)... [FAIL] No such clock, but __vdso_clock_gettime returned 22 [RUN] Testing clock_gettime for clock invalid (-2147483648)... [FAIL] No such clock, but __vdso_clock_gettime returned 22 [RUN] Testing clock_gettime for clock invalid (2147483647)... [FAIL] No such clock, but __vdso_clock_gettime returned 22 On powerpc, a call to a VDSO function is not an ordinary C function call. Unlike several architectures which returns a negative error code in case of an error, powerpc sets CR[SO] and returns the error code as a positive value. Define and use a macro called VDSO_CALL() which takes a pointer to the function to call, the number of arguments and the arguments. Also update ABI vdso documentation to reflect this subtlety. Provide a specific version of VDSO_CALL() for powerpc that negates the error code on return when CR[SO] is set. Fixes: c7e5789b24d3 ("kselftest: Move test_vdso to the vDSO test suite") Fixes: 2e9a97256616 ("selftests: vdso: Add a selftest for vDSO getcpu()") Fixes: 693f5ca08ca0 ("kselftest: Extend vDSO selftest") Fixes: b2f1c3db2887 ("kselftest: Extend vdso correctness test to clock_gettime64") Fixes: 4920a2590e91 ("selftests/vDSO: add tests for vgetrandom") Signed-off-by: Christophe Leroy Acked-by: Shuah Khan Signed-off-by: Jason A. Donenfeld --- Documentation/ABI/stable/vdso | 8 ++- tools/testing/selftests/vDSO/vdso_call.h | 70 ++++++++++++++++++++++ tools/testing/selftests/vDSO/vdso_test_abi.c | 9 +-- .../testing/selftests/vDSO/vdso_test_correctness.c | 15 ++--- tools/testing/selftests/vDSO/vdso_test_getcpu.c | 3 +- tools/testing/selftests/vDSO/vdso_test_getrandom.c | 5 +- .../selftests/vDSO/vdso_test_gettimeofday.c | 3 +- 7 files changed, 95 insertions(+), 18 deletions(-) create mode 100644 tools/testing/selftests/vDSO/vdso_call.h (limited to 'tools') diff --git a/Documentation/ABI/stable/vdso b/Documentation/ABI/stable/vdso index 951838d42781..85dbb6a160df 100644 --- a/Documentation/ABI/stable/vdso +++ b/Documentation/ABI/stable/vdso @@ -9,9 +9,11 @@ maps an ELF DSO into that program's address space. This DSO is called the vDSO and it often contains useful and highly-optimized alternatives to real syscalls. -These functions are called just like ordinary C function according to -your platform's ABI. Call them from a sensible context. (For example, -if you set CS on x86 to something strange, the vDSO functions are +These functions are called according to your platform's ABI. On many +platforms they are called just like ordinary C function. On other platforms +(ex: powerpc) they are called with the same convention as system calls which +is different from ordinary C functions. Call them from a sensible context. +(For example, if you set CS on x86 to something strange, the vDSO functions are within their rights to crash.) In addition, if you pass a bad pointer to a vDSO function, you might get SIGSEGV instead of -EFAULT. diff --git a/tools/testing/selftests/vDSO/vdso_call.h b/tools/testing/selftests/vDSO/vdso_call.h new file mode 100644 index 000000000000..bb237d771051 --- /dev/null +++ b/tools/testing/selftests/vDSO/vdso_call.h @@ -0,0 +1,70 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Macro to call vDSO functions + * + * Copyright (C) 2024 Christophe Leroy , CS GROUP France + */ +#ifndef __VDSO_CALL_H__ +#define __VDSO_CALL_H__ + +#ifdef __powerpc__ + +#define LOADARGS_1(fn, __arg1) do { \ + _r0 = fn; \ + _r3 = (long)__arg1; \ +} while (0) + +#define LOADARGS_2(fn, __arg1, __arg2) do { \ + _r0 = fn; \ + _r3 = (long)__arg1; \ + _r4 = (long)__arg2; \ +} while (0) + +#define LOADARGS_3(fn, __arg1, __arg2, __arg3) do { \ + _r0 = fn; \ + _r3 = (long)__arg1; \ + _r4 = (long)__arg2; \ + _r5 = (long)__arg3; \ +} while (0) + +#define LOADARGS_5(fn, __arg1, __arg2, __arg3, __arg4, __arg5) do { \ + _r0 = fn; \ + _r3 = (long)__arg1; \ + _r4 = (long)__arg2; \ + _r5 = (long)__arg3; \ + _r6 = (long)__arg4; \ + _r7 = (long)__arg5; \ +} while (0) + +#define VDSO_CALL(fn, nr, args...) ({ \ + register void *_r0 asm ("r0"); \ + register long _r3 asm ("r3"); \ + register long _r4 asm ("r4"); \ + register long _r5 asm ("r5"); \ + register long _r6 asm ("r6"); \ + register long _r7 asm ("r7"); \ + register long _r8 asm ("r8"); \ + register long _rval asm ("r3"); \ + \ + LOADARGS_##nr(fn, args); \ + \ + asm volatile( \ + " mtctr %0\n" \ + " bctrl\n" \ + " bns+ 1f\n" \ + " neg 3, 3\n" \ + "1:" \ + : "+r" (_r0), "=r" (_r3), "+r" (_r4), "+r" (_r5), \ + "+r" (_r6), "+r" (_r7), "+r" (_r8) \ + : "r" (_rval) \ + : "r9", "r10", "r11", "r12", "cr0", "cr1", "cr5", \ + "cr6", "cr7", "xer", "lr", "ctr", "memory" \ + ); \ + _rval; \ +}) + +#else +#define VDSO_CALL(fn, nr, args...) fn(args) +#endif + +#endif diff --git a/tools/testing/selftests/vDSO/vdso_test_abi.c b/tools/testing/selftests/vDSO/vdso_test_abi.c index 96d32fd65b42..00034208c4c6 100644 --- a/tools/testing/selftests/vDSO/vdso_test_abi.c +++ b/tools/testing/selftests/vDSO/vdso_test_abi.c @@ -20,6 +20,7 @@ #include "../kselftest.h" #include "vdso_config.h" +#include "vdso_call.h" extern void *vdso_sym(const char *version, const char *name); extern void vdso_init_from_sysinfo_ehdr(uintptr_t base); @@ -61,7 +62,7 @@ static void vdso_test_gettimeofday(void) } struct timeval tv; - long ret = vdso_gettimeofday(&tv, 0); + long ret = VDSO_CALL(vdso_gettimeofday, 2, &tv, 0); if (ret == 0) { ksft_print_msg("The time is %lld.%06lld\n", @@ -86,7 +87,7 @@ static void vdso_test_clock_gettime(clockid_t clk_id) } struct timespec ts; - long ret = vdso_clock_gettime(clk_id, &ts); + long ret = VDSO_CALL(vdso_clock_gettime, 2, clk_id, &ts); if (ret == 0) { ksft_print_msg("The time is %lld.%06lld\n", @@ -111,7 +112,7 @@ static void vdso_test_time(void) return; } - long ret = vdso_time(NULL); + long ret = VDSO_CALL(vdso_time, 1, NULL); if (ret > 0) { ksft_print_msg("The time in hours since January 1, 1970 is %lld\n", @@ -138,7 +139,7 @@ static void vdso_test_clock_getres(clockid_t clk_id) } struct timespec ts, sys_ts; - long ret = vdso_clock_getres(clk_id, &ts); + long ret = VDSO_CALL(vdso_clock_getres, 2, clk_id, &ts); if (ret == 0) { ksft_print_msg("The vdso resolution is %lld %lld\n", diff --git a/tools/testing/selftests/vDSO/vdso_test_correctness.c b/tools/testing/selftests/vDSO/vdso_test_correctness.c index cdb697ae8343..5fb97ad67eea 100644 --- a/tools/testing/selftests/vDSO/vdso_test_correctness.c +++ b/tools/testing/selftests/vDSO/vdso_test_correctness.c @@ -20,6 +20,7 @@ #include #include "vdso_config.h" +#include "vdso_call.h" #include "../kselftest.h" static const char **name; @@ -186,7 +187,7 @@ static void test_getcpu(void) ret_sys = sys_getcpu(&cpu_sys, &node_sys, 0); if (vdso_getcpu) - ret_vdso = vdso_getcpu(&cpu_vdso, &node_vdso, 0); + ret_vdso = VDSO_CALL(vdso_getcpu, 3, &cpu_vdso, &node_vdso, 0); if (vgetcpu) ret_vsys = vgetcpu(&cpu_vsys, &node_vsys, 0); @@ -269,7 +270,7 @@ static void test_one_clock_gettime(int clock, const char *name) if (sys_clock_gettime(clock, &start) < 0) { if (errno == EINVAL) { - vdso_ret = vdso_clock_gettime(clock, &vdso); + vdso_ret = VDSO_CALL(vdso_clock_gettime, 2, clock, &vdso); if (vdso_ret == -EINVAL) { printf("[OK]\tNo such clock.\n"); } else { @@ -282,7 +283,7 @@ static void test_one_clock_gettime(int clock, const char *name) return; } - vdso_ret = vdso_clock_gettime(clock, &vdso); + vdso_ret = VDSO_CALL(vdso_clock_gettime, 2, clock, &vdso); end_ret = sys_clock_gettime(clock, &end); if (vdso_ret != 0 || end_ret != 0) { @@ -331,7 +332,7 @@ static void test_one_clock_gettime64(int clock, const char *name) if (sys_clock_gettime64(clock, &start) < 0) { if (errno == EINVAL) { - vdso_ret = vdso_clock_gettime64(clock, &vdso); + vdso_ret = VDSO_CALL(vdso_clock_gettime64, 2, clock, &vdso); if (vdso_ret == -EINVAL) { printf("[OK]\tNo such clock.\n"); } else { @@ -344,7 +345,7 @@ static void test_one_clock_gettime64(int clock, const char *name) return; } - vdso_ret = vdso_clock_gettime64(clock, &vdso); + vdso_ret = VDSO_CALL(vdso_clock_gettime64, 2, clock, &vdso); end_ret = sys_clock_gettime64(clock, &end); if (vdso_ret != 0 || end_ret != 0) { @@ -401,7 +402,7 @@ static void test_gettimeofday(void) return; } - vdso_ret = vdso_gettimeofday(&vdso, &vdso_tz); + vdso_ret = VDSO_CALL(vdso_gettimeofday, 2, &vdso, &vdso_tz); end_ret = sys_gettimeofday(&end, NULL); if (vdso_ret != 0 || end_ret != 0) { @@ -431,7 +432,7 @@ static void test_gettimeofday(void) } /* And make sure that passing NULL for tz doesn't crash. */ - vdso_gettimeofday(&vdso, NULL); + VDSO_CALL(vdso_gettimeofday, 2, &vdso, NULL); } int main(int argc, char **argv) diff --git a/tools/testing/selftests/vDSO/vdso_test_getcpu.c b/tools/testing/selftests/vDSO/vdso_test_getcpu.c index b758f68c6c9c..cdeaed45fb26 100644 --- a/tools/testing/selftests/vDSO/vdso_test_getcpu.c +++ b/tools/testing/selftests/vDSO/vdso_test_getcpu.c @@ -14,6 +14,7 @@ #include "../kselftest.h" #include "parse_vdso.h" #include "vdso_config.h" +#include "vdso_call.h" struct getcpu_cache; typedef long (*getcpu_t)(unsigned int *, unsigned int *, @@ -42,7 +43,7 @@ int main(int argc, char **argv) return KSFT_SKIP; } - ret = get_cpu(&cpu, &node, 0); + ret = VDSO_CALL(get_cpu, 3, &cpu, &node, 0); if (ret == 0) { printf("Running on CPU %u node %u\n", cpu, node); } else { diff --git a/tools/testing/selftests/vDSO/vdso_test_getrandom.c b/tools/testing/selftests/vDSO/vdso_test_getrandom.c index 5db8ac8999cd..351daeb649c8 100644 --- a/tools/testing/selftests/vDSO/vdso_test_getrandom.c +++ b/tools/testing/selftests/vDSO/vdso_test_getrandom.c @@ -22,6 +22,7 @@ #include "../kselftest.h" #include "parse_vdso.h" #include "vdso_config.h" +#include "vdso_call.h" #ifndef timespecsub #define timespecsub(tsp, usp, vsp) \ @@ -116,7 +117,7 @@ static void vgetrandom_init(void) printf("%s is missing!\n", name); exit(KSFT_FAIL); } - ret = vgrnd.fn(NULL, 0, 0, &vgrnd.params, ~0UL); + ret = VDSO_CALL(vgrnd.fn, 5, NULL, 0, 0, &vgrnd.params, ~0UL); if (ret == -ENOSYS) { printf("unsupported architecture\n"); exit(KSFT_SKIP); @@ -137,7 +138,7 @@ static ssize_t vgetrandom(void *buf, size_t len, unsigned long flags) exit(KSFT_FAIL); } } - return vgrnd.fn(buf, len, flags, state, vgrnd.params.size_of_opaque_state); + return VDSO_CALL(vgrnd.fn, 5, buf, len, flags, state, vgrnd.params.size_of_opaque_state); } enum { TRIALS = 25000000, THREADS = 256 }; diff --git a/tools/testing/selftests/vDSO/vdso_test_gettimeofday.c b/tools/testing/selftests/vDSO/vdso_test_gettimeofday.c index ee4f1ca56a71..e31b18ffae33 100644 --- a/tools/testing/selftests/vDSO/vdso_test_gettimeofday.c +++ b/tools/testing/selftests/vDSO/vdso_test_gettimeofday.c @@ -19,6 +19,7 @@ #include "../kselftest.h" #include "parse_vdso.h" #include "vdso_config.h" +#include "vdso_call.h" int main(int argc, char **argv) { @@ -43,7 +44,7 @@ int main(int argc, char **argv) } struct timeval tv; - long ret = gtod(&tv, 0); + long ret = VDSO_CALL(gtod, 2, &tv, 0); if (ret == 0) { printf("The time is %lld.%06lld\n", -- cgit v1.2.3 From f0d0dbbc101a5ed2cd844eae0c2cc0546327ef89 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 30 Aug 2024 14:28:39 +0200 Subject: selftests: vDSO: use parse_vdso.h in vdso_test_abi Don't duplicate parse_vdso function prototypes, include the header instead. Fixes: 693f5ca08ca0 ("kselftest: Extend vDSO selftest") Signed-off-by: Christophe Leroy Acked-by: Shuah Khan Signed-off-by: Jason A. Donenfeld --- tools/testing/selftests/vDSO/vdso_test_abi.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/vDSO/vdso_test_abi.c b/tools/testing/selftests/vDSO/vdso_test_abi.c index 00034208c4c6..a54424e2336f 100644 --- a/tools/testing/selftests/vDSO/vdso_test_abi.c +++ b/tools/testing/selftests/vDSO/vdso_test_abi.c @@ -21,10 +21,7 @@ #include "../kselftest.h" #include "vdso_config.h" #include "vdso_call.h" - -extern void *vdso_sym(const char *version, const char *name); -extern void vdso_init_from_sysinfo_ehdr(uintptr_t base); -extern void vdso_init_from_auxv(void *auxv); +#include "parse_vdso.h" static const char *version; static const char **name; -- cgit v1.2.3 From d736d4fc763090f9a02dc5556174de9768093f43 Mon Sep 17 00:00:00 2001 From: Dev Jain Date: Fri, 30 Aug 2024 10:59:11 +0530 Subject: kselftest/arm64: Fix build warnings for ptrace A "%s" is missing in ksft_exit_fail_msg(); instead, use the newly introduced ksft_exit_fail_perror(). Signed-off-by: Dev Jain Reviewed-by: Shuah Khan Reviewed-by: Mark Brown Link: https://lore.kernel.org/r/20240830052911.4040970-1-dev.jain@arm.com Signed-off-by: Will Deacon --- tools/testing/selftests/arm64/abi/ptrace.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/arm64/abi/ptrace.c b/tools/testing/selftests/arm64/abi/ptrace.c index e4fa507cbdd0..b51d21f78cf9 100644 --- a/tools/testing/selftests/arm64/abi/ptrace.c +++ b/tools/testing/selftests/arm64/abi/ptrace.c @@ -163,10 +163,10 @@ static void test_hw_debug(pid_t child, int type, const char *type_name) static int do_child(void) { if (ptrace(PTRACE_TRACEME, -1, NULL, NULL)) - ksft_exit_fail_msg("PTRACE_TRACEME", strerror(errno)); + ksft_exit_fail_perror("PTRACE_TRACEME"); if (raise(SIGSTOP)) - ksft_exit_fail_msg("raise(SIGSTOP)", strerror(errno)); + ksft_exit_fail_perror("raise(SIGSTOP)"); return EXIT_SUCCESS; } -- cgit v1.2.3 From 33ffa2dd0de28ded1fa856b3fb9cbf2b873d7dd0 Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Fri, 30 Aug 2024 17:21:36 +0200 Subject: selftests: vDSO: quash clang omitted parameter warning in getrandom test When building with clang, there's this warning: vdso_test_getrandom.c:145:40: warning: omitting the parameter name in a function definition is a C23 extension [-Wc23-extensions] 145 | static void *test_vdso_getrandom(void *) | ^ vdso_test_getrandom.c:155:40: warning: omitting the parameter name in a function definition is a C23 extension [-Wc23-extensions] 155 | static void *test_libc_getrandom(void *) | ^ vdso_test_getrandom.c:165:43: warning: omitting the parameter name in a function definition is a C23 extension [-Wc23-extensions] 165 | static void *test_syscall_getrandom(void *) Add the named ctx parameter to quash it. Reported-by: Mark Brown Reviewed-by: Mark Brown Signed-off-by: Jason A. Donenfeld --- tools/testing/selftests/vDSO/vdso_test_getrandom.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/vDSO/vdso_test_getrandom.c b/tools/testing/selftests/vDSO/vdso_test_getrandom.c index 351daeb649c8..8866b65a4605 100644 --- a/tools/testing/selftests/vDSO/vdso_test_getrandom.c +++ b/tools/testing/selftests/vDSO/vdso_test_getrandom.c @@ -143,7 +143,7 @@ static ssize_t vgetrandom(void *buf, size_t len, unsigned long flags) enum { TRIALS = 25000000, THREADS = 256 }; -static void *test_vdso_getrandom(void *) +static void *test_vdso_getrandom(void *ctx) { for (size_t i = 0; i < TRIALS; ++i) { unsigned int val; @@ -153,7 +153,7 @@ static void *test_vdso_getrandom(void *) return NULL; } -static void *test_libc_getrandom(void *) +static void *test_libc_getrandom(void *ctx) { for (size_t i = 0; i < TRIALS; ++i) { unsigned int val; @@ -163,7 +163,7 @@ static void *test_libc_getrandom(void *) return NULL; } -static void *test_syscall_getrandom(void *) +static void *test_syscall_getrandom(void *ctx) { for (size_t i = 0; i < TRIALS; ++i) { unsigned int val; -- cgit v1.2.3 From da18bfa59d403040d8bcba1284285916fe9e4425 Mon Sep 17 00:00:00 2001 From: Tony Ambardar Date: Fri, 30 Aug 2024 02:51:50 -0700 Subject: libbpf: Ensure new BTF objects inherit input endianness New split BTF needs to preserve base's endianness. Similarly, when creating a distilled BTF, we need to preserve original endianness. Fix by updating libbpf's btf__distill_base() and btf_new_empty() to retain the byte order of any source BTF objects when creating new ones. Fixes: ba451366bf44 ("libbpf: Implement basic split BTF support") Fixes: 58e185a0dc35 ("libbpf: Add btf__distill_base() creating split BTF with distilled base BTF") Reported-by: Song Liu Reported-by: Eduard Zingerman Suggested-by: Eduard Zingerman Signed-off-by: Tony Ambardar Signed-off-by: Andrii Nakryiko Tested-by: Alan Maguire Acked-by: Eduard Zingerman Link: https://lore.kernel.org/bpf/6358db36c5f68b07873a0a5be2d062b1af5ea5f8.camel@gmail.com/ Link: https://lore.kernel.org/bpf/20240830095150.278881-1-tony.ambardar@gmail.com --- tools/lib/bpf/btf.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'tools') diff --git a/tools/lib/bpf/btf.c b/tools/lib/bpf/btf.c index 32c00db3b91b..40aae244e35f 100644 --- a/tools/lib/bpf/btf.c +++ b/tools/lib/bpf/btf.c @@ -996,6 +996,7 @@ static struct btf *btf_new_empty(struct btf *base_btf) btf->base_btf = base_btf; btf->start_id = btf__type_cnt(base_btf); btf->start_str_off = base_btf->hdr->str_len; + btf->swapped_endian = base_btf->swapped_endian; } /* +1 for empty string at offset 0 */ @@ -5394,6 +5395,9 @@ int btf__distill_base(const struct btf *src_btf, struct btf **new_base_btf, new_base = btf__new_empty(); if (!new_base) return libbpf_err(-ENOMEM); + + btf__set_endianness(new_base, btf__endianness(src_btf)); + dist.id_map = calloc(n, sizeof(*dist.id_map)); if (!dist.id_map) { err = -ENOMEM; -- cgit v1.2.3 From 181b0d1af5d5b2ba8996fa715382cbfbfef46b6e Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Fri, 30 Aug 2024 10:34:06 -0700 Subject: selftests/bpf: Check if distilled base inherits source endianness Create a BTF with endianness different from host, make a distilled base/split BTF pair from it, dump as raw bytes, import again and verify that endianness is preserved. Reviewed-by: Alan Maguire Tested-by: Alan Maguire Signed-off-by: Eduard Zingerman Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20240830173406.1581007-1-eddyz87@gmail.com --- .../testing/selftests/bpf/prog_tests/btf_distill.c | 68 ++++++++++++++++++++++ 1 file changed, 68 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/btf_distill.c b/tools/testing/selftests/bpf/prog_tests/btf_distill.c index bfbe795823a2..ca84726d5ac1 100644 --- a/tools/testing/selftests/bpf/prog_tests/btf_distill.c +++ b/tools/testing/selftests/bpf/prog_tests/btf_distill.c @@ -535,6 +535,72 @@ cleanup: btf__free(vmlinux_btf); } +/* Split and new base BTFs should inherit endianness from source BTF. */ +static void test_distilled_endianness(void) +{ + struct btf *base = NULL, *split = NULL, *new_base = NULL, *new_split = NULL; + struct btf *new_base1 = NULL, *new_split1 = NULL; + enum btf_endianness inverse_endianness; + const void *raw_data; + __u32 size; + + base = btf__new_empty(); + if (!ASSERT_OK_PTR(base, "empty_main_btf")) + return; + inverse_endianness = btf__endianness(base) == BTF_LITTLE_ENDIAN ? BTF_BIG_ENDIAN + : BTF_LITTLE_ENDIAN; + btf__set_endianness(base, inverse_endianness); + btf__add_int(base, "int", 4, BTF_INT_SIGNED); /* [1] int */ + VALIDATE_RAW_BTF( + base, + "[1] INT 'int' size=4 bits_offset=0 nr_bits=32 encoding=SIGNED"); + split = btf__new_empty_split(base); + if (!ASSERT_OK_PTR(split, "empty_split_btf")) + goto cleanup; + btf__add_ptr(split, 1); + VALIDATE_RAW_BTF( + split, + "[1] INT 'int' size=4 bits_offset=0 nr_bits=32 encoding=SIGNED", + "[2] PTR '(anon)' type_id=1"); + if (!ASSERT_EQ(0, btf__distill_base(split, &new_base, &new_split), + "distilled_base") || + !ASSERT_OK_PTR(new_base, "distilled_base") || + !ASSERT_OK_PTR(new_split, "distilled_split") || + !ASSERT_EQ(2, btf__type_cnt(new_base), "distilled_base_type_cnt")) + goto cleanup; + VALIDATE_RAW_BTF( + new_split, + "[1] INT 'int' size=4 bits_offset=0 nr_bits=32 encoding=SIGNED", + "[2] PTR '(anon)' type_id=1"); + + raw_data = btf__raw_data(new_base, &size); + if (!ASSERT_OK_PTR(raw_data, "btf__raw_data #1")) + goto cleanup; + new_base1 = btf__new(raw_data, size); + if (!ASSERT_OK_PTR(new_base1, "new_base1 = btf__new()")) + goto cleanup; + raw_data = btf__raw_data(new_split, &size); + if (!ASSERT_OK_PTR(raw_data, "btf__raw_data #2")) + goto cleanup; + new_split1 = btf__new_split(raw_data, size, new_base1); + if (!ASSERT_OK_PTR(new_split1, "new_split1 = btf__new()")) + goto cleanup; + + ASSERT_EQ(btf__endianness(new_base1), inverse_endianness, "new_base1 endianness"); + ASSERT_EQ(btf__endianness(new_split1), inverse_endianness, "new_split1 endianness"); + VALIDATE_RAW_BTF( + new_split1, + "[1] INT 'int' size=4 bits_offset=0 nr_bits=32 encoding=SIGNED", + "[2] PTR '(anon)' type_id=1"); +cleanup: + btf__free(new_split1); + btf__free(new_base1); + btf__free(new_split); + btf__free(new_base); + btf__free(split); + btf__free(base); +} + void test_btf_distill(void) { if (test__start_subtest("distilled_base")) @@ -549,4 +615,6 @@ void test_btf_distill(void) test_distilled_base_multi_err2(); if (test__start_subtest("distilled_base_vmlinux")) test_distilled_base_vmlinux(); + if (test__start_subtest("distilled_endianness")) + test_distilled_endianness(); } -- cgit v1.2.3 From 38960ac8f916d1e60d4ceb4735a93740c4308d9c Mon Sep 17 00:00:00 2001 From: Ihor Solodrai Date: Wed, 28 Aug 2024 17:46:14 +0000 Subject: selftests/bpf: Specify libbpf headers required for %.bpf.o progs Test %.bpf.o objects actually depend only on some libbpf headers. Define a list of required headers and use it as TRUNNER_BPF_OBJS dependency. bpf_*.h list was determined by: $ grep -rh 'include Signed-off-by: Andrii Nakryiko Link: Link: https://lore.kernel.org/bpf/20240828174608.377204-1-ihor.solodrai@pm.me https://lore.kernel.org/bpf/CAEf4BzYQ-j2i_xjs94Nn=8+FVfkWt51mLZyiYKiz9oA4Z=pCeA@mail.gmail.com/ --- tools/testing/selftests/bpf/Makefile | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index c120617b64ad..53cc13b92ee2 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -516,6 +516,12 @@ xdp_features.skel.h-deps := xdp_features.bpf.o LINKED_BPF_OBJS := $(foreach skel,$(LINKED_SKELS),$($(skel)-deps)) LINKED_BPF_SRCS := $(patsubst %.bpf.o,%.c,$(LINKED_BPF_OBJS)) +HEADERS_FOR_BPF_OBJS := $(wildcard $(BPFDIR)/*.bpf.h) \ + $(addprefix $(BPFDIR)/, bpf_core_read.h \ + bpf_endian.h \ + bpf_helpers.h \ + bpf_tracing.h) + # Set up extra TRUNNER_XXX "temporary" variables in the environment (relies on # $eval()) and pass control to DEFINE_TEST_RUNNER_RULES. # Parameters: @@ -566,8 +572,7 @@ $(TRUNNER_BPF_OBJS): $(TRUNNER_OUTPUT)/%.bpf.o: \ $(TRUNNER_BPF_PROGS_DIR)/%.c \ $(TRUNNER_BPF_PROGS_DIR)/*.h \ $$(INCLUDE_DIR)/vmlinux.h \ - $(wildcard $(BPFDIR)/bpf_*.h) \ - $(wildcard $(BPFDIR)/*.bpf.h) \ + $(HEADERS_FOR_BPF_OBJS) \ | $(TRUNNER_OUTPUT) $$(BPFOBJ) $$(call $(TRUNNER_BPF_BUILD_RULE),$$<,$$@, \ $(TRUNNER_BPF_CFLAGS) \ -- cgit v1.2.3 From 2ad6d23f465a4f851e3bcf6d74c315ce7b2c205b Mon Sep 17 00:00:00 2001 From: Ihor Solodrai Date: Wed, 28 Aug 2024 17:46:23 +0000 Subject: selftests/bpf: Do not update vmlinux.h unnecessarily %.bpf.o objects depend on vmlinux.h, which makes them transitively dependent on unnecessary libbpf headers. However vmlinux.h doesn't actually change as often. When generating vmlinux.h, compare it to a previous version and update it only if there are changes. Example of build time improvement (after first clean build): $ touch ../../../lib/bpf/bpf.h $ time make -j8 Before: real 1m37.592s After: real 0m27.310s Notice that %.bpf.o gen step is skipped if vmlinux.h hasn't changed. Signed-off-by: Ihor Solodrai Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/CAEf4BzY1z5cC7BKye8=A8aTVxpsCzD=p1jdTfKC7i0XVuYoHUQ@mail.gmail.com Link: https://lore.kernel.org/bpf/20240828174608.377204-2-ihor.solodrai@pm.me --- tools/testing/selftests/bpf/Makefile | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 53cc13b92ee2..7660d19b66c2 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -399,10 +399,14 @@ $(HOST_BPFOBJ): $(wildcard $(BPFDIR)/*.[ch] $(BPFDIR)/Makefile) \ DESTDIR=$(HOST_SCRATCH_DIR)/ prefix= all install_headers endif +# vmlinux.h is first dumped to a temprorary file and then compared to +# the previous version. This helps to avoid unnecessary re-builds of +# $(TRUNNER_BPF_OBJS) $(INCLUDE_DIR)/vmlinux.h: $(VMLINUX_BTF) $(BPFTOOL) | $(INCLUDE_DIR) ifeq ($(VMLINUX_H),) $(call msg,GEN,,$@) - $(Q)$(BPFTOOL) btf dump file $(VMLINUX_BTF) format c > $@ + $(Q)$(BPFTOOL) btf dump file $(VMLINUX_BTF) format c > $(INCLUDE_DIR)/.vmlinux.h.tmp + $(Q)cmp -s $(INCLUDE_DIR)/.vmlinux.h.tmp $@ || mv $(INCLUDE_DIR)/.vmlinux.h.tmp $@ else $(call msg,CP,,$@) $(Q)cp "$(VMLINUX_H)" $@ -- cgit v1.2.3 From 43a17fcfcc4294f53ece15425ba362f5e28664c2 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Tue, 20 Aug 2024 15:55:36 -0400 Subject: selftest/cgroup: Make test_cpuset_prs.sh deal with pre-isolated CPUs Since isolated CPUs can be reserved at boot time via the "isolcpus" boot command line option, these pre-isolated CPUs may interfere with testing done by test_cpuset_prs.sh. With the previous commit that incorporates those boot time isolated CPUs into "cpuset.cpus.isolated", we can check for those before testing is started to make sure that there will be no interference. Otherwise, this test will be skipped if incorrect test failure can happen. As "cpuset.cpus.isolated" is now available in a non cgroup_debug kernel, we don't need to check for its existence anymore. Signed-off-by: Waiman Long Signed-off-by: Tejun Heo --- tools/testing/selftests/cgroup/test_cpuset_prs.sh | 44 +++++++++++++++++------ 1 file changed, 33 insertions(+), 11 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/cgroup/test_cpuset_prs.sh b/tools/testing/selftests/cgroup/test_cpuset_prs.sh index 7295424502b9..03c1bdaed2c3 100755 --- a/tools/testing/selftests/cgroup/test_cpuset_prs.sh +++ b/tools/testing/selftests/cgroup/test_cpuset_prs.sh @@ -84,6 +84,20 @@ echo member > test/cpuset.cpus.partition echo "" > test/cpuset.cpus [[ $RESULT -eq 0 ]] && skip_test "Child cgroups are using cpuset!" +# +# If isolated CPUs have been reserved at boot time (as shown in +# cpuset.cpus.isolated), these isolated CPUs should be outside of CPUs 0-7 +# that will be used by this script for testing purpose. If not, some of +# the tests may fail incorrectly. These isolated CPUs will also be removed +# before being compared with the expected results. +# +BOOT_ISOLCPUS=$(cat $CGROUP2/cpuset.cpus.isolated) +if [[ -n "$BOOT_ISOLCPUS" ]] +then + [[ $(echo $BOOT_ISOLCPUS | sed -e "s/[,-].*//") -le 7 ]] && + skip_test "Pre-isolated CPUs ($BOOT_ISOLCPUS) overlap CPUs to be tested" + echo "Pre-isolated CPUs: $BOOT_ISOLCPUS" +fi cleanup() { online_cpus @@ -642,7 +656,8 @@ check_cgroup_states() # Note that isolated CPUs from the sched/domains context include offline # CPUs as well as CPUs in non-isolated 1-CPU partition. Those CPUs may # not be included in the cpuset.cpus.isolated control file which contains -# only CPUs in isolated partitions. +# only CPUs in isolated partitions as well as those that are isolated at +# boot time. # # $1 - expected isolated cpu list(s) {,} # - expected sched/domains value @@ -669,18 +684,21 @@ check_isolcpus() fi # - # Check the debug isolated cpumask, if present + # Check cpuset.cpus.isolated cpumask # - [[ -f $ISCPUS ]] && { + if [[ -z "$BOOT_ISOLCPUS" ]] + then + ISOLCPUS=$(cat $ISCPUS) + else + ISOLCPUS=$(cat $ISCPUS | sed -e "s/,*$BOOT_ISOLCPUS//") + fi + [[ "$EXPECT_VAL2" != "$ISOLCPUS" ]] && { + # Take a 50ms pause and try again + pause 0.05 ISOLCPUS=$(cat $ISCPUS) - [[ "$EXPECT_VAL2" != "$ISOLCPUS" ]] && { - # Take a 50ms pause and try again - pause 0.05 - ISOLCPUS=$(cat $ISCPUS) - } - [[ "$EXPECT_VAL2" != "$ISOLCPUS" ]] && return 1 - ISOLCPUS= } + [[ "$EXPECT_VAL2" != "$ISOLCPUS" ]] && return 1 + ISOLCPUS= # # Use the sched domain in debugfs to check isolated CPUs, if available @@ -713,6 +731,9 @@ check_isolcpus() fi done [[ "$ISOLCPUS" = *- ]] && ISOLCPUS=${ISOLCPUS}$LASTISOLCPU + [[ -n "BOOT_ISOLCPUS" ]] && + ISOLCPUS=$(echo $ISOLCPUS | sed -e "s/,*$BOOT_ISOLCPUS//") + [[ "$EXPECT_VAL" = "$ISOLCPUS" ]] } @@ -730,7 +751,8 @@ test_fail() } # -# Check to see if there are unexpected isolated CPUs left +# Check to see if there are unexpected isolated CPUs left beyond the boot +# time isolated ones. # null_isolcpus_check() { -- cgit v1.2.3 From 3f9319c6914c45a2d406faf3aa8cd0dee8bc4c2f Mon Sep 17 00:00:00 2001 From: Chen Ridong Date: Fri, 30 Aug 2024 10:02:29 +0000 Subject: cgroup/cpuset: add sefltest for cpuset v1 There is only hotplug test for cpuset v1, just add base read/write test for cpuset v1. Signed-off-by: Chen Ridong Acked-by: Waiman Long Signed-off-by: Tejun Heo --- MAINTAINERS | 1 + .../selftests/cgroup/test_cpuset_v1_base.sh | 77 ++++++++++++++++++++++ 2 files changed, 78 insertions(+) create mode 100755 tools/testing/selftests/cgroup/test_cpuset_v1_base.sh (limited to 'tools') diff --git a/MAINTAINERS b/MAINTAINERS index 3b5ec1cafd95..b59f54e1e30d 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -5703,6 +5703,7 @@ F: kernel/cgroup/cpuset-v1.c F: kernel/cgroup/cpuset.c F: tools/testing/selftests/cgroup/test_cpuset.c F: tools/testing/selftests/cgroup/test_cpuset_prs.sh +F: tools/testing/selftests/cgroup/test_cpuset_v1_base.sh CONTROL GROUP - MEMORY RESOURCE CONTROLLER (MEMCG) M: Johannes Weiner diff --git a/tools/testing/selftests/cgroup/test_cpuset_v1_base.sh b/tools/testing/selftests/cgroup/test_cpuset_v1_base.sh new file mode 100755 index 000000000000..42a6628fb8bc --- /dev/null +++ b/tools/testing/selftests/cgroup/test_cpuset_v1_base.sh @@ -0,0 +1,77 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# +# Basc test for cpuset v1 interfaces write/read +# + +skip_test() { + echo "$1" + echo "Test SKIPPED" + exit 4 # ksft_skip +} + +write_test() { + dir=$1 + interface=$2 + value=$3 + original=$(cat $dir/$interface) + echo "testing $interface $value" + echo $value > $dir/$interface + new=$(cat $dir/$interface) + [[ $value -ne $(cat $dir/$interface) ]] && { + echo "$interface write $value failed: new:$new" + exit 1 + } +} + +[[ $(id -u) -eq 0 ]] || skip_test "Test must be run as root!" + +# Find cpuset v1 mount point +CPUSET=$(mount -t cgroup | grep cpuset | head -1 | awk '{print $3}') +[[ -n "$CPUSET" ]] || skip_test "cpuset v1 mount point not found!" + +# +# Create a test cpuset, read write test +# +TDIR=test$$ +[[ -d $CPUSET/$TDIR ]] || mkdir $CPUSET/$TDIR + +ITF_MATRIX=( + #interface value expect root_only + 'cpuset.cpus 0-1 0-1 0' + 'cpuset.mem_exclusive 1 1 0' + 'cpuset.mem_exclusive 0 0 0' + 'cpuset.mem_hardwall 1 1 0' + 'cpuset.mem_hardwall 0 0 0' + 'cpuset.memory_migrate 1 1 0' + 'cpuset.memory_migrate 0 0 0' + 'cpuset.memory_spread_page 1 1 0' + 'cpuset.memory_spread_page 0 0 0' + 'cpuset.memory_spread_slab 1 1 0' + 'cpuset.memory_spread_slab 0 0 0' + 'cpuset.mems 0 0 0' + 'cpuset.sched_load_balance 1 1 0' + 'cpuset.sched_load_balance 0 0 0' + 'cpuset.sched_relax_domain_level 2 2 0' + 'cpuset.memory_pressure_enabled 1 1 1' + 'cpuset.memory_pressure_enabled 0 0 1' +) + +run_test() +{ + cnt="${ITF_MATRIX[@]}" + for i in "${ITF_MATRIX[@]}" ; do + args=($i) + root_only=${args[3]} + [[ $root_only -eq 1 ]] && { + write_test "$CPUSET" "${args[0]}" "${args[1]}" "${args[2]}" + continue + } + write_test "$CPUSET/$TDIR" "${args[0]}" "${args[1]}" "${args[2]}" + done +} + +run_test +rmdir $CPUSET/$TDIR +echo "Test PASSED" +exit 0 -- cgit v1.2.3 From b808f629215685c1941b1cd567c7b7ccb3c90278 Mon Sep 17 00:00:00 2001 From: Muhammad Usama Anjum Date: Fri, 9 Aug 2024 13:25:11 +0500 Subject: selftests: mm: fix build errors on armhf The __NR_mmap isn't found on armhf. The mmap() is commonly available system call and its wrapper is present on all architectures. So it should be used directly. It solves problem for armhf and doesn't create problem for other architectures. Remove sys_mmap() functions as they aren't doing anything else other than calling mmap(). There is no need to set errno = 0 manually as glibc always resets it. For reference errors are as following: CC seal_elf seal_elf.c: In function 'sys_mmap': seal_elf.c:39:33: error: '__NR_mmap' undeclared (first use in this function) 39 | sret = (void *) syscall(__NR_mmap, addr, len, prot, | ^~~~~~~~~ mseal_test.c: In function 'sys_mmap': mseal_test.c:90:33: error: '__NR_mmap' undeclared (first use in this function) 90 | sret = (void *) syscall(__NR_mmap, addr, len, prot, | ^~~~~~~~~ Link: https://lkml.kernel.org/r/20240809082511.497266-1-usama.anjum@collabora.com Fixes: 4926c7a52de7 ("selftest mm/mseal memory sealing") Signed-off-by: Muhammad Usama Anjum Cc: Jeff Xu Cc: Kees Cook Cc: Liam R. Howlett Cc: Shuah Khan Cc: Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/mseal_test.c | 37 ++++++++++++--------------------- tools/testing/selftests/mm/seal_elf.c | 13 +----------- 2 files changed, 14 insertions(+), 36 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/mm/mseal_test.c b/tools/testing/selftests/mm/mseal_test.c index a818f010de47..bfcea5cf9a48 100644 --- a/tools/testing/selftests/mm/mseal_test.c +++ b/tools/testing/selftests/mm/mseal_test.c @@ -81,17 +81,6 @@ static int sys_mprotect_pkey(void *ptr, size_t size, unsigned long orig_prot, return sret; } -static void *sys_mmap(void *addr, unsigned long len, unsigned long prot, - unsigned long flags, unsigned long fd, unsigned long offset) -{ - void *sret; - - errno = 0; - sret = (void *) syscall(__NR_mmap, addr, len, prot, - flags, fd, offset); - return sret; -} - static int sys_munmap(void *ptr, size_t size) { int sret; @@ -172,7 +161,7 @@ static void setup_single_address(int size, void **ptrOut) { void *ptr; - ptr = sys_mmap(NULL, size, PROT_READ, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); + ptr = mmap(NULL, size, PROT_READ, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); *ptrOut = ptr; } @@ -181,7 +170,7 @@ static void setup_single_address_rw(int size, void **ptrOut) void *ptr; unsigned long mapflags = MAP_ANONYMOUS | MAP_PRIVATE; - ptr = sys_mmap(NULL, size, PROT_READ | PROT_WRITE, mapflags, -1, 0); + ptr = mmap(NULL, size, PROT_READ | PROT_WRITE, mapflags, -1, 0); *ptrOut = ptr; } @@ -205,7 +194,7 @@ bool seal_support(void) void *ptr; unsigned long page_size = getpagesize(); - ptr = sys_mmap(NULL, page_size, PROT_READ, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); + ptr = mmap(NULL, page_size, PROT_READ, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); if (ptr == (void *) -1) return false; @@ -481,8 +470,8 @@ static void test_seal_zero_address(void) int prot; /* use mmap to change protection. */ - ptr = sys_mmap(0, size, PROT_NONE, - MAP_ANONYMOUS | MAP_PRIVATE | MAP_FIXED, -1, 0); + ptr = mmap(0, size, PROT_NONE, + MAP_ANONYMOUS | MAP_PRIVATE | MAP_FIXED, -1, 0); FAIL_TEST_IF_FALSE(ptr == 0); size = get_vma_size(ptr, &prot); @@ -1209,8 +1198,8 @@ static void test_seal_mmap_overwrite_prot(bool seal) } /* use mmap to change protection. */ - ret2 = sys_mmap(ptr, size, PROT_NONE, - MAP_ANONYMOUS | MAP_PRIVATE | MAP_FIXED, -1, 0); + ret2 = mmap(ptr, size, PROT_NONE, + MAP_ANONYMOUS | MAP_PRIVATE | MAP_FIXED, -1, 0); if (seal) { FAIL_TEST_IF_FALSE(ret2 == MAP_FAILED); FAIL_TEST_IF_FALSE(errno == EPERM); @@ -1240,8 +1229,8 @@ static void test_seal_mmap_expand(bool seal) } /* use mmap to expand. */ - ret2 = sys_mmap(ptr, size, PROT_READ, - MAP_ANONYMOUS | MAP_PRIVATE | MAP_FIXED, -1, 0); + ret2 = mmap(ptr, size, PROT_READ, + MAP_ANONYMOUS | MAP_PRIVATE | MAP_FIXED, -1, 0); if (seal) { FAIL_TEST_IF_FALSE(ret2 == MAP_FAILED); FAIL_TEST_IF_FALSE(errno == EPERM); @@ -1268,8 +1257,8 @@ static void test_seal_mmap_shrink(bool seal) } /* use mmap to shrink. */ - ret2 = sys_mmap(ptr, 8 * page_size, PROT_READ, - MAP_ANONYMOUS | MAP_PRIVATE | MAP_FIXED, -1, 0); + ret2 = mmap(ptr, 8 * page_size, PROT_READ, + MAP_ANONYMOUS | MAP_PRIVATE | MAP_FIXED, -1, 0); if (seal) { FAIL_TEST_IF_FALSE(ret2 == MAP_FAILED); FAIL_TEST_IF_FALSE(errno == EPERM); @@ -1650,7 +1639,7 @@ static void test_seal_discard_ro_anon_on_filebacked(bool seal) ret = fallocate(fd, 0, 0, size); FAIL_TEST_IF_FALSE(!ret); - ptr = sys_mmap(NULL, size, PROT_READ, mapflags, fd, 0); + ptr = mmap(NULL, size, PROT_READ, mapflags, fd, 0); FAIL_TEST_IF_FALSE(ptr != MAP_FAILED); if (seal) { @@ -1680,7 +1669,7 @@ static void test_seal_discard_ro_anon_on_shared(bool seal) int ret; unsigned long mapflags = MAP_ANONYMOUS | MAP_SHARED; - ptr = sys_mmap(NULL, size, PROT_READ, mapflags, -1, 0); + ptr = mmap(NULL, size, PROT_READ, mapflags, -1, 0); FAIL_TEST_IF_FALSE(ptr != (void *)-1); if (seal) { diff --git a/tools/testing/selftests/mm/seal_elf.c b/tools/testing/selftests/mm/seal_elf.c index 7aa1366063e4..d9f8ba8d5050 100644 --- a/tools/testing/selftests/mm/seal_elf.c +++ b/tools/testing/selftests/mm/seal_elf.c @@ -30,17 +30,6 @@ static int sys_mseal(void *start, size_t len) return sret; } -static void *sys_mmap(void *addr, unsigned long len, unsigned long prot, - unsigned long flags, unsigned long fd, unsigned long offset) -{ - void *sret; - - errno = 0; - sret = (void *) syscall(__NR_mmap, addr, len, prot, - flags, fd, offset); - return sret; -} - static inline int sys_mprotect(void *ptr, size_t size, unsigned long prot) { int sret; @@ -56,7 +45,7 @@ static bool seal_support(void) void *ptr; unsigned long page_size = getpagesize(); - ptr = sys_mmap(NULL, page_size, PROT_READ, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); + ptr = mmap(NULL, page_size, PROT_READ, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); if (ptr == (void *) -1) return false; -- cgit v1.2.3 From d075bccec082be326e08d8b4f7cd491029438f0b Mon Sep 17 00:00:00 2001 From: David Finkel Date: Mon, 29 Jul 2024 10:37:43 -0400 Subject: mm, memcg: cg2 memory{.swap,}.peak write tests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Extend two existing tests to cover extracting memory usage through the newly mutable memory.peak and memory.swap.peak handlers. In particular, make sure to exercise adding and removing watchers with overlapping lifetimes so the less-trivial logic gets tested. The new/updated tests attempt to detect a lack of the write handler by fstat'ing the memory.peak and memory.swap.peak files and skip the tests if that's the case. Additionally, skip if the file doesn't exist at all. [davidf@vimeo.com: update tests] Link: https://lkml.kernel.org/r/20240730231304.761942-3-davidf@vimeo.com Link: https://lkml.kernel.org/r/20240729143743.34236-3-davidf@vimeo.com Signed-off-by: David Finkel Acked-by: Tejun Heo Reviewed-by: Roman Gushchin Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Michal Hocko Cc: Michal Koutný Cc: Muchun Song Cc: Shakeel Butt Cc: Shuah Khan Cc: Waiman Long Cc: Zefan Li Signed-off-by: Andrew Morton --- tools/testing/selftests/cgroup/cgroup_util.c | 22 ++ tools/testing/selftests/cgroup/cgroup_util.h | 2 + tools/testing/selftests/cgroup/test_memcontrol.c | 264 ++++++++++++++++++++++- 3 files changed, 280 insertions(+), 8 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/cgroup/cgroup_util.c b/tools/testing/selftests/cgroup/cgroup_util.c index 432db923bced..1e2d46636a0c 100644 --- a/tools/testing/selftests/cgroup/cgroup_util.c +++ b/tools/testing/selftests/cgroup/cgroup_util.c @@ -141,6 +141,16 @@ long cg_read_long(const char *cgroup, const char *control) return atol(buf); } +long cg_read_long_fd(int fd) +{ + char buf[128]; + + if (pread(fd, buf, sizeof(buf), 0) <= 0) + return -1; + + return atol(buf); +} + long cg_read_key_long(const char *cgroup, const char *control, const char *key) { char buf[PAGE_SIZE]; @@ -183,6 +193,18 @@ int cg_write(const char *cgroup, const char *control, char *buf) return ret == len ? 0 : ret; } +/* + * Returns fd on success, or -1 on failure. + * (fd should be closed with close() as usual) + */ +int cg_open(const char *cgroup, const char *control, int flags) +{ + char path[PATH_MAX]; + + snprintf(path, sizeof(path), "%s/%s", cgroup, control); + return open(path, flags); +} + int cg_write_numeric(const char *cgroup, const char *control, long value) { char buf[64]; diff --git a/tools/testing/selftests/cgroup/cgroup_util.h b/tools/testing/selftests/cgroup/cgroup_util.h index e8d04ac9e3d2..19b131ee7707 100644 --- a/tools/testing/selftests/cgroup/cgroup_util.h +++ b/tools/testing/selftests/cgroup/cgroup_util.h @@ -34,9 +34,11 @@ extern int cg_read_strcmp(const char *cgroup, const char *control, extern int cg_read_strstr(const char *cgroup, const char *control, const char *needle); extern long cg_read_long(const char *cgroup, const char *control); +extern long cg_read_long_fd(int fd); long cg_read_key_long(const char *cgroup, const char *control, const char *key); extern long cg_read_lc(const char *cgroup, const char *control); extern int cg_write(const char *cgroup, const char *control, char *buf); +extern int cg_open(const char *cgroup, const char *control, int flags); int cg_write_numeric(const char *cgroup, const char *control, long value); extern int cg_run(const char *cgroup, int (*fn)(const char *cgroup, void *arg), diff --git a/tools/testing/selftests/cgroup/test_memcontrol.c b/tools/testing/selftests/cgroup/test_memcontrol.c index 41ae8047b889..16f5d74ae762 100644 --- a/tools/testing/selftests/cgroup/test_memcontrol.c +++ b/tools/testing/selftests/cgroup/test_memcontrol.c @@ -161,13 +161,16 @@ cleanup: /* * This test create a memory cgroup, allocates * some anonymous memory and some pagecache - * and check memory.current and some memory.stat values. + * and checks memory.current, memory.peak, and some memory.stat values. */ -static int test_memcg_current(const char *root) +static int test_memcg_current_peak(const char *root) { int ret = KSFT_FAIL; - long current; + long current, peak, peak_reset; char *memcg; + bool fd2_closed = false, fd3_closed = false, fd4_closed = false; + int peak_fd = -1, peak_fd2 = -1, peak_fd3 = -1, peak_fd4 = -1; + struct stat ss; memcg = cg_name(root, "memcg_test"); if (!memcg) @@ -180,15 +183,124 @@ static int test_memcg_current(const char *root) if (current != 0) goto cleanup; + peak = cg_read_long(memcg, "memory.peak"); + if (peak != 0) + goto cleanup; + if (cg_run(memcg, alloc_anon_50M_check, NULL)) goto cleanup; + peak = cg_read_long(memcg, "memory.peak"); + if (peak < MB(50)) + goto cleanup; + + /* + * We'll open a few FDs for the same memory.peak file to exercise the free-path + * We need at least three to be closed in a different order than writes occurred to test + * the linked-list handling. + */ + peak_fd = cg_open(memcg, "memory.peak", O_RDWR | O_APPEND | O_CLOEXEC); + + if (peak_fd == -1) { + if (errno == ENOENT) + ret = KSFT_SKIP; + goto cleanup; + } + + /* + * Before we try to use memory.peak's fd, try to figure out whether + * this kernel supports writing to that file in the first place. (by + * checking the writable bit on the file's st_mode) + */ + if (fstat(peak_fd, &ss)) + goto cleanup; + + if ((ss.st_mode & S_IWUSR) == 0) { + ret = KSFT_SKIP; + goto cleanup; + } + + peak_fd2 = cg_open(memcg, "memory.peak", O_RDWR | O_APPEND | O_CLOEXEC); + + if (peak_fd2 == -1) + goto cleanup; + + peak_fd3 = cg_open(memcg, "memory.peak", O_RDWR | O_APPEND | O_CLOEXEC); + + if (peak_fd3 == -1) + goto cleanup; + + /* any non-empty string resets, but make it clear */ + static const char reset_string[] = "reset\n"; + + peak_reset = write(peak_fd, reset_string, sizeof(reset_string)); + if (peak_reset != sizeof(reset_string)) + goto cleanup; + + peak_reset = write(peak_fd2, reset_string, sizeof(reset_string)); + if (peak_reset != sizeof(reset_string)) + goto cleanup; + + peak_reset = write(peak_fd3, reset_string, sizeof(reset_string)); + if (peak_reset != sizeof(reset_string)) + goto cleanup; + + /* Make sure a completely independent read isn't affected by our FD-local reset above*/ + peak = cg_read_long(memcg, "memory.peak"); + if (peak < MB(50)) + goto cleanup; + + fd2_closed = true; + if (close(peak_fd2)) + goto cleanup; + + peak_fd4 = cg_open(memcg, "memory.peak", O_RDWR | O_APPEND | O_CLOEXEC); + + if (peak_fd4 == -1) + goto cleanup; + + peak_reset = write(peak_fd4, reset_string, sizeof(reset_string)); + if (peak_reset != sizeof(reset_string)) + goto cleanup; + + peak = cg_read_long_fd(peak_fd); + if (peak > MB(30) || peak < 0) + goto cleanup; + if (cg_run(memcg, alloc_pagecache_50M_check, NULL)) goto cleanup; + peak = cg_read_long(memcg, "memory.peak"); + if (peak < MB(50)) + goto cleanup; + + /* Make sure everything is back to normal */ + peak = cg_read_long_fd(peak_fd); + if (peak < MB(50)) + goto cleanup; + + peak = cg_read_long_fd(peak_fd4); + if (peak < MB(50)) + goto cleanup; + + fd3_closed = true; + if (close(peak_fd3)) + goto cleanup; + + fd4_closed = true; + if (close(peak_fd4)) + goto cleanup; + ret = KSFT_PASS; cleanup: + close(peak_fd); + if (!fd2_closed) + close(peak_fd2); + if (!fd3_closed) + close(peak_fd3); + if (!fd4_closed) + close(peak_fd4); cg_destroy(memcg); free(memcg); @@ -817,13 +929,19 @@ cleanup: /* * This test checks that memory.swap.max limits the amount of - * anonymous memory which can be swapped out. + * anonymous memory which can be swapped out. Additionally, it verifies that + * memory.swap.peak reflects the high watermark and can be reset. */ -static int test_memcg_swap_max(const char *root) +static int test_memcg_swap_max_peak(const char *root) { int ret = KSFT_FAIL; char *memcg; - long max; + long max, peak; + struct stat ss; + int swap_peak_fd = -1, mem_peak_fd = -1; + + /* any non-empty string resets */ + static const char reset_string[] = "foobarbaz"; if (!is_swap_enabled()) return KSFT_SKIP; @@ -840,6 +958,61 @@ static int test_memcg_swap_max(const char *root) goto cleanup; } + swap_peak_fd = cg_open(memcg, "memory.swap.peak", + O_RDWR | O_APPEND | O_CLOEXEC); + + if (swap_peak_fd == -1) { + if (errno == ENOENT) + ret = KSFT_SKIP; + goto cleanup; + } + + /* + * Before we try to use memory.swap.peak's fd, try to figure out + * whether this kernel supports writing to that file in the first + * place. (by checking the writable bit on the file's st_mode) + */ + if (fstat(swap_peak_fd, &ss)) + goto cleanup; + + if ((ss.st_mode & S_IWUSR) == 0) { + ret = KSFT_SKIP; + goto cleanup; + } + + mem_peak_fd = cg_open(memcg, "memory.peak", O_RDWR | O_APPEND | O_CLOEXEC); + + if (mem_peak_fd == -1) + goto cleanup; + + if (cg_read_long(memcg, "memory.swap.peak")) + goto cleanup; + + if (cg_read_long_fd(swap_peak_fd)) + goto cleanup; + + /* switch the swap and mem fds into local-peak tracking mode*/ + int peak_reset = write(swap_peak_fd, reset_string, sizeof(reset_string)); + + if (peak_reset != sizeof(reset_string)) + goto cleanup; + + if (cg_read_long_fd(swap_peak_fd)) + goto cleanup; + + if (cg_read_long(memcg, "memory.peak")) + goto cleanup; + + if (cg_read_long_fd(mem_peak_fd)) + goto cleanup; + + peak_reset = write(mem_peak_fd, reset_string, sizeof(reset_string)); + if (peak_reset != sizeof(reset_string)) + goto cleanup; + + if (cg_read_long_fd(mem_peak_fd)) + goto cleanup; + if (cg_read_strcmp(memcg, "memory.max", "max\n")) goto cleanup; @@ -862,6 +1035,61 @@ static int test_memcg_swap_max(const char *root) if (cg_read_key_long(memcg, "memory.events", "oom_kill ") != 1) goto cleanup; + peak = cg_read_long(memcg, "memory.peak"); + if (peak < MB(29)) + goto cleanup; + + peak = cg_read_long(memcg, "memory.swap.peak"); + if (peak < MB(29)) + goto cleanup; + + peak = cg_read_long_fd(mem_peak_fd); + if (peak < MB(29)) + goto cleanup; + + peak = cg_read_long_fd(swap_peak_fd); + if (peak < MB(29)) + goto cleanup; + + /* + * open, reset and close the peak swap on another FD to make sure + * multiple extant fds don't corrupt the linked-list + */ + peak_reset = cg_write(memcg, "memory.swap.peak", (char *)reset_string); + if (peak_reset) + goto cleanup; + + peak_reset = cg_write(memcg, "memory.peak", (char *)reset_string); + if (peak_reset) + goto cleanup; + + /* actually reset on the fds */ + peak_reset = write(swap_peak_fd, reset_string, sizeof(reset_string)); + if (peak_reset != sizeof(reset_string)) + goto cleanup; + + peak_reset = write(mem_peak_fd, reset_string, sizeof(reset_string)); + if (peak_reset != sizeof(reset_string)) + goto cleanup; + + peak = cg_read_long_fd(swap_peak_fd); + if (peak > MB(10)) + goto cleanup; + + /* + * The cgroup is now empty, but there may be a page or two associated + * with the open FD accounted to it. + */ + peak = cg_read_long_fd(mem_peak_fd); + if (peak > MB(1)) + goto cleanup; + + if (cg_read_long(memcg, "memory.peak") < MB(29)) + goto cleanup; + + if (cg_read_long(memcg, "memory.swap.peak") < MB(29)) + goto cleanup; + if (cg_run(memcg, alloc_anon_50M_check_swap, (void *)MB(30))) goto cleanup; @@ -869,9 +1097,29 @@ static int test_memcg_swap_max(const char *root) if (max <= 0) goto cleanup; + peak = cg_read_long(memcg, "memory.peak"); + if (peak < MB(29)) + goto cleanup; + + peak = cg_read_long(memcg, "memory.swap.peak"); + if (peak < MB(29)) + goto cleanup; + + peak = cg_read_long_fd(mem_peak_fd); + if (peak < MB(29)) + goto cleanup; + + peak = cg_read_long_fd(swap_peak_fd); + if (peak < MB(19)) + goto cleanup; + ret = KSFT_PASS; cleanup: + if (mem_peak_fd != -1 && close(mem_peak_fd)) + ret = KSFT_FAIL; + if (swap_peak_fd != -1 && close(swap_peak_fd)) + ret = KSFT_FAIL; cg_destroy(memcg); free(memcg); @@ -1295,7 +1543,7 @@ struct memcg_test { const char *name; } tests[] = { T(test_memcg_subtree_control), - T(test_memcg_current), + T(test_memcg_current_peak), T(test_memcg_min), T(test_memcg_low), T(test_memcg_high), @@ -1303,7 +1551,7 @@ struct memcg_test { T(test_memcg_max), T(test_memcg_reclaim), T(test_memcg_oom_events), - T(test_memcg_swap_max), + T(test_memcg_swap_max_peak), T(test_memcg_sock), T(test_memcg_oom_group_leaf_events), T(test_memcg_oom_group_parent_events), -- cgit v1.2.3 From 74579d8dab476b66cd28715e73832ab777f20984 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Mon, 29 Jul 2024 12:50:40 +0100 Subject: tools: separate out shared radix-tree components The core components contained within the radix-tree tests which provide shims for kernel headers and access to the maple tree are useful for testing other things, so separate them out and make the radix tree tests dependent on the shared components. This lays the groundwork for us to add VMA tests of the newly introduced vma.c file. Link: https://lkml.kernel.org/r/1ee720c265808168e0d75608e687607d77c36719.1722251717.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Acked-by: Vlastimil Babka Reviewed-by: Liam R. Howlett Cc: Alexander Viro Cc: Brendan Higgins Cc: Christian Brauner Cc: David Gow Cc: Eric W. Biederman Cc: Jan Kara Cc: Kees Cook Cc: Matthew Wilcox (Oracle) Cc: Rae Moar Cc: SeongJae Park Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Pengfei Xu Signed-off-by: Andrew Morton --- tools/testing/radix-tree/.gitignore | 1 + tools/testing/radix-tree/Makefile | 72 +----- tools/testing/radix-tree/generated/autoconf.h | 2 - tools/testing/radix-tree/linux.c | 271 --------------------- tools/testing/radix-tree/linux/bug.h | 2 - tools/testing/radix-tree/linux/cpu.h | 1 - tools/testing/radix-tree/linux/idr.h | 1 - tools/testing/radix-tree/linux/init.h | 2 - tools/testing/radix-tree/linux/kconfig.h | 1 - tools/testing/radix-tree/linux/kernel.h | 29 --- tools/testing/radix-tree/linux/kmemleak.h | 1 - tools/testing/radix-tree/linux/local_lock.h | 8 - tools/testing/radix-tree/linux/lockdep.h | 16 -- tools/testing/radix-tree/linux/maple_tree.h | 7 - tools/testing/radix-tree/linux/percpu.h | 11 - tools/testing/radix-tree/linux/preempt.h | 15 -- tools/testing/radix-tree/linux/radix-tree.h | 26 -- tools/testing/radix-tree/linux/rcupdate.h | 12 - tools/testing/radix-tree/linux/xarray.h | 2 - tools/testing/radix-tree/trace/events/maple_tree.h | 5 - tools/testing/radix-tree/xarray.c | 10 +- tools/testing/shared/autoconf.h | 2 + tools/testing/shared/linux.c | 271 +++++++++++++++++++++ tools/testing/shared/linux/bug.h | 2 + tools/testing/shared/linux/cpu.h | 1 + tools/testing/shared/linux/idr.h | 1 + tools/testing/shared/linux/init.h | 2 + tools/testing/shared/linux/kconfig.h | 1 + tools/testing/shared/linux/kernel.h | 29 +++ tools/testing/shared/linux/kmemleak.h | 1 + tools/testing/shared/linux/local_lock.h | 8 + tools/testing/shared/linux/lockdep.h | 16 ++ tools/testing/shared/linux/maple_tree.h | 7 + tools/testing/shared/linux/percpu.h | 11 + tools/testing/shared/linux/preempt.h | 15 ++ tools/testing/shared/linux/radix-tree.h | 26 ++ tools/testing/shared/linux/rcupdate.h | 12 + tools/testing/shared/linux/xarray.h | 2 + tools/testing/shared/maple-shared.h | 9 + tools/testing/shared/maple-shim.c | 7 + tools/testing/shared/shared.h | 33 +++ tools/testing/shared/shared.mk | 72 ++++++ tools/testing/shared/trace/events/maple_tree.h | 5 + tools/testing/shared/xarray-shared.c | 5 + tools/testing/shared/xarray-shared.h | 4 + 45 files changed, 556 insertions(+), 481 deletions(-) delete mode 100644 tools/testing/radix-tree/generated/autoconf.h delete mode 100644 tools/testing/radix-tree/linux.c delete mode 100644 tools/testing/radix-tree/linux/bug.h delete mode 100644 tools/testing/radix-tree/linux/cpu.h delete mode 100644 tools/testing/radix-tree/linux/idr.h delete mode 100644 tools/testing/radix-tree/linux/init.h delete mode 100644 tools/testing/radix-tree/linux/kconfig.h delete mode 100644 tools/testing/radix-tree/linux/kernel.h delete mode 100644 tools/testing/radix-tree/linux/kmemleak.h delete mode 100644 tools/testing/radix-tree/linux/local_lock.h delete mode 100644 tools/testing/radix-tree/linux/lockdep.h delete mode 100644 tools/testing/radix-tree/linux/maple_tree.h delete mode 100644 tools/testing/radix-tree/linux/percpu.h delete mode 100644 tools/testing/radix-tree/linux/preempt.h delete mode 100644 tools/testing/radix-tree/linux/radix-tree.h delete mode 100644 tools/testing/radix-tree/linux/rcupdate.h delete mode 100644 tools/testing/radix-tree/linux/xarray.h delete mode 100644 tools/testing/radix-tree/trace/events/maple_tree.h create mode 100644 tools/testing/shared/autoconf.h create mode 100644 tools/testing/shared/linux.c create mode 100644 tools/testing/shared/linux/bug.h create mode 100644 tools/testing/shared/linux/cpu.h create mode 100644 tools/testing/shared/linux/idr.h create mode 100644 tools/testing/shared/linux/init.h create mode 100644 tools/testing/shared/linux/kconfig.h create mode 100644 tools/testing/shared/linux/kernel.h create mode 100644 tools/testing/shared/linux/kmemleak.h create mode 100644 tools/testing/shared/linux/local_lock.h create mode 100644 tools/testing/shared/linux/lockdep.h create mode 100644 tools/testing/shared/linux/maple_tree.h create mode 100644 tools/testing/shared/linux/percpu.h create mode 100644 tools/testing/shared/linux/preempt.h create mode 100644 tools/testing/shared/linux/radix-tree.h create mode 100644 tools/testing/shared/linux/rcupdate.h create mode 100644 tools/testing/shared/linux/xarray.h create mode 100644 tools/testing/shared/maple-shared.h create mode 100644 tools/testing/shared/maple-shim.c create mode 100644 tools/testing/shared/shared.h create mode 100644 tools/testing/shared/shared.mk create mode 100644 tools/testing/shared/trace/events/maple_tree.h create mode 100644 tools/testing/shared/xarray-shared.c create mode 100644 tools/testing/shared/xarray-shared.h (limited to 'tools') diff --git a/tools/testing/radix-tree/.gitignore b/tools/testing/radix-tree/.gitignore index 49bccb90c35b..ce167a761981 100644 --- a/tools/testing/radix-tree/.gitignore +++ b/tools/testing/radix-tree/.gitignore @@ -1,4 +1,5 @@ # SPDX-License-Identifier: GPL-2.0-only +generated/autoconf.h generated/bit-length.h generated/map-shift.h idr.c diff --git a/tools/testing/radix-tree/Makefile b/tools/testing/radix-tree/Makefile index d1acd7d58850..8b3591a51e1f 100644 --- a/tools/testing/radix-tree/Makefile +++ b/tools/testing/radix-tree/Makefile @@ -1,77 +1,29 @@ # SPDX-License-Identifier: GPL-2.0 -CFLAGS += -I. -I../../include -I../../../lib -g -Og -Wall \ - -D_LGPL_SOURCE -fsanitize=address -fsanitize=undefined -LDFLAGS += -fsanitize=address -fsanitize=undefined -LDLIBS+= -lpthread -lurcu -TARGETS = main idr-test multiorder xarray maple -LIBS := slab.o find_bit.o bitmap.o hweight.o vsprintf.o -CORE_OFILES := xarray.o radix-tree.o idr.o linux.o test.o maple.o $(LIBS) -OFILES = main.o $(CORE_OFILES) regression1.o regression2.o regression3.o \ - regression4.o tag_check.o multiorder.o idr-test.o iteration_check.o \ - iteration_check_2.o benchmark.o - -ifndef SHIFT - SHIFT=3 -endif +.PHONY: clean -ifeq ($(BUILD), 32) - CFLAGS += -m32 - LDFLAGS += -m32 -LONG_BIT := 32 -endif - -ifndef LONG_BIT -LONG_BIT := $(shell getconf LONG_BIT) -endif +TARGETS = main idr-test multiorder xarray maple +CORE_OFILES = $(SHARED_OFILES) xarray.o maple.o test.o +OFILES = main.o $(CORE_OFILES) regression1.o regression2.o \ + regression3.o regression4.o tag_check.o multiorder.o idr-test.o \ + iteration_check.o iteration_check_2.o benchmark.o targets: generated/map-shift.h generated/bit-length.h $(TARGETS) +include ../shared/shared.mk + main: $(OFILES) idr-test.o: ../../../lib/test_ida.c idr-test: idr-test.o $(CORE_OFILES) -xarray: $(CORE_OFILES) +xarray: $(CORE_OFILES) xarray.o -maple: $(CORE_OFILES) +maple: $(CORE_OFILES) maple.o multiorder: multiorder.o $(CORE_OFILES) clean: - $(RM) $(TARGETS) *.o radix-tree.c idr.c generated/map-shift.h generated/bit-length.h - -vpath %.c ../../lib - -$(OFILES): Makefile *.h */*.h generated/map-shift.h generated/bit-length.h \ - ../../include/linux/*.h \ - ../../include/asm/*.h \ - ../../../include/linux/xarray.h \ - ../../../include/linux/maple_tree.h \ - ../../../include/linux/radix-tree.h \ - ../../../lib/radix-tree.h \ - ../../../include/linux/idr.h - -radix-tree.c: ../../../lib/radix-tree.c - sed -e 's/^static //' -e 's/__always_inline //' -e 's/inline //' < $< > $@ - -idr.c: ../../../lib/idr.c - sed -e 's/^static //' -e 's/__always_inline //' -e 's/inline //' < $< > $@ - -xarray.o: ../../../lib/xarray.c ../../../lib/test_xarray.c - -maple.o: ../../../lib/maple_tree.c ../../../lib/test_maple_tree.c - -generated/map-shift.h: - @if ! grep -qws $(SHIFT) generated/map-shift.h; then \ - echo "#define XA_CHUNK_SHIFT $(SHIFT)" > \ - generated/map-shift.h; \ - fi - -generated/bit-length.h: FORCE - @if ! grep -qws CONFIG_$(LONG_BIT)BIT generated/bit-length.h; then \ - echo "Generating $@"; \ - echo "#define CONFIG_$(LONG_BIT)BIT 1" > $@; \ - fi + $(RM) $(TARGETS) *.o radix-tree.c idr.c generated/* -FORCE: ; +$(OFILES): $(SHARED_DEPS) *.h diff --git a/tools/testing/radix-tree/generated/autoconf.h b/tools/testing/radix-tree/generated/autoconf.h deleted file mode 100644 index 92dc474c349b..000000000000 --- a/tools/testing/radix-tree/generated/autoconf.h +++ /dev/null @@ -1,2 +0,0 @@ -#include "bit-length.h" -#define CONFIG_XARRAY_MULTI 1 diff --git a/tools/testing/radix-tree/linux.c b/tools/testing/radix-tree/linux.c deleted file mode 100644 index 4eb442206d01..000000000000 --- a/tools/testing/radix-tree/linux.c +++ /dev/null @@ -1,271 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include - -int nr_allocated; -int preempt_count; -int test_verbose; - -struct kmem_cache { - pthread_mutex_t lock; - unsigned int size; - unsigned int align; - int nr_objs; - void *objs; - void (*ctor)(void *); - unsigned int non_kernel; - unsigned long nr_allocated; - unsigned long nr_tallocated; -}; - -void kmem_cache_set_non_kernel(struct kmem_cache *cachep, unsigned int val) -{ - cachep->non_kernel = val; -} - -unsigned long kmem_cache_get_alloc(struct kmem_cache *cachep) -{ - return cachep->size * cachep->nr_allocated; -} - -unsigned long kmem_cache_nr_allocated(struct kmem_cache *cachep) -{ - return cachep->nr_allocated; -} - -unsigned long kmem_cache_nr_tallocated(struct kmem_cache *cachep) -{ - return cachep->nr_tallocated; -} - -void kmem_cache_zero_nr_tallocated(struct kmem_cache *cachep) -{ - cachep->nr_tallocated = 0; -} - -void *kmem_cache_alloc_lru(struct kmem_cache *cachep, struct list_lru *lru, - int gfp) -{ - void *p; - - if (!(gfp & __GFP_DIRECT_RECLAIM)) { - if (!cachep->non_kernel) - return NULL; - - cachep->non_kernel--; - } - - pthread_mutex_lock(&cachep->lock); - if (cachep->nr_objs) { - struct radix_tree_node *node = cachep->objs; - cachep->nr_objs--; - cachep->objs = node->parent; - pthread_mutex_unlock(&cachep->lock); - node->parent = NULL; - p = node; - } else { - pthread_mutex_unlock(&cachep->lock); - if (cachep->align) - posix_memalign(&p, cachep->align, cachep->size); - else - p = malloc(cachep->size); - if (cachep->ctor) - cachep->ctor(p); - else if (gfp & __GFP_ZERO) - memset(p, 0, cachep->size); - } - - uatomic_inc(&cachep->nr_allocated); - uatomic_inc(&nr_allocated); - uatomic_inc(&cachep->nr_tallocated); - if (kmalloc_verbose) - printf("Allocating %p from slab\n", p); - return p; -} - -void __kmem_cache_free_locked(struct kmem_cache *cachep, void *objp) -{ - assert(objp); - if (cachep->nr_objs > 10 || cachep->align) { - memset(objp, POISON_FREE, cachep->size); - free(objp); - } else { - struct radix_tree_node *node = objp; - cachep->nr_objs++; - node->parent = cachep->objs; - cachep->objs = node; - } -} - -void kmem_cache_free_locked(struct kmem_cache *cachep, void *objp) -{ - uatomic_dec(&nr_allocated); - uatomic_dec(&cachep->nr_allocated); - if (kmalloc_verbose) - printf("Freeing %p to slab\n", objp); - __kmem_cache_free_locked(cachep, objp); -} - -void kmem_cache_free(struct kmem_cache *cachep, void *objp) -{ - pthread_mutex_lock(&cachep->lock); - kmem_cache_free_locked(cachep, objp); - pthread_mutex_unlock(&cachep->lock); -} - -void kmem_cache_free_bulk(struct kmem_cache *cachep, size_t size, void **list) -{ - if (kmalloc_verbose) - pr_debug("Bulk free %p[0-%lu]\n", list, size - 1); - - pthread_mutex_lock(&cachep->lock); - for (int i = 0; i < size; i++) - kmem_cache_free_locked(cachep, list[i]); - pthread_mutex_unlock(&cachep->lock); -} - -void kmem_cache_shrink(struct kmem_cache *cachep) -{ -} - -int kmem_cache_alloc_bulk(struct kmem_cache *cachep, gfp_t gfp, size_t size, - void **p) -{ - size_t i; - - if (kmalloc_verbose) - pr_debug("Bulk alloc %lu\n", size); - - pthread_mutex_lock(&cachep->lock); - if (cachep->nr_objs >= size) { - struct radix_tree_node *node; - - for (i = 0; i < size; i++) { - if (!(gfp & __GFP_DIRECT_RECLAIM)) { - if (!cachep->non_kernel) - break; - cachep->non_kernel--; - } - - node = cachep->objs; - cachep->nr_objs--; - cachep->objs = node->parent; - p[i] = node; - node->parent = NULL; - } - pthread_mutex_unlock(&cachep->lock); - } else { - pthread_mutex_unlock(&cachep->lock); - for (i = 0; i < size; i++) { - if (!(gfp & __GFP_DIRECT_RECLAIM)) { - if (!cachep->non_kernel) - break; - cachep->non_kernel--; - } - - if (cachep->align) { - posix_memalign(&p[i], cachep->align, - cachep->size); - } else { - p[i] = malloc(cachep->size); - if (!p[i]) - break; - } - if (cachep->ctor) - cachep->ctor(p[i]); - else if (gfp & __GFP_ZERO) - memset(p[i], 0, cachep->size); - } - } - - if (i < size) { - size = i; - pthread_mutex_lock(&cachep->lock); - for (i = 0; i < size; i++) - __kmem_cache_free_locked(cachep, p[i]); - pthread_mutex_unlock(&cachep->lock); - return 0; - } - - for (i = 0; i < size; i++) { - uatomic_inc(&nr_allocated); - uatomic_inc(&cachep->nr_allocated); - uatomic_inc(&cachep->nr_tallocated); - if (kmalloc_verbose) - printf("Allocating %p from slab\n", p[i]); - } - - return size; -} - -struct kmem_cache * -kmem_cache_create(const char *name, unsigned int size, unsigned int align, - unsigned int flags, void (*ctor)(void *)) -{ - struct kmem_cache *ret = malloc(sizeof(*ret)); - - pthread_mutex_init(&ret->lock, NULL); - ret->size = size; - ret->align = align; - ret->nr_objs = 0; - ret->nr_allocated = 0; - ret->nr_tallocated = 0; - ret->objs = NULL; - ret->ctor = ctor; - ret->non_kernel = 0; - return ret; -} - -/* - * Test the test infrastructure for kem_cache_alloc/free and bulk counterparts. - */ -void test_kmem_cache_bulk(void) -{ - int i; - void *list[12]; - static struct kmem_cache *test_cache, *test_cache2; - - /* - * Testing the bulk allocators without aligned kmem_cache to force the - * bulk alloc/free to reuse - */ - test_cache = kmem_cache_create("test_cache", 256, 0, SLAB_PANIC, NULL); - - for (i = 0; i < 5; i++) - list[i] = kmem_cache_alloc(test_cache, __GFP_DIRECT_RECLAIM); - - for (i = 0; i < 5; i++) - kmem_cache_free(test_cache, list[i]); - assert(test_cache->nr_objs == 5); - - kmem_cache_alloc_bulk(test_cache, __GFP_DIRECT_RECLAIM, 5, list); - kmem_cache_free_bulk(test_cache, 5, list); - - for (i = 0; i < 12 ; i++) - list[i] = kmem_cache_alloc(test_cache, __GFP_DIRECT_RECLAIM); - - for (i = 0; i < 12; i++) - kmem_cache_free(test_cache, list[i]); - - /* The last free will not be kept around */ - assert(test_cache->nr_objs == 11); - - /* Aligned caches will immediately free */ - test_cache2 = kmem_cache_create("test_cache2", 128, 128, SLAB_PANIC, NULL); - - kmem_cache_alloc_bulk(test_cache2, __GFP_DIRECT_RECLAIM, 10, list); - kmem_cache_free_bulk(test_cache2, 10, list); - assert(!test_cache2->nr_objs); - - -} diff --git a/tools/testing/radix-tree/linux/bug.h b/tools/testing/radix-tree/linux/bug.h deleted file mode 100644 index 03dc8a57eb99..000000000000 --- a/tools/testing/radix-tree/linux/bug.h +++ /dev/null @@ -1,2 +0,0 @@ -#include -#include "asm/bug.h" diff --git a/tools/testing/radix-tree/linux/cpu.h b/tools/testing/radix-tree/linux/cpu.h deleted file mode 100644 index a45530d78107..000000000000 --- a/tools/testing/radix-tree/linux/cpu.h +++ /dev/null @@ -1 +0,0 @@ -#define cpuhp_setup_state_nocalls(a, b, c, d) (0) diff --git a/tools/testing/radix-tree/linux/idr.h b/tools/testing/radix-tree/linux/idr.h deleted file mode 100644 index 4e342f2e37cf..000000000000 --- a/tools/testing/radix-tree/linux/idr.h +++ /dev/null @@ -1 +0,0 @@ -#include "../../../../include/linux/idr.h" diff --git a/tools/testing/radix-tree/linux/init.h b/tools/testing/radix-tree/linux/init.h deleted file mode 100644 index 81563c3dfce7..000000000000 --- a/tools/testing/radix-tree/linux/init.h +++ /dev/null @@ -1,2 +0,0 @@ -#define __init -#define __exit diff --git a/tools/testing/radix-tree/linux/kconfig.h b/tools/testing/radix-tree/linux/kconfig.h deleted file mode 100644 index 6c8675859913..000000000000 --- a/tools/testing/radix-tree/linux/kconfig.h +++ /dev/null @@ -1 +0,0 @@ -#include "../../../../include/linux/kconfig.h" diff --git a/tools/testing/radix-tree/linux/kernel.h b/tools/testing/radix-tree/linux/kernel.h deleted file mode 100644 index c0a2bb785b92..000000000000 --- a/tools/testing/radix-tree/linux/kernel.h +++ /dev/null @@ -1,29 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _KERNEL_H -#define _KERNEL_H - -#include "../../include/linux/kernel.h" -#include -#include -#include - -#include -#include -#include -#include -#include "../../../include/linux/kconfig.h" - -#define printk printf -#define pr_err printk -#define pr_info printk -#define pr_debug printk -#define pr_cont printk -#define schedule() -#define PAGE_SHIFT 12 - -#define __acquires(x) -#define __releases(x) -#define __must_hold(x) - -#define EXPORT_PER_CPU_SYMBOL_GPL(x) -#endif /* _KERNEL_H */ diff --git a/tools/testing/radix-tree/linux/kmemleak.h b/tools/testing/radix-tree/linux/kmemleak.h deleted file mode 100644 index 155f112786c4..000000000000 --- a/tools/testing/radix-tree/linux/kmemleak.h +++ /dev/null @@ -1 +0,0 @@ -static inline void kmemleak_update_trace(const void *ptr) { } diff --git a/tools/testing/radix-tree/linux/local_lock.h b/tools/testing/radix-tree/linux/local_lock.h deleted file mode 100644 index b3cf8b233ca4..000000000000 --- a/tools/testing/radix-tree/linux/local_lock.h +++ /dev/null @@ -1,8 +0,0 @@ -#ifndef _LINUX_LOCAL_LOCK -#define _LINUX_LOCAL_LOCK -typedef struct { } local_lock_t; - -static inline void local_lock(local_lock_t *lock) { } -static inline void local_unlock(local_lock_t *lock) { } -#define INIT_LOCAL_LOCK(x) { } -#endif diff --git a/tools/testing/radix-tree/linux/lockdep.h b/tools/testing/radix-tree/linux/lockdep.h deleted file mode 100644 index 62473ab57f99..000000000000 --- a/tools/testing/radix-tree/linux/lockdep.h +++ /dev/null @@ -1,16 +0,0 @@ -#ifndef _LINUX_LOCKDEP_H -#define _LINUX_LOCKDEP_H - -#include - -struct lock_class_key { - unsigned int a; -}; - -static inline void lockdep_set_class(spinlock_t *lock, - struct lock_class_key *key) -{ -} - -extern int lockdep_is_held(const void *); -#endif /* _LINUX_LOCKDEP_H */ diff --git a/tools/testing/radix-tree/linux/maple_tree.h b/tools/testing/radix-tree/linux/maple_tree.h deleted file mode 100644 index 06c89bdcc515..000000000000 --- a/tools/testing/radix-tree/linux/maple_tree.h +++ /dev/null @@ -1,7 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0+ */ -#define atomic_t int32_t -#define atomic_inc(x) uatomic_inc(x) -#define atomic_read(x) uatomic_read(x) -#define atomic_set(x, y) do {} while (0) -#define U8_MAX UCHAR_MAX -#include "../../../../include/linux/maple_tree.h" diff --git a/tools/testing/radix-tree/linux/percpu.h b/tools/testing/radix-tree/linux/percpu.h deleted file mode 100644 index b2403aa743b2..000000000000 --- a/tools/testing/radix-tree/linux/percpu.h +++ /dev/null @@ -1,11 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#define DECLARE_PER_CPU(type, val) extern type val -#define DEFINE_PER_CPU(type, val) type val - -#define __get_cpu_var(var) var -#define this_cpu_ptr(var) var -#define this_cpu_read(var) var -#define this_cpu_xchg(var, val) uatomic_xchg(&var, val) -#define this_cpu_cmpxchg(var, old, new) uatomic_cmpxchg(&var, old, new) -#define per_cpu_ptr(ptr, cpu) ({ (void)(cpu); (ptr); }) -#define per_cpu(var, cpu) (*per_cpu_ptr(&(var), cpu)) diff --git a/tools/testing/radix-tree/linux/preempt.h b/tools/testing/radix-tree/linux/preempt.h deleted file mode 100644 index edb10302b903..000000000000 --- a/tools/testing/radix-tree/linux/preempt.h +++ /dev/null @@ -1,15 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef __LINUX_PREEMPT_H -#define __LINUX_PREEMPT_H - -extern int preempt_count; - -#define preempt_disable() uatomic_inc(&preempt_count) -#define preempt_enable() uatomic_dec(&preempt_count) - -static inline int in_interrupt(void) -{ - return 0; -} - -#endif /* __LINUX_PREEMPT_H */ diff --git a/tools/testing/radix-tree/linux/radix-tree.h b/tools/testing/radix-tree/linux/radix-tree.h deleted file mode 100644 index d1635a5bef02..000000000000 --- a/tools/testing/radix-tree/linux/radix-tree.h +++ /dev/null @@ -1,26 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _TEST_RADIX_TREE_H -#define _TEST_RADIX_TREE_H - -#include "../../../../include/linux/radix-tree.h" - -extern int kmalloc_verbose; -extern int test_verbose; - -static inline void trace_call_rcu(struct rcu_head *head, - void (*func)(struct rcu_head *head)) -{ - if (kmalloc_verbose) - printf("Delaying free of %p to slab\n", (char *)head - - offsetof(struct radix_tree_node, rcu_head)); - call_rcu(head, func); -} - -#define printv(verbosity_level, fmt, ...) \ - if(test_verbose >= verbosity_level) \ - printf(fmt, ##__VA_ARGS__) - -#undef call_rcu -#define call_rcu(x, y) trace_call_rcu(x, y) - -#endif /* _TEST_RADIX_TREE_H */ diff --git a/tools/testing/radix-tree/linux/rcupdate.h b/tools/testing/radix-tree/linux/rcupdate.h deleted file mode 100644 index fed468fb0c78..000000000000 --- a/tools/testing/radix-tree/linux/rcupdate.h +++ /dev/null @@ -1,12 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _RCUPDATE_H -#define _RCUPDATE_H - -#include - -#define rcu_dereference_raw(p) rcu_dereference(p) -#define rcu_dereference_protected(p, cond) rcu_dereference(p) -#define rcu_dereference_check(p, cond) rcu_dereference(p) -#define RCU_INIT_POINTER(p, v) do { (p) = (v); } while (0) - -#endif diff --git a/tools/testing/radix-tree/linux/xarray.h b/tools/testing/radix-tree/linux/xarray.h deleted file mode 100644 index df3812cda376..000000000000 --- a/tools/testing/radix-tree/linux/xarray.h +++ /dev/null @@ -1,2 +0,0 @@ -#include "generated/map-shift.h" -#include "../../../../include/linux/xarray.h" diff --git a/tools/testing/radix-tree/trace/events/maple_tree.h b/tools/testing/radix-tree/trace/events/maple_tree.h deleted file mode 100644 index 97d0e1ddcf08..000000000000 --- a/tools/testing/radix-tree/trace/events/maple_tree.h +++ /dev/null @@ -1,5 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0+ */ - -#define trace_ma_op(a, b) do {} while (0) -#define trace_ma_read(a, b) do {} while (0) -#define trace_ma_write(a, b, c, d) do {} while (0) diff --git a/tools/testing/radix-tree/xarray.c b/tools/testing/radix-tree/xarray.c index d0e53bff1eb6..253208a8541b 100644 --- a/tools/testing/radix-tree/xarray.c +++ b/tools/testing/radix-tree/xarray.c @@ -4,17 +4,9 @@ * Copyright (c) 2018 Matthew Wilcox */ -#define XA_DEBUG +#include "xarray-shared.h" #include "test.h" -#define module_init(x) -#define module_exit(x) -#define MODULE_AUTHOR(x) -#define MODULE_DESCRIPTION(X) -#define MODULE_LICENSE(x) -#define dump_stack() assert(0) - -#include "../../../lib/xarray.c" #undef XA_DEBUG #include "../../../lib/test_xarray.c" diff --git a/tools/testing/shared/autoconf.h b/tools/testing/shared/autoconf.h new file mode 100644 index 000000000000..92dc474c349b --- /dev/null +++ b/tools/testing/shared/autoconf.h @@ -0,0 +1,2 @@ +#include "bit-length.h" +#define CONFIG_XARRAY_MULTI 1 diff --git a/tools/testing/shared/linux.c b/tools/testing/shared/linux.c new file mode 100644 index 000000000000..4eb442206d01 --- /dev/null +++ b/tools/testing/shared/linux.c @@ -0,0 +1,271 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +int nr_allocated; +int preempt_count; +int test_verbose; + +struct kmem_cache { + pthread_mutex_t lock; + unsigned int size; + unsigned int align; + int nr_objs; + void *objs; + void (*ctor)(void *); + unsigned int non_kernel; + unsigned long nr_allocated; + unsigned long nr_tallocated; +}; + +void kmem_cache_set_non_kernel(struct kmem_cache *cachep, unsigned int val) +{ + cachep->non_kernel = val; +} + +unsigned long kmem_cache_get_alloc(struct kmem_cache *cachep) +{ + return cachep->size * cachep->nr_allocated; +} + +unsigned long kmem_cache_nr_allocated(struct kmem_cache *cachep) +{ + return cachep->nr_allocated; +} + +unsigned long kmem_cache_nr_tallocated(struct kmem_cache *cachep) +{ + return cachep->nr_tallocated; +} + +void kmem_cache_zero_nr_tallocated(struct kmem_cache *cachep) +{ + cachep->nr_tallocated = 0; +} + +void *kmem_cache_alloc_lru(struct kmem_cache *cachep, struct list_lru *lru, + int gfp) +{ + void *p; + + if (!(gfp & __GFP_DIRECT_RECLAIM)) { + if (!cachep->non_kernel) + return NULL; + + cachep->non_kernel--; + } + + pthread_mutex_lock(&cachep->lock); + if (cachep->nr_objs) { + struct radix_tree_node *node = cachep->objs; + cachep->nr_objs--; + cachep->objs = node->parent; + pthread_mutex_unlock(&cachep->lock); + node->parent = NULL; + p = node; + } else { + pthread_mutex_unlock(&cachep->lock); + if (cachep->align) + posix_memalign(&p, cachep->align, cachep->size); + else + p = malloc(cachep->size); + if (cachep->ctor) + cachep->ctor(p); + else if (gfp & __GFP_ZERO) + memset(p, 0, cachep->size); + } + + uatomic_inc(&cachep->nr_allocated); + uatomic_inc(&nr_allocated); + uatomic_inc(&cachep->nr_tallocated); + if (kmalloc_verbose) + printf("Allocating %p from slab\n", p); + return p; +} + +void __kmem_cache_free_locked(struct kmem_cache *cachep, void *objp) +{ + assert(objp); + if (cachep->nr_objs > 10 || cachep->align) { + memset(objp, POISON_FREE, cachep->size); + free(objp); + } else { + struct radix_tree_node *node = objp; + cachep->nr_objs++; + node->parent = cachep->objs; + cachep->objs = node; + } +} + +void kmem_cache_free_locked(struct kmem_cache *cachep, void *objp) +{ + uatomic_dec(&nr_allocated); + uatomic_dec(&cachep->nr_allocated); + if (kmalloc_verbose) + printf("Freeing %p to slab\n", objp); + __kmem_cache_free_locked(cachep, objp); +} + +void kmem_cache_free(struct kmem_cache *cachep, void *objp) +{ + pthread_mutex_lock(&cachep->lock); + kmem_cache_free_locked(cachep, objp); + pthread_mutex_unlock(&cachep->lock); +} + +void kmem_cache_free_bulk(struct kmem_cache *cachep, size_t size, void **list) +{ + if (kmalloc_verbose) + pr_debug("Bulk free %p[0-%lu]\n", list, size - 1); + + pthread_mutex_lock(&cachep->lock); + for (int i = 0; i < size; i++) + kmem_cache_free_locked(cachep, list[i]); + pthread_mutex_unlock(&cachep->lock); +} + +void kmem_cache_shrink(struct kmem_cache *cachep) +{ +} + +int kmem_cache_alloc_bulk(struct kmem_cache *cachep, gfp_t gfp, size_t size, + void **p) +{ + size_t i; + + if (kmalloc_verbose) + pr_debug("Bulk alloc %lu\n", size); + + pthread_mutex_lock(&cachep->lock); + if (cachep->nr_objs >= size) { + struct radix_tree_node *node; + + for (i = 0; i < size; i++) { + if (!(gfp & __GFP_DIRECT_RECLAIM)) { + if (!cachep->non_kernel) + break; + cachep->non_kernel--; + } + + node = cachep->objs; + cachep->nr_objs--; + cachep->objs = node->parent; + p[i] = node; + node->parent = NULL; + } + pthread_mutex_unlock(&cachep->lock); + } else { + pthread_mutex_unlock(&cachep->lock); + for (i = 0; i < size; i++) { + if (!(gfp & __GFP_DIRECT_RECLAIM)) { + if (!cachep->non_kernel) + break; + cachep->non_kernel--; + } + + if (cachep->align) { + posix_memalign(&p[i], cachep->align, + cachep->size); + } else { + p[i] = malloc(cachep->size); + if (!p[i]) + break; + } + if (cachep->ctor) + cachep->ctor(p[i]); + else if (gfp & __GFP_ZERO) + memset(p[i], 0, cachep->size); + } + } + + if (i < size) { + size = i; + pthread_mutex_lock(&cachep->lock); + for (i = 0; i < size; i++) + __kmem_cache_free_locked(cachep, p[i]); + pthread_mutex_unlock(&cachep->lock); + return 0; + } + + for (i = 0; i < size; i++) { + uatomic_inc(&nr_allocated); + uatomic_inc(&cachep->nr_allocated); + uatomic_inc(&cachep->nr_tallocated); + if (kmalloc_verbose) + printf("Allocating %p from slab\n", p[i]); + } + + return size; +} + +struct kmem_cache * +kmem_cache_create(const char *name, unsigned int size, unsigned int align, + unsigned int flags, void (*ctor)(void *)) +{ + struct kmem_cache *ret = malloc(sizeof(*ret)); + + pthread_mutex_init(&ret->lock, NULL); + ret->size = size; + ret->align = align; + ret->nr_objs = 0; + ret->nr_allocated = 0; + ret->nr_tallocated = 0; + ret->objs = NULL; + ret->ctor = ctor; + ret->non_kernel = 0; + return ret; +} + +/* + * Test the test infrastructure for kem_cache_alloc/free and bulk counterparts. + */ +void test_kmem_cache_bulk(void) +{ + int i; + void *list[12]; + static struct kmem_cache *test_cache, *test_cache2; + + /* + * Testing the bulk allocators without aligned kmem_cache to force the + * bulk alloc/free to reuse + */ + test_cache = kmem_cache_create("test_cache", 256, 0, SLAB_PANIC, NULL); + + for (i = 0; i < 5; i++) + list[i] = kmem_cache_alloc(test_cache, __GFP_DIRECT_RECLAIM); + + for (i = 0; i < 5; i++) + kmem_cache_free(test_cache, list[i]); + assert(test_cache->nr_objs == 5); + + kmem_cache_alloc_bulk(test_cache, __GFP_DIRECT_RECLAIM, 5, list); + kmem_cache_free_bulk(test_cache, 5, list); + + for (i = 0; i < 12 ; i++) + list[i] = kmem_cache_alloc(test_cache, __GFP_DIRECT_RECLAIM); + + for (i = 0; i < 12; i++) + kmem_cache_free(test_cache, list[i]); + + /* The last free will not be kept around */ + assert(test_cache->nr_objs == 11); + + /* Aligned caches will immediately free */ + test_cache2 = kmem_cache_create("test_cache2", 128, 128, SLAB_PANIC, NULL); + + kmem_cache_alloc_bulk(test_cache2, __GFP_DIRECT_RECLAIM, 10, list); + kmem_cache_free_bulk(test_cache2, 10, list); + assert(!test_cache2->nr_objs); + + +} diff --git a/tools/testing/shared/linux/bug.h b/tools/testing/shared/linux/bug.h new file mode 100644 index 000000000000..03dc8a57eb99 --- /dev/null +++ b/tools/testing/shared/linux/bug.h @@ -0,0 +1,2 @@ +#include +#include "asm/bug.h" diff --git a/tools/testing/shared/linux/cpu.h b/tools/testing/shared/linux/cpu.h new file mode 100644 index 000000000000..a45530d78107 --- /dev/null +++ b/tools/testing/shared/linux/cpu.h @@ -0,0 +1 @@ +#define cpuhp_setup_state_nocalls(a, b, c, d) (0) diff --git a/tools/testing/shared/linux/idr.h b/tools/testing/shared/linux/idr.h new file mode 100644 index 000000000000..4e342f2e37cf --- /dev/null +++ b/tools/testing/shared/linux/idr.h @@ -0,0 +1 @@ +#include "../../../../include/linux/idr.h" diff --git a/tools/testing/shared/linux/init.h b/tools/testing/shared/linux/init.h new file mode 100644 index 000000000000..81563c3dfce7 --- /dev/null +++ b/tools/testing/shared/linux/init.h @@ -0,0 +1,2 @@ +#define __init +#define __exit diff --git a/tools/testing/shared/linux/kconfig.h b/tools/testing/shared/linux/kconfig.h new file mode 100644 index 000000000000..6c8675859913 --- /dev/null +++ b/tools/testing/shared/linux/kconfig.h @@ -0,0 +1 @@ +#include "../../../../include/linux/kconfig.h" diff --git a/tools/testing/shared/linux/kernel.h b/tools/testing/shared/linux/kernel.h new file mode 100644 index 000000000000..c0a2bb785b92 --- /dev/null +++ b/tools/testing/shared/linux/kernel.h @@ -0,0 +1,29 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _KERNEL_H +#define _KERNEL_H + +#include "../../include/linux/kernel.h" +#include +#include +#include + +#include +#include +#include +#include +#include "../../../include/linux/kconfig.h" + +#define printk printf +#define pr_err printk +#define pr_info printk +#define pr_debug printk +#define pr_cont printk +#define schedule() +#define PAGE_SHIFT 12 + +#define __acquires(x) +#define __releases(x) +#define __must_hold(x) + +#define EXPORT_PER_CPU_SYMBOL_GPL(x) +#endif /* _KERNEL_H */ diff --git a/tools/testing/shared/linux/kmemleak.h b/tools/testing/shared/linux/kmemleak.h new file mode 100644 index 000000000000..155f112786c4 --- /dev/null +++ b/tools/testing/shared/linux/kmemleak.h @@ -0,0 +1 @@ +static inline void kmemleak_update_trace(const void *ptr) { } diff --git a/tools/testing/shared/linux/local_lock.h b/tools/testing/shared/linux/local_lock.h new file mode 100644 index 000000000000..b3cf8b233ca4 --- /dev/null +++ b/tools/testing/shared/linux/local_lock.h @@ -0,0 +1,8 @@ +#ifndef _LINUX_LOCAL_LOCK +#define _LINUX_LOCAL_LOCK +typedef struct { } local_lock_t; + +static inline void local_lock(local_lock_t *lock) { } +static inline void local_unlock(local_lock_t *lock) { } +#define INIT_LOCAL_LOCK(x) { } +#endif diff --git a/tools/testing/shared/linux/lockdep.h b/tools/testing/shared/linux/lockdep.h new file mode 100644 index 000000000000..62473ab57f99 --- /dev/null +++ b/tools/testing/shared/linux/lockdep.h @@ -0,0 +1,16 @@ +#ifndef _LINUX_LOCKDEP_H +#define _LINUX_LOCKDEP_H + +#include + +struct lock_class_key { + unsigned int a; +}; + +static inline void lockdep_set_class(spinlock_t *lock, + struct lock_class_key *key) +{ +} + +extern int lockdep_is_held(const void *); +#endif /* _LINUX_LOCKDEP_H */ diff --git a/tools/testing/shared/linux/maple_tree.h b/tools/testing/shared/linux/maple_tree.h new file mode 100644 index 000000000000..06c89bdcc515 --- /dev/null +++ b/tools/testing/shared/linux/maple_tree.h @@ -0,0 +1,7 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ +#define atomic_t int32_t +#define atomic_inc(x) uatomic_inc(x) +#define atomic_read(x) uatomic_read(x) +#define atomic_set(x, y) do {} while (0) +#define U8_MAX UCHAR_MAX +#include "../../../../include/linux/maple_tree.h" diff --git a/tools/testing/shared/linux/percpu.h b/tools/testing/shared/linux/percpu.h new file mode 100644 index 000000000000..b2403aa743b2 --- /dev/null +++ b/tools/testing/shared/linux/percpu.h @@ -0,0 +1,11 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#define DECLARE_PER_CPU(type, val) extern type val +#define DEFINE_PER_CPU(type, val) type val + +#define __get_cpu_var(var) var +#define this_cpu_ptr(var) var +#define this_cpu_read(var) var +#define this_cpu_xchg(var, val) uatomic_xchg(&var, val) +#define this_cpu_cmpxchg(var, old, new) uatomic_cmpxchg(&var, old, new) +#define per_cpu_ptr(ptr, cpu) ({ (void)(cpu); (ptr); }) +#define per_cpu(var, cpu) (*per_cpu_ptr(&(var), cpu)) diff --git a/tools/testing/shared/linux/preempt.h b/tools/testing/shared/linux/preempt.h new file mode 100644 index 000000000000..edb10302b903 --- /dev/null +++ b/tools/testing/shared/linux/preempt.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __LINUX_PREEMPT_H +#define __LINUX_PREEMPT_H + +extern int preempt_count; + +#define preempt_disable() uatomic_inc(&preempt_count) +#define preempt_enable() uatomic_dec(&preempt_count) + +static inline int in_interrupt(void) +{ + return 0; +} + +#endif /* __LINUX_PREEMPT_H */ diff --git a/tools/testing/shared/linux/radix-tree.h b/tools/testing/shared/linux/radix-tree.h new file mode 100644 index 000000000000..d1635a5bef02 --- /dev/null +++ b/tools/testing/shared/linux/radix-tree.h @@ -0,0 +1,26 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _TEST_RADIX_TREE_H +#define _TEST_RADIX_TREE_H + +#include "../../../../include/linux/radix-tree.h" + +extern int kmalloc_verbose; +extern int test_verbose; + +static inline void trace_call_rcu(struct rcu_head *head, + void (*func)(struct rcu_head *head)) +{ + if (kmalloc_verbose) + printf("Delaying free of %p to slab\n", (char *)head - + offsetof(struct radix_tree_node, rcu_head)); + call_rcu(head, func); +} + +#define printv(verbosity_level, fmt, ...) \ + if(test_verbose >= verbosity_level) \ + printf(fmt, ##__VA_ARGS__) + +#undef call_rcu +#define call_rcu(x, y) trace_call_rcu(x, y) + +#endif /* _TEST_RADIX_TREE_H */ diff --git a/tools/testing/shared/linux/rcupdate.h b/tools/testing/shared/linux/rcupdate.h new file mode 100644 index 000000000000..fed468fb0c78 --- /dev/null +++ b/tools/testing/shared/linux/rcupdate.h @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _RCUPDATE_H +#define _RCUPDATE_H + +#include + +#define rcu_dereference_raw(p) rcu_dereference(p) +#define rcu_dereference_protected(p, cond) rcu_dereference(p) +#define rcu_dereference_check(p, cond) rcu_dereference(p) +#define RCU_INIT_POINTER(p, v) do { (p) = (v); } while (0) + +#endif diff --git a/tools/testing/shared/linux/xarray.h b/tools/testing/shared/linux/xarray.h new file mode 100644 index 000000000000..df3812cda376 --- /dev/null +++ b/tools/testing/shared/linux/xarray.h @@ -0,0 +1,2 @@ +#include "generated/map-shift.h" +#include "../../../../include/linux/xarray.h" diff --git a/tools/testing/shared/maple-shared.h b/tools/testing/shared/maple-shared.h new file mode 100644 index 000000000000..3d847edd149d --- /dev/null +++ b/tools/testing/shared/maple-shared.h @@ -0,0 +1,9 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ + +#define CONFIG_DEBUG_MAPLE_TREE +#define CONFIG_MAPLE_SEARCH +#define MAPLE_32BIT (MAPLE_NODE_SLOTS > 31) +#include "shared.h" +#include +#include +#include "linux/init.h" diff --git a/tools/testing/shared/maple-shim.c b/tools/testing/shared/maple-shim.c new file mode 100644 index 000000000000..640df76f483e --- /dev/null +++ b/tools/testing/shared/maple-shim.c @@ -0,0 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +/* Very simple shim around the maple tree. */ + +#include "maple-shared.h" + +#include "../../../lib/maple_tree.c" diff --git a/tools/testing/shared/shared.h b/tools/testing/shared/shared.h new file mode 100644 index 000000000000..f08f683812ad --- /dev/null +++ b/tools/testing/shared/shared.h @@ -0,0 +1,33 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#include +#include +#include +#include + +#include +#include + +#ifndef module_init +#define module_init(x) +#endif + +#ifndef module_exit +#define module_exit(x) +#endif + +#ifndef MODULE_AUTHOR +#define MODULE_AUTHOR(x) +#endif + +#ifndef MODULE_LICENSE +#define MODULE_LICENSE(x) +#endif + +#ifndef MODULE_DESCRIPTION +#define MODULE_DESCRIPTION(x) +#endif + +#ifndef dump_stack +#define dump_stack() assert(0) +#endif diff --git a/tools/testing/shared/shared.mk b/tools/testing/shared/shared.mk new file mode 100644 index 000000000000..a05f0588513a --- /dev/null +++ b/tools/testing/shared/shared.mk @@ -0,0 +1,72 @@ +# SPDX-License-Identifier: GPL-2.0 + +CFLAGS += -I../shared -I. -I../../include -I../../../lib -g -Og -Wall \ + -D_LGPL_SOURCE -fsanitize=address -fsanitize=undefined +LDFLAGS += -fsanitize=address -fsanitize=undefined +LDLIBS += -lpthread -lurcu +LIBS := slab.o find_bit.o bitmap.o hweight.o vsprintf.o +SHARED_OFILES = xarray-shared.o radix-tree.o idr.o linux.o $(LIBS) + +SHARED_DEPS = Makefile ../shared/shared.mk ../shared/*.h generated/map-shift.h \ + generated/bit-length.h generated/autoconf.h \ + ../../include/linux/*.h \ + ../../include/asm/*.h \ + ../../../include/linux/xarray.h \ + ../../../include/linux/maple_tree.h \ + ../../../include/linux/radix-tree.h \ + ../../../lib/radix-tree.h \ + ../../../include/linux/idr.h + +ifndef SHIFT + SHIFT=3 +endif + +ifeq ($(BUILD), 32) + CFLAGS += -m32 + LDFLAGS += -m32 +LONG_BIT := 32 +endif + +ifndef LONG_BIT +LONG_BIT := $(shell getconf LONG_BIT) +endif + +%.o: ../shared/%.c + $(CC) -c $(CFLAGS) $< -o $@ + +vpath %.c ../../lib + +$(SHARED_OFILES): $(SHARED_DEPS) + +radix-tree.c: ../../../lib/radix-tree.c + sed -e 's/^static //' -e 's/__always_inline //' -e 's/inline //' < $< > $@ + +idr.c: ../../../lib/idr.c + sed -e 's/^static //' -e 's/__always_inline //' -e 's/inline //' < $< > $@ + +xarray-shared.o: ../shared/xarray-shared.c ../../../lib/xarray.c \ + ../../../lib/test_xarray.c + +maple-shared.o: ../shared/maple-shared.c ../../../lib/maple_tree.c \ + ../../../lib/test_maple_tree.c + +generated/autoconf.h: + @mkdir -p generated + cp ../shared/autoconf.h generated/autoconf.h + +generated/map-shift.h: + @mkdir -p generated + @if ! grep -qws $(SHIFT) generated/map-shift.h; then \ + echo "Generating $@"; \ + echo "#define XA_CHUNK_SHIFT $(SHIFT)" > \ + generated/map-shift.h; \ + fi + +generated/bit-length.h: FORCE + @mkdir -p generated + @if ! grep -qws CONFIG_$(LONG_BIT)BIT generated/bit-length.h; then \ + echo "Generating $@"; \ + echo "#define CONFIG_$(LONG_BIT)BIT 1" > $@; \ + fi + +FORCE: ; diff --git a/tools/testing/shared/trace/events/maple_tree.h b/tools/testing/shared/trace/events/maple_tree.h new file mode 100644 index 000000000000..97d0e1ddcf08 --- /dev/null +++ b/tools/testing/shared/trace/events/maple_tree.h @@ -0,0 +1,5 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ + +#define trace_ma_op(a, b) do {} while (0) +#define trace_ma_read(a, b) do {} while (0) +#define trace_ma_write(a, b, c, d) do {} while (0) diff --git a/tools/testing/shared/xarray-shared.c b/tools/testing/shared/xarray-shared.c new file mode 100644 index 000000000000..e90901958dcd --- /dev/null +++ b/tools/testing/shared/xarray-shared.c @@ -0,0 +1,5 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#include "xarray-shared.h" + +#include "../../../lib/xarray.c" diff --git a/tools/testing/shared/xarray-shared.h b/tools/testing/shared/xarray-shared.h new file mode 100644 index 000000000000..ac2d16ff53ae --- /dev/null +++ b/tools/testing/shared/xarray-shared.h @@ -0,0 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ + +#define XA_DEBUG +#include "shared.h" -- cgit v1.2.3 From 9325b8b5a1cba1fbabe6e8217e248c5f90fd0e13 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Mon, 29 Jul 2024 12:50:41 +0100 Subject: tools: add skeleton code for userland testing of VMA logic Establish a new userland VMA unit testing implementation under tools/testing which utilises existing logic providing maple tree support in userland utilising the now-shared code previously exclusive to radix tree testing. This provides fundamental VMA operations whose API is defined in mm/vma.h, while stubbing out superfluous functionality. This exists as a proof-of-concept, with the test implementation functional and sufficient to allow userland compilation of vma.c, but containing only cursory tests to demonstrate basic functionality. Link: https://lkml.kernel.org/r/533ffa2eec771cbe6b387dd049a7f128a53eb616.1722251717.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Tested-by: SeongJae Park Acked-by: Vlastimil Babka Reviewed-by: Liam R. Howlett Cc: Alexander Viro Cc: Brendan Higgins Cc: Christian Brauner Cc: David Gow Cc: Eric W. Biederman Cc: Jan Kara Cc: Kees Cook Cc: Matthew Wilcox (Oracle) Cc: Rae Moar Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Pengfei Xu Signed-off-by: Andrew Morton --- MAINTAINERS | 1 + tools/testing/vma/.gitignore | 7 + tools/testing/vma/Makefile | 16 + tools/testing/vma/linux/atomic.h | 12 + tools/testing/vma/linux/mmzone.h | 38 ++ tools/testing/vma/vma.c | 207 +++++++++ tools/testing/vma/vma_internal.h | 882 +++++++++++++++++++++++++++++++++++++++ 7 files changed, 1163 insertions(+) create mode 100644 tools/testing/vma/.gitignore create mode 100644 tools/testing/vma/Makefile create mode 100644 tools/testing/vma/linux/atomic.h create mode 100644 tools/testing/vma/linux/mmzone.h create mode 100644 tools/testing/vma/vma.c create mode 100644 tools/testing/vma/vma_internal.h (limited to 'tools') diff --git a/MAINTAINERS b/MAINTAINERS index 4a7b6e25bc77..41b15083e4a1 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -24423,6 +24423,7 @@ T: git git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm F: mm/vma.c F: mm/vma.h F: mm/vma_internal.h +F: tools/testing/vma/ VMALLOC M: Andrew Morton diff --git a/tools/testing/vma/.gitignore b/tools/testing/vma/.gitignore new file mode 100644 index 000000000000..b003258eba79 --- /dev/null +++ b/tools/testing/vma/.gitignore @@ -0,0 +1,7 @@ +# SPDX-License-Identifier: GPL-2.0-only +generated/bit-length.h +generated/map-shift.h +generated/autoconf.h +idr.c +radix-tree.c +vma diff --git a/tools/testing/vma/Makefile b/tools/testing/vma/Makefile new file mode 100644 index 000000000000..bfc905d222cf --- /dev/null +++ b/tools/testing/vma/Makefile @@ -0,0 +1,16 @@ +# SPDX-License-Identifier: GPL-2.0-or-later + +.PHONY: default + +default: vma + +include ../shared/shared.mk + +OFILES = $(SHARED_OFILES) vma.o maple-shim.o +TARGETS = vma + +vma: $(OFILES) vma_internal.h ../../../mm/vma.c ../../../mm/vma.h + $(CC) $(CFLAGS) -o $@ $(OFILES) $(LDLIBS) + +clean: + $(RM) $(TARGETS) *.o radix-tree.c idr.c generated/map-shift.h generated/bit-length.h generated/autoconf.h diff --git a/tools/testing/vma/linux/atomic.h b/tools/testing/vma/linux/atomic.h new file mode 100644 index 000000000000..e01f66f98982 --- /dev/null +++ b/tools/testing/vma/linux/atomic.h @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ + +#ifndef _LINUX_ATOMIC_H +#define _LINUX_ATOMIC_H + +#define atomic_t int32_t +#define atomic_inc(x) uatomic_inc(x) +#define atomic_read(x) uatomic_read(x) +#define atomic_set(x, y) do {} while (0) +#define U8_MAX UCHAR_MAX + +#endif /* _LINUX_ATOMIC_H */ diff --git a/tools/testing/vma/linux/mmzone.h b/tools/testing/vma/linux/mmzone.h new file mode 100644 index 000000000000..33cd1517f7a3 --- /dev/null +++ b/tools/testing/vma/linux/mmzone.h @@ -0,0 +1,38 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ + +#ifndef _LINUX_MMZONE_H +#define _LINUX_MMZONE_H + +#include + +struct pglist_data *first_online_pgdat(void); +struct pglist_data *next_online_pgdat(struct pglist_data *pgdat); + +#define for_each_online_pgdat(pgdat) \ + for (pgdat = first_online_pgdat(); \ + pgdat; \ + pgdat = next_online_pgdat(pgdat)) + +enum zone_type { + __MAX_NR_ZONES +}; + +#define MAX_NR_ZONES __MAX_NR_ZONES +#define MAX_PAGE_ORDER 10 +#define MAX_ORDER_NR_PAGES (1 << MAX_PAGE_ORDER) + +#define pageblock_order MAX_PAGE_ORDER +#define pageblock_nr_pages BIT(pageblock_order) +#define pageblock_align(pfn) ALIGN((pfn), pageblock_nr_pages) +#define pageblock_start_pfn(pfn) ALIGN_DOWN((pfn), pageblock_nr_pages) + +struct zone { + atomic_long_t managed_pages; +}; + +typedef struct pglist_data { + struct zone node_zones[MAX_NR_ZONES]; + +} pg_data_t; + +#endif /* _LINUX_MMZONE_H */ diff --git a/tools/testing/vma/vma.c b/tools/testing/vma/vma.c new file mode 100644 index 000000000000..48e033c60d87 --- /dev/null +++ b/tools/testing/vma/vma.c @@ -0,0 +1,207 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#include +#include +#include + +#include "maple-shared.h" +#include "vma_internal.h" + +/* + * Directly import the VMA implementation here. Our vma_internal.h wrapper + * provides userland-equivalent functionality for everything vma.c uses. + */ +#include "../../../mm/vma.c" + +const struct vm_operations_struct vma_dummy_vm_ops; + +#define ASSERT_TRUE(_expr) \ + do { \ + if (!(_expr)) { \ + fprintf(stderr, \ + "Assert FAILED at %s:%d:%s(): %s is FALSE.\n", \ + __FILE__, __LINE__, __FUNCTION__, #_expr); \ + return false; \ + } \ + } while (0) +#define ASSERT_FALSE(_expr) ASSERT_TRUE(!(_expr)) +#define ASSERT_EQ(_val1, _val2) ASSERT_TRUE((_val1) == (_val2)) +#define ASSERT_NE(_val1, _val2) ASSERT_TRUE((_val1) != (_val2)) + +static struct vm_area_struct *alloc_vma(struct mm_struct *mm, + unsigned long start, + unsigned long end, + pgoff_t pgoff, + vm_flags_t flags) +{ + struct vm_area_struct *ret = vm_area_alloc(mm); + + if (ret == NULL) + return NULL; + + ret->vm_start = start; + ret->vm_end = end; + ret->vm_pgoff = pgoff; + ret->__vm_flags = flags; + + return ret; +} + +static bool test_simple_merge(void) +{ + struct vm_area_struct *vma; + unsigned long flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE; + struct mm_struct mm = {}; + struct vm_area_struct *vma_left = alloc_vma(&mm, 0, 0x1000, 0, flags); + struct vm_area_struct *vma_middle = alloc_vma(&mm, 0x1000, 0x2000, 1, flags); + struct vm_area_struct *vma_right = alloc_vma(&mm, 0x2000, 0x3000, 2, flags); + VMA_ITERATOR(vmi, &mm, 0x1000); + + ASSERT_FALSE(vma_link(&mm, vma_left)); + ASSERT_FALSE(vma_link(&mm, vma_middle)); + ASSERT_FALSE(vma_link(&mm, vma_right)); + + vma = vma_merge_new_vma(&vmi, vma_left, vma_middle, 0x1000, + 0x2000, 1); + ASSERT_NE(vma, NULL); + + ASSERT_EQ(vma->vm_start, 0); + ASSERT_EQ(vma->vm_end, 0x3000); + ASSERT_EQ(vma->vm_pgoff, 0); + ASSERT_EQ(vma->vm_flags, flags); + + vm_area_free(vma); + mtree_destroy(&mm.mm_mt); + + return true; +} + +static bool test_simple_modify(void) +{ + struct vm_area_struct *vma; + unsigned long flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE; + struct mm_struct mm = {}; + struct vm_area_struct *init_vma = alloc_vma(&mm, 0, 0x3000, 0, flags); + VMA_ITERATOR(vmi, &mm, 0x1000); + + ASSERT_FALSE(vma_link(&mm, init_vma)); + + /* + * The flags will not be changed, the vma_modify_flags() function + * performs the merge/split only. + */ + vma = vma_modify_flags(&vmi, init_vma, init_vma, + 0x1000, 0x2000, VM_READ | VM_MAYREAD); + ASSERT_NE(vma, NULL); + /* We modify the provided VMA, and on split allocate new VMAs. */ + ASSERT_EQ(vma, init_vma); + + ASSERT_EQ(vma->vm_start, 0x1000); + ASSERT_EQ(vma->vm_end, 0x2000); + ASSERT_EQ(vma->vm_pgoff, 1); + + /* + * Now walk through the three split VMAs and make sure they are as + * expected. + */ + + vma_iter_set(&vmi, 0); + vma = vma_iter_load(&vmi); + + ASSERT_EQ(vma->vm_start, 0); + ASSERT_EQ(vma->vm_end, 0x1000); + ASSERT_EQ(vma->vm_pgoff, 0); + + vm_area_free(vma); + vma_iter_clear(&vmi); + + vma = vma_next(&vmi); + + ASSERT_EQ(vma->vm_start, 0x1000); + ASSERT_EQ(vma->vm_end, 0x2000); + ASSERT_EQ(vma->vm_pgoff, 1); + + vm_area_free(vma); + vma_iter_clear(&vmi); + + vma = vma_next(&vmi); + + ASSERT_EQ(vma->vm_start, 0x2000); + ASSERT_EQ(vma->vm_end, 0x3000); + ASSERT_EQ(vma->vm_pgoff, 2); + + vm_area_free(vma); + mtree_destroy(&mm.mm_mt); + + return true; +} + +static bool test_simple_expand(void) +{ + unsigned long flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE; + struct mm_struct mm = {}; + struct vm_area_struct *vma = alloc_vma(&mm, 0, 0x1000, 0, flags); + VMA_ITERATOR(vmi, &mm, 0); + + ASSERT_FALSE(vma_link(&mm, vma)); + + ASSERT_FALSE(vma_expand(&vmi, vma, 0, 0x3000, 0, NULL)); + + ASSERT_EQ(vma->vm_start, 0); + ASSERT_EQ(vma->vm_end, 0x3000); + ASSERT_EQ(vma->vm_pgoff, 0); + + vm_area_free(vma); + mtree_destroy(&mm.mm_mt); + + return true; +} + +static bool test_simple_shrink(void) +{ + unsigned long flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE; + struct mm_struct mm = {}; + struct vm_area_struct *vma = alloc_vma(&mm, 0, 0x3000, 0, flags); + VMA_ITERATOR(vmi, &mm, 0); + + ASSERT_FALSE(vma_link(&mm, vma)); + + ASSERT_FALSE(vma_shrink(&vmi, vma, 0, 0x1000, 0)); + + ASSERT_EQ(vma->vm_start, 0); + ASSERT_EQ(vma->vm_end, 0x1000); + ASSERT_EQ(vma->vm_pgoff, 0); + + vm_area_free(vma); + mtree_destroy(&mm.mm_mt); + + return true; +} + +int main(void) +{ + int num_tests = 0, num_fail = 0; + + maple_tree_init(); + +#define TEST(name) \ + do { \ + num_tests++; \ + if (!test_##name()) { \ + num_fail++; \ + fprintf(stderr, "Test " #name " FAILED\n"); \ + } \ + } while (0) + + TEST(simple_merge); + TEST(simple_modify); + TEST(simple_expand); + TEST(simple_shrink); + +#undef TEST + + printf("%d tests run, %d passed, %d failed.\n", + num_tests, num_tests - num_fail, num_fail); + + return num_fail == 0 ? EXIT_SUCCESS : EXIT_FAILURE; +} diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h new file mode 100644 index 000000000000..093560e5b2ac --- /dev/null +++ b/tools/testing/vma/vma_internal.h @@ -0,0 +1,882 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ +/* + * vma_internal.h + * + * Header providing userland wrappers and shims for the functionality provided + * by mm/vma_internal.h. + * + * We make the header guard the same as mm/vma_internal.h, so if this shim + * header is included, it precludes the inclusion of the kernel one. + */ + +#ifndef __MM_VMA_INTERNAL_H +#define __MM_VMA_INTERNAL_H + +#define __private +#define __bitwise +#define __randomize_layout + +#define CONFIG_MMU +#define CONFIG_PER_VMA_LOCK + +#include + +#include +#include +#include +#include +#include + +#define VM_WARN_ON(_expr) (WARN_ON(_expr)) +#define VM_WARN_ON_ONCE(_expr) (WARN_ON_ONCE(_expr)) +#define VM_BUG_ON(_expr) (BUG_ON(_expr)) +#define VM_BUG_ON_VMA(_expr, _vma) (BUG_ON(_expr)) + +#define VM_NONE 0x00000000 +#define VM_READ 0x00000001 +#define VM_WRITE 0x00000002 +#define VM_EXEC 0x00000004 +#define VM_SHARED 0x00000008 +#define VM_MAYREAD 0x00000010 +#define VM_MAYWRITE 0x00000020 +#define VM_GROWSDOWN 0x00000100 +#define VM_PFNMAP 0x00000400 +#define VM_LOCKED 0x00002000 +#define VM_IO 0x00004000 +#define VM_DONTEXPAND 0x00040000 +#define VM_ACCOUNT 0x00100000 +#define VM_MIXEDMAP 0x10000000 +#define VM_STACK VM_GROWSDOWN +#define VM_SHADOW_STACK VM_NONE +#define VM_SOFTDIRTY 0 + +#define VM_ACCESS_FLAGS (VM_READ | VM_WRITE | VM_EXEC) +#define VM_SPECIAL (VM_IO | VM_DONTEXPAND | VM_PFNMAP | VM_MIXEDMAP) + +#define FIRST_USER_ADDRESS 0UL +#define USER_PGTABLES_CEILING 0UL + +#define vma_policy(vma) NULL + +#define down_write_nest_lock(sem, nest_lock) + +#define pgprot_val(x) ((x).pgprot) +#define __pgprot(x) ((pgprot_t) { (x) } ) + +#define for_each_vma(__vmi, __vma) \ + while (((__vma) = vma_next(&(__vmi))) != NULL) + +/* The MM code likes to work with exclusive end addresses */ +#define for_each_vma_range(__vmi, __vma, __end) \ + while (((__vma) = vma_find(&(__vmi), (__end))) != NULL) + +#define offset_in_page(p) ((unsigned long)(p) & ~PAGE_MASK) + +#define PHYS_PFN(x) ((unsigned long)((x) >> PAGE_SHIFT)) + +#define test_and_set_bit(nr, addr) __test_and_set_bit(nr, addr) +#define test_and_clear_bit(nr, addr) __test_and_clear_bit(nr, addr) + +#define TASK_SIZE ((1ul << 47)-PAGE_SIZE) + +#define AS_MM_ALL_LOCKS 2 + +#define current NULL + +/* We hardcode this for now. */ +#define sysctl_max_map_count 0x1000000UL + +#define pgoff_t unsigned long +typedef unsigned long pgprotval_t; +typedef struct pgprot { pgprotval_t pgprot; } pgprot_t; +typedef unsigned long vm_flags_t; +typedef __bitwise unsigned int vm_fault_t; + +typedef struct refcount_struct { + atomic_t refs; +} refcount_t; + +struct kref { + refcount_t refcount; +}; + +struct anon_vma { + struct anon_vma *root; + struct rb_root_cached rb_root; +}; + +struct anon_vma_chain { + struct anon_vma *anon_vma; + struct list_head same_vma; +}; + +struct anon_vma_name { + struct kref kref; + /* The name needs to be at the end because it is dynamically sized. */ + char name[]; +}; + +struct vma_iterator { + struct ma_state mas; +}; + +#define VMA_ITERATOR(name, __mm, __addr) \ + struct vma_iterator name = { \ + .mas = { \ + .tree = &(__mm)->mm_mt, \ + .index = __addr, \ + .node = NULL, \ + .status = ma_start, \ + }, \ + } + +struct address_space { + struct rb_root_cached i_mmap; + unsigned long flags; + atomic_t i_mmap_writable; +}; + +struct vm_userfaultfd_ctx {}; +struct mempolicy {}; +struct mmu_gather {}; +struct mutex {}; +#define DEFINE_MUTEX(mutexname) \ + struct mutex mutexname = {} + +struct mm_struct { + struct maple_tree mm_mt; + int map_count; /* number of VMAs */ + unsigned long total_vm; /* Total pages mapped */ + unsigned long locked_vm; /* Pages that have PG_mlocked set */ + unsigned long data_vm; /* VM_WRITE & ~VM_SHARED & ~VM_STACK */ + unsigned long exec_vm; /* VM_EXEC & ~VM_WRITE & ~VM_STACK */ + unsigned long stack_vm; /* VM_STACK */ +}; + +struct vma_lock { + struct rw_semaphore lock; +}; + + +struct file { + struct address_space *f_mapping; +}; + +struct vm_area_struct { + /* The first cache line has the info for VMA tree walking. */ + + union { + struct { + /* VMA covers [vm_start; vm_end) addresses within mm */ + unsigned long vm_start; + unsigned long vm_end; + }; +#ifdef CONFIG_PER_VMA_LOCK + struct rcu_head vm_rcu; /* Used for deferred freeing. */ +#endif + }; + + struct mm_struct *vm_mm; /* The address space we belong to. */ + pgprot_t vm_page_prot; /* Access permissions of this VMA. */ + + /* + * Flags, see mm.h. + * To modify use vm_flags_{init|reset|set|clear|mod} functions. + */ + union { + const vm_flags_t vm_flags; + vm_flags_t __private __vm_flags; + }; + +#ifdef CONFIG_PER_VMA_LOCK + /* Flag to indicate areas detached from the mm->mm_mt tree */ + bool detached; + + /* + * Can only be written (using WRITE_ONCE()) while holding both: + * - mmap_lock (in write mode) + * - vm_lock->lock (in write mode) + * Can be read reliably while holding one of: + * - mmap_lock (in read or write mode) + * - vm_lock->lock (in read or write mode) + * Can be read unreliably (using READ_ONCE()) for pessimistic bailout + * while holding nothing (except RCU to keep the VMA struct allocated). + * + * This sequence counter is explicitly allowed to overflow; sequence + * counter reuse can only lead to occasional unnecessary use of the + * slowpath. + */ + int vm_lock_seq; + struct vma_lock *vm_lock; +#endif + + /* + * For areas with an address space and backing store, + * linkage into the address_space->i_mmap interval tree. + * + */ + struct { + struct rb_node rb; + unsigned long rb_subtree_last; + } shared; + + /* + * A file's MAP_PRIVATE vma can be in both i_mmap tree and anon_vma + * list, after a COW of one of the file pages. A MAP_SHARED vma + * can only be in the i_mmap tree. An anonymous MAP_PRIVATE, stack + * or brk vma (with NULL file) can only be in an anon_vma list. + */ + struct list_head anon_vma_chain; /* Serialized by mmap_lock & + * page_table_lock */ + struct anon_vma *anon_vma; /* Serialized by page_table_lock */ + + /* Function pointers to deal with this struct. */ + const struct vm_operations_struct *vm_ops; + + /* Information about our backing store: */ + unsigned long vm_pgoff; /* Offset (within vm_file) in PAGE_SIZE + units */ + struct file * vm_file; /* File we map to (can be NULL). */ + void * vm_private_data; /* was vm_pte (shared mem) */ + +#ifdef CONFIG_ANON_VMA_NAME + /* + * For private and shared anonymous mappings, a pointer to a null + * terminated string containing the name given to the vma, or NULL if + * unnamed. Serialized by mmap_lock. Use anon_vma_name to access. + */ + struct anon_vma_name *anon_name; +#endif +#ifdef CONFIG_SWAP + atomic_long_t swap_readahead_info; +#endif +#ifndef CONFIG_MMU + struct vm_region *vm_region; /* NOMMU mapping region */ +#endif +#ifdef CONFIG_NUMA + struct mempolicy *vm_policy; /* NUMA policy for the VMA */ +#endif +#ifdef CONFIG_NUMA_BALANCING + struct vma_numab_state *numab_state; /* NUMA Balancing state */ +#endif + struct vm_userfaultfd_ctx vm_userfaultfd_ctx; +} __randomize_layout; + +struct vm_fault {}; + +struct vm_operations_struct { + void (*open)(struct vm_area_struct * area); + /** + * @close: Called when the VMA is being removed from the MM. + * Context: User context. May sleep. Caller holds mmap_lock. + */ + void (*close)(struct vm_area_struct * area); + /* Called any time before splitting to check if it's allowed */ + int (*may_split)(struct vm_area_struct *area, unsigned long addr); + int (*mremap)(struct vm_area_struct *area); + /* + * Called by mprotect() to make driver-specific permission + * checks before mprotect() is finalised. The VMA must not + * be modified. Returns 0 if mprotect() can proceed. + */ + int (*mprotect)(struct vm_area_struct *vma, unsigned long start, + unsigned long end, unsigned long newflags); + vm_fault_t (*fault)(struct vm_fault *vmf); + vm_fault_t (*huge_fault)(struct vm_fault *vmf, unsigned int order); + vm_fault_t (*map_pages)(struct vm_fault *vmf, + pgoff_t start_pgoff, pgoff_t end_pgoff); + unsigned long (*pagesize)(struct vm_area_struct * area); + + /* notification that a previously read-only page is about to become + * writable, if an error is returned it will cause a SIGBUS */ + vm_fault_t (*page_mkwrite)(struct vm_fault *vmf); + + /* same as page_mkwrite when using VM_PFNMAP|VM_MIXEDMAP */ + vm_fault_t (*pfn_mkwrite)(struct vm_fault *vmf); + + /* called by access_process_vm when get_user_pages() fails, typically + * for use by special VMAs. See also generic_access_phys() for a generic + * implementation useful for any iomem mapping. + */ + int (*access)(struct vm_area_struct *vma, unsigned long addr, + void *buf, int len, int write); + + /* Called by the /proc/PID/maps code to ask the vma whether it + * has a special name. Returning non-NULL will also cause this + * vma to be dumped unconditionally. */ + const char *(*name)(struct vm_area_struct *vma); + +#ifdef CONFIG_NUMA + /* + * set_policy() op must add a reference to any non-NULL @new mempolicy + * to hold the policy upon return. Caller should pass NULL @new to + * remove a policy and fall back to surrounding context--i.e. do not + * install a MPOL_DEFAULT policy, nor the task or system default + * mempolicy. + */ + int (*set_policy)(struct vm_area_struct *vma, struct mempolicy *new); + + /* + * get_policy() op must add reference [mpol_get()] to any policy at + * (vma,addr) marked as MPOL_SHARED. The shared policy infrastructure + * in mm/mempolicy.c will do this automatically. + * get_policy() must NOT add a ref if the policy at (vma,addr) is not + * marked as MPOL_SHARED. vma policies are protected by the mmap_lock. + * If no [shared/vma] mempolicy exists at the addr, get_policy() op + * must return NULL--i.e., do not "fallback" to task or system default + * policy. + */ + struct mempolicy *(*get_policy)(struct vm_area_struct *vma, + unsigned long addr, pgoff_t *ilx); +#endif + /* + * Called by vm_normal_page() for special PTEs to find the + * page for @addr. This is useful if the default behavior + * (using pte_page()) would not find the correct page. + */ + struct page *(*find_special_page)(struct vm_area_struct *vma, + unsigned long addr); +}; + +static inline void vma_iter_invalidate(struct vma_iterator *vmi) +{ + mas_pause(&vmi->mas); +} + +static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot) +{ + return __pgprot(pgprot_val(oldprot) | pgprot_val(newprot)); +} + +static inline pgprot_t vm_get_page_prot(unsigned long vm_flags) +{ + return __pgprot(vm_flags); +} + +static inline bool is_shared_maywrite(vm_flags_t vm_flags) +{ + return (vm_flags & (VM_SHARED | VM_MAYWRITE)) == + (VM_SHARED | VM_MAYWRITE); +} + +static inline bool vma_is_shared_maywrite(struct vm_area_struct *vma) +{ + return is_shared_maywrite(vma->vm_flags); +} + +static inline struct vm_area_struct *vma_next(struct vma_iterator *vmi) +{ + /* + * Uses mas_find() to get the first VMA when the iterator starts. + * Calling mas_next() could skip the first entry. + */ + return mas_find(&vmi->mas, ULONG_MAX); +} + +static inline bool vma_lock_alloc(struct vm_area_struct *vma) +{ + vma->vm_lock = calloc(1, sizeof(struct vma_lock)); + + if (!vma->vm_lock) + return false; + + init_rwsem(&vma->vm_lock->lock); + vma->vm_lock_seq = -1; + + return true; +} + +static inline void vma_assert_write_locked(struct vm_area_struct *); +static inline void vma_mark_detached(struct vm_area_struct *vma, bool detached) +{ + /* When detaching vma should be write-locked */ + if (detached) + vma_assert_write_locked(vma); + vma->detached = detached; +} + +extern const struct vm_operations_struct vma_dummy_vm_ops; + +static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm) +{ + memset(vma, 0, sizeof(*vma)); + vma->vm_mm = mm; + vma->vm_ops = &vma_dummy_vm_ops; + INIT_LIST_HEAD(&vma->anon_vma_chain); + vma_mark_detached(vma, false); +} + +static inline struct vm_area_struct *vm_area_alloc(struct mm_struct *mm) +{ + struct vm_area_struct *vma = calloc(1, sizeof(struct vm_area_struct)); + + if (!vma) + return NULL; + + vma_init(vma, mm); + if (!vma_lock_alloc(vma)) { + free(vma); + return NULL; + } + + return vma; +} + +static inline struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig) +{ + struct vm_area_struct *new = calloc(1, sizeof(struct vm_area_struct)); + + if (!new) + return NULL; + + memcpy(new, orig, sizeof(*new)); + if (!vma_lock_alloc(new)) { + free(new); + return NULL; + } + INIT_LIST_HEAD(&new->anon_vma_chain); + + return new; +} + +/* + * These are defined in vma.h, but sadly vm_stat_account() is referenced by + * kernel/fork.c, so we have to these broadly available there, and temporarily + * define them here to resolve the dependency cycle. + */ + +#define is_exec_mapping(flags) \ + ((flags & (VM_EXEC | VM_WRITE | VM_STACK)) == VM_EXEC) + +#define is_stack_mapping(flags) \ + (((flags & VM_STACK) == VM_STACK) || (flags & VM_SHADOW_STACK)) + +#define is_data_mapping(flags) \ + ((flags & (VM_WRITE | VM_SHARED | VM_STACK)) == VM_WRITE) + +static inline void vm_stat_account(struct mm_struct *mm, vm_flags_t flags, + long npages) +{ + WRITE_ONCE(mm->total_vm, READ_ONCE(mm->total_vm)+npages); + + if (is_exec_mapping(flags)) + mm->exec_vm += npages; + else if (is_stack_mapping(flags)) + mm->stack_vm += npages; + else if (is_data_mapping(flags)) + mm->data_vm += npages; +} + +#undef is_exec_mapping +#undef is_stack_mapping +#undef is_data_mapping + +/* Currently stubbed but we may later wish to un-stub. */ +static inline void vm_acct_memory(long pages); +static inline void vm_unacct_memory(long pages) +{ + vm_acct_memory(-pages); +} + +static inline void mapping_allow_writable(struct address_space *mapping) +{ + atomic_inc(&mapping->i_mmap_writable); +} + +static inline void vma_set_range(struct vm_area_struct *vma, + unsigned long start, unsigned long end, + pgoff_t pgoff) +{ + vma->vm_start = start; + vma->vm_end = end; + vma->vm_pgoff = pgoff; +} + +static inline +struct vm_area_struct *vma_find(struct vma_iterator *vmi, unsigned long max) +{ + return mas_find(&vmi->mas, max - 1); +} + +static inline int vma_iter_clear_gfp(struct vma_iterator *vmi, + unsigned long start, unsigned long end, gfp_t gfp) +{ + __mas_set_range(&vmi->mas, start, end - 1); + mas_store_gfp(&vmi->mas, NULL, gfp); + if (unlikely(mas_is_err(&vmi->mas))) + return -ENOMEM; + + return 0; +} + +static inline void mmap_assert_locked(struct mm_struct *); +static inline struct vm_area_struct *find_vma_intersection(struct mm_struct *mm, + unsigned long start_addr, + unsigned long end_addr) +{ + unsigned long index = start_addr; + + mmap_assert_locked(mm); + return mt_find(&mm->mm_mt, &index, end_addr - 1); +} + +static inline +struct vm_area_struct *vma_lookup(struct mm_struct *mm, unsigned long addr) +{ + return mtree_load(&mm->mm_mt, addr); +} + +static inline struct vm_area_struct *vma_prev(struct vma_iterator *vmi) +{ + return mas_prev(&vmi->mas, 0); +} + +static inline void vma_iter_set(struct vma_iterator *vmi, unsigned long addr) +{ + mas_set(&vmi->mas, addr); +} + +static inline bool vma_is_anonymous(struct vm_area_struct *vma) +{ + return !vma->vm_ops; +} + +/* Defined in vma.h, so temporarily define here to avoid circular dependency. */ +#define vma_iter_load(vmi) \ + mas_walk(&(vmi)->mas) + +static inline struct vm_area_struct * +find_vma_prev(struct mm_struct *mm, unsigned long addr, + struct vm_area_struct **pprev) +{ + struct vm_area_struct *vma; + VMA_ITERATOR(vmi, mm, addr); + + vma = vma_iter_load(&vmi); + *pprev = vma_prev(&vmi); + if (!vma) + vma = vma_next(&vmi); + return vma; +} + +#undef vma_iter_load + +static inline void vma_iter_init(struct vma_iterator *vmi, + struct mm_struct *mm, unsigned long addr) +{ + mas_init(&vmi->mas, &mm->mm_mt, addr); +} + +/* Stubbed functions. */ + +static inline struct anon_vma_name *anon_vma_name(struct vm_area_struct *vma) +{ + return NULL; +} + +static inline bool is_mergeable_vm_userfaultfd_ctx(struct vm_area_struct *vma, + struct vm_userfaultfd_ctx vm_ctx) +{ + return true; +} + +static inline bool anon_vma_name_eq(struct anon_vma_name *anon_name1, + struct anon_vma_name *anon_name2) +{ + return true; +} + +static inline void might_sleep(void) +{ +} + +static inline unsigned long vma_pages(struct vm_area_struct *vma) +{ + return (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; +} + +static inline void fput(struct file *) +{ +} + +static inline void mpol_put(struct mempolicy *) +{ +} + +static inline void vma_lock_free(struct vm_area_struct *vma) +{ + free(vma->vm_lock); +} + +static inline void __vm_area_free(struct vm_area_struct *vma) +{ + vma_lock_free(vma); + free(vma); +} + +static inline void vm_area_free(struct vm_area_struct *vma) +{ + __vm_area_free(vma); +} + +static inline void lru_add_drain(void) +{ +} + +static inline void tlb_gather_mmu(struct mmu_gather *, struct mm_struct *) +{ +} + +static inline void update_hiwater_rss(struct mm_struct *) +{ +} + +static inline void update_hiwater_vm(struct mm_struct *) +{ +} + +static inline void unmap_vmas(struct mmu_gather *tlb, struct ma_state *mas, + struct vm_area_struct *vma, unsigned long start_addr, + unsigned long end_addr, unsigned long tree_end, + bool mm_wr_locked) +{ + (void)tlb; + (void)mas; + (void)vma; + (void)start_addr; + (void)end_addr; + (void)tree_end; + (void)mm_wr_locked; +} + +static inline void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas, + struct vm_area_struct *vma, unsigned long floor, + unsigned long ceiling, bool mm_wr_locked) +{ + (void)tlb; + (void)mas; + (void)vma; + (void)floor; + (void)ceiling; + (void)mm_wr_locked; +} + +static inline void mapping_unmap_writable(struct address_space *) +{ +} + +static inline void flush_dcache_mmap_lock(struct address_space *) +{ +} + +static inline void tlb_finish_mmu(struct mmu_gather *) +{ +} + +static inline void get_file(struct file *) +{ +} + +static inline int vma_dup_policy(struct vm_area_struct *, struct vm_area_struct *) +{ + return 0; +} + +static inline int anon_vma_clone(struct vm_area_struct *, struct vm_area_struct *) +{ + return 0; +} + +static inline void vma_start_write(struct vm_area_struct *) +{ +} + +static inline void vma_adjust_trans_huge(struct vm_area_struct *vma, + unsigned long start, + unsigned long end, + long adjust_next) +{ + (void)vma; + (void)start; + (void)end; + (void)adjust_next; +} + +static inline void vma_iter_free(struct vma_iterator *vmi) +{ + mas_destroy(&vmi->mas); +} + +static inline void vm_acct_memory(long pages) +{ +} + +static inline void vma_interval_tree_insert(struct vm_area_struct *, + struct rb_root_cached *) +{ +} + +static inline void vma_interval_tree_remove(struct vm_area_struct *, + struct rb_root_cached *) +{ +} + +static inline void flush_dcache_mmap_unlock(struct address_space *) +{ +} + +static inline void anon_vma_interval_tree_insert(struct anon_vma_chain*, + struct rb_root_cached *) +{ +} + +static inline void anon_vma_interval_tree_remove(struct anon_vma_chain*, + struct rb_root_cached *) +{ +} + +static inline void uprobe_mmap(struct vm_area_struct *) +{ +} + +static inline void uprobe_munmap(struct vm_area_struct *vma, + unsigned long start, unsigned long end) +{ + (void)vma; + (void)start; + (void)end; +} + +static inline void i_mmap_lock_write(struct address_space *) +{ +} + +static inline void anon_vma_lock_write(struct anon_vma *) +{ +} + +static inline void vma_assert_write_locked(struct vm_area_struct *) +{ +} + +static inline void unlink_anon_vmas(struct vm_area_struct *) +{ +} + +static inline void anon_vma_unlock_write(struct anon_vma *) +{ +} + +static inline void i_mmap_unlock_write(struct address_space *) +{ +} + +static inline void anon_vma_merge(struct vm_area_struct *, + struct vm_area_struct *) +{ +} + +static inline int userfaultfd_unmap_prep(struct vm_area_struct *vma, + unsigned long start, + unsigned long end, + struct list_head *unmaps) +{ + (void)vma; + (void)start; + (void)end; + (void)unmaps; + + return 0; +} + +static inline void mmap_write_downgrade(struct mm_struct *) +{ +} + +static inline void mmap_read_unlock(struct mm_struct *) +{ +} + +static inline void mmap_write_unlock(struct mm_struct *) +{ +} + +static inline bool can_modify_mm(struct mm_struct *mm, + unsigned long start, + unsigned long end) +{ + (void)mm; + (void)start; + (void)end; + + return true; +} + +static inline void arch_unmap(struct mm_struct *mm, + unsigned long start, + unsigned long end) +{ + (void)mm; + (void)start; + (void)end; +} + +static inline void mmap_assert_locked(struct mm_struct *) +{ +} + +static inline bool mpol_equal(struct mempolicy *, struct mempolicy *) +{ + return true; +} + +static inline void khugepaged_enter_vma(struct vm_area_struct *vma, + unsigned long vm_flags) +{ + (void)vma; + (void)vm_flags; +} + +static inline bool mapping_can_writeback(struct address_space *) +{ + return true; +} + +static inline bool is_vm_hugetlb_page(struct vm_area_struct *) +{ + return false; +} + +static inline bool vma_soft_dirty_enabled(struct vm_area_struct *) +{ + return false; +} + +static inline bool userfaultfd_wp(struct vm_area_struct *) +{ + return false; +} + +static inline void mmap_assert_write_locked(struct mm_struct *) +{ +} + +static inline void mutex_lock(struct mutex *) +{ +} + +static inline void mutex_unlock(struct mutex *) +{ +} + +static inline bool mutex_is_locked(struct mutex *) +{ + return true; +} + +static inline bool signal_pending(void *) +{ + return false; +} + +#endif /* __MM_VMA_INTERNAL_H */ -- cgit v1.2.3 From 67203f3f2a63d429272f0c80451e5fcc469fdb46 Mon Sep 17 00:00:00 2001 From: Pedro Falcato Date: Wed, 7 Aug 2024 18:33:36 +0100 Subject: selftests/mm: add mseal test for no-discard madvise Add an mseal test for madvise() operations that aren't considered "discard" (e.g purely advisory ops such as MADV_RANDOM). [pedro.falcato@gmail.com: adjust the mseal test's plan] Link: https://lkml.kernel.org/r/20240807203724.2686144-1-pedro.falcato@gmail.com Link: https://lkml.kernel.org/r/20240807173336.2523757-3-pedro.falcato@gmail.com Signed-off-by: Pedro Falcato Tested-by: Jeff Xu Reviewed-by: Jeff Xu Cc: Kees Cook Cc: Liam R. Howlett Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/mseal_test.c | 36 ++++++++++++++++++++++++++++++++- 1 file changed, 35 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/mm/mseal_test.c b/tools/testing/selftests/mm/mseal_test.c index a818f010de47..7eec3f0152e3 100644 --- a/tools/testing/selftests/mm/mseal_test.c +++ b/tools/testing/selftests/mm/mseal_test.c @@ -1731,6 +1731,38 @@ static void test_seal_discard_ro_anon(bool seal) REPORT_TEST_PASS(); } +static void test_seal_madvise_nodiscard(bool seal) +{ + void *ptr; + unsigned long page_size = getpagesize(); + unsigned long size = 4 * page_size; + int ret; + + setup_single_address(size, &ptr); + FAIL_TEST_IF_FALSE(ptr != (void *)-1); + + if (seal) { + ret = seal_single_address(ptr, size); + FAIL_TEST_IF_FALSE(!ret); + } + + /* + * Test a random madvise flag like MADV_RANDOM that does not touch page + * contents (and thus should work for msealed VMAs). RANDOM also happens to + * share bits with other discard-ish flags like REMOVE. + */ + ret = sys_madvise(ptr, size, MADV_RANDOM); + FAIL_TEST_IF_FALSE(!ret); + + ret = sys_munmap(ptr, size); + if (seal) + FAIL_TEST_IF_FALSE(ret < 0); + else + FAIL_TEST_IF_FALSE(!ret); + + REPORT_TEST_PASS(); +} + int main(int argc, char **argv) { bool test_seal = seal_support(); @@ -1743,7 +1775,7 @@ int main(int argc, char **argv) if (!pkey_supported()) ksft_print_msg("PKEY not supported\n"); - ksft_set_plan(80); + ksft_set_plan(82); test_seal_addseal(); test_seal_unmapped_start(); @@ -1822,6 +1854,8 @@ int main(int argc, char **argv) test_seal_mremap_move_fixed_zero(true); test_seal_mremap_move_dontunmap_anyaddr(false); test_seal_mremap_move_dontunmap_anyaddr(true); + test_seal_madvise_nodiscard(false); + test_seal_madvise_nodiscard(true); test_seal_discard_ro_anon(false); test_seal_discard_ro_anon(true); test_seal_discard_ro_anon_on_rw(false); -- cgit v1.2.3 From 072cd213b75eb01fcf40eff898f8d5c008ce1457 Mon Sep 17 00:00:00 2001 From: Jeff Xu Date: Wed, 7 Aug 2024 21:23:20 +0000 Subject: selftest mm/mseal: fix test_seal_mremap_move_dontunmap_anyaddr the syscall remap accepts following: mremap(src, size, size, MREMAP_MAYMOVE | MREMAP_DONTUNMAP, dst) when the src is sealed, the call will fail with error code: EPERM Previously, the test uses hard-coded 0xdeaddead as dst, and it will fail on the system with newer glibc installed. This patch removes test's dependency on glibc for mremap(), also fix the test and remove the hardcoded address. Link: https://lkml.kernel.org/r/20240807212320.2831848-1-jeffxu@chromium.org Fixes: 4926c7a52de7 ("selftest mm/mseal memory sealing") Signed-off-by: Jeff Xu Reported-by: Pedro Falcato Cc: Dave Hansen Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Michael Ellerman Cc: Vlastimil Babka Cc: Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/mseal_test.c | 57 +++++++++++++++++++++------------ 1 file changed, 36 insertions(+), 21 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/mm/mseal_test.c b/tools/testing/selftests/mm/mseal_test.c index 7eec3f0152e3..bd0bfda7aae7 100644 --- a/tools/testing/selftests/mm/mseal_test.c +++ b/tools/testing/selftests/mm/mseal_test.c @@ -110,6 +110,16 @@ static int sys_madvise(void *start, size_t len, int types) return sret; } +static void *sys_mremap(void *addr, size_t old_len, size_t new_len, + unsigned long flags, void *new_addr) +{ + void *sret; + + errno = 0; + sret = (void *) syscall(__NR_mremap, addr, old_len, new_len, flags, new_addr); + return sret; +} + static int sys_pkey_alloc(unsigned long flags, unsigned long init_val) { int ret = syscall(__NR_pkey_alloc, flags, init_val); @@ -1115,12 +1125,12 @@ static void test_seal_mremap_shrink(bool seal) } /* shrink from 4 pages to 2 pages. */ - ret2 = mremap(ptr, size, 2 * page_size, 0, 0); + ret2 = sys_mremap(ptr, size, 2 * page_size, 0, 0); if (seal) { - FAIL_TEST_IF_FALSE(ret2 == MAP_FAILED); + FAIL_TEST_IF_FALSE(ret2 == (void *) MAP_FAILED); FAIL_TEST_IF_FALSE(errno == EPERM); } else { - FAIL_TEST_IF_FALSE(ret2 != MAP_FAILED); + FAIL_TEST_IF_FALSE(ret2 != (void *) MAP_FAILED); } @@ -1147,7 +1157,7 @@ static void test_seal_mremap_expand(bool seal) } /* expand from 2 page to 4 pages. */ - ret2 = mremap(ptr, 2 * page_size, 4 * page_size, 0, 0); + ret2 = sys_mremap(ptr, 2 * page_size, 4 * page_size, 0, 0); if (seal) { FAIL_TEST_IF_FALSE(ret2 == MAP_FAILED); FAIL_TEST_IF_FALSE(errno == EPERM); @@ -1180,7 +1190,7 @@ static void test_seal_mremap_move(bool seal) } /* move from ptr to fixed address. */ - ret2 = mremap(ptr, size, size, MREMAP_MAYMOVE | MREMAP_FIXED, newPtr); + ret2 = sys_mremap(ptr, size, size, MREMAP_MAYMOVE | MREMAP_FIXED, newPtr); if (seal) { FAIL_TEST_IF_FALSE(ret2 == MAP_FAILED); FAIL_TEST_IF_FALSE(errno == EPERM); @@ -1299,7 +1309,7 @@ static void test_seal_mremap_shrink_fixed(bool seal) } /* mremap to move and shrink to fixed address */ - ret2 = mremap(ptr, size, 2 * page_size, MREMAP_MAYMOVE | MREMAP_FIXED, + ret2 = sys_mremap(ptr, size, 2 * page_size, MREMAP_MAYMOVE | MREMAP_FIXED, newAddr); if (seal) { FAIL_TEST_IF_FALSE(ret2 == MAP_FAILED); @@ -1330,7 +1340,7 @@ static void test_seal_mremap_expand_fixed(bool seal) } /* mremap to move and expand to fixed address */ - ret2 = mremap(ptr, page_size, size, MREMAP_MAYMOVE | MREMAP_FIXED, + ret2 = sys_mremap(ptr, page_size, size, MREMAP_MAYMOVE | MREMAP_FIXED, newAddr); if (seal) { FAIL_TEST_IF_FALSE(ret2 == MAP_FAILED); @@ -1361,7 +1371,7 @@ static void test_seal_mremap_move_fixed(bool seal) } /* mremap to move to fixed address */ - ret2 = mremap(ptr, size, size, MREMAP_MAYMOVE | MREMAP_FIXED, newAddr); + ret2 = sys_mremap(ptr, size, size, MREMAP_MAYMOVE | MREMAP_FIXED, newAddr); if (seal) { FAIL_TEST_IF_FALSE(ret2 == MAP_FAILED); FAIL_TEST_IF_FALSE(errno == EPERM); @@ -1390,14 +1400,13 @@ static void test_seal_mremap_move_fixed_zero(bool seal) /* * MREMAP_FIXED can move the mapping to zero address */ - ret2 = mremap(ptr, size, 2 * page_size, MREMAP_MAYMOVE | MREMAP_FIXED, + ret2 = sys_mremap(ptr, size, 2 * page_size, MREMAP_MAYMOVE | MREMAP_FIXED, 0); if (seal) { FAIL_TEST_IF_FALSE(ret2 == MAP_FAILED); FAIL_TEST_IF_FALSE(errno == EPERM); } else { FAIL_TEST_IF_FALSE(ret2 == 0); - } REPORT_TEST_PASS(); @@ -1420,13 +1429,13 @@ static void test_seal_mremap_move_dontunmap(bool seal) } /* mremap to move, and don't unmap src addr. */ - ret2 = mremap(ptr, size, size, MREMAP_MAYMOVE | MREMAP_DONTUNMAP, 0); + ret2 = sys_mremap(ptr, size, size, MREMAP_MAYMOVE | MREMAP_DONTUNMAP, 0); if (seal) { FAIL_TEST_IF_FALSE(ret2 == MAP_FAILED); FAIL_TEST_IF_FALSE(errno == EPERM); } else { + /* kernel will allocate a new address */ FAIL_TEST_IF_FALSE(ret2 != MAP_FAILED); - } REPORT_TEST_PASS(); @@ -1434,7 +1443,7 @@ static void test_seal_mremap_move_dontunmap(bool seal) static void test_seal_mremap_move_dontunmap_anyaddr(bool seal) { - void *ptr; + void *ptr, *ptr2; unsigned long page_size = getpagesize(); unsigned long size = 4 * page_size; int ret; @@ -1449,24 +1458,30 @@ static void test_seal_mremap_move_dontunmap_anyaddr(bool seal) } /* - * The 0xdeaddead should not have effect on dest addr - * when MREMAP_DONTUNMAP is set. + * The new address is any address that not allocated. + * use allocate/free to similate that. */ - ret2 = mremap(ptr, size, size, MREMAP_MAYMOVE | MREMAP_DONTUNMAP, - 0xdeaddead); + setup_single_address(size, &ptr2); + FAIL_TEST_IF_FALSE(ptr2 != (void *)-1); + ret = sys_munmap(ptr2, size); + FAIL_TEST_IF_FALSE(!ret); + + /* + * remap to any address. + */ + ret2 = sys_mremap(ptr, size, size, MREMAP_MAYMOVE | MREMAP_DONTUNMAP, + (void *) ptr2); if (seal) { FAIL_TEST_IF_FALSE(ret2 == MAP_FAILED); FAIL_TEST_IF_FALSE(errno == EPERM); } else { - FAIL_TEST_IF_FALSE(ret2 != MAP_FAILED); - FAIL_TEST_IF_FALSE((long)ret2 != 0xdeaddead); - + /* remap success and return ptr2 */ + FAIL_TEST_IF_FALSE(ret2 == ptr2); } REPORT_TEST_PASS(); } - static void test_seal_merge_and_split(void) { void *ptr; -- cgit v1.2.3 From 5adfeaecc487e7023f1c7bbdc081707d7a93110f Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Fri, 9 Aug 2024 14:48:51 +0300 Subject: mm: rework accept memory helpers Make accept_memory() and range_contains_unaccepted_memory() take 'start' and 'size' arguments instead of 'start' and 'end'. Remove accept_page(), replacing it with direct calls to accept_memory(). The accept_page() name is going to be used for a different function. Link: https://lkml.kernel.org/r/20240809114854.3745464-6-kirill.shutemov@linux.intel.com Signed-off-by: Kirill A. Shutemov Suggested-by: David Hildenbrand Cc: Borislav Petkov Cc: Johannes Weiner Cc: Matthew Wilcox Cc: Mel Gorman Cc: Mike Rapoport (Microsoft) Cc: Tom Lendacky Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- arch/x86/boot/compressed/misc.c | 2 +- arch/x86/boot/compressed/misc.h | 2 +- drivers/firmware/efi/libstub/efistub.h | 2 +- drivers/firmware/efi/libstub/unaccepted_memory.c | 3 ++- drivers/firmware/efi/unaccepted_memory.c | 18 ++++++++++-------- include/linux/mm.h | 12 +++++------- mm/memblock.c | 2 +- mm/mm_init.c | 2 +- mm/page_alloc.c | 19 +++---------------- tools/testing/memblock/internal.h | 2 +- 10 files changed, 26 insertions(+), 38 deletions(-) (limited to 'tools') diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index 944454306ef4..04a35b2c26e9 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -511,7 +511,7 @@ asmlinkage __visible void *extract_kernel(void *rmode, unsigned char *output) if (init_unaccepted_memory()) { debug_putstr("Accepting memory... "); - accept_memory(__pa(output), __pa(output) + needed_size); + accept_memory(__pa(output), needed_size); } entry_offset = decompress_kernel(output, virt_addr, error); diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h index b353a7be380c..dd8d1a85f671 100644 --- a/arch/x86/boot/compressed/misc.h +++ b/arch/x86/boot/compressed/misc.h @@ -256,6 +256,6 @@ static inline bool init_unaccepted_memory(void) { return false; } /* Defined in EFI stub */ extern struct efi_unaccepted_memory *unaccepted_table; -void accept_memory(phys_addr_t start, phys_addr_t end); +void accept_memory(phys_addr_t start, unsigned long size); #endif /* BOOT_COMPRESSED_MISC_H */ diff --git a/drivers/firmware/efi/libstub/efistub.h b/drivers/firmware/efi/libstub/efistub.h index d33ccbc4a2c6..685098f9626f 100644 --- a/drivers/firmware/efi/libstub/efistub.h +++ b/drivers/firmware/efi/libstub/efistub.h @@ -1229,7 +1229,7 @@ efi_zboot_entry(efi_handle_t handle, efi_system_table_t *systab); efi_status_t allocate_unaccepted_bitmap(__u32 nr_desc, struct efi_boot_memmap *map); void process_unaccepted_memory(u64 start, u64 end); -void accept_memory(phys_addr_t start, phys_addr_t end); +void accept_memory(phys_addr_t start, unsigned long size); void arch_accept_memory(phys_addr_t start, phys_addr_t end); #endif diff --git a/drivers/firmware/efi/libstub/unaccepted_memory.c b/drivers/firmware/efi/libstub/unaccepted_memory.c index c295ea3a6efc..757dbe734a47 100644 --- a/drivers/firmware/efi/libstub/unaccepted_memory.c +++ b/drivers/firmware/efi/libstub/unaccepted_memory.c @@ -177,9 +177,10 @@ void process_unaccepted_memory(u64 start, u64 end) start / unit_size, (end - start) / unit_size); } -void accept_memory(phys_addr_t start, phys_addr_t end) +void accept_memory(phys_addr_t start, unsigned long size) { unsigned long range_start, range_end; + phys_addr_t end = start + size; unsigned long bitmap_size; u64 unit_size; diff --git a/drivers/firmware/efi/unaccepted_memory.c b/drivers/firmware/efi/unaccepted_memory.c index 50f6503fe49f..c2c067eff634 100644 --- a/drivers/firmware/efi/unaccepted_memory.c +++ b/drivers/firmware/efi/unaccepted_memory.c @@ -30,11 +30,12 @@ static LIST_HEAD(accepting_list); * - memory that is below phys_base; * - memory that is above the memory that addressable by the bitmap; */ -void accept_memory(phys_addr_t start, phys_addr_t end) +void accept_memory(phys_addr_t start, unsigned long size) { struct efi_unaccepted_memory *unaccepted; unsigned long range_start, range_end; struct accept_range range, *entry; + phys_addr_t end = start + size; unsigned long flags; u64 unit_size; @@ -74,13 +75,13 @@ void accept_memory(phys_addr_t start, phys_addr_t end) * "guard" page is accepted in addition to the memory that needs to be * used: * - * 1. Implicitly extend the range_contains_unaccepted_memory(start, end) - * checks up to end+unit_size if 'end' is aligned on a unit_size - * boundary. + * 1. Implicitly extend the range_contains_unaccepted_memory(start, size) + * checks up to the next unit_size if 'start+size' is aligned on a + * unit_size boundary. * - * 2. Implicitly extend accept_memory(start, end) to end+unit_size if - * 'end' is aligned on a unit_size boundary. (immediately following - * this comment) + * 2. Implicitly extend accept_memory(start, size) to the next unit_size + * if 'size+end' is aligned on a unit_size boundary. (immediately + * following this comment) */ if (!(end % unit_size)) end += unit_size; @@ -156,9 +157,10 @@ retry: spin_unlock_irqrestore(&unaccepted_memory_lock, flags); } -bool range_contains_unaccepted_memory(phys_addr_t start, phys_addr_t end) +bool range_contains_unaccepted_memory(phys_addr_t start, unsigned long size) { struct efi_unaccepted_memory *unaccepted; + phys_addr_t end = start + size; unsigned long flags; bool ret = false; u64 unit_size; diff --git a/include/linux/mm.h b/include/linux/mm.h index ee8cea73d415..b1eed30fdc06 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -4062,18 +4062,18 @@ madvise_set_anon_name(struct mm_struct *mm, unsigned long start, #ifdef CONFIG_UNACCEPTED_MEMORY -bool range_contains_unaccepted_memory(phys_addr_t start, phys_addr_t end); -void accept_memory(phys_addr_t start, phys_addr_t end); +bool range_contains_unaccepted_memory(phys_addr_t start, unsigned long size); +void accept_memory(phys_addr_t start, unsigned long size); #else static inline bool range_contains_unaccepted_memory(phys_addr_t start, - phys_addr_t end) + unsigned long size) { return false; } -static inline void accept_memory(phys_addr_t start, phys_addr_t end) +static inline void accept_memory(phys_addr_t start, unsigned long size) { } @@ -4081,9 +4081,7 @@ static inline void accept_memory(phys_addr_t start, phys_addr_t end) static inline bool pfn_is_unaccepted_memory(unsigned long pfn) { - phys_addr_t paddr = pfn << PAGE_SHIFT; - - return range_contains_unaccepted_memory(paddr, paddr + PAGE_SIZE); + return range_contains_unaccepted_memory(pfn << PAGE_SHIFT, PAGE_SIZE); } void vma_pgtable_walk_begin(struct vm_area_struct *vma); diff --git a/mm/memblock.c b/mm/memblock.c index 3b9dc2d89b8a..0a77a748a8eb 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -1500,7 +1500,7 @@ done: * * Accept the memory of the allocated buffer. */ - accept_memory(found, found + size); + accept_memory(found, size); return found; } diff --git a/mm/mm_init.c b/mm/mm_init.c index 51960079875b..f3106b6299d3 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -1939,7 +1939,7 @@ static void __init deferred_free_pages(unsigned long pfn, } /* Accept chunks smaller than MAX_PAGE_ORDER upfront */ - accept_memory(PFN_PHYS(pfn), PFN_PHYS(pfn + nr_pages)); + accept_memory(PFN_PHYS(pfn), nr_pages * PAGE_SIZE); for (i = 0; i < nr_pages; i++, page++, pfn++) { if (pageblock_aligned(pfn)) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index fec7ed5a04f2..6ba88929c1e3 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -286,7 +286,6 @@ EXPORT_SYMBOL(nr_online_nodes); #endif static bool page_contains_unaccepted(struct page *page, unsigned int order); -static void accept_page(struct page *page, unsigned int order); static bool cond_accept_memory(struct zone *zone, unsigned int order); static inline bool has_unaccepted_memory(void); static bool __free_unaccepted(struct page *page); @@ -1268,7 +1267,7 @@ void __meminit __free_pages_core(struct page *page, unsigned int order, if (order == MAX_PAGE_ORDER && __free_unaccepted(page)) return; - accept_page(page, order); + accept_memory(page_to_phys(page), PAGE_SIZE << order); } /* @@ -6932,16 +6931,8 @@ early_param("accept_memory", accept_memory_parse); static bool page_contains_unaccepted(struct page *page, unsigned int order) { phys_addr_t start = page_to_phys(page); - phys_addr_t end = start + (PAGE_SIZE << order); - return range_contains_unaccepted_memory(start, end); -} - -static void accept_page(struct page *page, unsigned int order) -{ - phys_addr_t start = page_to_phys(page); - - accept_memory(start, start + (PAGE_SIZE << order)); + return range_contains_unaccepted_memory(start, PAGE_SIZE << order); } static bool try_to_accept_memory_one(struct zone *zone) @@ -6966,7 +6957,7 @@ static bool try_to_accept_memory_one(struct zone *zone) __ClearPageUnaccepted(page); spin_unlock_irqrestore(&zone->lock, flags); - accept_page(page, MAX_PAGE_ORDER); + accept_memory(page_to_phys(page), PAGE_SIZE << MAX_PAGE_ORDER); __free_pages_ok(page, MAX_PAGE_ORDER, FPI_TO_TAIL); @@ -7038,10 +7029,6 @@ static bool page_contains_unaccepted(struct page *page, unsigned int order) return false; } -static void accept_page(struct page *page, unsigned int order) -{ -} - static bool cond_accept_memory(struct zone *zone, unsigned int order) { return false; diff --git a/tools/testing/memblock/internal.h b/tools/testing/memblock/internal.h index f6c6e5474c3a..1cf82acb2a3e 100644 --- a/tools/testing/memblock/internal.h +++ b/tools/testing/memblock/internal.h @@ -20,7 +20,7 @@ void memblock_free_pages(struct page *page, unsigned long pfn, { } -static inline void accept_memory(phys_addr_t start, phys_addr_t end) +static inline void accept_memory(phys_addr_t start, unsigned long size) { } -- cgit v1.2.3 From 617f8e4d76b81361884db852005f519012ddd244 Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Mon, 12 Aug 2024 15:05:43 -0400 Subject: maple_tree: add test to replicate low memory race conditions Add new callback fields to the userspace implementation of struct kmem_cache. This allows for executing callback functions in order to further test low memory scenarios where node allocation is retried. This callback can help test race conditions by calling a function when a low memory event is tested. Link: https://lkml.kernel.org/r/20240812190543.71967-2-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Reviewed-by: Liam R. Howlett Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- lib/maple_tree.c | 13 +++++++++ tools/testing/radix-tree/maple.c | 63 ++++++++++++++++++++++++++++++++++++++++ tools/testing/shared/linux.c | 26 ++++++++++++++++- 3 files changed, 101 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/lib/maple_tree.c b/lib/maple_tree.c index b547ff211ac7..14d7864b8d53 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -7005,6 +7005,19 @@ void mt_set_non_kernel(unsigned int val) kmem_cache_set_non_kernel(maple_node_cache, val); } +extern void kmem_cache_set_callback(struct kmem_cache *cachep, + void (*callback)(void *)); +void mt_set_callback(void (*callback)(void *)) +{ + kmem_cache_set_callback(maple_node_cache, callback); +} + +extern void kmem_cache_set_private(struct kmem_cache *cachep, void *private); +void mt_set_private(void *private) +{ + kmem_cache_set_private(maple_node_cache, private); +} + extern unsigned long kmem_cache_get_alloc(struct kmem_cache *); unsigned long mt_get_alloc_size(void) { diff --git a/tools/testing/radix-tree/maple.c b/tools/testing/radix-tree/maple.c index cd1cf05503b4..ef5b83cf94ea 100644 --- a/tools/testing/radix-tree/maple.c +++ b/tools/testing/radix-tree/maple.c @@ -36224,6 +36224,65 @@ static noinline void __init check_mtree_dup(struct maple_tree *mt) extern void test_kmem_cache_bulk(void); +/* callback function used for check_nomem_writer_race() */ +static void writer2(void *maple_tree) +{ + struct maple_tree *mt = (struct maple_tree *)maple_tree; + MA_STATE(mas, mt, 6, 10); + + mtree_lock(mas.tree); + mas_store(&mas, xa_mk_value(0xC)); + mas_destroy(&mas); + mtree_unlock(mas.tree); +} + +/* + * check_nomem_writer_race() - test a possible race in the mas_nomem() path + * @mt: The tree to build. + * + * There is a possible race condition in low memory conditions when mas_nomem() + * gives up its lock. A second writer can chagne the entry that the primary + * writer executing the mas_nomem() path is modifying. This test recreates this + * scenario to ensure we are handling it correctly. + */ +static void check_nomem_writer_race(struct maple_tree *mt) +{ + MA_STATE(mas, mt, 0, 5); + + mt_set_non_kernel(0); + /* setup root with 2 values with NULL in between */ + mtree_store_range(mt, 0, 5, xa_mk_value(0xA), GFP_KERNEL); + mtree_store_range(mt, 6, 10, NULL, GFP_KERNEL); + mtree_store_range(mt, 11, 15, xa_mk_value(0xB), GFP_KERNEL); + + /* setup writer 2 that will trigger the race condition */ + mt_set_private(mt); + mt_set_callback(writer2); + + mtree_lock(mt); + /* erase 0-5 */ + mas_erase(&mas); + + /* index 6-10 should retain the value from writer 2 */ + check_load(mt, 6, xa_mk_value(0xC)); + mtree_unlock(mt); + + /* test for the same race but with mas_store_gfp() */ + mtree_store_range(mt, 0, 5, xa_mk_value(0xA), GFP_KERNEL); + mtree_store_range(mt, 6, 10, NULL, GFP_KERNEL); + + mas_set_range(&mas, 0, 5); + mtree_lock(mt); + mas_store_gfp(&mas, NULL, GFP_KERNEL); + + /* ensure write made by writer 2 is retained */ + check_load(mt, 6, xa_mk_value(0xC)); + + mt_set_private(NULL); + mt_set_callback(NULL); + mtree_unlock(mt); +} + void farmer_tests(void) { struct maple_node *node; @@ -36257,6 +36316,10 @@ void farmer_tests(void) check_dfs_preorder(&tree); mtree_destroy(&tree); + mt_init_flags(&tree, MT_FLAGS_ALLOC_RANGE | MT_FLAGS_USE_RCU); + check_nomem_writer_race(&tree); + mtree_destroy(&tree); + mt_init_flags(&tree, MT_FLAGS_ALLOC_RANGE); check_prealloc(&tree); mtree_destroy(&tree); diff --git a/tools/testing/shared/linux.c b/tools/testing/shared/linux.c index 4eb442206d01..17263696b5d8 100644 --- a/tools/testing/shared/linux.c +++ b/tools/testing/shared/linux.c @@ -26,8 +26,21 @@ struct kmem_cache { unsigned int non_kernel; unsigned long nr_allocated; unsigned long nr_tallocated; + bool exec_callback; + void (*callback)(void *); + void *private; }; +void kmem_cache_set_callback(struct kmem_cache *cachep, void (*callback)(void *)) +{ + cachep->callback = callback; +} + +void kmem_cache_set_private(struct kmem_cache *cachep, void *private) +{ + cachep->private = private; +} + void kmem_cache_set_non_kernel(struct kmem_cache *cachep, unsigned int val) { cachep->non_kernel = val; @@ -58,9 +71,17 @@ void *kmem_cache_alloc_lru(struct kmem_cache *cachep, struct list_lru *lru, { void *p; + if (cachep->exec_callback) { + if (cachep->callback) + cachep->callback(cachep->private); + cachep->exec_callback = false; + } + if (!(gfp & __GFP_DIRECT_RECLAIM)) { - if (!cachep->non_kernel) + if (!cachep->non_kernel) { + cachep->exec_callback = true; return NULL; + } cachep->non_kernel--; } @@ -223,6 +244,9 @@ kmem_cache_create(const char *name, unsigned int size, unsigned int align, ret->objs = NULL; ret->ctor = ctor; ret->non_kernel = 0; + ret->exec_callback = false; + ret->callback = NULL; + ret->private = NULL; return ret; } -- cgit v1.2.3 From 5d659bbb52a24f91cc6c0cf546ffcc0faa7e8f1a Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Wed, 14 Aug 2024 12:19:31 -0400 Subject: maple_tree: introduce mas_wr_store_type() Introduce mas_wr_store_type() which will set the correct store type based on a walk of the tree. In mas_wr_node_store() the <= min_slots condition is changed to < as if new_end is = to mt_min_slots then there is not enough room. mas_prealloc_calc() is also introduced to abstract the calculation used to determine the number of nodes needed for a store operation. In this change a call to mas_reset() is removed in the error case of mas_prealloc(). This is only needed in the MA_STATE_REBALANCE case of mas_destroy(). We can move the call to mas_reset() directly to mas_destroy(). Also, add a test case to validate the order that we check the store type in is correct. This test models a vma expanding and then shrinking which is part of the boot process. Link: https://lkml.kernel.org/r/20240814161944.55347-5-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Cc: Liam R. Howlett Cc: Matthew Wilcox (Oracle) Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- lib/maple_tree.c | 217 ++++++++++++++++++++++++++++----------- tools/testing/radix-tree/maple.c | 36 +++++++ 2 files changed, 193 insertions(+), 60 deletions(-) (limited to 'tools') diff --git a/lib/maple_tree.c b/lib/maple_tree.c index de4a91ced8ca..d0b9b3795b96 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -1372,9 +1372,9 @@ retry: return NULL; } + mas->node = NULL; /* empty tree */ if (unlikely(!root)) { - mas->node = NULL; mas->status = ma_none; mas->offset = MAPLE_NODE_SLOTS; return NULL; @@ -3890,7 +3890,7 @@ static inline bool mas_wr_node_store(struct ma_wr_state *wr_mas, bool in_rcu = mt_in_rcu(mas->tree); /* Check if there is enough data. The room is enough. */ - if (!mte_is_root(mas->node) && (new_end <= mt_min_slots[wr_mas->type]) && + if (!mte_is_root(mas->node) && (new_end < mt_min_slots[wr_mas->type]) && !(mas->mas_flags & MA_STATE_BULK)) return false; @@ -4275,6 +4275,146 @@ static inline void mas_wr_prealloc_setup(struct ma_wr_state *wr_mas) wr_mas->content = mas_start(mas); } +/** + * mas_prealloc_calc() - Calculate number of nodes needed for a + * given store oepration + * @mas: The maple state + * @entry: The entry to store into the tree + * + * Return: Number of nodes required for preallocation. + */ +static inline int mas_prealloc_calc(struct ma_state *mas, void *entry) +{ + int ret = mas_mt_height(mas) * 3 + 1; + + switch (mas->store_type) { + case wr_invalid: + WARN_ON_ONCE(1); + break; + case wr_new_root: + ret = 1; + break; + case wr_store_root: + if (likely((mas->last != 0) || (mas->index != 0))) + ret = 1; + else if (((unsigned long) (entry) & 3) == 2) + ret = 1; + else + ret = 0; + break; + case wr_spanning_store: + ret = mas_mt_height(mas) * 3 + 1; + break; + case wr_split_store: + ret = mas_mt_height(mas) * 2 + 1; + break; + case wr_rebalance: + ret = mas_mt_height(mas) * 2 - 1; + break; + case wr_node_store: + ret = mt_in_rcu(mas->tree) ? 1 : 0; + break; + case wr_append: + case wr_exact_fit: + case wr_slot_store: + ret = 0; + } + + return ret; +} + +/* + * mas_wr_store_type() - Set the store type for a given + * store operation. + * @wr_mas: The maple write state + */ +static inline void mas_wr_store_type(struct ma_wr_state *wr_mas) +{ + struct ma_state *mas = wr_mas->mas; + unsigned char new_end; + + if (unlikely(mas_is_none(mas) || mas_is_ptr(mas))) { + mas->store_type = wr_store_root; + return; + } + + if (unlikely(!mas_wr_walk(wr_mas))) { + mas->store_type = wr_spanning_store; + return; + } + + /* At this point, we are at the leaf node that needs to be altered. */ + mas_wr_end_piv(wr_mas); + if (!wr_mas->entry) + mas_wr_extend_null(wr_mas); + + new_end = mas_wr_new_end(wr_mas); + if ((wr_mas->r_min == mas->index) && (wr_mas->r_max == mas->last)) { + mas->store_type = wr_exact_fit; + return; + } + + if (unlikely(!mas->index && mas->last == ULONG_MAX)) { + mas->store_type = wr_new_root; + return; + } + + /* Potential spanning rebalance collapsing a node */ + if (new_end < mt_min_slots[wr_mas->type]) { + if (!mte_is_root(mas->node)) { + mas->store_type = wr_rebalance; + return; + } + mas->store_type = wr_node_store; + return; + } + + if (new_end >= mt_slots[wr_mas->type]) { + mas->store_type = wr_split_store; + return; + } + + if (!mt_in_rcu(mas->tree) && (mas->offset == mas->end)) { + mas->store_type = wr_append; + return; + } + + if ((new_end == mas->end) && (!mt_in_rcu(mas->tree) || + (wr_mas->offset_end - mas->offset == 1))) { + mas->store_type = wr_slot_store; + return; + } + + if (mte_is_root(mas->node) || (new_end >= mt_min_slots[wr_mas->type]) || + (mas->mas_flags & MA_STATE_BULK)) { + mas->store_type = wr_node_store; + return; + } + + mas->store_type = wr_invalid; + MAS_WARN_ON(mas, 1); +} + +/** + * mas_wr_preallocate() - Preallocate enough nodes for a store operation + * @wr_mas: The maple write state + * @entry: The entry that will be stored + * + */ +static inline void mas_wr_preallocate(struct ma_wr_state *wr_mas, void *entry) +{ + struct ma_state *mas = wr_mas->mas; + int request; + + mas_wr_prealloc_setup(wr_mas); + mas_wr_store_type(wr_mas); + request = mas_prealloc_calc(mas, entry); + if (!request) + return; + + mas_node_count(mas, request); +} + /** * mas_insert() - Internal call to insert a value * @mas: The maple state @@ -5508,69 +5648,25 @@ EXPORT_SYMBOL_GPL(mas_store_prealloc); int mas_preallocate(struct ma_state *mas, void *entry, gfp_t gfp) { MA_WR_STATE(wr_mas, mas, entry); - unsigned char node_size; - int request = 1; - int ret; - - - if (unlikely(!mas->index && mas->last == ULONG_MAX)) - goto ask_now; + int ret = 0; + int request; mas_wr_prealloc_setup(&wr_mas); - /* Root expand */ - if (unlikely(mas_is_none(mas) || mas_is_ptr(mas))) - goto ask_now; - - if (unlikely(!mas_wr_walk(&wr_mas))) { - /* Spanning store, use worst case for now */ - request = 1 + mas_mt_height(mas) * 3; - goto ask_now; - } - - /* At this point, we are at the leaf node that needs to be altered. */ - /* Exact fit, no nodes needed. */ - if (wr_mas.r_min == mas->index && wr_mas.r_max == mas->last) - return 0; - - mas_wr_end_piv(&wr_mas); - node_size = mas_wr_new_end(&wr_mas); - - /* Slot store, does not require additional nodes */ - if (node_size == mas->end) { - /* reuse node */ - if (!mt_in_rcu(mas->tree)) - return 0; - /* shifting boundary */ - if (wr_mas.offset_end - mas->offset == 1) - return 0; - } + mas_wr_store_type(&wr_mas); + request = mas_prealloc_calc(mas, entry); + if (!request) + return ret; - if (node_size >= mt_slots[wr_mas.type]) { - /* Split, worst case for now. */ - request = 1 + mas_mt_height(mas) * 2; - goto ask_now; + mas_node_count_gfp(mas, request, gfp); + if (mas_is_err(mas)) { + mas_set_alloc_req(mas, 0); + ret = xa_err(mas->node); + mas_destroy(mas); + mas_reset(mas); + return ret; } - /* New root needs a single node */ - if (unlikely(mte_is_root(mas->node))) - goto ask_now; - - /* Potential spanning rebalance collapsing a node, use worst-case */ - if (node_size - 1 <= mt_min_slots[wr_mas.type]) - request = mas_mt_height(mas) * 2 - 1; - - /* node store, slot store needs one node */ -ask_now: - mas_node_count_gfp(mas, request, gfp); mas->mas_flags |= MA_STATE_PREALLOC; - if (likely(!mas_is_err(mas))) - return 0; - - mas_set_alloc_req(mas, 0); - ret = xa_err(mas->node); - mas_reset(mas); - mas_destroy(mas); - mas_reset(mas); return ret; } EXPORT_SYMBOL_GPL(mas_preallocate); @@ -5596,7 +5692,8 @@ void mas_destroy(struct ma_state *mas) */ if (mas->mas_flags & MA_STATE_REBALANCE) { unsigned char end; - + if (mas_is_err(mas)) + mas_reset(mas); mas_start(mas); mtree_range_walk(mas); end = mas->end + 1; diff --git a/tools/testing/radix-tree/maple.c b/tools/testing/radix-tree/maple.c index ef5b83cf94ea..ad42a36231fb 100644 --- a/tools/testing/radix-tree/maple.c +++ b/tools/testing/radix-tree/maple.c @@ -36283,6 +36283,38 @@ static void check_nomem_writer_race(struct maple_tree *mt) mtree_unlock(mt); } + /* test to simulate expanding a vma from [0x7fffffffe000, 0x7ffffffff000) + * to [0x7ffde4ca1000, 0x7ffffffff000) and then shrinking the vma to + * [0x7ffde4ca1000, 0x7ffde4ca2000) + */ +static inline int check_vma_modification(struct maple_tree *mt) +{ + MA_STATE(mas, mt, 0, 0); + + mtree_lock(mt); + /* vma with old start and old end */ + __mas_set_range(&mas, 0x7fffffffe000, 0x7ffffffff000 - 1); + mas_preallocate(&mas, xa_mk_value(1), GFP_KERNEL); + mas_store_prealloc(&mas, xa_mk_value(1)); + + /* next write occurs partly in previous range [0, 0x7fffffffe000)*/ + mas_prev_range(&mas, 0); + /* expand vma to {0x7ffde4ca1000, 0x7ffffffff000) */ + __mas_set_range(&mas, 0x7ffde4ca1000, 0x7ffffffff000 - 1); + mas_preallocate(&mas, xa_mk_value(1), GFP_KERNEL); + mas_store_prealloc(&mas, xa_mk_value(1)); + + /* shrink vma to [0x7ffde4ca1000, 7ffde4ca2000) */ + __mas_set_range(&mas, 0x7ffde4ca2000, 0x7ffffffff000 - 1); + mas_preallocate(&mas, NULL, GFP_KERNEL); + mas_store_prealloc(&mas, NULL); + mt_dump(mt, mt_dump_hex); + + mas_destroy(&mas); + mtree_unlock(mt); + return 0; +} + void farmer_tests(void) { struct maple_node *node; @@ -36290,6 +36322,10 @@ void farmer_tests(void) mt_dump(&tree, mt_dump_dec); + mt_init_flags(&tree, MT_FLAGS_ALLOC_RANGE | MT_FLAGS_LOCK_EXTERN | MT_FLAGS_USE_RCU); + check_vma_modification(&tree); + mtree_destroy(&tree); + tree.ma_root = xa_mk_value(0); mt_dump(&tree, mt_dump_dec); -- cgit v1.2.3 From 3cd9e92e009de9a036cbf9ec27bad33367fca6f3 Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Wed, 14 Aug 2024 12:19:32 -0400 Subject: maple_tree: remove mas_destroy() from mas_nomem() Separate call to mas_destroy() from mas_nomem() so we can check for no memory errors without destroying the current maple state in mas_store_gfp(). We then add calls to mas_destroy() to callers of mas_nomem(). Link: https://lkml.kernel.org/r/20240814161944.55347-6-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Reviewed-by: Liam R. Howlett Cc: Matthew Wilcox (Oracle) Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- lib/maple_tree.c | 37 ++++++++++++++++++++++++------------- tools/testing/radix-tree/maple.c | 10 ++++++---- 2 files changed, 30 insertions(+), 17 deletions(-) (limited to 'tools') diff --git a/lib/maple_tree.c b/lib/maple_tree.c index d0b9b3795b96..58985107cf00 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -4519,6 +4519,7 @@ int mas_alloc_cyclic(struct ma_state *mas, unsigned long *startp, if (*next == 0) mas->tree->ma_flags |= MT_FLAGS_ALLOC_WRAPPED; + mas_destroy(mas); return ret; } EXPORT_SYMBOL(mas_alloc_cyclic); @@ -5601,21 +5602,25 @@ int mas_store_gfp(struct ma_state *mas, void *entry, gfp_t gfp) unsigned long index = mas->index; unsigned long last = mas->last; MA_WR_STATE(wr_mas, mas, entry); + int ret = 0; - mas_wr_store_setup(&wr_mas); - trace_ma_write(__func__, mas, 0, entry); retry: - mas_wr_store_entry(&wr_mas); + mas_wr_preallocate(&wr_mas, entry); if (unlikely(mas_nomem(mas, gfp))) { if (!entry) __mas_set_range(mas, index, last); goto retry; } - if (unlikely(mas_is_err(mas))) - return xa_err(mas->node); + if (mas_is_err(mas)) { + ret = xa_err(mas->node); + goto out; + } - return 0; + mas_wr_store_entry(&wr_mas); +out: + mas_destroy(mas); + return ret; } EXPORT_SYMBOL_GPL(mas_store_gfp); @@ -6374,6 +6379,7 @@ write_retry: goto write_retry; } + mas_destroy(mas); return entry; } EXPORT_SYMBOL_GPL(mas_erase); @@ -6388,10 +6394,8 @@ EXPORT_SYMBOL_GPL(mas_erase); bool mas_nomem(struct ma_state *mas, gfp_t gfp) __must_hold(mas->tree->ma_lock) { - if (likely(mas->node != MA_ERROR(-ENOMEM))) { - mas_destroy(mas); + if (likely(mas->node != MA_ERROR(-ENOMEM))) return false; - } if (gfpflags_allow_blocking(gfp) && !mt_external_lock(mas->tree)) { mtree_unlock(mas->tree); @@ -6469,6 +6473,7 @@ int mtree_store_range(struct maple_tree *mt, unsigned long index, { MA_STATE(mas, mt, index, last); MA_WR_STATE(wr_mas, &mas, entry); + int ret = 0; trace_ma_write(__func__, &mas, 0, entry); if (WARN_ON_ONCE(xa_is_advanced(entry))) @@ -6484,10 +6489,12 @@ retry: goto retry; mtree_unlock(mt); + if (mas_is_err(&mas)) - return xa_err(mas.node); + ret = xa_err(mas.node); - return 0; + mas_destroy(&mas); + return ret; } EXPORT_SYMBOL(mtree_store_range); @@ -6523,6 +6530,7 @@ int mtree_insert_range(struct maple_tree *mt, unsigned long first, unsigned long last, void *entry, gfp_t gfp) { MA_STATE(ms, mt, first, last); + int ret = 0; if (WARN_ON_ONCE(xa_is_advanced(entry))) return -EINVAL; @@ -6538,9 +6546,10 @@ retry: mtree_unlock(mt); if (mas_is_err(&ms)) - return xa_err(ms.node); + ret = xa_err(ms.node); - return 0; + mas_destroy(&ms); + return ret; } EXPORT_SYMBOL(mtree_insert_range); @@ -6595,6 +6604,7 @@ retry: unlock: mtree_unlock(mt); + mas_destroy(&mas); return ret; } EXPORT_SYMBOL(mtree_alloc_range); @@ -6676,6 +6686,7 @@ retry: unlock: mtree_unlock(mt); + mas_destroy(&mas); return ret; } EXPORT_SYMBOL(mtree_alloc_rrange); diff --git a/tools/testing/radix-tree/maple.c b/tools/testing/radix-tree/maple.c index ad42a36231fb..c5b00aca9def 100644 --- a/tools/testing/radix-tree/maple.c +++ b/tools/testing/radix-tree/maple.c @@ -120,7 +120,7 @@ static noinline void __init check_new_node(struct maple_tree *mt) MT_BUG_ON(mt, mas.alloc->slot[0] == NULL); mas_push_node(&mas, mn); mas_reset(&mas); - mas_nomem(&mas, GFP_KERNEL); /* free */ + mas_destroy(&mas); mtree_unlock(mt); @@ -144,7 +144,7 @@ static noinline void __init check_new_node(struct maple_tree *mt) mn->parent = ma_parent_ptr(mn); ma_free_rcu(mn); mas.status = ma_start; - mas_nomem(&mas, GFP_KERNEL); + mas_destroy(&mas); /* Allocate 3 nodes, will fail. */ mas_node_count(&mas, 3); /* Drop the lock and allocate 3 nodes. */ @@ -161,7 +161,7 @@ static noinline void __init check_new_node(struct maple_tree *mt) MT_BUG_ON(mt, mas_allocated(&mas) != 3); /* Free. */ mas_reset(&mas); - mas_nomem(&mas, GFP_KERNEL); + mas_destroy(&mas); /* Set allocation request to 1. */ mas_set_alloc_req(&mas, 1); @@ -277,6 +277,7 @@ static noinline void __init check_new_node(struct maple_tree *mt) } mas_reset(&mas); MT_BUG_ON(mt, mas_nomem(&mas, GFP_KERNEL)); + mas_destroy(&mas); } @@ -299,7 +300,7 @@ static noinline void __init check_new_node(struct maple_tree *mt) } MT_BUG_ON(mt, mas_allocated(&mas) != total); mas_reset(&mas); - mas_nomem(&mas, GFP_KERNEL); /* Free. */ + mas_destroy(&mas); /* Free. */ MT_BUG_ON(mt, mas_allocated(&mas) != 0); for (i = 1; i < 128; i++) { @@ -35847,6 +35848,7 @@ static noinline void __init check_nomem(struct maple_tree *mt) mas_store(&ms, &ms); /* insert 1 -> &ms */ mas_nomem(&ms, GFP_KERNEL); /* Node allocated in here. */ mtree_unlock(mt); + mas_destroy(&ms); mtree_destroy(mt); } -- cgit v1.2.3 From 11ee88a0f98770719f29b7d1efb2a2ca6a83af3c Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Mon, 29 Jul 2024 01:45:08 -0700 Subject: fault-injection: enhance failcmd to exit on non-hex address input The failcmd.sh script in the fault-injection toolkit does not currently validate whether the provided address is in hexadecimal format. This can lead to silent failures if the address is sourced from places like `/proc/kallsyms`, which omits the '0x' prefix, potentially causing users to operate under incorrect assumptions. Introduce a new function, `exit_if_not_hex`, which checks the format of the provided address and exits with an error message if the address is not a valid hexadecimal number. This enhancement prevents users from running the command with improperly formatted addresses, thus improving the robustness and usability of the failcmd tool. Link: https://lkml.kernel.org/r/20240729084512.3349928-1-leitao@debian.org Signed-off-by: Breno Leitao Reviewed-by: Akinobu Mita Signed-off-by: Andrew Morton --- tools/testing/fault-injection/failcmd.sh | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'tools') diff --git a/tools/testing/fault-injection/failcmd.sh b/tools/testing/fault-injection/failcmd.sh index 78dac34264be..c4f2432750f4 100644 --- a/tools/testing/fault-injection/failcmd.sh +++ b/tools/testing/fault-injection/failcmd.sh @@ -64,6 +64,14 @@ ENVIRONMENT EOF } +exit_if_not_hex() { + local value="$1" + if ! [[ $value =~ ^0x[0-9a-fA-F]+$ ]]; then + echo "Error: The provided value '$value' is not a valid hexadecimal number." >&2 + exit 1 + fi +} + if [ $UID != 0 ]; then echo must be run as root >&2 exit 1 @@ -160,18 +168,22 @@ while true; do shift 2 ;; --require-start) + exit_if_not_hex "$2" echo $2 > $FAULTATTR/require-start shift 2 ;; --require-end) + exit_if_not_hex "$2" echo $2 > $FAULTATTR/require-end shift 2 ;; --reject-start) + exit_if_not_hex "$2" echo $2 > $FAULTATTR/reject-start shift 2 ;; --reject-end) + exit_if_not_hex "$2" echo $2 > $FAULTATTR/reject-end shift 2 ;; -- cgit v1.2.3 From 8af2caf7307d8002d88c377c58eba8cae75a2109 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Mon, 29 Jul 2024 01:52:11 -0700 Subject: failcmd: make failcmd.sh executable Change the file permissions of tools/testing/fault-injection/failcmd.sh to allow execution. This ensures the script can be run directly without explicitly invoking a shell. Link: https://lkml.kernel.org/r/20240729085215.3403417-1-leitao@debian.org Signed-off-by: Breno Leitao Reviewed-by: Akinobu Mita Signed-off-by: Andrew Morton --- tools/testing/fault-injection/failcmd.sh | 0 1 file changed, 0 insertions(+), 0 deletions(-) mode change 100644 => 100755 tools/testing/fault-injection/failcmd.sh (limited to 'tools') diff --git a/tools/testing/fault-injection/failcmd.sh b/tools/testing/fault-injection/failcmd.sh old mode 100644 new mode 100755 -- cgit v1.2.3 From 931a36c4138ac418d487bd4db0d03780b46a77ba Mon Sep 17 00:00:00 2001 From: zhangjiao Date: Thu, 29 Aug 2024 14:29:42 +0800 Subject: tools: gpio: rm .*.cmd on make clean rm .*.cmd when calling make clean Signed-off-by: zhangjiao Link: https://lore.kernel.org/r/20240829062942.11487-1-zhangjiao2@cmss.chinamobile.com Signed-off-by: Bartosz Golaszewski --- tools/gpio/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/gpio/Makefile b/tools/gpio/Makefile index d29c9c49e251..ed565eb52275 100644 --- a/tools/gpio/Makefile +++ b/tools/gpio/Makefile @@ -78,7 +78,7 @@ $(OUTPUT)gpio-watch: $(GPIO_WATCH_IN) clean: rm -f $(ALL_PROGRAMS) rm -f $(OUTPUT)include/linux/gpio.h - find $(or $(OUTPUT),.) -name '*.o' -delete -o -name '\.*.d' -delete + find $(or $(OUTPUT),.) -name '*.o' -delete -o -name '\.*.d' -delete -o -name '\.*.cmd' -delete install: $(ALL_PROGRAMS) install -d -m 755 $(DESTDIR)$(bindir); \ -- cgit v1.2.3 From 0e7eb23668948585f3f0ea8c6249338f33fde872 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Fri, 30 Aug 2024 19:53:47 -0300 Subject: perf tools: Build x86 32-bit syscall table from arch/x86/entry/syscalls/syscall_32.tbl To remove one more use of the audit libs and address a problem reported with a recent change where a function isn't available when using the audit libs method, that should really go away, this being one step in that direction. The script used to generate the 64-bit syscall table was already parametrized to generate for both 64-bit and 32-bit, so just use it and wire the generated table to the syscalltbl.c routines. Reported-by: Jiri Slaby Suggested-by: Ian Rogers Reviewed-by: Ian Rogers Tested-by: Jiri Slaby Cc: Adrian Hunter Cc: Howard Chu Cc: Jiri Olsa Cc: Kan Liang Cc: Namhyung Kim Link: https://lore.kernel.org/lkml/6fe63fa3-6c63-4b75-ac09-884d26f6fb95@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Makefile.config | 13 +- tools/perf/arch/x86/Makefile | 6 +- tools/perf/arch/x86/entry/syscalls/syscall_32.tbl | 470 ++++++++++++++++++++++ tools/perf/check-headers.sh | 1 + tools/perf/util/syscalltbl.c | 4 + 5 files changed, 484 insertions(+), 10 deletions(-) create mode 100644 tools/perf/arch/x86/entry/syscalls/syscall_32.tbl (limited to 'tools') diff --git a/tools/perf/Makefile.config b/tools/perf/Makefile.config index 4eb1fc897baf..9998cd7a8792 100644 --- a/tools/perf/Makefile.config +++ b/tools/perf/Makefile.config @@ -31,14 +31,8 @@ $(call detected_var,SRCARCH) ifneq ($(NO_SYSCALL_TABLE),1) NO_SYSCALL_TABLE := 1 - ifeq ($(SRCARCH),x86) - ifeq (${IS_64_BIT}, 1) - NO_SYSCALL_TABLE := 0 - endif - else - ifeq ($(SRCARCH),$(filter $(SRCARCH),powerpc arm64 s390 mips loongarch)) - NO_SYSCALL_TABLE := 0 - endif + ifeq ($(SRCARCH),$(filter $(SRCARCH),x86 powerpc arm64 s390 mips loongarch)) + NO_SYSCALL_TABLE := 0 endif ifneq ($(NO_SYSCALL_TABLE),1) @@ -55,8 +49,9 @@ endif # Additional ARCH settings for x86 ifeq ($(SRCARCH),x86) $(call detected,CONFIG_X86) + CFLAGS += -I$(OUTPUT)arch/x86/include/generated ifeq (${IS_64_BIT}, 1) - CFLAGS += -DHAVE_ARCH_X86_64_SUPPORT -I$(OUTPUT)arch/x86/include/generated + CFLAGS += -DHAVE_ARCH_X86_64_SUPPORT ARCH_INCLUDE = ../../arch/x86/lib/memcpy_64.S ../../arch/x86/lib/memset_64.S LIBUNWIND_LIBS = -lunwind-x86_64 -lunwind -llzma $(call detected,CONFIG_X86_64) diff --git a/tools/perf/arch/x86/Makefile b/tools/perf/arch/x86/Makefile index 8952e00f9b60..67b4969a6738 100644 --- a/tools/perf/arch/x86/Makefile +++ b/tools/perf/arch/x86/Makefile @@ -13,6 +13,7 @@ PERF_HAVE_JITDUMP := 1 generated := $(OUTPUT)arch/x86/include/generated out := $(generated)/asm header := $(out)/syscalls_64.c +header_32 := $(out)/syscalls_32.c sys := $(srctree)/tools/perf/arch/x86/entry/syscalls systbl := $(sys)/syscalltbl.sh @@ -22,7 +23,10 @@ $(shell [ -d '$(out)' ] || mkdir -p '$(out)') $(header): $(sys)/syscall_64.tbl $(systbl) $(Q)$(SHELL) '$(systbl)' $(sys)/syscall_64.tbl 'x86_64' > $@ +$(header_32): $(sys)/syscall_32.tbl $(systbl) + $(Q)$(SHELL) '$(systbl)' $(sys)/syscall_32.tbl 'x86' > $@ + clean:: $(call QUIET_CLEAN, x86) $(RM) -r $(header) $(generated) -archheaders: $(header) +archheaders: $(header) $(header_32) diff --git a/tools/perf/arch/x86/entry/syscalls/syscall_32.tbl b/tools/perf/arch/x86/entry/syscalls/syscall_32.tbl new file mode 100644 index 000000000000..534c74b14fab --- /dev/null +++ b/tools/perf/arch/x86/entry/syscalls/syscall_32.tbl @@ -0,0 +1,470 @@ +# SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note +# +# 32-bit system call numbers and entry vectors +# +# The format is: +# [ [noreturn]] +# +# The __ia32_sys and __ia32_compat_sys stubs are created on-the-fly for +# sys_*() system calls and compat_sys_*() compat system calls if +# IA32_EMULATION is defined, and expect struct pt_regs *regs as their only +# parameter. +# +# The abi is always "i386" for this file. +# +0 i386 restart_syscall sys_restart_syscall +1 i386 exit sys_exit - noreturn +2 i386 fork sys_fork +3 i386 read sys_read +4 i386 write sys_write +5 i386 open sys_open compat_sys_open +6 i386 close sys_close +7 i386 waitpid sys_waitpid +8 i386 creat sys_creat +9 i386 link sys_link +10 i386 unlink sys_unlink +11 i386 execve sys_execve compat_sys_execve +12 i386 chdir sys_chdir +13 i386 time sys_time32 +14 i386 mknod sys_mknod +15 i386 chmod sys_chmod +16 i386 lchown sys_lchown16 +17 i386 break +18 i386 oldstat sys_stat +19 i386 lseek sys_lseek compat_sys_lseek +20 i386 getpid sys_getpid +21 i386 mount sys_mount +22 i386 umount sys_oldumount +23 i386 setuid sys_setuid16 +24 i386 getuid sys_getuid16 +25 i386 stime sys_stime32 +26 i386 ptrace sys_ptrace compat_sys_ptrace +27 i386 alarm sys_alarm +28 i386 oldfstat sys_fstat +29 i386 pause sys_pause +30 i386 utime sys_utime32 +31 i386 stty +32 i386 gtty +33 i386 access sys_access +34 i386 nice sys_nice +35 i386 ftime +36 i386 sync sys_sync +37 i386 kill sys_kill +38 i386 rename sys_rename +39 i386 mkdir sys_mkdir +40 i386 rmdir sys_rmdir +41 i386 dup sys_dup +42 i386 pipe sys_pipe +43 i386 times sys_times compat_sys_times +44 i386 prof +45 i386 brk sys_brk +46 i386 setgid sys_setgid16 +47 i386 getgid sys_getgid16 +48 i386 signal sys_signal +49 i386 geteuid sys_geteuid16 +50 i386 getegid sys_getegid16 +51 i386 acct sys_acct +52 i386 umount2 sys_umount +53 i386 lock +54 i386 ioctl sys_ioctl compat_sys_ioctl +55 i386 fcntl sys_fcntl compat_sys_fcntl64 +56 i386 mpx +57 i386 setpgid sys_setpgid +58 i386 ulimit +59 i386 oldolduname sys_olduname +60 i386 umask sys_umask +61 i386 chroot sys_chroot +62 i386 ustat sys_ustat compat_sys_ustat +63 i386 dup2 sys_dup2 +64 i386 getppid sys_getppid +65 i386 getpgrp sys_getpgrp +66 i386 setsid sys_setsid +67 i386 sigaction sys_sigaction compat_sys_sigaction +68 i386 sgetmask sys_sgetmask +69 i386 ssetmask sys_ssetmask +70 i386 setreuid sys_setreuid16 +71 i386 setregid sys_setregid16 +72 i386 sigsuspend sys_sigsuspend +73 i386 sigpending sys_sigpending compat_sys_sigpending +74 i386 sethostname sys_sethostname +75 i386 setrlimit sys_setrlimit compat_sys_setrlimit +76 i386 getrlimit sys_old_getrlimit compat_sys_old_getrlimit +77 i386 getrusage sys_getrusage compat_sys_getrusage +78 i386 gettimeofday sys_gettimeofday compat_sys_gettimeofday +79 i386 settimeofday sys_settimeofday compat_sys_settimeofday +80 i386 getgroups sys_getgroups16 +81 i386 setgroups sys_setgroups16 +82 i386 select sys_old_select compat_sys_old_select +83 i386 symlink sys_symlink +84 i386 oldlstat sys_lstat +85 i386 readlink sys_readlink +86 i386 uselib sys_uselib +87 i386 swapon sys_swapon +88 i386 reboot sys_reboot +89 i386 readdir sys_old_readdir compat_sys_old_readdir +90 i386 mmap sys_old_mmap compat_sys_ia32_mmap +91 i386 munmap sys_munmap +92 i386 truncate sys_truncate compat_sys_truncate +93 i386 ftruncate sys_ftruncate compat_sys_ftruncate +94 i386 fchmod sys_fchmod +95 i386 fchown sys_fchown16 +96 i386 getpriority sys_getpriority +97 i386 setpriority sys_setpriority +98 i386 profil +99 i386 statfs sys_statfs compat_sys_statfs +100 i386 fstatfs sys_fstatfs compat_sys_fstatfs +101 i386 ioperm sys_ioperm +102 i386 socketcall sys_socketcall compat_sys_socketcall +103 i386 syslog sys_syslog +104 i386 setitimer sys_setitimer compat_sys_setitimer +105 i386 getitimer sys_getitimer compat_sys_getitimer +106 i386 stat sys_newstat compat_sys_newstat +107 i386 lstat sys_newlstat compat_sys_newlstat +108 i386 fstat sys_newfstat compat_sys_newfstat +109 i386 olduname sys_uname +110 i386 iopl sys_iopl +111 i386 vhangup sys_vhangup +112 i386 idle +113 i386 vm86old sys_vm86old sys_ni_syscall +114 i386 wait4 sys_wait4 compat_sys_wait4 +115 i386 swapoff sys_swapoff +116 i386 sysinfo sys_sysinfo compat_sys_sysinfo +117 i386 ipc sys_ipc compat_sys_ipc +118 i386 fsync sys_fsync +119 i386 sigreturn sys_sigreturn compat_sys_sigreturn +120 i386 clone sys_clone compat_sys_ia32_clone +121 i386 setdomainname sys_setdomainname +122 i386 uname sys_newuname +123 i386 modify_ldt sys_modify_ldt +124 i386 adjtimex sys_adjtimex_time32 +125 i386 mprotect sys_mprotect +126 i386 sigprocmask sys_sigprocmask compat_sys_sigprocmask +127 i386 create_module +128 i386 init_module sys_init_module +129 i386 delete_module sys_delete_module +130 i386 get_kernel_syms +131 i386 quotactl sys_quotactl +132 i386 getpgid sys_getpgid +133 i386 fchdir sys_fchdir +134 i386 bdflush sys_ni_syscall +135 i386 sysfs sys_sysfs +136 i386 personality sys_personality +137 i386 afs_syscall +138 i386 setfsuid sys_setfsuid16 +139 i386 setfsgid sys_setfsgid16 +140 i386 _llseek sys_llseek +141 i386 getdents sys_getdents compat_sys_getdents +142 i386 _newselect sys_select compat_sys_select +143 i386 flock sys_flock +144 i386 msync sys_msync +145 i386 readv sys_readv +146 i386 writev sys_writev +147 i386 getsid sys_getsid +148 i386 fdatasync sys_fdatasync +149 i386 _sysctl sys_ni_syscall +150 i386 mlock sys_mlock +151 i386 munlock sys_munlock +152 i386 mlockall sys_mlockall +153 i386 munlockall sys_munlockall +154 i386 sched_setparam sys_sched_setparam +155 i386 sched_getparam sys_sched_getparam +156 i386 sched_setscheduler sys_sched_setscheduler +157 i386 sched_getscheduler sys_sched_getscheduler +158 i386 sched_yield sys_sched_yield +159 i386 sched_get_priority_max sys_sched_get_priority_max +160 i386 sched_get_priority_min sys_sched_get_priority_min +161 i386 sched_rr_get_interval sys_sched_rr_get_interval_time32 +162 i386 nanosleep sys_nanosleep_time32 +163 i386 mremap sys_mremap +164 i386 setresuid sys_setresuid16 +165 i386 getresuid sys_getresuid16 +166 i386 vm86 sys_vm86 sys_ni_syscall +167 i386 query_module +168 i386 poll sys_poll +169 i386 nfsservctl +170 i386 setresgid sys_setresgid16 +171 i386 getresgid sys_getresgid16 +172 i386 prctl sys_prctl +173 i386 rt_sigreturn sys_rt_sigreturn compat_sys_rt_sigreturn +174 i386 rt_sigaction sys_rt_sigaction compat_sys_rt_sigaction +175 i386 rt_sigprocmask sys_rt_sigprocmask compat_sys_rt_sigprocmask +176 i386 rt_sigpending sys_rt_sigpending compat_sys_rt_sigpending +177 i386 rt_sigtimedwait sys_rt_sigtimedwait_time32 compat_sys_rt_sigtimedwait_time32 +178 i386 rt_sigqueueinfo sys_rt_sigqueueinfo compat_sys_rt_sigqueueinfo +179 i386 rt_sigsuspend sys_rt_sigsuspend compat_sys_rt_sigsuspend +180 i386 pread64 sys_ia32_pread64 +181 i386 pwrite64 sys_ia32_pwrite64 +182 i386 chown sys_chown16 +183 i386 getcwd sys_getcwd +184 i386 capget sys_capget +185 i386 capset sys_capset +186 i386 sigaltstack sys_sigaltstack compat_sys_sigaltstack +187 i386 sendfile sys_sendfile compat_sys_sendfile +188 i386 getpmsg +189 i386 putpmsg +190 i386 vfork sys_vfork +191 i386 ugetrlimit sys_getrlimit compat_sys_getrlimit +192 i386 mmap2 sys_mmap_pgoff +193 i386 truncate64 sys_ia32_truncate64 +194 i386 ftruncate64 sys_ia32_ftruncate64 +195 i386 stat64 sys_stat64 compat_sys_ia32_stat64 +196 i386 lstat64 sys_lstat64 compat_sys_ia32_lstat64 +197 i386 fstat64 sys_fstat64 compat_sys_ia32_fstat64 +198 i386 lchown32 sys_lchown +199 i386 getuid32 sys_getuid +200 i386 getgid32 sys_getgid +201 i386 geteuid32 sys_geteuid +202 i386 getegid32 sys_getegid +203 i386 setreuid32 sys_setreuid +204 i386 setregid32 sys_setregid +205 i386 getgroups32 sys_getgroups +206 i386 setgroups32 sys_setgroups +207 i386 fchown32 sys_fchown +208 i386 setresuid32 sys_setresuid +209 i386 getresuid32 sys_getresuid +210 i386 setresgid32 sys_setresgid +211 i386 getresgid32 sys_getresgid +212 i386 chown32 sys_chown +213 i386 setuid32 sys_setuid +214 i386 setgid32 sys_setgid +215 i386 setfsuid32 sys_setfsuid +216 i386 setfsgid32 sys_setfsgid +217 i386 pivot_root sys_pivot_root +218 i386 mincore sys_mincore +219 i386 madvise sys_madvise +220 i386 getdents64 sys_getdents64 +221 i386 fcntl64 sys_fcntl64 compat_sys_fcntl64 +# 222 is unused +# 223 is unused +224 i386 gettid sys_gettid +225 i386 readahead sys_ia32_readahead +226 i386 setxattr sys_setxattr +227 i386 lsetxattr sys_lsetxattr +228 i386 fsetxattr sys_fsetxattr +229 i386 getxattr sys_getxattr +230 i386 lgetxattr sys_lgetxattr +231 i386 fgetxattr sys_fgetxattr +232 i386 listxattr sys_listxattr +233 i386 llistxattr sys_llistxattr +234 i386 flistxattr sys_flistxattr +235 i386 removexattr sys_removexattr +236 i386 lremovexattr sys_lremovexattr +237 i386 fremovexattr sys_fremovexattr +238 i386 tkill sys_tkill +239 i386 sendfile64 sys_sendfile64 +240 i386 futex sys_futex_time32 +241 i386 sched_setaffinity sys_sched_setaffinity compat_sys_sched_setaffinity +242 i386 sched_getaffinity sys_sched_getaffinity compat_sys_sched_getaffinity +243 i386 set_thread_area sys_set_thread_area +244 i386 get_thread_area sys_get_thread_area +245 i386 io_setup sys_io_setup compat_sys_io_setup +246 i386 io_destroy sys_io_destroy +247 i386 io_getevents sys_io_getevents_time32 +248 i386 io_submit sys_io_submit compat_sys_io_submit +249 i386 io_cancel sys_io_cancel +250 i386 fadvise64 sys_ia32_fadvise64 +# 251 is available for reuse (was briefly sys_set_zone_reclaim) +252 i386 exit_group sys_exit_group - noreturn +253 i386 lookup_dcookie +254 i386 epoll_create sys_epoll_create +255 i386 epoll_ctl sys_epoll_ctl +256 i386 epoll_wait sys_epoll_wait +257 i386 remap_file_pages sys_remap_file_pages +258 i386 set_tid_address sys_set_tid_address +259 i386 timer_create sys_timer_create compat_sys_timer_create +260 i386 timer_settime sys_timer_settime32 +261 i386 timer_gettime sys_timer_gettime32 +262 i386 timer_getoverrun sys_timer_getoverrun +263 i386 timer_delete sys_timer_delete +264 i386 clock_settime sys_clock_settime32 +265 i386 clock_gettime sys_clock_gettime32 +266 i386 clock_getres sys_clock_getres_time32 +267 i386 clock_nanosleep sys_clock_nanosleep_time32 +268 i386 statfs64 sys_statfs64 compat_sys_statfs64 +269 i386 fstatfs64 sys_fstatfs64 compat_sys_fstatfs64 +270 i386 tgkill sys_tgkill +271 i386 utimes sys_utimes_time32 +272 i386 fadvise64_64 sys_ia32_fadvise64_64 +273 i386 vserver +274 i386 mbind sys_mbind +275 i386 get_mempolicy sys_get_mempolicy +276 i386 set_mempolicy sys_set_mempolicy +277 i386 mq_open sys_mq_open compat_sys_mq_open +278 i386 mq_unlink sys_mq_unlink +279 i386 mq_timedsend sys_mq_timedsend_time32 +280 i386 mq_timedreceive sys_mq_timedreceive_time32 +281 i386 mq_notify sys_mq_notify compat_sys_mq_notify +282 i386 mq_getsetattr sys_mq_getsetattr compat_sys_mq_getsetattr +283 i386 kexec_load sys_kexec_load compat_sys_kexec_load +284 i386 waitid sys_waitid compat_sys_waitid +# 285 sys_setaltroot +286 i386 add_key sys_add_key +287 i386 request_key sys_request_key +288 i386 keyctl sys_keyctl compat_sys_keyctl +289 i386 ioprio_set sys_ioprio_set +290 i386 ioprio_get sys_ioprio_get +291 i386 inotify_init sys_inotify_init +292 i386 inotify_add_watch sys_inotify_add_watch +293 i386 inotify_rm_watch sys_inotify_rm_watch +294 i386 migrate_pages sys_migrate_pages +295 i386 openat sys_openat compat_sys_openat +296 i386 mkdirat sys_mkdirat +297 i386 mknodat sys_mknodat +298 i386 fchownat sys_fchownat +299 i386 futimesat sys_futimesat_time32 +300 i386 fstatat64 sys_fstatat64 compat_sys_ia32_fstatat64 +301 i386 unlinkat sys_unlinkat +302 i386 renameat sys_renameat +303 i386 linkat sys_linkat +304 i386 symlinkat sys_symlinkat +305 i386 readlinkat sys_readlinkat +306 i386 fchmodat sys_fchmodat +307 i386 faccessat sys_faccessat +308 i386 pselect6 sys_pselect6_time32 compat_sys_pselect6_time32 +309 i386 ppoll sys_ppoll_time32 compat_sys_ppoll_time32 +310 i386 unshare sys_unshare +311 i386 set_robust_list sys_set_robust_list compat_sys_set_robust_list +312 i386 get_robust_list sys_get_robust_list compat_sys_get_robust_list +313 i386 splice sys_splice +314 i386 sync_file_range sys_ia32_sync_file_range +315 i386 tee sys_tee +316 i386 vmsplice sys_vmsplice +317 i386 move_pages sys_move_pages +318 i386 getcpu sys_getcpu +319 i386 epoll_pwait sys_epoll_pwait +320 i386 utimensat sys_utimensat_time32 +321 i386 signalfd sys_signalfd compat_sys_signalfd +322 i386 timerfd_create sys_timerfd_create +323 i386 eventfd sys_eventfd +324 i386 fallocate sys_ia32_fallocate +325 i386 timerfd_settime sys_timerfd_settime32 +326 i386 timerfd_gettime sys_timerfd_gettime32 +327 i386 signalfd4 sys_signalfd4 compat_sys_signalfd4 +328 i386 eventfd2 sys_eventfd2 +329 i386 epoll_create1 sys_epoll_create1 +330 i386 dup3 sys_dup3 +331 i386 pipe2 sys_pipe2 +332 i386 inotify_init1 sys_inotify_init1 +333 i386 preadv sys_preadv compat_sys_preadv +334 i386 pwritev sys_pwritev compat_sys_pwritev +335 i386 rt_tgsigqueueinfo sys_rt_tgsigqueueinfo compat_sys_rt_tgsigqueueinfo +336 i386 perf_event_open sys_perf_event_open +337 i386 recvmmsg sys_recvmmsg_time32 compat_sys_recvmmsg_time32 +338 i386 fanotify_init sys_fanotify_init +339 i386 fanotify_mark sys_fanotify_mark compat_sys_fanotify_mark +340 i386 prlimit64 sys_prlimit64 +341 i386 name_to_handle_at sys_name_to_handle_at +342 i386 open_by_handle_at sys_open_by_handle_at compat_sys_open_by_handle_at +343 i386 clock_adjtime sys_clock_adjtime32 +344 i386 syncfs sys_syncfs +345 i386 sendmmsg sys_sendmmsg compat_sys_sendmmsg +346 i386 setns sys_setns +347 i386 process_vm_readv sys_process_vm_readv +348 i386 process_vm_writev sys_process_vm_writev +349 i386 kcmp sys_kcmp +350 i386 finit_module sys_finit_module +351 i386 sched_setattr sys_sched_setattr +352 i386 sched_getattr sys_sched_getattr +353 i386 renameat2 sys_renameat2 +354 i386 seccomp sys_seccomp +355 i386 getrandom sys_getrandom +356 i386 memfd_create sys_memfd_create +357 i386 bpf sys_bpf +358 i386 execveat sys_execveat compat_sys_execveat +359 i386 socket sys_socket +360 i386 socketpair sys_socketpair +361 i386 bind sys_bind +362 i386 connect sys_connect +363 i386 listen sys_listen +364 i386 accept4 sys_accept4 +365 i386 getsockopt sys_getsockopt sys_getsockopt +366 i386 setsockopt sys_setsockopt sys_setsockopt +367 i386 getsockname sys_getsockname +368 i386 getpeername sys_getpeername +369 i386 sendto sys_sendto +370 i386 sendmsg sys_sendmsg compat_sys_sendmsg +371 i386 recvfrom sys_recvfrom compat_sys_recvfrom +372 i386 recvmsg sys_recvmsg compat_sys_recvmsg +373 i386 shutdown sys_shutdown +374 i386 userfaultfd sys_userfaultfd +375 i386 membarrier sys_membarrier +376 i386 mlock2 sys_mlock2 +377 i386 copy_file_range sys_copy_file_range +378 i386 preadv2 sys_preadv2 compat_sys_preadv2 +379 i386 pwritev2 sys_pwritev2 compat_sys_pwritev2 +380 i386 pkey_mprotect sys_pkey_mprotect +381 i386 pkey_alloc sys_pkey_alloc +382 i386 pkey_free sys_pkey_free +383 i386 statx sys_statx +384 i386 arch_prctl sys_arch_prctl compat_sys_arch_prctl +385 i386 io_pgetevents sys_io_pgetevents_time32 compat_sys_io_pgetevents +386 i386 rseq sys_rseq +393 i386 semget sys_semget +394 i386 semctl sys_semctl compat_sys_semctl +395 i386 shmget sys_shmget +396 i386 shmctl sys_shmctl compat_sys_shmctl +397 i386 shmat sys_shmat compat_sys_shmat +398 i386 shmdt sys_shmdt +399 i386 msgget sys_msgget +400 i386 msgsnd sys_msgsnd compat_sys_msgsnd +401 i386 msgrcv sys_msgrcv compat_sys_msgrcv +402 i386 msgctl sys_msgctl compat_sys_msgctl +403 i386 clock_gettime64 sys_clock_gettime +404 i386 clock_settime64 sys_clock_settime +405 i386 clock_adjtime64 sys_clock_adjtime +406 i386 clock_getres_time64 sys_clock_getres +407 i386 clock_nanosleep_time64 sys_clock_nanosleep +408 i386 timer_gettime64 sys_timer_gettime +409 i386 timer_settime64 sys_timer_settime +410 i386 timerfd_gettime64 sys_timerfd_gettime +411 i386 timerfd_settime64 sys_timerfd_settime +412 i386 utimensat_time64 sys_utimensat +413 i386 pselect6_time64 sys_pselect6 compat_sys_pselect6_time64 +414 i386 ppoll_time64 sys_ppoll compat_sys_ppoll_time64 +416 i386 io_pgetevents_time64 sys_io_pgetevents compat_sys_io_pgetevents_time64 +417 i386 recvmmsg_time64 sys_recvmmsg compat_sys_recvmmsg_time64 +418 i386 mq_timedsend_time64 sys_mq_timedsend +419 i386 mq_timedreceive_time64 sys_mq_timedreceive +420 i386 semtimedop_time64 sys_semtimedop +421 i386 rt_sigtimedwait_time64 sys_rt_sigtimedwait compat_sys_rt_sigtimedwait_time64 +422 i386 futex_time64 sys_futex +423 i386 sched_rr_get_interval_time64 sys_sched_rr_get_interval +424 i386 pidfd_send_signal sys_pidfd_send_signal +425 i386 io_uring_setup sys_io_uring_setup +426 i386 io_uring_enter sys_io_uring_enter +427 i386 io_uring_register sys_io_uring_register +428 i386 open_tree sys_open_tree +429 i386 move_mount sys_move_mount +430 i386 fsopen sys_fsopen +431 i386 fsconfig sys_fsconfig +432 i386 fsmount sys_fsmount +433 i386 fspick sys_fspick +434 i386 pidfd_open sys_pidfd_open +435 i386 clone3 sys_clone3 +436 i386 close_range sys_close_range +437 i386 openat2 sys_openat2 +438 i386 pidfd_getfd sys_pidfd_getfd +439 i386 faccessat2 sys_faccessat2 +440 i386 process_madvise sys_process_madvise +441 i386 epoll_pwait2 sys_epoll_pwait2 compat_sys_epoll_pwait2 +442 i386 mount_setattr sys_mount_setattr +443 i386 quotactl_fd sys_quotactl_fd +444 i386 landlock_create_ruleset sys_landlock_create_ruleset +445 i386 landlock_add_rule sys_landlock_add_rule +446 i386 landlock_restrict_self sys_landlock_restrict_self +447 i386 memfd_secret sys_memfd_secret +448 i386 process_mrelease sys_process_mrelease +449 i386 futex_waitv sys_futex_waitv +450 i386 set_mempolicy_home_node sys_set_mempolicy_home_node +451 i386 cachestat sys_cachestat +452 i386 fchmodat2 sys_fchmodat2 +453 i386 map_shadow_stack sys_map_shadow_stack +454 i386 futex_wake sys_futex_wake +455 i386 futex_wait sys_futex_wait +456 i386 futex_requeue sys_futex_requeue +457 i386 statmount sys_statmount +458 i386 listmount sys_listmount +459 i386 lsm_get_self_attr sys_lsm_get_self_attr +460 i386 lsm_set_self_attr sys_lsm_set_self_attr +461 i386 lsm_list_modules sys_lsm_list_modules +462 i386 mseal sys_mseal diff --git a/tools/perf/check-headers.sh b/tools/perf/check-headers.sh index 672421b858ac..714c78e5da07 100755 --- a/tools/perf/check-headers.sh +++ b/tools/perf/check-headers.sh @@ -172,6 +172,7 @@ check lib/ctype.c '-I "^EXPORT_SYMBOL" -I "^#include " -B check lib/list_sort.c '-I "^#include "' # diff non-symmetric files +check_2 tools/perf/arch/x86/entry/syscalls/syscall_32.tbl arch/x86/entry/syscalls/syscall_32.tbl check_2 tools/perf/arch/x86/entry/syscalls/syscall_64.tbl arch/x86/entry/syscalls/syscall_64.tbl check_2 tools/perf/arch/powerpc/entry/syscalls/syscall.tbl arch/powerpc/kernel/syscalls/syscall.tbl check_2 tools/perf/arch/s390/entry/syscalls/syscall.tbl arch/s390/kernel/syscalls/syscall.tbl diff --git a/tools/perf/util/syscalltbl.c b/tools/perf/util/syscalltbl.c index 0dd26b991b3f..7c15dec6900d 100644 --- a/tools/perf/util/syscalltbl.c +++ b/tools/perf/util/syscalltbl.c @@ -18,6 +18,10 @@ #include const int syscalltbl_native_max_id = SYSCALLTBL_x86_64_MAX_ID; static const char *const *syscalltbl_native = syscalltbl_x86_64; +#elif defined(__i386__) +#include +const int syscalltbl_native_max_id = SYSCALLTBL_x86_MAX_ID; +static const char *const *syscalltbl_native = syscalltbl_x86; #elif defined(__s390x__) #include const int syscalltbl_native_max_id = SYSCALLTBL_S390_64_MAX_ID; -- cgit v1.2.3 From 5d2784e25d7a6c216c9a4cbed2026febfb0c094c Mon Sep 17 00:00:00 2001 From: Quentin Monnet Date: Sun, 1 Sep 2024 22:07:42 +0100 Subject: bpftool: Add missing blank lines in bpftool-net doc example In bpftool-net documentation, two blank lines are missing in a recently added example, causing docutils to complain: $ cd tools/bpf/bpftool $ make doc DESCEND Documentation GEN bpftool-btf.8 GEN bpftool-cgroup.8 GEN bpftool-feature.8 GEN bpftool-gen.8 GEN bpftool-iter.8 GEN bpftool-link.8 GEN bpftool-map.8 GEN bpftool-net.8 :189: (INFO/1) Possible incomplete section title. Treating the overline as ordinary text because it's so short. :192: (INFO/1) Blank line missing before literal block (after the "::")? Interpreted as a definition list item. :199: (INFO/1) Possible incomplete section title. Treating the overline as ordinary text because it's so short. :201: (INFO/1) Blank line missing before literal block (after the "::")? Interpreted as a definition list item. GEN bpftool-perf.8 GEN bpftool-prog.8 GEN bpftool.8 GEN bpftool-struct_ops.8 Add the missing blank lines. Fixes: 0d7c06125cea ("bpftool: Add document for net attach/detach on tcx subcommand") Signed-off-by: Quentin Monnet Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20240901210742.25758-1-qmo@kernel.org --- tools/bpf/bpftool/Documentation/bpftool-net.rst | 2 ++ 1 file changed, 2 insertions(+) (limited to 'tools') diff --git a/tools/bpf/bpftool/Documentation/bpftool-net.rst b/tools/bpf/bpftool/Documentation/bpftool-net.rst index 4a8cb5e0d94b..a9ed8992800f 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-net.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-net.rst @@ -187,6 +187,7 @@ EXAMPLES | :: + tc: lo(1) tcx/ingress tc_prog prog_id 29 @@ -197,4 +198,5 @@ EXAMPLES | :: + tc: -- cgit v1.2.3 From 1c7fb536e899a2f66f9b1719a0234570dda2e634 Mon Sep 17 00:00:00 2001 From: Veronika Molnarova Date: Thu, 8 Aug 2024 12:37:49 +0200 Subject: perf test pmu: Set uninitialized PMU alias to null Commit 3e0bf9fde2984469 ("perf pmu: Restore full PMU name wildcard support") adds a test case "PMU cmdline match" that covers PMU name wildcard support provided by function perf_pmu__match(). The test works with a wide range of supported combinations of PMU name matching but omits the case that if the perf_pmu__match() cannot match the PMU name to the wildcard, it tries to match its alias. However, this variable is not set up, causing the test case to fail when run with subprocesses or to segfault if run as a single process. ./perf test -vv 9 9: Sysfs PMU tests : 9.1: Parsing with PMU format directory : Ok 9.2: Parsing with PMU event : Ok 9.3: PMU event names : Ok 9.4: PMU name combining : Ok 9.5: PMU name comparison : Ok 9.6: PMU cmdline match : FAILED! ./perf test -F 9 9.1: Parsing with PMU format directory : Ok 9.2: Parsing with PMU event : Ok 9.3: PMU event names : Ok 9.4: PMU name combining : Ok 9.5: PMU name comparison : Ok Segmentation fault (core dumped) Initialize the PMU alias to null for all tests of perf_pmu__match() as this functionality is not being tested and the alias matching works exactly the same as the matching of the PMU name. ./perf test -F 9 9.1: Parsing with PMU format directory : Ok 9.2: Parsing with PMU event : Ok 9.3: PMU event names : Ok 9.4: PMU name combining : Ok 9.5: PMU name comparison : Ok 9.6: PMU cmdline match : Ok Fixes: 3e0bf9fde2984469 ("perf pmu: Restore full PMU name wildcard support") Signed-off-by: Veronika Molnarova Cc: james.clark@arm.com Cc: mpetlan@redhat.com Cc: rstoyano@redhat.com Link: https://lore.kernel.org/r/20240808103749.9356-1-vmolnaro@redhat.com Signed-off-by: Namhyung Kim --- tools/perf/tests/pmu.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/perf/tests/pmu.c b/tools/perf/tests/pmu.c index 40132655ccd1..c76f53a90a7b 100644 --- a/tools/perf/tests/pmu.c +++ b/tools/perf/tests/pmu.c @@ -456,11 +456,13 @@ static int test__name_cmp(struct test_suite *test __maybe_unused, int subtest __ /** * Test perf_pmu__match() that's used to search for a PMU given a name passed * on the command line. The name that's passed may also be a filename type glob - * match. + * match. If the name does not match, perf_pmu__match() attempts to match the + * alias of the PMU, if provided. */ static int test__pmu_match(struct test_suite *test __maybe_unused, int subtest __maybe_unused) { struct perf_pmu test_pmu; + test_pmu.alias_name = NULL; test_pmu.name = "pmuname"; TEST_ASSERT_EQUAL("Exact match", perf_pmu__match(&test_pmu, "pmuname"), true); -- cgit v1.2.3 From 287bd5cf06e0f2c02293ce942777ad1f18059ed3 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Tue, 27 Aug 2024 22:29:53 -0700 Subject: perf lock contention: Fix spinlock and rwlock accounting The spinlock and rwlock use a single-element per-cpu array to track current locks due to performance reason. But this means the key is always available and it cannot simply account lock stats in the array because some of them are invalid. In fact, the contention_end() program in the BPF invalidates the entry by setting the 'lock' value to 0 instead of deleting the entry for the hashmap. So it should skip entries with the lock value of 0 in the account_end_timestamp(). Otherwise, it'd have spurious high contention on an idle machine: $ sudo perf lock con -ab -Y spinlock sleep 3 contended total wait max wait avg wait type caller 8 4.72 s 1.84 s 590.46 ms spinlock rcu_core+0xc7 8 1.87 s 1.87 s 233.48 ms spinlock process_one_work+0x1b5 2 1.87 s 1.87 s 933.92 ms spinlock worker_thread+0x1a2 3 1.81 s 1.81 s 603.93 ms spinlock tmigr_update_events+0x13c 2 1.72 s 1.72 s 861.98 ms spinlock tick_do_update_jiffies64+0x25 6 42.48 us 13.02 us 7.08 us spinlock futex_q_lock+0x2a 1 13.03 us 13.03 us 13.03 us spinlock futex_wake+0xce 1 11.61 us 11.61 us 11.61 us spinlock rcu_core+0xc7 I don't believe it has contention on a spinlock longer than 1 second. After this change, it only reports some small contentions. $ sudo perf lock con -ab -Y spinlock sleep 3 contended total wait max wait avg wait type caller 4 133.51 us 43.29 us 33.38 us spinlock tick_do_update_jiffies64+0x25 4 69.06 us 31.82 us 17.27 us spinlock process_one_work+0x1b5 2 50.66 us 25.77 us 25.33 us spinlock rcu_core+0xc7 1 28.45 us 28.45 us 28.45 us spinlock rcu_core+0xc7 1 24.77 us 24.77 us 24.77 us spinlock tmigr_update_events+0x13c 1 23.34 us 23.34 us 23.34 us spinlock raw_spin_rq_lock_nested+0x15 Fixes: b5711042a1c8 ("perf lock contention: Use per-cpu array map for spinlocks") Reported-by: Xi Wang Cc: Song Liu Cc: bpf@vger.kernel.org Link: https://lore.kernel.org/r/20240828052953.1445862-1-namhyung@kernel.org Signed-off-by: Namhyung Kim --- tools/perf/util/bpf_lock_contention.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'tools') diff --git a/tools/perf/util/bpf_lock_contention.c b/tools/perf/util/bpf_lock_contention.c index b4cb3fe5cc25..bc4e92c0c08b 100644 --- a/tools/perf/util/bpf_lock_contention.c +++ b/tools/perf/util/bpf_lock_contention.c @@ -286,6 +286,9 @@ static void account_end_timestamp(struct lock_contention *con) goto next; for (int i = 0; i < total_cpus; i++) { + if (cpu_data[i].lock == 0) + continue; + update_lock_stat(stat_fd, -1, end_ts, aggr_mode, &cpu_data[i]); } -- cgit v1.2.3 From aee1d55922977bf9282398283a72d38fc5514540 Mon Sep 17 00:00:00 2001 From: Xu Yang Date: Mon, 19 Aug 2024 10:34:03 +0800 Subject: perf python: include "util/sample.h" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The 32-bit arm build system will complain: tools/perf/util/python.c:75:28: error: field ‘sample’ has incomplete type 75 | struct perf_sample sample; However, arm64 build system doesn't complain this. The root cause is arm64 define "HAVE_KVM_STAT_SUPPORT := 1" in tools/perf/arch/arm64/Makefile, but arm arch doesn't define this. This will lead to kvm-stat.h include other header files on arm64 build system, especially "util/sample.h" for util/python.c. This will try to directly include "util/sample.h" for "util/python.c" to avoid such build issue on arm platform. Signed-off-by: Xu Yang Cc: imx@lists.linux.dev Link: https://lore.kernel.org/r/20240819023403.201324-1-xu.yang_2@nxp.com Signed-off-by: Namhyung Kim --- tools/perf/util/python.c | 1 + 1 file changed, 1 insertion(+) (limited to 'tools') diff --git a/tools/perf/util/python.c b/tools/perf/util/python.c index 3be882b2e845..31a223eaf8e6 100644 --- a/tools/perf/util/python.c +++ b/tools/perf/util/python.c @@ -20,6 +20,7 @@ #include "util/env.h" #include "util/kvm-stat.h" #include "util/kwork.h" +#include "util/sample.h" #include "util/lock-contention.h" #include #include "../builtin.h" -- cgit v1.2.3 From e162cb25c410afc42051a582c46a47dde597f51c Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Mon, 19 Aug 2024 21:43:01 -0300 Subject: perf daemon: Fix the build on more 32-bit architectures FYI: I'm carrying this on perf-tools-next. The previous attempt fixed the build on debian:experimental-x-mipsel, but when building on a larger set of containers I noticed it broke the build on some other 32-bit architectures such as: 42 7.87 ubuntu:18.04-x-arm : FAIL gcc version 7.5.0 (Ubuntu/Linaro 7.5.0-3ubuntu1~18.04) builtin-daemon.c: In function 'cmd_session_list': builtin-daemon.c:692:16: error: format '%llu' expects argument of type 'long long unsigned int', but argument 4 has type 'long int' [-Werror=format=] fprintf(out, "%c%" PRIu64, ^~~~~ builtin-daemon.c:694:13: csv_sep, (curr - daemon->start) / 60); ~~~~~~~~~~~~~~~~~~~~~~~~~~~ In file included from builtin-daemon.c:3:0: /usr/arm-linux-gnueabihf/include/inttypes.h:105:34: note: format string is defined here # define PRIu64 __PRI64_PREFIX "u" So lets cast that time_t (32-bit/64-bit) to uint64_t to make sure it builds everywhere. Fixes: 4bbe6002931954bb ("perf daemon: Fix the build on 32-bit architectures") Signed-off-by: Arnaldo Carvalho de Melo Link: https://lore.kernel.org/r/ZsPmldtJ0D9Cua9_@x1 Signed-off-by: Namhyung Kim --- tools/perf/builtin-daemon.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'tools') diff --git a/tools/perf/builtin-daemon.c b/tools/perf/builtin-daemon.c index 5c9335fff2d3..9a95871afc95 100644 --- a/tools/perf/builtin-daemon.c +++ b/tools/perf/builtin-daemon.c @@ -691,7 +691,7 @@ static int cmd_session_list(struct daemon *daemon, union cmd *cmd, FILE *out) fprintf(out, "%c%" PRIu64, /* session up time */ - csv_sep, (curr - daemon->start) / 60); + csv_sep, (uint64_t)((curr - daemon->start) / 60)); fprintf(out, "\n"); } else { @@ -702,7 +702,7 @@ static int cmd_session_list(struct daemon *daemon, union cmd *cmd, FILE *out) fprintf(out, " lock: %s/lock\n", daemon->base); fprintf(out, " up: %" PRIu64 " minutes\n", - (curr - daemon->start) / 60); + (uint64_t)((curr - daemon->start) / 60)); } } @@ -730,7 +730,7 @@ static int cmd_session_list(struct daemon *daemon, union cmd *cmd, FILE *out) fprintf(out, "%c%" PRIu64, /* session up time */ - csv_sep, (curr - session->start) / 60); + csv_sep, (uint64_t)((curr - session->start) / 60)); fprintf(out, "\n"); } else { @@ -747,7 +747,7 @@ static int cmd_session_list(struct daemon *daemon, union cmd *cmd, FILE *out) fprintf(out, " ack: %s/%s\n", session->base, SESSION_ACK); fprintf(out, " up: %" PRIu64 " minutes\n", - (curr - session->start) / 60); + (uint64_t)((curr - session->start) / 60)); } } -- cgit v1.2.3 From b0222d1d9e6f8551a056b89b0bff38f515f3c9b5 Mon Sep 17 00:00:00 2001 From: Mykyta Yatsenko Date: Mon, 2 Sep 2024 18:17:21 +0100 Subject: bpftool: Fix handling enum64 in btf dump sorting Wrong function is used to access the first enum64 element. Substituting btf_enum(t) with btf_enum64(t) for BTF_KIND_ENUM64. Fixes: 94133cf24bb3 ("bpftool: Introduce btf c dump sorting") Signed-off-by: Mykyta Yatsenko Signed-off-by: Daniel Borkmann Acked-by: Quentin Monnet Link: https://lore.kernel.org/bpf/20240902171721.105253-1-mykyta.yatsenko5@gmail.com --- tools/bpf/bpftool/btf.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/bpf/bpftool/btf.c b/tools/bpf/bpftool/btf.c index 6789c7a4d5ca..3b57ba095ab6 100644 --- a/tools/bpf/bpftool/btf.c +++ b/tools/bpf/bpftool/btf.c @@ -561,9 +561,10 @@ static const char *btf_type_sort_name(const struct btf *btf, __u32 index, bool f case BTF_KIND_ENUM64: { int name_off = t->name_off; - /* Use name of the first element for anonymous enums if allowed */ - if (!from_ref && !t->name_off && btf_vlen(t)) - name_off = btf_enum(t)->name_off; + if (!from_ref && !name_off && btf_vlen(t)) + name_off = btf_kind(t) == BTF_KIND_ENUM64 ? + btf_enum64(t)->name_off : + btf_enum(t)->name_off; return btf__name_by_offset(btf, name_off); } -- cgit v1.2.3 From 673f0c3ffc75166317ca5edb66ff35eaa6a12149 Mon Sep 17 00:00:00 2001 From: Michael Grzeschik Date: Sun, 1 Sep 2024 21:11:18 +0200 Subject: tools: usb: p9_fwd: add usb gadget packet forwarder script This patch is adding an small python tool to forward 9pfs requests from the USB gadget to an existing 9pfs TCP server. Since currently all 9pfs servers lack support for the usb transport this tool is an useful helper to get started. Refer the Documentation section "USBG Example" in Documentation/filesystems/9p.rst on how to use it. Signed-off-by: Jan Luebbe Signed-off-by: Michael Grzeschik Tested-by: Andrzej Pietrasiewicz Link: https://lore.kernel.org/r/20240116-ml-topic-u9p-v12-3-9a27de5160e0@pengutronix.de Signed-off-by: Greg Kroah-Hartman --- Documentation/filesystems/9p.rst | 41 +++++++ tools/usb/p9_fwd.py | 243 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 284 insertions(+) create mode 100755 tools/usb/p9_fwd.py (limited to 'tools') diff --git a/Documentation/filesystems/9p.rst b/Documentation/filesystems/9p.rst index 10cf79dc287f..2cc85f3e8659 100644 --- a/Documentation/filesystems/9p.rst +++ b/Documentation/filesystems/9p.rst @@ -67,6 +67,47 @@ To mount a 9p FS on a USB Host accessible via the gadget as root filesystem:: where is the tag associated by the usb gadget transport. It is defined by the configfs instance name. +USBG Example +============ + +The USB host exports a filesystem, while the gadget on the USB device +side makes it mountable. + +Diod (9pfs server) and the forwarder are on the development host, where +the root filesystem is actually stored. The gadget is initialized during +boot (or later) on the embedded board. Then the forwarder will find it +on the USB bus and start forwarding requests. + +In this case the 9p requests come from the device and are handled by the +host. The reason is that USB device ports are normally not available on +PCs, so a connection in the other direction would not work. + +When using the usbg transport, for now there is no native usb host +service capable to handle the requests from the gadget driver. For +this we have to use the extra python tool p9_fwd.py from tools/usb. + +Just start the 9pfs capable network server like diod/nfs-ganesha e.g.: + + $ diod -f -n -d 0 -S -l 0.0.0.0:9999 -e $PWD + +Optionaly scan your bus if there are more then one usbg gadgets to find their path: + + $ python $kernel_dir/tools/usb/p9_fwd.py list + + Bus | Addr | Manufacturer | Product | ID | Path + --- | ---- | ---------------- | ---------------- | --------- | ---- + 2 | 67 | unknown | unknown | 1d6b:0109 | 2-1.1.2 + 2 | 68 | unknown | unknown | 1d6b:0109 | 2-1.1.3 + +Then start the python transport: + + $ python $kernel_dir/tools/usb/p9_fwd.py --path 2-1.1.2 connect -p 9999 + +After that the gadget driver can be used as described above. + +One use-case is to use it as an alternative to NFS root booting during +the development of embedded Linux devices. + Options ======= diff --git a/tools/usb/p9_fwd.py b/tools/usb/p9_fwd.py new file mode 100755 index 000000000000..12c76cbb046b --- /dev/null +++ b/tools/usb/p9_fwd.py @@ -0,0 +1,243 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: GPL-2.0 + +import argparse +import errno +import logging +import socket +import struct +import time + +import usb.core +import usb.util + + +def path_from_usb_dev(dev): + """Takes a pyUSB device as argument and returns a string. + The string is a Path representation of the position of the USB device on the USB bus tree. + + This path is used to find a USB device on the bus or all devices connected to a HUB. + The path is made up of the number of the USB controller followed be the ports of the HUB tree.""" + if dev.port_numbers: + dev_path = ".".join(str(i) for i in dev.port_numbers) + return f"{dev.bus}-{dev_path}" + return "" + + +HEXDUMP_FILTER = "".join(chr(x).isprintable() and chr(x) or "." for x in range(128)) + "." * 128 + + +class Forwarder: + @staticmethod + def _log_hexdump(data): + if not logging.root.isEnabledFor(logging.TRACE): + return + L = 16 + for c in range(0, len(data), L): + chars = data[c : c + L] + dump = " ".join(f"{x:02x}" for x in chars) + printable = "".join(HEXDUMP_FILTER[x] for x in chars) + line = f"{c:08x} {dump:{L*3}s} |{printable:{L}s}|" + logging.root.log(logging.TRACE, "%s", line) + + def __init__(self, server, vid, pid, path): + self.stats = { + "c2s packets": 0, + "c2s bytes": 0, + "s2c packets": 0, + "s2c bytes": 0, + } + self.stats_logged = time.monotonic() + + def find_filter(dev): + dev_path = path_from_usb_dev(dev) + if path is not None: + return dev_path == path + return True + + dev = usb.core.find(idVendor=vid, idProduct=pid, custom_match=find_filter) + if dev is None: + raise ValueError("Device not found") + + logging.info(f"found device: {dev.bus}/{dev.address} located at {path_from_usb_dev(dev)}") + + # dev.set_configuration() is not necessary since g_multi has only one + usb9pfs = None + # g_multi adds 9pfs as last interface + cfg = dev.get_active_configuration() + for intf in cfg: + # we have to detach the usb-storage driver from multi gadget since + # stall option could be set, which will lead to spontaneous port + # resets and our transfers will run dead + if intf.bInterfaceClass == 0x08: + if dev.is_kernel_driver_active(intf.bInterfaceNumber): + dev.detach_kernel_driver(intf.bInterfaceNumber) + + if intf.bInterfaceClass == 0xFF and intf.bInterfaceSubClass == 0xFF and intf.bInterfaceProtocol == 0x09: + usb9pfs = intf + if usb9pfs is None: + raise ValueError("Interface not found") + + logging.info(f"claiming interface:\n{usb9pfs}") + usb.util.claim_interface(dev, usb9pfs.bInterfaceNumber) + ep_out = usb.util.find_descriptor( + usb9pfs, + custom_match=lambda e: usb.util.endpoint_direction(e.bEndpointAddress) == usb.util.ENDPOINT_OUT, + ) + assert ep_out is not None + ep_in = usb.util.find_descriptor( + usb9pfs, + custom_match=lambda e: usb.util.endpoint_direction(e.bEndpointAddress) == usb.util.ENDPOINT_IN, + ) + assert ep_in is not None + logging.info("interface claimed") + + self.ep_out = ep_out + self.ep_in = ep_in + self.dev = dev + + # create and connect socket + self.s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + self.s.connect(server) + + logging.info("connected to server") + + def c2s(self): + """forward a request from the USB client to the TCP server""" + data = None + while data is None: + try: + logging.log(logging.TRACE, "c2s: reading") + data = self.ep_in.read(self.ep_in.wMaxPacketSize) + except usb.core.USBTimeoutError: + logging.log(logging.TRACE, "c2s: reading timed out") + continue + except usb.core.USBError as e: + if e.errno == errno.EIO: + logging.debug("c2s: reading failed with %s, retrying", repr(e)) + time.sleep(0.5) + continue + logging.error("c2s: reading failed with %s, aborting", repr(e)) + raise + size = struct.unpack(" 0 + data = data[written:] + if size % self.ep_out.wMaxPacketSize == 0: + logging.log(logging.TRACE, "sending zero length packet") + self.ep_out.write(b"") + logging.debug("s2c: forwarded %i bytes", size) + self.stats["s2c packets"] += 1 + self.stats["s2c bytes"] += size + + def log_stats(self): + logging.info("statistics:") + for k, v in self.stats.items(): + logging.info(f" {k+':':14s} {v}") + + def log_stats_interval(self, interval=5): + if (time.monotonic() - self.stats_logged) < interval: + return + + self.log_stats() + self.stats_logged = time.monotonic() + + +def try_get_usb_str(dev, name): + try: + with open(f"/sys/bus/usb/devices/{dev.bus}-{dev.address}/{name}") as f: + return f.read().strip() + except FileNotFoundError: + return None + + +def list_usb(args): + vid, pid = [int(x, 16) for x in args.id.split(":", 1)] + + print("Bus | Addr | Manufacturer | Product | ID | Path") + print("--- | ---- | ---------------- | ---------------- | --------- | ----") + for dev in usb.core.find(find_all=True, idVendor=vid, idProduct=pid): + path = path_from_usb_dev(dev) or "" + manufacturer = try_get_usb_str(dev, "manufacturer") or "unknown" + product = try_get_usb_str(dev, "product") or "unknown" + print( + f"{dev.bus:3} | {dev.address:4} | {manufacturer:16} | {product:16} | {dev.idVendor:04x}:{dev.idProduct:04x} | {path:18}" + ) + + +def connect(args): + vid, pid = [int(x, 16) for x in args.id.split(":", 1)] + + f = Forwarder(server=(args.server, args.port), vid=vid, pid=pid, path=args.path) + + try: + while True: + f.c2s() + f.s2c() + f.log_stats_interval() + finally: + f.log_stats() + + +def main(): + parser = argparse.ArgumentParser( + description="Forward 9PFS requests from USB to TCP", + ) + + parser.add_argument("--id", type=str, default="1d6b:0109", help="vid:pid of target device") + parser.add_argument("--path", type=str, required=False, help="path of target device") + parser.add_argument("-v", "--verbose", action="count", default=0) + + subparsers = parser.add_subparsers() + subparsers.required = True + subparsers.dest = "command" + + parser_list = subparsers.add_parser("list", help="List all connected 9p gadgets") + parser_list.set_defaults(func=list_usb) + + parser_connect = subparsers.add_parser( + "connect", help="Forward messages between the usb9pfs gadget and the 9p server" + ) + parser_connect.set_defaults(func=connect) + connect_group = parser_connect.add_argument_group() + connect_group.required = True + parser_connect.add_argument("-s", "--server", type=str, default="127.0.0.1", help="server hostname") + parser_connect.add_argument("-p", "--port", type=int, default=564, help="server port") + + args = parser.parse_args() + + logging.TRACE = logging.DEBUG - 5 + logging.addLevelName(logging.TRACE, "TRACE") + + if args.verbose >= 2: + level = logging.TRACE + elif args.verbose: + level = logging.DEBUG + else: + level = logging.INFO + logging.basicConfig(level=level, format="%(asctime)-15s %(levelname)-8s %(message)s") + + args.func(args) + + +if __name__ == "__main__": + main() -- cgit v1.2.3 From 05c1280a2bcfca187fe7fa90bb240602cf54af0a Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Thu, 29 Aug 2024 14:33:38 +0200 Subject: netdev_features: convert NETIF_F_NETNS_LOCAL to dev->netns_local "Interface can't change network namespaces" is rather an attribute, not a feature, and it can't be changed via Ethtool. Make it a "cold" private flag instead of a netdev_feature and free one more bit. Signed-off-by: Alexander Lobakin Signed-off-by: Paolo Abeni --- Documentation/networking/net_cachelines/net_device.rst | 1 + Documentation/networking/netdev-features.rst | 7 ------- Documentation/networking/switchdev.rst | 4 ++-- drivers/net/amt.c | 2 +- drivers/net/bonding/bond_main.c | 6 +++--- drivers/net/ethernet/adi/adin1110.c | 2 +- drivers/net/ethernet/marvell/prestera/prestera_main.c | 3 ++- drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 4 ++-- drivers/net/ethernet/mellanox/mlx5/core/en_rep.c | 3 ++- drivers/net/ethernet/mellanox/mlxsw/spectrum.c | 5 +++-- drivers/net/ethernet/rocker/rocker_main.c | 3 ++- drivers/net/ethernet/ti/cpsw_new.c | 3 ++- drivers/net/loopback.c | 2 +- drivers/net/net_failover.c | 2 +- drivers/net/team/team_core.c | 6 +++--- drivers/net/vrf.c | 2 +- include/linux/netdev_features.h | 6 ++---- include/linux/netdevice.h | 2 ++ net/batman-adv/soft-interface.c | 3 ++- net/bridge/br_device.c | 5 +++-- net/core/dev.c | 4 ++-- net/ethtool/common.c | 1 - net/hsr/hsr_device.c | 8 ++++---- net/ieee802154/6lowpan/core.c | 2 +- net/ieee802154/core.c | 10 +++++----- net/ipv4/ip_tunnel.c | 2 +- net/ipv4/ipmr.c | 2 +- net/ipv6/ip6_gre.c | 3 +-- net/ipv6/ip6_tunnel.c | 2 +- net/ipv6/ip6mr.c | 2 +- net/ipv6/sit.c | 2 +- net/openvswitch/vport-internal_dev.c | 2 +- net/wireless/core.c | 10 +++++----- tools/testing/selftests/net/forwarding/README | 2 +- 34 files changed, 61 insertions(+), 62 deletions(-) (limited to 'tools') diff --git a/Documentation/networking/net_cachelines/net_device.rst b/Documentation/networking/net_cachelines/net_device.rst index e55319277074..e649d8e9556f 100644 --- a/Documentation/networking/net_cachelines/net_device.rst +++ b/Documentation/networking/net_cachelines/net_device.rst @@ -166,6 +166,7 @@ unsigned:1 wol_enabled unsigned:1 threaded - - napi_poll(napi_enable,dev_set_threaded) unsigned_long:1 see_all_hwtstamp_requests unsigned_long:1 change_proto_down +unsigned_long:1 netns_local struct_list_head net_notifier_list struct_macsec_ops* macsec_ops struct_udp_tunnel_nic_info* udp_tunnel_nic_info diff --git a/Documentation/networking/netdev-features.rst b/Documentation/networking/netdev-features.rst index f29d982ebf5d..5014f7cc1398 100644 --- a/Documentation/networking/netdev-features.rst +++ b/Documentation/networking/netdev-features.rst @@ -139,13 +139,6 @@ chained skbs (skb->next/prev list). Features contained in NETIF_F_SOFT_FEATURES are features of networking stack. Driver should not change behaviour based on them. - * netns-local device - -NETIF_F_NETNS_LOCAL is set for devices that are not allowed to move between -network namespaces (e.g. loopback). - -Don't use it in drivers. - * VLAN challenged NETIF_F_VLAN_CHALLENGED should be set for devices which can't cope with VLAN diff --git a/Documentation/networking/switchdev.rst b/Documentation/networking/switchdev.rst index 758f1dae3fce..f355f0166f1b 100644 --- a/Documentation/networking/switchdev.rst +++ b/Documentation/networking/switchdev.rst @@ -137,10 +137,10 @@ would be sub-port 0 on port 1 on switch 1. Port Features ^^^^^^^^^^^^^ -NETIF_F_NETNS_LOCAL +dev->netns_local If the switchdev driver (and device) only supports offloading of the default -network namespace (netns), the driver should set this feature flag to prevent +network namespace (netns), the driver should set this private flag to prevent the port netdev from being moved out of the default netns. A netns-aware driver/device would not set this flag and be responsible for partitioning hardware to preserve netns containment. This means hardware cannot forward diff --git a/drivers/net/amt.c b/drivers/net/amt.c index 921bbfd72a38..0433a0f36d1b 100644 --- a/drivers/net/amt.c +++ b/drivers/net/amt.c @@ -3099,8 +3099,8 @@ static void amt_link_setup(struct net_device *dev) dev->addr_len = 0; dev->priv_flags |= IFF_NO_QUEUE; dev->lltx = true; + dev->netns_local = true; dev->features |= NETIF_F_GSO_SOFTWARE; - dev->features |= NETIF_F_NETNS_LOCAL; dev->hw_features |= NETIF_F_SG | NETIF_F_HW_CSUM; dev->hw_features |= NETIF_F_FRAGLIST | NETIF_F_RXCSUM; dev->hw_features |= NETIF_F_GSO_SOFTWARE; diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 8ae887cdc231..f13d413ad26c 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -5930,6 +5930,9 @@ void bond_setup(struct net_device *bond_dev) /* don't acquire bond device's netif_tx_lock when transmitting */ bond_dev->lltx = true; + /* Don't allow bond devices to change network namespaces. */ + bond_dev->netns_local = true; + /* By default, we declare the bond to be fully * VLAN hardware accelerated capable. Special * care is taken in the various xmit functions @@ -5937,9 +5940,6 @@ void bond_setup(struct net_device *bond_dev) * capable */ - /* Don't allow bond devices to change network namespaces. */ - bond_dev->features |= NETIF_F_NETNS_LOCAL; - bond_dev->hw_features = BOND_VLAN_FEATURES | NETIF_F_HW_VLAN_CTAG_RX | NETIF_F_HW_VLAN_CTAG_FILTER | diff --git a/drivers/net/ethernet/adi/adin1110.c b/drivers/net/ethernet/adi/adin1110.c index 0713f1e2c7f3..3431a7e62b0d 100644 --- a/drivers/net/ethernet/adi/adin1110.c +++ b/drivers/net/ethernet/adi/adin1110.c @@ -1599,7 +1599,7 @@ static int adin1110_probe_netdevs(struct adin1110_priv *priv) netdev->netdev_ops = &adin1110_netdev_ops; netdev->ethtool_ops = &adin1110_ethtool_ops; netdev->priv_flags |= IFF_UNICAST_FLT; - netdev->features |= NETIF_F_NETNS_LOCAL; + netdev->netns_local = true; port_priv->phydev = get_phy_device(priv->mii_bus, i + 1, false); if (IS_ERR(port_priv->phydev)) { diff --git a/drivers/net/ethernet/marvell/prestera/prestera_main.c b/drivers/net/ethernet/marvell/prestera/prestera_main.c index 63ae01954dfc..22ca6ee9665e 100644 --- a/drivers/net/ethernet/marvell/prestera/prestera_main.c +++ b/drivers/net/ethernet/marvell/prestera/prestera_main.c @@ -633,7 +633,8 @@ static int prestera_port_create(struct prestera_switch *sw, u32 id) if (err) goto err_dl_port_register; - dev->features |= NETIF_F_NETNS_LOCAL | NETIF_F_HW_TC; + dev->features |= NETIF_F_HW_TC; + dev->netns_local = true; dev->netdev_ops = &prestera_netdev_ops; dev->ethtool_ops = &prestera_ethtool_ops; SET_NETDEV_DEV(dev, sw->dev->dev); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index 16b67c457b60..47e7a80d221b 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -4414,9 +4414,9 @@ static netdev_features_t mlx5e_fix_features(struct net_device *netdev, if (mlx5e_is_uplink_rep(priv)) { features = mlx5e_fix_uplink_rep_features(netdev, features); - features |= NETIF_F_NETNS_LOCAL; + netdev->netns_local = true; } else { - features &= ~NETIF_F_NETNS_LOCAL; + netdev->netns_local = false; } mutex_unlock(&priv->state_lock); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c index b885042eef14..92094bf60d59 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c @@ -898,7 +898,8 @@ static void mlx5e_build_rep_netdev(struct net_device *netdev, netdev->hw_features |= NETIF_F_RXCSUM; netdev->features |= netdev->hw_features; - netdev->features |= NETIF_F_NETNS_LOCAL; + + netdev->netns_local = true; } static int mlx5e_init_rep(struct mlx5_core_dev *mdev, diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c index 44d6e125bd6f..b9ffd7236aff 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c @@ -1676,10 +1676,11 @@ static int mlxsw_sp_port_create(struct mlxsw_sp *mlxsw_sp, u16 local_port, netif_carrier_off(dev); - dev->features |= NETIF_F_NETNS_LOCAL | NETIF_F_SG | - NETIF_F_HW_VLAN_CTAG_FILTER | NETIF_F_HW_TC; + dev->features |= NETIF_F_SG | NETIF_F_HW_VLAN_CTAG_FILTER | + NETIF_F_HW_TC; dev->hw_features |= NETIF_F_HW_TC | NETIF_F_LOOPBACK; dev->lltx = true; + dev->netns_local = true; dev->min_mtu = ETH_MIN_MTU; dev->max_mtu = MLXSW_PORT_MAX_MTU - MLXSW_PORT_ETH_FRAME_HDR; diff --git a/drivers/net/ethernet/rocker/rocker_main.c b/drivers/net/ethernet/rocker/rocker_main.c index e097ce3e69ea..84fa911c78db 100644 --- a/drivers/net/ethernet/rocker/rocker_main.c +++ b/drivers/net/ethernet/rocker/rocker_main.c @@ -2575,7 +2575,8 @@ static int rocker_probe_port(struct rocker *rocker, unsigned int port_number) netif_napi_add(dev, &rocker_port->napi_rx, rocker_port_poll_rx); rocker_carrier_init(rocker_port); - dev->features |= NETIF_F_NETNS_LOCAL | NETIF_F_SG; + dev->features |= NETIF_F_SG; + dev->netns_local = true; /* MTU range: 68 - 9000 */ dev->min_mtu = ROCKER_PORT_MIN_MTU; diff --git a/drivers/net/ethernet/ti/cpsw_new.c b/drivers/net/ethernet/ti/cpsw_new.c index 2baa198ebfa0..557cc71b9dd2 100644 --- a/drivers/net/ethernet/ti/cpsw_new.c +++ b/drivers/net/ethernet/ti/cpsw_new.c @@ -1407,7 +1407,8 @@ static int cpsw_create_ports(struct cpsw_common *cpsw) cpsw->slaves[i].ndev = ndev; ndev->features |= NETIF_F_HW_VLAN_CTAG_FILTER | - NETIF_F_HW_VLAN_CTAG_RX | NETIF_F_NETNS_LOCAL | NETIF_F_HW_TC; + NETIF_F_HW_VLAN_CTAG_RX | NETIF_F_HW_TC; + ndev->netns_local = true; ndev->xdp_features = NETDEV_XDP_ACT_BASIC | NETDEV_XDP_ACT_REDIRECT | diff --git a/drivers/net/loopback.c b/drivers/net/loopback.c index bf857782be0f..1993b90b1a5f 100644 --- a/drivers/net/loopback.c +++ b/drivers/net/loopback.c @@ -172,6 +172,7 @@ static void gen_lo_setup(struct net_device *dev, dev->flags = IFF_LOOPBACK; dev->priv_flags |= IFF_LIVE_ADDR_CHANGE | IFF_NO_QUEUE; dev->lltx = true; + dev->netns_local = true; netif_keep_dst(dev); dev->hw_features = NETIF_F_GSO_SOFTWARE; dev->features = NETIF_F_SG | NETIF_F_FRAGLIST @@ -180,7 +181,6 @@ static void gen_lo_setup(struct net_device *dev, | NETIF_F_RXCSUM | NETIF_F_SCTP_CRC | NETIF_F_HIGHDMA - | NETIF_F_NETNS_LOCAL | NETIF_F_VLAN_CHALLENGED | NETIF_F_LOOPBACK; dev->ethtool_ops = eth_ops; diff --git a/drivers/net/net_failover.c b/drivers/net/net_failover.c index 06728385a35f..54c8b9d5b5fc 100644 --- a/drivers/net/net_failover.c +++ b/drivers/net/net_failover.c @@ -734,7 +734,7 @@ struct failover *net_failover_create(struct net_device *standby_dev) failover_dev->lltx = true; /* Don't allow failover devices to change network namespaces. */ - failover_dev->features |= NETIF_F_NETNS_LOCAL; + failover_dev->netns_local = true; failover_dev->hw_features = FAILOVER_VLAN_FEATURES | NETIF_F_HW_VLAN_CTAG_TX | diff --git a/drivers/net/team/team_core.c b/drivers/net/team/team_core.c index 1d1bad3cedc2..18191d5a8bd4 100644 --- a/drivers/net/team/team_core.c +++ b/drivers/net/team/team_core.c @@ -2191,10 +2191,10 @@ static void team_setup(struct net_device *dev) dev->priv_flags |= IFF_UNICAST_FLT | IFF_LIVE_ADDR_CHANGE; dev->lltx = true; - dev->features |= NETIF_F_GRO; - /* Don't allow team devices to change network namespaces. */ - dev->features |= NETIF_F_NETNS_LOCAL; + dev->netns_local = true; + + dev->features |= NETIF_F_GRO; dev->hw_features = TEAM_VLAN_FEATURES | NETIF_F_HW_VLAN_CTAG_RX | diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c index b1d4ef538995..4d8ccaf9a2b4 100644 --- a/drivers/net/vrf.c +++ b/drivers/net/vrf.c @@ -1638,7 +1638,7 @@ static void vrf_setup(struct net_device *dev) dev->lltx = true; /* don't allow vrf devices to change network namespaces. */ - dev->features |= NETIF_F_NETNS_LOCAL; + dev->netns_local = true; /* does not make sense for a VLAN to be added to a vrf device */ dev->features |= NETIF_F_VLAN_CHALLENGED; diff --git a/include/linux/netdev_features.h b/include/linux/netdev_features.h index a2e20b517584..d5a3836f4793 100644 --- a/include/linux/netdev_features.h +++ b/include/linux/netdev_features.h @@ -25,7 +25,7 @@ enum { NETIF_F_VLAN_CHALLENGED_BIT, /* Device cannot handle VLAN packets */ NETIF_F_GSO_BIT, /* Enable software GSO. */ __UNUSED_NETIF_F_12, - NETIF_F_NETNS_LOCAL_BIT, /* Does not change network namespaces */ + __UNUSED_NETIF_F_13, NETIF_F_GRO_BIT, /* Generic receive offload */ NETIF_F_LRO_BIT, /* large receive offload */ @@ -121,7 +121,6 @@ enum { #define NETIF_F_IPV6_CSUM __NETIF_F(IPV6_CSUM) #define NETIF_F_LOOPBACK __NETIF_F(LOOPBACK) #define NETIF_F_LRO __NETIF_F(LRO) -#define NETIF_F_NETNS_LOCAL __NETIF_F(NETNS_LOCAL) #define NETIF_F_NOCACHE_COPY __NETIF_F(NOCACHE_COPY) #define NETIF_F_NTUPLE __NETIF_F(NTUPLE) #define NETIF_F_RXCSUM __NETIF_F(RXCSUM) @@ -190,8 +189,7 @@ static inline int find_next_netdev_feature(u64 feature, unsigned long start) /* Features valid for ethtool to change */ /* = all defined minus driver/device-class-related */ -#define NETIF_F_NEVER_CHANGE (NETIF_F_VLAN_CHALLENGED | \ - NETIF_F_NETNS_LOCAL) +#define NETIF_F_NEVER_CHANGE NETIF_F_VLAN_CHALLENGED /* remember that ((t)1 << t_BITS) is undefined in C99 */ #define NETIF_F_ETHTOOL_BITS ((__NETIF_F_BIT(NETDEV_FEATURE_COUNT - 1) | \ diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index e22ca5e07490..a698e2402420 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1968,6 +1968,7 @@ enum netdev_reg_state { * regardless of source, even if those aren't * HWTSTAMP_SOURCE_NETDEV * @change_proto_down: device supports setting carrier via IFLA_PROTO_DOWN + * @netns_local: interface can't change network namespaces * * @net_notifier_list: List of per-net netdev notifier block * that follow this device when it is moved @@ -2361,6 +2362,7 @@ struct net_device { /* priv_flags_slow, ungrouped to save space */ unsigned long see_all_hwtstamp_requests:1; unsigned long change_proto_down:1; + unsigned long netns_local:1; struct list_head net_notifier_list; diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c index e791a73ef901..2758aba47a2f 100644 --- a/net/batman-adv/soft-interface.c +++ b/net/batman-adv/soft-interface.c @@ -1020,9 +1020,10 @@ static void batadv_softif_init_early(struct net_device *dev) dev->netdev_ops = &batadv_netdev_ops; dev->needs_free_netdev = true; dev->priv_destructor = batadv_softif_free; - dev->features |= NETIF_F_HW_VLAN_CTAG_FILTER | NETIF_F_NETNS_LOCAL; + dev->features |= NETIF_F_HW_VLAN_CTAG_FILTER; dev->priv_flags |= IFF_NO_QUEUE; dev->lltx = true; + dev->netns_local = true; /* can't call min_mtu, because the needed variables * have not been initialized yet diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c index a6d25113dfb1..26b79feb385d 100644 --- a/net/bridge/br_device.c +++ b/net/bridge/br_device.c @@ -488,9 +488,10 @@ void br_dev_setup(struct net_device *dev) SET_NETDEV_DEVTYPE(dev, &br_type); dev->priv_flags = IFF_EBRIDGE | IFF_NO_QUEUE; dev->lltx = true; + dev->netns_local = true; - dev->features = COMMON_FEATURES | NETIF_F_NETNS_LOCAL | - NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_STAG_TX; + dev->features = COMMON_FEATURES | NETIF_F_HW_VLAN_CTAG_TX | + NETIF_F_HW_VLAN_STAG_TX; dev->hw_features = COMMON_FEATURES | NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_STAG_TX; dev->vlan_features = COMMON_FEATURES; diff --git a/net/core/dev.c b/net/core/dev.c index 56aed88ceadf..98bb5f890b88 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -11487,7 +11487,7 @@ int __dev_change_net_namespace(struct net_device *dev, struct net *net, /* Don't allow namespace local devices to be moved. */ err = -EINVAL; - if (dev->features & NETIF_F_NETNS_LOCAL) + if (dev->netns_local) goto out; /* Ensure the device has been registered */ @@ -11869,7 +11869,7 @@ static void __net_exit default_device_exit_net(struct net *net) char fb_name[IFNAMSIZ]; /* Ignore unmoveable devices (i.e. loopback) */ - if (dev->features & NETIF_F_NETNS_LOCAL) + if (dev->netns_local) continue; /* Leave virtual devices for the generic cleanup */ diff --git a/net/ethtool/common.c b/net/ethtool/common.c index 447fc80e1b00..ca8e64162104 100644 --- a/net/ethtool/common.c +++ b/net/ethtool/common.c @@ -25,7 +25,6 @@ const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN] = { [NETIF_F_HW_VLAN_STAG_FILTER_BIT] = "rx-vlan-stag-filter", [NETIF_F_VLAN_CHALLENGED_BIT] = "vlan-challenged", [NETIF_F_GSO_BIT] = "tx-generic-segmentation", - [NETIF_F_NETNS_LOCAL_BIT] = "netns-local", [NETIF_F_GRO_BIT] = "rx-gro", [NETIF_F_GRO_HW_BIT] = "rx-gro-hw", [NETIF_F_LRO_BIT] = "rx-lro", diff --git a/net/hsr/hsr_device.c b/net/hsr/hsr_device.c index d4c783076662..a06e790042e2 100644 --- a/net/hsr/hsr_device.c +++ b/net/hsr/hsr_device.c @@ -556,6 +556,10 @@ void hsr_dev_setup(struct net_device *dev) dev->priv_flags |= IFF_NO_QUEUE | IFF_DISABLE_NETPOLL; /* Prevent recursive tx locking */ dev->lltx = true; + /* Not sure about this. Taken from bridge code. netdevice.h says + * it means "Does not change network namespaces". + */ + dev->netns_local = true; dev->needs_free_netdev = true; @@ -569,10 +573,6 @@ void hsr_dev_setup(struct net_device *dev) * hsr_header_create() etc. */ dev->features |= NETIF_F_VLAN_CHALLENGED; - /* Not sure about this. Taken from bridge code. netdev_features.h says - * it means "Does not change network namespaces". - */ - dev->features |= NETIF_F_NETNS_LOCAL; } /* Return true if dev is a HSR master; return false otherwise. diff --git a/net/ieee802154/6lowpan/core.c b/net/ieee802154/6lowpan/core.c index 77b4e92027c5..175efd860f7b 100644 --- a/net/ieee802154/6lowpan/core.c +++ b/net/ieee802154/6lowpan/core.c @@ -116,7 +116,7 @@ static void lowpan_setup(struct net_device *ldev) ldev->netdev_ops = &lowpan_netdev_ops; ldev->header_ops = &lowpan_header_ops; ldev->needs_free_netdev = true; - ldev->features |= NETIF_F_NETNS_LOCAL; + ldev->netns_local = true; } static int lowpan_validate(struct nlattr *tb[], struct nlattr *data[], diff --git a/net/ieee802154/core.c b/net/ieee802154/core.c index 60e8fff1347e..88adb04e4072 100644 --- a/net/ieee802154/core.c +++ b/net/ieee802154/core.c @@ -226,11 +226,11 @@ int cfg802154_switch_netns(struct cfg802154_registered_device *rdev, list_for_each_entry(wpan_dev, &rdev->wpan_dev_list, list) { if (!wpan_dev->netdev) continue; - wpan_dev->netdev->features &= ~NETIF_F_NETNS_LOCAL; + wpan_dev->netdev->netns_local = false; err = dev_change_net_namespace(wpan_dev->netdev, net, "wpan%d"); if (err) break; - wpan_dev->netdev->features |= NETIF_F_NETNS_LOCAL; + wpan_dev->netdev->netns_local = true; } if (err) { @@ -242,11 +242,11 @@ int cfg802154_switch_netns(struct cfg802154_registered_device *rdev, list) { if (!wpan_dev->netdev) continue; - wpan_dev->netdev->features &= ~NETIF_F_NETNS_LOCAL; + wpan_dev->netdev->netns_local = false; err = dev_change_net_namespace(wpan_dev->netdev, net, "wpan%d"); WARN_ON(err); - wpan_dev->netdev->features |= NETIF_F_NETNS_LOCAL; + wpan_dev->netdev->netns_local = true; } return err; @@ -291,7 +291,7 @@ static int cfg802154_netdev_notifier_call(struct notifier_block *nb, switch (state) { /* TODO NETDEV_DEVTYPE */ case NETDEV_REGISTER: - dev->features |= NETIF_F_NETNS_LOCAL; + dev->netns_local = true; wpan_dev->identifier = ++rdev->wpan_dev_id; list_add_rcu(&wpan_dev->list, &rdev->wpan_dev_list); rdev->devlist_generation++; diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index 0cd2f3de100c..18964394d6bd 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -1161,7 +1161,7 @@ int ip_tunnel_init_net(struct net *net, unsigned int ip_tnl_net_id, * Allowing to move it to another netns is clearly unsafe. */ if (!IS_ERR(itn->fb_tunnel_dev)) { - itn->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL; + itn->fb_tunnel_dev->netns_local = true; itn->fb_tunnel_dev->mtu = ip_tunnel_bind_dev(itn->fb_tunnel_dev); ip_tunnel_add(itn, netdev_priv(itn->fb_tunnel_dev)); itn->type = itn->fb_tunnel_dev->type; diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index b0eda745e3bc..f1a43199551b 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -537,7 +537,7 @@ static void reg_vif_setup(struct net_device *dev) dev->flags = IFF_NOARP; dev->netdev_ops = ®_vif_netdev_ops; dev->needs_free_netdev = true; - dev->features |= NETIF_F_NETNS_LOCAL; + dev->netns_local = true; } static struct net_device *ipmr_reg_vif(struct net *net, struct mr_table *mrt) diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index 08beab638bda..235808cfec70 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -1621,8 +1621,7 @@ static int __net_init ip6gre_init_net(struct net *net) /* FB netdevice is special: we have one, and only one per netns. * Allowing to move it to another netns is clearly unsafe. */ - ign->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL; - + ign->fb_tunnel_dev->netns_local = true; ip6gre_fb_tunnel_init(ign->fb_tunnel_dev); ign->fb_tunnel_dev->rtnl_link_ops = &ip6gre_link_ops; diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index cbef0fcece71..ec51ab5063e8 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -2258,7 +2258,7 @@ static int __net_init ip6_tnl_init_net(struct net *net) /* FB netdevice is special: we have one, and only one per netns. * Allowing to move it to another netns is clearly unsafe. */ - ip6n->fb_tnl_dev->features |= NETIF_F_NETNS_LOCAL; + ip6n->fb_tnl_dev->netns_local = true; err = ip6_fb_tnl_dev_init(ip6n->fb_tnl_dev); if (err < 0) diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index e3ee93c562c3..2ce4ae0d8dc3 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -640,7 +640,7 @@ static void reg_vif_setup(struct net_device *dev) dev->flags = IFF_NOARP; dev->netdev_ops = ®_vif_netdev_ops; dev->needs_free_netdev = true; - dev->features |= NETIF_F_NETNS_LOCAL; + dev->netns_local = true; } static struct net_device *ip6mr_reg_vif(struct net *net, struct mr_table *mrt) diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index 58306522fb1e..16b90a24c9ba 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -1856,7 +1856,7 @@ static int __net_init sit_init_net(struct net *net) /* FB netdevice is special: we have one, and only one per netns. * Allowing to move it to another netns is clearly unsafe. */ - sitn->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL; + sitn->fb_tunnel_dev->netns_local = true; err = register_netdev(sitn->fb_tunnel_dev); if (err) diff --git a/net/openvswitch/vport-internal_dev.c b/net/openvswitch/vport-internal_dev.c index 3a369a31c5cc..5858d65ea1a9 100644 --- a/net/openvswitch/vport-internal_dev.c +++ b/net/openvswitch/vport-internal_dev.c @@ -149,7 +149,7 @@ static struct vport *internal_dev_create(const struct vport_parms *parms) /* Restrict bridge port to current netns. */ if (vport->port_no == OVSP_LOCAL) - vport->dev->features |= NETIF_F_NETNS_LOCAL; + vport->dev->netns_local = true; rtnl_lock(); err = register_netdevice(vport->dev); diff --git a/net/wireless/core.c b/net/wireless/core.c index 4d5d351bd0b5..661adfc77644 100644 --- a/net/wireless/core.c +++ b/net/wireless/core.c @@ -165,11 +165,11 @@ int cfg80211_switch_netns(struct cfg80211_registered_device *rdev, list_for_each_entry(wdev, &rdev->wiphy.wdev_list, list) { if (!wdev->netdev) continue; - wdev->netdev->features &= ~NETIF_F_NETNS_LOCAL; + wdev->netdev->netns_local = false; err = dev_change_net_namespace(wdev->netdev, net, "wlan%d"); if (err) break; - wdev->netdev->features |= NETIF_F_NETNS_LOCAL; + wdev->netdev->netns_local = true; } if (err) { @@ -181,11 +181,11 @@ int cfg80211_switch_netns(struct cfg80211_registered_device *rdev, list) { if (!wdev->netdev) continue; - wdev->netdev->features &= ~NETIF_F_NETNS_LOCAL; + wdev->netdev->netns_local = false; err = dev_change_net_namespace(wdev->netdev, net, "wlan%d"); WARN_ON(err); - wdev->netdev->features |= NETIF_F_NETNS_LOCAL; + wdev->netdev->netns_local = true; } return err; @@ -1473,7 +1473,7 @@ static int cfg80211_netdev_notifier_call(struct notifier_block *nb, SET_NETDEV_DEVTYPE(dev, &wiphy_type); wdev->netdev = dev; /* can only change netns with wiphy */ - dev->features |= NETIF_F_NETNS_LOCAL; + dev->netns_local = true; cfg80211_init_wdev(wdev); break; diff --git a/tools/testing/selftests/net/forwarding/README b/tools/testing/selftests/net/forwarding/README index 7fdb6a9ca543..a652429bfd53 100644 --- a/tools/testing/selftests/net/forwarding/README +++ b/tools/testing/selftests/net/forwarding/README @@ -6,7 +6,7 @@ to easily create and test complex environments. Unfortunately, these namespaces can not be used with actual switching ASICs, as their ports can not be migrated to other network namespaces -(NETIF_F_NETNS_LOCAL) and most of them probably do not support the +(dev->netns_local) and most of them probably do not support the L1-separation provided by namespaces. However, a similar kind of flexibility can be achieved by using VRFs and -- cgit v1.2.3 From 5ceb87dc76ab269c940541ad9487cf0f3c0c793d Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 30 Aug 2024 11:22:39 +0200 Subject: selftests: netfilter: nft_queue.sh: fix spurious timeout on debug kernel The sctp selftest is very slow on debug kernels. Its possible that the nf_queue listener program exits due to timeout before first sctp packet is processed. In this case socat hangs until script times out. Fix this by removing the -t option where possible and kill the test program once the file transfer/socat has exited. -t sets SO_RCVTIMEO, its inteded for the 'ping' part of the selftest where we want to make sure that packets get reinjected properly without skipping a second queue request. While at it, add a helper to compare the (binary) files instead of diff. The 'diff' part was copied from a another sub-test that compares text. Let helper dump file sizes on error so we can see the progress made. Tested on an old 2010-ish box with a debug kernel and 100 iterations. This is a followup to the earlier filesize reduction change. Reported-by: Jakub Kicinski Closes: https://lore.kernel.org/netdev/20240829080109.GB30766@breakpoint.cc/ Fixes: 0a8b08c554da ("selftests: netfilter: nft_queue.sh: reduce test file size for debug build") Signed-off-by: Florian Westphal Link: https://patch.msgid.link/20240830092254.8029-1-fw@strlen.de Signed-off-by: Paolo Abeni --- tools/testing/selftests/net/netfilter/nft_queue.sh | 62 ++++++++++++++-------- 1 file changed, 40 insertions(+), 22 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/netfilter/nft_queue.sh b/tools/testing/selftests/net/netfilter/nft_queue.sh index 9e5f423bff09..d66e3c4dfec6 100755 --- a/tools/testing/selftests/net/netfilter/nft_queue.sh +++ b/tools/testing/selftests/net/netfilter/nft_queue.sh @@ -8,7 +8,7 @@ source lib.sh ret=0 -timeout=2 +timeout=5 cleanup() { @@ -255,17 +255,19 @@ listener_ready() test_tcp_forward() { - ip netns exec "$nsrouter" ./nf_queue -q 2 -t "$timeout" & + ip netns exec "$nsrouter" ./nf_queue -q 2 & local nfqpid=$! timeout 5 ip netns exec "$ns2" socat -u TCP-LISTEN:12345 STDOUT >/dev/null & local rpid=$! busywait "$BUSYWAIT_TIMEOUT" listener_ready "$ns2" + busywait "$BUSYWAIT_TIMEOUT" nf_queue_wait "$nsrouter" 2 ip netns exec "$ns1" socat -u STDIN TCP:10.0.2.99:12345 <"$TMPINPUT" >/dev/null wait "$rpid" && echo "PASS: tcp and nfqueue in forward chain" + kill "$nfqpid" } test_tcp_localhost() @@ -273,26 +275,29 @@ test_tcp_localhost() timeout 5 ip netns exec "$nsrouter" socat -u TCP-LISTEN:12345 STDOUT >/dev/null & local rpid=$! - ip netns exec "$nsrouter" ./nf_queue -q 3 -t "$timeout" & + ip netns exec "$nsrouter" ./nf_queue -q 3 & local nfqpid=$! busywait "$BUSYWAIT_TIMEOUT" listener_ready "$nsrouter" + busywait "$BUSYWAIT_TIMEOUT" nf_queue_wait "$nsrouter" 3 ip netns exec "$nsrouter" socat -u STDIN TCP:127.0.0.1:12345 <"$TMPINPUT" >/dev/null wait "$rpid" && echo "PASS: tcp via loopback" - wait 2>/dev/null + kill "$nfqpid" } test_tcp_localhost_connectclose() { - ip netns exec "$nsrouter" ./connect_close -p 23456 -t "$timeout" & - ip netns exec "$nsrouter" ./nf_queue -q 3 -t "$timeout" & + ip netns exec "$nsrouter" ./nf_queue -q 3 & + local nfqpid=$! busywait "$BUSYWAIT_TIMEOUT" nf_queue_wait "$nsrouter" 3 + timeout 10 ip netns exec "$nsrouter" ./connect_close -p 23456 -t 3 + + kill "$nfqpid" wait && echo "PASS: tcp via loopback with connect/close" - wait 2>/dev/null } test_tcp_localhost_requeue() @@ -357,7 +362,7 @@ table inet filter { } } EOF - ip netns exec "$ns1" ./nf_queue -q 1 -t "$timeout" & + ip netns exec "$ns1" ./nf_queue -q 1 & local nfqpid=$! busywait "$BUSYWAIT_TIMEOUT" nf_queue_wait "$ns1" 1 @@ -367,6 +372,7 @@ EOF for n in output post; do for d in tvrf eth0; do if ! ip netns exec "$ns1" nft list chain inet filter "$n" | grep -q "oifname \"$d\" icmp type echo-request counter packets 1"; then + kill "$nfqpid" echo "FAIL: chain $n: icmp packet counter mismatch for device $d" 1>&2 ip netns exec "$ns1" nft list ruleset ret=1 @@ -375,8 +381,8 @@ EOF done done - wait "$nfqpid" && echo "PASS: icmp+nfqueue via vrf" - wait 2>/dev/null + kill "$nfqpid" + echo "PASS: icmp+nfqueue via vrf" } sctp_listener_ready() @@ -384,6 +390,22 @@ sctp_listener_ready() ss -S -N "$1" -ln -o "sport = :12345" | grep -q 12345 } +check_output_files() +{ + local f1="$1" + local f2="$2" + local err="$3" + + if ! cmp "$f1" "$f2" ; then + echo "FAIL: $err: input and output file differ" 1>&2 + echo -n " Input file" 1>&2 + ls -l "$f1" 1>&2 + echo -n "Output file" 1>&2 + ls -l "$f2" 1>&2 + ret=1 + fi +} + test_sctp_forward() { ip netns exec "$nsrouter" nft -f /dev/stdin </dev/null @@ -411,11 +433,9 @@ EOF fi wait "$rpid" && echo "PASS: sctp and nfqueue in forward chain" + kill "$nfqpid" - if ! diff -u "$TMPINPUT" "$TMPFILE1" ; then - echo "FAIL: lost packets?!" 1>&2 - exit 1 - fi + check_output_files "$TMPINPUT" "$TMPFILE1" "sctp forward" } test_sctp_output() @@ -429,14 +449,14 @@ table inet sctpq { } EOF # reduce test file size, software segmentation causes sk wmem increase. - dd conv=sparse status=none if=/dev/zero bs=1M count=50 of="$TMPINPUT" + dd conv=sparse status=none if=/dev/zero bs=1M count=$((COUNT/2)) of="$TMPINPUT" timeout 60 ip netns exec "$ns2" socat -u SCTP-LISTEN:12345 STDOUT > "$TMPFILE1" & local rpid=$! busywait "$BUSYWAIT_TIMEOUT" sctp_listener_ready "$ns2" - ip netns exec "$ns1" ./nf_queue -q 11 -t "$timeout" & + ip netns exec "$ns1" ./nf_queue -q 11 & local nfqpid=$! ip netns exec "$ns1" socat -u STDIN SCTP:10.0.2.99:12345 <"$TMPINPUT" >/dev/null @@ -448,11 +468,9 @@ EOF # must wait before checking completeness of output file. wait "$rpid" && echo "PASS: sctp and nfqueue in output chain with GSO" + kill "$nfqpid" - if ! diff -u "$TMPINPUT" "$TMPFILE1" ; then - echo "FAIL: lost packets?!" 1>&2 - exit 1 - fi + check_output_files "$TMPINPUT" "$TMPFILE1" "sctp output" } test_queue_removal() @@ -468,7 +486,7 @@ table ip filter { } } EOF - ip netns exec "$ns1" ./nf_queue -q 0 -d 30000 -t "$timeout" & + ip netns exec "$ns1" ./nf_queue -q 0 -d 30000 & local nfqpid=$! busywait "$BUSYWAIT_TIMEOUT" nf_queue_wait "$ns1" 0 -- cgit v1.2.3 From c3f8644c21df9b7db97eb70e08e2826368aaafa0 Mon Sep 17 00:00:00 2001 From: "Steinar H. Gunderson" Date: Sat, 3 Aug 2024 17:20:06 +0200 Subject: perf report: Support LLVM for addr2line() In addition to the existing support for libbfd and calling out to an external addr2line command, add support for using libllvm directly. This is both faster than libbfd, and can be enabled in distro builds (the LLVM license has an explicit provision for GPLv2 compatibility). Thus, it is set as the primary choice if available. As an example, running 'perf report' on a medium-size profile with DWARF-based backtraces took 58 seconds with LLVM, 78 seconds with libbfd, 153 seconds with external llvm-addr2line, and I got tired and aborted the test after waiting for 55 minutes with external bfd addr2line (which is the default for perf as compiled by distributions today). Evidently, for this case, the bfd addr2line process needs 18 seconds (on a 5.2 GHz Zen 3) to load the .debug ELF in question, hits the 1-second timeout and gets killed during initialization, getting restarted anew every time. Having an in-process addr2line makes this much more robust. As future extensions, libllvm can be used in many other places where we currently use libbfd or other libraries: - Symbol enumeration (in particular, for PE binaries). - Demangling (including non-Itanium demangling, e.g. Microsoft or Rust). - Disassembling (perf annotate). However, these are much less pressing; most people don't profile PE binaries, and perf has non-bfd paths for ELF. The same with demangling; the default _cxa_demangle path works fine for most users, and while bfd objdump can be slow on large binaries, it is possible to use --objdump=llvm-objdump to get the speed benefits. (It appears LLVM-based demangling is very simple, should we want that.) Tested with LLVM 14, 15, 16, 18 and 19. For some reason, LLVM 12 was not correctly detected using feature_check, and thus was not tested. Committer notes: Added the name and a __maybe_unused to address: 1 13.50 almalinux:8 : FAIL gcc version 8.5.0 20210514 (Red Hat 8.5.0-22) (GCC) util/srcline.c: In function 'dso__free_a2l': util/srcline.c:184:20: error: parameter name omitted void dso__free_a2l(struct dso *) ^~~~~~~~~~~~ make[3]: *** [/git/perf-6.11.0-rc3/tools/build/Makefile.build:158: util] Error 2 Signed-off-by: Steinar H. Gunderson Tested-by: Arnaldo Carvalho de Melo Cc: Ian Rogers Link: https://lore.kernel.org/r/20240803152008.2818485-1-sesse@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/build/Makefile.feature | 1 + tools/perf/Makefile.config | 17 +++++ tools/perf/builtin-version.c | 1 + tools/perf/tests/make | 2 + tools/perf/util/Build | 1 + tools/perf/util/llvm-c-helpers.cpp | 134 +++++++++++++++++++++++++++++++++++++ tools/perf/util/llvm-c-helpers.h | 49 ++++++++++++++ tools/perf/util/srcline.c | 59 +++++++++++++++- 8 files changed, 263 insertions(+), 1 deletion(-) create mode 100644 tools/perf/util/llvm-c-helpers.cpp create mode 100644 tools/perf/util/llvm-c-helpers.h (limited to 'tools') diff --git a/tools/build/Makefile.feature b/tools/build/Makefile.feature index e1900abd44f6..0717e96d6a0e 100644 --- a/tools/build/Makefile.feature +++ b/tools/build/Makefile.feature @@ -136,6 +136,7 @@ FEATURE_DISPLAY ?= \ libunwind \ libdw-dwarf-unwind \ libcapstone \ + llvm \ zlib \ lzma \ get_cpuid \ diff --git a/tools/perf/Makefile.config b/tools/perf/Makefile.config index 9998cd7a8792..7888c932b1b4 100644 --- a/tools/perf/Makefile.config +++ b/tools/perf/Makefile.config @@ -980,6 +980,23 @@ ifdef BUILD_NONDISTRO endif endif +ifndef NO_LIBLLVM + $(call feature_check,llvm) + ifeq ($(feature-llvm), 1) + CFLAGS += -DHAVE_LIBLLVM_SUPPORT + CFLAGS += $(shell $(LLVM_CONFIG) --cflags) + CXXFLAGS += -DHAVE_LIBLLVM_SUPPORT + CXXFLAGS += $(shell $(LLVM_CONFIG) --cxxflags) + LIBLLVM = $(shell $(LLVM_CONFIG) --libs all) $(shell $(LLVM_CONFIG) --system-libs) + EXTLIBS += -L$(shell $(LLVM_CONFIG) --libdir) $(LIBLLVM) + EXTLIBS += -lstdc++ + $(call detected,CONFIG_LIBLLVM) + else + $(warning No libllvm found, slower source file resolution, please install llvm-devel/llvm-dev) + NO_LIBLLVM := 1 + endif +endif + ifndef NO_DEMANGLE $(call feature_check,cxa-demangle) ifeq ($(feature-cxa-demangle), 1) diff --git a/tools/perf/builtin-version.c b/tools/perf/builtin-version.c index 398aa53e9e2e..4b252196de12 100644 --- a/tools/perf/builtin-version.c +++ b/tools/perf/builtin-version.c @@ -65,6 +65,7 @@ static void library_status(void) STATUS(HAVE_LIBBFD_SUPPORT, libbfd); STATUS(HAVE_DEBUGINFOD_SUPPORT, debuginfod); STATUS(HAVE_LIBELF_SUPPORT, libelf); + STATUS(HAVE_LIBLLVM_SUPPORT, libllvm); STATUS(HAVE_LIBNUMA_SUPPORT, libnuma); STATUS(HAVE_LIBNUMA_SUPPORT, numa_num_possible_cpus); STATUS(HAVE_LIBPERL_SUPPORT, libperl); diff --git a/tools/perf/tests/make b/tools/perf/tests/make index 8a9edf758f10..a5040772043f 100644 --- a/tools/perf/tests/make +++ b/tools/perf/tests/make @@ -93,6 +93,7 @@ make_no_libbpf := NO_LIBBPF=1 make_libbpf_dynamic := LIBBPF_DYNAMIC=1 make_no_libbpf_DEBUG := NO_LIBBPF=1 DEBUG=1 make_no_libcrypto := NO_LIBCRYPTO=1 +make_no_libllvm := NO_LIBLLVM=1 make_with_babeltrace:= LIBBABELTRACE=1 make_with_coresight := CORESIGHT=1 make_no_sdt := NO_SDT=1 @@ -163,6 +164,7 @@ run += make_no_auxtrace run += make_no_libbpf run += make_no_libbpf_DEBUG run += make_no_libcrypto +run += make_no_libllvm run += make_no_sdt run += make_no_syscall_tbl run += make_with_babeltrace diff --git a/tools/perf/util/Build b/tools/perf/util/Build index 260cec2f6c0b..dc616292b2dd 100644 --- a/tools/perf/util/Build +++ b/tools/perf/util/Build @@ -229,6 +229,7 @@ perf-util-$(CONFIG_CXX_DEMANGLE) += demangle-cxx.o perf-util-y += demangle-ocaml.o perf-util-y += demangle-java.o perf-util-y += demangle-rust.o +perf-util-$(CONFIG_LIBLLVM) += llvm-c-helpers.o ifdef CONFIG_JITDUMP perf-util-$(CONFIG_LIBELF) += jitdump.o diff --git a/tools/perf/util/llvm-c-helpers.cpp b/tools/perf/util/llvm-c-helpers.cpp new file mode 100644 index 000000000000..3cc967ec6f28 --- /dev/null +++ b/tools/perf/util/llvm-c-helpers.cpp @@ -0,0 +1,134 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * Must come before the linux/compiler.h include, which defines several + * macros (e.g. noinline) that conflict with compiler builtins used + * by LLVM. + */ +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" /* Needed for LLVM <= 15 */ +#include +#pragma GCC diagnostic pop + +#include +#include +#include +extern "C" { +#include +} +#include "symbol_conf.h" +#include "llvm-c-helpers.h" + +using namespace llvm; +using llvm::symbolize::LLVMSymbolizer; + +/* + * Allocate a static LLVMSymbolizer, which will live to the end of the program. + * Unlike the bfd paths, LLVMSymbolizer has its own cache, so we do not need + * to store anything in the dso struct. + */ +static LLVMSymbolizer *get_symbolizer() +{ + static LLVMSymbolizer *instance = nullptr; + if (instance == nullptr) { + LLVMSymbolizer::Options opts; + /* + * LLVM sometimes demangles slightly different from the rest + * of the code, and this mismatch can cause new_inline_sym() + * to get confused and mark non-inline symbol as inlined + * (since the name does not properly match up with base_sym). + * Thus, disable the demangling and let the rest of the code + * handle it. + */ + opts.Demangle = false; + instance = new LLVMSymbolizer(opts); + } + return instance; +} + +/* Returns 0 on error, 1 on success. */ +static int extract_file_and_line(const DILineInfo &line_info, char **file, + unsigned int *line) +{ + if (file) { + if (line_info.FileName == "") { + /* Match the convention of libbfd. */ + *file = nullptr; + } else { + /* The caller expects to get something it can free(). */ + *file = strdup(line_info.FileName.c_str()); + if (*file == nullptr) + return 0; + } + } + if (line) + *line = line_info.Line; + return 1; +} + +extern "C" +int llvm_addr2line(const char *dso_name, u64 addr, + char **file, unsigned int *line, + bool unwind_inlines, + llvm_a2l_frame **inline_frames) +{ + LLVMSymbolizer *symbolizer = get_symbolizer(); + object::SectionedAddress sectioned_addr = { + addr, + object::SectionedAddress::UndefSection + }; + + if (unwind_inlines) { + Expected res_or_err = + symbolizer->symbolizeInlinedCode(dso_name, + sectioned_addr); + if (!res_or_err) + return 0; + unsigned num_frames = res_or_err->getNumberOfFrames(); + if (num_frames == 0) + return 0; + + if (extract_file_and_line(res_or_err->getFrame(0), + file, line) == 0) + return 0; + + *inline_frames = (llvm_a2l_frame *)calloc( + num_frames, sizeof(**inline_frames)); + if (*inline_frames == nullptr) + return 0; + + for (unsigned i = 0; i < num_frames; ++i) { + const DILineInfo &src = res_or_err->getFrame(i); + + llvm_a2l_frame &dst = (*inline_frames)[i]; + if (src.FileName == "") + /* Match the convention of libbfd. */ + dst.filename = nullptr; + else + dst.filename = strdup(src.FileName.c_str()); + dst.funcname = strdup(src.FunctionName.c_str()); + dst.line = src.Line; + + if (dst.filename == nullptr || + dst.funcname == nullptr) { + for (unsigned j = 0; j <= i; ++j) { + zfree(&(*inline_frames)[j].filename); + zfree(&(*inline_frames)[j].funcname); + } + zfree(inline_frames); + return 0; + } + } + + return num_frames; + } else { + if (inline_frames) + *inline_frames = nullptr; + + Expected res_or_err = + symbolizer->symbolizeCode(dso_name, sectioned_addr); + if (!res_or_err) + return 0; + return extract_file_and_line(*res_or_err, file, line); + } +} diff --git a/tools/perf/util/llvm-c-helpers.h b/tools/perf/util/llvm-c-helpers.h new file mode 100644 index 000000000000..19332dd98e14 --- /dev/null +++ b/tools/perf/util/llvm-c-helpers.h @@ -0,0 +1,49 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __PERF_LLVM_C_HELPERS +#define __PERF_LLVM_C_HELPERS 1 + +/* + * Helpers to call into LLVM C++ code from C, for the parts that do not have + * C APIs. + */ + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +struct llvm_a2l_frame { + char* filename; + char* funcname; + unsigned int line; +}; + +/* + * Implement addr2line() using libLLVM. LLVM is a C++ API, and + * many of the linux/ headers cannot be included in a C++ compile unit, + * so we need to make a little bridge code here. llvm_addr2line() will + * convert the inline frame information from LLVM's internal structures + * and put them into a flat array given in inline_frames. The caller + * is then responsible for taking that array and convert it into perf's + * regular inline frame structures (which depend on e.g. struct list_head). + * + * If the address could not be resolved, or an error occurred (e.g. OOM), + * returns 0. Otherwise, returns the number of inline frames (which means 1 + * if the address was not part of an inlined function). If unwind_inlines + * is set and the return code is nonzero, inline_frames will be set to + * a newly allocated array with that length. The caller is then responsible + * for freeing both the strings and the array itself. + */ +int llvm_addr2line(const char* dso_name, + u64 addr, + char** file, + unsigned int* line, + bool unwind_inlines, + struct llvm_a2l_frame** inline_frames); + +#ifdef __cplusplus +} +#endif + +#endif /* __PERF_LLVM_C_HELPERS */ diff --git a/tools/perf/util/srcline.c b/tools/perf/util/srcline.c index 760742fd4a7d..f32d0d4f4bc9 100644 --- a/tools/perf/util/srcline.c +++ b/tools/perf/util/srcline.c @@ -6,6 +6,7 @@ #include #include +#include #include #include #include @@ -16,6 +17,9 @@ #include "util/debug.h" #include "util/callchain.h" #include "util/symbol_conf.h" +#ifdef HAVE_LIBLLVM_SUPPORT +#include "util/llvm-c-helpers.h" +#endif #include "srcline.h" #include "string2.h" #include "symbol.h" @@ -130,7 +134,60 @@ static struct symbol *new_inline_sym(struct dso *dso, #define MAX_INLINE_NEST 1024 -#ifdef HAVE_LIBBFD_SUPPORT +#ifdef HAVE_LIBLLVM_SUPPORT + +static void free_llvm_inline_frames(struct llvm_a2l_frame *inline_frames, + int num_frames) +{ + if (inline_frames != NULL) { + for (int i = 0; i < num_frames; ++i) { + zfree(&inline_frames[i].filename); + zfree(&inline_frames[i].funcname); + } + zfree(&inline_frames); + } +} + +static int addr2line(const char *dso_name, u64 addr, + char **file, unsigned int *line, struct dso *dso, + bool unwind_inlines, struct inline_node *node, + struct symbol *sym) +{ + struct llvm_a2l_frame *inline_frames = NULL; + int num_frames = llvm_addr2line(dso_name, addr, file, line, + node && unwind_inlines, &inline_frames); + + if (num_frames == 0 || !inline_frames) { + /* Error, or we didn't want inlines. */ + return num_frames; + } + + for (int i = 0; i < num_frames; ++i) { + struct symbol *inline_sym = + new_inline_sym(dso, sym, inline_frames[i].funcname); + char *srcline = NULL; + + if (inline_frames[i].filename) { + srcline = + srcline_from_fileline(inline_frames[i].filename, + inline_frames[i].line); + } + if (inline_list__append(inline_sym, srcline, node) != 0) { + free_llvm_inline_frames(inline_frames, num_frames); + return 0; + } + } + free_llvm_inline_frames(inline_frames, num_frames); + + return num_frames; +} + +void dso__free_a2l(struct dso *dso __maybe_unused) +{ + /* Nothing to free. */ +} + +#elif defined(HAVE_LIBBFD_SUPPORT) /* * Implement addr2line using libbfd. -- cgit v1.2.3 From 6eca7c5ac23effd552d6f03acc5ce0efc1ed1c1e Mon Sep 17 00:00:00 2001 From: "Steinar H. Gunderson" Date: Sat, 3 Aug 2024 17:20:07 +0200 Subject: perf annotate: Split out read_symbol() The Capstone disassembler code has a useful code snippet to read the bytes for a given code symbol into memory. Split it out into its own function, so that the LLVM disassembler can use it in the next patch. Signed-off-by: Steinar H. Gunderson Cc: Ian Rogers Link: https://lore.kernel.org/r/20240803152008.2818485-2-sesse@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/disasm.c | 90 ++++++++++++++++++++++++++++++------------------ 1 file changed, 56 insertions(+), 34 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/disasm.c b/tools/perf/util/disasm.c index d11e75133b23..5e9be3487c6c 100644 --- a/tools/perf/util/disasm.c +++ b/tools/perf/util/disasm.c @@ -1372,6 +1372,53 @@ static int find_file_offset(u64 start, u64 len, u64 pgoff, void *arg) return 0; } +static u8 * +read_symbol(const char *filename, struct map *map, struct symbol *sym, + u64 *len, bool *is_64bit) +{ + struct dso *dso = map__dso(map); + struct nscookie nsc; + u64 start = map__rip_2objdump(map, sym->start); + u64 end = map__rip_2objdump(map, sym->end); + int fd, count; + u8 *buf = NULL; + struct find_file_offset_data data = { + .ip = start, + }; + + *is_64bit = false; + + nsinfo__mountns_enter(dso__nsinfo(dso), &nsc); + fd = open(filename, O_RDONLY); + nsinfo__mountns_exit(&nsc); + if (fd < 0) + return NULL; + + if (file__read_maps(fd, /*exe=*/true, find_file_offset, &data, + is_64bit) == 0) + goto err; + + *len = end - start; + buf = malloc(*len); + if (buf == NULL) + goto err; + + count = pread(fd, buf, *len, data.offset); + close(fd); + fd = -1; + + if ((u64)count != *len) + goto err; + + return buf; + +err: + if (fd >= 0) + close(fd); + free(buf); + return NULL; +} + static void print_capstone_detail(cs_insn *insn, char *buf, size_t len, struct annotate_args *args, u64 addr) { @@ -1572,19 +1619,13 @@ static int symbol__disassemble_capstone(char *filename, struct symbol *sym, { struct annotation *notes = symbol__annotation(sym); struct map *map = args->ms.map; - struct dso *dso = map__dso(map); - struct nscookie nsc; u64 start = map__rip_2objdump(map, sym->start); - u64 end = map__rip_2objdump(map, sym->end); - u64 len = end - start; + u64 len; u64 offset; - int i, fd, count; + int i, count; bool is_64bit = false; bool needs_cs_close = false; u8 *buf = NULL; - struct find_file_offset_data data = { - .ip = start, - }; csh handle; cs_insn *insn; char disasm_buf[512]; @@ -1593,31 +1634,9 @@ static int symbol__disassemble_capstone(char *filename, struct symbol *sym, if (args->options->objdump_path) return -1; - nsinfo__mountns_enter(dso__nsinfo(dso), &nsc); - fd = open(filename, O_RDONLY); - nsinfo__mountns_exit(&nsc); - if (fd < 0) - return -1; - - if (file__read_maps(fd, /*exe=*/true, find_file_offset, &data, - &is_64bit) == 0) - goto err; - - if (open_capstone_handle(args, is_64bit, &handle) < 0) - goto err; - - needs_cs_close = true; - - buf = malloc(len); + buf = read_symbol(filename, map, sym, &len, &is_64bit); if (buf == NULL) - goto err; - - count = pread(fd, buf, len, data.offset); - close(fd); - fd = -1; - - if ((u64)count != len) - goto err; + return -1; /* add the function address and name */ scnprintf(disasm_buf, sizeof(disasm_buf), "%#"PRIx64" <%s>:", @@ -1635,6 +1654,11 @@ static int symbol__disassemble_capstone(char *filename, struct symbol *sym, annotation_line__add(&dl->al, ¬es->src->source); + if (open_capstone_handle(args, is_64bit, &handle) < 0) + goto err; + + needs_cs_close = true; + count = cs_disasm(handle, buf, len, start, len, &insn); for (i = 0, offset = 0; i < count; i++) { int printed; @@ -1679,8 +1703,6 @@ out: return count < 0 ? count : 0; err: - if (fd >= 0) - close(fd); if (needs_cs_close) { struct disasm_line *tmp; -- cgit v1.2.3 From 04885681788855bb266fd131f06c04952a717659 Mon Sep 17 00:00:00 2001 From: "Steinar H. Gunderson" Date: Sat, 3 Aug 2024 17:20:08 +0200 Subject: perf annotate: LLVM-based disassembler MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Support using LLVM as a disassembler method, allowing helperless annotation in non-distro builds. (It is also much faster than using libbfd or bfd objdump on binaries with a lot of debug information.) This is nearly identical to the output of llvm-objdump; there are some very rare whitespace differences, some minor changes to demangling (since we use perf's regular demangling and not LLVM's own) and the occasional case where llvm-objdump makes a different choice when multiple symbols share the same address. It should work across all of LLVM's supported architectures, although I've only tested 64-bit x86, and finding the right triple from perf's idea of machine architecture can sometimes be a bit tricky. Ideally, we should have some way of finding the triplet just from the file itself. Committer notes: Address this on 32-bit systems by using PRIu64 from inttypes.h 3 17.58 almalinux:9-i386 : FAIL gcc version 11.4.1 20231218 (Red Hat 11.4.1-3) (GCC) util/llvm-c-helpers.cpp: In function ‘char* make_symbol_relative_string(dso*, const char*, u64, u64)’: util/llvm-c-helpers.cpp:150:52: error: format ‘%lx’ expects argument of type ‘long unsigned int’, but argument 5 has type ‘u64’ {aka +‘long long unsigned int’} [-Werror=format=] 150 | snprintf(buf, sizeof(buf), "%s+0x%lx", | ~~^ | | | long unsigned int | %llx 151 | demangled ? demangled : sym_name, addr - base_addr); | ~~~~~~~~~~~~~~~~ | | | u64 {aka long long unsigned int} cc1plus: all warnings being treated as errors Signed-off-by: Steinar H. Gunderson Cc: Ian Rogers Link: https://lore.kernel.org/r/20240803152008.2818485-3-sesse@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/disasm.c | 193 +++++++++++++++++++++++++++++++++++++ tools/perf/util/llvm-c-helpers.cpp | 63 ++++++++++++ tools/perf/util/llvm-c-helpers.h | 11 +++ 3 files changed, 267 insertions(+) (limited to 'tools') diff --git a/tools/perf/util/disasm.c b/tools/perf/util/disasm.c index 5e9be3487c6c..f05ba7739c1e 100644 --- a/tools/perf/util/disasm.c +++ b/tools/perf/util/disasm.c @@ -48,6 +48,7 @@ static int call__scnprintf(struct ins *ins, char *bf, size_t size, static void ins__sort(struct arch *arch); static int disasm_line__parse(char *line, const char **namep, char **rawp); static int disasm_line__parse_powerpc(struct disasm_line *dl); +static char *expand_tabs(char *line, char **storage, size_t *storage_len); static __attribute__((constructor)) void symbol__init_regexpr(void) { @@ -1354,7 +1355,9 @@ static int open_capstone_handle(struct annotate_args *args, bool is_64bit, return 0; } +#endif +#if defined(HAVE_LIBCAPSTONE_SUPPORT) || defined(HAVE_LIBLLVM_SUPPORT) struct find_file_offset_data { u64 ip; u64 offset; @@ -1418,7 +1421,9 @@ err: free(buf); return NULL; } +#endif +#ifdef HAVE_LIBCAPSTONE_SUPPORT static void print_capstone_detail(cs_insn *insn, char *buf, size_t len, struct annotate_args *args, u64 addr) { @@ -1805,6 +1810,189 @@ err: count = -1; goto out; } + +#ifdef HAVE_LIBLLVM_SUPPORT +#include +#include +#include "util/llvm-c-helpers.h" + +struct symbol_lookup_storage { + u64 branch_addr; + u64 pcrel_load_addr; +}; + +/* + * Whenever LLVM wants to resolve an address into a symbol, it calls this + * callback. We don't ever actually _return_ anything (in particular, because + * it puts quotation marks around what we return), but we use this as a hint + * that there is a branch or PC-relative address in the expression that we + * should add some textual annotation for after the instruction. The caller + * will use this information to add the actual annotation. + */ +static const char * +symbol_lookup_callback(void *disinfo, uint64_t value, + uint64_t *ref_type, + uint64_t address __maybe_unused, + const char **ref __maybe_unused) +{ + struct symbol_lookup_storage *storage = disinfo; + + if (*ref_type == LLVMDisassembler_ReferenceType_In_Branch) + storage->branch_addr = value; + else if (*ref_type == LLVMDisassembler_ReferenceType_In_PCrel_Load) + storage->pcrel_load_addr = value; + *ref_type = LLVMDisassembler_ReferenceType_InOut_None; + return NULL; +} + +static int symbol__disassemble_llvm(char *filename, struct symbol *sym, + struct annotate_args *args) +{ + struct annotation *notes = symbol__annotation(sym); + struct map *map = args->ms.map; + struct dso *dso = map__dso(map); + u64 start = map__rip_2objdump(map, sym->start); + u8 *buf; + u64 len; + u64 pc; + bool is_64bit; + char triplet[64]; + char disasm_buf[2048]; + size_t disasm_len; + struct disasm_line *dl; + LLVMDisasmContextRef disasm = NULL; + struct symbol_lookup_storage storage; + char *line_storage = NULL; + size_t line_storage_len = 0; + int ret = -1; + + if (args->options->objdump_path) + return -1; + + LLVMInitializeAllTargetInfos(); + LLVMInitializeAllTargetMCs(); + LLVMInitializeAllDisassemblers(); + + buf = read_symbol(filename, map, sym, &len, &is_64bit); + if (buf == NULL) + return -1; + + if (arch__is(args->arch, "x86")) { + if (is_64bit) + scnprintf(triplet, sizeof(triplet), "x86_64-pc-linux"); + else + scnprintf(triplet, sizeof(triplet), "i686-pc-linux"); + } else { + scnprintf(triplet, sizeof(triplet), "%s-linux-gnu", + args->arch->name); + } + + disasm = LLVMCreateDisasm(triplet, &storage, 0, NULL, + symbol_lookup_callback); + if (disasm == NULL) + goto err; + + if (args->options->disassembler_style && + !strcmp(args->options->disassembler_style, "intel")) + LLVMSetDisasmOptions(disasm, + LLVMDisassembler_Option_AsmPrinterVariant); + + /* + * This needs to be set after AsmPrinterVariant, due to a bug in LLVM; + * setting AsmPrinterVariant makes a new instruction printer, making it + * forget about the PrintImmHex flag (which is applied before if both + * are given to the same call). + */ + LLVMSetDisasmOptions(disasm, LLVMDisassembler_Option_PrintImmHex); + + /* add the function address and name */ + scnprintf(disasm_buf, sizeof(disasm_buf), "%#"PRIx64" <%s>:", + start, sym->name); + + args->offset = -1; + args->line = disasm_buf; + args->line_nr = 0; + args->fileloc = NULL; + args->ms.sym = sym; + + dl = disasm_line__new(args); + if (dl == NULL) + goto err; + + annotation_line__add(&dl->al, ¬es->src->source); + + pc = start; + for (u64 offset = 0; offset < len; ) { + unsigned int ins_len; + + storage.branch_addr = 0; + storage.pcrel_load_addr = 0; + + ins_len = LLVMDisasmInstruction(disasm, buf + offset, + len - offset, pc, + disasm_buf, sizeof(disasm_buf)); + if (ins_len == 0) + goto err; + disasm_len = strlen(disasm_buf); + + if (storage.branch_addr != 0) { + char *name = llvm_name_for_code(dso, filename, + storage.branch_addr); + if (name != NULL) { + disasm_len += scnprintf(disasm_buf + disasm_len, + sizeof(disasm_buf) - + disasm_len, + " <%s>", name); + free(name); + } + } + if (storage.pcrel_load_addr != 0) { + char *name = llvm_name_for_data(dso, filename, + storage.pcrel_load_addr); + disasm_len += scnprintf(disasm_buf + disasm_len, + sizeof(disasm_buf) - disasm_len, + " # %#"PRIx64, + storage.pcrel_load_addr); + if (name) { + disasm_len += scnprintf(disasm_buf + disasm_len, + sizeof(disasm_buf) - + disasm_len, + " <%s>", name); + free(name); + } + } + + args->offset = offset; + args->line = expand_tabs(disasm_buf, &line_storage, + &line_storage_len); + args->line_nr = 0; + args->fileloc = NULL; + args->ms.sym = sym; + + llvm_addr2line(filename, pc, &args->fileloc, + (unsigned int *)&args->line_nr, false, NULL); + + dl = disasm_line__new(args); + if (dl == NULL) + goto err; + + annotation_line__add(&dl->al, ¬es->src->source); + + free(args->fileloc); + pc += ins_len; + offset += ins_len; + } + + ret = 0; + +err: + LLVMDisasmDispose(disasm); + free(buf); + free(line_storage); + return ret; +} +#endif + /* * Possibly create a new version of line with tabs expanded. Returns the * existing or new line, storage is updated if a new line is allocated. If @@ -1951,6 +2139,11 @@ int symbol__disassemble(struct symbol *sym, struct annotate_args *args) } } +#ifdef HAVE_LIBLLVM_SUPPORT + err = symbol__disassemble_llvm(symfs_filename, sym, args); + if (err == 0) + goto out_remove_tmp; +#endif #ifdef HAVE_LIBCAPSTONE_SUPPORT err = symbol__disassemble_capstone(symfs_filename, sym, args); if (err == 0) diff --git a/tools/perf/util/llvm-c-helpers.cpp b/tools/perf/util/llvm-c-helpers.cpp index 3cc967ec6f28..663bcaba2041 100644 --- a/tools/perf/util/llvm-c-helpers.cpp +++ b/tools/perf/util/llvm-c-helpers.cpp @@ -8,8 +8,10 @@ #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wunused-parameter" /* Needed for LLVM <= 15 */ #include +#include #pragma GCC diagnostic pop +#include #include #include #include @@ -19,6 +21,9 @@ extern "C" { #include "symbol_conf.h" #include "llvm-c-helpers.h" +extern "C" +char *dso__demangle_sym(struct dso *dso, int kmodule, const char *elf_name); + using namespace llvm; using llvm::symbolize::LLVMSymbolizer; @@ -132,3 +137,61 @@ int llvm_addr2line(const char *dso_name, u64 addr, return extract_file_and_line(*res_or_err, file, line); } } + +static char * +make_symbol_relative_string(struct dso *dso, const char *sym_name, + u64 addr, u64 base_addr) +{ + if (!strcmp(sym_name, "")) + return NULL; + + char *demangled = dso__demangle_sym(dso, 0, sym_name); + if (base_addr && base_addr != addr) { + char buf[256]; + snprintf(buf, sizeof(buf), "%s+0x%" PRIx64, + demangled ? demangled : sym_name, addr - base_addr); + free(demangled); + return strdup(buf); + } else { + if (demangled) + return demangled; + else + return strdup(sym_name); + } +} + +extern "C" +char *llvm_name_for_code(struct dso *dso, const char *dso_name, u64 addr) +{ + LLVMSymbolizer *symbolizer = get_symbolizer(); + object::SectionedAddress sectioned_addr = { + addr, + object::SectionedAddress::UndefSection + }; + Expected res_or_err = + symbolizer->symbolizeCode(dso_name, sectioned_addr); + if (!res_or_err) { + return NULL; + } + return make_symbol_relative_string( + dso, res_or_err->FunctionName.c_str(), + addr, res_or_err->StartAddress ? *res_or_err->StartAddress : 0); +} + +extern "C" +char *llvm_name_for_data(struct dso *dso, const char *dso_name, u64 addr) +{ + LLVMSymbolizer *symbolizer = get_symbolizer(); + object::SectionedAddress sectioned_addr = { + addr, + object::SectionedAddress::UndefSection + }; + Expected res_or_err = + symbolizer->symbolizeData(dso_name, sectioned_addr); + if (!res_or_err) { + return NULL; + } + return make_symbol_relative_string( + dso, res_or_err->Name.c_str(), + addr, res_or_err->Start); +} diff --git a/tools/perf/util/llvm-c-helpers.h b/tools/perf/util/llvm-c-helpers.h index 19332dd98e14..d2b99637a28a 100644 --- a/tools/perf/util/llvm-c-helpers.h +++ b/tools/perf/util/llvm-c-helpers.h @@ -13,6 +13,8 @@ extern "C" { #endif +struct dso; + struct llvm_a2l_frame { char* filename; char* funcname; @@ -42,6 +44,15 @@ int llvm_addr2line(const char* dso_name, bool unwind_inlines, struct llvm_a2l_frame** inline_frames); +/* + * Simple symbolizers for addresses; will convert something like + * 0x12345 to "func+0x123". Will return NULL if no symbol was found. + * + * The returned value must be freed by the caller, with free(). + */ +char *llvm_name_for_code(struct dso *dso, const char *dso_name, u64 addr); +char *llvm_name_for_data(struct dso *dso, const char *dso_name, u64 addr); + #ifdef __cplusplus } #endif -- cgit v1.2.3 From 6c99903e084c68887f7bc52a6819117f26820667 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Sat, 31 Aug 2024 00:04:10 -0700 Subject: perf pmus: Fix name comparisons on 32-bit systems The hex PMU suffix maybe 64-bit but the comparisons were "unsigned long" or 32-bit on 32-bit systems. This was causing the "PMU name comparison" test to fail in a 32-bit build. Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Athira Rajeev Cc: Chaitanya S Prakash Cc: Colin Ian King Cc: David Ahern Cc: Dominique Martinet Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: John Garry Cc: Junhao He Cc: Kan Liang Cc: Mark Rutland Cc: Masami Hiramatsu Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Yang Jihong Link: https://lore.kernel.org/r/20240831070415.506194-2-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/pmus.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/pmus.c b/tools/perf/util/pmus.c index 3fcabfd8fca1..769b920d9250 100644 --- a/tools/perf/util/pmus.c +++ b/tools/perf/util/pmus.c @@ -69,7 +69,7 @@ size_t pmu_name_len_no_suffix(const char *str) int pmu_name_cmp(const char *lhs_pmu_name, const char *rhs_pmu_name) { - unsigned long lhs_num = 0, rhs_num = 0; + unsigned long long lhs_num = 0, rhs_num = 0; size_t lhs_pmu_name_len = pmu_name_len_no_suffix(lhs_pmu_name); size_t rhs_pmu_name_len = pmu_name_len_no_suffix(rhs_pmu_name); int ret = strncmp(lhs_pmu_name, rhs_pmu_name, @@ -79,9 +79,9 @@ int pmu_name_cmp(const char *lhs_pmu_name, const char *rhs_pmu_name) return ret; if (lhs_pmu_name_len + 1 < strlen(lhs_pmu_name)) - lhs_num = strtoul(&lhs_pmu_name[lhs_pmu_name_len + 1], NULL, 16); + lhs_num = strtoull(&lhs_pmu_name[lhs_pmu_name_len + 1], NULL, 16); if (rhs_pmu_name_len + 1 < strlen(rhs_pmu_name)) - rhs_num = strtoul(&rhs_pmu_name[rhs_pmu_name_len + 1], NULL, 16); + rhs_num = strtoull(&rhs_pmu_name[rhs_pmu_name_len + 1], NULL, 16); return lhs_num < rhs_num ? -1 : (lhs_num > rhs_num ? 1 : 0); } -- cgit v1.2.3 From 38e2648a81204c9fc5b4c87a8ffce93a6ed91b65 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Sat, 31 Aug 2024 00:04:11 -0700 Subject: perf time-utils: Fix 32-bit nsec parsing The "time utils" test fails in 32-bit builds: ... parse_nsec_time("18446744073.709551615") Failed. ptime 4294967295709551615 expected 18446744073709551615 ... Switch strtoul to strtoull as an unsigned long in 32-bit build isn't 64-bits. Fixes: c284d669a20d408b ("perf tools: Move parse_nsec_time to time-utils.c") Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Athira Rajeev Cc: Chaitanya S Prakash Cc: Colin Ian King Cc: David Ahern Cc: Dominique Martinet Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: John Garry Cc: Junhao He Cc: Kan Liang Cc: Mark Rutland Cc: Masami Hiramatsu Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Yang Jihong Link: https://lore.kernel.org/r/20240831070415.506194-3-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/time-utils.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/time-utils.c b/tools/perf/util/time-utils.c index 302443921681..1b91ccd4d523 100644 --- a/tools/perf/util/time-utils.c +++ b/tools/perf/util/time-utils.c @@ -20,7 +20,7 @@ int parse_nsec_time(const char *str, u64 *ptime) u64 time_sec, time_nsec; char *end; - time_sec = strtoul(str, &end, 10); + time_sec = strtoull(str, &end, 10); if (*end != '.' && *end != '\0') return -1; @@ -38,7 +38,7 @@ int parse_nsec_time(const char *str, u64 *ptime) for (i = strlen(nsec_buf); i < 9; i++) nsec_buf[i] = '0'; - time_nsec = strtoul(nsec_buf, &end, 10); + time_nsec = strtoull(nsec_buf, &end, 10); if (*end != '\0') return -1; } else -- cgit v1.2.3 From 91235380e5c78dc02f18d6afa97dff82ab3a6887 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Sat, 31 Aug 2024 00:04:12 -0700 Subject: perf test: Skip uprobe test if probe command isn't present The probe command is dependent on libelf. Skip the test if the required probe command isn't present. Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Athira Rajeev Cc: Chaitanya S Prakash Cc: Colin Ian King Cc: David Ahern Cc: Dominique Martinet Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: John Garry Cc: Junhao He Cc: Kan Liang Cc: Mark Rutland Cc: Masami Hiramatsu Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Yang Jihong Link: https://lore.kernel.org/r/20240831070415.506194-4-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/shell/test_uprobe_from_different_cu.sh | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'tools') diff --git a/tools/perf/tests/shell/test_uprobe_from_different_cu.sh b/tools/perf/tests/shell/test_uprobe_from_different_cu.sh index 82bc774a078a..33387c329f92 100755 --- a/tools/perf/tests/shell/test_uprobe_from_different_cu.sh +++ b/tools/perf/tests/shell/test_uprobe_from_different_cu.sh @@ -4,6 +4,13 @@ set -e +# Skip if there's no probe command. +if ! perf | grep probe +then + echo "Skip: probe command isn't present" + exit 2 +fi + # skip if there's no gcc if ! [ -x "$(command -v gcc)" ]; then echo "failed: no gcc compiler" -- cgit v1.2.3 From 18f41f1ba5404cb4ca17590ba28258f8358f8695 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Sat, 31 Aug 2024 00:04:15 -0700 Subject: perf test: Make watchpoint data 32-bits on i386 i386 only supports watchpoints up to size 4, 8 bytes causes extra counts and test failures. Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Athira Rajeev Cc: Chaitanya S Prakash Cc: Colin Ian King Cc: David Ahern Cc: Dominique Martinet Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: John Garry Cc: Junhao He Cc: Kan Liang Cc: Mark Rutland Cc: Masami Hiramatsu Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Yang Jihong Link: https://lore.kernel.org/r/20240831070415.506194-7-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/wp.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'tools') diff --git a/tools/perf/tests/wp.c b/tools/perf/tests/wp.c index cc8719609b19..6c178985e37f 100644 --- a/tools/perf/tests/wp.c +++ b/tools/perf/tests/wp.c @@ -20,7 +20,12 @@ do { \ TEST_ASSERT_VAL(text, count == val); \ } while (0) +#ifdef __i386__ +/* Only breakpoint length less-than 8 has hardware support on i386. */ +volatile u32 data1; +#else volatile u64 data1; +#endif volatile u8 data2[3]; #ifndef __s390x__ -- cgit v1.2.3 From 76d3685400944e63c2c6e791a72d5329c2843770 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Mon, 2 Sep 2024 13:05:11 -0700 Subject: perf stat: Constify control data for BPF The control knobs set before loading BPF programs should be declared as 'const volatile' so that it can be optimized by the BPF core. Committer testing: root@x1:~# perf stat --bpf-counters -e cpu_core/cycles/,cpu_core/instructions/ sleep 1 Performance counter stats for 'sleep 1': 2,442,583 cpu_core/cycles/ 2,494,425 cpu_core/instructions/ 1.002687372 seconds time elapsed 0.001126000 seconds user 0.001166000 seconds sys root@x1:~# perf trace -e bpf --max-events 10 perf stat --bpf-counters -e cpu_core/cycles/,cpu_core/instructions/ sleep 1 0.000 ( 0.019 ms): perf/2944119 bpf(cmd: OBJ_GET, uattr: 0x7fffdf5cdd40, size: 20) = 5 0.021 ( 0.002 ms): perf/2944119 bpf(cmd: OBJ_GET_INFO_BY_FD, uattr: 0x7fffdf5cdcd0, size: 16) = 0 0.030 ( 0.005 ms): perf/2944119 bpf(cmd: MAP_LOOKUP_ELEM, uattr: 0x7fffdf5ceda0, size: 32) = 0 0.037 ( 0.004 ms): perf/2944119 bpf(cmd: LINK_GET_FD_BY_ID, uattr: 0x7fffdf5ced80, size: 12) = -1 ENOENT (No such file or directory) 0.189 ( 0.004 ms): perf/2944119 bpf(cmd: 36, uattr: 0x7fffdf5cec10, size: 8) = -1 EOPNOTSUPP (Operation not supported) 0.201 ( 0.095 ms): perf/2944119 bpf(cmd: PROG_LOAD, uattr: 0x7fffdf5ce940, size: 148) = 10 0.305 ( 0.026 ms): perf/2944119 bpf(cmd: PROG_LOAD, uattr: 0x7fffdf5cea00, size: 148) = 10 0.347 ( 0.012 ms): perf/2944119 bpf(cmd: BTF_LOAD, uattr: 0x7fffdf5ce8e0, size: 40) = 10 0.364 ( 0.004 ms): perf/2944119 bpf(cmd: BTF_LOAD, uattr: 0x7fffdf5ce950, size: 40) = 10 0.376 ( 0.006 ms): perf/2944119 bpf(cmd: BTF_LOAD, uattr: 0x7fffdf5ce730, size: 40) = 10 root@x1:~# Performance counter stats for 'sleep 1': 271,221 cpu_core/cycles/ 139,150 cpu_core/instructions/ 1.002881677 seconds time elapsed 0.001318000 seconds user 0.001314000 seconds sys root@x1:~# Signed-off-by: Namhyung Kim Tested-by: Arnaldo Carvalho de Melo Cc: Adrian Hunter Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kan Liang Cc: Peter Zijlstra Cc: Song Liu Link: https://lore.kernel.org/r/20240902200515.2103769-2-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/bpf_counter_cgroup.c | 6 +++--- tools/perf/util/bpf_skel/bperf_cgroup.bpf.c | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/bpf_counter_cgroup.c b/tools/perf/util/bpf_counter_cgroup.c index ea29c372f339..6ff42619de12 100644 --- a/tools/perf/util/bpf_counter_cgroup.c +++ b/tools/perf/util/bpf_counter_cgroup.c @@ -61,6 +61,9 @@ static int bperf_load_program(struct evlist *evlist) skel->rodata->num_cpus = total_cpus; skel->rodata->num_events = evlist->core.nr_entries / nr_cgroups; + if (cgroup_is_v2("perf_event") > 0) + skel->rodata->use_cgroup_v2 = 1; + BUG_ON(evlist->core.nr_entries % nr_cgroups != 0); /* we need one copy of events per cpu for reading */ @@ -82,9 +85,6 @@ static int bperf_load_program(struct evlist *evlist) goto out; } - if (cgroup_is_v2("perf_event") > 0) - skel->bss->use_cgroup_v2 = 1; - err = -1; cgrp_switch = evsel__new(&cgrp_switch_attr); diff --git a/tools/perf/util/bpf_skel/bperf_cgroup.bpf.c b/tools/perf/util/bpf_skel/bperf_cgroup.bpf.c index 6a438e0102c5..57cab7647a9a 100644 --- a/tools/perf/util/bpf_skel/bperf_cgroup.bpf.c +++ b/tools/perf/util/bpf_skel/bperf_cgroup.bpf.c @@ -57,9 +57,9 @@ struct cgroup___old { const volatile __u32 num_events = 1; const volatile __u32 num_cpus = 1; +const volatile int use_cgroup_v2 = 0; int enabled = 0; -int use_cgroup_v2 = 0; int perf_subsys_id = -1; static inline __u64 get_cgroup_v1_ancestor_id(struct cgroup *cgrp, int level) -- cgit v1.2.3 From ac5a23b2f2868c54fdd2fc84c1990ebeb1e06188 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Mon, 2 Sep 2024 13:05:12 -0700 Subject: perf ftrace latency: Constify control data for BPF The control knobs set before loading BPF programs should be declared as 'const volatile' so that it can be optimized by the BPF core. Committer testing: root@x1:~# perf ftrace latency --use-bpf -T schedule ^C# DURATION | COUNT | GRAPH | 0 - 1 us | 0 | | 1 - 2 us | 0 | | 2 - 4 us | 0 | | 4 - 8 us | 0 | | 8 - 16 us | 1 | | 16 - 32 us | 5 | | 32 - 64 us | 2 | | 64 - 128 us | 6 | | 128 - 256 us | 7 | | 256 - 512 us | 5 | | 512 - 1024 us | 22 | # | 1 - 2 ms | 36 | ## | 2 - 4 ms | 68 | ##### | 4 - 8 ms | 22 | # | 8 - 16 ms | 91 | ####### | 16 - 32 ms | 11 | | 32 - 64 ms | 26 | ## | 64 - 128 ms | 213 | ################# | 128 - 256 ms | 19 | # | 256 - 512 ms | 14 | # | 512 - 1024 ms | 5 | | 1 - ... s | 8 | | root@x1:~# root@x1:~# perf trace -e bpf perf ftrace latency --use-bpf -T schedule 0.000 ( 0.015 ms): perf/2944525 bpf(cmd: 36, uattr: 0x7ffe80de7b40, size: 8) = -1 EOPNOTSUPP (Operation not supported) 0.025 ( 0.102 ms): perf/2944525 bpf(cmd: PROG_LOAD, uattr: 0x7ffe80de7870, size: 148) = 8 0.136 ( 0.026 ms): perf/2944525 bpf(cmd: PROG_LOAD, uattr: 0x7ffe80de7930, size: 148) = 8 0.174 ( 0.026 ms): perf/2944525 bpf(cmd: PROG_LOAD, uattr: 0x7ffe80de77e0, size: 148) = 8 0.205 ( 0.010 ms): perf/2944525 bpf(uattr: 0x7ffe80de7990, size: 80) = 8 0.227 ( 0.011 ms): perf/2944525 bpf(cmd: BTF_LOAD, uattr: 0x7ffe80de7810, size: 40) = 8 0.244 ( 0.004 ms): perf/2944525 bpf(cmd: BTF_LOAD, uattr: 0x7ffe80de7880, size: 40) = 8 0.257 ( 0.006 ms): perf/2944525 bpf(cmd: BTF_LOAD, uattr: 0x7ffe80de7660, size: 40) = 8 0.265 ( 0.058 ms): perf/2944525 bpf(cmd: PROG_LOAD, uattr: 0x7ffe80de7730, size: 148) = 9 0.330 ( 0.004 ms): perf/2944525 bpf(cmd: BTF_LOAD, uattr: 0x7ffe80de78e0, size: 40) = 8 0.337 ( 0.003 ms): perf/2944525 bpf(cmd: BTF_LOAD, uattr: 0x7ffe80de7890, size: 40) = 8 0.343 ( 0.004 ms): perf/2944525 bpf(cmd: BTF_LOAD, uattr: 0x7ffe80de7880, size: 40) = 8 0.349 ( 0.003 ms): perf/2944525 bpf(cmd: BTF_LOAD, uattr: 0x7ffe80de78b0, size: 40) = 8 0.355 ( 0.004 ms): perf/2944525 bpf(cmd: BTF_LOAD, uattr: 0x7ffe80de7890, size: 40) = 8 0.361 ( 0.003 ms): perf/2944525 bpf(cmd: BTF_LOAD, uattr: 0x7ffe80de78b0, size: 40) = 8 0.367 ( 0.003 ms): perf/2944525 bpf(cmd: BTF_LOAD, uattr: 0x7ffe80de7880, size: 40) = 8 0.373 ( 0.014 ms): perf/2944525 bpf(cmd: BTF_LOAD, uattr: 0x7ffe80de7a00, size: 40) = 8 0.390 ( 0.358 ms): perf/2944525 bpf(uattr: 0x7ffe80de7950, size: 80) = 9 0.763 ( 0.014 ms): perf/2944525 bpf(uattr: 0x7ffe80de7950, size: 80) = 9 0.783 ( 0.011 ms): perf/2944525 bpf(uattr: 0x7ffe80de7950, size: 80) = 9 0.798 ( 0.017 ms): perf/2944525 bpf(uattr: 0x7ffe80de7950, size: 80) = 9 0.819 ( 0.003 ms): perf/2944525 bpf(uattr: 0x7ffe80de7700, size: 80) = 9 0.824 ( 0.047 ms): perf/2944525 bpf(cmd: PROG_LOAD, uattr: 0x7ffe80de76c0, size: 148) = 10 0.878 ( 0.008 ms): perf/2944525 bpf(uattr: 0x7ffe80de7950, size: 80) = 9 0.891 ( 0.014 ms): perf/2944525 bpf(cmd: MAP_UPDATE_ELEM, uattr: 0x7ffe80de79e0, size: 32) = 0 0.910 ( 0.103 ms): perf/2944525 bpf(cmd: PROG_LOAD, uattr: 0x7ffe80de7880, size: 148) = 9 1.016 ( 0.143 ms): perf/2944525 bpf(cmd: PROG_LOAD, uattr: 0x7ffe80de7880, size: 148) = 10 3.777 ( 0.068 ms): perf/2944525 bpf(cmd: PROG_LOAD, uattr: 0x7ffe80de7570, size: 148) = 12 3.848 ( 0.003 ms): perf/2944525 bpf(cmd: LINK_CREATE, uattr: 0x7ffe80de7550, size: 64) = -1 EBADF (Bad file descriptor) 3.859 ( 0.006 ms): perf/2944525 bpf(cmd: LINK_CREATE, uattr: 0x7ffe80de77c0, size: 64) = 12 6.504 ( 0.010 ms): perf/2944525 bpf(cmd: LINK_CREATE, uattr: 0x7ffe80de77c0, size: 64) = 14 ^C# DURATION | COUNT | GRAPH | 0 - 1 us | 0 | | 1 - 2 us | 0 | | 2 - 4 us | 1 | | 4 - 8 us | 3 | | 8 - 16 us | 3 | | 16 - 32 us | 11 | | 32 - 64 us | 9 | | 64 - 128 us | 17 | | 128 - 256 us | 30 | # | 256 - 512 us | 20 | | 512 - 1024 us | 42 | # | 1 - 2 ms | 151 | ###### | 2 - 4 ms | 106 | #### | 4 - 8 ms | 18 | | 8 - 16 ms | 149 | ###### | 16 - 32 ms | 30 | # | 32 - 64 ms | 17 | | 64 - 128 ms | 360 | ############### | 128 - 256 ms | 52 | ## | 256 - 512 ms | 18 | | 512 - 1024 ms | 28 | # | 1 - ... s | 5 | | root@x1:~# Signed-off-by: Namhyung Kim Tested-by: Arnaldo Carvalho de Melo Cc: Adrian Hunter Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kan Liang Cc: Peter Zijlstra Cc: Song Liu Link: https://lore.kernel.org/r/20240902200515.2103769-3-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/bpf_ftrace.c | 8 ++++---- tools/perf/util/bpf_skel/func_latency.bpf.c | 7 ++++--- 2 files changed, 8 insertions(+), 7 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/bpf_ftrace.c b/tools/perf/util/bpf_ftrace.c index 7a4297d8fd2c..06d1c4018407 100644 --- a/tools/perf/util/bpf_ftrace.c +++ b/tools/perf/util/bpf_ftrace.c @@ -40,13 +40,17 @@ int perf_ftrace__latency_prepare_bpf(struct perf_ftrace *ftrace) if (ftrace->target.cpu_list) { ncpus = perf_cpu_map__nr(ftrace->evlist->core.user_requested_cpus); bpf_map__set_max_entries(skel->maps.cpu_filter, ncpus); + skel->rodata->has_cpu = 1; } if (target__has_task(&ftrace->target) || target__none(&ftrace->target)) { ntasks = perf_thread_map__nr(ftrace->evlist->core.threads); bpf_map__set_max_entries(skel->maps.task_filter, ntasks); + skel->rodata->has_task = 1; } + skel->rodata->use_nsec = ftrace->use_nsec; + set_max_rlimit(); err = func_latency_bpf__load(skel); @@ -59,7 +63,6 @@ int perf_ftrace__latency_prepare_bpf(struct perf_ftrace *ftrace) u32 cpu; u8 val = 1; - skel->bss->has_cpu = 1; fd = bpf_map__fd(skel->maps.cpu_filter); for (i = 0; i < ncpus; i++) { @@ -72,7 +75,6 @@ int perf_ftrace__latency_prepare_bpf(struct perf_ftrace *ftrace) u32 pid; u8 val = 1; - skel->bss->has_task = 1; fd = bpf_map__fd(skel->maps.task_filter); for (i = 0; i < ntasks; i++) { @@ -81,8 +83,6 @@ int perf_ftrace__latency_prepare_bpf(struct perf_ftrace *ftrace) } } - skel->bss->use_nsec = ftrace->use_nsec; - skel->links.func_begin = bpf_program__attach_kprobe(skel->progs.func_begin, false, func->name); if (IS_ERR(skel->links.func_begin)) { diff --git a/tools/perf/util/bpf_skel/func_latency.bpf.c b/tools/perf/util/bpf_skel/func_latency.bpf.c index 9d01e3af7479..f613dc9cb123 100644 --- a/tools/perf/util/bpf_skel/func_latency.bpf.c +++ b/tools/perf/util/bpf_skel/func_latency.bpf.c @@ -37,9 +37,10 @@ struct { int enabled = 0; -int has_cpu = 0; -int has_task = 0; -int use_nsec = 0; + +const volatile int has_cpu = 0; +const volatile int has_task = 0; +const volatile int use_nsec = 0; SEC("kprobe/func") int BPF_PROG(func_begin) -- cgit v1.2.3 From 066fd840873f2187deb4a646c5f531a8dba2fd36 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Mon, 2 Sep 2024 13:05:13 -0700 Subject: perf kwork: Constify control data for BPF The control knobs set before loading BPF programs should be declared as 'const volatile' so that it can be optimized by the BPF core. Committer testing: root@x1:~# perf kwork report --use-bpf Starting trace, Hit to stop and report ^C Kwork Name | Cpu | Total Runtime | Count | Max runtime | Max runtime start | Max runtime end | -------------------------------------------------------------------------------------------------------------------------------- (w)intel_atomic_commit_work [ | 0009 | 18.680 ms | 2 | 18.553 ms | 362410.681580 s | 362410.700133 s | (w)pm_runtime_work | 0007 | 13.300 ms | 1 | 13.300 ms | 362410.254996 s | 362410.268295 s | (w)intel_atomic_commit_work [ | 0009 | 9.846 ms | 2 | 9.717 ms | 362410.172352 s | 362410.182069 s | (w)acpi_ec_event_processor | 0002 | 8.106 ms | 1 | 8.106 ms | 362410.463187 s | 362410.471293 s | (s)SCHED:7 | 0000 | 1.351 ms | 106 | 0.063 ms | 362410.658017 s | 362410.658080 s | i915:157 | 0008 | 0.994 ms | 13 | 0.361 ms | 362411.222125 s | 362411.222486 s | (s)SCHED:7 | 0001 | 0.703 ms | 98 | 0.047 ms | 362410.245004 s | 362410.245051 s | (s)SCHED:7 | 0005 | 0.674 ms | 42 | 0.074 ms | 362411.483039 s | 362411.483113 s | (s)NET_RX:3 | 0001 | 0.556 ms | 10 | 0.079 ms | 362411.066388 s | 362411.066467 s | root@x1:~# perf trace -e bpf --max-events 5 perf kwork report --use-bpf 0.000 ( 0.016 ms): perf/2948007 bpf(cmd: 36, uattr: 0x7ffededa6660, size: 8) = -1 EOPNOTSUPP (Operation not supported) 0.026 ( 0.106 ms): perf/2948007 bpf(cmd: PROG_LOAD, uattr: 0x7ffededa6390, size: 148) = 12 0.152 ( 0.032 ms): perf/2948007 bpf(cmd: PROG_LOAD, uattr: 0x7ffededa6450, size: 148) = 12 26.247 ( 0.138 ms): perf/2948007 bpf(cmd: PROG_LOAD, uattr: 0x7ffededa6300, size: 148) = 12 26.396 ( 0.012 ms): perf/2948007 bpf(uattr: 0x7ffededa64b0, size: 80) = 12 Starting trace, Hit to stop and report root@x1:~# Signed-off-by: Namhyung Kim Tested-by: Arnaldo Carvalho de Melo Cc: Adrian Hunter Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kan Liang Cc: Peter Zijlstra Cc: Song Liu Cc: Yang Jihong Link: https://lore.kernel.org/r/20240902200515.2103769-4-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/bpf_kwork.c | 9 +++++---- tools/perf/util/bpf_kwork_top.c | 7 ++++--- tools/perf/util/bpf_skel/kwork_top.bpf.c | 2 +- tools/perf/util/bpf_skel/kwork_trace.bpf.c | 5 +++-- 4 files changed, 13 insertions(+), 10 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/bpf_kwork.c b/tools/perf/util/bpf_kwork.c index 44f0f708a15d..6c7126b7670d 100644 --- a/tools/perf/util/bpf_kwork.c +++ b/tools/perf/util/bpf_kwork.c @@ -176,8 +176,6 @@ static int setup_filters(struct perf_kwork *kwork) bpf_map_update_elem(fd, &cpu.cpu, &val, BPF_ANY); } perf_cpu_map__put(map); - - skel->bss->has_cpu_filter = 1; } if (kwork->profile_name != NULL) { @@ -197,8 +195,6 @@ static int setup_filters(struct perf_kwork *kwork) key = 0; bpf_map_update_elem(fd, &key, kwork->profile_name, BPF_ANY); - - skel->bss->has_name_filter = 1; } return 0; @@ -239,6 +235,11 @@ int perf_kwork__trace_prepare_bpf(struct perf_kwork *kwork) class_bpf->load_prepare(kwork); } + if (kwork->cpu_list != NULL) + skel->rodata->has_cpu_filter = 1; + if (kwork->profile_name != NULL) + skel->rodata->has_name_filter = 1; + if (kwork_trace_bpf__load(skel)) { pr_debug("Failed to load kwork trace skeleton\n"); goto out; diff --git a/tools/perf/util/bpf_kwork_top.c b/tools/perf/util/bpf_kwork_top.c index 22a3b00a1e23..7261cad43468 100644 --- a/tools/perf/util/bpf_kwork_top.c +++ b/tools/perf/util/bpf_kwork_top.c @@ -151,14 +151,12 @@ static int setup_filters(struct perf_kwork *kwork) bpf_map_update_elem(fd, &cpu.cpu, &val, BPF_ANY); } perf_cpu_map__put(map); - - skel->bss->has_cpu_filter = 1; } return 0; } -int perf_kwork__top_prepare_bpf(struct perf_kwork *kwork __maybe_unused) +int perf_kwork__top_prepare_bpf(struct perf_kwork *kwork) { struct bpf_program *prog; struct kwork_class *class; @@ -193,6 +191,9 @@ int perf_kwork__top_prepare_bpf(struct perf_kwork *kwork __maybe_unused) class_bpf->load_prepare(); } + if (kwork->cpu_list) + skel->rodata->has_cpu_filter = 1; + if (kwork_top_bpf__load(skel)) { pr_debug("Failed to load kwork top skeleton\n"); goto out; diff --git a/tools/perf/util/bpf_skel/kwork_top.bpf.c b/tools/perf/util/bpf_skel/kwork_top.bpf.c index 84c15ccbab44..594da91965a2 100644 --- a/tools/perf/util/bpf_skel/kwork_top.bpf.c +++ b/tools/perf/util/bpf_skel/kwork_top.bpf.c @@ -84,7 +84,7 @@ struct { int enabled = 0; -int has_cpu_filter = 0; +const volatile int has_cpu_filter = 0; __u64 from_timestamp = 0; __u64 to_timestamp = 0; diff --git a/tools/perf/util/bpf_skel/kwork_trace.bpf.c b/tools/perf/util/bpf_skel/kwork_trace.bpf.c index 063c124e0999..cbd79bc4b330 100644 --- a/tools/perf/util/bpf_skel/kwork_trace.bpf.c +++ b/tools/perf/util/bpf_skel/kwork_trace.bpf.c @@ -68,8 +68,9 @@ struct { } perf_kwork_name_filter SEC(".maps"); int enabled = 0; -int has_cpu_filter = 0; -int has_name_filter = 0; + +const volatile int has_cpu_filter = 0; +const volatile int has_name_filter = 0; static __always_inline int local_strncmp(const char *s1, unsigned int sz, const char *s2) -- cgit v1.2.3 From 4afdc00c378f34943fb3a3cc08db4babdacb5c5b Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Mon, 2 Sep 2024 13:05:14 -0700 Subject: perf lock contention: Constify control data for BPF The control knobs set before loading BPF programs should be declared as 'const volatile' so that it can be optimized by the BPF core. Committer testing: root@x1:~# perf lock contention --use-bpf contended total wait max wait avg wait type caller 5 31.57 us 14.93 us 6.31 us mutex btrfs_delayed_update_inode+0x43 1 16.91 us 16.91 us 16.91 us rwsem:R btrfs_tree_read_lock_nested+0x1b 1 15.13 us 15.13 us 15.13 us spinlock btrfs_getattr+0xd1 1 6.65 us 6.65 us 6.65 us rwsem:R btrfs_tree_read_lock_nested+0x1b 1 4.34 us 4.34 us 4.34 us spinlock process_one_work+0x1a9 root@x1:~# root@x1:~# perf trace -e bpf --max-events 10 perf lock contention --use-bpf 0.000 ( 0.013 ms): :2948281/2948281 bpf(cmd: 36, uattr: 0x7ffd5f12d730, size: 8) = -1 EOPNOTSUPP (Operation not supported) 0.024 ( 0.120 ms): :2948281/2948281 bpf(cmd: PROG_LOAD, uattr: 0x7ffd5f12d460, size: 148) = 16 0.158 ( 0.034 ms): :2948281/2948281 bpf(cmd: PROG_LOAD, uattr: 0x7ffd5f12d520, size: 148) = 16 26.653 ( 0.154 ms): perf/2948281 bpf(cmd: PROG_LOAD, uattr: 0x7ffd5f12d3d0, size: 148) = 16 26.825 ( 0.014 ms): perf/2948281 bpf(uattr: 0x7ffd5f12d580, size: 80) = 16 87.924 ( 0.038 ms): perf/2948281 bpf(cmd: BTF_LOAD, uattr: 0x7ffd5f12d400, size: 40) = 16 87.988 ( 0.006 ms): perf/2948281 bpf(cmd: BTF_LOAD, uattr: 0x7ffd5f12d470, size: 40) = 16 88.019 ( 0.006 ms): perf/2948281 bpf(cmd: BTF_LOAD, uattr: 0x7ffd5f12d250, size: 40) = 16 88.029 ( 0.172 ms): perf/2948281 bpf(cmd: PROG_LOAD, uattr: 0x7ffd5f12d320, size: 148) = 17 88.217 ( 0.005 ms): perf/2948281 bpf(cmd: BTF_LOAD, uattr: 0x7ffd5f12d4d0, size: 40) = 16 root@x1:~# Signed-off-by: Namhyung Kim Tested-by: Arnaldo Carvalho de Melo Cc: Adrian Hunter Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kan Liang Cc: Peter Zijlstra Cc: Song Liu Link: https://lore.kernel.org/r/20240902200515.2103769-5-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/bpf_lock_contention.c | 45 ++++++++++++++------------ tools/perf/util/bpf_skel/lock_contention.bpf.c | 27 ++++++++-------- 2 files changed, 38 insertions(+), 34 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/bpf_lock_contention.c b/tools/perf/util/bpf_lock_contention.c index bc4e92c0c08b..41a1ad087895 100644 --- a/tools/perf/util/bpf_lock_contention.c +++ b/tools/perf/util/bpf_lock_contention.c @@ -46,14 +46,22 @@ int lock_contention_prepare(struct lock_contention *con) else bpf_map__set_max_entries(skel->maps.stacks, 1); - if (target__has_cpu(target)) + if (target__has_cpu(target)) { + skel->rodata->has_cpu = 1; ncpus = perf_cpu_map__nr(evlist->core.user_requested_cpus); - if (target__has_task(target)) + } + if (target__has_task(target)) { + skel->rodata->has_task = 1; ntasks = perf_thread_map__nr(evlist->core.threads); - if (con->filters->nr_types) + } + if (con->filters->nr_types) { + skel->rodata->has_type = 1; ntypes = con->filters->nr_types; - if (con->filters->nr_cgrps) + } + if (con->filters->nr_cgrps) { + skel->rodata->has_cgroup = 1; ncgrps = con->filters->nr_cgrps; + } /* resolve lock name filters to addr */ if (con->filters->nr_syms) { @@ -82,6 +90,7 @@ int lock_contention_prepare(struct lock_contention *con) con->filters->addrs = addrs; } naddrs = con->filters->nr_addrs; + skel->rodata->has_addr = 1; } bpf_map__set_max_entries(skel->maps.cpu_filter, ncpus); @@ -90,6 +99,16 @@ int lock_contention_prepare(struct lock_contention *con) bpf_map__set_max_entries(skel->maps.addr_filter, naddrs); bpf_map__set_max_entries(skel->maps.cgroup_filter, ncgrps); + skel->rodata->stack_skip = con->stack_skip; + skel->rodata->aggr_mode = con->aggr_mode; + skel->rodata->needs_callstack = con->save_callstack; + skel->rodata->lock_owner = con->owner; + + if (con->aggr_mode == LOCK_AGGR_CGROUP || con->filters->nr_cgrps) { + if (cgroup_is_v2("perf_event")) + skel->rodata->use_cgroup_v2 = 1; + } + if (lock_contention_bpf__load(skel) < 0) { pr_err("Failed to load lock-contention BPF skeleton\n"); return -1; @@ -99,7 +118,6 @@ int lock_contention_prepare(struct lock_contention *con) u32 cpu; u8 val = 1; - skel->bss->has_cpu = 1; fd = bpf_map__fd(skel->maps.cpu_filter); for (i = 0; i < ncpus; i++) { @@ -112,7 +130,6 @@ int lock_contention_prepare(struct lock_contention *con) u32 pid; u8 val = 1; - skel->bss->has_task = 1; fd = bpf_map__fd(skel->maps.task_filter); for (i = 0; i < ntasks; i++) { @@ -125,7 +142,6 @@ int lock_contention_prepare(struct lock_contention *con) u32 pid = evlist->workload.pid; u8 val = 1; - skel->bss->has_task = 1; fd = bpf_map__fd(skel->maps.task_filter); bpf_map_update_elem(fd, &pid, &val, BPF_ANY); } @@ -133,7 +149,6 @@ int lock_contention_prepare(struct lock_contention *con) if (con->filters->nr_types) { u8 val = 1; - skel->bss->has_type = 1; fd = bpf_map__fd(skel->maps.type_filter); for (i = 0; i < con->filters->nr_types; i++) @@ -143,7 +158,6 @@ int lock_contention_prepare(struct lock_contention *con) if (con->filters->nr_addrs) { u8 val = 1; - skel->bss->has_addr = 1; fd = bpf_map__fd(skel->maps.addr_filter); for (i = 0; i < con->filters->nr_addrs; i++) @@ -153,25 +167,14 @@ int lock_contention_prepare(struct lock_contention *con) if (con->filters->nr_cgrps) { u8 val = 1; - skel->bss->has_cgroup = 1; fd = bpf_map__fd(skel->maps.cgroup_filter); for (i = 0; i < con->filters->nr_cgrps; i++) bpf_map_update_elem(fd, &con->filters->cgrps[i], &val, BPF_ANY); } - /* these don't work well if in the rodata section */ - skel->bss->stack_skip = con->stack_skip; - skel->bss->aggr_mode = con->aggr_mode; - skel->bss->needs_callstack = con->save_callstack; - skel->bss->lock_owner = con->owner; - - if (con->aggr_mode == LOCK_AGGR_CGROUP) { - if (cgroup_is_v2("perf_event")) - skel->bss->use_cgroup_v2 = 1; - + if (con->aggr_mode == LOCK_AGGR_CGROUP) read_all_cgroups(&con->cgroups); - } bpf_program__set_autoload(skel->progs.collect_lock_syms, false); diff --git a/tools/perf/util/bpf_skel/lock_contention.bpf.c b/tools/perf/util/bpf_skel/lock_contention.bpf.c index 52a876b42699..1069bda5d733 100644 --- a/tools/perf/util/bpf_skel/lock_contention.bpf.c +++ b/tools/perf/util/bpf_skel/lock_contention.bpf.c @@ -117,21 +117,22 @@ struct mm_struct___new { } __attribute__((preserve_access_index)); /* control flags */ -int enabled; -int has_cpu; -int has_task; -int has_type; -int has_addr; -int has_cgroup; -int needs_callstack; -int stack_skip; -int lock_owner; - -int use_cgroup_v2; -int perf_subsys_id = -1; +const volatile int has_cpu; +const volatile int has_task; +const volatile int has_type; +const volatile int has_addr; +const volatile int has_cgroup; +const volatile int needs_callstack; +const volatile int stack_skip; +const volatile int lock_owner; +const volatile int use_cgroup_v2; /* determine the key of lock stat */ -int aggr_mode; +const volatile int aggr_mode; + +int enabled; + +int perf_subsys_id = -1; __u64 end_ts; -- cgit v1.2.3 From 8b3b1bb3ea1f930d44d79dfb0b8cb7d62db08ed6 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Mon, 2 Sep 2024 13:05:15 -0700 Subject: perf record offcpu: Constify control data for BPF The control knobs set before loading BPF programs should be declared as 'const volatile' so that it can be optimized by the BPF core. Committer testing: root@x1:~# perf record --off-cpu ^C[ perf record: Woken up 1 times to write data ] [ perf record: Captured and wrote 1.807 MB perf.data (5645 samples) ] root@x1:~# perf evlist cpu_atom/cycles/P cpu_core/cycles/P offcpu-time dummy:u root@x1:~# perf evlist -v cpu_atom/cycles/P: type: 0 (PERF_TYPE_HARDWARE), size: 136, config: 0xa00000000, { sample_period, sample_freq }: 4000, sample_type: IP|TID|TIME|CPU|PERIOD|IDENTIFIER, read_format: ID|LOST, disabled: 1, inherit: 1, freq: 1, precise_ip: 3, sample_id_all: 1 cpu_core/cycles/P: type: 0 (PERF_TYPE_HARDWARE), size: 136, config: 0x400000000, { sample_period, sample_freq }: 4000, sample_type: IP|TID|TIME|CPU|PERIOD|IDENTIFIER, read_format: ID|LOST, disabled: 1, inherit: 1, freq: 1, precise_ip: 3, sample_id_all: 1 offcpu-time: type: 1 (software), size: 136, config: 0xa (PERF_COUNT_SW_BPF_OUTPUT), { sample_period, sample_freq }: 1, sample_type: IP|TID|TIME|CALLCHAIN|CPU|PERIOD|IDENTIFIER, read_format: ID|LOST, disabled: 1, inherit: 1, freq: 1, sample_id_all: 1 dummy:u: type: 1 (software), size: 136, config: 0x9 (PERF_COUNT_SW_DUMMY), { sample_period, sample_freq }: 1, sample_type: IP|TID|TIME|CPU|IDENTIFIER, read_format: ID|LOST, inherit: 1, exclude_kernel: 1, exclude_hv: 1, mmap: 1, comm: 1, task: 1, sample_id_all: 1, exclude_guest: 1, mmap2: 1, comm_exec: 1, ksymbol: 1, bpf_event: 1 root@x1:~# perf trace -e bpf --max-events 5 perf record --off-cpu 0.000 ( 0.015 ms): :2949124/2949124 bpf(cmd: 36, uattr: 0x7ffefc6dbe30, size: 8) = -1 EOPNOTSUPP (Operation not supported) 0.031 ( 0.115 ms): :2949124/2949124 bpf(cmd: PROG_LOAD, uattr: 0x7ffefc6dbb60, size: 148) = 14 0.159 ( 0.037 ms): :2949124/2949124 bpf(cmd: PROG_LOAD, uattr: 0x7ffefc6dbc20, size: 148) = 14 23.868 ( 0.144 ms): perf/2949124 bpf(cmd: PROG_LOAD, uattr: 0x7ffefc6dbad0, size: 148) = 14 24.027 ( 0.014 ms): perf/2949124 bpf(uattr: 0x7ffefc6dbc80, size: 80) = 14 root@x1:~# Signed-off-by: Namhyung Kim Tested-by: Arnaldo Carvalho de Melo Cc: Adrian Hunter Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kan Liang Cc: Peter Zijlstra Cc: Song Liu Link: https://lore.kernel.org/r/20240902200515.2103769-6-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/bpf_off_cpu.c | 16 ++++++++-------- tools/perf/util/bpf_skel/off_cpu.bpf.c | 9 +++++---- 2 files changed, 13 insertions(+), 12 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/bpf_off_cpu.c b/tools/perf/util/bpf_off_cpu.c index 6af36142dc5a..a590a8ac1f9d 100644 --- a/tools/perf/util/bpf_off_cpu.c +++ b/tools/perf/util/bpf_off_cpu.c @@ -73,14 +73,12 @@ static void off_cpu_start(void *arg) struct evlist *evlist = arg; /* update task filter for the given workload */ - if (!skel->bss->has_cpu && !skel->bss->has_task && + if (skel->rodata->has_task && skel->rodata->uses_tgid && perf_thread_map__pid(evlist->core.threads, 0) != -1) { int fd; u32 pid; u8 val = 1; - skel->bss->has_task = 1; - skel->bss->uses_tgid = 1; fd = bpf_map__fd(skel->maps.task_filter); pid = perf_thread_map__pid(evlist->core.threads, 0); bpf_map_update_elem(fd, &pid, &val, BPF_ANY); @@ -148,6 +146,7 @@ int off_cpu_prepare(struct evlist *evlist, struct target *target, if (target->cpu_list) { ncpus = perf_cpu_map__nr(evlist->core.user_requested_cpus); bpf_map__set_max_entries(skel->maps.cpu_filter, ncpus); + skel->rodata->has_cpu = 1; } if (target->pid) { @@ -173,11 +172,16 @@ int off_cpu_prepare(struct evlist *evlist, struct target *target, ntasks = MAX_PROC; bpf_map__set_max_entries(skel->maps.task_filter, ntasks); + skel->rodata->has_task = 1; + skel->rodata->uses_tgid = 1; } else if (target__has_task(target)) { ntasks = perf_thread_map__nr(evlist->core.threads); bpf_map__set_max_entries(skel->maps.task_filter, ntasks); + skel->rodata->has_task = 1; } else if (target__none(target)) { bpf_map__set_max_entries(skel->maps.task_filter, MAX_PROC); + skel->rodata->has_task = 1; + skel->rodata->uses_tgid = 1; } if (evlist__first(evlist)->cgrp) { @@ -186,6 +190,7 @@ int off_cpu_prepare(struct evlist *evlist, struct target *target, if (!cgroup_is_v2("perf_event")) skel->rodata->uses_cgroup_v1 = true; + skel->rodata->has_cgroup = 1; } if (opts->record_cgroup) { @@ -208,7 +213,6 @@ int off_cpu_prepare(struct evlist *evlist, struct target *target, u32 cpu; u8 val = 1; - skel->bss->has_cpu = 1; fd = bpf_map__fd(skel->maps.cpu_filter); for (i = 0; i < ncpus; i++) { @@ -220,8 +224,6 @@ int off_cpu_prepare(struct evlist *evlist, struct target *target, if (target->pid) { u8 val = 1; - skel->bss->has_task = 1; - skel->bss->uses_tgid = 1; fd = bpf_map__fd(skel->maps.task_filter); strlist__for_each_entry(pos, pid_slist) { @@ -240,7 +242,6 @@ int off_cpu_prepare(struct evlist *evlist, struct target *target, u32 pid; u8 val = 1; - skel->bss->has_task = 1; fd = bpf_map__fd(skel->maps.task_filter); for (i = 0; i < ntasks; i++) { @@ -253,7 +254,6 @@ int off_cpu_prepare(struct evlist *evlist, struct target *target, struct evsel *evsel; u8 val = 1; - skel->bss->has_cgroup = 1; fd = bpf_map__fd(skel->maps.cgroup_filter); evlist__for_each_entry(evlist, evsel) { diff --git a/tools/perf/util/bpf_skel/off_cpu.bpf.c b/tools/perf/util/bpf_skel/off_cpu.bpf.c index d877a0a9731f..c152116df72f 100644 --- a/tools/perf/util/bpf_skel/off_cpu.bpf.c +++ b/tools/perf/util/bpf_skel/off_cpu.bpf.c @@ -85,10 +85,11 @@ struct task_struct___old { } __attribute__((preserve_access_index)); int enabled = 0; -int has_cpu = 0; -int has_task = 0; -int has_cgroup = 0; -int uses_tgid = 0; + +const volatile int has_cpu = 0; +const volatile int has_task = 0; +const volatile int has_cgroup = 0; +const volatile int uses_tgid = 0; const volatile bool has_prev_state = false; const volatile bool needs_cgroup = false; -- cgit v1.2.3 From bf0db8c759ba137cebfeda9eecc1f728cb14dab7 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 29 Feb 2024 08:18:28 -0800 Subject: perf script: Minimize "not reaching sample" for '-F +brstackinsn' In some situations 'perf script -F +brstackinsn' sees a lot of "not reaching sample" messages. This happens when the last LBR block before the sample contains a branch that is not in the LBR, and the instruction dumping stops. $ perf record -b emacs -Q --batch '()' [ perf record: Woken up 1 times to write data ] [ perf record: Captured and wrote 0.396 MB perf.data (443 samples) ] $ perf script -F +brstackinsn ... 00007f0ab2d171a4 insn: 41 0f 94 c0 00007f0ab2d171a8 insn: 83 fa 01 00007f0ab2d171ab insn: 74 d3 # PRED 6 cycles [313] 1.00 IPC 00007f0ab2d17180 insn: 45 84 c0 00007f0ab2d17183 insn: 74 28 ... not reaching sample ... $ perf script -F +brstackinsn | grep -c reach 136 $ This is a problem for further analysis that wants to see the full code upto the sample. There are two common cases where the message is bogus: - The LBR only logs taken branches, but the branch might be a conditional branch that is not taken (that is the most common case actually) - The LBR sampling uses a filter ignoring some branches, but the perf script check checks for all branches. This patch fixes these two conditions, by only checking for conditional branches, as well as checking the perf_event_attr's branch filter attributes. For the test case above it fixes all the messages: $ ./perf script -F +brstackinsn | grep -c reach 0 Note that there are still conditions when the message is hit -- sometimes there can be a unconditional branch that misses the LBR update before the sample -- but they are much more rare now. Signed-off-by: Andi Kleen Reviewed-by: Adrian Hunter Link: https://lore.kernel.org/r/20240229161828.386397-1-ak@linux.intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-script.c | 6 ++++-- tools/perf/util/dump-insn.c | 2 +- tools/perf/util/dump-insn.h | 2 +- tools/perf/util/intel-pt-decoder/intel-pt-insn-decoder.c | 5 +++-- 4 files changed, 9 insertions(+), 6 deletions(-) (limited to 'tools') diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c index 206b08426555..dbe792b52c5c 100644 --- a/tools/perf/builtin-script.c +++ b/tools/perf/builtin-script.c @@ -1428,7 +1428,7 @@ static int perf_sample__fprintf_brstackinsn(struct perf_sample *sample, * Due to pipeline delays the LBRs might be missing a branch * or two, which can result in very large or negative blocks * between final branch and sample. When this happens just - * continue walking after the last TO until we hit a branch. + * continue walking after the last TO. */ start = entries[0].to; end = sample->ip; @@ -1463,7 +1463,9 @@ static int perf_sample__fprintf_brstackinsn(struct perf_sample *sample, printed += fprintf(fp, "\n"); if (ilen == 0) break; - if (arch_is_branch(buffer + off, len - off, x.is64bit) && start + off != sample->ip) { + if ((attr->branch_sample_type == 0 || attr->branch_sample_type & PERF_SAMPLE_BRANCH_ANY) + && arch_is_uncond_branch(buffer + off, len - off, x.is64bit) + && start + off != sample->ip) { /* * Hit a missing branch. Just stop. */ diff --git a/tools/perf/util/dump-insn.c b/tools/perf/util/dump-insn.c index 2bd8585db93c..c1cc0ade48d0 100644 --- a/tools/perf/util/dump-insn.c +++ b/tools/perf/util/dump-insn.c @@ -15,7 +15,7 @@ const char *dump_insn(struct perf_insn *x __maybe_unused, } __weak -int arch_is_branch(const unsigned char *buf __maybe_unused, +int arch_is_uncond_branch(const unsigned char *buf __maybe_unused, size_t len __maybe_unused, int x86_64 __maybe_unused) { diff --git a/tools/perf/util/dump-insn.h b/tools/perf/util/dump-insn.h index 4a7797dd6d09..20d4d7bb5275 100644 --- a/tools/perf/util/dump-insn.h +++ b/tools/perf/util/dump-insn.h @@ -21,6 +21,6 @@ struct perf_insn { const char *dump_insn(struct perf_insn *x, u64 ip, u8 *inbuf, int inlen, int *lenp); -int arch_is_branch(const unsigned char *buf, size_t len, int x86_64); +int arch_is_uncond_branch(const unsigned char *buf, size_t len, int x86_64); #endif diff --git a/tools/perf/util/intel-pt-decoder/intel-pt-insn-decoder.c b/tools/perf/util/intel-pt-decoder/intel-pt-insn-decoder.c index 4407130d91f8..47cf35799a4d 100644 --- a/tools/perf/util/intel-pt-decoder/intel-pt-insn-decoder.c +++ b/tools/perf/util/intel-pt-decoder/intel-pt-insn-decoder.c @@ -209,12 +209,13 @@ int intel_pt_get_insn(const unsigned char *buf, size_t len, int x86_64, return 0; } -int arch_is_branch(const unsigned char *buf, size_t len, int x86_64) +int arch_is_uncond_branch(const unsigned char *buf, size_t len, int x86_64) { struct intel_pt_insn in; if (intel_pt_get_insn(buf, len, x86_64, &in) < 0) return -1; - return in.branch != INTEL_PT_BR_NO_BRANCH; + return in.branch == INTEL_PT_BR_UNCONDITIONAL || + in.branch == INTEL_PT_BR_INDIRECT; } const char *dump_insn(struct perf_insn *x, uint64_t ip __maybe_unused, -- cgit v1.2.3 From b4bcdff7e839237d89a2aa15b6042b96b0240ac4 Mon Sep 17 00:00:00 2001 From: Abhinav Jain Date: Sat, 10 Aug 2024 19:23:33 +0530 Subject: selftests: filesystems: fix warn_unused_result build warnings MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add return value checks for read & write calls in test_listmount_ns function. This patch resolves below compilation warnings: ``` statmount_test_ns.c: In function ‘test_listmount_ns’: statmount_test_ns.c:322:17: warning: ignoring return value of ‘write’ declared with attribute ‘warn_unused_result’ [-Wunused-result] statmount_test_ns.c:323:17: warning: ignoring return value of ‘read’ declared with attribute ‘warn_unused_result’ [-Wunused-result] ``` Signed-off-by: Abhinav Jain Signed-off-by: Shuah Khan --- tools/testing/selftests/filesystems/statmount/statmount_test_ns.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/filesystems/statmount/statmount_test_ns.c b/tools/testing/selftests/filesystems/statmount/statmount_test_ns.c index e044f5fc57fd..70cb0c8b21cf 100644 --- a/tools/testing/selftests/filesystems/statmount/statmount_test_ns.c +++ b/tools/testing/selftests/filesystems/statmount/statmount_test_ns.c @@ -319,8 +319,11 @@ static void test_listmount_ns(void) * Tell our parent how many mounts we have, and then wait for it * to tell us we're done. */ - write(child_ready_pipe[1], &nr_mounts, sizeof(nr_mounts)); - read(parent_ready_pipe[0], &cval, sizeof(cval)); + if (write(child_ready_pipe[1], &nr_mounts, sizeof(nr_mounts)) != + sizeof(nr_mounts)) + ret = NSID_ERROR; + if (read(parent_ready_pipe[0], &cval, sizeof(cval)) != sizeof(cval)) + ret = NSID_ERROR; exit(NSID_PASS); } -- cgit v1.2.3 From 3c6b818b097dd6932859bcc3d6722a74ec5931c1 Mon Sep 17 00:00:00 2001 From: Zhu Jun Date: Wed, 28 Aug 2024 02:31:29 -0700 Subject: tools/iio: Add memory allocation failure check for trigger_name Added a check to handle memory allocation failure for `trigger_name` and return `-ENOMEM`. Signed-off-by: Zhu Jun Link: https://patch.msgid.link/20240828093129.3040-1-zhujun2@cmss.chinamobile.com Signed-off-by: Jonathan Cameron --- tools/iio/iio_generic_buffer.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'tools') diff --git a/tools/iio/iio_generic_buffer.c b/tools/iio/iio_generic_buffer.c index 0d0a7a19d6f9..9ef5ee087eda 100644 --- a/tools/iio/iio_generic_buffer.c +++ b/tools/iio/iio_generic_buffer.c @@ -498,6 +498,10 @@ int main(int argc, char **argv) return -ENOMEM; } trigger_name = malloc(IIO_MAX_NAME_LENGTH); + if (!trigger_name) { + ret = -ENOMEM; + goto error; + } ret = read_sysfs_string("name", trig_dev_name, trigger_name); free(trig_dev_name); if (ret < 0) { -- cgit v1.2.3 From 575eec218059f90bf07a4e26b531fd6d54fe1769 Mon Sep 17 00:00:00 2001 From: Yang Jihong Date: Mon, 12 Aug 2024 21:26:05 +0800 Subject: perf sched timehist: Skip print non-idle task samples when only show idle events when only show idle events, runtime stats of non-idle tasks is not updated, and the value is 0, there is no need to print non-idle samples. Before: # perf sched timehist -I Samples of sched_switch event do not have callchains. time cpu task name wait time sch delay run time [tid/pid] (msec) (msec) (msec) --------------- ------ ------------------------------ --------- --------- --------- 2090450.763235 [0000] migration/0[15] 0.000 0.000 0.000 2090450.763268 [0001] migration/1[21] 0.000 0.000 0.000 2090450.763309 [0002] migration/2[27] 0.000 0.000 0.000 2090450.763343 [0003] migration/3[33] 0.000 0.000 0.000 2090450.763469 [0004] migration/4[39] 0.000 0.000 0.000 2090450.763501 [0005] migration/5[45] 0.000 0.000 0.000 2090450.763622 [0006] migration/6[51] 0.000 0.000 0.000 2090450.763660 [0007] migration/7[57] 0.000 0.000 0.000 2090450.763741 [0009] migration/9[69] 0.000 0.000 0.000 2090450.763862 [0010] migration/10[75] 0.000 0.000 0.000 2090450.763894 [0011] migration/11[81] 0.000 0.000 0.000 2090450.764021 [0012] migration/12[87] 0.000 0.000 0.000 2090450.764056 [0013] migration/13[93] 0.000 0.000 0.000 2090450.764135 [0014] migration/14[99] 0.000 0.000 0.000 2090450.764163 [0015] migration/15[105] 0.000 0.000 0.000 2090450.764292 [0016] migration/16[111] 0.000 0.000 0.000 2090450.764371 [0017] migration/17[117] 0.000 0.000 0.000 2090450.764422 [0018] migration/18[123] 0.000 0.000 0.000 2090450.764490 [0000] 0.000 0.000 1.255 2090450.764505 [0000] s1-perf[8235/7168] 0.000 0.000 0.000 2090450.764571 [0016] 0.000 0.000 0.278 2090450.764588 [0010] 0.000 0.000 0.725 2090450.764590 [0016] s1-agent[7179/7162] 0.000 0.000 0.000 2090450.764635 [0000] 0.015 0.015 0.129 2090450.764637 [0017] 0.000 0.000 0.266 2090450.764639 [0000] s1-perf[8235/7168] 0.000 0.000 0.000 2090450.764668 [0017] s1-agent[7180/7162] 0.000 0.000 0.000 2090450.764669 [0000] 0.003 0.003 0.029 2090450.764672 [0000] s1-perf[8235/7168] 0.000 0.000 0.000 2090450.764683 [0000] 0.003 0.003 0.010 After: # perf sched timehist -I Samples of sched_switch event do not have callchains. time cpu task name wait time sch delay run time [tid/pid] (msec) (msec) (msec) --------------- ------ ------------------------------ --------- --------- --------- 2090450.764490 [0000] 0.000 0.000 1.255 2090450.764571 [0016] 0.000 0.000 0.278 2090450.764588 [0010] 0.000 0.000 0.725 2090450.764635 [0000] 0.015 0.015 0.129 2090450.764637 [0017] 0.000 0.000 0.266 2090450.764669 [0000] 0.003 0.003 0.029 2090450.764683 [0000] 0.003 0.003 0.010 2090450.764688 [0016] 0.019 0.019 0.097 2090450.764694 [0000] 0.001 0.001 0.009 2090450.764706 [0000] 0.001 0.001 0.010 2090450.764725 [0002] 0.000 0.000 1.415 2090450.764728 [0000] 0.002 0.002 0.019 2090450.764823 [0000] 0.003 0.003 0.091 2090450.764838 [0019] 0.000 0.000 0.154 2090450.764865 [0002] 0.109 0.109 0.029 2090450.764866 [0000] 0.012 0.012 0.030 2090450.764880 [0002] 0.013 0.013 0.001 2090450.764880 [0000] 0.002 0.002 0.011 2090450.764896 [0000] 0.001 0.001 0.013 2090450.764903 [0019] 0.063 0.063 0.002 2090450.764908 [0019] 0.003 0.003 0.001 Fixes: 07235f84ece6b66f ("perf sched timehist: Add -I/--idle-hist option") Signed-off-by: Yang Jihong Acked-by: Namhyung Kim Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ian Rogers Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: Kan Liang Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20240812132606.3126490-1-yangjihong@bytedance.com Reviewed-and-tested-by: Madadi Vineeth Reddy Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-sched.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c index 1c386ebe4b98..72f0e7f97f56 100644 --- a/tools/perf/builtin-sched.c +++ b/tools/perf/builtin-sched.c @@ -2730,10 +2730,10 @@ static int timehist_sched_change_event(const struct perf_tool *tool, itr->last_thread = NULL; } - } - if (!sched->summary_only) - timehist_print_sample(sched, evsel, sample, &al, thread, t, state); + if (!sched->summary_only) + timehist_print_sample(sched, evsel, sample, &al, thread, t, state); + } out: if (sched->hist_time.start == 0 && t >= ptime->start) -- cgit v1.2.3 From b93fb9cf45a99042f4472678fc67afd1de47c32e Mon Sep 17 00:00:00 2001 From: Yang Jihong Date: Mon, 12 Aug 2024 21:26:06 +0800 Subject: perf sched timehist: Remove redundant BUG_ON in timehist_sched_change_event() The BUG_ON(thread__tid(thread) != 0) in timehist_sched_change_event() is redundant, remove it. No functional change. Fixes: 07235f84ece6b66f ("perf sched timehist: Add -I/--idle-hist option") Reviewed-by: Madadi Vineeth Reddy Signed-off-by: Yang Jihong Acked-by: Namhyung Kim Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ian Rogers Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: Kan Liang Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20240812132606.3126490-2-yangjihong@bytedance.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-sched.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'tools') diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c index 72f0e7f97f56..dcae3f1f5f3c 100644 --- a/tools/perf/builtin-sched.c +++ b/tools/perf/builtin-sched.c @@ -2703,8 +2703,6 @@ static int timehist_sched_change_event(const struct perf_tool *tool, struct idle_thread_runtime *itr = (void *)tr; struct thread_runtime *last_tr; - BUG_ON(thread__tid(thread) != 0); - if (itr->last_thread == NULL) goto out; -- cgit v1.2.3 From 3fcd740990dee9404f4f6ebeb7a31f0066849e46 Mon Sep 17 00:00:00 2001 From: Yang Jihong Date: Mon, 19 Aug 2024 11:30:15 +0800 Subject: perf sched timehist: Add --show-prio option The --show-prio option is used to display the priority of task. It is disabled by default, which is consistent with original behavior. The display format is xxx (priority does not change during task running) or xxx->yyy (priority changes during task running) Testcase: # perf sched record nice -n 9 true [ perf record: Woken up 0 times to write data ] [ perf record: Captured and wrote 0.497 MB perf.data ] # perf sched timehist -h Usage: perf sched timehist [] -C, --cpu list of cpus to profile -D, --dump-raw-trace dump raw trace in ASCII -f, --force don't complain, do it -g, --call-graph Display call chains if present (default on) -I, --idle-hist Show idle events only -i, --input input file name -k, --vmlinux vmlinux pathname -M, --migrations Show migration events -n, --next Show next task -p, --pid analyze events only for given process id(s) -s, --summary Show only syscall summary with statistics -S, --with-summary Show all syscalls and summary with statistics -t, --tid analyze events only for given thread id(s) -V, --cpu-visual Add CPU visual -v, --verbose be more verbose (show symbol address, etc) -w, --wakeups Show wakeup events --kallsyms kallsyms pathname --max-stack Maximum number of functions to display backtrace. --show-prio Show task priority --state Show task state when sched-out --symfs Look for files with symbols relative to this directory --time Time span for analysis (start,stop) # perf sched timehist Samples of sched_switch event do not have callchains. time cpu task name wait time sch delay run time [tid/pid] (msec) (msec) (msec) --------------- ------ ------------------------------ --------- --------- --------- 23952.006537 [0000] perf[534] 0.000 0.000 0.000 23952.006593 [0000] migration/0[19] 0.000 0.014 0.056 23952.006899 [0001] perf[534] 0.000 0.000 0.000 23952.006947 [0001] migration/1[22] 0.000 0.015 0.047 23952.007138 [0002] perf[534] 0.000 0.000 0.000 # perf sched timehist --show-prio Samples of sched_switch event do not have callchains. time cpu task name prio wait time sch delay run time [tid/pid] (msec) (msec) (msec) --------------- ------ ------------------------------ -------- --------- --------- --------- 23952.006537 [0000] perf[534] 120 0.000 0.000 0.000 23952.006593 [0000] migration/0[19] 0 0.000 0.014 0.056 23952.006899 [0001] perf[534] 120 0.000 0.000 0.000 23952.034843 [0003] nice[535] 120->129 0.189 0.024 23.314 23952.053838 [0005] rcu_preempt[16] 120 3.993 0.000 0.023 23952.053990 [0005] 120 0.023 0.023 0.152 23952.054137 [0006] 120 1.427 1.427 17.855 23952.054278 [0007] 120 0.506 0.506 1.650 Signed-off-by: Yang Jihong Acked-by: Namhyung Kim Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ian Rogers Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: Kan Liang Cc: Mark Rutland Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20240819033016.2427235-2-yangjihong@bytedance.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Documentation/perf-sched.txt | 3 ++ tools/perf/builtin-sched.c | 91 ++++++++++++++++++++++++++++++--- 2 files changed, 87 insertions(+), 7 deletions(-) (limited to 'tools') diff --git a/tools/perf/Documentation/perf-sched.txt b/tools/perf/Documentation/perf-sched.txt index 84d49f9241b1..3efa5c58418d 100644 --- a/tools/perf/Documentation/perf-sched.txt +++ b/tools/perf/Documentation/perf-sched.txt @@ -212,6 +212,9 @@ OPTIONS for 'perf sched timehist' --state:: Show task state when it switched out. +--show-prio:: + Show task priority. + OPTIONS for 'perf sched replay' ------------------------------ diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c index dcae3f1f5f3c..ba6563dc93d0 100644 --- a/tools/perf/builtin-sched.c +++ b/tools/perf/builtin-sched.c @@ -228,6 +228,7 @@ struct perf_sched { bool show_next; bool show_migrations; bool show_state; + bool show_prio; u64 skipped_samples; const char *time_str; struct perf_time_interval ptime; @@ -258,6 +259,8 @@ struct thread_runtime { bool comm_changed; u64 migrations; + + int prio; }; /* per event run time data */ @@ -920,6 +923,11 @@ struct sort_dimension { struct list_head list; }; +static inline void init_prio(struct thread_runtime *r) +{ + r->prio = -1; +} + /* * handle runtime stats saved per thread */ @@ -932,6 +940,7 @@ static struct thread_runtime *thread__init_runtime(struct thread *thread) return NULL; init_stats(&r->run_stats); + init_prio(r); thread__set_priv(thread, r); return r; @@ -2036,6 +2045,24 @@ static char *timehist_get_commstr(struct thread *thread) return str; } +/* prio field format: xxx or xxx->yyy */ +#define MAX_PRIO_STR_LEN 8 +static char *timehist_get_priostr(struct evsel *evsel, + struct thread *thread, + struct perf_sample *sample) +{ + static char prio_str[16]; + int prev_prio = (int)evsel__intval(evsel, sample, "prev_prio"); + struct thread_runtime *tr = thread__priv(thread); + + if (tr->prio != prev_prio && tr->prio != -1) + scnprintf(prio_str, sizeof(prio_str), "%d->%d", tr->prio, prev_prio); + else + scnprintf(prio_str, sizeof(prio_str), "%d", prev_prio); + + return prio_str; +} + static void timehist_header(struct perf_sched *sched) { u32 ncpus = sched->max_cpu.cpu + 1; @@ -2053,8 +2080,14 @@ static void timehist_header(struct perf_sched *sched) printf(" "); } - printf(" %-*s %9s %9s %9s", comm_width, - "task name", "wait time", "sch delay", "run time"); + if (sched->show_prio) { + printf(" %-*s %-*s %9s %9s %9s", + comm_width, "task name", MAX_PRIO_STR_LEN, "prio", + "wait time", "sch delay", "run time"); + } else { + printf(" %-*s %9s %9s %9s", comm_width, + "task name", "wait time", "sch delay", "run time"); + } if (sched->show_state) printf(" %s", "state"); @@ -2069,8 +2102,14 @@ static void timehist_header(struct perf_sched *sched) if (sched->show_cpu_visual) printf(" %*s ", ncpus, ""); - printf(" %-*s %9s %9s %9s", comm_width, - "[tid/pid]", "(msec)", "(msec)", "(msec)"); + if (sched->show_prio) { + printf(" %-*s %-*s %9s %9s %9s", + comm_width, "[tid/pid]", MAX_PRIO_STR_LEN, "", + "(msec)", "(msec)", "(msec)"); + } else { + printf(" %-*s %9s %9s %9s", comm_width, + "[tid/pid]", "(msec)", "(msec)", "(msec)"); + } if (sched->show_state) printf(" %5s", ""); @@ -2085,9 +2124,15 @@ static void timehist_header(struct perf_sched *sched) if (sched->show_cpu_visual) printf(" %.*s ", ncpus, graph_dotted_line); - printf(" %.*s %.9s %.9s %.9s", comm_width, - graph_dotted_line, graph_dotted_line, graph_dotted_line, - graph_dotted_line); + if (sched->show_prio) { + printf(" %.*s %.*s %.9s %.9s %.9s", + comm_width, graph_dotted_line, MAX_PRIO_STR_LEN, graph_dotted_line, + graph_dotted_line, graph_dotted_line, graph_dotted_line); + } else { + printf(" %.*s %.9s %.9s %.9s", comm_width, + graph_dotted_line, graph_dotted_line, graph_dotted_line, + graph_dotted_line); + } if (sched->show_state) printf(" %.5s", graph_dotted_line); @@ -2134,6 +2179,9 @@ static void timehist_print_sample(struct perf_sched *sched, printf(" %-*s ", comm_width, timehist_get_commstr(thread)); + if (sched->show_prio) + printf(" %-*s ", MAX_PRIO_STR_LEN, timehist_get_priostr(evsel, thread, sample)); + wait_time = tr->dt_sleep + tr->dt_iowait + tr->dt_preempt; print_sched_time(wait_time, 6); @@ -2301,6 +2349,7 @@ static int init_idle_thread(struct thread *thread) if (itr == NULL) return -ENOMEM; + init_prio(&itr->tr); init_stats(&itr->tr.run_stats); callchain_init(&itr->callchain); callchain_cursor_reset(&itr->cursor); @@ -2627,6 +2676,30 @@ static int timehist_migrate_task_event(const struct perf_tool *tool, return 0; } +static void timehist_update_task_prio(struct evsel *evsel, + struct perf_sample *sample, + struct machine *machine) +{ + struct thread *thread; + struct thread_runtime *tr = NULL; + const u32 next_pid = evsel__intval(evsel, sample, "next_pid"); + const u32 next_prio = evsel__intval(evsel, sample, "next_prio"); + + if (next_pid == 0) + thread = get_idle_thread(sample->cpu); + else + thread = machine__findnew_thread(machine, -1, next_pid); + + if (thread == NULL) + return; + + tr = thread__get_runtime(thread); + if (tr == NULL) + return; + + tr->prio = next_prio; +} + static int timehist_sched_change_event(const struct perf_tool *tool, union perf_event *event, struct evsel *evsel, @@ -2650,6 +2723,9 @@ static int timehist_sched_change_event(const struct perf_tool *tool, goto out; } + if (sched->show_prio) + timehist_update_task_prio(evsel, sample, machine); + thread = timehist_get_thread(sched, sample, machine, evsel); if (thread == NULL) { rc = -1; @@ -3684,6 +3760,7 @@ int cmd_sched(int argc, const char **argv) OPT_STRING('t', "tid", &symbol_conf.tid_list_str, "tid[,tid...]", "analyze events only for given thread id(s)"), OPT_STRING('C', "cpu", &cpu_list, "cpu", "list of cpus to profile"), + OPT_BOOLEAN(0, "show-prio", &sched.show_prio, "Show task priority"), OPT_PARENT(sched_options) }; -- cgit v1.2.3 From 9b3a48bbe20d9692e0d456488cd3a6244cac3e3b Mon Sep 17 00:00:00 2001 From: Yang Jihong Date: Mon, 19 Aug 2024 11:30:16 +0800 Subject: perf sched timehist: Add --prio option The --prio option is used to only show events for the given task priority(ies). The default is to show events for all priority tasks, which is consistent with the previous behavior. Testcase: # perf sched record nice -n 9 perf bench sched messaging -l 10000 # Running 'sched/messaging' benchmark: # 20 sender and receiver processes per group # 10 groups == 400 processes run Total time: 3.435 [sec] [ perf record: Woken up 270 times to write data ] [ perf record: Captured and wrote 618.688 MB perf.data (5729036 samples) ] # perf sched timehist -h Usage: perf sched timehist [] -C, --cpu list of cpus to profile -D, --dump-raw-trace dump raw trace in ASCII -f, --force don't complain, do it -g, --call-graph Display call chains if present (default on) -I, --idle-hist Show idle events only -i, --input input file name -k, --vmlinux vmlinux pathname -M, --migrations Show migration events -n, --next Show next task -p, --pid analyze events only for given process id(s) -s, --summary Show only syscall summary with statistics -S, --with-summary Show all syscalls and summary with statistics -t, --tid analyze events only for given thread id(s) -V, --cpu-visual Add CPU visual -v, --verbose be more verbose (show symbol address, etc) -w, --wakeups Show wakeup events --kallsyms kallsyms pathname --max-stack Maximum number of functions to display backtrace. --prio analyze events only for given task priority(ies) --show-prio Show task priority --state Show task state when sched-out --symfs Look for files with symbols relative to this directory --time Time span for analysis (start,stop) # perf sched timehist --prio 140 Samples of sched_switch event do not have callchains. Invalid prio string # perf sched timehist --show-prio --prio 129 Samples of sched_switch event do not have callchains. time cpu task name prio wait time sch delay run time [tid/pid] (msec) (msec) (msec) --------------- ------ ------------------------------ -------- --------- --------- --------- 2090450.765421 [0002] sched-messaging[1229618] 129 0.000 0.000 0.029 2090450.765445 [0007] sched-messaging[1229616] 129 0.000 0.062 0.043 2090450.765448 [0014] sched-messaging[1229619] 129 0.000 0.000 0.032 2090450.765478 [0013] sched-messaging[1229617] 129 0.000 0.065 0.048 2090450.765503 [0014] sched-messaging[1229622] 129 0.000 0.000 0.017 2090450.765550 [0002] sched-messaging[1229624] 129 0.000 0.000 0.021 2090450.765562 [0007] sched-messaging[1229621] 129 0.000 0.071 0.028 2090450.765570 [0005] sched-messaging[1229620] 129 0.000 0.064 0.066 2090450.765583 [0001] sched-messaging[1229625] 129 0.000 0.001 0.031 2090450.765595 [0013] sched-messaging[1229623] 129 0.000 0.060 0.028 2090450.765637 [0014] sched-messaging[1229628] 129 0.000 0.000 0.019 2090450.765665 [0007] sched-messaging[1229627] 129 0.000 0.038 0.030 # perf sched timehist --show-prio --prio 0,120-129 Samples of sched_switch event do not have callchains. time cpu task name prio wait time sch delay run time [tid/pid] (msec) (msec) (msec) --------------- ------ ------------------------------ -------- --------- --------- --------- 2090450.763231 [0000] perf[1229608] 120 0.000 0.000 0.000 2090450.763235 [0000] migration/0[15] 0 0.000 0.001 0.003 2090450.763263 [0001] perf[1229608] 120 0.000 0.000 0.000 2090450.763268 [0001] migration/1[21] 0 0.000 0.001 0.004 2090450.763302 [0002] perf[1229608] 120 0.000 0.000 0.000 2090450.763309 [0002] migration/2[27] 0 0.000 0.001 0.007 2090450.763338 [0003] perf[1229608] 120 0.000 0.000 0.000 2090450.763343 [0003] migration/3[33] 0 0.000 0.001 0.004 2090450.763459 [0004] perf[1229608] 120 0.000 0.000 0.000 2090450.763469 [0004] migration/4[39] 0 0.000 0.002 0.010 2090450.763496 [0005] perf[1229608] 120 0.000 0.000 0.000 2090450.763501 [0005] migration/5[45] 0 0.000 0.001 0.004 2090450.763613 [0006] perf[1229608] 120 0.000 0.000 0.000 2090450.763622 [0006] migration/6[51] 0 0.000 0.001 0.008 2090450.763652 [0007] perf[1229608] 120 0.000 0.000 0.000 2090450.763660 [0007] migration/7[57] 0 0.000 0.001 0.008 2090450.765665 [0001] 120 0.031 0.031 0.081 2090450.765665 [0007] sched-messaging[1229627] 129 0.000 0.038 0.030 2090450.765667 [0000] s1-perf[8235/7168] 120 0.008 0.000 0.004 2090450.765684 [0013] 120 0.028 0.028 0.088 2090450.765685 [0001] sched-messaging[1229630] 129 0.000 0.001 0.020 2090450.765688 [0000] 120 0.004 0.004 0.020 2090450.765689 [0002] 120 0.021 0.021 0.138 2090450.765691 [0005] sched-messaging[1229626] 129 0.000 0.085 0.029 Signed-off-by: Yang Jihong Acked-by: Namhyung Kim Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ian Rogers Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: Kan Liang Cc: Mark Rutland Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20240819033016.2427235-3-yangjihong@bytedance.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Documentation/perf-sched.txt | 6 +++ tools/perf/builtin-sched.c | 74 ++++++++++++++++++++++++++++++++- 2 files changed, 79 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/perf/Documentation/perf-sched.txt b/tools/perf/Documentation/perf-sched.txt index 3efa5c58418d..3db64954a267 100644 --- a/tools/perf/Documentation/perf-sched.txt +++ b/tools/perf/Documentation/perf-sched.txt @@ -215,6 +215,12 @@ OPTIONS for 'perf sched timehist' --show-prio:: Show task priority. +--prio:: + Only show events for given task priority(ies). Multiple priorities can be + provided as a comma-separated list with no spaces: 0,120. Ranges of + priorities are specified with -: 120-129. A combination of both can also be + provided: 0,120-129. + OPTIONS for 'perf sched replay' ------------------------------ diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c index ba6563dc93d0..0a7b2b2acd56 100644 --- a/tools/perf/builtin-sched.c +++ b/tools/perf/builtin-sched.c @@ -51,6 +51,7 @@ #define COMM_LEN 20 #define SYM_LEN 129 #define MAX_PID 1024000 +#define MAX_PRIO 140 static const char *cpu_list; static DECLARE_BITMAP(cpu_bitmap, MAX_NR_CPUS); @@ -234,6 +235,8 @@ struct perf_sched { struct perf_time_interval ptime; struct perf_time_interval hist_time; volatile bool thread_funcs_exit; + const char *prio_str; + DECLARE_BITMAP(prio_bitmap, MAX_PRIO); }; /* per thread run time data */ @@ -2504,12 +2507,33 @@ static bool timehist_skip_sample(struct perf_sched *sched, struct perf_sample *sample) { bool rc = false; + int prio = -1; + struct thread_runtime *tr = NULL; if (thread__is_filtered(thread)) { rc = true; sched->skipped_samples++; } + if (sched->prio_str) { + /* + * Because priority may be changed during task execution, + * first read priority from prev sched_in event for current task. + * If prev sched_in event is not saved, then read priority from + * current task sched_out event. + */ + tr = thread__get_runtime(thread); + if (tr && tr->prio != -1) + prio = tr->prio; + else if (evsel__name_is(evsel, "sched:sched_switch")) + prio = evsel__intval(evsel, sample, "prev_prio"); + + if (prio != -1 && !test_bit(prio, sched->prio_bitmap)) { + rc = true; + sched->skipped_samples++; + } + } + if (sched->idle_hist) { if (!evsel__name_is(evsel, "sched:sched_switch")) rc = true; @@ -2723,7 +2747,7 @@ static int timehist_sched_change_event(const struct perf_tool *tool, goto out; } - if (sched->show_prio) + if (sched->show_prio || sched->prio_str) timehist_update_task_prio(evsel, sample, machine); thread = timehist_get_thread(sched, sample, machine, evsel); @@ -3143,6 +3167,47 @@ static int timehist_check_attr(struct perf_sched *sched, return 0; } +static int timehist_parse_prio_str(struct perf_sched *sched) +{ + char *p; + unsigned long start_prio, end_prio; + const char *str = sched->prio_str; + + if (!str) + return 0; + + while (isdigit(*str)) { + p = NULL; + start_prio = strtoul(str, &p, 0); + if (start_prio >= MAX_PRIO || (*p != '\0' && *p != ',' && *p != '-')) + return -1; + + if (*p == '-') { + str = ++p; + p = NULL; + end_prio = strtoul(str, &p, 0); + + if (end_prio >= MAX_PRIO || (*p != '\0' && *p != ',')) + return -1; + + if (end_prio < start_prio) + return -1; + } else { + end_prio = start_prio; + } + + for (; start_prio <= end_prio; start_prio++) + __set_bit(start_prio, sched->prio_bitmap); + + if (*p) + ++p; + + str = p; + } + + return 0; +} + static int perf_sched__timehist(struct perf_sched *sched) { struct evsel_str_handler handlers[] = { @@ -3204,6 +3269,11 @@ static int perf_sched__timehist(struct perf_sched *sched) if (timehist_check_attr(sched, evlist) != 0) goto out; + if (timehist_parse_prio_str(sched) != 0) { + pr_err("Invalid prio string\n"); + goto out; + } + setup_pager(); /* prefer sched_waking if it is captured */ @@ -3761,6 +3831,8 @@ int cmd_sched(int argc, const char **argv) "analyze events only for given thread id(s)"), OPT_STRING('C', "cpu", &cpu_list, "cpu", "list of cpus to profile"), OPT_BOOLEAN(0, "show-prio", &sched.show_prio, "Show task priority"), + OPT_STRING(0, "prio", &sched.prio_str, "prio", + "analyze events only for given task priority(ies)"), OPT_PARENT(sched_options) }; -- cgit v1.2.3 From beef8fb2af95ae8aa77f05a2663710d260741ddd Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Wed, 17 Jul 2024 17:30:20 -0700 Subject: perf pmu: Merge boolean sysfs event option parsing Merge perf_pmu__parse_per_pkg() and perf_pmu__parse_snapshot() that do the same parsing except for the file suffix used. Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ananth Narayan Cc: Bjorn Helgaas Cc: Dhananjay Ugwekar Cc: Dominique Martinet Cc: Gautham Shenoy Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: Jonathan Corbet Cc: K Prateek Nayak Cc: Kan Liang Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Sandipan Das Link: https://lore.kernel.org/r/20240718003025.1486232-2-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/pmu.c | 47 +++++++++++++++++++++++------------------------ 1 file changed, 23 insertions(+), 24 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/pmu.c b/tools/perf/util/pmu.c index 0b38c51bd6eb..ef8ab918eaeb 100644 --- a/tools/perf/util/pmu.c +++ b/tools/perf/util/pmu.c @@ -367,8 +367,8 @@ error: return -1; } -static int -perf_pmu__parse_per_pkg(struct perf_pmu *pmu, struct perf_pmu_alias *alias) +static bool perf_pmu__parse_event_source_bool(const char *pmu_name, const char *event_name, + const char *suffix) { char path[PATH_MAX]; size_t len; @@ -376,37 +376,36 @@ perf_pmu__parse_per_pkg(struct perf_pmu *pmu, struct perf_pmu_alias *alias) len = perf_pmu__event_source_devices_scnprintf(path, sizeof(path)); if (!len) - return 0; - scnprintf(path + len, sizeof(path) - len, "%s/events/%s.per-pkg", pmu->name, alias->name); + return false; + + scnprintf(path + len, sizeof(path) - len, "%s/events/%s.%s", pmu_name, event_name, suffix); fd = open(path, O_RDONLY); if (fd == -1) - return -1; + return false; - close(fd); +#ifndef NDEBUG + { + char buf[8]; - alias->per_pkg = true; - return 0; + len = read(fd, buf, sizeof(buf)); + assert(len == 1 || len == 2); + assert(buf[0] == '1'); + } +#endif + + close(fd); + return true; } -static int perf_pmu__parse_snapshot(struct perf_pmu *pmu, struct perf_pmu_alias *alias) +static void perf_pmu__parse_per_pkg(struct perf_pmu *pmu, struct perf_pmu_alias *alias) { - char path[PATH_MAX]; - size_t len; - int fd; - - len = perf_pmu__event_source_devices_scnprintf(path, sizeof(path)); - if (!len) - return 0; - scnprintf(path + len, sizeof(path) - len, "%s/events/%s.snapshot", pmu->name, alias->name); - - fd = open(path, O_RDONLY); - if (fd == -1) - return -1; + alias->per_pkg = perf_pmu__parse_event_source_bool(pmu->name, alias->name, "per-pkg"); +} - alias->snapshot = true; - close(fd); - return 0; +static void perf_pmu__parse_snapshot(struct perf_pmu *pmu, struct perf_pmu_alias *alias) +{ + alias->snapshot = perf_pmu__parse_event_source_bool(pmu->name, alias->name, "snapshot"); } /* Delete an alias entry. */ -- cgit v1.2.3 From f76e3525acf395f22712fc9120de555fb4b1a52f Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Wed, 17 Jul 2024 17:30:21 -0700 Subject: perf parse-events: Pass cpu_list as a perf_cpu_map in __add_event() Previously the cpu_list is a string and typically no cpu_list is passed to __add_event(). Wanting to make events have their cpus distinct from the PMU means that in more occassions we want to pass a cpu_list. If we're reading this from sysfs it is easier to read a perf_cpu_map than allocate and pass around strings that will later be parsed. Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ananth Narayan Cc: Bjorn Helgaas Cc: Dhananjay Ugwekar Cc: Dominique Martinet Cc: Gautham Shenoy Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: Jonathan Corbet Cc: K Prateek Nayak Cc: Kan Liang Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Sandipan Das Link: https://lore.kernel.org/r/20240718003025.1486232-3-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/parse-events.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index fab01ba54e34..9c0ce01684c3 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -227,12 +227,12 @@ __add_event(struct list_head *list, int *idx, bool init_attr, const char *name, const char *metric_id, struct perf_pmu *pmu, struct list_head *config_terms, bool auto_merge_stats, - const char *cpu_list) + struct perf_cpu_map *cpu_list) { struct evsel *evsel; - struct perf_cpu_map *cpus = pmu ? perf_cpu_map__get(pmu->cpus) : - cpu_list ? perf_cpu_map__new(cpu_list) : NULL; + struct perf_cpu_map *cpus = perf_cpu_map__is_empty(cpu_list) && pmu ? pmu->cpus : cpu_list; + cpus = perf_cpu_map__get(cpus); if (pmu) perf_pmu__warn_invalid_formats(pmu); @@ -305,16 +305,17 @@ static int add_event_tool(struct list_head *list, int *idx, .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_DUMMY, }; - const char *cpu_list = NULL; + struct perf_cpu_map *cpu_list = NULL; if (tool_event == PERF_TOOL_DURATION_TIME) { /* Duration time is gathered globally, pretend it is only on CPU0. */ - cpu_list = "0"; + cpu_list = perf_cpu_map__new("0"); } evsel = __add_event(list, idx, &attr, /*init_attr=*/true, /*name=*/NULL, /*metric_id=*/NULL, /*pmu=*/NULL, /*config_terms=*/NULL, /*auto_merge_stats=*/false, cpu_list); + perf_cpu_map__put(cpu_list); if (!evsel) return -ENOMEM; evsel->tool_event = tool_event; -- cgit v1.2.3 From 1c1d348b08ab7c106dd96bafb120f0888827174b Mon Sep 17 00:00:00 2001 From: Kunwu Chan Date: Tue, 27 Aug 2024 17:51:23 +0800 Subject: tools/testing/cxl: Use dev_is_platform() Use dev_is_platform() instead of checking bus type directly. Signed-off-by: Kunwu Chan Reviewed-by: Jonathan Cameron Link: https://patch.msgid.link/20240827095123.168696-1-kunwu.chan@linux.dev Signed-off-by: Dave Jiang --- tools/testing/cxl/mock_acpi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/cxl/mock_acpi.c b/tools/testing/cxl/mock_acpi.c index 55813de26d46..8da94378ccec 100644 --- a/tools/testing/cxl/mock_acpi.c +++ b/tools/testing/cxl/mock_acpi.c @@ -18,7 +18,7 @@ struct acpi_device *to_cxl_host_bridge(struct device *host, struct device *dev) goto out; } - if (dev->bus == &platform_bus_type) + if (dev_is_platform(dev)) goto out; adev = to_acpi_device(dev); -- cgit v1.2.3 From 1b2965a8cd8d444cbea891e55083b5987d00cc66 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Mon, 2 Sep 2024 12:45:55 +0200 Subject: selftests: mptcp: join: reduce join_nr params chk_join_nr() currently takes 9 positional parameters, 6 of them are optional. It makes it hard to read: chk_join_nr 1 1 1 1 0 1 1 0 4 Naming these vars helps to make it easier to read: join_csum_ns1=1 join_csum_ns2=0 \ join_fail_nr=1 join_rst_nr=1 join_infi_nr=0 \ join_corrupted_pkts=4 \ chk_join_nr 1 1 1 It will then be easier to add new optional parameters. Reviewed-by: Geliang Tang Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20240902-net-next-mptcp-mib-mpjtx-misc-v1-4-d3e0f3773b90@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/mptcp/mptcp_join.sh | 31 ++++++++++++++++++------- 1 file changed, 22 insertions(+), 9 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh index a4762c49a878..51b226784c6b 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_join.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh @@ -61,6 +61,12 @@ unset sflags unset fastclose unset fullmesh unset speed +unset join_csum_ns1 +unset join_csum_ns2 +unset join_fail_nr +unset join_rst_nr +unset join_infi_nr +unset join_corrupted_pkts # generated using "nfbpf_compile '(ip && (ip[54] & 0xf0) == 0x30) || # (ip6 && (ip6[74] & 0xf0) == 0x30)'" @@ -1319,12 +1325,12 @@ chk_join_nr() local syn_nr=$1 local syn_ack_nr=$2 local ack_nr=$3 - local csum_ns1=${4:-0} - local csum_ns2=${5:-0} - local fail_nr=${6:-0} - local rst_nr=${7:-0} - local infi_nr=${8:-0} - local corrupted_pkts=${9:-0} + local csum_ns1=${join_csum_ns1:-0} + local csum_ns2=${join_csum_ns2:-0} + local fail_nr=${join_fail_nr:-0} + local rst_nr=${join_rst_nr:-0} + local infi_nr=${join_infi_nr:-0} + local corrupted_pkts=${join_corrupted_pkts:-0} local count local with_cookie @@ -3164,7 +3170,8 @@ fastclose_tests() MPTCP_LIB_SUBTEST_FLAKY=1 test_linkfail=1024 fastclose=server \ run_tests $ns1 $ns2 10.0.1.1 - chk_join_nr 0 0 0 0 0 0 1 + join_rst_nr=1 \ + chk_join_nr 0 0 0 chk_fclose_nr 1 1 invert chk_rst_nr 1 1 fi @@ -3183,7 +3190,10 @@ fail_tests() MPTCP_LIB_SUBTEST_FLAKY=1 test_linkfail=128 \ run_tests $ns1 $ns2 10.0.1.1 - chk_join_nr 0 0 0 +1 +0 1 0 1 "$(pedit_action_pkts)" + join_csum_ns1=+1 join_csum_ns2=+0 \ + join_fail_nr=1 join_rst_nr=0 join_infi_nr=1 \ + join_corrupted_pkts="$(pedit_action_pkts)" \ + chk_join_nr 0 0 0 chk_fail_nr 1 -1 invert fi @@ -3196,7 +3206,10 @@ fail_tests() pm_nl_add_endpoint $ns2 10.0.2.2 dev ns2eth2 flags subflow test_linkfail=1024 \ run_tests $ns1 $ns2 10.0.1.1 - chk_join_nr 1 1 1 1 0 1 1 0 "$(pedit_action_pkts)" + join_csum_ns1=1 join_csum_ns2=0 \ + join_fail_nr=1 join_rst_nr=1 join_infi_nr=0 \ + join_corrupted_pkts="$(pedit_action_pkts)" \ + chk_join_nr 1 1 1 fi } -- cgit v1.2.3 From ba8a664004da84e4e04205429043a1bc19e00d23 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Mon, 2 Sep 2024 12:45:56 +0200 Subject: selftests: mptcp: join: one line for join check Most tests are checking if the expected number of SYN/SYN+ACK/ACK JOINs have been received, each of them on one line. More Join related tests are going to be checked soon, no need to add 5 new lines per test in case of success, just one is enough. In case of issue, the errors will still be reported like before. Reviewed-by: Geliang Tang Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20240902-net-next-mptcp-mib-mpjtx-misc-v1-5-d3e0f3773b90@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/mptcp/mptcp_join.sh | 45 ++++++++++++++++--------- 1 file changed, 30 insertions(+), 15 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh index 51b226784c6b..63580a5810bf 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_join.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh @@ -202,6 +202,22 @@ print_skip() mptcp_lib_pr_skip "${@}" } +# $1: check name; $2: rc +print_results() +{ + local check="${1}" + local rc=${2} + + print_check "${check}" + if [ ${rc} = ${KSFT_PASS} ]; then + print_ok + elif [ ${rc} = ${KSFT_SKIP} ]; then + print_skip + else + fail_test "see above" + fi +} + # [ $1: fail msg ] mark_as_skipped() { @@ -1331,6 +1347,7 @@ chk_join_nr() local rst_nr=${join_rst_nr:-0} local infi_nr=${join_infi_nr:-0} local corrupted_pkts=${join_corrupted_pkts:-0} + local rc=${KSFT_PASS} local count local with_cookie @@ -1338,43 +1355,41 @@ chk_join_nr() print_info "${corrupted_pkts} corrupted pkts" fi - print_check "syn" count=$(mptcp_lib_get_counter ${ns1} "MPTcpExtMPJoinSynRx") if [ -z "$count" ]; then - print_skip + rc=${KSFT_SKIP} elif [ "$count" != "$syn_nr" ]; then + rc=${KSFT_FAIL} + print_check "syn" fail_test "got $count JOIN[s] syn expected $syn_nr" - else - print_ok fi - print_check "synack" with_cookie=$(ip netns exec $ns2 sysctl -n net.ipv4.tcp_syncookies) count=$(mptcp_lib_get_counter ${ns2} "MPTcpExtMPJoinSynAckRx") if [ -z "$count" ]; then - print_skip + rc=${KSFT_SKIP} elif [ "$count" != "$syn_ack_nr" ]; then # simult connections exceeding the limit with cookie enabled could go up to # synack validation as the conn limit can be enforced reliably only after # the subflow creation - if [ "$with_cookie" = 2 ] && [ "$count" -gt "$syn_ack_nr" ] && [ "$count" -le "$syn_nr" ]; then - print_ok - else + if [ "$with_cookie" != 2 ] || [ "$count" -le "$syn_ack_nr" ] || [ "$count" -gt "$syn_nr" ]; then + rc=${KSFT_FAIL} + print_check "synack" fail_test "got $count JOIN[s] synack expected $syn_ack_nr" fi - else - print_ok fi - print_check "ack" count=$(mptcp_lib_get_counter ${ns1} "MPTcpExtMPJoinAckRx") if [ -z "$count" ]; then - print_skip + rc=${KSFT_SKIP} elif [ "$count" != "$ack_nr" ]; then + rc=${KSFT_FAIL} + print_check "ack" fail_test "got $count JOIN[s] ack expected $ack_nr" - else - print_ok fi + + print_results "join Rx" ${rc} + if $validate_checksum; then chk_csum_nr $csum_ns1 $csum_ns2 chk_fail_nr $fail_nr $fail_nr -- cgit v1.2.3 From 004125c251a6826d080bb28feb48ebc5454ada81 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Mon, 2 Sep 2024 12:45:57 +0200 Subject: selftests: mptcp: join: validate MPJ SYN TX MIB counters A few new MPJoinSynTx MIB counters have been added in a previous commit. They are being validated here in mptcp_join.sh selftest, each time the number of received MPJ are checked. Most of the time, the number of sent SYN+MPJ is the same as the received ones. But sometimes, there are more, because there are dropped, or there are errors. While at it, the "no MPC reuse with single endpoint" subtest has been modified to force a bind() error. Reviewed-by: Geliang Tang Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20240902-net-next-mptcp-mib-mpjtx-misc-v1-6-d3e0f3773b90@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/mptcp/mptcp_join.sh | 89 ++++++++++++++++++++++--- 1 file changed, 78 insertions(+), 11 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh index 63580a5810bf..23f8e2254064 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_join.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh @@ -67,6 +67,10 @@ unset join_fail_nr unset join_rst_nr unset join_infi_nr unset join_corrupted_pkts +unset join_syn_tx +unset join_create_err +unset join_bind_err +unset join_connect_err # generated using "nfbpf_compile '(ip && (ip[54] & 0xf0) == 0x30) || # (ip6 && (ip6[74] & 0xf0) == 0x30)'" @@ -1336,6 +1340,54 @@ chk_infi_nr() fi } +chk_join_tx_nr() +{ + local syn_tx=${join_syn_tx:-0} + local create=${join_create_err:-0} + local bind=${join_bind_err:-0} + local connect=${join_connect_err:-0} + local rc=${KSFT_PASS} + local count + + count=$(mptcp_lib_get_counter ${ns2} "MPTcpExtMPJoinSynTx") + if [ -z "$count" ]; then + rc=${KSFT_SKIP} + elif [ "$count" != "$syn_tx" ]; then + rc=${KSFT_FAIL} + print_check "syn tx" + fail_test "got $count JOIN[s] syn tx expected $syn_tx" + fi + + count=$(mptcp_lib_get_counter ${ns2} "MPTcpExtMPJoinSynTxCreatSkErr") + if [ -z "$count" ]; then + rc=${KSFT_SKIP} + elif [ "$count" != "$create" ]; then + rc=${KSFT_FAIL} + print_check "syn tx create socket error" + fail_test "got $count JOIN[s] syn tx create socket error expected $create" + fi + + count=$(mptcp_lib_get_counter ${ns2} "MPTcpExtMPJoinSynTxBindErr") + if [ -z "$count" ]; then + rc=${KSFT_SKIP} + elif [ "$count" != "$bind" ]; then + rc=${KSFT_FAIL} + print_check "syn tx bind error" + fail_test "got $count JOIN[s] syn tx bind error expected $bind" + fi + + count=$(mptcp_lib_get_counter ${ns2} "MPTcpExtMPJoinSynTxConnectErr") + if [ -z "$count" ]; then + rc=${KSFT_SKIP} + elif [ "$count" != "$connect" ]; then + rc=${KSFT_FAIL} + print_check "syn tx connect error" + fail_test "got $count JOIN[s] syn tx connect error expected $connect" + fi + + print_results "join Tx" ${rc} +} + chk_join_nr() { local syn_nr=$1 @@ -1390,6 +1442,9 @@ chk_join_nr() print_results "join Rx" ${rc} + join_syn_tx="${join_syn_tx:-${syn_nr}}" \ + chk_join_tx_nr + if $validate_checksum; then chk_csum_nr $csum_ns1 $csum_ns2 chk_fail_nr $fail_nr $fail_nr @@ -1930,9 +1985,11 @@ subflows_error_tests() pm_nl_set_limits $ns1 0 1 pm_nl_set_limits $ns2 0 1 pm_nl_add_endpoint $ns2 10.0.1.2 flags subflow + pm_nl_add_endpoint $ns2 10.0.12.2 flags subflow speed=slow \ run_tests $ns1 $ns2 10.0.1.1 - chk_join_nr 0 0 0 + join_bind_err=1 \ + chk_join_nr 0 0 0 fi # multiple subflows, with subflow creation error @@ -1944,7 +2001,8 @@ subflows_error_tests() pm_nl_add_endpoint $ns2 10.0.2.2 flags subflow speed=slow \ run_tests $ns1 $ns2 10.0.1.1 - chk_join_nr 1 1 1 + join_syn_tx=2 \ + chk_join_nr 1 1 1 fi # multiple subflows, with subflow timeout on MPJ @@ -1956,7 +2014,8 @@ subflows_error_tests() pm_nl_add_endpoint $ns2 10.0.2.2 flags subflow speed=slow \ run_tests $ns1 $ns2 10.0.1.1 - chk_join_nr 1 1 1 + join_syn_tx=2 \ + chk_join_nr 1 1 1 fi # multiple subflows, check that the endpoint corresponding to @@ -1977,7 +2036,8 @@ subflows_error_tests() # additional subflow could be created only if the PM select # the later endpoint, skipping the already used one - chk_join_nr 1 1 1 + join_syn_tx=2 \ + chk_join_nr 1 1 1 fi } @@ -2063,7 +2123,8 @@ signal_address_tests() pm_nl_add_endpoint $ns1 10.0.14.1 flags signal pm_nl_set_limits $ns2 3 3 run_tests $ns1 $ns2 10.0.1.1 - chk_join_nr 1 1 1 + join_syn_tx=3 \ + chk_join_nr 1 1 1 chk_add_nr 3 3 fi @@ -2231,7 +2292,8 @@ add_addr_timeout_tests() pm_nl_set_limits $ns2 2 2 speed=10 \ run_tests $ns1 $ns2 10.0.1.1 - chk_join_nr 1 1 1 + join_syn_tx=2 \ + chk_join_nr 1 1 1 chk_add_nr 8 0 fi } @@ -2331,7 +2393,8 @@ remove_tests() pm_nl_set_limits $ns2 2 2 addr_nr_ns1=-3 speed=10 \ run_tests $ns1 $ns2 10.0.1.1 - chk_join_nr 1 1 1 + join_syn_tx=2 join_connect_err=1 \ + chk_join_nr 1 1 1 chk_add_nr 3 3 chk_rm_nr 3 1 invert chk_rst_nr 0 0 @@ -2396,7 +2459,8 @@ remove_tests() pm_nl_set_limits $ns2 3 3 addr_nr_ns1=-8 speed=slow \ run_tests $ns1 $ns2 10.0.1.1 - chk_join_nr 1 1 1 + join_syn_tx=3 \ + chk_join_nr 1 1 1 chk_add_nr 3 3 chk_rm_nr 3 1 invert chk_rst_nr 0 0 @@ -3703,7 +3767,8 @@ endpoint_tests() chk_evt_nr ns2 MPTCP_LIB_EVENT_SUB_ESTABLISHED 6 chk_evt_nr ns2 MPTCP_LIB_EVENT_SUB_CLOSED 5 # one has been closed before estab - chk_join_nr 6 6 6 + join_syn_tx=7 \ + chk_join_nr 6 6 6 chk_rm_nr 4 4 fi @@ -3775,7 +3840,8 @@ endpoint_tests() chk_evt_nr ns2 MPTCP_LIB_EVENT_SUB_ESTABLISHED 5 chk_evt_nr ns2 MPTCP_LIB_EVENT_SUB_CLOSED 3 - chk_join_nr 5 5 5 + join_connect_err=1 \ + chk_join_nr 5 5 5 chk_add_nr 6 6 chk_rm_nr 4 3 invert fi @@ -3806,7 +3872,8 @@ endpoint_tests() wait_mpj $ns2 mptcp_lib_kill_wait $tests_pid - chk_join_nr 2 2 2 + join_syn_tx=3 join_connect_err=1 \ + chk_join_nr 2 2 2 chk_add_nr 2 2 chk_rm_nr 1 0 invert fi -- cgit v1.2.3 From 6ed495345be8113899986d21c6cbc7d8f550dedb Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Mon, 2 Sep 2024 12:45:58 +0200 Subject: selftests: mptcp: join: more explicit check name Before, the check names had to be very short. It is no longer the case now that these checks are printed on a dedicated line. Then, it looks better to have more explicit names. Reviewed-by: Geliang Tang Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20240902-net-next-mptcp-mib-mpjtx-misc-v1-7-d3e0f3773b90@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/mptcp/mptcp_join.sh | 65 +++++++++++++------------ 1 file changed, 33 insertions(+), 32 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh index 23f8e2254064..7993e0e0029e 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_join.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh @@ -865,7 +865,7 @@ chk_cestab_nr() local cestab=$2 local count - print_check "cestab $cestab" + print_check "currently established: $cestab" count=$(mptcp_lib_get_counter ${ns} "MPTcpExtMPCurrEstab") if [ -z "$count" ]; then print_skip @@ -1141,7 +1141,7 @@ chk_csum_nr() csum_ns2=${csum_ns2:1} fi - print_check "sum" + print_check "checksum server" count=$(mptcp_lib_get_counter ${ns1} "MPTcpExtDataCsumErr") if [ -n "$count" ] && [ "$count" != "$csum_ns1" ]; then extra_msg+=" ns1=$count" @@ -1154,7 +1154,8 @@ chk_csum_nr() else print_ok fi - print_check "csum" + + print_check "checksum client" count=$(mptcp_lib_get_counter ${ns2} "MPTcpExtDataCsumErr") if [ -n "$count" ] && [ "$count" != "$csum_ns2" ]; then extra_msg+=" ns2=$count" @@ -1198,7 +1199,7 @@ chk_fail_nr() fail_rx=${fail_rx:1} fi - print_check "ftx" + print_check "fail tx" count=$(mptcp_lib_get_counter ${ns_tx} "MPTcpExtMPFailTx") if [ -n "$count" ] && [ "$count" != "$fail_tx" ]; then extra_msg+=",tx=$count" @@ -1212,7 +1213,7 @@ chk_fail_nr() print_ok fi - print_check "failrx" + print_check "fail rx" count=$(mptcp_lib_get_counter ${ns_rx} "MPTcpExtMPFailRx") if [ -n "$count" ] && [ "$count" != "$fail_rx" ]; then extra_msg+=",rx=$count" @@ -1245,7 +1246,7 @@ chk_fclose_nr() extra_msg="invert" fi - print_check "ctx" + print_check "fast close tx" count=$(mptcp_lib_get_counter ${ns_tx} "MPTcpExtMPFastcloseTx") if [ -z "$count" ]; then print_skip @@ -1256,7 +1257,7 @@ chk_fclose_nr() print_ok fi - print_check "fclzrx" + print_check "fast close rx" count=$(mptcp_lib_get_counter ${ns_rx} "MPTcpExtMPFastcloseRx") if [ -z "$count" ]; then print_skip @@ -1286,7 +1287,7 @@ chk_rst_nr() extra_msg="invert" fi - print_check "rtx" + print_check "reset tx" count=$(mptcp_lib_get_counter ${ns_tx} "MPTcpExtMPRstTx") if [ -z "$count" ]; then print_skip @@ -1298,7 +1299,7 @@ chk_rst_nr() print_ok fi - print_check "rstrx" + print_check "reset rx" count=$(mptcp_lib_get_counter ${ns_rx} "MPTcpExtMPRstRx") if [ -z "$count" ]; then print_skip @@ -1319,7 +1320,7 @@ chk_infi_nr() local infi_rx=$2 local count - print_check "itx" + print_check "infi tx" count=$(mptcp_lib_get_counter ${ns2} "MPTcpExtInfiniteMapTx") if [ -z "$count" ]; then print_skip @@ -1329,7 +1330,7 @@ chk_infi_nr() print_ok fi - print_check "infirx" + print_check "infi rx" count=$(mptcp_lib_get_counter ${ns1} "MPTcpExtInfiniteMapRx") if [ -z "$count" ]; then print_skip @@ -1412,8 +1413,8 @@ chk_join_nr() rc=${KSFT_SKIP} elif [ "$count" != "$syn_nr" ]; then rc=${KSFT_FAIL} - print_check "syn" - fail_test "got $count JOIN[s] syn expected $syn_nr" + print_check "syn rx" + fail_test "got $count JOIN[s] syn rx expected $syn_nr" fi with_cookie=$(ip netns exec $ns2 sysctl -n net.ipv4.tcp_syncookies) @@ -1426,8 +1427,8 @@ chk_join_nr() # the subflow creation if [ "$with_cookie" != 2 ] || [ "$count" -le "$syn_ack_nr" ] || [ "$count" -gt "$syn_nr" ]; then rc=${KSFT_FAIL} - print_check "synack" - fail_test "got $count JOIN[s] synack expected $syn_ack_nr" + print_check "synack rx" + fail_test "got $count JOIN[s] synack rx expected $syn_ack_nr" fi fi @@ -1436,8 +1437,8 @@ chk_join_nr() rc=${KSFT_SKIP} elif [ "$count" != "$ack_nr" ]; then rc=${KSFT_FAIL} - print_check "ack" - fail_test "got $count JOIN[s] ack expected $ack_nr" + print_check "ack rx" + fail_test "got $count JOIN[s] ack rx expected $ack_nr" fi print_results "join Rx" ${rc} @@ -1517,7 +1518,7 @@ chk_add_nr() timeout=$(ip netns exec ${ns_tx} sysctl -n net.mptcp.add_addr_timeout) - print_check "add" + print_check "add addr rx" count=$(mptcp_lib_get_counter ${ns_rx} "MPTcpExtAddAddr") if [ -z "$count" ]; then print_skip @@ -1529,7 +1530,7 @@ chk_add_nr() print_ok fi - print_check "echo" + print_check "add addr echo rx" count=$(mptcp_lib_get_counter ${ns_tx} "MPTcpExtEchoAdd") if [ -z "$count" ]; then print_skip @@ -1540,7 +1541,7 @@ chk_add_nr() fi if [ $port_nr -gt 0 ]; then - print_check "pt" + print_check "add addr rx with port" count=$(mptcp_lib_get_counter ${ns_rx} "MPTcpExtPortAdd") if [ -z "$count" ]; then print_skip @@ -1550,7 +1551,7 @@ chk_add_nr() print_ok fi - print_check "syn" + print_check "syn rx port" count=$(mptcp_lib_get_counter ${ns_tx} "MPTcpExtMPJoinPortSynRx") if [ -z "$count" ]; then print_skip @@ -1561,7 +1562,7 @@ chk_add_nr() print_ok fi - print_check "synack" + print_check "synack rx port" count=$(mptcp_lib_get_counter ${ns_rx} "MPTcpExtMPJoinPortSynAckRx") if [ -z "$count" ]; then print_skip @@ -1572,7 +1573,7 @@ chk_add_nr() print_ok fi - print_check "ack" + print_check "ack rx port" count=$(mptcp_lib_get_counter ${ns_tx} "MPTcpExtMPJoinPortAckRx") if [ -z "$count" ]; then print_skip @@ -1583,7 +1584,7 @@ chk_add_nr() print_ok fi - print_check "syn" + print_check "syn rx port mismatch" count=$(mptcp_lib_get_counter ${ns_tx} "MPTcpExtMismatchPortSynRx") if [ -z "$count" ]; then print_skip @@ -1594,7 +1595,7 @@ chk_add_nr() print_ok fi - print_check "ack" + print_check "ack rx port mismatch" count=$(mptcp_lib_get_counter ${ns_tx} "MPTcpExtMismatchPortAckRx") if [ -z "$count" ]; then print_skip @@ -1618,7 +1619,7 @@ chk_add_tx_nr() timeout=$(ip netns exec $ns1 sysctl -n net.mptcp.add_addr_timeout) - print_check "add TX" + print_check "add addr tx" count=$(mptcp_lib_get_counter ${ns1} "MPTcpExtAddAddrTx") if [ -z "$count" ]; then print_skip @@ -1630,7 +1631,7 @@ chk_add_tx_nr() print_ok fi - print_check "echo TX" + print_check "add addr echo tx" count=$(mptcp_lib_get_counter ${ns2} "MPTcpExtEchoAddTx") if [ -z "$count" ]; then print_skip @@ -1668,7 +1669,7 @@ chk_rm_nr() extra_msg="invert" fi - print_check "rm" + print_check "rm addr rx" count=$(mptcp_lib_get_counter ${addr_ns} "MPTcpExtRmAddr") if [ -z "$count" ]; then print_skip @@ -1678,7 +1679,7 @@ chk_rm_nr() print_ok fi - print_check "rmsf" + print_check "rm subflow" count=$(mptcp_lib_get_counter ${subflow_ns} "MPTcpExtRmSubflow") if [ -z "$count" ]; then print_skip @@ -1713,7 +1714,7 @@ chk_rm_tx_nr() { local rm_addr_tx_nr=$1 - print_check "rm TX" + print_check "rm addr tx" count=$(mptcp_lib_get_counter ${ns2} "MPTcpExtRmAddrTx") if [ -z "$count" ]; then print_skip @@ -1732,7 +1733,7 @@ chk_prio_nr() local mpj_syn_ack=$4 local count - print_check "ptx" + print_check "mp_prio tx" count=$(mptcp_lib_get_counter ${ns1} "MPTcpExtMPPrioTx") if [ -z "$count" ]; then print_skip @@ -1742,7 +1743,7 @@ chk_prio_nr() print_ok fi - print_check "prx" + print_check "mp_prio rx" count=$(mptcp_lib_get_counter ${ns1} "MPTcpExtMPPrioRx") if [ -z "$count" ]; then print_skip -- cgit v1.2.3 From 8d328dbcf61b50ca51f8a930561fe6ae2c8bf5e9 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Mon, 2 Sep 2024 12:45:59 +0200 Subject: selftests: mptcp: join: specify host being checked Instead of displaying 'invert' when looking at some events like MP_FAIL, MP_FASTCLOSE, MP_RESET, RM_ADDR, which is a bit vague because they are not traditionnaly sent from one side, the host being checked is now printed. For the ADD_ADDR, only display the host when it is the client sending it, which is more unusual. Also before, the 'invert' message was printed after a few checks, but it was not clear which ones exactly. Reviewed-by: Geliang Tang Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20240902-net-next-mptcp-mib-mpjtx-misc-v1-8-d3e0f3773b90@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/mptcp/mptcp_join.sh | 89 +++++++++++++------------ 1 file changed, 45 insertions(+), 44 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh index 7993e0e0029e..321197d8977e 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_join.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh @@ -1180,6 +1180,8 @@ chk_fail_nr() local count local ns_tx=$ns1 local ns_rx=$ns2 + local tx="server" + local rx="client" local extra_msg="" local allow_tx_lost=0 local allow_rx_lost=0 @@ -1187,7 +1189,8 @@ chk_fail_nr() if [[ $ns_invert = "invert" ]]; then ns_tx=$ns2 ns_rx=$ns1 - extra_msg="invert" + tx="client" + rx="server" fi if [[ "${fail_tx}" = "-"* ]]; then @@ -1199,10 +1202,10 @@ chk_fail_nr() fail_rx=${fail_rx:1} fi - print_check "fail tx" + print_check "fail tx ${tx}" count=$(mptcp_lib_get_counter ${ns_tx} "MPTcpExtMPFailTx") if [ -n "$count" ] && [ "$count" != "$fail_tx" ]; then - extra_msg+=",tx=$count" + extra_msg+=" tx=$count" fi if [ -z "$count" ]; then print_skip @@ -1213,10 +1216,10 @@ chk_fail_nr() print_ok fi - print_check "fail rx" + print_check "fail rx ${rx}" count=$(mptcp_lib_get_counter ${ns_rx} "MPTcpExtMPFailRx") if [ -n "$count" ] && [ "$count" != "$fail_rx" ]; then - extra_msg+=",rx=$count" + extra_msg+=" rx=$count" fi if [ -z "$count" ]; then print_skip @@ -1238,37 +1241,35 @@ chk_fclose_nr() local count local ns_tx=$ns2 local ns_rx=$ns1 - local extra_msg="" + local tx="client" + local rx="server" if [[ $ns_invert = "invert" ]]; then ns_tx=$ns1 ns_rx=$ns2 - extra_msg="invert" + tx="server" + rx="client" fi - print_check "fast close tx" + print_check "fast close tx ${tx}" count=$(mptcp_lib_get_counter ${ns_tx} "MPTcpExtMPFastcloseTx") if [ -z "$count" ]; then print_skip elif [ "$count" != "$fclose_tx" ]; then - extra_msg+=",tx=$count" fail_test "got $count MP_FASTCLOSE[s] TX expected $fclose_tx" else print_ok fi - print_check "fast close rx" + print_check "fast close rx ${rx}" count=$(mptcp_lib_get_counter ${ns_rx} "MPTcpExtMPFastcloseRx") if [ -z "$count" ]; then print_skip elif [ "$count" != "$fclose_rx" ]; then - extra_msg+=",rx=$count" fail_test "got $count MP_FASTCLOSE[s] RX expected $fclose_rx" else print_ok fi - - print_info "$extra_msg" } chk_rst_nr() @@ -1279,15 +1280,17 @@ chk_rst_nr() local count local ns_tx=$ns1 local ns_rx=$ns2 - local extra_msg="" + local tx="server" + local rx="client" if [[ $ns_invert = "invert" ]]; then ns_tx=$ns2 ns_rx=$ns1 - extra_msg="invert" + tx="client" + rx="server" fi - print_check "reset tx" + print_check "reset tx ${tx}" count=$(mptcp_lib_get_counter ${ns_tx} "MPTcpExtMPRstTx") if [ -z "$count" ]; then print_skip @@ -1299,7 +1302,7 @@ chk_rst_nr() print_ok fi - print_check "reset rx" + print_check "reset rx ${rx}" count=$(mptcp_lib_get_counter ${ns_rx} "MPTcpExtMPRstRx") if [ -z "$count" ]; then print_skip @@ -1310,8 +1313,6 @@ chk_rst_nr() else print_ok fi - - print_info "$extra_msg" } chk_infi_nr() @@ -1320,7 +1321,7 @@ chk_infi_nr() local infi_rx=$2 local count - print_check "infi tx" + print_check "infi tx client" count=$(mptcp_lib_get_counter ${ns2} "MPTcpExtInfiniteMapTx") if [ -z "$count" ]; then print_skip @@ -1330,7 +1331,7 @@ chk_infi_nr() print_ok fi - print_check "infi rx" + print_check "infi rx server" count=$(mptcp_lib_get_counter ${ns1} "MPTcpExtInfiniteMapRx") if [ -z "$count" ]; then print_skip @@ -1506,19 +1507,21 @@ chk_add_nr() local mis_ack_nr=0 local ns_tx=$ns1 local ns_rx=$ns2 - local extra_msg="" + local tx="" + local rx="" local count local timeout if [[ $ns_invert = "invert" ]]; then ns_tx=$ns2 ns_rx=$ns1 - extra_msg="invert" + tx=" client" + rx=" server" fi timeout=$(ip netns exec ${ns_tx} sysctl -n net.mptcp.add_addr_timeout) - print_check "add addr rx" + print_check "add addr rx${rx}" count=$(mptcp_lib_get_counter ${ns_rx} "MPTcpExtAddAddr") if [ -z "$count" ]; then print_skip @@ -1530,7 +1533,7 @@ chk_add_nr() print_ok fi - print_check "add addr echo rx" + print_check "add addr echo rx${tx}" count=$(mptcp_lib_get_counter ${ns_tx} "MPTcpExtEchoAdd") if [ -z "$count" ]; then print_skip @@ -1541,7 +1544,7 @@ chk_add_nr() fi if [ $port_nr -gt 0 ]; then - print_check "add addr rx with port" + print_check "add addr rx with port${rx}" count=$(mptcp_lib_get_counter ${ns_rx} "MPTcpExtPortAdd") if [ -z "$count" ]; then print_skip @@ -1551,7 +1554,7 @@ chk_add_nr() print_ok fi - print_check "syn rx port" + print_check "syn rx port${tx}" count=$(mptcp_lib_get_counter ${ns_tx} "MPTcpExtMPJoinPortSynRx") if [ -z "$count" ]; then print_skip @@ -1562,7 +1565,7 @@ chk_add_nr() print_ok fi - print_check "synack rx port" + print_check "synack rx port${rx}" count=$(mptcp_lib_get_counter ${ns_rx} "MPTcpExtMPJoinPortSynAckRx") if [ -z "$count" ]; then print_skip @@ -1573,7 +1576,7 @@ chk_add_nr() print_ok fi - print_check "ack rx port" + print_check "ack rx port${tx}" count=$(mptcp_lib_get_counter ${ns_tx} "MPTcpExtMPJoinPortAckRx") if [ -z "$count" ]; then print_skip @@ -1584,7 +1587,7 @@ chk_add_nr() print_ok fi - print_check "syn rx port mismatch" + print_check "syn rx port mismatch${tx}" count=$(mptcp_lib_get_counter ${ns_tx} "MPTcpExtMismatchPortSynRx") if [ -z "$count" ]; then print_skip @@ -1595,7 +1598,7 @@ chk_add_nr() print_ok fi - print_check "ack rx port mismatch" + print_check "ack rx port mismatch${tx}" count=$(mptcp_lib_get_counter ${ns_tx} "MPTcpExtMismatchPortAckRx") if [ -z "$count" ]; then print_skip @@ -1606,8 +1609,6 @@ chk_add_nr() print_ok fi fi - - print_info "$extra_msg" } chk_add_tx_nr() @@ -1651,6 +1652,8 @@ chk_rm_nr() local count local addr_ns=$ns1 local subflow_ns=$ns2 + local addr="server" + local subflow="client" local extra_msg="" shift 2 @@ -1660,16 +1663,14 @@ chk_rm_nr() shift done - if [ -z $invert ]; then - addr_ns=$ns1 - subflow_ns=$ns2 - elif [ $invert = "true" ]; then + if [ "$invert" = "true" ]; then addr_ns=$ns2 subflow_ns=$ns1 - extra_msg="invert" + addr="client" + subflow="server" fi - print_check "rm addr rx" + print_check "rm addr rx ${addr}" count=$(mptcp_lib_get_counter ${addr_ns} "MPTcpExtRmAddr") if [ -z "$count" ]; then print_skip @@ -1679,7 +1680,7 @@ chk_rm_nr() print_ok fi - print_check "rm subflow" + print_check "rm subflow ${subflow}" count=$(mptcp_lib_get_counter ${subflow_ns} "MPTcpExtRmSubflow") if [ -z "$count" ]; then print_skip @@ -1693,7 +1694,7 @@ chk_rm_nr() count=$((count + cnt)) if [ "$count" != "$rm_subflow_nr" ]; then suffix="$count in [$rm_subflow_nr:$((rm_subflow_nr*2))]" - extra_msg+=" simult" + extra_msg="simult" fi if [ $count -ge "$rm_subflow_nr" ] && \ [ "$count" -le "$((rm_subflow_nr *2 ))" ]; then @@ -1714,7 +1715,7 @@ chk_rm_tx_nr() { local rm_addr_tx_nr=$1 - print_check "rm addr tx" + print_check "rm addr tx client" count=$(mptcp_lib_get_counter ${ns2} "MPTcpExtRmAddrTx") if [ -z "$count" ]; then print_skip @@ -1733,7 +1734,7 @@ chk_prio_nr() local mpj_syn_ack=$4 local count - print_check "mp_prio tx" + print_check "mp_prio tx server" count=$(mptcp_lib_get_counter ${ns1} "MPTcpExtMPPrioTx") if [ -z "$count" ]; then print_skip @@ -1743,7 +1744,7 @@ chk_prio_nr() print_ok fi - print_check "mp_prio rx" + print_check "mp_prio rx client" count=$(mptcp_lib_get_counter ${ns1} "MPTcpExtMPPrioRx") if [ -z "$count" ]; then print_skip -- cgit v1.2.3 From 08eecd7e7fe79669ccab44dde518b10a2f7e48a7 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Mon, 2 Sep 2024 12:46:00 +0200 Subject: selftests: mptcp: join: mute errors when ran in the background The test is supposed to be killed before the end, which will likely cause "Connection reset by peer" errors. It is confusing, especially because in case of real transfer errors, the test will not be marked as failed. But that's OK, there are many other tests checking that. Reviewed-by: Geliang Tang Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20240902-net-next-mptcp-mib-mpjtx-misc-v1-9-d3e0f3773b90@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/mptcp/mptcp_join.sh | 36 ++++++++++++------------- 1 file changed, 18 insertions(+), 18 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh index 321197d8977e..5d164abc18e5 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_join.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh @@ -3542,8 +3542,8 @@ userspace_tests() continue_if mptcp_lib_has_file '/proc/sys/net/mptcp/pm_type'; then set_userspace_pm $ns1 pm_nl_set_limits $ns2 2 2 - speed=5 \ - run_tests $ns1 $ns2 10.0.1.1 & + { speed=5 \ + run_tests $ns1 $ns2 10.0.1.1 & } 2>/dev/null local tests_pid=$! wait_mpj $ns1 userspace_pm_add_addr $ns1 10.0.2.1 10 @@ -3575,8 +3575,8 @@ userspace_tests() continue_if mptcp_lib_has_file '/proc/sys/net/mptcp/pm_type'; then set_userspace_pm $ns2 pm_nl_set_limits $ns1 0 1 - speed=5 \ - run_tests $ns1 $ns2 10.0.1.1 & + { speed=5 \ + run_tests $ns1 $ns2 10.0.1.1 & } 2>/dev/null local tests_pid=$! wait_mpj $ns2 userspace_pm_add_sf $ns2 10.0.3.2 20 @@ -3603,8 +3603,8 @@ userspace_tests() continue_if mptcp_lib_has_file '/proc/sys/net/mptcp/pm_type'; then set_userspace_pm $ns2 pm_nl_set_limits $ns1 0 1 - speed=5 \ - run_tests $ns1 $ns2 10.0.1.1 & + { speed=5 \ + run_tests $ns1 $ns2 10.0.1.1 & } 2>/dev/null local tests_pid=$! wait_mpj $ns2 chk_mptcp_info subflows 0 subflows 0 @@ -3624,8 +3624,8 @@ userspace_tests() continue_if mptcp_lib_has_file '/proc/sys/net/mptcp/pm_type'; then set_userspace_pm $ns2 pm_nl_set_limits $ns1 0 1 - speed=5 \ - run_tests $ns1 $ns2 10.0.1.1 & + { speed=5 \ + run_tests $ns1 $ns2 10.0.1.1 & } 2>/dev/null local tests_pid=$! wait_mpj $ns2 userspace_pm_add_sf $ns2 10.0.3.2 20 @@ -3648,8 +3648,8 @@ userspace_tests() continue_if mptcp_lib_has_file '/proc/sys/net/mptcp/pm_type'; then set_userspace_pm $ns1 pm_nl_set_limits $ns2 1 1 - speed=5 \ - run_tests $ns1 $ns2 10.0.1.1 & + { speed=5 \ + run_tests $ns1 $ns2 10.0.1.1 & } 2>/dev/null local tests_pid=$! wait_mpj $ns1 userspace_pm_add_addr $ns1 10.0.2.1 10 @@ -3679,8 +3679,8 @@ endpoint_tests() pm_nl_set_limits $ns1 2 2 pm_nl_set_limits $ns2 2 2 pm_nl_add_endpoint $ns1 10.0.2.1 flags signal - speed=slow \ - run_tests $ns1 $ns2 10.0.1.1 & + { speed=slow \ + run_tests $ns1 $ns2 10.0.1.1 & } 2>/dev/null local tests_pid=$! wait_mpj $ns1 @@ -3706,8 +3706,8 @@ endpoint_tests() pm_nl_set_limits $ns2 0 3 pm_nl_add_endpoint $ns2 10.0.1.2 id 1 dev ns2eth1 flags subflow pm_nl_add_endpoint $ns2 10.0.2.2 id 2 dev ns2eth2 flags subflow - test_linkfail=4 speed=5 \ - run_tests $ns1 $ns2 10.0.1.1 & + { test_linkfail=4 speed=5 \ + run_tests $ns1 $ns2 10.0.1.1 & } 2>/dev/null local tests_pid=$! wait_mpj $ns2 @@ -3783,8 +3783,8 @@ endpoint_tests() # broadcast IP: no packet for this address will be received on ns1 pm_nl_add_endpoint $ns1 224.0.0.1 id 2 flags signal pm_nl_add_endpoint $ns1 10.0.1.1 id 42 flags signal - test_linkfail=4 speed=5 \ - run_tests $ns1 $ns2 10.0.1.1 & + { test_linkfail=4 speed=5 \ + run_tests $ns1 $ns2 10.0.1.1 & } 2>/dev/null local tests_pid=$! wait_mpj $ns2 @@ -3856,8 +3856,8 @@ endpoint_tests() # broadcast IP: no packet for this address will be received on ns1 pm_nl_add_endpoint $ns1 224.0.0.1 id 2 flags signal pm_nl_add_endpoint $ns2 10.0.3.2 id 3 flags subflow - test_linkfail=4 speed=20 \ - run_tests $ns1 $ns2 10.0.1.1 & + { test_linkfail=4 speed=20 \ + run_tests $ns1 $ns2 10.0.1.1 & } 2>/dev/null local tests_pid=$! wait_attempt_fail $ns2 -- cgit v1.2.3 From 0e2b4584d61abe8dc22db18e83c85429782793ee Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Mon, 2 Sep 2024 12:46:01 +0200 Subject: selftests: mptcp: join: simplify checksum_tests The four checksum tests are similar, only one line is different. So a for-loop can be used to simplify these tests. Signed-off-by: Geliang Tang Reviewed-by: Matthieu Baerts (NGI0) Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20240902-net-next-mptcp-mib-mpjtx-misc-v1-10-d3e0f3773b90@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/mptcp/mptcp_join.sh | 43 +++++++------------------ 1 file changed, 11 insertions(+), 32 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh index 5d164abc18e5..43f8a9bd84c4 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_join.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh @@ -363,7 +363,7 @@ reset_with_checksum() local ns1_enable=$1 local ns2_enable=$2 - reset "checksum test ${1} ${2}" || return 1 + reset "checksum test ${ns1_enable} ${ns2_enable}" || return 1 ip netns exec $ns1 sysctl -q net.mptcp.checksum_enabled=$ns1_enable ip netns exec $ns2 sysctl -q net.mptcp.checksum_enabled=$ns2_enable @@ -3032,37 +3032,16 @@ syncookies_tests() checksum_tests() { - # checksum test 0 0 - if reset_with_checksum 0 0; then - pm_nl_set_limits $ns1 0 1 - pm_nl_set_limits $ns2 0 1 - run_tests $ns1 $ns2 10.0.1.1 - chk_join_nr 0 0 0 - fi - - # checksum test 1 1 - if reset_with_checksum 1 1; then - pm_nl_set_limits $ns1 0 1 - pm_nl_set_limits $ns2 0 1 - run_tests $ns1 $ns2 10.0.1.1 - chk_join_nr 0 0 0 - fi - - # checksum test 0 1 - if reset_with_checksum 0 1; then - pm_nl_set_limits $ns1 0 1 - pm_nl_set_limits $ns2 0 1 - run_tests $ns1 $ns2 10.0.1.1 - chk_join_nr 0 0 0 - fi - - # checksum test 1 0 - if reset_with_checksum 1 0; then - pm_nl_set_limits $ns1 0 1 - pm_nl_set_limits $ns2 0 1 - run_tests $ns1 $ns2 10.0.1.1 - chk_join_nr 0 0 0 - fi + local checksum_enable + for checksum_enable in "0 0" "1 1" "0 1" "1 0"; do + # checksum test 0 0, 1 1, 0 1, 1 0 + if reset_with_checksum ${checksum_enable}; then + pm_nl_set_limits $ns1 0 1 + pm_nl_set_limits $ns2 0 1 + run_tests $ns1 $ns2 10.0.1.1 + chk_join_nr 0 0 0 + fi + done } deny_join_id0_tests() -- cgit v1.2.3 From 38dc0708bcc8d2ee9e0559a992fd6e91bf67e271 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Mon, 2 Sep 2024 12:46:02 +0200 Subject: selftests: mptcp: pm_nl_ctl: remove re-definition 'MPTCP_PM_NAME' is defined in 'linux/mptcp_pm.h', included in 'linux/mptcp.h', no need to re-define it. 'MPTCP_PM_EVENTS' is not defined in 'linux/mptcp.h', but 'MPTCP_PM_EV_GRP_NAME' is, with the same value. We can then use the latter, and drop the other one. Reviewed-by: Geliang Tang Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20240902-net-next-mptcp-mib-mpjtx-misc-v1-11-d3e0f3773b90@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/mptcp/pm_nl_ctl.c | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/mptcp/pm_nl_ctl.c b/tools/testing/selftests/net/mptcp/pm_nl_ctl.c index 7ad5a59adff2..994a556f46c1 100644 --- a/tools/testing/selftests/net/mptcp/pm_nl_ctl.c +++ b/tools/testing/selftests/net/mptcp/pm_nl_ctl.c @@ -19,12 +19,6 @@ #include "linux/mptcp.h" -#ifndef MPTCP_PM_NAME -#define MPTCP_PM_NAME "mptcp_pm" -#endif -#ifndef MPTCP_PM_EVENTS -#define MPTCP_PM_EVENTS "mptcp_pm_events" -#endif #ifndef IPPROTO_MPTCP #define IPPROTO_MPTCP 262 #endif @@ -116,7 +110,7 @@ static int capture_events(int fd, int event_group) if (setsockopt(fd, SOL_NETLINK, NETLINK_ADD_MEMBERSHIP, &event_group, sizeof(event_group)) < 0) - error(1, errno, "could not join the " MPTCP_PM_EVENTS " mcast group"); + error(1, errno, "could not join the " MPTCP_PM_EV_GRP_NAME " mcast group"); do { FD_ZERO(&rfds); @@ -288,7 +282,7 @@ static int genl_parse_getfamily(struct nlmsghdr *nlh, int *pm_family, if (grp->rta_type == CTRL_ATTR_MCAST_GRP_ID) *events_mcast_grp = *(__u32 *)RTA_DATA(grp); else if (grp->rta_type == CTRL_ATTR_MCAST_GRP_NAME && - !strcmp(RTA_DATA(grp), MPTCP_PM_EVENTS)) + !strcmp(RTA_DATA(grp), MPTCP_PM_EV_GRP_NAME)) got_events_grp = 1; grp = RTA_NEXT(grp, grp_len); -- cgit v1.2.3 From 577a67662ff529f617981fe9692ff277b5756402 Mon Sep 17 00:00:00 2001 From: Li Ming Date: Fri, 30 Aug 2024 06:13:06 +0000 Subject: cxl/pci: Rename cxl_setup_parent_dport() and cxl_dport_map_regs() The name of cxl_setup_parent_dport() function is not clear, the function is used to initialize AER and RAS capabilities on a dport, therefore, rename the function to cxl_dport_init_ras_reporting(), it is easier for user to understand what the function does. Besides, adjust the order of the function parameters, the subject of cxl_dport_init_ras_reporting() is a cxl dport, so a struct cxl_dport as the first parameter of the function should be better. cxl_dport_map_regs() is used to map CXL RAS capability on a cxl dport, using cxl_dport_map_ras() as the function name. Signed-off-by: Li Ming Reviewed-by: Jonathan Cameron Link: https://patch.msgid.link/20240830061308.2327065-1-ming4.li@intel.com Signed-off-by: Dave Jiang --- drivers/cxl/core/pci.c | 13 +++++++++---- drivers/cxl/cxl.h | 5 +++-- drivers/cxl/mem.c | 2 +- tools/testing/cxl/Kbuild | 2 +- tools/testing/cxl/test/mock.c | 6 +++--- 5 files changed, 17 insertions(+), 11 deletions(-) (limited to 'tools') diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c index 4725e37d90fb..cca078068a05 100644 --- a/drivers/cxl/core/pci.c +++ b/drivers/cxl/core/pci.c @@ -787,7 +787,7 @@ static void cxl_dport_map_rch_aer(struct cxl_dport *dport) dport->regs.dport_aer = dport_aer; } -static void cxl_dport_map_regs(struct cxl_dport *dport) +static void cxl_dport_map_ras(struct cxl_dport *dport) { struct cxl_register_map *map = &dport->reg_map; struct device *dev = dport->dport_dev; @@ -831,7 +831,12 @@ static void cxl_disable_rch_root_ints(struct cxl_dport *dport) } } -void cxl_setup_parent_dport(struct device *host, struct cxl_dport *dport) +/** + * cxl_dport_init_ras_reporting - Setup CXL RAS report on this dport + * @dport: the cxl_dport that needs to be initialized + * @host: host device for devm operations + */ +void cxl_dport_init_ras_reporting(struct cxl_dport *dport, struct device *host) { struct device *dport_dev = dport->dport_dev; @@ -843,12 +848,12 @@ void cxl_setup_parent_dport(struct device *host, struct cxl_dport *dport) } dport->reg_map.host = host; - cxl_dport_map_regs(dport); + cxl_dport_map_ras(dport); if (dport->rch) cxl_disable_rch_root_ints(dport); } -EXPORT_SYMBOL_NS_GPL(cxl_setup_parent_dport, CXL); +EXPORT_SYMBOL_NS_GPL(cxl_dport_init_ras_reporting, CXL); static void cxl_handle_rdport_cor_ras(struct cxl_dev_state *cxlds, struct cxl_dport *dport) diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h index 84cf3b4d60a1..2890f7e2aad5 100644 --- a/drivers/cxl/cxl.h +++ b/drivers/cxl/cxl.h @@ -763,9 +763,10 @@ struct cxl_dport *devm_cxl_add_rch_dport(struct cxl_port *port, #ifdef CONFIG_PCIEAER_CXL void cxl_setup_parent_dport(struct device *host, struct cxl_dport *dport); +void cxl_dport_init_ras_reporting(struct cxl_dport *dport, struct device *host); #else -static inline void cxl_setup_parent_dport(struct device *host, - struct cxl_dport *dport) { } +static inline void cxl_dport_init_ras_reporting(struct cxl_dport *dport, + struct device *host) { } #endif struct cxl_decoder *to_cxl_decoder(struct device *dev); diff --git a/drivers/cxl/mem.c b/drivers/cxl/mem.c index ae94018a01bd..a9fd5cd5a0d2 100644 --- a/drivers/cxl/mem.c +++ b/drivers/cxl/mem.c @@ -166,7 +166,7 @@ static int cxl_mem_probe(struct device *dev) else endpoint_parent = &parent_port->dev; - cxl_setup_parent_dport(dev, dport); + cxl_dport_init_ras_reporting(dport, dev); scoped_guard(device, endpoint_parent) { if (!endpoint_parent->driver) { diff --git a/tools/testing/cxl/Kbuild b/tools/testing/cxl/Kbuild index 3d1ca9e38b1f..b1256fee3567 100644 --- a/tools/testing/cxl/Kbuild +++ b/tools/testing/cxl/Kbuild @@ -14,7 +14,7 @@ ldflags-y += --wrap=cxl_dvsec_rr_decode ldflags-y += --wrap=devm_cxl_add_rch_dport ldflags-y += --wrap=cxl_rcd_component_reg_phys ldflags-y += --wrap=cxl_endpoint_parse_cdat -ldflags-y += --wrap=cxl_setup_parent_dport +ldflags-y += --wrap=cxl_dport_init_ras_reporting DRIVERS := ../../../drivers CXL_SRC := $(DRIVERS)/cxl diff --git a/tools/testing/cxl/test/mock.c b/tools/testing/cxl/test/mock.c index d619672faa49..bbd7d938156d 100644 --- a/tools/testing/cxl/test/mock.c +++ b/tools/testing/cxl/test/mock.c @@ -299,17 +299,17 @@ void __wrap_cxl_endpoint_parse_cdat(struct cxl_port *port) } EXPORT_SYMBOL_NS_GPL(__wrap_cxl_endpoint_parse_cdat, CXL); -void __wrap_cxl_setup_parent_dport(struct device *host, struct cxl_dport *dport) +void __wrap_cxl_dport_init_ras_reporting(struct cxl_dport *dport, struct device *host) { int index; struct cxl_mock_ops *ops = get_cxl_mock_ops(&index); if (!ops || !ops->is_mock_port(dport->dport_dev)) - cxl_setup_parent_dport(host, dport); + cxl_dport_init_ras_reporting(dport, host); put_cxl_mock_ops(index); } -EXPORT_SYMBOL_NS_GPL(__wrap_cxl_setup_parent_dport, CXL); +EXPORT_SYMBOL_NS_GPL(__wrap_cxl_dport_init_ras_reporting, CXL); MODULE_LICENSE("GPL v2"); MODULE_IMPORT_NS(ACPI); -- cgit v1.2.3 From 2f4db2861013fcfb4be11d0dc176271ae566241c Mon Sep 17 00:00:00 2001 From: Jinjiang Tu Date: Mon, 19 Aug 2024 21:06:09 +0800 Subject: selftests/mm: remove unnecessary ia64 code and comment IA64 has gone with commit cf8e8658100d ("arch: Remove Itanium (IA-64) architecture"), so remove unnecessary ia64 special mm code and comment in selftests too. Link: https://lkml.kernel.org/r/20240819130609.3386195-1-tujinjiang@huawei.com Signed-off-by: Jinjiang Tu Cc: Kefeng Wang Cc: Mike Rapoport Cc: Nanyong Sun Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/Makefile | 2 +- tools/testing/selftests/mm/hugepage-mmap.c | 18 +----------------- tools/testing/selftests/mm/hugepage-shm.c | 18 +----------------- tools/testing/selftests/mm/hugepage-vmemmap.c | 17 ++--------------- tools/testing/selftests/mm/map_hugetlb.c | 18 ++---------------- tools/testing/selftests/mm/run_vmtests.sh | 2 +- 6 files changed, 8 insertions(+), 67 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/mm/Makefile b/tools/testing/selftests/mm/Makefile index cfad627e8d94..0533c5ee0b3e 100644 --- a/tools/testing/selftests/mm/Makefile +++ b/tools/testing/selftests/mm/Makefile @@ -112,7 +112,7 @@ endif endif -ifneq (,$(filter $(ARCH),arm64 ia64 mips64 parisc64 powerpc riscv64 s390x sparc64 x86_64 s390)) +ifneq (,$(filter $(ARCH),arm64 mips64 parisc64 powerpc riscv64 s390x sparc64 x86_64 s390)) TEST_GEN_FILES += va_high_addr_switch TEST_GEN_FILES += virtual_address_range TEST_GEN_FILES += write_to_hugetlbfs diff --git a/tools/testing/selftests/mm/hugepage-mmap.c b/tools/testing/selftests/mm/hugepage-mmap.c index 267eea2e0e0b..3b1b532f1cbb 100644 --- a/tools/testing/selftests/mm/hugepage-mmap.c +++ b/tools/testing/selftests/mm/hugepage-mmap.c @@ -8,13 +8,6 @@ * like /mnt) using the command mount -t hugetlbfs nodev /mnt. In this * example, the app is requesting memory of size 256MB that is backed by * huge pages. - * - * For the ia64 architecture, the Linux kernel reserves Region number 4 for - * huge pages. That means that if one requires a fixed address, a huge page - * aligned address starting with 0x800000... will be required. If a fixed - * address is not required, the kernel will select an address in the proper - * range. - * Other architectures, such as ppc64, i386 or x86_64 are not so constrained. */ #define _GNU_SOURCE #include @@ -27,15 +20,6 @@ #define LENGTH (256UL*1024*1024) #define PROTECTION (PROT_READ | PROT_WRITE) -/* Only ia64 requires this */ -#ifdef __ia64__ -#define ADDR (void *)(0x8000000000000000UL) -#define FLAGS (MAP_SHARED | MAP_FIXED) -#else -#define ADDR (void *)(0x0UL) -#define FLAGS (MAP_SHARED) -#endif - static void check_bytes(char *addr) { ksft_print_msg("First hex is %x\n", *((unsigned int *)addr)); @@ -74,7 +58,7 @@ int main(void) if (fd < 0) ksft_exit_fail_msg("memfd_create() failed: %s\n", strerror(errno)); - addr = mmap(ADDR, LENGTH, PROTECTION, FLAGS, fd, 0); + addr = mmap(NULL, LENGTH, PROTECTION, MAP_SHARED, fd, 0); if (addr == MAP_FAILED) { close(fd); ksft_exit_fail_msg("mmap(): %s\n", strerror(errno)); diff --git a/tools/testing/selftests/mm/hugepage-shm.c b/tools/testing/selftests/mm/hugepage-shm.c index 478bb1e989e9..ef06260802b5 100644 --- a/tools/testing/selftests/mm/hugepage-shm.c +++ b/tools/testing/selftests/mm/hugepage-shm.c @@ -8,13 +8,6 @@ * SHM_HUGETLB in the shmget system call to inform the kernel that it is * requesting huge pages. * - * For the ia64 architecture, the Linux kernel reserves Region number 4 for - * huge pages. That means that if one requires a fixed address, a huge page - * aligned address starting with 0x800000... will be required. If a fixed - * address is not required, the kernel will select an address in the proper - * range. - * Other architectures, such as ppc64, i386 or x86_64 are not so constrained. - * * Note: The default shared memory limit is quite low on many kernels, * you may need to increase it via: * @@ -39,15 +32,6 @@ #define dprintf(x) printf(x) -/* Only ia64 requires this */ -#ifdef __ia64__ -#define ADDR (void *)(0x8000000000000000UL) -#define SHMAT_FLAGS (SHM_RND) -#else -#define ADDR (void *)(0x0UL) -#define SHMAT_FLAGS (0) -#endif - int main(void) { int shmid; @@ -61,7 +45,7 @@ int main(void) } printf("shmid: 0x%x\n", shmid); - shmaddr = shmat(shmid, ADDR, SHMAT_FLAGS); + shmaddr = shmat(shmid, NULL, 0); if (shmaddr == (char *)-1) { perror("Shared memory attach failure"); shmctl(shmid, IPC_RMID, NULL); diff --git a/tools/testing/selftests/mm/hugepage-vmemmap.c b/tools/testing/selftests/mm/hugepage-vmemmap.c index 894d28c3dd47..df366a4d1b92 100644 --- a/tools/testing/selftests/mm/hugepage-vmemmap.c +++ b/tools/testing/selftests/mm/hugepage-vmemmap.c @@ -22,20 +22,6 @@ #define PM_PFRAME_BITS 55 #define PM_PFRAME_MASK ~((1UL << PM_PFRAME_BITS) - 1) -/* - * For ia64 architecture, Linux kernel reserves Region number 4 for hugepages. - * That means the addresses starting with 0x800000... will need to be - * specified. Specifying a fixed address is not required on ppc64, i386 - * or x86_64. - */ -#ifdef __ia64__ -#define MAP_ADDR (void *)(0x8000000000000000UL) -#define MAP_FLAGS (MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB | MAP_FIXED) -#else -#define MAP_ADDR NULL -#define MAP_FLAGS (MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB) -#endif - static size_t pagesize; static size_t maplength; @@ -113,7 +99,8 @@ int main(int argc, char **argv) exit(1); } - addr = mmap(MAP_ADDR, maplength, PROT_READ | PROT_WRITE, MAP_FLAGS, -1, 0); + addr = mmap(NULL, maplength, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB, -1, 0); if (addr == MAP_FAILED) { perror("mmap"); exit(1); diff --git a/tools/testing/selftests/mm/map_hugetlb.c b/tools/testing/selftests/mm/map_hugetlb.c index a1f005a90a4f..b47399feab53 100644 --- a/tools/testing/selftests/mm/map_hugetlb.c +++ b/tools/testing/selftests/mm/map_hugetlb.c @@ -4,11 +4,6 @@ * system call with MAP_HUGETLB flag. Before running this program make * sure the administrator has allocated enough default sized huge pages * to cover the 256 MB allocation. - * - * For ia64 architecture, Linux kernel reserves Region number 4 for hugepages. - * That means the addresses starting with 0x800000... will need to be - * specified. Specifying a fixed address is not required on ppc64, i386 - * or x86_64. */ #include #include @@ -21,15 +16,6 @@ #define LENGTH (256UL*1024*1024) #define PROTECTION (PROT_READ | PROT_WRITE) -/* Only ia64 requires this */ -#ifdef __ia64__ -#define ADDR (void *)(0x8000000000000000UL) -#define FLAGS (MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB | MAP_FIXED) -#else -#define ADDR (void *)(0x0UL) -#define FLAGS (MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB) -#endif - static void check_bytes(char *addr) { ksft_print_msg("First hex is %x\n", *((unsigned int *)addr)); @@ -60,7 +46,7 @@ int main(int argc, char **argv) void *addr; size_t hugepage_size; size_t length = LENGTH; - int flags = FLAGS; + int flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB; int shift = 0; hugepage_size = default_huge_page_size(); @@ -85,7 +71,7 @@ int main(int argc, char **argv) ksft_print_msg("Default size hugepages\n"); ksft_print_msg("Mapping %lu Mbytes\n", (unsigned long)length >> 20); - addr = mmap(ADDR, length, PROTECTION, flags, -1, 0); + addr = mmap(NULL, length, PROTECTION, flags, -1, 0); if (addr == MAP_FAILED) ksft_exit_fail_msg("mmap: %s\n", strerror(errno)); diff --git a/tools/testing/selftests/mm/run_vmtests.sh b/tools/testing/selftests/mm/run_vmtests.sh index 36045edb10de..c5797ad1d37b 100755 --- a/tools/testing/selftests/mm/run_vmtests.sh +++ b/tools/testing/selftests/mm/run_vmtests.sh @@ -189,7 +189,7 @@ else fi # filter 64bit architectures -ARCH64STR="arm64 ia64 mips64 parisc64 ppc64 ppc64le riscv64 s390x sparc64 x86_64" +ARCH64STR="arm64 mips64 parisc64 ppc64 ppc64le riscv64 s390x sparc64 x86_64" if [ -z "$ARCH" ]; then ARCH=$(uname -m 2>/dev/null | sed -e 's/aarch64.*/arm64/') fi -- cgit v1.2.3 From 2e6d88e9d455fae9c4ac95c893362258f9540dfa Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Tue, 20 Aug 2024 17:49:17 +0800 Subject: selftests: mm: support shmem mTHP collapse testing Add shmem mTHP collpase testing. Similar to the anonymous page, users can use the '-s' parameter to specify the shmem mTHP size for testing. Link: https://lkml.kernel.org/r/fa44bfa20ca5b9fd6f9163a048f3d3c1e53cd0a8.1724140601.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Cc: Barry Song <21cnbao@gmail.com> Cc: David Hildenbrand Cc: Hugh Dickins Cc: Matthew Wilcox Cc: Ryan Roberts Cc: Yang Shi Cc: Zi Yan Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/khugepaged.c | 4 ++- tools/testing/selftests/mm/thp_settings.c | 46 +++++++++++++++++++++++++++---- tools/testing/selftests/mm/thp_settings.h | 9 +++++- 3 files changed, 51 insertions(+), 8 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/mm/khugepaged.c b/tools/testing/selftests/mm/khugepaged.c index 829320a519e7..56d4480e8d3c 100644 --- a/tools/testing/selftests/mm/khugepaged.c +++ b/tools/testing/selftests/mm/khugepaged.c @@ -1095,7 +1095,7 @@ static void usage(void) fprintf(stderr, "\n\tSupported Options:\n"); fprintf(stderr, "\t\t-h: This help message.\n"); fprintf(stderr, "\t\t-s: mTHP size, expressed as page order.\n"); - fprintf(stderr, "\t\t Defaults to 0. Use this size for anon allocations.\n"); + fprintf(stderr, "\t\t Defaults to 0. Use this size for anon or shmem allocations.\n"); exit(1); } @@ -1209,6 +1209,8 @@ int main(int argc, char **argv) default_settings.khugepaged.pages_to_scan = hpage_pmd_nr * 8; default_settings.hugepages[hpage_pmd_order].enabled = THP_INHERIT; default_settings.hugepages[anon_order].enabled = THP_ALWAYS; + default_settings.shmem_hugepages[hpage_pmd_order].enabled = SHMEM_INHERIT; + default_settings.shmem_hugepages[anon_order].enabled = SHMEM_ALWAYS; save_settings(); thp_push_settings(&default_settings); diff --git a/tools/testing/selftests/mm/thp_settings.c b/tools/testing/selftests/mm/thp_settings.c index a4163438108e..577eaab6266f 100644 --- a/tools/testing/selftests/mm/thp_settings.c +++ b/tools/testing/selftests/mm/thp_settings.c @@ -33,10 +33,11 @@ static const char * const thp_defrag_strings[] = { }; static const char * const shmem_enabled_strings[] = { + "never", "always", "within_size", "advise", - "never", + "inherit", "deny", "force", NULL @@ -200,6 +201,7 @@ void thp_write_num(const char *name, unsigned long num) void thp_read_settings(struct thp_settings *settings) { unsigned long orders = thp_supported_orders(); + unsigned long shmem_orders = thp_shmem_supported_orders(); char path[PATH_MAX]; int i; @@ -234,12 +236,24 @@ void thp_read_settings(struct thp_settings *settings) settings->hugepages[i].enabled = thp_read_string(path, thp_enabled_strings); } + + for (i = 0; i < NR_ORDERS; i++) { + if (!((1 << i) & shmem_orders)) { + settings->shmem_hugepages[i].enabled = SHMEM_NEVER; + continue; + } + snprintf(path, PATH_MAX, "hugepages-%ukB/shmem_enabled", + (getpagesize() >> 10) << i); + settings->shmem_hugepages[i].enabled = + thp_read_string(path, shmem_enabled_strings); + } } void thp_write_settings(struct thp_settings *settings) { struct khugepaged_settings *khugepaged = &settings->khugepaged; unsigned long orders = thp_supported_orders(); + unsigned long shmem_orders = thp_shmem_supported_orders(); char path[PATH_MAX]; int enabled; int i; @@ -271,6 +285,15 @@ void thp_write_settings(struct thp_settings *settings) enabled = settings->hugepages[i].enabled; thp_write_string(path, thp_enabled_strings[enabled]); } + + for (i = 0; i < NR_ORDERS; i++) { + if (!((1 << i) & shmem_orders)) + continue; + snprintf(path, PATH_MAX, "hugepages-%ukB/shmem_enabled", + (getpagesize() >> 10) << i); + enabled = settings->shmem_hugepages[i].enabled; + thp_write_string(path, shmem_enabled_strings[enabled]); + } } struct thp_settings *thp_current_settings(void) @@ -324,17 +347,18 @@ void thp_set_read_ahead_path(char *path) dev_queue_read_ahead_path[sizeof(dev_queue_read_ahead_path) - 1] = '\0'; } -unsigned long thp_supported_orders(void) +static unsigned long __thp_supported_orders(bool is_shmem) { unsigned long orders = 0; char path[PATH_MAX]; char buf[256]; - int ret; - int i; + int ret, i; + char anon_dir[] = "enabled"; + char shmem_dir[] = "shmem_enabled"; for (i = 0; i < NR_ORDERS; i++) { - ret = snprintf(path, PATH_MAX, THP_SYSFS "hugepages-%ukB/enabled", - (getpagesize() >> 10) << i); + ret = snprintf(path, PATH_MAX, THP_SYSFS "hugepages-%ukB/%s", + (getpagesize() >> 10) << i, is_shmem ? shmem_dir : anon_dir); if (ret >= PATH_MAX) { printf("%s: Pathname is too long\n", __func__); exit(EXIT_FAILURE); @@ -347,3 +371,13 @@ unsigned long thp_supported_orders(void) return orders; } + +unsigned long thp_supported_orders(void) +{ + return __thp_supported_orders(false); +} + +unsigned long thp_shmem_supported_orders(void) +{ + return __thp_supported_orders(true); +} diff --git a/tools/testing/selftests/mm/thp_settings.h b/tools/testing/selftests/mm/thp_settings.h index 71cbff05f4c7..876235a23460 100644 --- a/tools/testing/selftests/mm/thp_settings.h +++ b/tools/testing/selftests/mm/thp_settings.h @@ -22,10 +22,11 @@ enum thp_defrag { }; enum shmem_enabled { + SHMEM_NEVER, SHMEM_ALWAYS, SHMEM_WITHIN_SIZE, SHMEM_ADVISE, - SHMEM_NEVER, + SHMEM_INHERIT, SHMEM_DENY, SHMEM_FORCE, }; @@ -46,6 +47,10 @@ struct khugepaged_settings { unsigned long pages_to_scan; }; +struct shmem_hugepages_settings { + enum shmem_enabled enabled; +}; + struct thp_settings { enum thp_enabled thp_enabled; enum thp_defrag thp_defrag; @@ -54,6 +59,7 @@ struct thp_settings { struct khugepaged_settings khugepaged; unsigned long read_ahead_kb; struct hugepages_settings hugepages[NR_ORDERS]; + struct shmem_hugepages_settings shmem_hugepages[NR_ORDERS]; }; int read_file(const char *path, char *buf, size_t buflen); @@ -76,5 +82,6 @@ void thp_save_settings(void); void thp_set_read_ahead_path(char *path); unsigned long thp_supported_orders(void); +unsigned long thp_shmem_supported_orders(void); #endif /* __THP_SETTINGS_H__ */ -- cgit v1.2.3 From f28bdd1b17ec187eaa34845814afaaff99832762 Mon Sep 17 00:00:00 2001 From: Pedro Falcato Date: Sat, 17 Aug 2024 01:18:34 +0100 Subject: selftests/mm: add more mseal traversal tests Add more mseal traversal tests across VMAs, where we could possibly screw up sealing checks. These test more across-vma traversal for mprotect, munmap and madvise. Particularly, we test for the case where a regular VMA is followed by a sealed VMA. [akpm@linux-foundation.org: remove incorrect comment, per review] [akpm@linux-foundation.org: remove the correct comment, per Pedro] [pedro.falcato@gmail.com: fix mseal's length] Link: https://lkml.kernel.org/r/vc4czyuemmu3kylqb4ctaga6y5yvondlyabimx6jvljlw2fkea@djawlllf45xa Link: https://lkml.kernel.org/r/20240817-mseal-depessimize-v3-7-d8d2e037df30@gmail.com Signed-off-by: Pedro Falcato Reviewed-by: Liam R. Howlett Reviewed-by: Lorenzo Stoakes Cc: Jeff Xu Cc: Kees Cook Cc: Linus Torvalds Cc: Michael Ellerman Cc: Shuah Khan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/mseal_test.c | 106 +++++++++++++++++++++++++++++++- 1 file changed, 105 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/mm/mseal_test.c b/tools/testing/selftests/mm/mseal_test.c index bd0bfda7aae7..e7ed720e9677 100644 --- a/tools/testing/selftests/mm/mseal_test.c +++ b/tools/testing/selftests/mm/mseal_test.c @@ -777,6 +777,42 @@ static void test_seal_mprotect_partial_mprotect(bool seal) REPORT_TEST_PASS(); } +static void test_seal_mprotect_partial_mprotect_tail(bool seal) +{ + void *ptr; + unsigned long page_size = getpagesize(); + unsigned long size = 2 * page_size; + int ret; + int prot; + + /* + * Check if a partial mseal (that results in two vmas) works correctly. + * It might mprotect the first, but it'll never touch the second (msealed) vma. + */ + + setup_single_address(size, &ptr); + FAIL_TEST_IF_FALSE(ptr != (void *)-1); + + if (seal) { + ret = sys_mseal(ptr + page_size, page_size); + FAIL_TEST_IF_FALSE(!ret); + } + + ret = sys_mprotect(ptr, size, PROT_EXEC); + if (seal) + FAIL_TEST_IF_FALSE(ret < 0); + else + FAIL_TEST_IF_FALSE(!ret); + + if (seal) { + FAIL_TEST_IF_FALSE(get_vma_size(ptr + page_size, &prot) > 0); + FAIL_TEST_IF_FALSE(prot == 0x4); + } + + REPORT_TEST_PASS(); +} + + static void test_seal_mprotect_two_vma_with_gap(bool seal) { void *ptr; @@ -994,6 +1030,36 @@ static void test_seal_munmap_vma_with_gap(bool seal) REPORT_TEST_PASS(); } +static void test_seal_munmap_partial_across_vmas(bool seal) +{ + void *ptr; + unsigned long page_size = getpagesize(); + unsigned long size = 2 * page_size; + int ret; + int prot; + + setup_single_address(size, &ptr); + FAIL_TEST_IF_FALSE(ptr != (void *)-1); + + if (seal) { + ret = sys_mseal(ptr + page_size, page_size); + FAIL_TEST_IF_FALSE(!ret); + } + + ret = sys_munmap(ptr, size); + if (seal) + FAIL_TEST_IF_FALSE(ret < 0); + else + FAIL_TEST_IF_FALSE(!ret); + + if (seal) { + FAIL_TEST_IF_FALSE(get_vma_size(ptr + page_size, &prot) > 0); + FAIL_TEST_IF_FALSE(prot == 0x4); + } + + REPORT_TEST_PASS(); +} + static void test_munmap_start_freed(bool seal) { void *ptr; @@ -1746,6 +1812,37 @@ static void test_seal_discard_ro_anon(bool seal) REPORT_TEST_PASS(); } +static void test_seal_discard_across_vmas(bool seal) +{ + void *ptr; + unsigned long page_size = getpagesize(); + unsigned long size = 2 * page_size; + int ret; + + setup_single_address(size, &ptr); + FAIL_TEST_IF_FALSE(ptr != (void *)-1); + + if (seal) { + ret = seal_single_address(ptr + page_size, page_size); + FAIL_TEST_IF_FALSE(!ret); + } + + ret = sys_madvise(ptr, size, MADV_DONTNEED); + if (seal) + FAIL_TEST_IF_FALSE(ret < 0); + else + FAIL_TEST_IF_FALSE(!ret); + + ret = sys_munmap(ptr, size); + if (seal) + FAIL_TEST_IF_FALSE(ret < 0); + else + FAIL_TEST_IF_FALSE(!ret); + + REPORT_TEST_PASS(); +} + + static void test_seal_madvise_nodiscard(bool seal) { void *ptr; @@ -1790,7 +1887,7 @@ int main(int argc, char **argv) if (!pkey_supported()) ksft_print_msg("PKEY not supported\n"); - ksft_set_plan(82); + ksft_set_plan(88); test_seal_addseal(); test_seal_unmapped_start(); @@ -1836,12 +1933,17 @@ int main(int argc, char **argv) test_seal_mprotect_split(false); test_seal_mprotect_split(true); + test_seal_mprotect_partial_mprotect_tail(false); + test_seal_mprotect_partial_mprotect_tail(true); + test_seal_munmap(false); test_seal_munmap(true); test_seal_munmap_two_vma(false); test_seal_munmap_two_vma(true); test_seal_munmap_vma_with_gap(false); test_seal_munmap_vma_with_gap(true); + test_seal_munmap_partial_across_vmas(false); + test_seal_munmap_partial_across_vmas(true); test_munmap_start_freed(false); test_munmap_start_freed(true); @@ -1873,6 +1975,8 @@ int main(int argc, char **argv) test_seal_madvise_nodiscard(true); test_seal_discard_ro_anon(false); test_seal_discard_ro_anon(true); + test_seal_discard_across_vmas(false); + test_seal_discard_across_vmas(true); test_seal_discard_ro_anon_on_rw(false); test_seal_discard_ro_anon_on_rw(true); test_seal_discard_ro_anon_on_shared(false); -- cgit v1.2.3 From 02e1960aafac33721401dcd92e915325fdb524b2 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 21 Aug 2024 20:34:42 +0100 Subject: mm: rename PG_mappedtodisk to PG_owner_2 This flag has similar constraints to PG_owner_priv_1 -- it is ignored by core code, and is entirely for the use of the code which allocated the folio. Since the pagecache does not use it, individual filesystems can use it. The bufferhead code does use it, so filesystems which use the buffer cache must not use it for another purpose. Link: https://lkml.kernel.org/r/20240821193445.2294269-10-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- fs/proc/page.c | 2 +- include/linux/kernel-page-flags.h | 2 +- include/linux/page-flags.h | 24 ++++++++++++++++-------- include/trace/events/mmflags.h | 2 +- tools/mm/page-types.c | 10 +++++----- 5 files changed, 24 insertions(+), 16 deletions(-) (limited to 'tools') diff --git a/fs/proc/page.c b/fs/proc/page.c index 73a0f872d97f..e74e639893be 100644 --- a/fs/proc/page.c +++ b/fs/proc/page.c @@ -211,7 +211,7 @@ u64 stable_page_flags(const struct page *page) #endif u |= kpf_copy_bit(k, KPF_RESERVED, PG_reserved); - u |= kpf_copy_bit(k, KPF_MAPPEDTODISK, PG_mappedtodisk); + u |= kpf_copy_bit(k, KPF_OWNER_2, PG_owner_2); u |= kpf_copy_bit(k, KPF_PRIVATE, PG_private); u |= kpf_copy_bit(k, KPF_PRIVATE_2, PG_private_2); u |= kpf_copy_bit(k, KPF_OWNER_PRIVATE, PG_owner_priv_1); diff --git a/include/linux/kernel-page-flags.h b/include/linux/kernel-page-flags.h index 859f4b0c1b2b..7c587a711be1 100644 --- a/include/linux/kernel-page-flags.h +++ b/include/linux/kernel-page-flags.h @@ -10,7 +10,7 @@ */ #define KPF_RESERVED 32 #define KPF_MLOCKED 33 -#define KPF_MAPPEDTODISK 34 +#define KPF_OWNER_2 34 #define KPF_PRIVATE 35 #define KPF_PRIVATE_2 36 #define KPF_OWNER_PRIVATE 37 diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 017205048cab..1ff3d172c22c 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -101,12 +101,12 @@ enum pageflags { PG_waiters, /* Page has waiters, check its waitqueue. Must be bit #7 and in the same byte as "PG_locked" */ PG_active, PG_workingset, - PG_owner_priv_1, /* Owner use. If pagecache, fs may use*/ + PG_owner_priv_1, /* Owner use. If pagecache, fs may use */ + PG_owner_2, /* Owner use. If pagecache, fs may use */ PG_arch_1, PG_reserved, PG_private, /* If pagecache, has fs-private data */ PG_private_2, /* If pagecache, has fs aux data */ - PG_mappedtodisk, /* Has blocks allocated on-disk */ PG_reclaim, /* To be reclaimed asap */ PG_swapbacked, /* Page is backed by RAM/swap */ PG_unevictable, /* Page is "unevictable" */ @@ -131,6 +131,11 @@ enum pageflags { PG_readahead = PG_reclaim, + /* Anonymous memory (and shmem) */ + PG_swapcache = PG_owner_priv_1, /* Swap page: swp_entry_t in private */ + /* Some filesystems */ + PG_checked = PG_owner_priv_1, + /* * Depending on the way an anonymous folio can be mapped into a page * table (e.g., single PMD/PUD/CONT of the head page vs. PTE-mapped @@ -138,13 +143,13 @@ enum pageflags { * tail pages of an anonymous folio. For now, we only expect it to be * set on tail pages for PTE-mapped THP. */ - PG_anon_exclusive = PG_mappedtodisk, - - /* Filesystems */ - PG_checked = PG_owner_priv_1, + PG_anon_exclusive = PG_owner_2, - /* SwapBacked */ - PG_swapcache = PG_owner_priv_1, /* Swap page: swp_entry_t in private */ + /* + * Set if all buffer heads in the folio are mapped. + * Filesystems which do not use BHs can use it for their own purpose. + */ + PG_mappedtodisk = PG_owner_2, /* Two page bits are conscripted by FS-Cache to maintain local caching * state. These bits are set on pages belonging to the netfs's inodes @@ -540,6 +545,9 @@ FOLIO_FLAG(swapbacked, FOLIO_HEAD_PAGE) PAGEFLAG(Private, private, PF_ANY) PAGEFLAG(Private2, private_2, PF_ANY) TESTSCFLAG(Private2, private_2, PF_ANY) +/* owner_2 can be set on tail pages for anon memory */ +FOLIO_FLAG(owner_2, FOLIO_HEAD_PAGE) + /* * Only test-and-set exist for PG_writeback. The unconditional operators are * risky: they bypass page accounting. diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h index c151cc21d367..3b51558cdc9b 100644 --- a/include/trace/events/mmflags.h +++ b/include/trace/events/mmflags.h @@ -107,13 +107,13 @@ DEF_PAGEFLAG_NAME(active), \ DEF_PAGEFLAG_NAME(workingset), \ DEF_PAGEFLAG_NAME(owner_priv_1), \ + DEF_PAGEFLAG_NAME(owner_2), \ DEF_PAGEFLAG_NAME(arch_1), \ DEF_PAGEFLAG_NAME(reserved), \ DEF_PAGEFLAG_NAME(private), \ DEF_PAGEFLAG_NAME(private_2), \ DEF_PAGEFLAG_NAME(writeback), \ DEF_PAGEFLAG_NAME(head), \ - DEF_PAGEFLAG_NAME(mappedtodisk), \ DEF_PAGEFLAG_NAME(reclaim), \ DEF_PAGEFLAG_NAME(swapbacked), \ DEF_PAGEFLAG_NAME(unevictable) \ diff --git a/tools/mm/page-types.c b/tools/mm/page-types.c index 8d5595b6c59f..8ca41c41105e 100644 --- a/tools/mm/page-types.c +++ b/tools/mm/page-types.c @@ -71,7 +71,7 @@ /* [32-] kernel hacking assistances */ #define KPF_RESERVED 32 #define KPF_MLOCKED 33 -#define KPF_MAPPEDTODISK 34 +#define KPF_OWNER_2 34 #define KPF_PRIVATE 35 #define KPF_PRIVATE_2 36 #define KPF_OWNER_PRIVATE 37 @@ -129,7 +129,7 @@ static const char * const page_flag_names[] = { [KPF_RESERVED] = "r:reserved", [KPF_MLOCKED] = "m:mlocked", - [KPF_MAPPEDTODISK] = "d:mappedtodisk", + [KPF_OWNER_2] = "d:owner_2", [KPF_PRIVATE] = "P:private", [KPF_PRIVATE_2] = "p:private_2", [KPF_OWNER_PRIVATE] = "O:owner_private", @@ -472,9 +472,9 @@ static int bit_mask_ok(uint64_t flags) static uint64_t expand_overloaded_flags(uint64_t flags, uint64_t pme) { - /* Anonymous pages overload PG_mappedtodisk */ - if ((flags & BIT(ANON)) && (flags & BIT(MAPPEDTODISK))) - flags ^= BIT(MAPPEDTODISK) | BIT(ANON_EXCLUSIVE); + /* Anonymous pages use PG_owner_2 for anon_exclusive */ + if ((flags & BIT(ANON)) && (flags & BIT(OWNER_2))) + flags ^= BIT(OWNER_2) | BIT(ANON_EXCLUSIVE); /* SLUB overloads several page flags */ if (flags & BIT(SLAB)) { -- cgit v1.2.3 From 7a87225ae2c6c317c7b80cf599e5cf0eee699196 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 21 Aug 2024 20:34:43 +0100 Subject: x86: remove PG_uncached Convert x86 to use PG_arch_2 instead of PG_uncached and remove PG_uncached. Link: https://lkml.kernel.org/r/20240821193445.2294269-11-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- .../features/vm/PG_uncached/arch-support.txt | 30 ---------------------- arch/arm64/Kconfig | 3 ++- arch/x86/Kconfig | 5 +--- arch/x86/mm/pat/memtype.c | 8 +++--- fs/proc/page.c | 8 +++--- include/linux/kernel-page-flags.h | 1 - include/linux/page-flags.h | 13 +++------- include/trace/events/mmflags.h | 23 ++++++++--------- mm/Kconfig | 9 +++---- mm/huge_memory.c | 4 ++- tools/mm/page-types.c | 3 +-- 11 files changed, 31 insertions(+), 76 deletions(-) delete mode 100644 Documentation/features/vm/PG_uncached/arch-support.txt (limited to 'tools') diff --git a/Documentation/features/vm/PG_uncached/arch-support.txt b/Documentation/features/vm/PG_uncached/arch-support.txt deleted file mode 100644 index 5a7508b8c967..000000000000 --- a/Documentation/features/vm/PG_uncached/arch-support.txt +++ /dev/null @@ -1,30 +0,0 @@ -# -# Feature name: PG_uncached -# Kconfig: ARCH_USES_PG_UNCACHED -# description: arch supports the PG_uncached page flag -# - ----------------------- - | arch |status| - ----------------------- - | alpha: | TODO | - | arc: | TODO | - | arm: | TODO | - | arm64: | TODO | - | csky: | TODO | - | hexagon: | TODO | - | loongarch: | TODO | - | m68k: | TODO | - | microblaze: | TODO | - | mips: | TODO | - | nios2: | TODO | - | openrisc: | TODO | - | parisc: | TODO | - | powerpc: | TODO | - | riscv: | TODO | - | s390: | TODO | - | sh: | TODO | - | sparc: | TODO | - | um: | TODO | - | x86: | ok | - | xtensa: | TODO | - ----------------------- diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index a2f8ff354ca6..6494848019a0 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -2100,7 +2100,8 @@ config ARM64_MTE depends on ARM64_PAN select ARCH_HAS_SUBPAGE_FAULTS select ARCH_USES_HIGH_VMA_FLAGS - select ARCH_USES_PG_ARCH_X + select ARCH_USES_PG_ARCH_2 + select ARCH_USES_PG_ARCH_3 help Memory Tagging (part of the ARMv8.5 Extensions) provides architectural support for run-time, always-on detection of diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index acd9745bf2ae..b74b9ee484da 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1799,6 +1799,7 @@ config X86_PAT def_bool y prompt "x86 PAT support" if EXPERT depends on MTRR + select ARCH_USES_PG_ARCH_2 help Use PAT attributes to setup page level cache control. @@ -1810,10 +1811,6 @@ config X86_PAT If unsure, say Y. -config ARCH_USES_PG_UNCACHED - def_bool y - depends on X86_PAT - config X86_UMIP def_bool y prompt "User Mode Instruction Prevention" if EXPERT diff --git a/arch/x86/mm/pat/memtype.c b/arch/x86/mm/pat/memtype.c index bdc2a240c2aa..1fa0bf6ed295 100644 --- a/arch/x86/mm/pat/memtype.c +++ b/arch/x86/mm/pat/memtype.c @@ -104,7 +104,7 @@ __setup("debugpat", pat_debug_setup); #ifdef CONFIG_X86_PAT /* - * X86 PAT uses page flags arch_1 and uncached together to keep track of + * X86 PAT uses page flags arch_1 and arch_2 together to keep track of * memory type of pages that have backing page struct. * * X86 PAT supports 4 different memory types: @@ -118,9 +118,9 @@ __setup("debugpat", pat_debug_setup); #define _PGMT_WB 0 #define _PGMT_WC (1UL << PG_arch_1) -#define _PGMT_UC_MINUS (1UL << PG_uncached) -#define _PGMT_WT (1UL << PG_uncached | 1UL << PG_arch_1) -#define _PGMT_MASK (1UL << PG_uncached | 1UL << PG_arch_1) +#define _PGMT_UC_MINUS (1UL << PG_arch_2) +#define _PGMT_WT (1UL << PG_arch_2 | 1UL << PG_arch_1) +#define _PGMT_MASK (1UL << PG_arch_2 | 1UL << PG_arch_1) #define _PGMT_CLEAR_MASK (~_PGMT_MASK) static inline enum page_cache_mode get_page_memtype(struct page *pg) diff --git a/fs/proc/page.c b/fs/proc/page.c index e74e639893be..a55f5acefa97 100644 --- a/fs/proc/page.c +++ b/fs/proc/page.c @@ -206,18 +206,16 @@ u64 stable_page_flags(const struct page *page) u |= kpf_copy_bit(page->flags, KPF_HWPOISON, PG_hwpoison); #endif -#ifdef CONFIG_ARCH_USES_PG_UNCACHED - u |= kpf_copy_bit(k, KPF_UNCACHED, PG_uncached); -#endif - u |= kpf_copy_bit(k, KPF_RESERVED, PG_reserved); u |= kpf_copy_bit(k, KPF_OWNER_2, PG_owner_2); u |= kpf_copy_bit(k, KPF_PRIVATE, PG_private); u |= kpf_copy_bit(k, KPF_PRIVATE_2, PG_private_2); u |= kpf_copy_bit(k, KPF_OWNER_PRIVATE, PG_owner_priv_1); u |= kpf_copy_bit(k, KPF_ARCH, PG_arch_1); -#ifdef CONFIG_ARCH_USES_PG_ARCH_X +#ifdef CONFIG_ARCH_USES_PG_ARCH_2 u |= kpf_copy_bit(k, KPF_ARCH_2, PG_arch_2); +#endif +#ifdef CONFIG_ARCH_USES_PG_ARCH_3 u |= kpf_copy_bit(k, KPF_ARCH_3, PG_arch_3); #endif diff --git a/include/linux/kernel-page-flags.h b/include/linux/kernel-page-flags.h index 7c587a711be1..196778a087c4 100644 --- a/include/linux/kernel-page-flags.h +++ b/include/linux/kernel-page-flags.h @@ -15,7 +15,6 @@ #define KPF_PRIVATE_2 36 #define KPF_OWNER_PRIVATE 37 #define KPF_ARCH 38 -#define KPF_UNCACHED 39 #define KPF_SOFTDIRTY 40 #define KPF_ARCH_2 41 #define KPF_ARCH_3 42 diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 1ff3d172c22c..2175ebceb41c 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -113,9 +113,6 @@ enum pageflags { #ifdef CONFIG_MMU PG_mlocked, /* Page is vma mlocked */ #endif -#ifdef CONFIG_ARCH_USES_PG_UNCACHED - PG_uncached, /* Page has been mapped as uncached */ -#endif #ifdef CONFIG_MEMORY_FAILURE PG_hwpoison, /* hardware poisoned page. Don't touch */ #endif @@ -123,8 +120,10 @@ enum pageflags { PG_young, PG_idle, #endif -#ifdef CONFIG_ARCH_USES_PG_ARCH_X +#ifdef CONFIG_ARCH_USES_PG_ARCH_2 PG_arch_2, +#endif +#ifdef CONFIG_ARCH_USES_PG_ARCH_3 PG_arch_3, #endif __NR_PAGEFLAGS, @@ -602,12 +601,6 @@ FOLIO_FLAG_FALSE(mlocked) FOLIO_TEST_SET_FLAG_FALSE(mlocked) #endif -#ifdef CONFIG_ARCH_USES_PG_UNCACHED -PAGEFLAG(Uncached, uncached, PF_NO_COMPOUND) -#else -PAGEFLAG_FALSE(Uncached, uncached) -#endif - #ifdef CONFIG_MEMORY_FAILURE PAGEFLAG(HWPoison, hwpoison, PF_ANY) TESTSCFLAG(HWPoison, hwpoison, PF_ANY) diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h index 3b51558cdc9b..58f2699331b6 100644 --- a/include/trace/events/mmflags.h +++ b/include/trace/events/mmflags.h @@ -71,12 +71,6 @@ #define IF_HAVE_PG_MLOCK(_name) #endif -#ifdef CONFIG_ARCH_USES_PG_UNCACHED -#define IF_HAVE_PG_UNCACHED(_name) ,{1UL << PG_##_name, __stringify(_name)} -#else -#define IF_HAVE_PG_UNCACHED(_name) -#endif - #ifdef CONFIG_MEMORY_FAILURE #define IF_HAVE_PG_HWPOISON(_name) ,{1UL << PG_##_name, __stringify(_name)} #else @@ -89,10 +83,16 @@ #define IF_HAVE_PG_IDLE(_name) #endif -#ifdef CONFIG_ARCH_USES_PG_ARCH_X -#define IF_HAVE_PG_ARCH_X(_name) ,{1UL << PG_##_name, __stringify(_name)} +#ifdef CONFIG_ARCH_USES_PG_ARCH_2 +#define IF_HAVE_PG_ARCH_2(_name) ,{1UL << PG_##_name, __stringify(_name)} +#else +#define IF_HAVE_PG_ARCH_2(_name) +#endif + +#ifdef CONFIG_ARCH_USES_PG_ARCH_3 +#define IF_HAVE_PG_ARCH_3(_name) ,{1UL << PG_##_name, __stringify(_name)} #else -#define IF_HAVE_PG_ARCH_X(_name) +#define IF_HAVE_PG_ARCH_3(_name) #endif #define DEF_PAGEFLAG_NAME(_name) { 1UL << PG_##_name, __stringify(_name) } @@ -118,12 +118,11 @@ DEF_PAGEFLAG_NAME(swapbacked), \ DEF_PAGEFLAG_NAME(unevictable) \ IF_HAVE_PG_MLOCK(mlocked) \ -IF_HAVE_PG_UNCACHED(uncached) \ IF_HAVE_PG_HWPOISON(hwpoison) \ IF_HAVE_PG_IDLE(idle) \ IF_HAVE_PG_IDLE(young) \ -IF_HAVE_PG_ARCH_X(arch_2) \ -IF_HAVE_PG_ARCH_X(arch_3) +IF_HAVE_PG_ARCH_2(arch_2) \ +IF_HAVE_PG_ARCH_3(arch_3) #define show_page_flags(flags) \ (flags) ? __print_flags(flags, "|", \ diff --git a/mm/Kconfig b/mm/Kconfig index 5946dcdcaeda..8078a4b3c509 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -1079,13 +1079,10 @@ config ARCH_USES_HIGH_VMA_FLAGS config ARCH_HAS_PKEYS bool -config ARCH_USES_PG_ARCH_X +config ARCH_USES_PG_ARCH_2 + bool +config ARCH_USES_PG_ARCH_3 bool - help - Enable the definition of PG_arch_x page flags with x > 1. Only - suitable for 64-bit architectures with CONFIG_FLATMEM or - CONFIG_SPARSEMEM_VMEMMAP enabled, otherwise there may not be - enough room for additional bits in page->flags. config VM_EVENT_COUNTERS default y diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 8dfb912ebdaa..c4b45ad576f6 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2993,8 +2993,10 @@ static void __split_huge_page_tail(struct folio *folio, int tail, (1L << PG_workingset) | (1L << PG_locked) | (1L << PG_unevictable) | -#ifdef CONFIG_ARCH_USES_PG_ARCH_X +#ifdef CONFIG_ARCH_USES_PG_ARCH_2 (1L << PG_arch_2) | +#endif +#ifdef CONFIG_ARCH_USES_PG_ARCH_3 (1L << PG_arch_3) | #endif (1L << PG_dirty) | diff --git a/tools/mm/page-types.c b/tools/mm/page-types.c index 8ca41c41105e..fa050d5a48cd 100644 --- a/tools/mm/page-types.c +++ b/tools/mm/page-types.c @@ -76,7 +76,7 @@ #define KPF_PRIVATE_2 36 #define KPF_OWNER_PRIVATE 37 #define KPF_ARCH 38 -#define KPF_UNCACHED 39 +#define KPF_UNCACHED 39 /* unused */ #define KPF_SOFTDIRTY 40 #define KPF_ARCH_2 41 @@ -134,7 +134,6 @@ static const char * const page_flag_names[] = { [KPF_PRIVATE_2] = "p:private_2", [KPF_OWNER_PRIVATE] = "O:owner_private", [KPF_ARCH] = "h:arch", - [KPF_UNCACHED] = "c:uncached", [KPF_SOFTDIRTY] = "f:softdirty", [KPF_ARCH_2] = "H:arch_2", -- cgit v1.2.3 From c41a701d18efe6b8aa402efab16edbaba50c9548 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 21 Aug 2024 14:31:15 +0200 Subject: selftests/mm: fix charge_reserved_hugetlb.sh test Currently, running the charge_reserved_hugetlb.sh selftest we can sometimes observe something like: $ ./charge_reserved_hugetlb.sh -cgroup-v2 ... write_result is 0 After write: hugetlb_usage=0 reserved_usage=10485760 killing write_to_hugetlbfs Received 2. Deleting the memory Detach failure: Invalid argument umount: /mnt/huge: target is busy. Both cases are issues in the test. While the unmount error seems to be racy, it will make the test fail: $ ./run_vmtests.sh -t hugetlb ... # [FAIL] not ok 10 charge_reserved_hugetlb.sh -cgroup-v2 # exit=32 The issue is that we are not waiting for the write_to_hugetlbfs process to quit. So it might still have a hugetlbfs file open, about which umount is not happy. Fix that by making "killall" wait for the process to quit. The other error ("Detach failure: Invalid argument") does not seem to result in a test error, but is misleading. Turns out write_to_hugetlbfs.c unconditionally tries to cleanup using shmdt(), even when we only mmap()'ed a hugetlb file. Even worse, shmaddr is never even set for the SHM case. Fix that as well. With this change it seems to work as expected. Link: https://lkml.kernel.org/r/20240821123115.2068812-1-david@redhat.com Fixes: 29750f71a9b4 ("hugetlb_cgroup: add hugetlb_cgroup reservation tests") Signed-off-by: David Hildenbrand Reported-by: Mario Casquero Reviewed-by: Mina Almasry Tested-by: Mario Casquero Cc: Shuah Khan Cc: Muchun Song Signed-off-by: Andrew Morton --- .../testing/selftests/mm/charge_reserved_hugetlb.sh | 2 +- tools/testing/selftests/mm/write_to_hugetlbfs.c | 21 ++++++++++++--------- 2 files changed, 13 insertions(+), 10 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/mm/charge_reserved_hugetlb.sh b/tools/testing/selftests/mm/charge_reserved_hugetlb.sh index d680c00d2853..67df7b47087f 100755 --- a/tools/testing/selftests/mm/charge_reserved_hugetlb.sh +++ b/tools/testing/selftests/mm/charge_reserved_hugetlb.sh @@ -254,7 +254,7 @@ function cleanup_hugetlb_memory() { local cgroup="$1" if [[ "$(pgrep -f write_to_hugetlbfs)" != "" ]]; then echo killing write_to_hugetlbfs - killall -2 write_to_hugetlbfs + killall -2 --wait write_to_hugetlbfs wait_for_hugetlb_memory_to_get_depleted $cgroup fi set -e diff --git a/tools/testing/selftests/mm/write_to_hugetlbfs.c b/tools/testing/selftests/mm/write_to_hugetlbfs.c index 6a2caba19ee1..1289d311efd7 100644 --- a/tools/testing/selftests/mm/write_to_hugetlbfs.c +++ b/tools/testing/selftests/mm/write_to_hugetlbfs.c @@ -28,7 +28,7 @@ enum method { /* Global variables. */ static const char *self; -static char *shmaddr; +static int *shmaddr; static int shmid; /* @@ -47,15 +47,17 @@ void sig_handler(int signo) { printf("Received %d.\n", signo); if (signo == SIGINT) { - printf("Deleting the memory\n"); - if (shmdt((const void *)shmaddr) != 0) { - perror("Detach failure"); + if (shmaddr) { + printf("Deleting the memory\n"); + if (shmdt((const void *)shmaddr) != 0) { + perror("Detach failure"); + shmctl(shmid, IPC_RMID, NULL); + exit(4); + } + shmctl(shmid, IPC_RMID, NULL); - exit(4); + printf("Done deleting the memory\n"); } - - shmctl(shmid, IPC_RMID, NULL); - printf("Done deleting the memory\n"); } exit(2); } @@ -211,7 +213,8 @@ int main(int argc, char **argv) shmctl(shmid, IPC_RMID, NULL); exit(2); } - printf("shmaddr: %p\n", ptr); + shmaddr = ptr; + printf("shmaddr: %p\n", shmaddr); break; default: -- cgit v1.2.3 From fd06ce2ce4c1f93e3d5d7d2640672ccb1a94cce3 Mon Sep 17 00:00:00 2001 From: Mike Yuan Date: Fri, 23 Aug 2024 16:27:09 +0000 Subject: selftests: test_zswap: add test for hierarchical zswap.writeback MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Ensure that zswap.writeback check goes up the cgroup tree, i.e. is hierarchical. Create a subcgroup which has zswap.writeback set to 1, and the upper hierarchy's restrictions shall apply. Link: https://lkml.kernel.org/r/20240823162506.12117-2-me@yhndnzj.com Signed-off-by: Mike Yuan Cc: Johannes Weiner Cc: Michal Hocko Cc: Michal Koutný Cc: Muchun Song Cc: Nhat Pham Cc: Roman Gushchin Cc: Shakeel Butt Cc: Yosry Ahmed Signed-off-by: Andrew Morton --- tools/testing/selftests/cgroup/test_zswap.c | 75 +++++++++++++++++++++-------- 1 file changed, 54 insertions(+), 21 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/cgroup/test_zswap.c b/tools/testing/selftests/cgroup/test_zswap.c index 190096017f80..40de679248b8 100644 --- a/tools/testing/selftests/cgroup/test_zswap.c +++ b/tools/testing/selftests/cgroup/test_zswap.c @@ -263,15 +263,13 @@ out: static int attempt_writeback(const char *cgroup, void *arg) { long pagesize = sysconf(_SC_PAGESIZE); - char *test_group = arg; size_t memsize = MB(4); char buf[pagesize]; long zswap_usage; - bool wb_enabled; + bool wb_enabled = *(bool *) arg; int ret = -1; char *mem; - wb_enabled = cg_read_long(test_group, "memory.zswap.writeback"); mem = (char *)malloc(memsize); if (!mem) return ret; @@ -288,12 +286,12 @@ static int attempt_writeback(const char *cgroup, void *arg) memcpy(&mem[i], buf, pagesize); /* Try and reclaim allocated memory */ - if (cg_write_numeric(test_group, "memory.reclaim", memsize)) { + if (cg_write_numeric(cgroup, "memory.reclaim", memsize)) { ksft_print_msg("Failed to reclaim all of the requested memory\n"); goto out; } - zswap_usage = cg_read_long(test_group, "memory.zswap.current"); + zswap_usage = cg_read_long(cgroup, "memory.zswap.current"); /* zswpin */ for (int i = 0; i < memsize; i += pagesize) { @@ -303,7 +301,7 @@ static int attempt_writeback(const char *cgroup, void *arg) } } - if (cg_write_numeric(test_group, "memory.zswap.max", zswap_usage/2)) + if (cg_write_numeric(cgroup, "memory.zswap.max", zswap_usage/2)) goto out; /* @@ -312,7 +310,7 @@ static int attempt_writeback(const char *cgroup, void *arg) * If writeback is disabled, memory reclaim will fail as zswap is limited and * it can't writeback to swap. */ - ret = cg_write_numeric(test_group, "memory.reclaim", memsize); + ret = cg_write_numeric(cgroup, "memory.reclaim", memsize); if (!wb_enabled) ret = (ret == -EAGAIN) ? 0 : -1; @@ -321,12 +319,41 @@ out: return ret; } +static int test_zswap_writeback_one(const char *cgroup, bool wb) +{ + long zswpwb_before, zswpwb_after; + + zswpwb_before = get_cg_wb_count(cgroup); + if (zswpwb_before != 0) { + ksft_print_msg("zswpwb_before = %ld instead of 0\n", zswpwb_before); + return -1; + } + + if (cg_run(cgroup, attempt_writeback, (void *) &wb)) + return -1; + + /* Verify that zswap writeback occurred only if writeback was enabled */ + zswpwb_after = get_cg_wb_count(cgroup); + if (zswpwb_after < 0) + return -1; + + if (wb != !!zswpwb_after) { + ksft_print_msg("zswpwb_after is %ld while wb is %s", + zswpwb_after, wb ? "enabled" : "disabled"); + return -1; + } + + return 0; +} + /* Test to verify the zswap writeback path */ static int test_zswap_writeback(const char *root, bool wb) { - long zswpwb_before, zswpwb_after; int ret = KSFT_FAIL; - char *test_group; + char *test_group, *test_group_child = NULL; + + if (cg_read_strcmp(root, "memory.zswap.writeback", "1")) + return KSFT_SKIP; test_group = cg_name(root, "zswap_writeback_test"); if (!test_group) @@ -336,29 +363,35 @@ static int test_zswap_writeback(const char *root, bool wb) if (cg_write(test_group, "memory.zswap.writeback", wb ? "1" : "0")) goto out; - zswpwb_before = get_cg_wb_count(test_group); - if (zswpwb_before != 0) { - ksft_print_msg("zswpwb_before = %ld instead of 0\n", zswpwb_before); + if (test_zswap_writeback_one(test_group, wb)) goto out; - } - if (cg_run(test_group, attempt_writeback, (void *) test_group)) + /* Reset memory.zswap.max to max (modified by attempt_writeback), and + * set up child cgroup, whose memory.zswap.writeback is hardcoded to 1. + * Thus, the parent's setting shall be what's in effect. */ + if (cg_write(test_group, "memory.zswap.max", "max")) + goto out; + if (cg_write(test_group, "cgroup.subtree_control", "+memory")) goto out; - /* Verify that zswap writeback occurred only if writeback was enabled */ - zswpwb_after = get_cg_wb_count(test_group); - if (zswpwb_after < 0) + test_group_child = cg_name(test_group, "zswap_writeback_test_child"); + if (!test_group_child) + goto out; + if (cg_create(test_group_child)) + goto out; + if (cg_write(test_group_child, "memory.zswap.writeback", "1")) goto out; - if (wb != !!zswpwb_after) { - ksft_print_msg("zswpwb_after is %ld while wb is %s", - zswpwb_after, wb ? "enabled" : "disabled"); + if (test_zswap_writeback_one(test_group_child, wb)) goto out; - } ret = KSFT_PASS; out: + if (test_group_child) { + cg_destroy(test_group_child); + free(test_group_child); + } cg_destroy(test_group); free(test_group); return ret; -- cgit v1.2.3 From 4e52a60ac5c079000add146ca39b0edcc30f74cd Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Fri, 30 Aug 2024 19:10:13 +0100 Subject: tools: improve vma test Makefile Patch series "mm: remove vma_merge()", v3. The infamous vma_merge() function has been the cause of a great deal of pain, bugs and confusion for a very long time. It is subtle, contains many corner cases, tries to do far too much and is as a result very fragile. The fact that the function requires there to be a numbering system to cover each possible eventuality with references to each in the many branches of its implementation as to which case you are looking at speaks to all this. Some of this complexity is inherent - unfortunately there is no getting away from the need to figure out precisely how to execute the merge, whether we need to remove VMAs, whether it is safe to do so, what constitutes a mergeable VMA and so on. However, a lot of the complexity is not inherent but instead a product of the function's 'organic' development. Liam has gone to great lengths to improve the situation as a part of his maple tree implementation, greatly improving the readability of the code, and Vlastimil and myself have additionally gone to lengths to try to improve things further. However, with the availability of userland VMA testing, it now becomes possible to perform a rather more significant refactoring while maintaining confidence in its correct operation. An attempt was previously made by Vlastimil [0] to eliminate vma_merge(), however it was rather - brutal - and an astute reader might refer to the date of that patch for insight as to its intent. This series instead divides merge operations into two natural kinds - merges which occur when a NEW vma is being added to the address space, and merges which occur when a vma is being MODIFIED. Happily, the vma_expand() function introduced by Liam, which has the capacity for also deleting a subsequent VMA, covers each of the NEW vma cases. By abstracting the actual final commit of changes to a VMA to its own function, commit_merge() and writing a wrapper around vma_expand() for new VMA cases vma_merge_new_range(), we can avoid having to use vma_merge() for these instances altogether. By doing so we are also able to then de-duplicate all existing merge logic in mmap_region() and do_brk_flags() and have everything invoke this new function, so we universally take the same approach to merging new VMAs. Having done so, we can then completely rework vma_merge() into vma_merge_existing_range() and use this for the instances where a merge is proposed for a region of an existing VMA. This eliminates vma_merge() and its numbered cases and instead divides things into logical cases - merge both, merge left, merge right (the latter 2 being either partial or full merges). The code is heavily annotated with ASCII diagrams and greatly simplified in comparison to the existing vma_merge() function. Having made this change, we take the opportunity to address an issue with merging VMAs possessing a vm_ops->close() hook - commit 714965ca8252 ("mm/mmap: start distinguishing if vma can be removed in mergeability test") and commit fc0c8f9089c2 ("mm, mmap: fix vma_merge() case 7 with vma_ops->close") make efforts to relax how we handle these, making assumptions about which VMAs might end up deleted (and thus, if possessing a vm_ops->close() hook, cannot be). This refactor means we do not need to guess, so instead explicitly only disallow merge in instances where a VMA with a vm_ops->close() hook would be deleted (and try a smaller merge in cases where this is possible). In addition to these changes, we introduce a new vma_merge_struct abstraction to allow VMA merge state to be threaded through the operation neatly. There is heavy unit testing provided for all merge functionality, added prior to the refactoring, allowing for before/after testing. The vm_ops->close() change also introduces exhaustive testing to demonstrate that this functions as expected, and in addition to this the reproduction code from commit fc0c8f9089c2 ("mm, mmap: fix vma_merge() case 7 with vma_ops->close") was tested and confirmed passing. [0]:https://lore.kernel.org/linux-mm/20240401192623.18575-2-vbabka@suse.cz/ This patch (of 10): Have vma.o depend on its source dependencies explicitly, as previously these were simply being ignored as existing object files were up to date. This now correctly re-triggers the build if mm/ source is changed as well as local source code. Also set clean as a phony rule. Link: https://lkml.kernel.org/r/cover.1725040657.git.lorenzo.stoakes@oracle.com Link: https://lkml.kernel.org/r/e3ea58f08364ae5432c9a074de0195a7c7e0b04a.1725040657.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Liam R. Howlett Cc: Mark Brown Cc: Vlastimil Babka Cc: Bert Karwatzki Cc: Jeff Xu Cc: Jiri Olsa Cc: Kees Cook Cc: Lorenzo Stoakes Cc: Matthew Wilcox Cc: "Paul E. McKenney" Cc: Paul Moore Cc: Sidhartha Kumar Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- tools/testing/vma/Makefile | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/testing/vma/Makefile b/tools/testing/vma/Makefile index bfc905d222cf..860fd2311dcc 100644 --- a/tools/testing/vma/Makefile +++ b/tools/testing/vma/Makefile @@ -1,6 +1,6 @@ # SPDX-License-Identifier: GPL-2.0-or-later -.PHONY: default +.PHONY: default clean default: vma @@ -9,7 +9,9 @@ include ../shared/shared.mk OFILES = $(SHARED_OFILES) vma.o maple-shim.o TARGETS = vma -vma: $(OFILES) vma_internal.h ../../../mm/vma.c ../../../mm/vma.h +vma.o: vma.c vma_internal.h ../../../mm/vma.c ../../../mm/vma.h + +vma: $(OFILES) $(CC) $(CFLAGS) -o $@ $(OFILES) $(LDLIBS) clean: -- cgit v1.2.3 From 955db39676b6de84283b370d03683171b67dceb3 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Fri, 30 Aug 2024 19:10:14 +0100 Subject: tools: add VMA merge tests Add a variety of VMA merge unit tests to assert that the behaviour of VMA merge is correct at an abstract level and VMAs are merged or not merged as expected. These are intentionally added _before_ we start refactoring vma_merge() in order that we can continually assert correctness throughout the rest of the series. In order to reduce churn going forward, we backport the vma_merge_struct data type to the test code which we introduce and use in a future commit, and add wrappers around the merge new and existing VMA cases. Link: https://lkml.kernel.org/r/1c7a0b43cfad2c511a6b1b52f3507696478ff51a.1725040657.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Liam R. Howlett Cc: Mark Brown Cc: Vlastimil Babka Cc: Bert Karwatzki Cc: Jeff Xu Cc: Jiri Olsa Cc: Kees Cook Cc: Lorenzo Stoakes Cc: Matthew Wilcox Cc: "Paul E. McKenney" Cc: Paul Moore Cc: Sidhartha Kumar Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- tools/testing/vma/vma.c | 1282 +++++++++++++++++++++++++++++++++++++- tools/testing/vma/vma_internal.h | 45 +- 2 files changed, 1317 insertions(+), 10 deletions(-) (limited to 'tools') diff --git a/tools/testing/vma/vma.c b/tools/testing/vma/vma.c index 48e033c60d87..71bd30d5da81 100644 --- a/tools/testing/vma/vma.c +++ b/tools/testing/vma/vma.c @@ -7,13 +7,43 @@ #include "maple-shared.h" #include "vma_internal.h" +/* Include so header guard set. */ +#include "../../../mm/vma.h" + +static bool fail_prealloc; + +/* Then override vma_iter_prealloc() so we can choose to fail it. */ +#define vma_iter_prealloc(vmi, vma) \ + (fail_prealloc ? -ENOMEM : mas_preallocate(&(vmi)->mas, (vma), GFP_KERNEL)) + /* * Directly import the VMA implementation here. Our vma_internal.h wrapper * provides userland-equivalent functionality for everything vma.c uses. */ #include "../../../mm/vma.c" +/* + * Temporarily forward-ported from a future in which vmg's are used for merging. + */ +struct vma_merge_struct { + struct mm_struct *mm; + struct vma_iterator *vmi; + pgoff_t pgoff; + struct vm_area_struct *prev; + struct vm_area_struct *next; /* Modified by vma_merge(). */ + struct vm_area_struct *vma; /* Either a new VMA or the one being modified. */ + unsigned long start; + unsigned long end; + unsigned long flags; + struct file *file; + struct anon_vma *anon_vma; + struct mempolicy *policy; + struct vm_userfaultfd_ctx uffd_ctx; + struct anon_vma_name *anon_name; +}; + const struct vm_operations_struct vma_dummy_vm_ops; +static struct anon_vma dummy_anon_vma; #define ASSERT_TRUE(_expr) \ do { \ @@ -28,6 +58,14 @@ const struct vm_operations_struct vma_dummy_vm_ops; #define ASSERT_EQ(_val1, _val2) ASSERT_TRUE((_val1) == (_val2)) #define ASSERT_NE(_val1, _val2) ASSERT_TRUE((_val1) != (_val2)) +static struct task_struct __current; + +struct task_struct *get_current(void) +{ + return &__current; +} + +/* Helper function to simply allocate a VMA. */ static struct vm_area_struct *alloc_vma(struct mm_struct *mm, unsigned long start, unsigned long end, @@ -47,22 +85,201 @@ static struct vm_area_struct *alloc_vma(struct mm_struct *mm, return ret; } +/* Helper function to allocate a VMA and link it to the tree. */ +static struct vm_area_struct *alloc_and_link_vma(struct mm_struct *mm, + unsigned long start, + unsigned long end, + pgoff_t pgoff, + vm_flags_t flags) +{ + struct vm_area_struct *vma = alloc_vma(mm, start, end, pgoff, flags); + + if (vma == NULL) + return NULL; + + if (vma_link(mm, vma)) { + vm_area_free(vma); + return NULL; + } + + /* + * Reset this counter which we use to track whether writes have + * begun. Linking to the tree will have caused this to be incremented, + * which means we will get a false positive otherwise. + */ + vma->vm_lock_seq = -1; + + return vma; +} + +/* Helper function which provides a wrapper around a merge new VMA operation. */ +static struct vm_area_struct *merge_new(struct vma_merge_struct *vmg) +{ + /* vma_merge() needs a VMA to determine mm, anon_vma, and file. */ + struct vm_area_struct dummy = { + .vm_mm = vmg->mm, + .vm_flags = vmg->flags, + .anon_vma = vmg->anon_vma, + .vm_file = vmg->file, + }; + + /* + * For convenience, get prev and next VMAs. Which the new VMA operation + * requires. + */ + vmg->next = vma_next(vmg->vmi); + vmg->prev = vma_prev(vmg->vmi); + + vma_iter_set(vmg->vmi, vmg->start); + return vma_merge_new_vma(vmg->vmi, vmg->prev, &dummy, vmg->start, + vmg->end, vmg->pgoff); +} + +/* + * Helper function which provides a wrapper around a merge existing VMA + * operation. + */ +static struct vm_area_struct *merge_existing(struct vma_merge_struct *vmg) +{ + /* vma_merge() needs a VMA to determine mm, anon_vma, and file. */ + struct vm_area_struct dummy = { + .vm_mm = vmg->mm, + .vm_flags = vmg->flags, + .anon_vma = vmg->anon_vma, + .vm_file = vmg->file, + }; + + return vma_merge(vmg->vmi, vmg->prev, &dummy, vmg->start, vmg->end, + vmg->flags, vmg->pgoff, vmg->policy, vmg->uffd_ctx, + vmg->anon_name); +} + +/* + * Helper function which provides a wrapper around the expansion of an existing + * VMA. + */ +static int expand_existing(struct vma_merge_struct *vmg) +{ + return vma_expand(vmg->vmi, vmg->vma, vmg->start, vmg->end, vmg->pgoff, + vmg->next); +} + +/* + * Helper function to reset merge state the associated VMA iterator to a + * specified new range. + */ +static void vmg_set_range(struct vma_merge_struct *vmg, unsigned long start, + unsigned long end, pgoff_t pgoff, vm_flags_t flags) +{ + vma_iter_set(vmg->vmi, start); + + vmg->prev = NULL; + vmg->next = NULL; + vmg->vma = NULL; + + vmg->start = start; + vmg->end = end; + vmg->pgoff = pgoff; + vmg->flags = flags; +} + +/* + * Helper function to try to merge a new VMA. + * + * Update vmg and the iterator for it and try to merge, otherwise allocate a new + * VMA, link it to the maple tree and return it. + */ +static struct vm_area_struct *try_merge_new_vma(struct mm_struct *mm, + struct vma_merge_struct *vmg, + unsigned long start, unsigned long end, + pgoff_t pgoff, vm_flags_t flags, + bool *was_merged) +{ + struct vm_area_struct *merged; + + vmg_set_range(vmg, start, end, pgoff, flags); + + merged = merge_new(vmg); + if (merged) { + *was_merged = true; + return merged; + } + + *was_merged = false; + return alloc_and_link_vma(mm, start, end, pgoff, flags); +} + +/* + * Helper function to reset the dummy anon_vma to indicate it has not been + * duplicated. + */ +static void reset_dummy_anon_vma(void) +{ + dummy_anon_vma.was_cloned = false; + dummy_anon_vma.was_unlinked = false; +} + +/* + * Helper function to remove all VMAs and destroy the maple tree associated with + * a virtual address space. Returns a count of VMAs in the tree. + */ +static int cleanup_mm(struct mm_struct *mm, struct vma_iterator *vmi) +{ + struct vm_area_struct *vma; + int count = 0; + + fail_prealloc = false; + reset_dummy_anon_vma(); + + vma_iter_set(vmi, 0); + for_each_vma(*vmi, vma) { + vm_area_free(vma); + count++; + } + + mtree_destroy(&mm->mm_mt); + mm->map_count = 0; + return count; +} + +/* Helper function to determine if VMA has had vma_start_write() performed. */ +static bool vma_write_started(struct vm_area_struct *vma) +{ + int seq = vma->vm_lock_seq; + + /* We reset after each check. */ + vma->vm_lock_seq = -1; + + /* The vma_start_write() stub simply increments this value. */ + return seq > -1; +} + +/* Helper function providing a dummy vm_ops->close() method.*/ +static void dummy_close(struct vm_area_struct *) +{ +} + static bool test_simple_merge(void) { struct vm_area_struct *vma; unsigned long flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE; struct mm_struct mm = {}; struct vm_area_struct *vma_left = alloc_vma(&mm, 0, 0x1000, 0, flags); - struct vm_area_struct *vma_middle = alloc_vma(&mm, 0x1000, 0x2000, 1, flags); struct vm_area_struct *vma_right = alloc_vma(&mm, 0x2000, 0x3000, 2, flags); VMA_ITERATOR(vmi, &mm, 0x1000); + struct vma_merge_struct vmg = { + .mm = &mm, + .vmi = &vmi, + .start = 0x1000, + .end = 0x2000, + .flags = flags, + .pgoff = 1, + }; ASSERT_FALSE(vma_link(&mm, vma_left)); - ASSERT_FALSE(vma_link(&mm, vma_middle)); ASSERT_FALSE(vma_link(&mm, vma_right)); - vma = vma_merge_new_vma(&vmi, vma_left, vma_middle, 0x1000, - 0x2000, 1); + vma = merge_new(&vmg); ASSERT_NE(vma, NULL); ASSERT_EQ(vma->vm_start, 0); @@ -142,10 +359,17 @@ static bool test_simple_expand(void) struct mm_struct mm = {}; struct vm_area_struct *vma = alloc_vma(&mm, 0, 0x1000, 0, flags); VMA_ITERATOR(vmi, &mm, 0); + struct vma_merge_struct vmg = { + .vmi = &vmi, + .vma = vma, + .start = 0, + .end = 0x3000, + .pgoff = 0, + }; ASSERT_FALSE(vma_link(&mm, vma)); - ASSERT_FALSE(vma_expand(&vmi, vma, 0, 0x3000, 0, NULL)); + ASSERT_FALSE(expand_existing(&vmg)); ASSERT_EQ(vma->vm_start, 0); ASSERT_EQ(vma->vm_end, 0x3000); @@ -178,6 +402,1042 @@ static bool test_simple_shrink(void) return true; } +static bool test_merge_new(void) +{ + unsigned long flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE; + struct mm_struct mm = {}; + VMA_ITERATOR(vmi, &mm, 0); + struct vma_merge_struct vmg = { + .mm = &mm, + .vmi = &vmi, + }; + struct anon_vma_chain dummy_anon_vma_chain_a = { + .anon_vma = &dummy_anon_vma, + }; + struct anon_vma_chain dummy_anon_vma_chain_b = { + .anon_vma = &dummy_anon_vma, + }; + struct anon_vma_chain dummy_anon_vma_chain_c = { + .anon_vma = &dummy_anon_vma, + }; + struct anon_vma_chain dummy_anon_vma_chain_d = { + .anon_vma = &dummy_anon_vma, + }; + int count; + struct vm_area_struct *vma, *vma_a, *vma_b, *vma_c, *vma_d; + bool merged; + + /* + * 0123456789abc + * AA B CC + */ + vma_a = alloc_and_link_vma(&mm, 0, 0x2000, 0, flags); + ASSERT_NE(vma_a, NULL); + /* We give each VMA a single avc so we can test anon_vma duplication. */ + INIT_LIST_HEAD(&vma_a->anon_vma_chain); + list_add(&dummy_anon_vma_chain_a.same_vma, &vma_a->anon_vma_chain); + + vma_b = alloc_and_link_vma(&mm, 0x3000, 0x4000, 3, flags); + ASSERT_NE(vma_b, NULL); + INIT_LIST_HEAD(&vma_b->anon_vma_chain); + list_add(&dummy_anon_vma_chain_b.same_vma, &vma_b->anon_vma_chain); + + vma_c = alloc_and_link_vma(&mm, 0xb000, 0xc000, 0xb, flags); + ASSERT_NE(vma_c, NULL); + INIT_LIST_HEAD(&vma_c->anon_vma_chain); + list_add(&dummy_anon_vma_chain_c.same_vma, &vma_c->anon_vma_chain); + + /* + * NO merge. + * + * 0123456789abc + * AA B ** CC + */ + vma_d = try_merge_new_vma(&mm, &vmg, 0x7000, 0x9000, 7, flags, &merged); + ASSERT_NE(vma_d, NULL); + INIT_LIST_HEAD(&vma_d->anon_vma_chain); + list_add(&dummy_anon_vma_chain_d.same_vma, &vma_d->anon_vma_chain); + ASSERT_FALSE(merged); + ASSERT_EQ(mm.map_count, 4); + + /* + * Merge BOTH sides. + * + * 0123456789abc + * AA*B DD CC + */ + vma_b->anon_vma = &dummy_anon_vma; + vma = try_merge_new_vma(&mm, &vmg, 0x2000, 0x3000, 2, flags, &merged); + ASSERT_EQ(vma, vma_a); + /* Merge with A, delete B. */ + ASSERT_TRUE(merged); + ASSERT_EQ(vma->vm_start, 0); + ASSERT_EQ(vma->vm_end, 0x4000); + ASSERT_EQ(vma->vm_pgoff, 0); + ASSERT_EQ(vma->anon_vma, &dummy_anon_vma); + ASSERT_TRUE(vma_write_started(vma)); + ASSERT_EQ(mm.map_count, 3); + + /* + * Merge to PREVIOUS VMA. + * + * 0123456789abc + * AAAA* DD CC + */ + vma = try_merge_new_vma(&mm, &vmg, 0x4000, 0x5000, 4, flags, &merged); + ASSERT_EQ(vma, vma_a); + /* Extend A. */ + ASSERT_TRUE(merged); + ASSERT_EQ(vma->vm_start, 0); + ASSERT_EQ(vma->vm_end, 0x5000); + ASSERT_EQ(vma->vm_pgoff, 0); + ASSERT_EQ(vma->anon_vma, &dummy_anon_vma); + ASSERT_TRUE(vma_write_started(vma)); + ASSERT_EQ(mm.map_count, 3); + + /* + * Merge to NEXT VMA. + * + * 0123456789abc + * AAAAA *DD CC + */ + vma_d->anon_vma = &dummy_anon_vma; + vma = try_merge_new_vma(&mm, &vmg, 0x6000, 0x7000, 6, flags, &merged); + ASSERT_EQ(vma, vma_d); + /* Prepend. */ + ASSERT_TRUE(merged); + ASSERT_EQ(vma->vm_start, 0x6000); + ASSERT_EQ(vma->vm_end, 0x9000); + ASSERT_EQ(vma->vm_pgoff, 6); + ASSERT_EQ(vma->anon_vma, &dummy_anon_vma); + ASSERT_TRUE(vma_write_started(vma)); + ASSERT_EQ(mm.map_count, 3); + + /* + * Merge BOTH sides. + * + * 0123456789abc + * AAAAA*DDD CC + */ + vma = try_merge_new_vma(&mm, &vmg, 0x5000, 0x6000, 5, flags, &merged); + ASSERT_EQ(vma, vma_a); + /* Merge with A, delete D. */ + ASSERT_TRUE(merged); + ASSERT_EQ(vma->vm_start, 0); + ASSERT_EQ(vma->vm_end, 0x9000); + ASSERT_EQ(vma->vm_pgoff, 0); + ASSERT_EQ(vma->anon_vma, &dummy_anon_vma); + ASSERT_TRUE(vma_write_started(vma)); + ASSERT_EQ(mm.map_count, 2); + + /* + * Merge to NEXT VMA. + * + * 0123456789abc + * AAAAAAAAA *CC + */ + vma_c->anon_vma = &dummy_anon_vma; + vma = try_merge_new_vma(&mm, &vmg, 0xa000, 0xb000, 0xa, flags, &merged); + ASSERT_EQ(vma, vma_c); + /* Prepend C. */ + ASSERT_TRUE(merged); + ASSERT_EQ(vma->vm_start, 0xa000); + ASSERT_EQ(vma->vm_end, 0xc000); + ASSERT_EQ(vma->vm_pgoff, 0xa); + ASSERT_EQ(vma->anon_vma, &dummy_anon_vma); + ASSERT_TRUE(vma_write_started(vma)); + ASSERT_EQ(mm.map_count, 2); + + /* + * Merge BOTH sides. + * + * 0123456789abc + * AAAAAAAAA*CCC + */ + vma = try_merge_new_vma(&mm, &vmg, 0x9000, 0xa000, 0x9, flags, &merged); + ASSERT_EQ(vma, vma_a); + /* Extend A and delete C. */ + ASSERT_TRUE(merged); + ASSERT_EQ(vma->vm_start, 0); + ASSERT_EQ(vma->vm_end, 0xc000); + ASSERT_EQ(vma->vm_pgoff, 0); + ASSERT_EQ(vma->anon_vma, &dummy_anon_vma); + ASSERT_TRUE(vma_write_started(vma)); + ASSERT_EQ(mm.map_count, 1); + + /* + * Final state. + * + * 0123456789abc + * AAAAAAAAAAAAA + */ + + count = 0; + vma_iter_set(&vmi, 0); + for_each_vma(vmi, vma) { + ASSERT_NE(vma, NULL); + ASSERT_EQ(vma->vm_start, 0); + ASSERT_EQ(vma->vm_end, 0xc000); + ASSERT_EQ(vma->vm_pgoff, 0); + ASSERT_EQ(vma->anon_vma, &dummy_anon_vma); + + vm_area_free(vma); + count++; + } + + /* Should only have one VMA left (though freed) after all is done.*/ + ASSERT_EQ(count, 1); + + mtree_destroy(&mm.mm_mt); + return true; +} + +static bool test_vma_merge_special_flags(void) +{ + unsigned long flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE; + struct mm_struct mm = {}; + VMA_ITERATOR(vmi, &mm, 0); + struct vma_merge_struct vmg = { + .mm = &mm, + .vmi = &vmi, + }; + vm_flags_t special_flags[] = { VM_IO, VM_DONTEXPAND, VM_PFNMAP, VM_MIXEDMAP }; + vm_flags_t all_special_flags = 0; + int i; + struct vm_area_struct *vma_left, *vma; + + /* Make sure there aren't new VM_SPECIAL flags. */ + for (i = 0; i < ARRAY_SIZE(special_flags); i++) { + all_special_flags |= special_flags[i]; + } + ASSERT_EQ(all_special_flags, VM_SPECIAL); + + /* + * 01234 + * AAA + */ + vma_left = alloc_and_link_vma(&mm, 0, 0x3000, 0, flags); + ASSERT_NE(vma_left, NULL); + + /* 1. Set up new VMA with special flag that would otherwise merge. */ + + /* + * 01234 + * AAA* + * + * This should merge if not for the VM_SPECIAL flag. + */ + vmg_set_range(&vmg, 0x3000, 0x4000, 3, flags); + for (i = 0; i < ARRAY_SIZE(special_flags); i++) { + vm_flags_t special_flag = special_flags[i]; + + vma_left->__vm_flags = flags | special_flag; + vmg.flags = flags | special_flag; + vma = merge_new(&vmg); + ASSERT_EQ(vma, NULL); + } + + /* 2. Modify VMA with special flag that would otherwise merge. */ + + /* + * 01234 + * AAAB + * + * Create a VMA to modify. + */ + vma = alloc_and_link_vma(&mm, 0x3000, 0x4000, 3, flags); + ASSERT_NE(vma, NULL); + vmg.vma = vma; + + for (i = 0; i < ARRAY_SIZE(special_flags); i++) { + vm_flags_t special_flag = special_flags[i]; + + vma_left->__vm_flags = flags | special_flag; + vmg.flags = flags | special_flag; + vma = merge_existing(&vmg); + ASSERT_EQ(vma, NULL); + } + + cleanup_mm(&mm, &vmi); + return true; +} + +static bool test_vma_merge_with_close(void) +{ + unsigned long flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE; + struct mm_struct mm = {}; + VMA_ITERATOR(vmi, &mm, 0); + struct vma_merge_struct vmg = { + .mm = &mm, + .vmi = &vmi, + }; + const struct vm_operations_struct vm_ops = { + .close = dummy_close, + }; + struct vm_area_struct *vma_next = + alloc_and_link_vma(&mm, 0x2000, 0x3000, 2, flags); + struct vm_area_struct *vma; + + /* + * When we merge VMAs we sometimes have to delete others as part of the + * operation. + * + * Considering the two possible adjacent VMAs to which a VMA can be + * merged: + * + * [ prev ][ vma ][ next ] + * + * In no case will we need to delete prev. If the operation is + * mergeable, then prev will be extended with one or both of vma and + * next deleted. + * + * As a result, during initial mergeability checks, only + * can_vma_merge_before() (which implies the VMA being merged with is + * 'next' as shown above) bothers to check to see whether the next VMA + * has a vm_ops->close() callback that will need to be called when + * removed. + * + * If it does, then we cannot merge as the resources that the close() + * operation potentially clears down are tied only to the existing VMA + * range and we have no way of extending those to the nearly merged one. + * + * We must consider two scenarios: + * + * A. + * + * vm_ops->close: - - !NULL + * [ prev ][ vma ][ next ] + * + * Where prev may or may not be present/mergeable. + * + * This is picked up by a specific check in can_vma_merge_before(). + * + * B. + * + * vm_ops->close: - !NULL + * [ prev ][ vma ] + * + * Where prev and vma are present and mergeable. + * + * This is picked up by a specific check in the modified VMA merge. + * + * IMPORTANT NOTE: We make the assumption that the following case: + * + * - !NULL NULL + * [ prev ][ vma ][ next ] + * + * Cannot occur, because vma->vm_ops being the same implies the same + * vma->vm_file, and therefore this would mean that next->vm_ops->close + * would be set too, and thus scenario A would pick this up. + */ + + ASSERT_NE(vma_next, NULL); + + /* + * SCENARIO A + * + * 0123 + * *N + */ + + /* Make the next VMA have a close() callback. */ + vma_next->vm_ops = &vm_ops; + + /* Our proposed VMA has characteristics that would otherwise be merged. */ + vmg_set_range(&vmg, 0x1000, 0x2000, 1, flags); + + /* The next VMA having a close() operator should cause the merge to fail.*/ + ASSERT_EQ(merge_new(&vmg), NULL); + + /* Now create the VMA so we can merge via modified flags */ + vmg_set_range(&vmg, 0x1000, 0x2000, 1, flags); + vma = alloc_and_link_vma(&mm, 0x1000, 0x2000, 1, flags); + vmg.vma = vma; + + /* + * The VMA being modified in a way that would otherwise merge should + * also fail. + */ + ASSERT_EQ(merge_existing(&vmg), NULL); + + /* SCENARIO B + * + * 0123 + * P* + * + * In order for this scenario to trigger, the VMA currently being + * modified must also have a .close(). + */ + + /* Reset VMG state. */ + vmg_set_range(&vmg, 0x1000, 0x2000, 1, flags); + /* + * Make next unmergeable, and don't let the scenario A check pick this + * up, we want to reproduce scenario B only. + */ + vma_next->vm_ops = NULL; + vma_next->__vm_flags &= ~VM_MAYWRITE; + /* Allocate prev. */ + vmg.prev = alloc_and_link_vma(&mm, 0, 0x1000, 0, flags); + /* Assign a vm_ops->close() function to VMA explicitly. */ + vma->vm_ops = &vm_ops; + vmg.vma = vma; + /* Make sure merge does not occur. */ + ASSERT_EQ(merge_existing(&vmg), NULL); + + cleanup_mm(&mm, &vmi); + return true; +} + +static bool test_vma_merge_new_with_close(void) +{ + unsigned long flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE; + struct mm_struct mm = {}; + VMA_ITERATOR(vmi, &mm, 0); + struct vma_merge_struct vmg = { + .mm = &mm, + .vmi = &vmi, + }; + struct vm_area_struct *vma_prev = alloc_and_link_vma(&mm, 0, 0x2000, 0, flags); + struct vm_area_struct *vma_next = alloc_and_link_vma(&mm, 0x5000, 0x7000, 5, flags); + const struct vm_operations_struct vm_ops = { + .close = dummy_close, + }; + struct vm_area_struct *vma; + + /* + * We should allow the partial merge of a proposed new VMA if the + * surrounding VMAs have vm_ops->close() hooks (but are otherwise + * compatible), e.g.: + * + * New VMA + * A v-------v B + * |-----| |-----| + * close close + * + * Since the rule is to not DELETE a VMA with a close operation, this + * should be permitted, only rather than expanding A and deleting B, we + * should simply expand A and leave B intact, e.g.: + * + * New VMA + * A B + * |------------||-----| + * close close + */ + + /* Have prev and next have a vm_ops->close() hook. */ + vma_prev->vm_ops = &vm_ops; + vma_next->vm_ops = &vm_ops; + + vmg_set_range(&vmg, 0x2000, 0x5000, 2, flags); + vma = merge_new(&vmg); + ASSERT_NE(vma, NULL); + ASSERT_EQ(vma->vm_start, 0); + ASSERT_EQ(vma->vm_end, 0x5000); + ASSERT_EQ(vma->vm_pgoff, 0); + ASSERT_EQ(vma->vm_ops, &vm_ops); + ASSERT_TRUE(vma_write_started(vma)); + ASSERT_EQ(mm.map_count, 2); + + cleanup_mm(&mm, &vmi); + return true; +} + +static bool test_merge_existing(void) +{ + unsigned long flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE; + struct mm_struct mm = {}; + VMA_ITERATOR(vmi, &mm, 0); + struct vm_area_struct *vma, *vma_prev, *vma_next; + struct vma_merge_struct vmg = { + .mm = &mm, + .vmi = &vmi, + }; + + /* + * Merge right case - partial span. + * + * <-> + * 0123456789 + * VVVVNNN + * -> + * 0123456789 + * VNNNNNN + */ + vma = alloc_and_link_vma(&mm, 0x2000, 0x6000, 2, flags); + vma_next = alloc_and_link_vma(&mm, 0x6000, 0x9000, 6, flags); + vmg_set_range(&vmg, 0x3000, 0x6000, 3, flags); + vmg.vma = vma; + vmg.prev = vma; + vma->anon_vma = &dummy_anon_vma; + ASSERT_EQ(merge_existing(&vmg), vma_next); + ASSERT_EQ(vma_next->vm_start, 0x3000); + ASSERT_EQ(vma_next->vm_end, 0x9000); + ASSERT_EQ(vma_next->vm_pgoff, 3); + ASSERT_EQ(vma_next->anon_vma, &dummy_anon_vma); + ASSERT_EQ(vma->vm_start, 0x2000); + ASSERT_EQ(vma->vm_end, 0x3000); + ASSERT_EQ(vma->vm_pgoff, 2); + ASSERT_TRUE(vma_write_started(vma)); + ASSERT_TRUE(vma_write_started(vma_next)); + ASSERT_EQ(mm.map_count, 2); + + /* Clear down and reset. */ + ASSERT_EQ(cleanup_mm(&mm, &vmi), 2); + + /* + * Merge right case - full span. + * + * <--> + * 0123456789 + * VVVVNNN + * -> + * 0123456789 + * NNNNNNN + */ + vma = alloc_and_link_vma(&mm, 0x2000, 0x6000, 2, flags); + vma_next = alloc_and_link_vma(&mm, 0x6000, 0x9000, 6, flags); + vmg_set_range(&vmg, 0x2000, 0x6000, 2, flags); + vmg.vma = vma; + vma->anon_vma = &dummy_anon_vma; + ASSERT_EQ(merge_existing(&vmg), vma_next); + ASSERT_EQ(vma_next->vm_start, 0x2000); + ASSERT_EQ(vma_next->vm_end, 0x9000); + ASSERT_EQ(vma_next->vm_pgoff, 2); + ASSERT_EQ(vma_next->anon_vma, &dummy_anon_vma); + ASSERT_TRUE(vma_write_started(vma_next)); + ASSERT_EQ(mm.map_count, 1); + + /* Clear down and reset. We should have deleted vma. */ + ASSERT_EQ(cleanup_mm(&mm, &vmi), 1); + + /* + * Merge left case - partial span. + * + * <-> + * 0123456789 + * PPPVVVV + * -> + * 0123456789 + * PPPPPPV + */ + vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, flags); + vma = alloc_and_link_vma(&mm, 0x3000, 0x7000, 3, flags); + vmg_set_range(&vmg, 0x3000, 0x6000, 3, flags); + vmg.prev = vma_prev; + vmg.vma = vma; + vma->anon_vma = &dummy_anon_vma; + + ASSERT_EQ(merge_existing(&vmg), vma_prev); + ASSERT_EQ(vma_prev->vm_start, 0); + ASSERT_EQ(vma_prev->vm_end, 0x6000); + ASSERT_EQ(vma_prev->vm_pgoff, 0); + ASSERT_EQ(vma_prev->anon_vma, &dummy_anon_vma); + ASSERT_EQ(vma->vm_start, 0x6000); + ASSERT_EQ(vma->vm_end, 0x7000); + ASSERT_EQ(vma->vm_pgoff, 6); + ASSERT_TRUE(vma_write_started(vma_prev)); + ASSERT_TRUE(vma_write_started(vma)); + ASSERT_EQ(mm.map_count, 2); + + /* Clear down and reset. */ + ASSERT_EQ(cleanup_mm(&mm, &vmi), 2); + + /* + * Merge left case - full span. + * + * <--> + * 0123456789 + * PPPVVVV + * -> + * 0123456789 + * PPPPPPP + */ + vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, flags); + vma = alloc_and_link_vma(&mm, 0x3000, 0x7000, 3, flags); + vmg_set_range(&vmg, 0x3000, 0x7000, 3, flags); + vmg.prev = vma_prev; + vmg.vma = vma; + vma->anon_vma = &dummy_anon_vma; + ASSERT_EQ(merge_existing(&vmg), vma_prev); + ASSERT_EQ(vma_prev->vm_start, 0); + ASSERT_EQ(vma_prev->vm_end, 0x7000); + ASSERT_EQ(vma_prev->vm_pgoff, 0); + ASSERT_EQ(vma_prev->anon_vma, &dummy_anon_vma); + ASSERT_TRUE(vma_write_started(vma_prev)); + ASSERT_EQ(mm.map_count, 1); + + /* Clear down and reset. We should have deleted vma. */ + ASSERT_EQ(cleanup_mm(&mm, &vmi), 1); + + /* + * Merge both case. + * + * <--> + * 0123456789 + * PPPVVVVNNN + * -> + * 0123456789 + * PPPPPPPPPP + */ + vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, flags); + vma = alloc_and_link_vma(&mm, 0x3000, 0x7000, 3, flags); + vma_next = alloc_and_link_vma(&mm, 0x7000, 0x9000, 7, flags); + vmg_set_range(&vmg, 0x3000, 0x7000, 3, flags); + vmg.prev = vma_prev; + vmg.vma = vma; + vma->anon_vma = &dummy_anon_vma; + ASSERT_EQ(merge_existing(&vmg), vma_prev); + ASSERT_EQ(vma_prev->vm_start, 0); + ASSERT_EQ(vma_prev->vm_end, 0x9000); + ASSERT_EQ(vma_prev->vm_pgoff, 0); + ASSERT_EQ(vma_prev->anon_vma, &dummy_anon_vma); + ASSERT_TRUE(vma_write_started(vma_prev)); + ASSERT_EQ(mm.map_count, 1); + + /* Clear down and reset. We should have deleted prev and next. */ + ASSERT_EQ(cleanup_mm(&mm, &vmi), 1); + + /* + * Non-merge ranges. the modified VMA merge operation assumes that the + * caller always specifies ranges within the input VMA so we need only + * examine these cases. + * + * - + * - + * - + * <-> + * <> + * <> + * 0123456789a + * PPPVVVVVNNN + */ + + vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, flags); + vma = alloc_and_link_vma(&mm, 0x3000, 0x8000, 3, flags); + vma_next = alloc_and_link_vma(&mm, 0x8000, 0xa000, 8, flags); + + vmg_set_range(&vmg, 0x4000, 0x5000, 4, flags); + vmg.prev = vma; + vmg.vma = vma; + ASSERT_EQ(merge_existing(&vmg), NULL); + + vmg_set_range(&vmg, 0x5000, 0x6000, 5, flags); + vmg.prev = vma; + vmg.vma = vma; + ASSERT_EQ(merge_existing(&vmg), NULL); + + vmg_set_range(&vmg, 0x6000, 0x7000, 6, flags); + vmg.prev = vma; + vmg.vma = vma; + ASSERT_EQ(merge_existing(&vmg), NULL); + + vmg_set_range(&vmg, 0x4000, 0x7000, 4, flags); + vmg.prev = vma; + vmg.vma = vma; + ASSERT_EQ(merge_existing(&vmg), NULL); + + vmg_set_range(&vmg, 0x4000, 0x6000, 4, flags); + vmg.prev = vma; + vmg.vma = vma; + ASSERT_EQ(merge_existing(&vmg), NULL); + + vmg_set_range(&vmg, 0x5000, 0x6000, 5, flags); + vmg.prev = vma; + vmg.vma = vma; + ASSERT_EQ(merge_existing(&vmg), NULL); + + ASSERT_EQ(cleanup_mm(&mm, &vmi), 3); + + return true; +} + +static bool test_anon_vma_non_mergeable(void) +{ + unsigned long flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE; + struct mm_struct mm = {}; + VMA_ITERATOR(vmi, &mm, 0); + struct vm_area_struct *vma, *vma_prev, *vma_next; + struct vma_merge_struct vmg = { + .mm = &mm, + .vmi = &vmi, + }; + struct anon_vma_chain dummy_anon_vma_chain1 = { + .anon_vma = &dummy_anon_vma, + }; + struct anon_vma_chain dummy_anon_vma_chain2 = { + .anon_vma = &dummy_anon_vma, + }; + + /* + * In the case of modified VMA merge, merging both left and right VMAs + * but where prev and next have incompatible anon_vma objects, we revert + * to a merge of prev and VMA: + * + * <--> + * 0123456789 + * PPPVVVVNNN + * -> + * 0123456789 + * PPPPPPPNNN + */ + vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, flags); + vma = alloc_and_link_vma(&mm, 0x3000, 0x7000, 3, flags); + vma_next = alloc_and_link_vma(&mm, 0x7000, 0x9000, 7, flags); + + /* + * Give both prev and next single anon_vma_chain fields, so they will + * merge with the NULL vmg->anon_vma. + * + * However, when prev is compared to next, the merge should fail. + */ + + INIT_LIST_HEAD(&vma_prev->anon_vma_chain); + list_add(&dummy_anon_vma_chain1.same_vma, &vma_prev->anon_vma_chain); + ASSERT_TRUE(list_is_singular(&vma_prev->anon_vma_chain)); + vma_prev->anon_vma = &dummy_anon_vma; + ASSERT_TRUE(is_mergeable_anon_vma(NULL, vma_prev->anon_vma, vma_prev)); + + INIT_LIST_HEAD(&vma_next->anon_vma_chain); + list_add(&dummy_anon_vma_chain2.same_vma, &vma_next->anon_vma_chain); + ASSERT_TRUE(list_is_singular(&vma_next->anon_vma_chain)); + vma_next->anon_vma = (struct anon_vma *)2; + ASSERT_TRUE(is_mergeable_anon_vma(NULL, vma_next->anon_vma, vma_next)); + + ASSERT_FALSE(is_mergeable_anon_vma(vma_prev->anon_vma, vma_next->anon_vma, NULL)); + + vmg_set_range(&vmg, 0x3000, 0x7000, 3, flags); + vmg.prev = vma_prev; + vmg.vma = vma; + + ASSERT_EQ(merge_existing(&vmg), vma_prev); + ASSERT_EQ(vma_prev->vm_start, 0); + ASSERT_EQ(vma_prev->vm_end, 0x7000); + ASSERT_EQ(vma_prev->vm_pgoff, 0); + ASSERT_TRUE(vma_write_started(vma_prev)); + ASSERT_FALSE(vma_write_started(vma_next)); + + /* Clear down and reset. */ + ASSERT_EQ(cleanup_mm(&mm, &vmi), 2); + + /* + * Now consider the new VMA case. This is equivalent, only adding a new + * VMA in a gap between prev and next. + * + * <--> + * 0123456789 + * PPP****NNN + * -> + * 0123456789 + * PPPPPPPNNN + */ + vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, flags); + vma_next = alloc_and_link_vma(&mm, 0x7000, 0x9000, 7, flags); + + INIT_LIST_HEAD(&vma_prev->anon_vma_chain); + list_add(&dummy_anon_vma_chain1.same_vma, &vma_prev->anon_vma_chain); + vma_prev->anon_vma = (struct anon_vma *)1; + + INIT_LIST_HEAD(&vma_next->anon_vma_chain); + list_add(&dummy_anon_vma_chain2.same_vma, &vma_next->anon_vma_chain); + vma_next->anon_vma = (struct anon_vma *)2; + + vmg_set_range(&vmg, 0x3000, 0x7000, 3, flags); + vmg.prev = vma_prev; + + ASSERT_EQ(merge_new(&vmg), vma_prev); + ASSERT_EQ(vma_prev->vm_start, 0); + ASSERT_EQ(vma_prev->vm_end, 0x7000); + ASSERT_EQ(vma_prev->vm_pgoff, 0); + ASSERT_TRUE(vma_write_started(vma_prev)); + ASSERT_FALSE(vma_write_started(vma_next)); + + /* Final cleanup. */ + ASSERT_EQ(cleanup_mm(&mm, &vmi), 2); + + return true; +} + +static bool test_dup_anon_vma(void) +{ + unsigned long flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE; + struct mm_struct mm = {}; + VMA_ITERATOR(vmi, &mm, 0); + struct vma_merge_struct vmg = { + .mm = &mm, + .vmi = &vmi, + }; + struct anon_vma_chain dummy_anon_vma_chain = { + .anon_vma = &dummy_anon_vma, + }; + struct vm_area_struct *vma_prev, *vma_next, *vma; + + reset_dummy_anon_vma(); + + /* + * Expanding a VMA delete the next one duplicates next's anon_vma and + * assigns it to the expanded VMA. + * + * This covers new VMA merging, as these operations amount to a VMA + * expand. + */ + vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, flags); + vma_next = alloc_and_link_vma(&mm, 0x3000, 0x5000, 3, flags); + vma_next->anon_vma = &dummy_anon_vma; + + vmg_set_range(&vmg, 0, 0x5000, 0, flags); + vmg.vma = vma_prev; + vmg.next = vma_next; + + ASSERT_EQ(expand_existing(&vmg), 0); + + /* Will have been cloned. */ + ASSERT_EQ(vma_prev->anon_vma, &dummy_anon_vma); + ASSERT_TRUE(vma_prev->anon_vma->was_cloned); + + /* Cleanup ready for next run. */ + cleanup_mm(&mm, &vmi); + + /* + * next has anon_vma, we assign to prev. + * + * |<----->| + * |-------*********-------| + * prev vma next + * extend delete delete + */ + + vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, flags); + vma = alloc_and_link_vma(&mm, 0x3000, 0x5000, 3, flags); + vma_next = alloc_and_link_vma(&mm, 0x5000, 0x8000, 5, flags); + + /* Initialise avc so mergeability check passes. */ + INIT_LIST_HEAD(&vma_next->anon_vma_chain); + list_add(&dummy_anon_vma_chain.same_vma, &vma_next->anon_vma_chain); + + vma_next->anon_vma = &dummy_anon_vma; + vmg_set_range(&vmg, 0x3000, 0x5000, 3, flags); + vmg.prev = vma_prev; + vmg.vma = vma; + + ASSERT_EQ(merge_existing(&vmg), vma_prev); + + ASSERT_EQ(vma_prev->vm_start, 0); + ASSERT_EQ(vma_prev->vm_end, 0x8000); + + ASSERT_EQ(vma_prev->anon_vma, &dummy_anon_vma); + ASSERT_TRUE(vma_prev->anon_vma->was_cloned); + + cleanup_mm(&mm, &vmi); + + /* + * vma has anon_vma, we assign to prev. + * + * |<----->| + * |-------*********-------| + * prev vma next + * extend delete delete + */ + + vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, flags); + vma = alloc_and_link_vma(&mm, 0x3000, 0x5000, 3, flags); + vma_next = alloc_and_link_vma(&mm, 0x5000, 0x8000, 5, flags); + + vma->anon_vma = &dummy_anon_vma; + vmg_set_range(&vmg, 0x3000, 0x5000, 3, flags); + vmg.prev = vma_prev; + vmg.vma = vma; + + ASSERT_EQ(merge_existing(&vmg), vma_prev); + + ASSERT_EQ(vma_prev->vm_start, 0); + ASSERT_EQ(vma_prev->vm_end, 0x8000); + + ASSERT_EQ(vma_prev->anon_vma, &dummy_anon_vma); + ASSERT_TRUE(vma_prev->anon_vma->was_cloned); + + cleanup_mm(&mm, &vmi); + + /* + * vma has anon_vma, we assign to prev. + * + * |<----->| + * |-------************* + * prev vma + * extend shrink/delete + */ + + vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, flags); + vma = alloc_and_link_vma(&mm, 0x3000, 0x8000, 3, flags); + + vma->anon_vma = &dummy_anon_vma; + vmg_set_range(&vmg, 0x3000, 0x5000, 3, flags); + vmg.prev = vma_prev; + vmg.vma = vma; + + ASSERT_EQ(merge_existing(&vmg), vma_prev); + + ASSERT_EQ(vma_prev->vm_start, 0); + ASSERT_EQ(vma_prev->vm_end, 0x5000); + + ASSERT_EQ(vma_prev->anon_vma, &dummy_anon_vma); + ASSERT_TRUE(vma_prev->anon_vma->was_cloned); + + cleanup_mm(&mm, &vmi); + + /* + * vma has anon_vma, we assign to next. + * + * |<----->| + * *************-------| + * vma next + * shrink/delete extend + */ + + vma = alloc_and_link_vma(&mm, 0, 0x5000, 0, flags); + vma_next = alloc_and_link_vma(&mm, 0x5000, 0x8000, 5, flags); + + vma->anon_vma = &dummy_anon_vma; + vmg_set_range(&vmg, 0x3000, 0x5000, 3, flags); + vmg.prev = vma; + vmg.vma = vma; + + ASSERT_EQ(merge_existing(&vmg), vma_next); + + ASSERT_EQ(vma_next->vm_start, 0x3000); + ASSERT_EQ(vma_next->vm_end, 0x8000); + + ASSERT_EQ(vma_next->anon_vma, &dummy_anon_vma); + ASSERT_TRUE(vma_next->anon_vma->was_cloned); + + cleanup_mm(&mm, &vmi); + return true; +} + +static bool test_vmi_prealloc_fail(void) +{ + unsigned long flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE; + struct mm_struct mm = {}; + VMA_ITERATOR(vmi, &mm, 0); + struct vma_merge_struct vmg = { + .mm = &mm, + .vmi = &vmi, + }; + struct vm_area_struct *vma_prev, *vma; + + /* + * We are merging vma into prev, with vma possessing an anon_vma, which + * will be duplicated. We cause the vmi preallocation to fail and assert + * the duplicated anon_vma is unlinked. + */ + + vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, flags); + vma = alloc_and_link_vma(&mm, 0x3000, 0x5000, 3, flags); + vma->anon_vma = &dummy_anon_vma; + + vmg_set_range(&vmg, 0x3000, 0x5000, 3, flags); + vmg.prev = vma_prev; + vmg.vma = vma; + + fail_prealloc = true; + + /* This will cause the merge to fail. */ + ASSERT_EQ(merge_existing(&vmg), NULL); + /* We will already have assigned the anon_vma. */ + ASSERT_EQ(vma_prev->anon_vma, &dummy_anon_vma); + /* And it was both cloned and unlinked. */ + ASSERT_TRUE(dummy_anon_vma.was_cloned); + ASSERT_TRUE(dummy_anon_vma.was_unlinked); + + cleanup_mm(&mm, &vmi); /* Resets fail_prealloc too. */ + + /* + * We repeat the same operation for expanding a VMA, which is what new + * VMA merging ultimately uses too. This asserts that unlinking is + * performed in this case too. + */ + + vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, flags); + vma = alloc_and_link_vma(&mm, 0x3000, 0x5000, 3, flags); + vma->anon_vma = &dummy_anon_vma; + + vmg_set_range(&vmg, 0, 0x5000, 3, flags); + vmg.vma = vma_prev; + vmg.next = vma; + + fail_prealloc = true; + ASSERT_EQ(expand_existing(&vmg), -ENOMEM); + + ASSERT_EQ(vma_prev->anon_vma, &dummy_anon_vma); + ASSERT_TRUE(dummy_anon_vma.was_cloned); + ASSERT_TRUE(dummy_anon_vma.was_unlinked); + + cleanup_mm(&mm, &vmi); + return true; +} + +static bool test_merge_extend(void) +{ + unsigned long flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE; + struct mm_struct mm = {}; + VMA_ITERATOR(vmi, &mm, 0x1000); + struct vm_area_struct *vma; + + vma = alloc_and_link_vma(&mm, 0, 0x1000, 0, flags); + alloc_and_link_vma(&mm, 0x3000, 0x4000, 3, flags); + + /* + * Extend a VMA into the gap between itself and the following VMA. + * This should result in a merge. + * + * <-> + * * * + * + */ + + ASSERT_EQ(vma_merge_extend(&vmi, vma, 0x2000), vma); + ASSERT_EQ(vma->vm_start, 0); + ASSERT_EQ(vma->vm_end, 0x4000); + ASSERT_EQ(vma->vm_pgoff, 0); + ASSERT_TRUE(vma_write_started(vma)); + ASSERT_EQ(mm.map_count, 1); + + cleanup_mm(&mm, &vmi); + return true; +} + +static bool test_copy_vma(void) +{ + unsigned long flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE; + struct mm_struct mm = {}; + bool need_locks = false; + VMA_ITERATOR(vmi, &mm, 0); + struct vm_area_struct *vma, *vma_new, *vma_next; + + /* Move backwards and do not merge. */ + + vma = alloc_and_link_vma(&mm, 0x3000, 0x5000, 3, flags); + vma_new = copy_vma(&vma, 0, 0x2000, 0, &need_locks); + + ASSERT_NE(vma_new, vma); + ASSERT_EQ(vma_new->vm_start, 0); + ASSERT_EQ(vma_new->vm_end, 0x2000); + ASSERT_EQ(vma_new->vm_pgoff, 0); + + cleanup_mm(&mm, &vmi); + + /* Move a VMA into position next to another and merge the two. */ + + vma = alloc_and_link_vma(&mm, 0, 0x2000, 0, flags); + vma_next = alloc_and_link_vma(&mm, 0x6000, 0x8000, 6, flags); + vma_new = copy_vma(&vma, 0x4000, 0x2000, 4, &need_locks); + + ASSERT_EQ(vma_new, vma_next); + + cleanup_mm(&mm, &vmi); + return true; +} + int main(void) { int num_tests = 0, num_fail = 0; @@ -193,11 +1453,23 @@ int main(void) } \ } while (0) + /* Very simple tests to kick the tyres. */ TEST(simple_merge); TEST(simple_modify); TEST(simple_expand); TEST(simple_shrink); + TEST(merge_new); + TEST(vma_merge_special_flags); + TEST(vma_merge_with_close); + TEST(vma_merge_new_with_close); + TEST(merge_existing); + TEST(anon_vma_non_mergeable); + TEST(dup_anon_vma); + TEST(vmi_prealloc_fail); + TEST(merge_extend); + TEST(copy_vma); + #undef TEST printf("%d tests run, %d passed, %d failed.\n", diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h index 093560e5b2ac..a3c262c6eb73 100644 --- a/tools/testing/vma/vma_internal.h +++ b/tools/testing/vma/vma_internal.h @@ -81,8 +81,6 @@ #define AS_MM_ALL_LOCKS 2 -#define current NULL - /* We hardcode this for now. */ #define sysctl_max_map_count 0x1000000UL @@ -92,6 +90,12 @@ typedef struct pgprot { pgprotval_t pgprot; } pgprot_t; typedef unsigned long vm_flags_t; typedef __bitwise unsigned int vm_fault_t; +/* + * The shared stubs do not implement this, it amounts to an fprintf(STDERR,...) + * either way :) + */ +#define pr_warn_once pr_err + typedef struct refcount_struct { atomic_t refs; } refcount_t; @@ -100,9 +104,30 @@ struct kref { refcount_t refcount; }; +/* + * Define the task command name length as enum, then it can be visible to + * BPF programs. + */ +enum { + TASK_COMM_LEN = 16, +}; + +struct task_struct { + char comm[TASK_COMM_LEN]; + pid_t pid; + struct mm_struct *mm; +}; + +struct task_struct *get_current(void); +#define current get_current() + struct anon_vma { struct anon_vma *root; struct rb_root_cached rb_root; + + /* Test fields. */ + bool was_cloned; + bool was_unlinked; }; struct anon_vma_chain { @@ -682,13 +707,21 @@ static inline int vma_dup_policy(struct vm_area_struct *, struct vm_area_struct return 0; } -static inline int anon_vma_clone(struct vm_area_struct *, struct vm_area_struct *) +static inline int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src) { + /* For testing purposes. We indicate that an anon_vma has been cloned. */ + if (src->anon_vma != NULL) { + dst->anon_vma = src->anon_vma; + dst->anon_vma->was_cloned = true; + } + return 0; } -static inline void vma_start_write(struct vm_area_struct *) +static inline void vma_start_write(struct vm_area_struct *vma) { + /* Used to indicate to tests that a write operation has begun. */ + vma->vm_lock_seq++; } static inline void vma_adjust_trans_huge(struct vm_area_struct *vma, @@ -759,8 +792,10 @@ static inline void vma_assert_write_locked(struct vm_area_struct *) { } -static inline void unlink_anon_vmas(struct vm_area_struct *) +static inline void unlink_anon_vmas(struct vm_area_struct *vma) { + /* For testing purposes, indicate that the anon_vma was unlinked. */ + vma->anon_vma->was_unlinked = true; } static inline void anon_vma_unlock_write(struct anon_vma *) -- cgit v1.2.3 From 2f1c6611b0a89afcb8641471af5f223c9caa01e0 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Fri, 30 Aug 2024 19:10:15 +0100 Subject: mm: introduce vma_merge_struct and abstract vma_merge(),vma_modify() Rather than passing around huge numbers of parameters to numerous helper functions, abstract them into a single struct that we thread through the operation, the vma_merge_struct ('vmg'). Adjust vma_merge() and vma_modify() to accept this parameter, as well as predicate functions can_vma_merge_before(), can_vma_merge_after(), and the vma_modify_...() helper functions. Also introduce VMG_STATE() and VMG_VMA_STATE() helper macros to allow for easy vmg declaration. We additionally remove the requirement that vma_merge() is passed a VMA object representing the candidate new VMA. Previously it used this to obtain the mm_struct, file and anon_vma properties of the proposed range (a rather confusing state of affairs), which are now provided by the vmg directly. We also remove the pgoff calculation previously performed vma_modify(), and instead calculate this in VMG_VMA_STATE() via the vma_pgoff_offset() helper. Link: https://lkml.kernel.org/r/a955aad09d81329f6fbeb636b2dd10cde7b73dab.1725040657.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Liam R. Howlett Cc: Mark Brown Cc: Vlastimil Babka Cc: Bert Karwatzki Cc: Jeff Xu Cc: Jiri Olsa Cc: Kees Cook Cc: Lorenzo Stoakes Cc: Matthew Wilcox Cc: "Paul E. McKenney" Cc: Paul Moore Cc: Sidhartha Kumar Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- mm/mmap.c | 76 ++++++++++-------- mm/vma.c | 207 +++++++++++++++++++++++++++++------------------- mm/vma.h | 127 +++++++++++++++++------------ tools/testing/vma/vma.c | 43 +--------- 4 files changed, 246 insertions(+), 207 deletions(-) (limited to 'tools') diff --git a/mm/mmap.c b/mm/mmap.c index fc726c4e98be..ca9c6939638b 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1373,10 +1373,11 @@ unsigned long mmap_region(struct file *file, unsigned long addr, unsigned long end = addr + len; unsigned long merge_start = addr, merge_end = end; bool writable_file_mapping = false; - pgoff_t vm_pgoff; int error = -ENOMEM; VMA_ITERATOR(vmi, mm, addr); + VMG_STATE(vmg, mm, &vmi, addr, end, vm_flags, pgoff); + vmg.file = file; /* Find the first overlapping VMA */ vma = vma_find(&vmi, end); init_vma_munmap(&vms, &vmi, vma, addr, end, uf, /* unlock = */ false); @@ -1389,12 +1390,12 @@ unsigned long mmap_region(struct file *file, unsigned long addr, if (error) goto gather_failed; - next = vms.next; - prev = vms.prev; + next = vmg.next = vms.next; + prev = vmg.prev = vms.prev; vma = NULL; } else { - next = vma_next(&vmi); - prev = vma_prev(&vmi); + next = vmg.next = vma_next(&vmi); + prev = vmg.prev = vma_prev(&vmi); if (prev) vma_iter_next_range(&vmi); } @@ -1414,6 +1415,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr, vms.nr_accounted = 0; vm_flags |= VM_ACCOUNT; + vmg.flags = vm_flags; } if (vm_flags & VM_SPECIAL) @@ -1422,28 +1424,31 @@ unsigned long mmap_region(struct file *file, unsigned long addr, /* Attempt to expand an old mapping */ /* Check next */ if (next && next->vm_start == end && !vma_policy(next) && - can_vma_merge_before(next, vm_flags, NULL, file, pgoff+pglen, - NULL_VM_UFFD_CTX, NULL)) { + can_vma_merge_before(&vmg)) { merge_end = next->vm_end; vma = next; - vm_pgoff = next->vm_pgoff - pglen; + vmg.pgoff = next->vm_pgoff - pglen; + /* + * We set this here so if we will merge with the previous VMA in + * the code below, can_vma_merge_after() ensures anon_vma + * compatibility between prev and next. + */ + vmg.anon_vma = vma->anon_vma; + vmg.uffd_ctx = vma->vm_userfaultfd_ctx; } /* Check prev */ if (prev && prev->vm_end == addr && !vma_policy(prev) && - (vma ? can_vma_merge_after(prev, vm_flags, vma->anon_vma, file, - pgoff, vma->vm_userfaultfd_ctx, NULL) : - can_vma_merge_after(prev, vm_flags, NULL, file, pgoff, - NULL_VM_UFFD_CTX, NULL))) { + can_vma_merge_after(&vmg)) { merge_start = prev->vm_start; vma = prev; - vm_pgoff = prev->vm_pgoff; + vmg.pgoff = prev->vm_pgoff; vma_prev(&vmi); /* Equivalent to going to the previous range */ } if (vma) { /* Actually expand, if possible */ - if (!vma_expand(&vmi, vma, merge_start, merge_end, vm_pgoff, next)) { + if (!vma_expand(&vmi, vma, merge_start, merge_end, vmg.pgoff, next)) { khugepaged_enter_vma(vma, vm_flags); goto expanded; } @@ -1774,26 +1779,29 @@ static int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma, * Expand the existing vma if possible; Note that singular lists do not * occur after forking, so the expand will only happen on new VMAs. */ - if (vma && vma->vm_end == addr && !vma_policy(vma) && - can_vma_merge_after(vma, flags, NULL, NULL, - addr >> PAGE_SHIFT, NULL_VM_UFFD_CTX, NULL)) { - vma_iter_config(vmi, vma->vm_start, addr + len); - if (vma_iter_prealloc(vmi, vma)) - goto unacct_fail; - - vma_start_write(vma); - - init_vma_prep(&vp, vma); - vma_prepare(&vp); - vma_adjust_trans_huge(vma, vma->vm_start, addr + len, 0); - vma->vm_end = addr + len; - vm_flags_set(vma, VM_SOFTDIRTY); - vma_iter_store(vmi, vma); - - vma_complete(&vp, vmi, mm); - validate_mm(mm); - khugepaged_enter_vma(vma, flags); - goto out; + if (vma && vma->vm_end == addr && !vma_policy(vma)) { + VMG_STATE(vmg, mm, vmi, addr, addr + len, flags, PHYS_PFN(addr)); + + vmg.prev = vma; + if (can_vma_merge_after(&vmg)) { + vma_iter_config(vmi, vma->vm_start, addr + len); + if (vma_iter_prealloc(vmi, vma)) + goto unacct_fail; + + vma_start_write(vma); + + init_vma_prep(&vp, vma); + vma_prepare(&vp); + vma_adjust_trans_huge(vma, vma->vm_start, addr + len, 0); + vma->vm_end = addr + len; + vm_flags_set(vma, VM_SOFTDIRTY); + vma_iter_store(vmi, vma); + + vma_complete(&vp, vmi, mm); + validate_mm(mm); + khugepaged_enter_vma(vma, flags); + goto out; + } } if (vma) diff --git a/mm/vma.c b/mm/vma.c index 1736bb237b2c..6be645240f07 100644 --- a/mm/vma.c +++ b/mm/vma.c @@ -7,16 +7,18 @@ #include "vma_internal.h" #include "vma.h" -/* - * If the vma has a ->close operation then the driver probably needs to release - * per-vma resources, so we don't attempt to merge those if the caller indicates - * the current vma may be removed as part of the merge. - */ -static inline bool is_mergeable_vma(struct vm_area_struct *vma, - struct file *file, unsigned long vm_flags, - struct vm_userfaultfd_ctx vm_userfaultfd_ctx, - struct anon_vma_name *anon_name, bool may_remove_vma) +static inline bool is_mergeable_vma(struct vma_merge_struct *vmg, bool merge_next) { + struct vm_area_struct *vma = merge_next ? vmg->next : vmg->prev; + /* + * If the vma has a ->close operation then the driver probably needs to + * release per-vma resources, so we don't attempt to merge those if the + * caller indicates the current vma may be removed as part of the merge, + * which is the case if we are attempting to merge the next VMA into + * this one. + */ + bool may_remove_vma = merge_next; + /* * VM_SOFTDIRTY should not prevent from VMA merging, if we * match the flags but dirty bit -- the caller should mark @@ -25,15 +27,15 @@ static inline bool is_mergeable_vma(struct vm_area_struct *vma, * the kernel to generate new VMAs when old one could be * extended instead. */ - if ((vma->vm_flags ^ vm_flags) & ~VM_SOFTDIRTY) + if ((vma->vm_flags ^ vmg->flags) & ~VM_SOFTDIRTY) return false; - if (vma->vm_file != file) + if (vma->vm_file != vmg->file) return false; if (may_remove_vma && vma->vm_ops && vma->vm_ops->close) return false; - if (!is_mergeable_vm_userfaultfd_ctx(vma, vm_userfaultfd_ctx)) + if (!is_mergeable_vm_userfaultfd_ctx(vma, vmg->uffd_ctx)) return false; - if (!anon_vma_name_eq(anon_vma_name(vma), anon_name)) + if (!anon_vma_name_eq(anon_vma_name(vma), vmg->anon_name)) return false; return true; } @@ -94,16 +96,16 @@ static void init_multi_vma_prep(struct vma_prepare *vp, * We assume the vma may be removed as part of the merge. */ bool -can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags, - struct anon_vma *anon_vma, struct file *file, - pgoff_t vm_pgoff, struct vm_userfaultfd_ctx vm_userfaultfd_ctx, - struct anon_vma_name *anon_name) +can_vma_merge_before(struct vma_merge_struct *vmg) { - if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx, anon_name, true) && - is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) { - if (vma->vm_pgoff == vm_pgoff) + pgoff_t pglen = PHYS_PFN(vmg->end - vmg->start); + + if (is_mergeable_vma(vmg, /* merge_next = */ true) && + is_mergeable_anon_vma(vmg->anon_vma, vmg->next->anon_vma, vmg->next)) { + if (vmg->next->vm_pgoff == vmg->pgoff + pglen) return true; } + return false; } @@ -116,18 +118,11 @@ can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags, * * We assume that vma is not removed as part of the merge. */ -bool -can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags, - struct anon_vma *anon_vma, struct file *file, - pgoff_t vm_pgoff, struct vm_userfaultfd_ctx vm_userfaultfd_ctx, - struct anon_vma_name *anon_name) +bool can_vma_merge_after(struct vma_merge_struct *vmg) { - if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx, anon_name, false) && - is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) { - pgoff_t vm_pglen; - - vm_pglen = vma_pages(vma); - if (vma->vm_pgoff + vm_pglen == vm_pgoff) + if (is_mergeable_vma(vmg, /* merge_next = */ false) && + is_mergeable_anon_vma(vmg->anon_vma, vmg->prev->anon_vma, vmg->prev)) { + if (vmg->prev->vm_pgoff + vma_pages(vmg->prev) == vmg->pgoff) return true; } return false; @@ -1017,16 +1012,10 @@ int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm, * **** is not represented - it will be merged and the vma containing the * area is returned, or the function will return NULL */ -static struct vm_area_struct -*vma_merge(struct vma_iterator *vmi, struct vm_area_struct *prev, - struct vm_area_struct *src, unsigned long addr, unsigned long end, - unsigned long vm_flags, pgoff_t pgoff, struct mempolicy *policy, - struct vm_userfaultfd_ctx vm_userfaultfd_ctx, - struct anon_vma_name *anon_name) +static struct vm_area_struct *vma_merge(struct vma_merge_struct *vmg) { - struct mm_struct *mm = src->vm_mm; - struct anon_vma *anon_vma = src->anon_vma; - struct file *file = src->vm_file; + struct mm_struct *mm = vmg->mm; + struct vm_area_struct *prev = vmg->prev; struct vm_area_struct *curr, *next, *res; struct vm_area_struct *vma, *adjust, *remove, *remove2; struct vm_area_struct *anon_dup = NULL; @@ -1036,16 +1025,18 @@ static struct vm_area_struct bool merge_prev = false; bool merge_next = false; bool vma_expanded = false; + unsigned long addr = vmg->start; + unsigned long end = vmg->end; unsigned long vma_start = addr; unsigned long vma_end = end; - pgoff_t pglen = (end - addr) >> PAGE_SHIFT; + pgoff_t pglen = PHYS_PFN(end - addr); long adj_start = 0; /* * We later require that vma->vm_flags == vm_flags, * so this tests vma->vm_flags & VM_SPECIAL, too. */ - if (vm_flags & VM_SPECIAL) + if (vmg->flags & VM_SPECIAL) return NULL; /* Does the input range span an existing VMA? (cases 5 - 8) */ @@ -1053,27 +1044,26 @@ static struct vm_area_struct if (!curr || /* cases 1 - 4 */ end == curr->vm_end) /* cases 6 - 8, adjacent VMA */ - next = vma_lookup(mm, end); + next = vmg->next = vma_lookup(mm, end); else - next = NULL; /* case 5 */ + next = vmg->next = NULL; /* case 5 */ if (prev) { vma_start = prev->vm_start; vma_pgoff = prev->vm_pgoff; /* Can we merge the predecessor? */ - if (addr == prev->vm_end && mpol_equal(vma_policy(prev), policy) - && can_vma_merge_after(prev, vm_flags, anon_vma, file, - pgoff, vm_userfaultfd_ctx, anon_name)) { + if (addr == prev->vm_end && mpol_equal(vma_policy(prev), vmg->policy) + && can_vma_merge_after(vmg)) { + merge_prev = true; - vma_prev(vmi); + vma_prev(vmg->vmi); } } /* Can we merge the successor? */ - if (next && mpol_equal(policy, vma_policy(next)) && - can_vma_merge_before(next, vm_flags, anon_vma, file, pgoff+pglen, - vm_userfaultfd_ctx, anon_name)) { + if (next && mpol_equal(vmg->policy, vma_policy(next)) && + can_vma_merge_before(vmg)) { merge_next = true; } @@ -1164,13 +1154,13 @@ static struct vm_area_struct vma_expanded = true; if (vma_expanded) { - vma_iter_config(vmi, vma_start, vma_end); + vma_iter_config(vmg->vmi, vma_start, vma_end); } else { - vma_iter_config(vmi, adjust->vm_start + adj_start, + vma_iter_config(vmg->vmi, adjust->vm_start + adj_start, adjust->vm_end); } - if (vma_iter_prealloc(vmi, vma)) + if (vma_iter_prealloc(vmg->vmi, vma)) goto prealloc_fail; init_multi_vma_prep(&vp, vma, adjust, remove, remove2); @@ -1182,20 +1172,20 @@ static struct vm_area_struct vma_set_range(vma, vma_start, vma_end, vma_pgoff); if (vma_expanded) - vma_iter_store(vmi, vma); + vma_iter_store(vmg->vmi, vma); if (adj_start) { adjust->vm_start += adj_start; adjust->vm_pgoff += adj_start >> PAGE_SHIFT; if (adj_start < 0) { WARN_ON(vma_expanded); - vma_iter_store(vmi, next); + vma_iter_store(vmg->vmi, next); } } - vma_complete(&vp, vmi, mm); + vma_complete(&vp, vmg->vmi, mm); validate_mm(mm); - khugepaged_enter_vma(res, vm_flags); + khugepaged_enter_vma(res, vmg->flags); return res; prealloc_fail: @@ -1203,8 +1193,8 @@ prealloc_fail: unlink_anon_vmas(anon_dup); anon_vma_fail: - vma_iter_set(vmi, addr); - vma_iter_load(vmi); + vma_iter_set(vmg->vmi, addr); + vma_iter_load(vmg->vmi); return NULL; } @@ -1221,32 +1211,27 @@ anon_vma_fail: * The function returns either the merged VMA, the original VMA if a split was * required instead, or an error if the split failed. */ -struct vm_area_struct *vma_modify(struct vma_iterator *vmi, - struct vm_area_struct *prev, - struct vm_area_struct *vma, - unsigned long start, unsigned long end, - unsigned long vm_flags, - struct mempolicy *policy, - struct vm_userfaultfd_ctx uffd_ctx, - struct anon_vma_name *anon_name) +static struct vm_area_struct *vma_modify(struct vma_merge_struct *vmg) { - pgoff_t pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); + struct vm_area_struct *vma = vmg->vma; struct vm_area_struct *merged; - merged = vma_merge(vmi, prev, vma, start, end, vm_flags, - pgoff, policy, uffd_ctx, anon_name); + /* First, try to merge. */ + merged = vma_merge(vmg); if (merged) return merged; - if (vma->vm_start < start) { - int err = split_vma(vmi, vma, start, 1); + /* Split any preceding portion of the VMA. */ + if (vma->vm_start < vmg->start) { + int err = split_vma(vmg->vmi, vma, vmg->start, 1); if (err) return ERR_PTR(err); } - if (vma->vm_end > end) { - int err = split_vma(vmi, vma, end, 0); + /* Split any trailing portion of the VMA. */ + if (vma->vm_end > vmg->end) { + int err = split_vma(vmg->vmi, vma, vmg->end, 0); if (err) return ERR_PTR(err); @@ -1255,6 +1240,65 @@ struct vm_area_struct *vma_modify(struct vma_iterator *vmi, return vma; } +struct vm_area_struct *vma_modify_flags( + struct vma_iterator *vmi, struct vm_area_struct *prev, + struct vm_area_struct *vma, unsigned long start, unsigned long end, + unsigned long new_flags) +{ + VMG_VMA_STATE(vmg, vmi, prev, vma, start, end); + + vmg.flags = new_flags; + + return vma_modify(&vmg); +} + +struct vm_area_struct +*vma_modify_flags_name(struct vma_iterator *vmi, + struct vm_area_struct *prev, + struct vm_area_struct *vma, + unsigned long start, + unsigned long end, + unsigned long new_flags, + struct anon_vma_name *new_name) +{ + VMG_VMA_STATE(vmg, vmi, prev, vma, start, end); + + vmg.flags = new_flags; + vmg.anon_name = new_name; + + return vma_modify(&vmg); +} + +struct vm_area_struct +*vma_modify_policy(struct vma_iterator *vmi, + struct vm_area_struct *prev, + struct vm_area_struct *vma, + unsigned long start, unsigned long end, + struct mempolicy *new_pol) +{ + VMG_VMA_STATE(vmg, vmi, prev, vma, start, end); + + vmg.policy = new_pol; + + return vma_modify(&vmg); +} + +struct vm_area_struct +*vma_modify_flags_uffd(struct vma_iterator *vmi, + struct vm_area_struct *prev, + struct vm_area_struct *vma, + unsigned long start, unsigned long end, + unsigned long new_flags, + struct vm_userfaultfd_ctx new_ctx) +{ + VMG_VMA_STATE(vmg, vmi, prev, vma, start, end); + + vmg.flags = new_flags; + vmg.uffd_ctx = new_ctx; + + return vma_modify(&vmg); +} + /* * Attempt to merge a newly mapped VMA with those adjacent to it. The caller * must ensure that [start, end) does not overlap any existing VMA. @@ -1264,8 +1308,11 @@ struct vm_area_struct struct vm_area_struct *vma, unsigned long start, unsigned long end, pgoff_t pgoff) { - return vma_merge(vmi, prev, vma, start, end, vma->vm_flags, pgoff, - vma_policy(vma), vma->vm_userfaultfd_ctx, anon_vma_name(vma)); + VMG_VMA_STATE(vmg, vmi, prev, vma, start, end); + + vmg.pgoff = pgoff; + + return vma_merge(&vmg); } /* @@ -1276,12 +1323,10 @@ struct vm_area_struct *vma_merge_extend(struct vma_iterator *vmi, struct vm_area_struct *vma, unsigned long delta) { - pgoff_t pgoff = vma->vm_pgoff + vma_pages(vma); + VMG_VMA_STATE(vmg, vmi, vma, vma, vma->vm_end, vma->vm_end + delta); /* vma is specified as prev, so case 1 or 2 will apply. */ - return vma_merge(vmi, vma, vma, vma->vm_end, vma->vm_end + delta, - vma->vm_flags, pgoff, vma_policy(vma), - vma->vm_userfaultfd_ctx, anon_vma_name(vma)); + return vma_merge(&vmg); } void unlink_file_vma_batch_init(struct unlink_vma_file_batch *vb) diff --git a/mm/vma.h b/mm/vma.h index 2a3a5c89a33b..e761713f855f 100644 --- a/mm/vma.h +++ b/mm/vma.h @@ -52,6 +52,59 @@ struct vma_munmap_struct { unsigned long data_vm; }; +/* Represents a VMA merge operation. */ +struct vma_merge_struct { + struct mm_struct *mm; + struct vma_iterator *vmi; + pgoff_t pgoff; + struct vm_area_struct *prev; + struct vm_area_struct *next; /* Modified by vma_merge(). */ + struct vm_area_struct *vma; /* Either a new VMA or the one being modified. */ + unsigned long start; + unsigned long end; + unsigned long flags; + struct file *file; + struct anon_vma *anon_vma; + struct mempolicy *policy; + struct vm_userfaultfd_ctx uffd_ctx; + struct anon_vma_name *anon_name; +}; + +/* Assumes addr >= vma->vm_start. */ +static inline pgoff_t vma_pgoff_offset(struct vm_area_struct *vma, + unsigned long addr) +{ + return vma->vm_pgoff + PHYS_PFN(addr - vma->vm_start); +} + +#define VMG_STATE(name, mm_, vmi_, start_, end_, flags_, pgoff_) \ + struct vma_merge_struct name = { \ + .mm = mm_, \ + .vmi = vmi_, \ + .start = start_, \ + .end = end_, \ + .flags = flags_, \ + .pgoff = pgoff_, \ + } + +#define VMG_VMA_STATE(name, vmi_, prev_, vma_, start_, end_) \ + struct vma_merge_struct name = { \ + .mm = vma_->vm_mm, \ + .vmi = vmi_, \ + .prev = prev_, \ + .next = NULL, \ + .vma = vma_, \ + .start = start_, \ + .end = end_, \ + .flags = vma_->vm_flags, \ + .pgoff = vma_pgoff_offset(vma_, start_), \ + .file = vma_->vm_file, \ + .anon_vma = vma_->anon_vma, \ + .policy = vma_policy(vma_), \ + .uffd_ctx = vma_->vm_userfaultfd_ctx, \ + .anon_name = anon_vma_name(vma_), \ + } + #ifdef CONFIG_DEBUG_VM_MAPLE_TREE void validate_mm(struct mm_struct *mm); #else @@ -212,80 +265,52 @@ void remove_vma(struct vm_area_struct *vma, bool unreachable, bool closed); void unmap_region(struct ma_state *mas, struct vm_area_struct *vma, struct vm_area_struct *prev, struct vm_area_struct *next); -/* Required by mmap_region(). */ -bool -can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags, - struct anon_vma *anon_vma, struct file *file, - pgoff_t vm_pgoff, struct vm_userfaultfd_ctx vm_userfaultfd_ctx, - struct anon_vma_name *anon_name); - -/* Required by mmap_region() and do_brk_flags(). */ -bool -can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags, - struct anon_vma *anon_vma, struct file *file, - pgoff_t vm_pgoff, struct vm_userfaultfd_ctx vm_userfaultfd_ctx, - struct anon_vma_name *anon_name); - -struct vm_area_struct *vma_modify(struct vma_iterator *vmi, - struct vm_area_struct *prev, - struct vm_area_struct *vma, - unsigned long start, unsigned long end, - unsigned long vm_flags, - struct mempolicy *policy, - struct vm_userfaultfd_ctx uffd_ctx, - struct anon_vma_name *anon_name); +/* + * Can we merge the VMA described by vmg into the following VMA vmg->next? + * + * Required by mmap_region(). + */ +bool can_vma_merge_before(struct vma_merge_struct *vmg); + +/* + * Can we merge the VMA described by vmg into the preceding VMA vmg->prev? + * + * Required by mmap_region() and do_brk_flags(). + */ +bool can_vma_merge_after(struct vma_merge_struct *vmg); /* We are about to modify the VMA's flags. */ -static inline struct vm_area_struct -*vma_modify_flags(struct vma_iterator *vmi, - struct vm_area_struct *prev, - struct vm_area_struct *vma, - unsigned long start, unsigned long end, - unsigned long new_flags) -{ - return vma_modify(vmi, prev, vma, start, end, new_flags, - vma_policy(vma), vma->vm_userfaultfd_ctx, - anon_vma_name(vma)); -} +struct vm_area_struct *vma_modify_flags(struct vma_iterator *vmi, + struct vm_area_struct *prev, struct vm_area_struct *vma, + unsigned long start, unsigned long end, + unsigned long new_flags); /* We are about to modify the VMA's flags and/or anon_name. */ -static inline struct vm_area_struct +struct vm_area_struct *vma_modify_flags_name(struct vma_iterator *vmi, struct vm_area_struct *prev, struct vm_area_struct *vma, unsigned long start, unsigned long end, unsigned long new_flags, - struct anon_vma_name *new_name) -{ - return vma_modify(vmi, prev, vma, start, end, new_flags, - vma_policy(vma), vma->vm_userfaultfd_ctx, new_name); -} + struct anon_vma_name *new_name); /* We are about to modify the VMA's memory policy. */ -static inline struct vm_area_struct +struct vm_area_struct *vma_modify_policy(struct vma_iterator *vmi, struct vm_area_struct *prev, struct vm_area_struct *vma, unsigned long start, unsigned long end, - struct mempolicy *new_pol) -{ - return vma_modify(vmi, prev, vma, start, end, vma->vm_flags, - new_pol, vma->vm_userfaultfd_ctx, anon_vma_name(vma)); -} + struct mempolicy *new_pol); /* We are about to modify the VMA's flags and/or uffd context. */ -static inline struct vm_area_struct +struct vm_area_struct *vma_modify_flags_uffd(struct vma_iterator *vmi, struct vm_area_struct *prev, struct vm_area_struct *vma, unsigned long start, unsigned long end, unsigned long new_flags, - struct vm_userfaultfd_ctx new_ctx) -{ - return vma_modify(vmi, prev, vma, start, end, new_flags, - vma_policy(vma), new_ctx, anon_vma_name(vma)); -} + struct vm_userfaultfd_ctx new_ctx); struct vm_area_struct *vma_merge_new_vma(struct vma_iterator *vmi, struct vm_area_struct *prev, diff --git a/tools/testing/vma/vma.c b/tools/testing/vma/vma.c index 71bd30d5da81..7a3f59186464 100644 --- a/tools/testing/vma/vma.c +++ b/tools/testing/vma/vma.c @@ -22,26 +22,6 @@ static bool fail_prealloc; */ #include "../../../mm/vma.c" -/* - * Temporarily forward-ported from a future in which vmg's are used for merging. - */ -struct vma_merge_struct { - struct mm_struct *mm; - struct vma_iterator *vmi; - pgoff_t pgoff; - struct vm_area_struct *prev; - struct vm_area_struct *next; /* Modified by vma_merge(). */ - struct vm_area_struct *vma; /* Either a new VMA or the one being modified. */ - unsigned long start; - unsigned long end; - unsigned long flags; - struct file *file; - struct anon_vma *anon_vma; - struct mempolicy *policy; - struct vm_userfaultfd_ctx uffd_ctx; - struct anon_vma_name *anon_name; -}; - const struct vm_operations_struct vma_dummy_vm_ops; static struct anon_vma dummy_anon_vma; @@ -115,14 +95,6 @@ static struct vm_area_struct *alloc_and_link_vma(struct mm_struct *mm, /* Helper function which provides a wrapper around a merge new VMA operation. */ static struct vm_area_struct *merge_new(struct vma_merge_struct *vmg) { - /* vma_merge() needs a VMA to determine mm, anon_vma, and file. */ - struct vm_area_struct dummy = { - .vm_mm = vmg->mm, - .vm_flags = vmg->flags, - .anon_vma = vmg->anon_vma, - .vm_file = vmg->file, - }; - /* * For convenience, get prev and next VMAs. Which the new VMA operation * requires. @@ -131,8 +103,7 @@ static struct vm_area_struct *merge_new(struct vma_merge_struct *vmg) vmg->prev = vma_prev(vmg->vmi); vma_iter_set(vmg->vmi, vmg->start); - return vma_merge_new_vma(vmg->vmi, vmg->prev, &dummy, vmg->start, - vmg->end, vmg->pgoff); + return vma_merge(vmg); } /* @@ -141,17 +112,7 @@ static struct vm_area_struct *merge_new(struct vma_merge_struct *vmg) */ static struct vm_area_struct *merge_existing(struct vma_merge_struct *vmg) { - /* vma_merge() needs a VMA to determine mm, anon_vma, and file. */ - struct vm_area_struct dummy = { - .vm_mm = vmg->mm, - .vm_flags = vmg->flags, - .anon_vma = vmg->anon_vma, - .vm_file = vmg->file, - }; - - return vma_merge(vmg->vmi, vmg->prev, &dummy, vmg->start, vmg->end, - vmg->flags, vmg->pgoff, vmg->policy, vmg->uffd_ctx, - vmg->anon_name); + return vma_merge(vmg); } /* -- cgit v1.2.3 From fc21959f74bc1138b28e90a02ec224ab8626111e Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Fri, 30 Aug 2024 19:10:17 +0100 Subject: mm: abstract vma_expand() to use vma_merge_struct The purpose of the vmg is to thread merge state through functions and avoid egregious parameter lists. We expand this to vma_expand(), which is used for a number of merge cases. Accordingly, adjust its callers, mmap_region() and relocate_vma_down(), to use a vmg. An added purpose of this change is the ability in a future commit to perform all new VMA range merging using vma_expand(). Link: https://lkml.kernel.org/r/4bc8c9dbc9ca52452ef8e587b28fe555854ceb38.1725040657.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Liam R. Howlett Cc: Mark Brown Cc: Vlastimil Babka Cc: Bert Karwatzki Cc: Jeff Xu Cc: Jiri Olsa Cc: Kees Cook Cc: Lorenzo Stoakes Cc: Matthew Wilcox Cc: "Paul E. McKenney" Cc: Paul Moore Cc: Sidhartha Kumar Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- mm/mmap.c | 15 ++++++++------- mm/vma.c | 39 +++++++++++++++++---------------------- mm/vma.h | 5 +---- tools/testing/vma/vma.c | 3 +-- 4 files changed, 27 insertions(+), 35 deletions(-) (limited to 'tools') diff --git a/mm/mmap.c b/mm/mmap.c index 3af8459e4e88..2b3006efd3fb 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1371,7 +1371,6 @@ unsigned long mmap_region(struct file *file, unsigned long addr, struct ma_state mas_detach; struct maple_tree mt_detach; unsigned long end = addr + len; - unsigned long merge_start = addr, merge_end = end; bool writable_file_mapping = false; int error = -ENOMEM; VMA_ITERATOR(vmi, mm, addr); @@ -1424,8 +1423,8 @@ unsigned long mmap_region(struct file *file, unsigned long addr, /* Attempt to expand an old mapping */ /* Check next */ if (next && next->vm_start == end && can_vma_merge_before(&vmg)) { - merge_end = next->vm_end; - vma = next; + vmg.end = next->vm_end; + vma = vmg.vma = next; vmg.pgoff = next->vm_pgoff - pglen; /* * We set this here so if we will merge with the previous VMA in @@ -1438,15 +1437,15 @@ unsigned long mmap_region(struct file *file, unsigned long addr, /* Check prev */ if (prev && prev->vm_end == addr && can_vma_merge_after(&vmg)) { - merge_start = prev->vm_start; - vma = prev; + vmg.start = prev->vm_start; + vma = vmg.vma = prev; vmg.pgoff = prev->vm_pgoff; vma_prev(&vmi); /* Equivalent to going to the previous range */ } if (vma) { /* Actually expand, if possible */ - if (!vma_expand(&vmi, vma, merge_start, merge_end, vmg.pgoff, next)) { + if (!vma_expand(&vmg)) { khugepaged_enter_vma(vma, vm_flags); goto expanded; } @@ -2320,6 +2319,7 @@ int relocate_vma_down(struct vm_area_struct *vma, unsigned long shift) unsigned long new_start = old_start - shift; unsigned long new_end = old_end - shift; VMA_ITERATOR(vmi, mm, new_start); + VMG_STATE(vmg, mm, &vmi, new_start, old_end, 0, vma->vm_pgoff); struct vm_area_struct *next; struct mmu_gather tlb; @@ -2336,7 +2336,8 @@ int relocate_vma_down(struct vm_area_struct *vma, unsigned long shift) /* * cover the whole range: [new_start, old_end) */ - if (vma_expand(&vmi, vma, new_start, old_end, vma->vm_pgoff, NULL)) + vmg.vma = vma; + if (vma_expand(&vmg)) return -ENOMEM; /* diff --git a/mm/vma.c b/mm/vma.c index 3284bb778c3d..d1033dade70e 100644 --- a/mm/vma.c +++ b/mm/vma.c @@ -467,30 +467,25 @@ void validate_mm(struct mm_struct *mm) /* * vma_expand - Expand an existing VMA * - * @vmi: The vma iterator - * @vma: The vma to expand - * @start: The start of the vma - * @end: The exclusive end of the vma - * @pgoff: The page offset of vma - * @next: The current of next vma. + * @vmg: Describes a VMA expansion operation. * - * Expand @vma to @start and @end. Can expand off the start and end. Will - * expand over @next if it's different from @vma and @end == @next->vm_end. - * Checking if the @vma can expand and merge with @next needs to be handled by - * the caller. + * Expand @vma to vmg->start and vmg->end. Can expand off the start and end. + * Will expand over vmg->next if it's different from vmg->vma and vmg->end == + * vmg->next->vm_end. Checking if the vmg->vma can expand and merge with + * vmg->next needs to be handled by the caller. * * Returns: 0 on success */ -int vma_expand(struct vma_iterator *vmi, struct vm_area_struct *vma, - unsigned long start, unsigned long end, pgoff_t pgoff, - struct vm_area_struct *next) +int vma_expand(struct vma_merge_struct *vmg) { struct vm_area_struct *anon_dup = NULL; bool remove_next = false; + struct vm_area_struct *vma = vmg->vma; + struct vm_area_struct *next = vmg->next; struct vma_prepare vp; vma_start_write(vma); - if (next && (vma != next) && (end == next->vm_end)) { + if (next && (vma != next) && (vmg->end == next->vm_end)) { int ret; remove_next = true; @@ -503,21 +498,21 @@ int vma_expand(struct vma_iterator *vmi, struct vm_area_struct *vma, init_multi_vma_prep(&vp, vma, NULL, remove_next ? next : NULL, NULL); /* Not merging but overwriting any part of next is not handled. */ VM_WARN_ON(next && !vp.remove && - next != vma && end > next->vm_start); + next != vma && vmg->end > next->vm_start); /* Only handles expanding */ - VM_WARN_ON(vma->vm_start < start || vma->vm_end > end); + VM_WARN_ON(vma->vm_start < vmg->start || vma->vm_end > vmg->end); /* Note: vma iterator must be pointing to 'start' */ - vma_iter_config(vmi, start, end); - if (vma_iter_prealloc(vmi, vma)) + vma_iter_config(vmg->vmi, vmg->start, vmg->end); + if (vma_iter_prealloc(vmg->vmi, vma)) goto nomem; vma_prepare(&vp); - vma_adjust_trans_huge(vma, start, end, 0); - vma_set_range(vma, start, end, pgoff); - vma_iter_store(vmi, vma); + vma_adjust_trans_huge(vma, vmg->start, vmg->end, 0); + vma_set_range(vma, vmg->start, vmg->end, vmg->pgoff); + vma_iter_store(vmg->vmi, vma); - vma_complete(&vp, vmi, vma->vm_mm); + vma_complete(&vp, vmg->vmi, vma->vm_mm); return 0; nomem: diff --git a/mm/vma.h b/mm/vma.h index e761713f855f..218d884ff5ff 100644 --- a/mm/vma.h +++ b/mm/vma.h @@ -128,10 +128,7 @@ void init_vma_prep(struct vma_prepare *vp, void vma_complete(struct vma_prepare *vp, struct vma_iterator *vmi, struct mm_struct *mm); -int vma_expand(struct vma_iterator *vmi, struct vm_area_struct *vma, - unsigned long start, unsigned long end, pgoff_t pgoff, - struct vm_area_struct *next); - +int vma_expand(struct vma_merge_struct *vmg); int vma_shrink(struct vma_iterator *vmi, struct vm_area_struct *vma, unsigned long start, unsigned long end, pgoff_t pgoff); diff --git a/tools/testing/vma/vma.c b/tools/testing/vma/vma.c index 7a3f59186464..f6c4706a861f 100644 --- a/tools/testing/vma/vma.c +++ b/tools/testing/vma/vma.c @@ -121,8 +121,7 @@ static struct vm_area_struct *merge_existing(struct vma_merge_struct *vmg) */ static int expand_existing(struct vma_merge_struct *vmg) { - return vma_expand(vmg->vmi, vmg->vma, vmg->start, vmg->end, vmg->pgoff, - vmg->next); + return vma_expand(vmg); } /* -- cgit v1.2.3 From cacded5e42b9609b07b22d80c10f0076d439f7d1 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Fri, 30 Aug 2024 19:10:18 +0100 Subject: mm: avoid using vma_merge() for new VMAs Abstract vma_merge_new_vma() to use vma_merge_struct and rename the resultant function vma_merge_new_range() to be clear what the purpose of this function is - a new VMA is desired in the specified range, and we wish to see if it is possible to 'merge' surrounding VMAs into this range rather than having to allocate a new VMA. Note that this function uses vma_extend() exclusively, so adopts its requirement that the iterator point at or before the gap. We add an assert to this effect. This is as opposed to vma_merge_existing_range(), which will be introduced in a subsequent commit, and provide the same functionality for cases in which we are modifying an existing VMA. In mmap_region() and do_brk_flags() we open code scenarios where we prefer to use vma_expand() rather than invoke a full vma_merge() operation. Abstract this logic and eliminate all of the open-coding, and also use the same logic for all cases where we add new VMAs to, rather than ultimately use vma_merge(), rather use vma_expand(). Doing so removes duplication and simplifies VMA merging in all such cases, laying the ground for us to eliminate the merging of new VMAs in vma_merge() altogether. Also add the ability for the vmg to track state, and able to report errors, allowing for us to differentiate a failed merge from an inability to allocate memory in callers. This makes it far easier to understand what is happening in these cases avoiding confusion, bugs and allowing for future optimisation. Also introduce vma_iter_next_rewind() to allow for retrieval of the next, and (optionally) the prev VMA, rewinding to the start of the previous gap. Introduce are_anon_vmas_compatible() to abstract individual VMA anon_vma comparison for the case of merging on both sides where the anon_vma of the VMA being merged maybe compatible with prev and next, but prev and next's anon_vma's may not be compatible with each other. Finally also introduce can_vma_merge_left() / can_vma_merge_right() to check adjacent VMA compatibility and that they are indeed adjacent. Link: https://lkml.kernel.org/r/49d37c0769b6b9dc03b27fe4d059173832556392.1725040657.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Tested-by: Mark Brown Cc: Liam R. Howlett Cc: Vlastimil Babka Cc: Bert Karwatzki Cc: Jeff Xu Cc: Jiri Olsa Cc: Kees Cook Cc: Lorenzo Stoakes Cc: Matthew Wilcox Cc: "Paul E. McKenney" Cc: Paul Moore Cc: Sidhartha Kumar Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- mm/mmap.c | 92 ++++-------------- mm/vma.c | 200 ++++++++++++++++++++++++++++++++++----- mm/vma.h | 48 +++++++++- tools/testing/vma/vma.c | 33 ++++++- tools/testing/vma/vma_internal.h | 6 ++ 5 files changed, 279 insertions(+), 100 deletions(-) (limited to 'tools') diff --git a/mm/mmap.c b/mm/mmap.c index 2b3006efd3fb..02f7b45c3076 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1364,8 +1364,8 @@ unsigned long mmap_region(struct file *file, unsigned long addr, { struct mm_struct *mm = current->mm; struct vm_area_struct *vma = NULL; - struct vm_area_struct *next, *prev, *merge; pgoff_t pglen = PHYS_PFN(len); + struct vm_area_struct *merge; unsigned long charged = 0; struct vma_munmap_struct vms; struct ma_state mas_detach; @@ -1389,14 +1389,11 @@ unsigned long mmap_region(struct file *file, unsigned long addr, if (error) goto gather_failed; - next = vmg.next = vms.next; - prev = vmg.prev = vms.prev; + vmg.next = vms.next; + vmg.prev = vms.prev; vma = NULL; } else { - next = vmg.next = vma_next(&vmi); - prev = vmg.prev = vma_prev(&vmi); - if (prev) - vma_iter_next_range(&vmi); + vmg.next = vma_iter_next_rewind(&vmi, &vmg.prev); } /* Check against address space limit. */ @@ -1417,46 +1414,9 @@ unsigned long mmap_region(struct file *file, unsigned long addr, vmg.flags = vm_flags; } - if (vm_flags & VM_SPECIAL) - goto cannot_expand; - - /* Attempt to expand an old mapping */ - /* Check next */ - if (next && next->vm_start == end && can_vma_merge_before(&vmg)) { - vmg.end = next->vm_end; - vma = vmg.vma = next; - vmg.pgoff = next->vm_pgoff - pglen; - /* - * We set this here so if we will merge with the previous VMA in - * the code below, can_vma_merge_after() ensures anon_vma - * compatibility between prev and next. - */ - vmg.anon_vma = vma->anon_vma; - vmg.uffd_ctx = vma->vm_userfaultfd_ctx; - } - - /* Check prev */ - if (prev && prev->vm_end == addr && can_vma_merge_after(&vmg)) { - vmg.start = prev->vm_start; - vma = vmg.vma = prev; - vmg.pgoff = prev->vm_pgoff; - vma_prev(&vmi); /* Equivalent to going to the previous range */ - } - - if (vma) { - /* Actually expand, if possible */ - if (!vma_expand(&vmg)) { - khugepaged_enter_vma(vma, vm_flags); - goto expanded; - } - - /* If the expand fails, then reposition the vma iterator */ - if (unlikely(vma == prev)) - vma_iter_set(&vmi, addr); - } - -cannot_expand: - + vma = vma_merge_new_range(&vmg); + if (vma) + goto expanded; /* * Determine the object being mapped and call the appropriate * specific mapper. the address has already been validated, but @@ -1503,10 +1463,11 @@ cannot_expand: * If vm_flags changed after call_mmap(), we should try merge * vma again as we may succeed this time. */ - if (unlikely(vm_flags != vma->vm_flags && prev)) { - merge = vma_merge_new_vma(&vmi, prev, vma, - vma->vm_start, vma->vm_end, - vma->vm_pgoff); + if (unlikely(vm_flags != vma->vm_flags && vmg.prev)) { + vmg.flags = vma->vm_flags; + /* If this fails, state is reset ready for a reattempt. */ + merge = vma_merge_new_range(&vmg); + if (merge) { /* * ->mmap() can change vma->vm_file and fput @@ -1522,6 +1483,7 @@ cannot_expand: vm_flags = vma->vm_flags; goto unmap_writable; } + vma_iter_config(&vmi, addr, end); } vm_flags = vma->vm_flags; @@ -1554,7 +1516,7 @@ cannot_expand: vma_link_file(vma); /* - * vma_merge() calls khugepaged_enter_vma() either, the below + * vma_merge_new_range() calls khugepaged_enter_vma() too, the below * call covers the non-merge case. */ khugepaged_enter_vma(vma, vma->vm_flags); @@ -1609,7 +1571,7 @@ unmap_and_free_vma: vma_iter_set(&vmi, vma->vm_end); /* Undo any partial mapping done by a device driver. */ - unmap_region(&vmi.mas, vma, prev, next); + unmap_region(&vmi.mas, vma, vmg.prev, vmg.next); } if (writable_file_mapping) mapping_unmap_writable(file->f_mapping); @@ -1756,7 +1718,6 @@ static int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma, unsigned long addr, unsigned long len, unsigned long flags) { struct mm_struct *mm = current->mm; - struct vma_prepare vp; /* * Check against address space limits by the changed size @@ -1780,25 +1741,12 @@ static int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma, VMG_STATE(vmg, mm, vmi, addr, addr + len, flags, PHYS_PFN(addr)); vmg.prev = vma; - if (can_vma_merge_after(&vmg)) { - vma_iter_config(vmi, vma->vm_start, addr + len); - if (vma_iter_prealloc(vmi, vma)) - goto unacct_fail; - - vma_start_write(vma); - - init_vma_prep(&vp, vma); - vma_prepare(&vp); - vma_adjust_trans_huge(vma, vma->vm_start, addr + len, 0); - vma->vm_end = addr + len; - vm_flags_set(vma, VM_SOFTDIRTY); - vma_iter_store(vmi, vma); - - vma_complete(&vp, vmi, mm); - validate_mm(mm); - khugepaged_enter_vma(vma, flags); + vma_iter_next_range(vmi); + + if (vma_merge_new_range(&vmg)) goto out; - } + else if (vmg_nomem(&vmg)) + goto unacct_fail; } if (vma) diff --git a/mm/vma.c b/mm/vma.c index d1033dade70e..749c4881fd60 100644 --- a/mm/vma.c +++ b/mm/vma.c @@ -55,6 +55,13 @@ static inline bool is_mergeable_anon_vma(struct anon_vma *anon_vma1, return anon_vma1 == anon_vma2; } +/* Are the anon_vma's belonging to each VMA compatible with one another? */ +static inline bool are_anon_vmas_compatible(struct vm_area_struct *vma1, + struct vm_area_struct *vma2) +{ + return is_mergeable_anon_vma(vma1->anon_vma, vma2->anon_vma, NULL); +} + /* * init_multi_vma_prep() - Initializer for struct vma_prepare * @vp: The vma_prepare struct @@ -130,6 +137,44 @@ bool can_vma_merge_after(struct vma_merge_struct *vmg) return false; } +/* + * Can the proposed VMA be merged with the left (previous) VMA taking into + * account the start position of the proposed range. + */ +static bool can_vma_merge_left(struct vma_merge_struct *vmg) + +{ + return vmg->prev && vmg->prev->vm_end == vmg->start && + can_vma_merge_after(vmg); +} + +/* + * Can the proposed VMA be merged with the right (next) VMA taking into + * account the end position of the proposed range. + * + * In addition, if we can merge with the left VMA, ensure that left and right + * anon_vma's are also compatible. + */ +static bool can_vma_merge_right(struct vma_merge_struct *vmg, + bool can_merge_left) +{ + if (!vmg->next || vmg->end != vmg->next->vm_start || + !can_vma_merge_before(vmg)) + return false; + + if (!can_merge_left) + return true; + + /* + * If we can merge with prev (left) and next (right), indicating that + * each VMA's anon_vma is compatible with the proposed anon_vma, this + * does not mean prev and next are compatible with EACH OTHER. + * + * We therefore check this in addition to mergeability to either side. + */ + return are_anon_vmas_compatible(vmg->prev, vmg->next); +} + /* * Close a vm structure and free it. */ @@ -464,6 +509,111 @@ void validate_mm(struct mm_struct *mm) } #endif /* CONFIG_DEBUG_VM_MAPLE_TREE */ +/* + * vma_merge_new_range - Attempt to merge a new VMA into address space + * + * @vmg: Describes the VMA we are adding, in the range @vmg->start to @vmg->end + * (exclusive), which we try to merge with any adjacent VMAs if possible. + * + * We are about to add a VMA to the address space starting at @vmg->start and + * ending at @vmg->end. There are three different possible scenarios: + * + * 1. There is a VMA with identical properties immediately adjacent to the + * proposed new VMA [@vmg->start, @vmg->end) either before or after it - + * EXPAND that VMA: + * + * Proposed: |-----| or |-----| + * Existing: |----| |----| + * + * 2. There are VMAs with identical properties immediately adjacent to the + * proposed new VMA [@vmg->start, @vmg->end) both before AND after it - + * EXPAND the former and REMOVE the latter: + * + * Proposed: |-----| + * Existing: |----| |----| + * + * 3. There are no VMAs immediately adjacent to the proposed new VMA or those + * VMAs do not have identical attributes - NO MERGE POSSIBLE. + * + * In instances where we can merge, this function returns the expanded VMA which + * will have its range adjusted accordingly and the underlying maple tree also + * adjusted. + * + * Returns: In instances where no merge was possible, NULL. Otherwise, a pointer + * to the VMA we expanded. + * + * This function adjusts @vmg to provide @vmg->next if not already specified, + * and adjusts [@vmg->start, @vmg->end) to span the expanded range. + * + * ASSUMPTIONS: + * - The caller must hold a WRITE lock on the mm_struct->mmap_lock. + * - The caller must have determined that [@vmg->start, @vmg->end) is empty, + other than VMAs that will be unmapped should the operation succeed. + * - The caller must have specified the previous vma in @vmg->prev. + * - The caller must have specified the next vma in @vmg->next. + * - The caller must have positioned the vmi at or before the gap. + */ +struct vm_area_struct *vma_merge_new_range(struct vma_merge_struct *vmg) +{ + struct vm_area_struct *prev = vmg->prev; + struct vm_area_struct *next = vmg->next; + unsigned long start = vmg->start; + unsigned long end = vmg->end; + pgoff_t pgoff = vmg->pgoff; + pgoff_t pglen = PHYS_PFN(end - start); + bool can_merge_left, can_merge_right; + + mmap_assert_write_locked(vmg->mm); + VM_WARN_ON(vmg->vma); + /* vmi must point at or before the gap. */ + VM_WARN_ON(vma_iter_addr(vmg->vmi) > end); + + vmg->state = VMA_MERGE_NOMERGE; + + /* Special VMAs are unmergeable, also if no prev/next. */ + if ((vmg->flags & VM_SPECIAL) || (!prev && !next)) + return NULL; + + can_merge_left = can_vma_merge_left(vmg); + can_merge_right = can_vma_merge_right(vmg, can_merge_left); + + /* If we can merge with the next VMA, adjust vmg accordingly. */ + if (can_merge_right) { + vmg->end = next->vm_end; + vmg->vma = next; + vmg->pgoff = next->vm_pgoff - pglen; + } + + /* If we can merge with the previous VMA, adjust vmg accordingly. */ + if (can_merge_left) { + vmg->start = prev->vm_start; + vmg->vma = prev; + vmg->pgoff = prev->vm_pgoff; + + vma_prev(vmg->vmi); /* Equivalent to going to the previous range */ + } + + /* + * Now try to expand adjacent VMA(s). This takes care of removing the + * following VMA if we have VMAs on both sides. + */ + if (vmg->vma && !vma_expand(vmg)) { + khugepaged_enter_vma(vmg->vma, vmg->flags); + vmg->state = VMA_MERGE_SUCCESS; + return vmg->vma; + } + + /* If expansion failed, reset state. Allows us to retry merge later. */ + vmg->vma = NULL; + vmg->start = start; + vmg->end = end; + vmg->pgoff = pgoff; + if (vmg->vma == prev) + vma_iter_set(vmg->vmi, start); + + return NULL; +} + /* * vma_expand - Expand an existing VMA * @@ -474,7 +624,11 @@ void validate_mm(struct mm_struct *mm) * vmg->next->vm_end. Checking if the vmg->vma can expand and merge with * vmg->next needs to be handled by the caller. * - * Returns: 0 on success + * Returns: 0 on success. + * + * ASSUMPTIONS: + * - The caller must hold a WRITE lock on vmg->vma->mm->mmap_lock. + * - The caller must have set @vmg->vma and @vmg->next. */ int vma_expand(struct vma_merge_struct *vmg) { @@ -484,6 +638,8 @@ int vma_expand(struct vma_merge_struct *vmg) struct vm_area_struct *next = vmg->next; struct vma_prepare vp; + mmap_assert_write_locked(vmg->mm); + vma_start_write(vma); if (next && (vma != next) && (vmg->end == next->vm_end)) { int ret; @@ -516,6 +672,7 @@ int vma_expand(struct vma_merge_struct *vmg) return 0; nomem: + vmg->state = VMA_MERGE_ERROR_NOMEM; if (anon_dup) unlink_anon_vmas(anon_dup); return -ENOMEM; @@ -1029,6 +1186,8 @@ static struct vm_area_struct *vma_merge(struct vma_merge_struct *vmg) pgoff_t pglen = PHYS_PFN(end - addr); long adj_start = 0; + vmg->state = VMA_MERGE_NOMERGE; + /* * We later require that vma->vm_flags == vm_flags, * so this tests vma->vm_flags & VM_SPECIAL, too. @@ -1180,13 +1339,19 @@ static struct vm_area_struct *vma_merge(struct vma_merge_struct *vmg) vma_complete(&vp, vmg->vmi, mm); validate_mm(mm); khugepaged_enter_vma(res, vmg->flags); + + vmg->state = VMA_MERGE_SUCCESS; return res; prealloc_fail: + vmg->state = VMA_MERGE_ERROR_NOMEM; if (anon_dup) unlink_anon_vmas(anon_dup); anon_vma_fail: + if (err == -ENOMEM) + vmg->state = VMA_MERGE_ERROR_NOMEM; + vma_iter_set(vmg->vmi, addr); vma_iter_load(vmg->vmi); return NULL; @@ -1293,22 +1458,6 @@ struct vm_area_struct return vma_modify(&vmg); } -/* - * Attempt to merge a newly mapped VMA with those adjacent to it. The caller - * must ensure that [start, end) does not overlap any existing VMA. - */ -struct vm_area_struct -*vma_merge_new_vma(struct vma_iterator *vmi, struct vm_area_struct *prev, - struct vm_area_struct *vma, unsigned long start, - unsigned long end, pgoff_t pgoff) -{ - VMG_VMA_STATE(vmg, vmi, prev, vma, start, end); - - vmg.pgoff = pgoff; - - return vma_merge(&vmg); -} - /* * Expand vma by delta bytes, potentially merging with an immediately adjacent * VMA with identical properties. @@ -1319,8 +1468,10 @@ struct vm_area_struct *vma_merge_extend(struct vma_iterator *vmi, { VMG_VMA_STATE(vmg, vmi, vma, vma, vma->vm_end, vma->vm_end + delta); - /* vma is specified as prev, so case 1 or 2 will apply. */ - return vma_merge(&vmg); + vmg.next = vma_iter_next_rewind(vmi, NULL); + vmg.vma = NULL; /* We use the VMA to populate VMG fields only. */ + + return vma_merge_new_range(&vmg); } void unlink_file_vma_batch_init(struct unlink_vma_file_batch *vb) @@ -1421,9 +1572,10 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, struct vm_area_struct *vma = *vmap; unsigned long vma_start = vma->vm_start; struct mm_struct *mm = vma->vm_mm; - struct vm_area_struct *new_vma, *prev; + struct vm_area_struct *new_vma; bool faulted_in_anon_vma = true; VMA_ITERATOR(vmi, mm, addr); + VMG_VMA_STATE(vmg, &vmi, NULL, vma, addr, addr + len); /* * If anonymous vma has not yet been faulted, update new pgoff @@ -1434,11 +1586,15 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, faulted_in_anon_vma = false; } - new_vma = find_vma_prev(mm, addr, &prev); + new_vma = find_vma_prev(mm, addr, &vmg.prev); if (new_vma && new_vma->vm_start < addr + len) return NULL; /* should never get here */ - new_vma = vma_merge_new_vma(&vmi, prev, vma, addr, addr + len, pgoff); + vmg.vma = NULL; /* New VMA range. */ + vmg.pgoff = pgoff; + vmg.next = vma_iter_next_rewind(&vmi, NULL); + new_vma = vma_merge_new_range(&vmg); + if (new_vma) { /* * Source vma may have been merged into new_vma diff --git a/mm/vma.h b/mm/vma.h index 218d884ff5ff..82354fe5edd0 100644 --- a/mm/vma.h +++ b/mm/vma.h @@ -52,6 +52,13 @@ struct vma_munmap_struct { unsigned long data_vm; }; +enum vma_merge_state { + VMA_MERGE_START, + VMA_MERGE_ERROR_NOMEM, + VMA_MERGE_NOMERGE, + VMA_MERGE_SUCCESS, +}; + /* Represents a VMA merge operation. */ struct vma_merge_struct { struct mm_struct *mm; @@ -68,8 +75,14 @@ struct vma_merge_struct { struct mempolicy *policy; struct vm_userfaultfd_ctx uffd_ctx; struct anon_vma_name *anon_name; + enum vma_merge_state state; }; +static inline bool vmg_nomem(struct vma_merge_struct *vmg) +{ + return vmg->state == VMA_MERGE_ERROR_NOMEM; +} + /* Assumes addr >= vma->vm_start. */ static inline pgoff_t vma_pgoff_offset(struct vm_area_struct *vma, unsigned long addr) @@ -85,6 +98,7 @@ static inline pgoff_t vma_pgoff_offset(struct vm_area_struct *vma, .end = end_, \ .flags = flags_, \ .pgoff = pgoff_, \ + .state = VMA_MERGE_START, \ } #define VMG_VMA_STATE(name, vmi_, prev_, vma_, start_, end_) \ @@ -103,6 +117,7 @@ static inline pgoff_t vma_pgoff_offset(struct vm_area_struct *vma, .policy = vma_policy(vma_), \ .uffd_ctx = vma_->vm_userfaultfd_ctx, \ .anon_name = anon_vma_name(vma_), \ + .state = VMA_MERGE_START, \ } #ifdef CONFIG_DEBUG_VM_MAPLE_TREE @@ -309,10 +324,7 @@ struct vm_area_struct unsigned long new_flags, struct vm_userfaultfd_ctx new_ctx); -struct vm_area_struct -*vma_merge_new_vma(struct vma_iterator *vmi, struct vm_area_struct *prev, - struct vm_area_struct *vma, unsigned long start, - unsigned long end, pgoff_t pgoff); +struct vm_area_struct *vma_merge_new_range(struct vma_merge_struct *vmg); struct vm_area_struct *vma_merge_extend(struct vma_iterator *vmi, struct vm_area_struct *vma, @@ -505,6 +517,34 @@ struct vm_area_struct *vma_iter_prev_range(struct vma_iterator *vmi) return mas_prev_range(&vmi->mas, 0); } +/* + * Retrieve the next VMA and rewind the iterator to end of the previous VMA, or + * if no previous VMA, to index 0. + */ +static inline +struct vm_area_struct *vma_iter_next_rewind(struct vma_iterator *vmi, + struct vm_area_struct **pprev) +{ + struct vm_area_struct *next = vma_next(vmi); + struct vm_area_struct *prev = vma_prev(vmi); + + /* + * Consider the case where no previous VMA exists. We advance to the + * next VMA, skipping any gap, then rewind to the start of the range. + * + * If we were to unconditionally advance to the next range we'd wind up + * at the next VMA again, so we check to ensure there is a previous VMA + * to skip over. + */ + if (prev) + vma_iter_next_range(vmi); + + if (pprev) + *pprev = prev; + + return next; +} + #ifdef CONFIG_64BIT static inline bool vma_is_sealed(struct vm_area_struct *vma) diff --git a/tools/testing/vma/vma.c b/tools/testing/vma/vma.c index f6c4706a861f..b7cdafec09af 100644 --- a/tools/testing/vma/vma.c +++ b/tools/testing/vma/vma.c @@ -101,9 +101,9 @@ static struct vm_area_struct *merge_new(struct vma_merge_struct *vmg) */ vmg->next = vma_next(vmg->vmi); vmg->prev = vma_prev(vmg->vmi); + vma_iter_next_range(vmg->vmi); - vma_iter_set(vmg->vmi, vmg->start); - return vma_merge(vmg); + return vma_merge_new_range(vmg); } /* @@ -162,10 +162,14 @@ static struct vm_area_struct *try_merge_new_vma(struct mm_struct *mm, merged = merge_new(vmg); if (merged) { *was_merged = true; + ASSERT_EQ(vmg->state, VMA_MERGE_SUCCESS); return merged; } *was_merged = false; + + ASSERT_EQ(vmg->state, VMA_MERGE_NOMERGE); + return alloc_and_link_vma(mm, start, end, pgoff, flags); } @@ -595,6 +599,7 @@ static bool test_vma_merge_special_flags(void) vmg.flags = flags | special_flag; vma = merge_new(&vmg); ASSERT_EQ(vma, NULL); + ASSERT_EQ(vmg.state, VMA_MERGE_NOMERGE); } /* 2. Modify VMA with special flag that would otherwise merge. */ @@ -616,6 +621,7 @@ static bool test_vma_merge_special_flags(void) vmg.flags = flags | special_flag; vma = merge_existing(&vmg); ASSERT_EQ(vma, NULL); + ASSERT_EQ(vmg.state, VMA_MERGE_NOMERGE); } cleanup_mm(&mm, &vmi); @@ -708,6 +714,7 @@ static bool test_vma_merge_with_close(void) /* The next VMA having a close() operator should cause the merge to fail.*/ ASSERT_EQ(merge_new(&vmg), NULL); + ASSERT_EQ(vmg.state, VMA_MERGE_NOMERGE); /* Now create the VMA so we can merge via modified flags */ vmg_set_range(&vmg, 0x1000, 0x2000, 1, flags); @@ -719,6 +726,7 @@ static bool test_vma_merge_with_close(void) * also fail. */ ASSERT_EQ(merge_existing(&vmg), NULL); + ASSERT_EQ(vmg.state, VMA_MERGE_NOMERGE); /* SCENARIO B * @@ -744,6 +752,7 @@ static bool test_vma_merge_with_close(void) vmg.vma = vma; /* Make sure merge does not occur. */ ASSERT_EQ(merge_existing(&vmg), NULL); + ASSERT_EQ(vmg.state, VMA_MERGE_NOMERGE); cleanup_mm(&mm, &vmi); return true; @@ -792,6 +801,7 @@ static bool test_vma_merge_new_with_close(void) vmg_set_range(&vmg, 0x2000, 0x5000, 2, flags); vma = merge_new(&vmg); ASSERT_NE(vma, NULL); + ASSERT_EQ(vmg.state, VMA_MERGE_SUCCESS); ASSERT_EQ(vma->vm_start, 0); ASSERT_EQ(vma->vm_end, 0x5000); ASSERT_EQ(vma->vm_pgoff, 0); @@ -831,6 +841,7 @@ static bool test_merge_existing(void) vmg.prev = vma; vma->anon_vma = &dummy_anon_vma; ASSERT_EQ(merge_existing(&vmg), vma_next); + ASSERT_EQ(vmg.state, VMA_MERGE_SUCCESS); ASSERT_EQ(vma_next->vm_start, 0x3000); ASSERT_EQ(vma_next->vm_end, 0x9000); ASSERT_EQ(vma_next->vm_pgoff, 3); @@ -861,6 +872,7 @@ static bool test_merge_existing(void) vmg.vma = vma; vma->anon_vma = &dummy_anon_vma; ASSERT_EQ(merge_existing(&vmg), vma_next); + ASSERT_EQ(vmg.state, VMA_MERGE_SUCCESS); ASSERT_EQ(vma_next->vm_start, 0x2000); ASSERT_EQ(vma_next->vm_end, 0x9000); ASSERT_EQ(vma_next->vm_pgoff, 2); @@ -889,6 +901,7 @@ static bool test_merge_existing(void) vma->anon_vma = &dummy_anon_vma; ASSERT_EQ(merge_existing(&vmg), vma_prev); + ASSERT_EQ(vmg.state, VMA_MERGE_SUCCESS); ASSERT_EQ(vma_prev->vm_start, 0); ASSERT_EQ(vma_prev->vm_end, 0x6000); ASSERT_EQ(vma_prev->vm_pgoff, 0); @@ -920,6 +933,7 @@ static bool test_merge_existing(void) vmg.vma = vma; vma->anon_vma = &dummy_anon_vma; ASSERT_EQ(merge_existing(&vmg), vma_prev); + ASSERT_EQ(vmg.state, VMA_MERGE_SUCCESS); ASSERT_EQ(vma_prev->vm_start, 0); ASSERT_EQ(vma_prev->vm_end, 0x7000); ASSERT_EQ(vma_prev->vm_pgoff, 0); @@ -948,6 +962,7 @@ static bool test_merge_existing(void) vmg.vma = vma; vma->anon_vma = &dummy_anon_vma; ASSERT_EQ(merge_existing(&vmg), vma_prev); + ASSERT_EQ(vmg.state, VMA_MERGE_SUCCESS); ASSERT_EQ(vma_prev->vm_start, 0); ASSERT_EQ(vma_prev->vm_end, 0x9000); ASSERT_EQ(vma_prev->vm_pgoff, 0); @@ -981,31 +996,37 @@ static bool test_merge_existing(void) vmg.prev = vma; vmg.vma = vma; ASSERT_EQ(merge_existing(&vmg), NULL); + ASSERT_EQ(vmg.state, VMA_MERGE_NOMERGE); vmg_set_range(&vmg, 0x5000, 0x6000, 5, flags); vmg.prev = vma; vmg.vma = vma; ASSERT_EQ(merge_existing(&vmg), NULL); + ASSERT_EQ(vmg.state, VMA_MERGE_NOMERGE); vmg_set_range(&vmg, 0x6000, 0x7000, 6, flags); vmg.prev = vma; vmg.vma = vma; ASSERT_EQ(merge_existing(&vmg), NULL); + ASSERT_EQ(vmg.state, VMA_MERGE_NOMERGE); vmg_set_range(&vmg, 0x4000, 0x7000, 4, flags); vmg.prev = vma; vmg.vma = vma; ASSERT_EQ(merge_existing(&vmg), NULL); + ASSERT_EQ(vmg.state, VMA_MERGE_NOMERGE); vmg_set_range(&vmg, 0x4000, 0x6000, 4, flags); vmg.prev = vma; vmg.vma = vma; ASSERT_EQ(merge_existing(&vmg), NULL); + ASSERT_EQ(vmg.state, VMA_MERGE_NOMERGE); vmg_set_range(&vmg, 0x5000, 0x6000, 5, flags); vmg.prev = vma; vmg.vma = vma; ASSERT_EQ(merge_existing(&vmg), NULL); + ASSERT_EQ(vmg.state, VMA_MERGE_NOMERGE); ASSERT_EQ(cleanup_mm(&mm, &vmi), 3); @@ -1071,6 +1092,7 @@ static bool test_anon_vma_non_mergeable(void) vmg.vma = vma; ASSERT_EQ(merge_existing(&vmg), vma_prev); + ASSERT_EQ(vmg.state, VMA_MERGE_SUCCESS); ASSERT_EQ(vma_prev->vm_start, 0); ASSERT_EQ(vma_prev->vm_end, 0x7000); ASSERT_EQ(vma_prev->vm_pgoff, 0); @@ -1106,6 +1128,7 @@ static bool test_anon_vma_non_mergeable(void) vmg.prev = vma_prev; ASSERT_EQ(merge_new(&vmg), vma_prev); + ASSERT_EQ(vmg.state, VMA_MERGE_SUCCESS); ASSERT_EQ(vma_prev->vm_start, 0); ASSERT_EQ(vma_prev->vm_end, 0x7000); ASSERT_EQ(vma_prev->vm_pgoff, 0); @@ -1181,6 +1204,7 @@ static bool test_dup_anon_vma(void) vmg.vma = vma; ASSERT_EQ(merge_existing(&vmg), vma_prev); + ASSERT_EQ(vmg.state, VMA_MERGE_SUCCESS); ASSERT_EQ(vma_prev->vm_start, 0); ASSERT_EQ(vma_prev->vm_end, 0x8000); @@ -1209,6 +1233,7 @@ static bool test_dup_anon_vma(void) vmg.vma = vma; ASSERT_EQ(merge_existing(&vmg), vma_prev); + ASSERT_EQ(vmg.state, VMA_MERGE_SUCCESS); ASSERT_EQ(vma_prev->vm_start, 0); ASSERT_EQ(vma_prev->vm_end, 0x8000); @@ -1236,6 +1261,7 @@ static bool test_dup_anon_vma(void) vmg.vma = vma; ASSERT_EQ(merge_existing(&vmg), vma_prev); + ASSERT_EQ(vmg.state, VMA_MERGE_SUCCESS); ASSERT_EQ(vma_prev->vm_start, 0); ASSERT_EQ(vma_prev->vm_end, 0x5000); @@ -1263,6 +1289,7 @@ static bool test_dup_anon_vma(void) vmg.vma = vma; ASSERT_EQ(merge_existing(&vmg), vma_next); + ASSERT_EQ(vmg.state, VMA_MERGE_SUCCESS); ASSERT_EQ(vma_next->vm_start, 0x3000); ASSERT_EQ(vma_next->vm_end, 0x8000); @@ -1303,6 +1330,7 @@ static bool test_vmi_prealloc_fail(void) /* This will cause the merge to fail. */ ASSERT_EQ(merge_existing(&vmg), NULL); + ASSERT_EQ(vmg.state, VMA_MERGE_ERROR_NOMEM); /* We will already have assigned the anon_vma. */ ASSERT_EQ(vma_prev->anon_vma, &dummy_anon_vma); /* And it was both cloned and unlinked. */ @@ -1327,6 +1355,7 @@ static bool test_vmi_prealloc_fail(void) fail_prealloc = true; ASSERT_EQ(expand_existing(&vmg), -ENOMEM); + ASSERT_EQ(vmg.state, VMA_MERGE_ERROR_NOMEM); ASSERT_EQ(vma_prev->anon_vma, &dummy_anon_vma); ASSERT_TRUE(dummy_anon_vma.was_cloned); diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h index a3c262c6eb73..c5b9da034511 100644 --- a/tools/testing/vma/vma_internal.h +++ b/tools/testing/vma/vma_internal.h @@ -740,6 +740,12 @@ static inline void vma_iter_free(struct vma_iterator *vmi) mas_destroy(&vmi->mas); } +static inline +struct vm_area_struct *vma_iter_next_range(struct vma_iterator *vmi) +{ + return mas_next_range(&vmi->mas, ULONG_MAX); +} + static inline void vm_acct_memory(long pages) { } -- cgit v1.2.3 From cc8cb3697a8d8eabe1fb9acb8768b11c1ab607d8 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Fri, 30 Aug 2024 19:10:21 +0100 Subject: mm: refactor vma_merge() into modify-only vma_merge_existing_range() The existing vma_merge() function is no longer required to handle what were previously referred to as cases 1-3 (i.e. the merging of a new VMA), as this is now handled by vma_merge_new_vma(). Additionally, simplify the convoluted control flow of the original, maintaining identical logic only expressed more clearly and doing away with a complicated set of cases, rather logically examining each possible outcome - merging of both the previous and subsequent VMA, merging of the previous VMA and merging of the subsequent VMA alone. We now utilise the previously implemented commit_merge() function to share logic with vma_expand() de-duplicating code and providing less surface area for bugs and confusion. In order to do so, we adjust this function to accept parameters specific to merging existing ranges. Link: https://lkml.kernel.org/r/2cf6016b7bfcc4965fc3cde10827560c42e4f12c.1725040657.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Cc: Liam R. Howlett Cc: Mark Brown Cc: Vlastimil Babka Cc: Bert Karwatzki Cc: Jeff Xu Cc: Jiri Olsa Cc: Kees Cook Cc: Lorenzo Stoakes Cc: Matthew Wilcox Cc: "Paul E. McKenney" Cc: Paul Moore Cc: Sidhartha Kumar Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- mm/vma.c | 508 ++++++++++++++++++++++++------------------------ tools/testing/vma/vma.c | 9 +- 2 files changed, 264 insertions(+), 253 deletions(-) (limited to 'tools') diff --git a/mm/vma.c b/mm/vma.c index 566cad2338dd..393bef832604 100644 --- a/mm/vma.c +++ b/mm/vma.c @@ -587,29 +587,278 @@ void validate_mm(struct mm_struct *mm) /* Actually perform the VMA merge operation. */ static int commit_merge(struct vma_merge_struct *vmg, - struct vm_area_struct *remove) + struct vm_area_struct *adjust, + struct vm_area_struct *remove, + struct vm_area_struct *remove2, + long adj_start, + bool expanded) { struct vma_prepare vp; - init_multi_vma_prep(&vp, vmg->vma, NULL, remove, NULL); + init_multi_vma_prep(&vp, vmg->vma, adjust, remove, remove2); - /* Note: vma iterator must be pointing to 'start'. */ - vma_iter_config(vmg->vmi, vmg->start, vmg->end); + VM_WARN_ON(vp.anon_vma && adjust && adjust->anon_vma && + vp.anon_vma != adjust->anon_vma); + + if (expanded) { + /* Note: vma iterator must be pointing to 'start'. */ + vma_iter_config(vmg->vmi, vmg->start, vmg->end); + } else { + vma_iter_config(vmg->vmi, adjust->vm_start + adj_start, + adjust->vm_end); + } if (vma_iter_prealloc(vmg->vmi, vmg->vma)) return -ENOMEM; vma_prepare(&vp); - vma_adjust_trans_huge(vmg->vma, vmg->start, vmg->end, 0); + vma_adjust_trans_huge(vmg->vma, vmg->start, vmg->end, adj_start); vma_set_range(vmg->vma, vmg->start, vmg->end, vmg->pgoff); - vma_iter_store(vmg->vmi, vmg->vma); + if (expanded) + vma_iter_store(vmg->vmi, vmg->vma); + + if (adj_start) { + adjust->vm_start += adj_start; + adjust->vm_pgoff += PHYS_PFN(adj_start); + if (adj_start < 0) { + WARN_ON(expanded); + vma_iter_store(vmg->vmi, adjust); + } + } vma_complete(&vp, vmg->vmi, vmg->vma->vm_mm); return 0; } +/* + * vma_merge_existing_range - Attempt to merge VMAs based on a VMA having its + * attributes modified. + * + * @vmg: Describes the modifications being made to a VMA and associated + * metadata. + * + * When the attributes of a range within a VMA change, then it might be possible + * for immediately adjacent VMAs to be merged into that VMA due to having + * identical properties. + * + * This function checks for the existence of any such mergeable VMAs and updates + * the maple tree describing the @vmg->vma->vm_mm address space to account for + * this, as well as any VMAs shrunk/expanded/deleted as a result of this merge. + * + * As part of this operation, if a merge occurs, the @vmg object will have its + * vma, start, end, and pgoff fields modified to execute the merge. Subsequent + * calls to this function should reset these fields. + * + * Returns: The merged VMA if merge succeeds, or NULL otherwise. + * + * ASSUMPTIONS: + * - The caller must assign the VMA to be modifed to @vmg->vma. + * - The caller must have set @vmg->prev to the previous VMA, if there is one. + * - The caller must not set @vmg->next, as we determine this. + * - The caller must hold a WRITE lock on the mm_struct->mmap_lock. + * - vmi must be positioned within [@vmg->vma->vm_start, @vmg->vma->vm_end). + */ +static struct vm_area_struct *vma_merge_existing_range(struct vma_merge_struct *vmg) +{ + struct vm_area_struct *vma = vmg->vma; + struct vm_area_struct *prev = vmg->prev; + struct vm_area_struct *next, *res; + struct vm_area_struct *anon_dup = NULL; + struct vm_area_struct *adjust = NULL; + unsigned long start = vmg->start; + unsigned long end = vmg->end; + bool left_side = vma && start == vma->vm_start; + bool right_side = vma && end == vma->vm_end; + int err = 0; + long adj_start = 0; + bool merge_will_delete_vma, merge_will_delete_next; + bool merge_left, merge_right, merge_both; + bool expanded; + + mmap_assert_write_locked(vmg->mm); + VM_WARN_ON(!vma); /* We are modifying a VMA, so caller must specify. */ + VM_WARN_ON(vmg->next); /* We set this. */ + VM_WARN_ON(prev && start <= prev->vm_start); + VM_WARN_ON(start >= end); + /* + * If vma == prev, then we are offset into a VMA. Otherwise, if we are + * not, we must span a portion of the VMA. + */ + VM_WARN_ON(vma && ((vma != prev && vmg->start != vma->vm_start) || + vmg->end > vma->vm_end)); + /* The vmi must be positioned within vmg->vma. */ + VM_WARN_ON(vma && !(vma_iter_addr(vmg->vmi) >= vma->vm_start && + vma_iter_addr(vmg->vmi) < vma->vm_end)); + + vmg->state = VMA_MERGE_NOMERGE; + + /* + * If a special mapping or if the range being modified is neither at the + * furthermost left or right side of the VMA, then we have no chance of + * merging and should abort. + */ + if (vmg->flags & VM_SPECIAL || (!left_side && !right_side)) + return NULL; + + if (left_side) + merge_left = can_vma_merge_left(vmg); + else + merge_left = false; + + if (right_side) { + next = vmg->next = vma_iter_next_range(vmg->vmi); + vma_iter_prev_range(vmg->vmi); + + merge_right = can_vma_merge_right(vmg, merge_left); + } else { + merge_right = false; + next = NULL; + } + + if (merge_left) /* If merging prev, position iterator there. */ + vma_prev(vmg->vmi); + else if (!merge_right) /* If we have nothing to merge, abort. */ + return NULL; + + merge_both = merge_left && merge_right; + /* If we span the entire VMA, a merge implies it will be deleted. */ + merge_will_delete_vma = left_side && right_side; + /* + * If we merge both VMAs, then next is also deleted. This implies + * merge_will_delete_vma also. + */ + merge_will_delete_next = merge_both; + + /* No matter what happens, we will be adjusting vma. */ + vma_start_write(vma); + + if (merge_left) + vma_start_write(prev); + + if (merge_right) + vma_start_write(next); + + if (merge_both) { + /* + * |<----->| + * |-------*********-------| + * prev vma next + * extend delete delete + */ + + vmg->vma = prev; + vmg->start = prev->vm_start; + vmg->end = next->vm_end; + vmg->pgoff = prev->vm_pgoff; + + /* + * We already ensured anon_vma compatibility above, so now it's + * simply a case of, if prev has no anon_vma object, which of + * next or vma contains the anon_vma we must duplicate. + */ + err = dup_anon_vma(prev, next->anon_vma ? next : vma, &anon_dup); + } else if (merge_left) { + /* + * |<----->| OR + * |<--------->| + * |-------************* + * prev vma + * extend shrink/delete + */ + + vmg->vma = prev; + vmg->start = prev->vm_start; + vmg->pgoff = prev->vm_pgoff; + + if (merge_will_delete_vma) { + /* + * can_vma_merge_after() assumed we would not be + * removing vma, so it skipped the check for + * vm_ops->close, but we are removing vma. + */ + if (vma->vm_ops && vma->vm_ops->close) + err = -EINVAL; + } else { + adjust = vma; + adj_start = vmg->end - vma->vm_start; + } + + if (!err) + err = dup_anon_vma(prev, vma, &anon_dup); + } else { /* merge_right */ + /* + * |<----->| OR + * |<--------->| + * *************-------| + * vma next + * shrink/delete extend + */ + + pgoff_t pglen = PHYS_PFN(vmg->end - vmg->start); + + VM_WARN_ON(!merge_right); + /* If we are offset into a VMA, then prev must be vma. */ + VM_WARN_ON(vmg->start > vma->vm_start && prev && vma != prev); + + if (merge_will_delete_vma) { + vmg->vma = next; + vmg->end = next->vm_end; + vmg->pgoff = next->vm_pgoff - pglen; + } else { + /* + * We shrink vma and expand next. + * + * IMPORTANT: This is the ONLY case where the final + * merged VMA is NOT vmg->vma, but rather vmg->next. + */ + + vmg->start = vma->vm_start; + vmg->end = start; + vmg->pgoff = vma->vm_pgoff; + + adjust = next; + adj_start = -(vma->vm_end - start); + } + + err = dup_anon_vma(next, vma, &anon_dup); + } + + if (err) + goto abort; + + /* + * In nearly all cases, we expand vmg->vma. There is one exception - + * merge_right where we partially span the VMA. In this case we shrink + * the end of vmg->vma and adjust the start of vmg->next accordingly. + */ + expanded = !merge_right || merge_will_delete_vma; + + if (commit_merge(vmg, adjust, + merge_will_delete_vma ? vma : NULL, + merge_will_delete_next ? next : NULL, + adj_start, expanded)) { + if (anon_dup) + unlink_anon_vmas(anon_dup); + + vmg->state = VMA_MERGE_ERROR_NOMEM; + return NULL; + } + + res = merge_left ? prev : next; + khugepaged_enter_vma(res, vmg->flags); + + vmg->state = VMA_MERGE_SUCCESS; + return res; + +abort: + vma_iter_set(vmg->vmi, start); + vma_iter_load(vmg->vmi); + vmg->state = VMA_MERGE_ERROR_NOMEM; + return NULL; +} + /* * vma_merge_new_range - Attempt to merge a new VMA into address space * @@ -757,7 +1006,7 @@ int vma_expand(struct vma_merge_struct *vmg) /* Only handles expanding */ VM_WARN_ON(vma->vm_start < vmg->start || vma->vm_end > vmg->end); - if (commit_merge(vmg, remove_next ? next : NULL)) + if (commit_merge(vmg, NULL, remove_next ? next : NULL, NULL, 0, true)) goto nomem; return 0; @@ -1127,249 +1376,6 @@ int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm, return do_vmi_align_munmap(vmi, vma, mm, start, end, uf, unlock); } -/* - * Given a mapping request (addr,end,vm_flags,file,pgoff,anon_name), - * figure out whether that can be merged with its predecessor or its - * successor. Or both (it neatly fills a hole). - * - * In most cases - when called for mmap, brk or mremap - [addr,end) is - * certain not to be mapped by the time vma_merge is called; but when - * called for mprotect, it is certain to be already mapped (either at - * an offset within prev, or at the start of next), and the flags of - * this area are about to be changed to vm_flags - and the no-change - * case has already been eliminated. - * - * The following mprotect cases have to be considered, where **** is - * the area passed down from mprotect_fixup, never extending beyond one - * vma, PPPP is the previous vma, CCCC is a concurrent vma that starts - * at the same address as **** and is of the same or larger span, and - * NNNN the next vma after ****: - * - * **** **** **** - * PPPPPPNNNNNN PPPPPPNNNNNN PPPPPPCCCCCC - * cannot merge might become might become - * PPNNNNNNNNNN PPPPPPPPPPCC - * mmap, brk or case 4 below case 5 below - * mremap move: - * **** **** - * PPPP NNNN PPPPCCCCNNNN - * might become might become - * PPPPPPPPPPPP 1 or PPPPPPPPPPPP 6 or - * PPPPPPPPNNNN 2 or PPPPPPPPNNNN 7 or - * PPPPNNNNNNNN 3 PPPPNNNNNNNN 8 - * - * It is important for case 8 that the vma CCCC overlapping the - * region **** is never going to extended over NNNN. Instead NNNN must - * be extended in region **** and CCCC must be removed. This way in - * all cases where vma_merge succeeds, the moment vma_merge drops the - * rmap_locks, the properties of the merged vma will be already - * correct for the whole merged range. Some of those properties like - * vm_page_prot/vm_flags may be accessed by rmap_walks and they must - * be correct for the whole merged range immediately after the - * rmap_locks are released. Otherwise if NNNN would be removed and - * CCCC would be extended over the NNNN range, remove_migration_ptes - * or other rmap walkers (if working on addresses beyond the "end" - * parameter) may establish ptes with the wrong permissions of CCCC - * instead of the right permissions of NNNN. - * - * In the code below: - * PPPP is represented by *prev - * CCCC is represented by *curr or not represented at all (NULL) - * NNNN is represented by *next or not represented at all (NULL) - * **** is not represented - it will be merged and the vma containing the - * area is returned, or the function will return NULL - */ -static struct vm_area_struct *vma_merge(struct vma_merge_struct *vmg) -{ - struct mm_struct *mm = vmg->mm; - struct vm_area_struct *prev = vmg->prev; - struct vm_area_struct *curr, *next, *res; - struct vm_area_struct *vma, *adjust, *remove, *remove2; - struct vm_area_struct *anon_dup = NULL; - struct vma_prepare vp; - pgoff_t vma_pgoff; - int err = 0; - bool merge_prev = false; - bool merge_next = false; - bool vma_expanded = false; - unsigned long addr = vmg->start; - unsigned long end = vmg->end; - unsigned long vma_start = addr; - unsigned long vma_end = end; - pgoff_t pglen = PHYS_PFN(end - addr); - long adj_start = 0; - - vmg->state = VMA_MERGE_NOMERGE; - - /* - * We later require that vma->vm_flags == vm_flags, - * so this tests vma->vm_flags & VM_SPECIAL, too. - */ - if (vmg->flags & VM_SPECIAL) - return NULL; - - /* Does the input range span an existing VMA? (cases 5 - 8) */ - curr = find_vma_intersection(mm, prev ? prev->vm_end : 0, end); - - if (!curr || /* cases 1 - 4 */ - end == curr->vm_end) /* cases 6 - 8, adjacent VMA */ - next = vmg->next = vma_lookup(mm, end); - else - next = vmg->next = NULL; /* case 5 */ - - if (prev) { - vma_start = prev->vm_start; - vma_pgoff = prev->vm_pgoff; - - /* Can we merge the predecessor? */ - if (addr == prev->vm_end && can_vma_merge_after(vmg)) { - merge_prev = true; - vma_prev(vmg->vmi); - } - } - - /* Can we merge the successor? */ - if (next && can_vma_merge_before(vmg)) { - merge_next = true; - } - - /* Verify some invariant that must be enforced by the caller. */ - VM_WARN_ON(prev && addr <= prev->vm_start); - VM_WARN_ON(curr && (addr != curr->vm_start || end > curr->vm_end)); - VM_WARN_ON(addr >= end); - - if (!merge_prev && !merge_next) - return NULL; /* Not mergeable. */ - - if (merge_prev) - vma_start_write(prev); - - res = vma = prev; - remove = remove2 = adjust = NULL; - - /* Can we merge both the predecessor and the successor? */ - if (merge_prev && merge_next && - is_mergeable_anon_vma(prev->anon_vma, next->anon_vma, NULL)) { - vma_start_write(next); - remove = next; /* case 1 */ - vma_end = next->vm_end; - err = dup_anon_vma(prev, next, &anon_dup); - if (curr) { /* case 6 */ - vma_start_write(curr); - remove = curr; - remove2 = next; - /* - * Note that the dup_anon_vma below cannot overwrite err - * since the first caller would do nothing unless next - * has an anon_vma. - */ - if (!next->anon_vma) - err = dup_anon_vma(prev, curr, &anon_dup); - } - } else if (merge_prev) { /* case 2 */ - if (curr) { - vma_start_write(curr); - if (end == curr->vm_end) { /* case 7 */ - /* - * can_vma_merge_after() assumed we would not be - * removing prev vma, so it skipped the check - * for vm_ops->close, but we are removing curr - */ - if (curr->vm_ops && curr->vm_ops->close) - err = -EINVAL; - remove = curr; - } else { /* case 5 */ - adjust = curr; - adj_start = (end - curr->vm_start); - } - if (!err) - err = dup_anon_vma(prev, curr, &anon_dup); - } - } else { /* merge_next */ - vma_start_write(next); - res = next; - if (prev && addr < prev->vm_end) { /* case 4 */ - vma_start_write(prev); - vma_end = addr; - adjust = next; - adj_start = -(prev->vm_end - addr); - err = dup_anon_vma(next, prev, &anon_dup); - } else { - /* - * Note that cases 3 and 8 are the ONLY ones where prev - * is permitted to be (but is not necessarily) NULL. - */ - vma = next; /* case 3 */ - vma_start = addr; - vma_end = next->vm_end; - vma_pgoff = next->vm_pgoff - pglen; - if (curr) { /* case 8 */ - vma_pgoff = curr->vm_pgoff; - vma_start_write(curr); - remove = curr; - err = dup_anon_vma(next, curr, &anon_dup); - } - } - } - - /* Error in anon_vma clone. */ - if (err) - goto anon_vma_fail; - - if (vma_start < vma->vm_start || vma_end > vma->vm_end) - vma_expanded = true; - - if (vma_expanded) { - vma_iter_config(vmg->vmi, vma_start, vma_end); - } else { - vma_iter_config(vmg->vmi, adjust->vm_start + adj_start, - adjust->vm_end); - } - - if (vma_iter_prealloc(vmg->vmi, vma)) - goto prealloc_fail; - - init_multi_vma_prep(&vp, vma, adjust, remove, remove2); - VM_WARN_ON(vp.anon_vma && adjust && adjust->anon_vma && - vp.anon_vma != adjust->anon_vma); - - vma_prepare(&vp); - vma_adjust_trans_huge(vma, vma_start, vma_end, adj_start); - vma_set_range(vma, vma_start, vma_end, vma_pgoff); - - if (vma_expanded) - vma_iter_store(vmg->vmi, vma); - - if (adj_start) { - adjust->vm_start += adj_start; - adjust->vm_pgoff += adj_start >> PAGE_SHIFT; - if (adj_start < 0) { - WARN_ON(vma_expanded); - vma_iter_store(vmg->vmi, next); - } - } - - vma_complete(&vp, vmg->vmi, mm); - validate_mm(mm); - khugepaged_enter_vma(res, vmg->flags); - - vmg->state = VMA_MERGE_SUCCESS; - return res; - -prealloc_fail: - vmg->state = VMA_MERGE_ERROR_NOMEM; - if (anon_dup) - unlink_anon_vmas(anon_dup); - -anon_vma_fail: - if (err == -ENOMEM) - vmg->state = VMA_MERGE_ERROR_NOMEM; - - vma_iter_set(vmg->vmi, addr); - vma_iter_load(vmg->vmi); - return NULL; -} - /* * We are about to modify one or multiple of a VMA's flags, policy, userfaultfd * context and anonymous VMA name within the range [start, end). @@ -1389,7 +1395,7 @@ static struct vm_area_struct *vma_modify(struct vma_merge_struct *vmg) struct vm_area_struct *merged; /* First, try to merge. */ - merged = vma_merge(vmg); + merged = vma_merge_existing_range(vmg); if (merged) return merged; diff --git a/tools/testing/vma/vma.c b/tools/testing/vma/vma.c index b7cdafec09af..25a95d9901ea 100644 --- a/tools/testing/vma/vma.c +++ b/tools/testing/vma/vma.c @@ -112,7 +112,7 @@ static struct vm_area_struct *merge_new(struct vma_merge_struct *vmg) */ static struct vm_area_struct *merge_existing(struct vma_merge_struct *vmg) { - return vma_merge(vmg); + return vma_merge_existing_range(vmg); } /* @@ -752,7 +752,12 @@ static bool test_vma_merge_with_close(void) vmg.vma = vma; /* Make sure merge does not occur. */ ASSERT_EQ(merge_existing(&vmg), NULL); - ASSERT_EQ(vmg.state, VMA_MERGE_NOMERGE); + /* + * Initially this is misapprehended as an out of memory report, as the + * close() check is handled in the same way as anon_vma duplication + * failures, however a subsequent patch resolves this. + */ + ASSERT_EQ(vmg.state, VMA_MERGE_ERROR_NOMEM); cleanup_mm(&mm, &vmi); return true; -- cgit v1.2.3 From 01c373e9a5ce2273812eaf83036c5357829fb3f7 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Fri, 30 Aug 2024 19:10:22 +0100 Subject: mm: rework vm_ops->close() handling on VMA merge In commit 714965ca8252 ("mm/mmap: start distinguishing if vma can be removed in mergeability test") we relaxed the VMA merge rules for VMAs possessing a vm_ops->close() hook, permitting this operation in instances where we wouldn't delete the VMA as part of the merge operation. This was later corrected in commit fc0c8f9089c2 ("mm, mmap: fix vma_merge() case 7 with vma_ops->close") to account for a subtle case that the previous commit had not taken into account. In both instances, we first rely on is_mergeable_vma() to determine whether we might be dealing with a VMA that might be removed, taking advantage of the fact that a 'previous' VMA will never be deleted, only VMAs that follow it. The second patch corrects the instance where a merge of the previous VMA into a subsequent one did not correctly check whether the subsequent VMA had a vm_ops->close() handler. Both changes prevent merge cases that are actually permissible (for instance a merge of a VMA into a following VMA with a vm_ops->close(), but with no previous VMA, which would result in the next VMA being extended, not deleted). In addition, both changes fail to consider the case where a VMA that would otherwise be merged with the previous and next VMA might have vm_ops->close(), on the assumption that for this to be the case, all three would have to have the same vma->vm_file to be mergeable and thus the same vm_ops. And in addition both changes operate at 50,000 feet, trying to guess whether a VMA will be deleted. As we have majorly refactored the VMA merge operation and de-duplicated code to the point where we know precisely where deletions will occur, this patch removes the aforementioned checks altogether and instead explicitly checks whether a VMA will be deleted. In cases where a reduced merge is still possible (where we merge both previous and next VMA but the next VMA has a vm_ops->close hook, meaning we could just merge the previous and current VMA), we do so, otherwise the merge is not permitted. We take advantage of our userland testing to assert that this functions correctly - replacing the previous limited vm_ops->close() tests with tests for every single case where we delete a VMA. We also update all testing for both new and modified VMAs to set vma->vm_ops->close() in every single instance where this would not prevent the merge, to assert that we never do so. Link: https://lkml.kernel.org/r/9f96b8cfeef3d14afabddac3d6144afdfbef2e22.1725040657.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Acked-by: Vlastimil Babka Cc: Liam R. Howlett Cc: Mark Brown Cc: Bert Karwatzki Cc: Jeff Xu Cc: Jiri Olsa Cc: Kees Cook Cc: Lorenzo Stoakes Cc: Matthew Wilcox Cc: "Paul E. McKenney" Cc: Paul Moore Cc: Sidhartha Kumar Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- mm/vma.c | 57 +++++++++++------ tools/testing/vma/vma.c | 166 +++++++++++++++++++++++++++++++++++++----------- 2 files changed, 164 insertions(+), 59 deletions(-) (limited to 'tools') diff --git a/mm/vma.c b/mm/vma.c index 393bef832604..8d1686fc8d5a 100644 --- a/mm/vma.c +++ b/mm/vma.c @@ -10,14 +10,6 @@ static inline bool is_mergeable_vma(struct vma_merge_struct *vmg, bool merge_next) { struct vm_area_struct *vma = merge_next ? vmg->next : vmg->prev; - /* - * If the vma has a ->close operation then the driver probably needs to - * release per-vma resources, so we don't attempt to merge those if the - * caller indicates the current vma may be removed as part of the merge, - * which is the case if we are attempting to merge the next VMA into - * this one. - */ - bool may_remove_vma = merge_next; if (!mpol_equal(vmg->policy, vma_policy(vma))) return false; @@ -33,8 +25,6 @@ static inline bool is_mergeable_vma(struct vma_merge_struct *vmg, bool merge_nex return false; if (vma->vm_file != vmg->file) return false; - if (may_remove_vma && vma->vm_ops && vma->vm_ops->close) - return false; if (!is_mergeable_vm_userfaultfd_ctx(vma, vmg->uffd_ctx)) return false; if (!anon_vma_name_eq(anon_vma_name(vma), vmg->anon_name)) @@ -632,6 +622,12 @@ static int commit_merge(struct vma_merge_struct *vmg, return 0; } +/* We can only remove VMAs when merging if they do not have a close hook. */ +static bool can_merge_remove_vma(struct vm_area_struct *vma) +{ + return !vma->vm_ops || !vma->vm_ops->close; +} + /* * vma_merge_existing_range - Attempt to merge VMAs based on a VMA having its * attributes modified. @@ -725,12 +721,30 @@ static struct vm_area_struct *vma_merge_existing_range(struct vma_merge_struct * merge_both = merge_left && merge_right; /* If we span the entire VMA, a merge implies it will be deleted. */ merge_will_delete_vma = left_side && right_side; + + /* + * If we need to remove vma in its entirety but are unable to do so, + * we have no sensible recourse but to abort the merge. + */ + if (merge_will_delete_vma && !can_merge_remove_vma(vma)) + return NULL; + /* * If we merge both VMAs, then next is also deleted. This implies * merge_will_delete_vma also. */ merge_will_delete_next = merge_both; + /* + * If we cannot delete next, then we can reduce the operation to merging + * prev and vma (thereby deleting vma). + */ + if (merge_will_delete_next && !can_merge_remove_vma(next)) { + merge_will_delete_next = false; + merge_right = false; + merge_both = false; + } + /* No matter what happens, we will be adjusting vma. */ vma_start_write(vma); @@ -772,21 +786,12 @@ static struct vm_area_struct *vma_merge_existing_range(struct vma_merge_struct * vmg->start = prev->vm_start; vmg->pgoff = prev->vm_pgoff; - if (merge_will_delete_vma) { - /* - * can_vma_merge_after() assumed we would not be - * removing vma, so it skipped the check for - * vm_ops->close, but we are removing vma. - */ - if (vma->vm_ops && vma->vm_ops->close) - err = -EINVAL; - } else { + if (!merge_will_delete_vma) { adjust = vma; adj_start = vmg->end - vma->vm_start; } - if (!err) - err = dup_anon_vma(prev, vma, &anon_dup); + err = dup_anon_vma(prev, vma, &anon_dup); } else { /* merge_right */ /* * |<----->| OR @@ -940,6 +945,14 @@ struct vm_area_struct *vma_merge_new_range(struct vma_merge_struct *vmg) vmg->vma = prev; vmg->pgoff = prev->vm_pgoff; + /* + * If this merge would result in removal of the next VMA but we + * are not permitted to do so, reduce the operation to merging + * prev and vma. + */ + if (can_merge_right && !can_merge_remove_vma(next)) + vmg->end = end; + vma_prev(vmg->vmi); /* Equivalent to going to the previous range */ } @@ -994,6 +1007,8 @@ int vma_expand(struct vma_merge_struct *vmg) int ret; remove_next = true; + /* This should already have been checked by this point. */ + VM_WARN_ON(!can_merge_remove_vma(next)); vma_start_write(next); ret = dup_anon_vma(vma, next, &anon_dup); if (ret) diff --git a/tools/testing/vma/vma.c b/tools/testing/vma/vma.c index 25a95d9901ea..c53f220eb6cc 100644 --- a/tools/testing/vma/vma.c +++ b/tools/testing/vma/vma.c @@ -387,6 +387,9 @@ static bool test_merge_new(void) struct anon_vma_chain dummy_anon_vma_chain_d = { .anon_vma = &dummy_anon_vma, }; + const struct vm_operations_struct vm_ops = { + .close = dummy_close, + }; int count; struct vm_area_struct *vma, *vma_a, *vma_b, *vma_c, *vma_d; bool merged; @@ -430,6 +433,7 @@ static bool test_merge_new(void) * 0123456789abc * AA*B DD CC */ + vma_a->vm_ops = &vm_ops; /* This should have no impact. */ vma_b->anon_vma = &dummy_anon_vma; vma = try_merge_new_vma(&mm, &vmg, 0x2000, 0x3000, 2, flags, &merged); ASSERT_EQ(vma, vma_a); @@ -466,6 +470,7 @@ static bool test_merge_new(void) * AAAAA *DD CC */ vma_d->anon_vma = &dummy_anon_vma; + vma_d->vm_ops = &vm_ops; /* This should have no impact. */ vma = try_merge_new_vma(&mm, &vmg, 0x6000, 0x7000, 6, flags, &merged); ASSERT_EQ(vma, vma_d); /* Prepend. */ @@ -483,6 +488,7 @@ static bool test_merge_new(void) * 0123456789abc * AAAAA*DDD CC */ + vma_d->vm_ops = NULL; /* This would otherwise degrade the merge. */ vma = try_merge_new_vma(&mm, &vmg, 0x5000, 0x6000, 5, flags, &merged); ASSERT_EQ(vma, vma_a); /* Merge with A, delete D. */ @@ -640,13 +646,11 @@ static bool test_vma_merge_with_close(void) const struct vm_operations_struct vm_ops = { .close = dummy_close, }; - struct vm_area_struct *vma_next = - alloc_and_link_vma(&mm, 0x2000, 0x3000, 2, flags); - struct vm_area_struct *vma; + struct vm_area_struct *vma_prev, *vma_next, *vma; /* - * When we merge VMAs we sometimes have to delete others as part of the - * operation. + * When merging VMAs we are not permitted to remove any VMA that has a + * vm_ops->close() hook. * * Considering the two possible adjacent VMAs to which a VMA can be * merged: @@ -697,28 +701,52 @@ static bool test_vma_merge_with_close(void) * would be set too, and thus scenario A would pick this up. */ - ASSERT_NE(vma_next, NULL); - /* - * SCENARIO A + * The only case of a new VMA merge that results in a VMA being deleted + * is one where both the previous and next VMAs are merged - in this + * instance the next VMA is deleted, and the previous VMA is extended. * - * 0123 - * *N + * If we are unable to do so, we reduce the operation to simply + * extending the prev VMA and not merging next. + * + * 0123456789 + * PPP**NNNN + * -> + * 0123456789 + * PPPPPPNNN */ - /* Make the next VMA have a close() callback. */ + vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, flags); + vma_next = alloc_and_link_vma(&mm, 0x5000, 0x9000, 5, flags); vma_next->vm_ops = &vm_ops; - /* Our proposed VMA has characteristics that would otherwise be merged. */ - vmg_set_range(&vmg, 0x1000, 0x2000, 1, flags); + vmg_set_range(&vmg, 0x3000, 0x5000, 3, flags); + ASSERT_EQ(merge_new(&vmg), vma_prev); + ASSERT_EQ(vmg.state, VMA_MERGE_SUCCESS); + ASSERT_EQ(vma_prev->vm_start, 0); + ASSERT_EQ(vma_prev->vm_end, 0x5000); + ASSERT_EQ(vma_prev->vm_pgoff, 0); - /* The next VMA having a close() operator should cause the merge to fail.*/ - ASSERT_EQ(merge_new(&vmg), NULL); - ASSERT_EQ(vmg.state, VMA_MERGE_NOMERGE); + ASSERT_EQ(cleanup_mm(&mm, &vmi), 2); - /* Now create the VMA so we can merge via modified flags */ - vmg_set_range(&vmg, 0x1000, 0x2000, 1, flags); - vma = alloc_and_link_vma(&mm, 0x1000, 0x2000, 1, flags); + /* + * When modifying an existing VMA there are further cases where we + * delete VMAs. + * + * <> + * 0123456789 + * PPPVV + * + * In this instance, if vma has a close hook, the merge simply cannot + * proceed. + */ + + vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, flags); + vma = alloc_and_link_vma(&mm, 0x3000, 0x5000, 3, flags); + vma->vm_ops = &vm_ops; + + vmg_set_range(&vmg, 0x3000, 0x5000, 3, flags); + vmg.prev = vma_prev; vmg.vma = vma; /* @@ -728,38 +756,90 @@ static bool test_vma_merge_with_close(void) ASSERT_EQ(merge_existing(&vmg), NULL); ASSERT_EQ(vmg.state, VMA_MERGE_NOMERGE); - /* SCENARIO B + ASSERT_EQ(cleanup_mm(&mm, &vmi), 2); + + /* + * This case is mirrored if merging with next. * - * 0123 - * P* + * <> + * 0123456789 + * VVNNNN * - * In order for this scenario to trigger, the VMA currently being - * modified must also have a .close(). + * In this instance, if vma has a close hook, the merge simply cannot + * proceed. */ - /* Reset VMG state. */ - vmg_set_range(&vmg, 0x1000, 0x2000, 1, flags); - /* - * Make next unmergeable, and don't let the scenario A check pick this - * up, we want to reproduce scenario B only. - */ - vma_next->vm_ops = NULL; - vma_next->__vm_flags &= ~VM_MAYWRITE; - /* Allocate prev. */ - vmg.prev = alloc_and_link_vma(&mm, 0, 0x1000, 0, flags); - /* Assign a vm_ops->close() function to VMA explicitly. */ + vma = alloc_and_link_vma(&mm, 0x3000, 0x5000, 3, flags); + vma_next = alloc_and_link_vma(&mm, 0x5000, 0x9000, 5, flags); vma->vm_ops = &vm_ops; + + vmg_set_range(&vmg, 0x3000, 0x5000, 3, flags); vmg.vma = vma; - /* Make sure merge does not occur. */ ASSERT_EQ(merge_existing(&vmg), NULL); /* * Initially this is misapprehended as an out of memory report, as the * close() check is handled in the same way as anon_vma duplication * failures, however a subsequent patch resolves this. */ - ASSERT_EQ(vmg.state, VMA_MERGE_ERROR_NOMEM); + ASSERT_EQ(vmg.state, VMA_MERGE_NOMERGE); + + ASSERT_EQ(cleanup_mm(&mm, &vmi), 2); + + /* + * Finally, we consider two variants of the case where we modify a VMA + * to merge with both the previous and next VMAs. + * + * The first variant is where vma has a close hook. In this instance, no + * merge can proceed. + * + * <> + * 0123456789 + * PPPVVNNNN + */ + + vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, flags); + vma = alloc_and_link_vma(&mm, 0x3000, 0x5000, 3, flags); + vma_next = alloc_and_link_vma(&mm, 0x5000, 0x9000, 5, flags); + vma->vm_ops = &vm_ops; + + vmg_set_range(&vmg, 0x3000, 0x5000, 3, flags); + vmg.prev = vma_prev; + vmg.vma = vma; + + ASSERT_EQ(merge_existing(&vmg), NULL); + ASSERT_EQ(vmg.state, VMA_MERGE_NOMERGE); + + ASSERT_EQ(cleanup_mm(&mm, &vmi), 3); + + /* + * The second variant is where next has a close hook. In this instance, + * we reduce the operation to a merge between prev and vma. + * + * <> + * 0123456789 + * PPPVVNNNN + * -> + * 0123456789 + * PPPPPNNNN + */ + + vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, flags); + vma = alloc_and_link_vma(&mm, 0x3000, 0x5000, 3, flags); + vma_next = alloc_and_link_vma(&mm, 0x5000, 0x9000, 5, flags); + vma_next->vm_ops = &vm_ops; + + vmg_set_range(&vmg, 0x3000, 0x5000, 3, flags); + vmg.prev = vma_prev; + vmg.vma = vma; + + ASSERT_EQ(merge_existing(&vmg), vma_prev); + ASSERT_EQ(vmg.state, VMA_MERGE_SUCCESS); + ASSERT_EQ(vma_prev->vm_start, 0); + ASSERT_EQ(vma_prev->vm_end, 0x5000); + ASSERT_EQ(vma_prev->vm_pgoff, 0); + + ASSERT_EQ(cleanup_mm(&mm, &vmi), 2); - cleanup_mm(&mm, &vmi); return true; } @@ -828,6 +908,9 @@ static bool test_merge_existing(void) .mm = &mm, .vmi = &vmi, }; + const struct vm_operations_struct vm_ops = { + .close = dummy_close, + }; /* * Merge right case - partial span. @@ -840,7 +923,9 @@ static bool test_merge_existing(void) * VNNNNNN */ vma = alloc_and_link_vma(&mm, 0x2000, 0x6000, 2, flags); + vma->vm_ops = &vm_ops; /* This should have no impact. */ vma_next = alloc_and_link_vma(&mm, 0x6000, 0x9000, 6, flags); + vma_next->vm_ops = &vm_ops; /* This should have no impact. */ vmg_set_range(&vmg, 0x3000, 0x6000, 3, flags); vmg.vma = vma; vmg.prev = vma; @@ -873,6 +958,7 @@ static bool test_merge_existing(void) */ vma = alloc_and_link_vma(&mm, 0x2000, 0x6000, 2, flags); vma_next = alloc_and_link_vma(&mm, 0x6000, 0x9000, 6, flags); + vma_next->vm_ops = &vm_ops; /* This should have no impact. */ vmg_set_range(&vmg, 0x2000, 0x6000, 2, flags); vmg.vma = vma; vma->anon_vma = &dummy_anon_vma; @@ -899,7 +985,9 @@ static bool test_merge_existing(void) * PPPPPPV */ vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, flags); + vma_prev->vm_ops = &vm_ops; /* This should have no impact. */ vma = alloc_and_link_vma(&mm, 0x3000, 0x7000, 3, flags); + vma->vm_ops = &vm_ops; /* This should have no impact. */ vmg_set_range(&vmg, 0x3000, 0x6000, 3, flags); vmg.prev = vma_prev; vmg.vma = vma; @@ -932,6 +1020,7 @@ static bool test_merge_existing(void) * PPPPPPP */ vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, flags); + vma_prev->vm_ops = &vm_ops; /* This should have no impact. */ vma = alloc_and_link_vma(&mm, 0x3000, 0x7000, 3, flags); vmg_set_range(&vmg, 0x3000, 0x7000, 3, flags); vmg.prev = vma_prev; @@ -960,6 +1049,7 @@ static bool test_merge_existing(void) * PPPPPPPPPP */ vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, flags); + vma_prev->vm_ops = &vm_ops; /* This should have no impact. */ vma = alloc_and_link_vma(&mm, 0x3000, 0x7000, 3, flags); vma_next = alloc_and_link_vma(&mm, 0x7000, 0x9000, 7, flags); vmg_set_range(&vmg, 0x3000, 0x7000, 3, flags); -- cgit v1.2.3 From 9cb75552f421c5e9667bc19fb430063cb023c219 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 26 Aug 2024 20:03:28 -0700 Subject: selftests/damon: add access_memory_even to .gitignore Patch series "misc fixups for DAMON {self,kunit} tests". This patchset is for minor fixups of DAMON selftests and kunit tests. First three patches make DAMON selftests more cleanly maintained (patches 1 and 2) without unnecessary warnings (patch 3). Following six patches remove unnecessary test case (patch 4), handle configs combinations that can make tests fail (patches 5-7), reorganize the test files following the new guideline (patch 8), and add reference kunitconfig for DAMON kunit tests (patch 9). This patch (of 9): DAMON selftests build access_memory_even, but its not on the .gitignore list. Add it to make 'git status' output cleaner. Link: https://lkml.kernel.org/r/20240827030336.7930-1-sj@kernel.org Link: https://lkml.kernel.org/r/20240827030336.7930-2-sj@kernel.org Fixes: c94df805c774 ("selftests/damon: implement a program for even-numbered memory regions access") Signed-off-by: SeongJae Park Cc: Brendan Higgins Cc: David Gow Signed-off-by: Andrew Morton --- tools/testing/selftests/damon/.gitignore | 1 + 1 file changed, 1 insertion(+) (limited to 'tools') diff --git a/tools/testing/selftests/damon/.gitignore b/tools/testing/selftests/damon/.gitignore index e65ef9d9cedc..2ab675fecb6b 100644 --- a/tools/testing/selftests/damon/.gitignore +++ b/tools/testing/selftests/damon/.gitignore @@ -3,3 +3,4 @@ huge_count_read_write debugfs_target_ids_read_before_terminate_race debugfs_target_ids_pid_leak access_memory +access_memory_even -- cgit v1.2.3 From 582c04b07fa91665b4f38682c290fb9f0b085c7e Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 26 Aug 2024 20:03:29 -0700 Subject: selftests/damon: cleanup __pycache__/ with 'make clean' Python-based tests creates __pycache__/ directory. Remove it with 'make clean' by defining it as EXTRA_CLEAN. Link: https://lkml.kernel.org/r/20240827030336.7930-3-sj@kernel.org Fixes: b5906f5f7359 ("selftests/damon: add a test for update_schemes_tried_regions sysfs command") Signed-off-by: SeongJae Park Cc: Brendan Higgins Cc: David Gow Signed-off-by: Andrew Morton --- tools/testing/selftests/damon/Makefile | 2 ++ 1 file changed, 2 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/damon/Makefile b/tools/testing/selftests/damon/Makefile index 1e2e98cc809d..5b2a6a5dd1af 100644 --- a/tools/testing/selftests/damon/Makefile +++ b/tools/testing/selftests/damon/Makefile @@ -25,4 +25,6 @@ TEST_PROGS += debugfs_target_ids_pid_leak.sh TEST_PROGS += sysfs_update_removed_scheme_dir.sh TEST_PROGS += sysfs_update_schemes_tried_regions_hang.py +EXTRA_CLEAN = __pycache__ + include ../lib.mk -- cgit v1.2.3 From 8c211412c5dffd090eaea5ee033cd729f8e5f873 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 26 Aug 2024 20:03:30 -0700 Subject: selftests/damon: add execute permissions to test scripts Some test scripts are missing executable permissions. It causes warnings that make the test output unnecessarily verbose. Add executable permissions. Link: https://lkml.kernel.org/r/20240827030336.7930-4-sj@kernel.org Signed-off-by: SeongJae Park Cc: Brendan Higgins Cc: David Gow Signed-off-by: Andrew Morton --- tools/testing/selftests/damon/damon_nr_regions.py | 0 tools/testing/selftests/damon/damos_apply_interval.py | 0 tools/testing/selftests/damon/damos_quota.py | 0 tools/testing/selftests/damon/damos_quota_goal.py | 0 tools/testing/selftests/damon/damos_tried_regions.py | 0 tools/testing/selftests/damon/debugfs_target_ids_pid_leak.sh | 0 .../selftests/damon/debugfs_target_ids_read_before_terminate_race.sh | 0 tools/testing/selftests/damon/sysfs_update_schemes_tried_regions_hang.py | 0 .../selftests/damon/sysfs_update_schemes_tried_regions_wss_estimation.py | 0 9 files changed, 0 insertions(+), 0 deletions(-) mode change 100644 => 100755 tools/testing/selftests/damon/damon_nr_regions.py mode change 100644 => 100755 tools/testing/selftests/damon/damos_apply_interval.py mode change 100644 => 100755 tools/testing/selftests/damon/damos_quota.py mode change 100644 => 100755 tools/testing/selftests/damon/damos_quota_goal.py mode change 100644 => 100755 tools/testing/selftests/damon/damos_tried_regions.py mode change 100644 => 100755 tools/testing/selftests/damon/debugfs_target_ids_pid_leak.sh mode change 100644 => 100755 tools/testing/selftests/damon/debugfs_target_ids_read_before_terminate_race.sh mode change 100644 => 100755 tools/testing/selftests/damon/sysfs_update_schemes_tried_regions_hang.py mode change 100644 => 100755 tools/testing/selftests/damon/sysfs_update_schemes_tried_regions_wss_estimation.py (limited to 'tools') diff --git a/tools/testing/selftests/damon/damon_nr_regions.py b/tools/testing/selftests/damon/damon_nr_regions.py old mode 100644 new mode 100755 diff --git a/tools/testing/selftests/damon/damos_apply_interval.py b/tools/testing/selftests/damon/damos_apply_interval.py old mode 100644 new mode 100755 diff --git a/tools/testing/selftests/damon/damos_quota.py b/tools/testing/selftests/damon/damos_quota.py old mode 100644 new mode 100755 diff --git a/tools/testing/selftests/damon/damos_quota_goal.py b/tools/testing/selftests/damon/damos_quota_goal.py old mode 100644 new mode 100755 diff --git a/tools/testing/selftests/damon/damos_tried_regions.py b/tools/testing/selftests/damon/damos_tried_regions.py old mode 100644 new mode 100755 diff --git a/tools/testing/selftests/damon/debugfs_target_ids_pid_leak.sh b/tools/testing/selftests/damon/debugfs_target_ids_pid_leak.sh old mode 100644 new mode 100755 diff --git a/tools/testing/selftests/damon/debugfs_target_ids_read_before_terminate_race.sh b/tools/testing/selftests/damon/debugfs_target_ids_read_before_terminate_race.sh old mode 100644 new mode 100755 diff --git a/tools/testing/selftests/damon/sysfs_update_schemes_tried_regions_hang.py b/tools/testing/selftests/damon/sysfs_update_schemes_tried_regions_hang.py old mode 100644 new mode 100755 diff --git a/tools/testing/selftests/damon/sysfs_update_schemes_tried_regions_wss_estimation.py b/tools/testing/selftests/damon/sysfs_update_schemes_tried_regions_wss_estimation.py old mode 100644 new mode 100755 -- cgit v1.2.3 From 7c65ae81ea86a6ed8086e1a5651acd766187f19b Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 3 Sep 2024 21:54:28 -1000 Subject: sched_ext: Don't call put_prev_task_scx() before picking the next task fd03c5b85855 ("sched: Rework pick_next_task()") changed the definition of pick_next_task() from: pick_next_task() := pick_task() + set_next_task(.first = true) to: pick_next_task(prev) := pick_task() + put_prev_task() + set_next_task(.first = true) making invoking put_prev_task() pick_next_task()'s responsibility. This reordering allows pick_task() to be shared between regular and core-sched paths and put_prev_task() to know the next task. sched_ext depended on put_prev_task_scx() enqueueing the current task before pick_next_task_scx() is called. While pulling sched/core changes, 70cc76aa0d80 ("Merge branch 'tip/sched/core' into for-6.12") added an explicit put_prev_task_scx() call for SCX tasks in pick_next_task_scx() before picking the first task as a workaround. Clean it up and adopt the conventions that other sched classes are following. The operation of keeping running the current task was spread and required the task to be put on the local DSQ before picking: - balance_one() used SCX_TASK_BAL_KEEP to indicate that the task is still runnable, hasn't exhausted its slice, and thus should keep running. - put_prev_task_scx() enqueued the task to local DSQ if SCX_TASK_BAL_KEEP is set. It also called do_enqueue_task() with SCX_ENQ_LAST if it is the only runnable task. do_enqueue_task() in turn decided whether to use the local DSQ depending on SCX_OPS_ENQ_LAST. Consolidate the logic in balance_one() as it always knows whether it is going to keep the current task. balance_one() now considers all conditions where the current task should be kept and uses SCX_TASK_BAL_KEEP to tell pick_next_task_scx() to keep the current task instead of picking one from the local DSQ. Accordingly, SCX_ENQ_LAST handling is removed from put_prev_task_scx() and do_enqueue_task() and pick_next_task_scx() is updated to pick the current task if SCX_TASK_BAL_KEEP is set. The workaround put_prev_task[_scx]() calls are replaced with put_prev_set_next_task(). This causes two behavior changes observable from the BPF scheduler: - When a task keep running, it no longer goes through enqueue/dequeue cycle and thus ops.stopping/running() transitions. The new behavior is better and all the existing schedulers should be able to handle the new behavior. - The BPF scheduler cannot keep executing the current task by enqueueing SCX_ENQ_LAST task to the local DSQ. If SCX_OPS_ENQ_LAST is specified, the BPF scheduler is responsible for resuming execution after each SCX_ENQ_LAST. SCX_OPS_ENQ_LAST is mostly useful for cases where scheduling decisions are not made on the local CPU - e.g. central or userspace-driven schedulin - and the new behavior is more logical and shouldn't pose any problems. SCX_OPS_ENQ_LAST demonstration from scx_qmap is dropped as it doesn't fit that well anymore and the last task handling is moved to the end of qmap_dispatch(). Signed-off-by: Tejun Heo Cc: David Vernet Cc: Andrea Righi Cc: Changwoo Min Cc: Daniel Hodges Cc: Dan Schatzberg --- kernel/sched/ext.c | 130 ++++++++++++++++++++--------------------- tools/sched_ext/scx_qmap.bpf.c | 22 +++++-- 2 files changed, 80 insertions(+), 72 deletions(-) (limited to 'tools') diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 5e862c7c0e6b..be86dbfa75a8 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -630,11 +630,8 @@ enum scx_enq_flags { * %SCX_OPS_ENQ_LAST is specified, they're ops.enqueue()'d with the * %SCX_ENQ_LAST flag set. * - * If the BPF scheduler wants to continue executing the task, - * ops.enqueue() should dispatch the task to %SCX_DSQ_LOCAL immediately. - * If the task gets queued on a different dsq or the BPF side, the BPF - * scheduler is responsible for triggering a follow-up scheduling event. - * Otherwise, Execution may stall. + * The BPF scheduler is responsible for triggering a follow-up + * scheduling event. Otherwise, Execution may stall. */ SCX_ENQ_LAST = 1LLU << 41, @@ -1852,12 +1849,8 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags, if (!scx_rq_online(rq)) goto local; - if (scx_ops_bypassing()) { - if (enq_flags & SCX_ENQ_LAST) - goto local; - else - goto global; - } + if (scx_ops_bypassing()) + goto global; if (p->scx.ddsp_dsq_id != SCX_DSQ_INVALID) goto direct; @@ -1867,11 +1860,6 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags, unlikely(p->flags & PF_EXITING)) goto local; - /* see %SCX_OPS_ENQ_LAST */ - if (!static_branch_unlikely(&scx_ops_enq_last) && - (enq_flags & SCX_ENQ_LAST)) - goto local; - if (!SCX_HAS_OP(enqueue)) goto global; @@ -2517,7 +2505,6 @@ static int balance_one(struct rq *rq, struct task_struct *prev, bool local) struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx); bool prev_on_scx = prev->sched_class == &ext_sched_class; int nr_loops = SCX_DSP_MAX_LOOPS; - bool has_tasks = false; lockdep_assert_rq_held(rq); rq->scx.flags |= SCX_RQ_IN_BALANCE; @@ -2542,9 +2529,9 @@ static int balance_one(struct rq *rq, struct task_struct *prev, bool local) /* * If @prev is runnable & has slice left, it has priority and * fetching more just increases latency for the fetched tasks. - * Tell put_prev_task_scx() to put @prev on local_dsq. If the - * BPF scheduler wants to handle this explicitly, it should - * implement ->cpu_released(). + * Tell pick_next_task_scx() to keep running @prev. If the BPF + * scheduler wants to handle this explicitly, it should + * implement ->cpu_release(). * * See scx_ops_disable_workfn() for the explanation on the * bypassing test. @@ -2570,7 +2557,7 @@ static int balance_one(struct rq *rq, struct task_struct *prev, bool local) goto has_tasks; if (!SCX_HAS_OP(dispatch) || scx_ops_bypassing() || !scx_rq_online(rq)) - goto out; + goto no_tasks; dspc->rq = rq; @@ -2609,13 +2596,23 @@ static int balance_one(struct rq *rq, struct task_struct *prev, bool local) } } while (dspc->nr_tasks); - goto out; +no_tasks: + /* + * Didn't find another task to run. Keep running @prev unless + * %SCX_OPS_ENQ_LAST is in effect. + */ + if ((prev->scx.flags & SCX_TASK_QUEUED) && + (!static_branch_unlikely(&scx_ops_enq_last) || scx_ops_bypassing())) { + if (local) + prev->scx.flags |= SCX_TASK_BAL_KEEP; + goto has_tasks; + } + rq->scx.flags &= ~SCX_RQ_IN_BALANCE; + return false; has_tasks: - has_tasks = true; -out: rq->scx.flags &= ~SCX_RQ_IN_BALANCE; - return has_tasks; + return true; } static int balance_scx(struct rq *rq, struct task_struct *prev, @@ -2728,25 +2725,16 @@ static void put_prev_task_scx(struct rq *rq, struct task_struct *p, if (SCX_HAS_OP(stopping) && (p->scx.flags & SCX_TASK_QUEUED)) SCX_CALL_OP_TASK(SCX_KF_REST, stopping, p, true); - /* - * If we're being called from put_prev_task_balance(), balance_scx() may - * have decided that @p should keep running. - */ - if (p->scx.flags & SCX_TASK_BAL_KEEP) { + if (p->scx.flags & SCX_TASK_QUEUED) { p->scx.flags &= ~SCX_TASK_BAL_KEEP; - set_task_runnable(rq, p); - dispatch_enqueue(&rq->scx.local_dsq, p, SCX_ENQ_HEAD); - return; - } - if (p->scx.flags & SCX_TASK_QUEUED) { set_task_runnable(rq, p); /* - * If @p has slice left and balance_scx() didn't tag it for - * keeping, @p is getting preempted by a higher priority - * scheduler class or core-sched forcing a different task. Leave - * it at the head of the local DSQ. + * If @p has slice left and is being put, @p is getting + * preempted by a higher priority scheduler class or core-sched + * forcing a different task. Leave it at the head of the local + * DSQ. */ if (p->scx.slice && !scx_ops_bypassing()) { dispatch_enqueue(&rq->scx.local_dsq, p, SCX_ENQ_HEAD); @@ -2754,18 +2742,17 @@ static void put_prev_task_scx(struct rq *rq, struct task_struct *p, } /* - * If we're in the pick_next_task path, balance_scx() should - * have already populated the local DSQ if there are any other - * available tasks. If empty, tell ops.enqueue() that @p is the - * only one available for this cpu. ops.enqueue() should put it - * on the local DSQ so that the subsequent pick_next_task_scx() - * can find the task unless it wants to trigger a separate - * follow-up scheduling event. + * If @p is runnable but we're about to enter a lower + * sched_class, %SCX_OPS_ENQ_LAST must be set. Tell + * ops.enqueue() that @p is the only one available for this cpu, + * which should trigger an explicit follow-up scheduling event. */ - if (list_empty(&rq->scx.local_dsq.list)) + if (sched_class_above(&ext_sched_class, next->sched_class)) { + WARN_ON_ONCE(!static_branch_unlikely(&scx_ops_enq_last)); do_enqueue_task(rq, p, SCX_ENQ_LAST, -1); - else + } else { do_enqueue_task(rq, p, 0, -1); + } } } @@ -2780,27 +2767,33 @@ static struct task_struct *pick_next_task_scx(struct rq *rq, { struct task_struct *p; - if (prev->sched_class == &ext_sched_class) - put_prev_task_scx(rq, prev, NULL); - - p = first_local_task(rq); - if (!p) - return NULL; - - if (prev->sched_class != &ext_sched_class) - prev->sched_class->put_prev_task(rq, prev, p); - - set_next_task_scx(rq, p, true); + /* + * If balance_scx() is telling us to keep running @prev, replenish slice + * if necessary and keep running @prev. Otherwise, pop the first one + * from the local DSQ. + */ + if (prev->scx.flags & SCX_TASK_BAL_KEEP) { + prev->scx.flags &= ~SCX_TASK_BAL_KEEP; + p = prev; + if (!p->scx.slice) + p->scx.slice = SCX_SLICE_DFL; + } else { + p = first_local_task(rq); + if (!p) + return NULL; - if (unlikely(!p->scx.slice)) { - if (!scx_ops_bypassing() && !scx_warned_zero_slice) { - printk_deferred(KERN_WARNING "sched_ext: %s[%d] has zero slice in pick_next_task_scx()\n", - p->comm, p->pid); - scx_warned_zero_slice = true; + if (unlikely(!p->scx.slice)) { + if (!scx_ops_bypassing() && !scx_warned_zero_slice) { + printk_deferred(KERN_WARNING "sched_ext: %s[%d] has zero slice in pick_next_task_scx()\n", + p->comm, p->pid); + scx_warned_zero_slice = true; + } + p->scx.slice = SCX_SLICE_DFL; } - p->scx.slice = SCX_SLICE_DFL; } + put_prev_set_next_task(rq, prev, p); + return p; } @@ -3875,12 +3868,13 @@ bool task_should_scx(struct task_struct *p) * to force global FIFO scheduling. * * a. ops.enqueue() is ignored and tasks are queued in simple global FIFO order. + * %SCX_OPS_ENQ_LAST is also ignored. * * b. ops.dispatch() is ignored. * - * c. balance_scx() never sets %SCX_TASK_BAL_KEEP as the slice value can't be - * trusted. Whenever a tick triggers, the running task is rotated to the tail - * of the queue with core_sched_at touched. + * c. balance_scx() does not set %SCX_TASK_BAL_KEEP on non-zero slice as slice + * can't be trusted. Whenever a tick triggers, the running task is rotated to + * the tail of the queue with core_sched_at touched. * * d. pick_next_task() suppresses zero slice warning. * diff --git a/tools/sched_ext/scx_qmap.bpf.c b/tools/sched_ext/scx_qmap.bpf.c index 892278f12dce..7e7f28f4117e 100644 --- a/tools/sched_ext/scx_qmap.bpf.c +++ b/tools/sched_ext/scx_qmap.bpf.c @@ -205,8 +205,7 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags) /* * All enqueued tasks must have their core_sched_seq updated for correct - * core-sched ordering, which is why %SCX_OPS_ENQ_LAST is specified in - * qmap_ops.flags. + * core-sched ordering. Also, take a look at the end of qmap_dispatch(). */ tctx->core_sched_seq = core_sched_tail_seqs[idx]++; @@ -214,7 +213,7 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags) * If qmap_select_cpu() is telling us to or this is the last runnable * task on the CPU, enqueue locally. */ - if (tctx->force_local || (enq_flags & SCX_ENQ_LAST)) { + if (tctx->force_local) { tctx->force_local = false; scx_bpf_dispatch(p, SCX_DSQ_LOCAL, slice_ns, enq_flags); return; @@ -285,6 +284,7 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev) { struct task_struct *p; struct cpu_ctx *cpuc; + struct task_ctx *tctx; u32 zero = 0, batch = dsp_batch ?: 1; void *fifo; s32 i, pid; @@ -349,6 +349,21 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev) cpuc->dsp_cnt = 0; } + + /* + * No other tasks. @prev will keep running. Update its core_sched_seq as + * if the task were enqueued and dispatched immediately. + */ + if (prev) { + tctx = bpf_task_storage_get(&task_ctx_stor, prev, 0, 0); + if (!tctx) { + scx_bpf_error("task_ctx lookup failed"); + return; + } + + tctx->core_sched_seq = + core_sched_tail_seqs[weight_to_idx(prev->scx.weight)]++; + } } void BPF_STRUCT_OPS(qmap_tick, struct task_struct *p) @@ -701,6 +716,5 @@ SCX_OPS_DEFINE(qmap_ops, .cpu_offline = (void *)qmap_cpu_offline, .init = (void *)qmap_init, .exit = (void *)qmap_exit, - .flags = SCX_OPS_ENQ_LAST, .timeout_ms = 5000U, .name = "qmap"); -- cgit v1.2.3 From 487355f111f98a74d86007c4df0ba9f0f9edc172 Mon Sep 17 00:00:00 2001 From: Joey Gouly Date: Thu, 22 Aug 2024 16:11:13 +0100 Subject: KVM: selftests: get-reg-list: add Permission Overlay registers Add new system registers: - POR_EL1 - POR_EL0 Signed-off-by: Joey Gouly Cc: Catalin Marinas Cc: Will Deacon Cc: Marc Zyngier Cc: Oliver Upton Cc: Shuah Khan Reviewed-by: Mark Brown Link: https://lore.kernel.org/r/20240822151113.1479789-31-joey.gouly@arm.com Signed-off-by: Will Deacon --- tools/testing/selftests/kvm/aarch64/get-reg-list.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/aarch64/get-reg-list.c b/tools/testing/selftests/kvm/aarch64/get-reg-list.c index 709d7d721760..ac661ebf6859 100644 --- a/tools/testing/selftests/kvm/aarch64/get-reg-list.c +++ b/tools/testing/selftests/kvm/aarch64/get-reg-list.c @@ -40,6 +40,18 @@ static struct feature_id_reg feat_id_regs[] = { ARM64_SYS_REG(3, 0, 0, 7, 3), /* ID_AA64MMFR3_EL1 */ 4, 1 + }, + { + ARM64_SYS_REG(3, 0, 10, 2, 4), /* POR_EL1 */ + ARM64_SYS_REG(3, 0, 0, 7, 3), /* ID_AA64MMFR3_EL1 */ + 16, + 1 + }, + { + ARM64_SYS_REG(3, 3, 10, 2, 4), /* POR_EL0 */ + ARM64_SYS_REG(3, 0, 0, 7, 3), /* ID_AA64MMFR3_EL1 */ + 16, + 1 } }; @@ -468,6 +480,7 @@ static __u64 base_regs[] = { ARM64_SYS_REG(3, 0, 10, 2, 0), /* MAIR_EL1 */ ARM64_SYS_REG(3, 0, 10, 2, 2), /* PIRE0_EL1 */ ARM64_SYS_REG(3, 0, 10, 2, 3), /* PIR_EL1 */ + ARM64_SYS_REG(3, 0, 10, 2, 4), /* POR_EL1 */ ARM64_SYS_REG(3, 0, 10, 3, 0), /* AMAIR_EL1 */ ARM64_SYS_REG(3, 0, 12, 0, 0), /* VBAR_EL1 */ ARM64_SYS_REG(3, 0, 12, 1, 1), /* DISR_EL1 */ @@ -475,6 +488,7 @@ static __u64 base_regs[] = { ARM64_SYS_REG(3, 0, 13, 0, 4), /* TPIDR_EL1 */ ARM64_SYS_REG(3, 0, 14, 1, 0), /* CNTKCTL_EL1 */ ARM64_SYS_REG(3, 2, 0, 0, 0), /* CSSELR_EL1 */ + ARM64_SYS_REG(3, 3, 10, 2, 4), /* POR_EL0 */ ARM64_SYS_REG(3, 3, 13, 0, 2), /* TPIDR_EL0 */ ARM64_SYS_REG(3, 3, 13, 0, 3), /* TPIDRRO_EL0 */ ARM64_SYS_REG(3, 3, 14, 0, 1), /* CNTPCT_EL0 */ -- cgit v1.2.3 From 6354a0184c542f2b8fade9cb0eb843acd3310191 Mon Sep 17 00:00:00 2001 From: Joey Gouly Date: Thu, 22 Aug 2024 16:11:07 +0100 Subject: kselftest/arm64: move get_header() Put this function in the header so that it can be used by other tests, without needing to link to testcases.c. This will be used by selftest/mm/protection_keys.c Signed-off-by: Joey Gouly Cc: Catalin Marinas Cc: Will Deacon Cc: Andrew Morton Cc: Shuah Khan Cc: Dave Hansen Cc: Aneesh Kumar K.V Reviewed-by: Mark Brown Link: https://lore.kernel.org/r/20240822151113.1479789-25-joey.gouly@arm.com Signed-off-by: Will Deacon --- .../selftests/arm64/signal/testcases/testcases.c | 23 -------------------- .../selftests/arm64/signal/testcases/testcases.h | 25 ++++++++++++++++++++-- 2 files changed, 23 insertions(+), 25 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/arm64/signal/testcases/testcases.c b/tools/testing/selftests/arm64/signal/testcases/testcases.c index 674b88cc8c39..e4331440fed0 100644 --- a/tools/testing/selftests/arm64/signal/testcases/testcases.c +++ b/tools/testing/selftests/arm64/signal/testcases/testcases.c @@ -6,29 +6,6 @@ #include "testcases.h" -struct _aarch64_ctx *get_header(struct _aarch64_ctx *head, uint32_t magic, - size_t resv_sz, size_t *offset) -{ - size_t offs = 0; - struct _aarch64_ctx *found = NULL; - - if (!head || resv_sz < HDR_SZ) - return found; - - while (offs <= resv_sz - HDR_SZ && - head->magic != magic && head->magic) { - offs += head->size; - head = GET_RESV_NEXT_HEAD(head); - } - if (head->magic == magic) { - found = head; - if (offset) - *offset = offs; - } - - return found; -} - bool validate_extra_context(struct extra_context *extra, char **err, void **extra_data, size_t *extra_size) { diff --git a/tools/testing/selftests/arm64/signal/testcases/testcases.h b/tools/testing/selftests/arm64/signal/testcases/testcases.h index 7727126347e0..3185e6875694 100644 --- a/tools/testing/selftests/arm64/signal/testcases/testcases.h +++ b/tools/testing/selftests/arm64/signal/testcases/testcases.h @@ -88,8 +88,29 @@ struct fake_sigframe { bool validate_reserved(ucontext_t *uc, size_t resv_sz, char **err); -struct _aarch64_ctx *get_header(struct _aarch64_ctx *head, uint32_t magic, - size_t resv_sz, size_t *offset); +static inline struct _aarch64_ctx *get_header(struct _aarch64_ctx *head, uint32_t magic, + size_t resv_sz, size_t *offset) +{ + size_t offs = 0; + struct _aarch64_ctx *found = NULL; + + if (!head || resv_sz < HDR_SZ) + return found; + + while (offs <= resv_sz - HDR_SZ && + head->magic != magic && head->magic) { + offs += head->size; + head = GET_RESV_NEXT_HEAD(head); + } + if (head->magic == magic) { + found = head; + if (offset) + *offset = offs; + } + + return found; +} + static inline struct _aarch64_ctx *get_terminator(struct _aarch64_ctx *head, size_t resv_sz, -- cgit v1.2.3 From 41bbcf7b4b046b4e7190c1866625aed0fe6f69f6 Mon Sep 17 00:00:00 2001 From: Joey Gouly Date: Thu, 22 Aug 2024 16:11:08 +0100 Subject: selftests: mm: move fpregs printing arm64's fpregs are not at a constant offset from sigcontext. Since this is not an important part of the test, don't print the fpregs pointer on arm64. Signed-off-by: Joey Gouly Cc: Catalin Marinas Cc: Will Deacon Cc: Andrew Morton Cc: Shuah Khan Cc: Dave Hansen Cc: Aneesh Kumar K.V Acked-by: Dave Hansen Link: https://lore.kernel.org/r/20240822151113.1479789-26-joey.gouly@arm.com Signed-off-by: Will Deacon --- tools/testing/selftests/mm/pkey-powerpc.h | 1 + tools/testing/selftests/mm/pkey-x86.h | 2 ++ tools/testing/selftests/mm/protection_keys.c | 6 ++++++ 3 files changed, 9 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/mm/pkey-powerpc.h b/tools/testing/selftests/mm/pkey-powerpc.h index ae5df26104e5..6275d0f474b3 100644 --- a/tools/testing/selftests/mm/pkey-powerpc.h +++ b/tools/testing/selftests/mm/pkey-powerpc.h @@ -9,6 +9,7 @@ #endif #define REG_IP_IDX PT_NIP #define REG_TRAPNO PT_TRAP +#define MCONTEXT_FPREGS #define gregs gp_regs #define fpregs fp_regs #define si_pkey_offset 0x20 diff --git a/tools/testing/selftests/mm/pkey-x86.h b/tools/testing/selftests/mm/pkey-x86.h index 814758e109c0..b9170a26bfcb 100644 --- a/tools/testing/selftests/mm/pkey-x86.h +++ b/tools/testing/selftests/mm/pkey-x86.h @@ -15,6 +15,8 @@ #endif +#define MCONTEXT_FPREGS + #ifndef PKEY_DISABLE_ACCESS # define PKEY_DISABLE_ACCESS 0x1 #endif diff --git a/tools/testing/selftests/mm/protection_keys.c b/tools/testing/selftests/mm/protection_keys.c index eaa6d1fc5328..4337106a985e 100644 --- a/tools/testing/selftests/mm/protection_keys.c +++ b/tools/testing/selftests/mm/protection_keys.c @@ -314,7 +314,9 @@ void signal_handler(int signum, siginfo_t *si, void *vucontext) ucontext_t *uctxt = vucontext; int trapno; unsigned long ip; +#ifdef MCONTEXT_FPREGS char *fpregs; +#endif #if defined(__i386__) || defined(__x86_64__) /* arch */ u32 *pkey_reg_ptr; int pkey_reg_offset; @@ -330,7 +332,9 @@ void signal_handler(int signum, siginfo_t *si, void *vucontext) trapno = uctxt->uc_mcontext.gregs[REG_TRAPNO]; ip = uctxt->uc_mcontext.gregs[REG_IP_IDX]; +#ifdef MCONTEXT_FPREGS fpregs = (char *) uctxt->uc_mcontext.fpregs; +#endif dprintf2("%s() trapno: %d ip: 0x%016lx info->si_code: %s/%d\n", __func__, trapno, ip, si_code_str(si->si_code), @@ -359,7 +363,9 @@ void signal_handler(int signum, siginfo_t *si, void *vucontext) #endif /* arch */ dprintf1("siginfo: %p\n", si); +#ifdef MCONTEXT_FPREGS dprintf1(" fpregs: %p\n", fpregs); +#endif if ((si->si_code == SEGV_MAPERR) || (si->si_code == SEGV_ACCERR) || -- cgit v1.2.3 From f5b5ea51f78f2ebd94d5a77702bbe5eee8924b50 Mon Sep 17 00:00:00 2001 From: Joey Gouly Date: Thu, 22 Aug 2024 16:11:09 +0100 Subject: selftests: mm: make protection_keys test work on arm64 The encoding of the pkey register differs on arm64, than on x86/ppc. On those platforms, a bit in the register is used to disable permissions, for arm64, a bit enabled in the register indicates that the permission is allowed. This drops two asserts of the form: assert(read_pkey_reg() <= orig_pkey_reg); Because on arm64 this doesn't hold, due to the encoding. The pkey must be reset to both access allow and write allow in the signal handler. pkey_access_allow() works currently for PowerPC as the PKEY_DISABLE_ACCESS and PKEY_DISABLE_WRITE have overlapping bits set. Access to the uc_mcontext is abstracted, as arm64 has a different structure. Signed-off-by: Joey Gouly Cc: Catalin Marinas Cc: Will Deacon Cc: Andrew Morton Cc: Shuah Khan Cc: Dave Hansen Cc: Aneesh Kumar K.V Acked-by: Dave Hansen Link: https://lore.kernel.org/r/20240822151113.1479789-27-joey.gouly@arm.com Signed-off-by: Will Deacon --- .../selftests/arm64/signal/testcases/testcases.h | 3 + tools/testing/selftests/mm/Makefile | 2 +- tools/testing/selftests/mm/pkey-arm64.h | 139 +++++++++++++++++++++ tools/testing/selftests/mm/pkey-helpers.h | 8 ++ tools/testing/selftests/mm/pkey-powerpc.h | 2 + tools/testing/selftests/mm/pkey-x86.h | 2 + tools/testing/selftests/mm/protection_keys.c | 103 +++++++++++++-- 7 files changed, 247 insertions(+), 12 deletions(-) create mode 100644 tools/testing/selftests/mm/pkey-arm64.h (limited to 'tools') diff --git a/tools/testing/selftests/arm64/signal/testcases/testcases.h b/tools/testing/selftests/arm64/signal/testcases/testcases.h index 3185e6875694..9872b8912714 100644 --- a/tools/testing/selftests/arm64/signal/testcases/testcases.h +++ b/tools/testing/selftests/arm64/signal/testcases/testcases.h @@ -26,6 +26,9 @@ #define HDR_SZ \ sizeof(struct _aarch64_ctx) +#define GET_UC_RESV_HEAD(uc) \ + (struct _aarch64_ctx *)(&(uc->uc_mcontext.__reserved)) + #define GET_SF_RESV_HEAD(sf) \ (struct _aarch64_ctx *)(&(sf).uc.uc_mcontext.__reserved) diff --git a/tools/testing/selftests/mm/Makefile b/tools/testing/selftests/mm/Makefile index 7b8a5def54a1..b474ac7be8a5 100644 --- a/tools/testing/selftests/mm/Makefile +++ b/tools/testing/selftests/mm/Makefile @@ -104,7 +104,7 @@ TEST_GEN_FILES += $(BINARIES_64) endif else -ifneq (,$(findstring $(ARCH),powerpc)) +ifneq (,$(filter $(ARCH),arm64 powerpc)) TEST_GEN_FILES += protection_keys endif diff --git a/tools/testing/selftests/mm/pkey-arm64.h b/tools/testing/selftests/mm/pkey-arm64.h new file mode 100644 index 000000000000..580e1b0bb38e --- /dev/null +++ b/tools/testing/selftests/mm/pkey-arm64.h @@ -0,0 +1,139 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2023 Arm Ltd. + */ + +#ifndef _PKEYS_ARM64_H +#define _PKEYS_ARM64_H + +#include "vm_util.h" +/* for signal frame parsing */ +#include "../arm64/signal/testcases/testcases.h" + +#ifndef SYS_mprotect_key +# define SYS_mprotect_key 288 +#endif +#ifndef SYS_pkey_alloc +# define SYS_pkey_alloc 289 +# define SYS_pkey_free 290 +#endif +#define MCONTEXT_IP(mc) mc.pc +#define MCONTEXT_TRAPNO(mc) -1 + +#define PKEY_MASK 0xf + +#define POE_NONE 0x0 +#define POE_X 0x2 +#define POE_RX 0x3 +#define POE_RWX 0x7 + +#define NR_PKEYS 8 +#define NR_RESERVED_PKEYS 1 /* pkey-0 */ + +#define PKEY_ALLOW_ALL 0x77777777 + +#define PKEY_BITS_PER_PKEY 4 +#define PAGE_SIZE sysconf(_SC_PAGESIZE) +#undef HPAGE_SIZE +#define HPAGE_SIZE default_huge_page_size() + +/* 4-byte instructions * 16384 = 64K page */ +#define __page_o_noops() asm(".rept 16384 ; nop; .endr") + +static inline u64 __read_pkey_reg(void) +{ + u64 pkey_reg = 0; + + // POR_EL0 + asm volatile("mrs %0, S3_3_c10_c2_4" : "=r" (pkey_reg)); + + return pkey_reg; +} + +static inline void __write_pkey_reg(u64 pkey_reg) +{ + u64 por = pkey_reg; + + dprintf4("%s() changing %016llx to %016llx\n", + __func__, __read_pkey_reg(), pkey_reg); + + // POR_EL0 + asm volatile("msr S3_3_c10_c2_4, %0\nisb" :: "r" (por) :); + + dprintf4("%s() pkey register after changing %016llx to %016llx\n", + __func__, __read_pkey_reg(), pkey_reg); +} + +static inline int cpu_has_pkeys(void) +{ + /* No simple way to determine this */ + return 1; +} + +static inline u32 pkey_bit_position(int pkey) +{ + return pkey * PKEY_BITS_PER_PKEY; +} + +static inline int get_arch_reserved_keys(void) +{ + return NR_RESERVED_PKEYS; +} + +void expect_fault_on_read_execonly_key(void *p1, int pkey) +{ +} + +void *malloc_pkey_with_mprotect_subpage(long size, int prot, u16 pkey) +{ + return PTR_ERR_ENOTSUP; +} + +#define set_pkey_bits set_pkey_bits +static inline u64 set_pkey_bits(u64 reg, int pkey, u64 flags) +{ + u32 shift = pkey_bit_position(pkey); + u64 new_val = POE_RWX; + + /* mask out bits from pkey in old value */ + reg &= ~((u64)PKEY_MASK << shift); + + if (flags & PKEY_DISABLE_ACCESS) + new_val = POE_X; + else if (flags & PKEY_DISABLE_WRITE) + new_val = POE_RX; + + /* OR in new bits for pkey */ + reg |= new_val << shift; + + return reg; +} + +#define get_pkey_bits get_pkey_bits +static inline u64 get_pkey_bits(u64 reg, int pkey) +{ + u32 shift = pkey_bit_position(pkey); + /* + * shift down the relevant bits to the lowest four, then + * mask off all the other higher bits + */ + u32 perm = (reg >> shift) & PKEY_MASK; + + if (perm == POE_X) + return PKEY_DISABLE_ACCESS; + if (perm == POE_RX) + return PKEY_DISABLE_WRITE; + return 0; +} + +static void aarch64_write_signal_pkey(ucontext_t *uctxt, u64 pkey) +{ + struct _aarch64_ctx *ctx = GET_UC_RESV_HEAD(uctxt); + struct poe_context *poe_ctx = + (struct poe_context *) get_header(ctx, POE_MAGIC, + sizeof(uctxt->uc_mcontext), NULL); + if (poe_ctx) + poe_ctx->por_el0 = pkey; +} + +#endif /* _PKEYS_ARM64_H */ diff --git a/tools/testing/selftests/mm/pkey-helpers.h b/tools/testing/selftests/mm/pkey-helpers.h index 1af3156a9db8..15608350fc01 100644 --- a/tools/testing/selftests/mm/pkey-helpers.h +++ b/tools/testing/selftests/mm/pkey-helpers.h @@ -91,12 +91,17 @@ void record_pkey_malloc(void *ptr, long size, int prot); #include "pkey-x86.h" #elif defined(__powerpc64__) /* arch */ #include "pkey-powerpc.h" +#elif defined(__aarch64__) /* arch */ +#include "pkey-arm64.h" #else /* arch */ #error Architecture not supported #endif /* arch */ +#ifndef PKEY_MASK #define PKEY_MASK (PKEY_DISABLE_ACCESS | PKEY_DISABLE_WRITE) +#endif +#ifndef set_pkey_bits static inline u64 set_pkey_bits(u64 reg, int pkey, u64 flags) { u32 shift = pkey_bit_position(pkey); @@ -106,7 +111,9 @@ static inline u64 set_pkey_bits(u64 reg, int pkey, u64 flags) reg |= (flags & PKEY_MASK) << shift; return reg; } +#endif +#ifndef get_pkey_bits static inline u64 get_pkey_bits(u64 reg, int pkey) { u32 shift = pkey_bit_position(pkey); @@ -116,6 +123,7 @@ static inline u64 get_pkey_bits(u64 reg, int pkey) */ return ((reg >> shift) & PKEY_MASK); } +#endif extern u64 shadow_pkey_reg; diff --git a/tools/testing/selftests/mm/pkey-powerpc.h b/tools/testing/selftests/mm/pkey-powerpc.h index 6275d0f474b3..3d0c0bdae5bc 100644 --- a/tools/testing/selftests/mm/pkey-powerpc.h +++ b/tools/testing/selftests/mm/pkey-powerpc.h @@ -8,6 +8,8 @@ # define SYS_pkey_free 385 #endif #define REG_IP_IDX PT_NIP +#define MCONTEXT_IP(mc) mc.gp_regs[REG_IP_IDX] +#define MCONTEXT_TRAPNO(mc) mc.gp_regs[REG_TRAPNO] #define REG_TRAPNO PT_TRAP #define MCONTEXT_FPREGS #define gregs gp_regs diff --git a/tools/testing/selftests/mm/pkey-x86.h b/tools/testing/selftests/mm/pkey-x86.h index b9170a26bfcb..5f28e26a2511 100644 --- a/tools/testing/selftests/mm/pkey-x86.h +++ b/tools/testing/selftests/mm/pkey-x86.h @@ -15,6 +15,8 @@ #endif +#define MCONTEXT_IP(mc) mc.gregs[REG_IP_IDX] +#define MCONTEXT_TRAPNO(mc) mc.gregs[REG_TRAPNO] #define MCONTEXT_FPREGS #ifndef PKEY_DISABLE_ACCESS diff --git a/tools/testing/selftests/mm/protection_keys.c b/tools/testing/selftests/mm/protection_keys.c index 4337106a985e..0789981b72b9 100644 --- a/tools/testing/selftests/mm/protection_keys.c +++ b/tools/testing/selftests/mm/protection_keys.c @@ -147,7 +147,7 @@ void abort_hooks(void) * will then fault, which makes sure that the fault code handles * execute-only memory properly. */ -#ifdef __powerpc64__ +#if defined(__powerpc64__) || defined(__aarch64__) /* This way, both 4K and 64K alignment are maintained */ __attribute__((__aligned__(65536))) #else @@ -212,7 +212,6 @@ void pkey_disable_set(int pkey, int flags) unsigned long syscall_flags = 0; int ret; int pkey_rights; - u64 orig_pkey_reg = read_pkey_reg(); dprintf1("START->%s(%d, 0x%x)\n", __func__, pkey, flags); @@ -242,8 +241,6 @@ void pkey_disable_set(int pkey, int flags) dprintf1("%s(%d) pkey_reg: 0x%016llx\n", __func__, pkey, read_pkey_reg()); - if (flags) - pkey_assert(read_pkey_reg() >= orig_pkey_reg); dprintf1("END<---%s(%d, 0x%x)\n", __func__, pkey, flags); } @@ -253,7 +250,6 @@ void pkey_disable_clear(int pkey, int flags) unsigned long syscall_flags = 0; int ret; int pkey_rights = hw_pkey_get(pkey, syscall_flags); - u64 orig_pkey_reg = read_pkey_reg(); pkey_assert(flags & (PKEY_DISABLE_ACCESS | PKEY_DISABLE_WRITE)); @@ -273,8 +269,6 @@ void pkey_disable_clear(int pkey, int flags) dprintf1("%s(%d) pkey_reg: 0x%016llx\n", __func__, pkey, read_pkey_reg()); - if (flags) - assert(read_pkey_reg() <= orig_pkey_reg); } void pkey_write_allow(int pkey) @@ -330,8 +324,8 @@ void signal_handler(int signum, siginfo_t *si, void *vucontext) __func__, __LINE__, __read_pkey_reg(), shadow_pkey_reg); - trapno = uctxt->uc_mcontext.gregs[REG_TRAPNO]; - ip = uctxt->uc_mcontext.gregs[REG_IP_IDX]; + trapno = MCONTEXT_TRAPNO(uctxt->uc_mcontext); + ip = MCONTEXT_IP(uctxt->uc_mcontext); #ifdef MCONTEXT_FPREGS fpregs = (char *) uctxt->uc_mcontext.fpregs; #endif @@ -395,6 +389,8 @@ void signal_handler(int signum, siginfo_t *si, void *vucontext) #elif defined(__powerpc64__) /* arch */ /* restore access and let the faulting instruction continue */ pkey_access_allow(siginfo_pkey); +#elif defined(__aarch64__) + aarch64_write_signal_pkey(uctxt, PKEY_ALLOW_ALL); #endif /* arch */ pkey_faults++; dprintf1("<<<<==================================================\n"); @@ -908,7 +904,9 @@ void expected_pkey_fault(int pkey) * test program continue. We now have to restore it. */ if (__read_pkey_reg() != 0) -#else /* arch */ +#elif defined(__aarch64__) + if (__read_pkey_reg() != PKEY_ALLOW_ALL) +#else if (__read_pkey_reg() != shadow_pkey_reg) #endif /* arch */ pkey_assert(0); @@ -1498,6 +1496,11 @@ void test_executing_on_unreadable_memory(int *ptr, u16 pkey) lots_o_noops_around_write(&scratch); do_not_expect_pkey_fault("executing on PROT_EXEC memory"); expect_fault_on_read_execonly_key(p1, pkey); + + // Reset back to PROT_EXEC | PROT_READ for architectures that support + // non-PKEY execute-only permissions. + ret = mprotect_pkey(p1, PAGE_SIZE, PROT_EXEC | PROT_READ, (u64)pkey); + pkey_assert(!ret); } void test_implicit_mprotect_exec_only_memory(int *ptr, u16 pkey) @@ -1671,6 +1674,84 @@ void test_ptrace_modifies_pkru(int *ptr, u16 pkey) } #endif +#if defined(__aarch64__) +void test_ptrace_modifies_pkru(int *ptr, u16 pkey) +{ + pid_t child; + int status, ret; + struct iovec iov; + u64 trace_pkey; + /* Just a random pkey value.. */ + u64 new_pkey = (POE_X << PKEY_BITS_PER_PKEY * 2) | + (POE_NONE << PKEY_BITS_PER_PKEY) | + POE_RWX; + + child = fork(); + pkey_assert(child >= 0); + dprintf3("[%d] fork() ret: %d\n", getpid(), child); + if (!child) { + ptrace(PTRACE_TRACEME, 0, 0, 0); + + /* Stop and allow the tracer to modify PKRU directly */ + raise(SIGSTOP); + + /* + * need __read_pkey_reg() version so we do not do shadow_pkey_reg + * checking + */ + if (__read_pkey_reg() != new_pkey) + exit(1); + + raise(SIGSTOP); + + exit(0); + } + + pkey_assert(child == waitpid(child, &status, 0)); + dprintf3("[%d] waitpid(%d) status: %x\n", getpid(), child, status); + pkey_assert(WIFSTOPPED(status) && WSTOPSIG(status) == SIGSTOP); + + iov.iov_base = &trace_pkey; + iov.iov_len = 8; + ret = ptrace(PTRACE_GETREGSET, child, (void *)NT_ARM_POE, &iov); + pkey_assert(ret == 0); + pkey_assert(trace_pkey == read_pkey_reg()); + + trace_pkey = new_pkey; + + ret = ptrace(PTRACE_SETREGSET, child, (void *)NT_ARM_POE, &iov); + pkey_assert(ret == 0); + + /* Test that the modification is visible in ptrace before any execution */ + memset(&trace_pkey, 0, sizeof(trace_pkey)); + ret = ptrace(PTRACE_GETREGSET, child, (void *)NT_ARM_POE, &iov); + pkey_assert(ret == 0); + pkey_assert(trace_pkey == new_pkey); + + /* Execute the tracee */ + ret = ptrace(PTRACE_CONT, child, 0, 0); + pkey_assert(ret == 0); + + /* Test that the tracee saw the PKRU value change */ + pkey_assert(child == waitpid(child, &status, 0)); + dprintf3("[%d] waitpid(%d) status: %x\n", getpid(), child, status); + pkey_assert(WIFSTOPPED(status) && WSTOPSIG(status) == SIGSTOP); + + /* Test that the modification is visible in ptrace after execution */ + memset(&trace_pkey, 0, sizeof(trace_pkey)); + ret = ptrace(PTRACE_GETREGSET, child, (void *)NT_ARM_POE, &iov); + pkey_assert(ret == 0); + pkey_assert(trace_pkey == new_pkey); + + ret = ptrace(PTRACE_CONT, child, 0, 0); + pkey_assert(ret == 0); + pkey_assert(child == waitpid(child, &status, 0)); + dprintf3("[%d] waitpid(%d) status: %x\n", getpid(), child, status); + pkey_assert(WIFEXITED(status)); + pkey_assert(WEXITSTATUS(status) == 0); +} +#endif + void test_mprotect_pkey_on_unsupported_cpu(int *ptr, u16 pkey) { int size = PAGE_SIZE; @@ -1706,7 +1787,7 @@ void (*pkey_tests[])(int *ptr, u16 pkey) = { test_pkey_syscalls_bad_args, test_pkey_alloc_exhaust, test_pkey_alloc_free_attach_pkey0, -#if defined(__i386__) || defined(__x86_64__) +#if defined(__i386__) || defined(__x86_64__) || defined(__aarch64__) test_ptrace_modifies_pkru, #endif }; -- cgit v1.2.3 From fabf056278b4ccddea4a944a1635fee44033b71f Mon Sep 17 00:00:00 2001 From: Joey Gouly Date: Thu, 22 Aug 2024 16:11:10 +0100 Subject: kselftest/arm64: add HWCAP test for FEAT_S1POE Check that when POE is enabled, the POR_EL0 register is accessible. Signed-off-by: Joey Gouly Cc: Catalin Marinas Cc: Will Deacon Cc: Mark Brown Cc: Shuah Khan Reviewed-by: Mark Brown Link: https://lore.kernel.org/r/20240822151113.1479789-28-joey.gouly@arm.com Signed-off-by: Will Deacon --- tools/testing/selftests/arm64/abi/hwcap.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/arm64/abi/hwcap.c b/tools/testing/selftests/arm64/abi/hwcap.c index d8909b2b535a..f2d6007a2b98 100644 --- a/tools/testing/selftests/arm64/abi/hwcap.c +++ b/tools/testing/selftests/arm64/abi/hwcap.c @@ -156,6 +156,12 @@ static void pmull_sigill(void) asm volatile(".inst 0x0ee0e000" : : : ); } +static void poe_sigill(void) +{ + /* mrs x0, POR_EL0 */ + asm volatile("mrs x0, S3_3_C10_C2_4" : : : "x0"); +} + static void rng_sigill(void) { asm volatile("mrs x0, S3_3_C2_C4_0" : : : "x0"); @@ -601,6 +607,14 @@ static const struct hwcap_data { .cpuinfo = "pmull", .sigill_fn = pmull_sigill, }, + { + .name = "POE", + .at_hwcap = AT_HWCAP2, + .hwcap_bit = HWCAP2_POE, + .cpuinfo = "poe", + .sigill_fn = poe_sigill, + .sigill_reliable = true, + }, { .name = "RNG", .at_hwcap = AT_HWCAP2, -- cgit v1.2.3 From d3c6e5b1093ac68ef66e3c78564e8fb958180bac Mon Sep 17 00:00:00 2001 From: Joey Gouly Date: Thu, 22 Aug 2024 16:11:11 +0100 Subject: kselftest/arm64: parse POE_MAGIC in a signal frame Teach the signal frame parsing about the new POE frame, avoids warning when it is generated. Signed-off-by: Joey Gouly Cc: Catalin Marinas Cc: Will Deacon Cc: Mark Brown Cc: Shuah Khan Reviewed-by: Mark Brown Link: https://lore.kernel.org/r/20240822151113.1479789-29-joey.gouly@arm.com Signed-off-by: Will Deacon --- tools/testing/selftests/arm64/signal/testcases/testcases.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/arm64/signal/testcases/testcases.c b/tools/testing/selftests/arm64/signal/testcases/testcases.c index e4331440fed0..e6daa94fcd2e 100644 --- a/tools/testing/selftests/arm64/signal/testcases/testcases.c +++ b/tools/testing/selftests/arm64/signal/testcases/testcases.c @@ -161,6 +161,10 @@ bool validate_reserved(ucontext_t *uc, size_t resv_sz, char **err) if (head->size != sizeof(struct esr_context)) *err = "Bad size for esr_context"; break; + case POE_MAGIC: + if (head->size != sizeof(struct poe_context)) + *err = "Bad size for poe_context"; + break; case TPIDR2_MAGIC: if (head->size != sizeof(struct tpidr2_context)) *err = "Bad size for tpidr2_context"; -- cgit v1.2.3 From 6a428d63717add52bba4175a7fde54d1f9d166e0 Mon Sep 17 00:00:00 2001 From: Joey Gouly Date: Thu, 22 Aug 2024 16:11:12 +0100 Subject: kselftest/arm64: Add test case for POR_EL0 signal frame records Ensure that we get signal context for POR_EL0 if and only if POE is present on the system. Copied from the TPIDR2 test. Signed-off-by: Joey Gouly Cc: Catalin Marinas Cc: Will Deacon Cc: Mark Brown Cc: Shuah Khan Reviewed-by: Mark Brown Acked-by: Shuah Khan Link: https://lore.kernel.org/r/20240822151113.1479789-30-joey.gouly@arm.com Signed-off-by: Will Deacon --- tools/testing/selftests/arm64/signal/.gitignore | 1 + .../selftests/arm64/signal/testcases/poe_siginfo.c | 86 ++++++++++++++++++++++ 2 files changed, 87 insertions(+) create mode 100644 tools/testing/selftests/arm64/signal/testcases/poe_siginfo.c (limited to 'tools') diff --git a/tools/testing/selftests/arm64/signal/.gitignore b/tools/testing/selftests/arm64/signal/.gitignore index 1ce5b5eac386..b2f2bfd5c6aa 100644 --- a/tools/testing/selftests/arm64/signal/.gitignore +++ b/tools/testing/selftests/arm64/signal/.gitignore @@ -2,6 +2,7 @@ mangle_* fake_sigreturn_* fpmr_* +poe_* sme_* ssve_* sve_* diff --git a/tools/testing/selftests/arm64/signal/testcases/poe_siginfo.c b/tools/testing/selftests/arm64/signal/testcases/poe_siginfo.c new file mode 100644 index 000000000000..36bd9940ee05 --- /dev/null +++ b/tools/testing/selftests/arm64/signal/testcases/poe_siginfo.c @@ -0,0 +1,86 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2023 Arm Limited + * + * Verify that the POR_EL0 register context in signal frames is set up as + * expected. + */ + +#include +#include +#include +#include +#include +#include + +#include "test_signals_utils.h" +#include "testcases.h" + +static union { + ucontext_t uc; + char buf[1024 * 128]; +} context; + +#define SYS_POR_EL0 "S3_3_C10_C2_4" + +static uint64_t get_por_el0(void) +{ + uint64_t val; + + asm volatile( + "mrs %0, " SYS_POR_EL0 "\n" + : "=r"(val) + : + : ); + + return val; +} + +int poe_present(struct tdescr *td, siginfo_t *si, ucontext_t *uc) +{ + struct _aarch64_ctx *head = GET_BUF_RESV_HEAD(context); + struct poe_context *poe_ctx; + size_t offset; + bool in_sigframe; + bool have_poe; + __u64 orig_poe; + + have_poe = getauxval(AT_HWCAP2) & HWCAP2_POE; + if (have_poe) + orig_poe = get_por_el0(); + + if (!get_current_context(td, &context.uc, sizeof(context))) + return 1; + + poe_ctx = (struct poe_context *) + get_header(head, POE_MAGIC, td->live_sz, &offset); + + in_sigframe = poe_ctx != NULL; + + fprintf(stderr, "POR_EL0 sigframe %s on system %s POE\n", + in_sigframe ? "present" : "absent", + have_poe ? "with" : "without"); + + td->pass = (in_sigframe == have_poe); + + /* + * Check that the value we read back was the one present at + * the time that the signal was triggered. + */ + if (have_poe && poe_ctx) { + if (poe_ctx->por_el0 != orig_poe) { + fprintf(stderr, "POR_EL0 in frame is %llx, was %llx\n", + poe_ctx->por_el0, orig_poe); + td->pass = false; + } + } + + return 0; +} + +struct tdescr tde = { + .name = "POR_EL0", + .descr = "Validate that POR_EL0 is present as expected", + .timeout = 3, + .run = poe_present, +}; -- cgit v1.2.3 From 5c26516f090363b548c51ce892b8bcc46c7dcdbc Mon Sep 17 00:00:00 2001 From: Jason Xing Date: Tue, 3 Sep 2024 00:06:10 +0800 Subject: selftests: add selftest for UDP SO_PEEK_OFF support Add the SO_PEEK_OFF selftest for UDP. In this patch, I mainly do three things: 1. rename tcp_so_peek_off.c 2. adjust for UDP protocol 3. add selftests into it Suggested-by: Jon Maloy Reviewed-by: Willem de Bruijn Signed-off-by: Jason Xing Signed-off-by: David S. Miller --- tools/testing/selftests/net/.gitignore | 1 + tools/testing/selftests/net/Makefile | 2 +- tools/testing/selftests/net/sk_so_peek_off.c | 202 ++++++++++++++++++++++++++ tools/testing/selftests/net/tcp_so_peek_off.c | 183 ----------------------- 4 files changed, 204 insertions(+), 184 deletions(-) create mode 100644 tools/testing/selftests/net/sk_so_peek_off.c delete mode 100644 tools/testing/selftests/net/tcp_so_peek_off.c (limited to 'tools') diff --git a/tools/testing/selftests/net/.gitignore b/tools/testing/selftests/net/.gitignore index 666ab7d9390b..923bf098e2eb 100644 --- a/tools/testing/selftests/net/.gitignore +++ b/tools/testing/selftests/net/.gitignore @@ -34,6 +34,7 @@ scm_pidfd scm_rights sk_bind_sendto_listen sk_connect_zero_addr +sk_so_peek_off socket so_incoming_cpu so_netns_cookie diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile index 1179e3261bef..d5029f978aa9 100644 --- a/tools/testing/selftests/net/Makefile +++ b/tools/testing/selftests/net/Makefile @@ -80,7 +80,7 @@ TEST_PROGS += io_uring_zerocopy_tx.sh TEST_GEN_FILES += bind_bhash TEST_GEN_PROGS += sk_bind_sendto_listen TEST_GEN_PROGS += sk_connect_zero_addr -TEST_GEN_PROGS += tcp_so_peek_off +TEST_GEN_PROGS += sk_so_peek_off TEST_PROGS += test_ingress_egress_chaining.sh TEST_GEN_PROGS += so_incoming_cpu TEST_PROGS += sctp_vrf.sh diff --git a/tools/testing/selftests/net/sk_so_peek_off.c b/tools/testing/selftests/net/sk_so_peek_off.c new file mode 100644 index 000000000000..d87dd8d8d491 --- /dev/null +++ b/tools/testing/selftests/net/sk_so_peek_off.c @@ -0,0 +1,202 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include +#include +#include +#include +#include +#include +#include "../kselftest.h" + +static char *afstr(int af, int proto) +{ + if (proto == IPPROTO_TCP) + return af == AF_INET ? "TCP/IPv4" : "TCP/IPv6"; + else + return af == AF_INET ? "UDP/IPv4" : "UDP/IPv6"; +} + +int sk_peek_offset_probe(sa_family_t af, int proto) +{ + int type = (proto == IPPROTO_TCP ? SOCK_STREAM : SOCK_DGRAM); + int optv = 0; + int ret = 0; + int s; + + s = socket(af, type, proto); + if (s < 0) { + ksft_perror("Temporary TCP socket creation failed"); + } else { + if (!setsockopt(s, SOL_SOCKET, SO_PEEK_OFF, &optv, sizeof(int))) + ret = 1; + else + printf("%s does not support SO_PEEK_OFF\n", afstr(af, proto)); + close(s); + } + return ret; +} + +static void sk_peek_offset_set(int s, int offset) +{ + if (setsockopt(s, SOL_SOCKET, SO_PEEK_OFF, &offset, sizeof(offset))) + ksft_perror("Failed to set SO_PEEK_OFF value\n"); +} + +static int sk_peek_offset_get(int s) +{ + int offset; + socklen_t len = sizeof(offset); + + if (getsockopt(s, SOL_SOCKET, SO_PEEK_OFF, &offset, &len)) + ksft_perror("Failed to get SO_PEEK_OFF value\n"); + return offset; +} + +static int sk_peek_offset_test(sa_family_t af, int proto) +{ + int type = (proto == IPPROTO_TCP ? SOCK_STREAM : SOCK_DGRAM); + union { + struct sockaddr sa; + struct sockaddr_in a4; + struct sockaddr_in6 a6; + } a; + int res = 0; + int s[2] = {0, 0}; + int recv_sock = 0; + int offset = 0; + ssize_t len; + char buf[2]; + + memset(&a, 0, sizeof(a)); + a.sa.sa_family = af; + + s[0] = recv_sock = socket(af, type, proto); + s[1] = socket(af, type, proto); + + if (s[0] < 0 || s[1] < 0) { + ksft_perror("Temporary socket creation failed\n"); + goto out; + } + if (bind(s[0], &a.sa, sizeof(a)) < 0) { + ksft_perror("Temporary socket bind() failed\n"); + goto out; + } + if (getsockname(s[0], &a.sa, &((socklen_t) { sizeof(a) })) < 0) { + ksft_perror("Temporary socket getsockname() failed\n"); + goto out; + } + if (proto == IPPROTO_TCP && listen(s[0], 0) < 0) { + ksft_perror("Temporary socket listen() failed\n"); + goto out; + } + if (connect(s[1], &a.sa, sizeof(a)) < 0) { + ksft_perror("Temporary socket connect() failed\n"); + goto out; + } + if (proto == IPPROTO_TCP) { + recv_sock = accept(s[0], NULL, NULL); + if (recv_sock <= 0) { + ksft_perror("Temporary socket accept() failed\n"); + goto out; + } + } + + /* Some basic tests of getting/setting offset */ + offset = sk_peek_offset_get(recv_sock); + if (offset != -1) { + ksft_perror("Initial value of socket offset not -1\n"); + goto out; + } + sk_peek_offset_set(recv_sock, 0); + offset = sk_peek_offset_get(recv_sock); + if (offset != 0) { + ksft_perror("Failed to set socket offset to 0\n"); + goto out; + } + + /* Transfer a message */ + if (send(s[1], (char *)("ab"), 2, 0) != 2) { + ksft_perror("Temporary probe socket send() failed\n"); + goto out; + } + /* Read first byte */ + len = recv(recv_sock, buf, 1, MSG_PEEK); + if (len != 1 || buf[0] != 'a') { + ksft_perror("Failed to read first byte of message\n"); + goto out; + } + offset = sk_peek_offset_get(recv_sock); + if (offset != 1) { + ksft_perror("Offset not forwarded correctly at first byte\n"); + goto out; + } + /* Try to read beyond last byte */ + len = recv(recv_sock, buf, 2, MSG_PEEK); + if (len != 1 || buf[0] != 'b') { + ksft_perror("Failed to read last byte of message\n"); + goto out; + } + offset = sk_peek_offset_get(recv_sock); + if (offset != 2) { + ksft_perror("Offset not forwarded correctly at last byte\n"); + goto out; + } + /* Flush message */ + len = recv(recv_sock, buf, 2, MSG_TRUNC); + if (len != 2) { + ksft_perror("Failed to flush message\n"); + goto out; + } + offset = sk_peek_offset_get(recv_sock); + if (offset != 0) { + ksft_perror("Offset not reverted correctly after flush\n"); + goto out; + } + + printf("%s with MSG_PEEK_OFF works correctly\n", afstr(af, proto)); + res = 1; +out: + if (proto == IPPROTO_TCP && recv_sock >= 0) + close(recv_sock); + if (s[1] >= 0) + close(s[1]); + if (s[0] >= 0) + close(s[0]); + return res; +} + +static int do_test(int proto) +{ + int res4, res6; + + res4 = sk_peek_offset_probe(AF_INET, proto); + res6 = sk_peek_offset_probe(AF_INET6, proto); + + if (!res4 && !res6) + return KSFT_SKIP; + + if (res4) + res4 = sk_peek_offset_test(AF_INET, proto); + + if (res6) + res6 = sk_peek_offset_test(AF_INET6, proto); + + if (!res4 || !res6) + return KSFT_FAIL; + + return KSFT_PASS; +} + +int main(void) +{ + int restcp, resudp; + + restcp = do_test(IPPROTO_TCP); + resudp = do_test(IPPROTO_UDP); + if (restcp == KSFT_FAIL || resudp == KSFT_FAIL) + return KSFT_FAIL; + + return KSFT_PASS; +} diff --git a/tools/testing/selftests/net/tcp_so_peek_off.c b/tools/testing/selftests/net/tcp_so_peek_off.c deleted file mode 100644 index df8a39d9d3c3..000000000000 --- a/tools/testing/selftests/net/tcp_so_peek_off.c +++ /dev/null @@ -1,183 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include -#include -#include -#include -#include -#include -#include -#include -#include "../kselftest.h" - -static char *afstr(int af) -{ - return af == AF_INET ? "TCP/IPv4" : "TCP/IPv6"; -} - -int tcp_peek_offset_probe(sa_family_t af) -{ - int optv = 0; - int ret = 0; - int s; - - s = socket(af, SOCK_STREAM | SOCK_CLOEXEC, IPPROTO_TCP); - if (s < 0) { - ksft_perror("Temporary TCP socket creation failed"); - } else { - if (!setsockopt(s, SOL_SOCKET, SO_PEEK_OFF, &optv, sizeof(int))) - ret = 1; - else - printf("%s does not support SO_PEEK_OFF\n", afstr(af)); - close(s); - } - return ret; -} - -static void tcp_peek_offset_set(int s, int offset) -{ - if (setsockopt(s, SOL_SOCKET, SO_PEEK_OFF, &offset, sizeof(offset))) - ksft_perror("Failed to set SO_PEEK_OFF value\n"); -} - -static int tcp_peek_offset_get(int s) -{ - int offset; - socklen_t len = sizeof(offset); - - if (getsockopt(s, SOL_SOCKET, SO_PEEK_OFF, &offset, &len)) - ksft_perror("Failed to get SO_PEEK_OFF value\n"); - return offset; -} - -static int tcp_peek_offset_test(sa_family_t af) -{ - union { - struct sockaddr sa; - struct sockaddr_in a4; - struct sockaddr_in6 a6; - } a; - int res = 0; - int s[2] = {0, 0}; - int recv_sock = 0; - int offset = 0; - ssize_t len; - char buf; - - memset(&a, 0, sizeof(a)); - a.sa.sa_family = af; - - s[0] = socket(af, SOCK_STREAM, IPPROTO_TCP); - s[1] = socket(af, SOCK_STREAM | SOCK_NONBLOCK, IPPROTO_TCP); - - if (s[0] < 0 || s[1] < 0) { - ksft_perror("Temporary socket creation failed\n"); - goto out; - } - if (bind(s[0], &a.sa, sizeof(a)) < 0) { - ksft_perror("Temporary socket bind() failed\n"); - goto out; - } - if (getsockname(s[0], &a.sa, &((socklen_t) { sizeof(a) })) < 0) { - ksft_perror("Temporary socket getsockname() failed\n"); - goto out; - } - if (listen(s[0], 0) < 0) { - ksft_perror("Temporary socket listen() failed\n"); - goto out; - } - if (connect(s[1], &a.sa, sizeof(a)) >= 0 || errno != EINPROGRESS) { - ksft_perror("Temporary socket connect() failed\n"); - goto out; - } - recv_sock = accept(s[0], NULL, NULL); - if (recv_sock <= 0) { - ksft_perror("Temporary socket accept() failed\n"); - goto out; - } - - /* Some basic tests of getting/setting offset */ - offset = tcp_peek_offset_get(recv_sock); - if (offset != -1) { - ksft_perror("Initial value of socket offset not -1\n"); - goto out; - } - tcp_peek_offset_set(recv_sock, 0); - offset = tcp_peek_offset_get(recv_sock); - if (offset != 0) { - ksft_perror("Failed to set socket offset to 0\n"); - goto out; - } - - /* Transfer a message */ - if (send(s[1], (char *)("ab"), 2, 0) <= 0 || errno != EINPROGRESS) { - ksft_perror("Temporary probe socket send() failed\n"); - goto out; - } - /* Read first byte */ - len = recv(recv_sock, &buf, 1, MSG_PEEK); - if (len != 1 || buf != 'a') { - ksft_perror("Failed to read first byte of message\n"); - goto out; - } - offset = tcp_peek_offset_get(recv_sock); - if (offset != 1) { - ksft_perror("Offset not forwarded correctly at first byte\n"); - goto out; - } - /* Try to read beyond last byte */ - len = recv(recv_sock, &buf, 2, MSG_PEEK); - if (len != 1 || buf != 'b') { - ksft_perror("Failed to read last byte of message\n"); - goto out; - } - offset = tcp_peek_offset_get(recv_sock); - if (offset != 2) { - ksft_perror("Offset not forwarded correctly at last byte\n"); - goto out; - } - /* Flush message */ - len = recv(recv_sock, NULL, 2, MSG_TRUNC); - if (len != 2) { - ksft_perror("Failed to flush message\n"); - goto out; - } - offset = tcp_peek_offset_get(recv_sock); - if (offset != 0) { - ksft_perror("Offset not reverted correctly after flush\n"); - goto out; - } - - printf("%s with MSG_PEEK_OFF works correctly\n", afstr(af)); - res = 1; -out: - if (recv_sock >= 0) - close(recv_sock); - if (s[1] >= 0) - close(s[1]); - if (s[0] >= 0) - close(s[0]); - return res; -} - -int main(void) -{ - int res4, res6; - - res4 = tcp_peek_offset_probe(AF_INET); - res6 = tcp_peek_offset_probe(AF_INET6); - - if (!res4 && !res6) - return KSFT_SKIP; - - if (res4) - res4 = tcp_peek_offset_test(AF_INET); - - if (res6) - res6 = tcp_peek_offset_test(AF_INET6); - - if (!res4 || !res6) - return KSFT_FAIL; - - return KSFT_PASS; -} -- cgit v1.2.3 From dd7c445beb7baf0ed071aba94341857b07fde69b Mon Sep 17 00:00:00 2001 From: "Yo-Jung (Leo) Lin" <0xff07@gmail.com> Date: Sun, 25 Aug 2024 17:53:50 +0800 Subject: pm-graph: Make git ignore sleepgraph.py artifacts By default, sleepgraph.py creates suspend-{date}-{time} directories to store artifacts, or suspend-{date}-{time}-xN if the --multi option is used. Ignore those directories by adding a .gitignore file. Signed-off-by: Yo-Jung (Leo) Lin <0xff07@gmail.com> Acked-by: Todd Brandt Link: https://patch.msgid.link/20240825095353.7578-1-0xff07@gmail.com [ rjw: Subject and changelog edits ] Signed-off-by: Rafael J. Wysocki --- tools/power/pm-graph/.gitignore | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 tools/power/pm-graph/.gitignore (limited to 'tools') diff --git a/tools/power/pm-graph/.gitignore b/tools/power/pm-graph/.gitignore new file mode 100644 index 000000000000..37762a8a06d6 --- /dev/null +++ b/tools/power/pm-graph/.gitignore @@ -0,0 +1,3 @@ +# sleepgraph.py artifacts +suspend-[0-9]*-[0-9]* +suspend-[0-9]*-[0-9]*-x[0-9]* -- cgit v1.2.3 From 387ce37ed57c5d92400c5cff9d5c887f95db0236 Mon Sep 17 00:00:00 2001 From: Amit Vadhavana Date: Sun, 25 Aug 2024 16:36:20 +0530 Subject: pm-graph: Update directory handling and installation process in Makefile - Standardize directory variables to support more flexible installations. - Add copyright and licensing information to the Makefile. - Introduce ".PHONY" declarations to ensure that specific targets are always executed, regardless of the presence of files with matching names. - Add a help target to provide usage instructions. Signed-off-by: Amit Vadhavana Acked-by: Todd Brandt Link: https://patch.msgid.link/Update directory handling and installation process in Makefile [ rjw: Changelog edits ] Signed-off-by: Rafael J. Wysocki --- tools/power/pm-graph/Makefile | 111 +++++++++++++++++++++++++++--------------- 1 file changed, 73 insertions(+), 38 deletions(-) (limited to 'tools') diff --git a/tools/power/pm-graph/Makefile b/tools/power/pm-graph/Makefile index b5310832c19c..aeddbaf2d4c4 100644 --- a/tools/power/pm-graph/Makefile +++ b/tools/power/pm-graph/Makefile @@ -1,51 +1,86 @@ # SPDX-License-Identifier: GPL-2.0 -PREFIX ?= /usr -DESTDIR ?= +# +# Copyright (c) 2013, Intel Corporation. +# +# This program is free software; you can redistribute it and/or modify it +# under the terms and conditions of the GNU General Public License, +# version 2, as published by the Free Software Foundation. +# +# This program is distributed in the hope it will be useful, but WITHOUT +# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +# FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for +# more details. +# +# Authors: +# Todd Brandt + +# Prefix to the directories we're installing to +DESTDIR ?= + +# Directory definitions. These are default and most probably +# do not need to be changed. Please note that DESTDIR is +# added in front of any of them + +BINDIR ?= /usr/bin +MANDIR ?= /usr/share/man +LIBDIR ?= /usr/lib + +# Toolchain: what tools do we use, and what options do they need: +INSTALL = /usr/bin/install +INSTALL_DATA = ${INSTALL} -m 644 all: @echo "Nothing to build" install : uninstall - install -d $(DESTDIR)$(PREFIX)/lib/pm-graph - install sleepgraph.py $(DESTDIR)$(PREFIX)/lib/pm-graph - install bootgraph.py $(DESTDIR)$(PREFIX)/lib/pm-graph - install -d $(DESTDIR)$(PREFIX)/lib/pm-graph/config - install -m 644 config/cgskip.txt $(DESTDIR)$(PREFIX)/lib/pm-graph/config - install -m 644 config/freeze-callgraph.cfg $(DESTDIR)$(PREFIX)/lib/pm-graph/config - install -m 644 config/freeze.cfg $(DESTDIR)$(PREFIX)/lib/pm-graph/config - install -m 644 config/freeze-dev.cfg $(DESTDIR)$(PREFIX)/lib/pm-graph/config - install -m 644 config/standby-callgraph.cfg $(DESTDIR)$(PREFIX)/lib/pm-graph/config - install -m 644 config/standby.cfg $(DESTDIR)$(PREFIX)/lib/pm-graph/config - install -m 644 config/standby-dev.cfg $(DESTDIR)$(PREFIX)/lib/pm-graph/config - install -m 644 config/suspend-callgraph.cfg $(DESTDIR)$(PREFIX)/lib/pm-graph/config - install -m 644 config/suspend.cfg $(DESTDIR)$(PREFIX)/lib/pm-graph/config - install -m 644 config/suspend-dev.cfg $(DESTDIR)$(PREFIX)/lib/pm-graph/config - install -m 644 config/suspend-x2-proc.cfg $(DESTDIR)$(PREFIX)/lib/pm-graph/config - - install -d $(DESTDIR)$(PREFIX)/bin - ln -s ../lib/pm-graph/bootgraph.py $(DESTDIR)$(PREFIX)/bin/bootgraph - ln -s ../lib/pm-graph/sleepgraph.py $(DESTDIR)$(PREFIX)/bin/sleepgraph - - install -d $(DESTDIR)$(PREFIX)/share/man/man8 - install bootgraph.8 $(DESTDIR)$(PREFIX)/share/man/man8 - install sleepgraph.8 $(DESTDIR)$(PREFIX)/share/man/man8 + $(INSTALL) -d $(DESTDIR)$(LIBDIR)/pm-graph + $(INSTALL) sleepgraph.py $(DESTDIR)$(LIBDIR)/pm-graph + $(INSTALL) bootgraph.py $(DESTDIR)$(LIBDIR)/pm-graph + $(INSTALL) -d $(DESTDIR)$(LIBDIR)/pm-graph/config + $(INSTALL_DATA) config/cgskip.txt $(DESTDIR)$(LIBDIR)/pm-graph/config + $(INSTALL_DATA) config/freeze-callgraph.cfg $(DESTDIR)$(LIBDIR)/pm-graph/config + $(INSTALL_DATA) config/freeze.cfg $(DESTDIR)$(LIBDIR)/pm-graph/config + $(INSTALL_DATA) config/freeze-dev.cfg $(DESTDIR)$(LIBDIR)/pm-graph/config + $(INSTALL_DATA) config/standby-callgraph.cfg $(DESTDIR)$(LIBDIR)/pm-graph/config + $(INSTALL_DATA) config/standby.cfg $(DESTDIR)$(LIBDIR)/pm-graph/config + $(INSTALL_DATA) config/standby-dev.cfg $(DESTDIR)$(LIBDIR)/pm-graph/config + $(INSTALL_DATA) config/suspend-callgraph.cfg $(DESTDIR)$(LIBDIR)/pm-graph/config + $(INSTALL_DATA) config/suspend.cfg $(DESTDIR)$(LIBDIR)/pm-graph/config + $(INSTALL_DATA) config/suspend-dev.cfg $(DESTDIR)$(LIBDIR)/pm-graph/config + $(INSTALL_DATA) config/suspend-x2-proc.cfg $(DESTDIR)$(LIBDIR)/pm-graph/config + + $(INSTALL) -d $(DESTDIR)$(BINDIR) + ln -s ../lib/pm-graph/bootgraph.py $(DESTDIR)$(BINDIR)/bootgraph + ln -s ../lib/pm-graph/sleepgraph.py $(DESTDIR)$(BINDIR)/sleepgraph + + $(INSTALL) -d $(DESTDIR)$(MANDIR)/man8 + $(INSTALL) bootgraph.8 $(DESTDIR)$(MANDIR)/man8 + $(INSTALL) sleepgraph.8 $(DESTDIR)$(MANDIR)/man8 uninstall : - rm -f $(DESTDIR)$(PREFIX)/share/man/man8/bootgraph.8 - rm -f $(DESTDIR)$(PREFIX)/share/man/man8/sleepgraph.8 + rm -f $(DESTDIR)$(MANDIR)/man8/bootgraph.8 + rm -f $(DESTDIR)$(MANDIR)/man8/sleepgraph.8 - rm -f $(DESTDIR)$(PREFIX)/bin/bootgraph - rm -f $(DESTDIR)$(PREFIX)/bin/sleepgraph + rm -f $(DESTDIR)$(BINDIR)/bootgraph + rm -f $(DESTDIR)$(BINDIR)/sleepgraph - rm -f $(DESTDIR)$(PREFIX)/lib/pm-graph/config/* - if [ -d $(DESTDIR)$(PREFIX)/lib/pm-graph/config ] ; then \ - rmdir $(DESTDIR)$(PREFIX)/lib/pm-graph/config; \ + rm -f $(DESTDIR)$(LIBDIR)/pm-graph/config/* + if [ -d $(DESTDIR)$(LIBDIR)/pm-graph/config ] ; then \ + rmdir $(DESTDIR)$(LIBDIR)/pm-graph/config; \ fi; - rm -f $(DESTDIR)$(PREFIX)/lib/pm-graph/__pycache__/* - if [ -d $(DESTDIR)$(PREFIX)/lib/pm-graph/__pycache__ ] ; then \ - rmdir $(DESTDIR)$(PREFIX)/lib/pm-graph/__pycache__; \ + rm -f $(DESTDIR)$(LIBDIR)/pm-graph/__pycache__/* + if [ -d $(DESTDIR)$(LIBDIR)/pm-graph/__pycache__ ] ; then \ + rmdir $(DESTDIR)$(LIBDIR)/pm-graph/__pycache__; \ fi; - rm -f $(DESTDIR)$(PREFIX)/lib/pm-graph/* - if [ -d $(DESTDIR)$(PREFIX)/lib/pm-graph ] ; then \ - rmdir $(DESTDIR)$(PREFIX)/lib/pm-graph; \ + rm -f $(DESTDIR)$(LIBDIR)/pm-graph/* + if [ -d $(DESTDIR)$(LIBDIR)/pm-graph ] ; then \ + rmdir $(DESTDIR)$(LIBDIR)/pm-graph; \ fi; + +help: + @echo 'Building targets:' + @echo ' all - Nothing to build' + @echo ' install - Install the program and create necessary directories' + @echo ' uninstall - Remove installed files and directories' + +.PHONY: all install uninstall help -- cgit v1.2.3 From 70b27c756f95156f51b1e28ec7d1b16a539d5487 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Tue, 3 Sep 2024 22:06:04 -0700 Subject: perf parse-events: Add default_breakpoint_len helper The default breakpoint length is "sizeof(long)" however this is incorrect on platforms like Aarch64 where sizeof(long) is 8 but the breakpoint length is 4. Add a helper function that can be used to determine the correct breakpoint length, in this change it just returns the existing default sizeof(long) value. Use the helper in the bp_account test so that, when modifying the event from a watchpoint to a breakpoint, the breakpoint length is appropriate for the architecture and not just sizeof(long). Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Athira Rajeev Cc: Chaitanya S Prakash Cc: Colin Ian King Cc: Dominique Martinet Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: John Garry Cc: Junhao He Cc: Kan Liang Cc: Mark Rutland Cc: Masami Hiramatsu Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Yang Jihong Link: https://lore.kernel.org/r/20240904050606.752788-5-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/bp_account.c | 4 +++- tools/perf/tests/bp_signal.c | 3 ++- tools/perf/tests/bp_signal_overflow.c | 3 ++- tools/perf/tests/parse-events.c | 3 ++- tools/perf/util/parse-events.c | 7 ++++++- tools/perf/util/parse-events.h | 2 ++ 6 files changed, 17 insertions(+), 5 deletions(-) (limited to 'tools') diff --git a/tools/perf/tests/bp_account.c b/tools/perf/tests/bp_account.c index 6f921db33cf9..4cb7d486b5c1 100644 --- a/tools/perf/tests/bp_account.c +++ b/tools/perf/tests/bp_account.c @@ -16,6 +16,7 @@ #include "tests.h" #include "debug.h" #include "event.h" +#include "parse-events.h" #include "../perf-sys.h" #include "cloexec.h" @@ -50,7 +51,7 @@ static int __event(bool is_x, void *addr, struct perf_event_attr *attr) attr->config = 0; attr->bp_type = is_x ? HW_BREAKPOINT_X : HW_BREAKPOINT_W; attr->bp_addr = (unsigned long) addr; - attr->bp_len = sizeof(long); + attr->bp_len = is_x ? default_breakpoint_len() : sizeof(long); attr->sample_period = 1; attr->sample_type = PERF_SAMPLE_IP; @@ -92,6 +93,7 @@ static int bp_accounting(int wp_cnt, int share) attr_mod = attr; attr_mod.bp_type = HW_BREAKPOINT_X; attr_mod.bp_addr = (unsigned long) test_function; + attr_mod.bp_len = default_breakpoint_len(); ret = ioctl(fd[0], PERF_EVENT_IOC_MODIFY_ATTRIBUTES, &attr_mod); TEST_ASSERT_VAL("failed to modify wp\n", ret == 0); diff --git a/tools/perf/tests/bp_signal.c b/tools/perf/tests/bp_signal.c index 1f2908f02389..3faeb5b6fe0b 100644 --- a/tools/perf/tests/bp_signal.c +++ b/tools/perf/tests/bp_signal.c @@ -26,6 +26,7 @@ #include "tests.h" #include "debug.h" #include "event.h" +#include "parse-events.h" #include "perf-sys.h" #include "cloexec.h" @@ -111,7 +112,7 @@ static int __event(bool is_x, void *addr, int sig) pe.config = 0; pe.bp_type = is_x ? HW_BREAKPOINT_X : HW_BREAKPOINT_W; pe.bp_addr = (unsigned long) addr; - pe.bp_len = sizeof(long); + pe.bp_len = is_x ? default_breakpoint_len() : sizeof(long); pe.sample_period = 1; pe.sample_type = PERF_SAMPLE_IP; diff --git a/tools/perf/tests/bp_signal_overflow.c b/tools/perf/tests/bp_signal_overflow.c index 4e897c2cf26b..ee560e156be6 100644 --- a/tools/perf/tests/bp_signal_overflow.c +++ b/tools/perf/tests/bp_signal_overflow.c @@ -25,6 +25,7 @@ #include "tests.h" #include "debug.h" #include "event.h" +#include "parse-events.h" #include "../perf-sys.h" #include "cloexec.h" @@ -88,7 +89,7 @@ static int test__bp_signal_overflow(struct test_suite *test __maybe_unused, int pe.config = 0; pe.bp_type = HW_BREAKPOINT_X; pe.bp_addr = (unsigned long) test_function; - pe.bp_len = sizeof(long); + pe.bp_len = default_breakpoint_len(); pe.sample_period = THRESHOLD; pe.sample_type = PERF_SAMPLE_IP; diff --git a/tools/perf/tests/parse-events.c b/tools/perf/tests/parse-events.c index edc2adcf1bae..639e65a9bf61 100644 --- a/tools/perf/tests/parse-events.c +++ b/tools/perf/tests/parse-events.c @@ -5,6 +5,7 @@ #include #include "tests.h" #include "debug.h" +#include "parse-events.h" #include "pmu.h" #include "pmus.h" #include @@ -262,7 +263,7 @@ static int test__checkevent_breakpoint_x(struct evlist *evlist) TEST_ASSERT_VAL("wrong config", test_config(evsel, 0)); TEST_ASSERT_VAL("wrong bp_type", HW_BREAKPOINT_X == evsel->core.attr.bp_type); - TEST_ASSERT_VAL("wrong bp_len", sizeof(long) == evsel->core.attr.bp_len); + TEST_ASSERT_VAL("wrong bp_len", default_breakpoint_len() == evsel->core.attr.bp_len); return TEST_OK; } diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index 9c0ce01684c3..8c837c8a6c8c 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -671,6 +671,11 @@ static int add_tracepoint_multi_sys(struct parse_events_state *parse_state, } #endif /* HAVE_LIBTRACEEVENT */ +size_t default_breakpoint_len(void) +{ + return sizeof(long); +} + static int parse_breakpoint_type(const char *type, struct perf_event_attr *attr) { @@ -729,7 +734,7 @@ int parse_events_add_breakpoint(struct parse_events_state *parse_state, /* Provide some defaults if len is not specified */ if (!len) { if (attr.bp_type == HW_BREAKPOINT_X) - len = sizeof(long); + len = default_breakpoint_len(); else len = HW_BREAKPOINT_LEN_4; } diff --git a/tools/perf/util/parse-events.h b/tools/perf/util/parse-events.h index b735cd9e0acf..68bfea9ffa70 100644 --- a/tools/perf/util/parse-events.h +++ b/tools/perf/util/parse-events.h @@ -286,4 +286,6 @@ static inline bool is_sdt_event(char *str __maybe_unused) } #endif /* HAVE_LIBELF_SUPPORT */ +size_t default_breakpoint_len(void); + #endif /* __PERF_PARSE_EVENTS_H */ -- cgit v1.2.3 From fa6cc3f932584a5fb5eedd2261fdf0058e1d8828 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Tue, 3 Sep 2024 22:06:05 -0700 Subject: perf parse-events: Vary default_breakpoint_len on i386 and arm64 On arm64 the breakpoint length should be 4-bytes but 8-bytes is tolerated as perf passes that as sizeof(long). Just pass the correct value. On i386 the sizeof(long) check in the kernel needs to match the kernel's long size. Check using an environment (uname checks) whether 4 or 8 bytes needs to be passed. Cache the value in a static. Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Athira Rajeev Cc: Chaitanya S Prakash Cc: Colin Ian King Cc: Dominique Martinet Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: John Garry Cc: Junhao He Cc: Kan Liang Cc: Mark Rutland Cc: Masami Hiramatsu Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Yang Jihong Link: https://lore.kernel.org/r/20240904050606.752788-6-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/parse-events.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'tools') diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index 8c837c8a6c8c..f5eb1af8302c 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -8,6 +8,7 @@ #include #include #include "term.h" +#include "env.h" #include "evlist.h" #include "evsel.h" #include @@ -673,7 +674,22 @@ static int add_tracepoint_multi_sys(struct parse_events_state *parse_state, size_t default_breakpoint_len(void) { +#if defined(__i386__) + static int len; + + if (len == 0) { + struct perf_env env = {}; + + perf_env__init(&env); + len = perf_env__kernel_is_64_bit(&env) ? sizeof(u64) : sizeof(long); + perf_env__exit(&env); + } + return len; +#elif defined(__aarch64__) + return 4; +#else return sizeof(long); +#endif } static int -- cgit v1.2.3 From 1a5efc9e13f357abc396dbf445b25d08914c8060 Mon Sep 17 00:00:00 2001 From: Aditya Gupta Date: Wed, 4 Sep 2024 11:48:30 +0530 Subject: libsubcmd: Don't free the usage string Currently, commands which depend on 'parse_options_subcommand()' don't show the usage string, and instead show '(null)' $ ./perf sched Usage: (null) -D, --dump-raw-trace dump raw trace in ASCII -f, --force don't complain, do it -i, --input input file name -v, --verbose be more verbose (show symbol address, etc) 'parse_options_subcommand()' is generally expected to initialise the usage string, with information in the passed 'subcommands[]' array This behaviour was changed in: 230a7a71f92212e7 ("libsubcmd: Fix parse-options memory leak") Where the generated usage string is deallocated, and usage[0] string is reassigned as NULL. As discussed in [1], free the allocated usage string in the main function itself, and don't reset usage string to NULL in parse_options_subcommand With this change, the behaviour is restored. $ ./perf sched Usage: perf sched [] {record|latency|map|replay|script|timehist} -D, --dump-raw-trace dump raw trace in ASCII -f, --force don't complain, do it -i, --input input file name -v, --verbose be more verbose (show symbol address, etc) [1]: https://lore.kernel.org/linux-perf-users/htq5vhx6piet4nuq2mmhk7fs2bhfykv52dbppwxmo3s7du2odf@styd27tioc6e/ Fixes: 230a7a71f92212e7 ("libsubcmd: Fix parse-options memory leak") Suggested-by: Namhyung Kim Signed-off-by: Aditya Gupta Acked-by: Namhyung Kim Tested-by: Arnaldo Carvalho de Melo Cc: Athira Rajeev Cc: Disha Goel Cc: Ian Rogers Cc: Jiri Olsa Cc: Kajol Jain Cc: Madhavan Srinivasan Cc: Namhyung Kim Link: https://lore.kernel.org/r/20240904061836.55873-2-adityag@linux.ibm.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/lib/subcmd/parse-options.c | 8 +++----- tools/perf/builtin-kmem.c | 2 ++ tools/perf/builtin-kvm.c | 3 +++ tools/perf/builtin-kwork.c | 3 +++ tools/perf/builtin-lock.c | 3 +++ tools/perf/builtin-mem.c | 3 +++ tools/perf/builtin-sched.c | 3 +++ 7 files changed, 20 insertions(+), 5 deletions(-) (limited to 'tools') diff --git a/tools/lib/subcmd/parse-options.c b/tools/lib/subcmd/parse-options.c index 4b60ec03b0bb..eb896d30545b 100644 --- a/tools/lib/subcmd/parse-options.c +++ b/tools/lib/subcmd/parse-options.c @@ -633,10 +633,11 @@ int parse_options_subcommand(int argc, const char **argv, const struct option *o const char *const subcommands[], const char *usagestr[], int flags) { struct parse_opt_ctx_t ctx; - char *buf = NULL; /* build usage string if it's not provided */ if (subcommands && !usagestr[0]) { + char *buf = NULL; + astrcatf(&buf, "%s %s [] {", subcmd_config.exec_name, argv[0]); for (int i = 0; subcommands[i]; i++) { @@ -678,10 +679,7 @@ int parse_options_subcommand(int argc, const char **argv, const struct option *o astrcatf(&error_buf, "unknown switch `%c'", *ctx.opt); usage_with_options(usagestr, options); } - if (buf) { - usagestr[0] = NULL; - free(buf); - } + return parse_options_end(&ctx); } diff --git a/tools/perf/builtin-kmem.c b/tools/perf/builtin-kmem.c index b3cbac40b8c7..a756147e2eec 100644 --- a/tools/perf/builtin-kmem.c +++ b/tools/perf/builtin-kmem.c @@ -2057,6 +2057,8 @@ int cmd_kmem(int argc, const char **argv) out_delete: perf_session__delete(session); + /* free usage string allocated by parse_options_subcommand */ + free((void *)kmem_usage[0]); return ret; } diff --git a/tools/perf/builtin-kvm.c b/tools/perf/builtin-kvm.c index 692267b1b7e8..55ea17c5ff02 100644 --- a/tools/perf/builtin-kvm.c +++ b/tools/perf/builtin-kvm.c @@ -2184,5 +2184,8 @@ int cmd_kvm(int argc, const char **argv) else usage_with_options(kvm_usage, kvm_options); + /* free usage string allocated by parse_options_subcommand */ + free((void *)kvm_usage[0]); + return 0; } diff --git a/tools/perf/builtin-kwork.c b/tools/perf/builtin-kwork.c index 6a4281b8fd10..c1daf82c9b92 100644 --- a/tools/perf/builtin-kwork.c +++ b/tools/perf/builtin-kwork.c @@ -2519,5 +2519,8 @@ int cmd_kwork(int argc, const char **argv) } else usage_with_options(kwork_usage, kwork_options); + /* free usage string allocated by parse_options_subcommand */ + free((void *)kwork_usage[0]); + return 0; } diff --git a/tools/perf/builtin-lock.c b/tools/perf/builtin-lock.c index 2c216427e929..062e2b56a2ab 100644 --- a/tools/perf/builtin-lock.c +++ b/tools/perf/builtin-lock.c @@ -2712,6 +2712,9 @@ int cmd_lock(int argc, const char **argv) usage_with_options(lock_usage, lock_options); } + /* free usage string allocated by parse_options_subcommand */ + free((void *)lock_usage[0]); + zfree(&lockhash_table); return rc; } diff --git a/tools/perf/builtin-mem.c b/tools/perf/builtin-mem.c index 18e5f9a0ddc2..bb38bb5a1c26 100644 --- a/tools/perf/builtin-mem.c +++ b/tools/perf/builtin-mem.c @@ -546,5 +546,8 @@ int cmd_mem(int argc, const char **argv) else usage_with_options(mem_usage, mem_options); + /* free usage string allocated by parse_options_subcommand */ + free((void *)mem_usage[0]); + return 0; } diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c index 0a7b2b2acd56..5981cc51abc8 100644 --- a/tools/perf/builtin-sched.c +++ b/tools/perf/builtin-sched.c @@ -3954,5 +3954,8 @@ int cmd_sched(int argc, const char **argv) usage_with_options(sched_usage, sched_options); } + /* free usage string allocated by parse_options_subcommand */ + free((void *)sched_usage[0]); + return 0; } -- cgit v1.2.3 From 98ad0b77323ce8c216e5a505fda6b2ea53299231 Mon Sep 17 00:00:00 2001 From: Aditya Gupta Date: Wed, 4 Sep 2024 11:48:31 +0530 Subject: perf check: Introduce 'check' subcommand Currently the presence of a feature is checked with a combination of perf version --build-options and greps, such as: perf version --build-options | grep " on .* HAVE_FEATURE" Instead of this, introduce a subcommand "perf check feature", with which scripts can test for presence of a feature, such as: perf check feature HAVE_FEATURE 'perf check feature' command is expected to have exit status of 0 if feature is built-in, and 1 if it's not built-in or if feature is not known. Multiple features can also be passed as a comma-separated list, in which case the exit status will be 1 only if all of the passed features are built-in. For example, with below command, it will have exit status of 0 only if both libtraceevent and bpf are enabled, else 1 in all other cases perf check feature libtraceevent,bpf The arguments are case-insensitive. An array 'supported_features' has also been introduced that can be used by other commands like 'perf version --build-options', so that new features can be added in one place, with the array Committer testing: $ perf check feature libtraceevent,bpf libtraceevent: [ on ] # HAVE_LIBTRACEEVENT bpf: [ on ] # HAVE_LIBBPF_SUPPORT $ perf check feature libtraceevent libtraceevent: [ on ] # HAVE_LIBTRACEEVENT $ perf check feature bpf bpf: [ on ] # HAVE_LIBBPF_SUPPORT $ perf check -q feature bpf && echo "BPF support is present" BPF support is present $ perf check -q feature Bogus && echo "Bogus support is present" $ Reviewed-by: Athira Rajeev Signed-off-by: Aditya Gupta Tested-by: Arnaldo Carvalho de Melo Cc: Athira Rajeev Cc: Disha Goel Cc: Ian Rogers Cc: Jiri Olsa Cc: Kajol Jain Cc: Madhavan Srinivasan Cc: Namhyung Kim Link: https://lore.kernel.org/r/20240904061836.55873-3-adityag@linux.ibm.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Build | 1 + tools/perf/Documentation/perf-check.txt | 82 +++++++++++++++ tools/perf/builtin-check.c | 180 ++++++++++++++++++++++++++++++++ tools/perf/builtin.h | 17 +++ tools/perf/perf.c | 1 + 5 files changed, 281 insertions(+) create mode 100644 tools/perf/Documentation/perf-check.txt create mode 100644 tools/perf/builtin-check.c (limited to 'tools') diff --git a/tools/perf/Build b/tools/perf/Build index 1d4957957d75..3e486f7df94b 100644 --- a/tools/perf/Build +++ b/tools/perf/Build @@ -1,5 +1,6 @@ perf-bench-y += builtin-bench.o perf-y += builtin-annotate.o +perf-y += builtin-check.o perf-y += builtin-config.o perf-y += builtin-diff.o perf-y += builtin-evlist.o diff --git a/tools/perf/Documentation/perf-check.txt b/tools/perf/Documentation/perf-check.txt new file mode 100644 index 000000000000..b081514d240a --- /dev/null +++ b/tools/perf/Documentation/perf-check.txt @@ -0,0 +1,82 @@ +perf-check(1) +=============== + +NAME +---- +perf-check - check if features are present in perf + +SYNOPSIS +-------- +[verse] +'perf check' [] +'perf check' {feature } [] + +DESCRIPTION +----------- +With no subcommands given, 'perf check' command just prints the command +usage on the standard output. + +If the subcommand 'feature' is used, then status of feature is printed +on the standard output (unless '-q' is also passed), ie. whether it is +compiled-in/built-in or not. +Also, 'perf check feature' returns with exit status 0 if the feature +is built-in, otherwise returns with exit status 1. + +SUBCOMMANDS +----------- + +feature:: + + Print whether feature(s) is compiled-in or not, and also returns with an + exit status of 0, if passed feature(s) are compiled-in, else 1. + + It expects a feature list as an argument. There can be a single feature + name/macro, or multiple features can also be passed as a comma-separated + list, in which case the exit status will be 0 only if all of the passed + features are compiled-in. + + The feature names/macros are case-insensitive. + + Example Usage: + perf check feature libtraceevent + perf check feature HAVE_LIBTRACEEVENT + perf check feature libtraceevent,bpf + + Supported feature names/macro: + aio / HAVE_AIO_SUPPORT + bpf / HAVE_LIBBPF_SUPPORT + bpf_skeletons / HAVE_BPF_SKEL + debuginfod / HAVE_DEBUGINFOD_SUPPORT + dwarf / HAVE_DWARF_SUPPORT + dwarf_getlocations / HAVE_DWARF_GETLOCATIONS_SUPPORT + dwarf-unwind-support / HAVE_DWARF_UNWIND_SUPPORT + get_cpuid / HAVE_AUXTRACE_SUPPORT + libaudit / HAVE_LIBAUDIT_SUPPORT + libbfd / HAVE_LIBBFD_SUPPORT + libcapstone / HAVE_LIBCAPSTONE_SUPPORT + libcrypto / HAVE_LIBCRYPTO_SUPPORT + libdw-dwarf-unwind / HAVE_DWARF_SUPPORT + libelf / HAVE_LIBELF_SUPPORT + libnuma / HAVE_LIBNUMA_SUPPORT + libopencsd / HAVE_CSTRACE_SUPPORT + libperl / HAVE_LIBPERL_SUPPORT + libpfm4 / HAVE_LIBPFM + libpython / HAVE_LIBPYTHON_SUPPORT + libslang / HAVE_SLANG_SUPPORT + libtraceevent / HAVE_LIBTRACEEVENT + libunwind / HAVE_LIBUNWIND_SUPPORT + lzma / HAVE_LZMA_SUPPORT + numa_num_possible_cpus / HAVE_LIBNUMA_SUPPORT + syscall_table / HAVE_SYSCALL_TABLE_SUPPORT + zlib / HAVE_ZLIB_SUPPORT + zstd / HAVE_ZSTD_SUPPORT + +OPTIONS +------- +-q:: +--quiet:: + Do not print any messages or warnings + + This can be used along with subcommands such as 'perf check feature' + to hide unnecessary output in test scripts, eg. + 'perf check feature --quiet libtraceevent' diff --git a/tools/perf/builtin-check.c b/tools/perf/builtin-check.c new file mode 100644 index 000000000000..16a04b5456d9 --- /dev/null +++ b/tools/perf/builtin-check.c @@ -0,0 +1,180 @@ +// SPDX-License-Identifier: GPL-2.0 +#include "builtin.h" +#include "color.h" +#include "util/debug.h" +#include "util/header.h" +#include +#include +#include +#include +#include + +static const char * const check_subcommands[] = { "feature", NULL }; +static struct option check_options[] = { + OPT_BOOLEAN('q', "quiet", &quiet, "do not show any warnings or messages"), + OPT_END() +}; +static struct option check_feature_options[] = { OPT_PARENT(check_options) }; + +static const char *check_usage[] = { NULL, NULL }; +static const char *check_feature_usage[] = { + "perf check feature ", + NULL +}; + +struct feature_status supported_features[] = { + FEATURE_STATUS("aio", HAVE_AIO_SUPPORT), + FEATURE_STATUS("bpf", HAVE_LIBBPF_SUPPORT), + FEATURE_STATUS("bpf_skeletons", HAVE_BPF_SKEL), + FEATURE_STATUS("debuginfod", HAVE_DEBUGINFOD_SUPPORT), + FEATURE_STATUS("dwarf", HAVE_DWARF_SUPPORT), + FEATURE_STATUS("dwarf_getlocations", HAVE_DWARF_GETLOCATIONS_SUPPORT), + FEATURE_STATUS("dwarf-unwind-support", HAVE_DWARF_UNWIND_SUPPORT), + FEATURE_STATUS("get_cpuid", HAVE_AUXTRACE_SUPPORT), + FEATURE_STATUS("libaudit", HAVE_LIBAUDIT_SUPPORT), + FEATURE_STATUS("libbfd", HAVE_LIBBFD_SUPPORT), + FEATURE_STATUS("libcapstone", HAVE_LIBCAPSTONE_SUPPORT), + FEATURE_STATUS("libcrypto", HAVE_LIBCRYPTO_SUPPORT), + FEATURE_STATUS("libdw-dwarf-unwind", HAVE_DWARF_SUPPORT), + FEATURE_STATUS("libelf", HAVE_LIBELF_SUPPORT), + FEATURE_STATUS("libnuma", HAVE_LIBNUMA_SUPPORT), + FEATURE_STATUS("libopencsd", HAVE_CSTRACE_SUPPORT), + FEATURE_STATUS("libperl", HAVE_LIBPERL_SUPPORT), + FEATURE_STATUS("libpfm4", HAVE_LIBPFM), + FEATURE_STATUS("libpython", HAVE_LIBPYTHON_SUPPORT), + FEATURE_STATUS("libslang", HAVE_SLANG_SUPPORT), + FEATURE_STATUS("libtraceevent", HAVE_LIBTRACEEVENT), + FEATURE_STATUS("libunwind", HAVE_LIBUNWIND_SUPPORT), + FEATURE_STATUS("lzma", HAVE_LZMA_SUPPORT), + FEATURE_STATUS("numa_num_possible_cpus", HAVE_LIBNUMA_SUPPORT), + FEATURE_STATUS("syscall_table", HAVE_SYSCALL_TABLE_SUPPORT), + FEATURE_STATUS("zlib", HAVE_ZLIB_SUPPORT), + FEATURE_STATUS("zstd", HAVE_ZSTD_SUPPORT), + + /* this should remain at end, to know the array end */ + FEATURE_STATUS(NULL, _) +}; + +static void on_off_print(const char *status) +{ + printf("[ "); + + if (!strcmp(status, "OFF")) + color_fprintf(stdout, PERF_COLOR_RED, "%-3s", status); + else + color_fprintf(stdout, PERF_COLOR_GREEN, "%-3s", status); + + printf(" ]"); +} + +/* Helper function to print status of a feature along with name/macro */ +static void status_print(const char *name, const char *macro, + const char *status) +{ + printf("%22s: ", name); + on_off_print(status); + printf(" # %s\n", macro); +} + +#define STATUS(feature) \ +do { \ + if (feature.is_builtin) \ + status_print(feature.name, feature.macro, "on"); \ + else \ + status_print(feature.name, feature.macro, "OFF"); \ +} while (0) + +/** + * check whether "feature" is built-in with perf + * + * returns: + * 0: NOT built-in or Feature not known + * 1: Built-in + */ +static int has_support(const char *feature) +{ + for (int i = 0; supported_features[i].name; ++i) { + if ((strcasecmp(feature, supported_features[i].name) == 0) || + (strcasecmp(feature, supported_features[i].macro) == 0)) { + if (!quiet) + STATUS(supported_features[i]); + return supported_features[i].is_builtin; + } + } + + if (!quiet) + pr_err("Unknown feature '%s', please use 'perf version --build-options' to see which ones are available.\n", feature); + + return 0; +} + + +/** + * Usage: 'perf check feature ' + * + * can be a single feature name/macro, or a comma-separated list + * of feature names/macros + * eg. argument can be "libtraceevent" or "libtraceevent,bpf" etc + * + * In case of a comma-separated list, feature_enabled will be 1, only if + * all features passed in the string are supported + * + * Note that argv will get modified + */ +static int subcommand_feature(int argc, const char **argv) +{ + char *feature_list; + char *feature_name; + int feature_enabled; + + argc = parse_options(argc, argv, check_feature_options, + check_feature_usage, 0); + + if (!argc) + usage_with_options(check_feature_usage, check_feature_options); + + if (argc > 1) { + pr_err("Too many arguments passed to 'perf check feature'\n"); + return -1; + } + + feature_enabled = 1; + /* feature_list is a non-const copy of 'argv[0]' */ + feature_list = strdup(argv[0]); + if (!feature_list) { + pr_err("ERROR: failed to allocate memory for feature list\n"); + return -1; + } + + feature_name = strtok(feature_list, ","); + + while (feature_name) { + feature_enabled &= has_support(feature_name); + feature_name = strtok(NULL, ","); + } + + free(feature_list); + + return !feature_enabled; +} + +int cmd_check(int argc, const char **argv) +{ + argc = parse_options_subcommand(argc, argv, check_options, + check_subcommands, check_usage, 0); + + if (!argc) + usage_with_options(check_usage, check_options); + + if (strcmp(argv[0], "feature") == 0) + return subcommand_feature(argc, argv); + + /* If no subcommand matched above, print usage help */ + pr_err("Unknown subcommand: %s\n", argv[0]); + usage_with_options(check_usage, check_options); + + /* free usage string allocated by parse_options_subcommand */ + free((void *)check_usage[0]); + + return 0; +} diff --git a/tools/perf/builtin.h b/tools/perf/builtin.h index f4375deabfa3..94f4b3769bf7 100644 --- a/tools/perf/builtin.h +++ b/tools/perf/builtin.h @@ -2,6 +2,22 @@ #ifndef BUILTIN_H #define BUILTIN_H +#include +#include +#include + +struct feature_status { + const char *name; + const char *macro; + int is_builtin; +}; + +#define FEATURE_STATUS(name_, macro_) { \ + .name = name_, \ + .macro = #macro_, \ + .is_builtin = IS_BUILTIN(macro_) } + +extern struct feature_status supported_features[]; struct cmdnames; void list_common_cmds_help(void); @@ -11,6 +27,7 @@ int cmd_annotate(int argc, const char **argv); int cmd_bench(int argc, const char **argv); int cmd_buildid_cache(int argc, const char **argv); int cmd_buildid_list(int argc, const char **argv); +int cmd_check(int argc, const char **argv); int cmd_config(int argc, const char **argv); int cmd_c2c(int argc, const char **argv); int cmd_diff(int argc, const char **argv); diff --git a/tools/perf/perf.c b/tools/perf/perf.c index bd3f80b5bb46..4def800f4089 100644 --- a/tools/perf/perf.c +++ b/tools/perf/perf.c @@ -52,6 +52,7 @@ static struct cmd_struct commands[] = { { "archive", NULL, 0 }, { "buildid-cache", cmd_buildid_cache, 0 }, { "buildid-list", cmd_buildid_list, 0 }, + { "check", cmd_check, 0 }, { "config", cmd_config, 0 }, { "c2c", cmd_c2c, 0 }, { "diff", cmd_diff, 0 }, -- cgit v1.2.3 From 9b2b9b66d532197d08efa1eba157fc484c3657bc Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Tue, 3 Sep 2024 21:43:51 -0700 Subject: perf jevents: Add cpuid to model lookup command When restricting jevents generated json lookup code with JEVENTS_MODEL a list of models must be provided. Some builds don't know model names but know cpuids. Add a command that can convert a cpuid to a model using mapfile.csv files. This can be used with JEVENTS_MODEL like: $ make JEVENTS_MODEL=`./pmu-events/models.py x86 'GenuineIntel-6-8D-1,AuthenticAMD-26-1' pmu-events/arch/` Committer testing: $ tools/perf/pmu-events/models.py x86 'GenuineIntel-6-8D-1,AuthenticAMD-26-1' tools/perf/pmu-events/arch/ tigerlake,amdzen5 $ perf stat -v sleep 1 |& head -1 Using CPUID GenuineIntel-6-B7-1 $ tools/perf/pmu-events/models.py x86 'GenuineIntel-6-B7-1' tools/perf/pmu-events/arch/ alderlake $ Signed-off-by: Ian Rogers Tested-by: Arnaldo Carvalho de Melo Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kan Liang Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20240904044351.712080-1-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/pmu-events/models.py | 73 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 73 insertions(+) create mode 100755 tools/perf/pmu-events/models.py (limited to 'tools') diff --git a/tools/perf/pmu-events/models.py b/tools/perf/pmu-events/models.py new file mode 100755 index 000000000000..8f727d29c952 --- /dev/null +++ b/tools/perf/pmu-events/models.py @@ -0,0 +1,73 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) +"""List model names from mapfile.csv files.""" +import argparse +import csv +import os +import re +from typing import List + +def main() -> None: + def dir_path(path: str) -> str: + """Validate path is a directory for argparse.""" + if os.path.isdir(path): + return path + raise argparse.ArgumentTypeError(f'\'{path}\' is not a valid directory') + + def find_archs(start_dir: str, arch: str) -> List[str]: + archs = [] + for item in os.scandir(start_dir): + if not item.is_dir(): + continue + if arch in (item.name, 'all'): + archs.append(item.name) + + if len(archs) < 1: + raise IOError(f'Missing architecture directory \'{arch}\'') + + return archs + + def find_mapfiles(start_dir: str, archs: List[str]) -> List[str]: + result = [] + for arch in archs: + for item in os.scandir(f'{start_dir}/{arch}'): + if item.is_dir(): + continue + if item.name == 'mapfile.csv': + result.append(f'{start_dir}/{arch}/mapfile.csv') + return result + + def find_cpuids(mapfiles: List[str], cpuids: str) -> List[str]: + result = [] + for mapfile in mapfiles: + with open(mapfile, encoding='utf-8') as csvfile: + first = False + table = csv.reader(csvfile) + for row in table: + if not first or len(row) == 0 or row[0].startswith('#'): + first = True + continue + # Python regular expressions don't handle xdigit. + regex = row[0].replace('[[:xdigit:]]', '[0-9a-fA-F]') + for cpuid in cpuids.split(','): + if re.match(regex, cpuid): + result.append(row[2]) + return result + + ap = argparse.ArgumentParser() + ap.add_argument('arch', help='Architecture name like x86') + ap.add_argument('cpuid', default='all', help='List of cpuids to convert to model names') + ap.add_argument( + 'starting_dir', + type=dir_path, + help='Root of tree containing architecture directories containing json files' + ) + args = ap.parse_args() + + archs = find_archs(args.starting_dir, args.arch) + mapfiles = find_mapfiles(args.starting_dir, archs) + models = find_cpuids(mapfiles, args.cpuid) + print(','.join(models)) + +if __name__ == '__main__': + main() -- cgit v1.2.3 From dce35dd27684640508ff330b8235c4671159743e Mon Sep 17 00:00:00 2001 From: zhang jiao Date: Wed, 4 Sep 2024 15:35:50 +0800 Subject: spi: spidev_fdx: Fix the wrong format specifier The unsigned int should use "%u" instead of "%d". Signed-off-by: zhang jiao Link: https://patch.msgid.link/20240904073550.103618-1-zhangjiao2@cmss.chinamobile.com Signed-off-by: Mark Brown --- tools/spi/spidev_fdx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/spi/spidev_fdx.c b/tools/spi/spidev_fdx.c index 7d2a867cd4ae..bc9c4f6c3ba8 100644 --- a/tools/spi/spidev_fdx.c +++ b/tools/spi/spidev_fdx.c @@ -99,7 +99,7 @@ static void dumpstat(const char *name, int fd) return; } - printf("%s: spi mode 0x%x, %d bits %sper word, %d Hz max\n", + printf("%s: spi mode 0x%x, %d bits %sper word, %u Hz max\n", name, mode, bits, lsb ? "(lsb first) " : "", speed); } -- cgit v1.2.3 From 02baa0a2a677cf543899bc3eb43ed92caf4aba7a Mon Sep 17 00:00:00 2001 From: Yuan Chen Date: Tue, 3 Sep 2024 09:28:39 +0800 Subject: selftests/bpf: Fix procmap_query()'s params mismatch and compilation warning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When the PROCMAP_QUERY is not defined, a compilation error occurs due to the mismatch of the procmap_query()'s params, procmap_query() only be called in the file where the function is defined, modify the params so they can match. We get a warning when build samples/bpf: trace_helpers.c:252:5: warning: no previous prototype for ‘procmap_query’ [-Wmissing-prototypes] 252 | int procmap_query(int fd, const void *addr, __u32 query_flags, size_t *start, size_t *offset, int *flags) | ^~~~~~~~~~~~~ As this function is only used in the file, mark it as 'static'. Fixes: 4e9e07603ecd ("selftests/bpf: make use of PROCMAP_QUERY ioctl if available") Signed-off-by: Yuan Chen Link: https://lore.kernel.org/r/20240903012839.3178-1-chenyuan_fl@163.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/trace_helpers.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/trace_helpers.c b/tools/testing/selftests/bpf/trace_helpers.c index 1bfd881c0e07..2d742fdac6b9 100644 --- a/tools/testing/selftests/bpf/trace_helpers.c +++ b/tools/testing/selftests/bpf/trace_helpers.c @@ -249,7 +249,7 @@ out: #ifdef PROCMAP_QUERY int env_verbosity __weak = 0; -int procmap_query(int fd, const void *addr, __u32 query_flags, size_t *start, size_t *offset, int *flags) +static int procmap_query(int fd, const void *addr, __u32 query_flags, size_t *start, size_t *offset, int *flags) { char path_buf[PATH_MAX], build_id_buf[20]; struct procmap_query q; @@ -293,7 +293,7 @@ int procmap_query(int fd, const void *addr, __u32 query_flags, size_t *start, si return 0; } #else -int procmap_query(int fd, const void *addr, size_t *start, size_t *offset, int *flags) +static int procmap_query(int fd, const void *addr, __u32 query_flags, size_t *start, size_t *offset, int *flags) { return -EOPNOTSUPP; } -- cgit v1.2.3 From d441734d0cfcebc5780bc1880d871f92debb588a Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 19 Aug 2024 17:20:28 -0400 Subject: ktest.pl: Always warn on build warnings If a warning happens at build, give a warning at the end: Build time: 1 minute 40 seconds Install time: 17 seconds Reboot time: 25 seconds *** WARNING found in build: 1 *** ******************************************* ******************************************* KTEST RESULT: TEST 1 SUCCESS!!!! ** ******************************************* ******************************************* This way, even if the test isn't made to fail on warnings during the build, a message is still displayed that warnings were found. Link: https://lore.kernel.org/<20240819172028.3a7fae09@gandalf.local.home> Acked-by: John 'Warthog9' Hawley (Tenstorrent) Signed-off-by: Steven Rostedt --- tools/testing/ktest/ktest.pl | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) (limited to 'tools') diff --git a/tools/testing/ktest/ktest.pl b/tools/testing/ktest/ktest.pl index eb31cd9c977b..c82b8d55dddb 100755 --- a/tools/testing/ktest/ktest.pl +++ b/tools/testing/ktest/ktest.pl @@ -222,6 +222,8 @@ my $install_time; my $reboot_time; my $test_time; +my $warning_found = 0; + my $pwd; my $dirname = $FindBin::Bin; @@ -729,11 +731,18 @@ sub print_times { show_time($test_time); doprint "\n"; } + if ($warning_found) { + doprint "\n*** WARNING"; + doprint "S" if ($warning_found > 1); + doprint " found in build: $warning_found ***\n\n"; + } + # reset for iterations like bisect $build_time = 0; $install_time = 0; $reboot_time = 0; $test_time = 0; + $warning_found = 0; } sub get_mandatory_configs { @@ -2460,8 +2469,6 @@ sub process_warning_line { # Returns 1 if OK # 0 otherwise sub check_buildlog { - return 1 if (!defined $warnings_file); - my %warnings_list; # Failed builds should not reboot the target @@ -2482,18 +2489,21 @@ sub check_buildlog { close(IN); } - # If warnings file didn't exist, and WARNINGS_FILE exist, - # then we fail on any warning! - open(IN, $buildlog) or dodie "Can't open $buildlog"; while () { if (/$check_build_re/) { my $warning = process_warning_line $_; if (!defined $warnings_list{$warning}) { - fail "New warning found (not in $warnings_file)\n$_\n"; - $no_reboot = $save_no_reboot; - return 0; + $warning_found++; + + # If warnings file didn't exist, and WARNINGS_FILE exist, + # then we fail on any warning! + if (defined $warnings_file) { + fail "New warning found (not in $warnings_file)\n$_\n"; + $no_reboot = $save_no_reboot; + return 0; + } } } } -- cgit v1.2.3 From 2351e8c65404aabc433300b6bf90c7a37e8bbc4d Mon Sep 17 00:00:00 2001 From: Daniel Jordan Date: Wed, 4 Sep 2024 13:55:30 -0400 Subject: ktest.pl: Avoid false positives with grub2 skip regex Some distros have grub2 config files with the lines if [ x"${feature_menuentry_id}" = xy ]; then menuentry_id_option="--id" else menuentry_id_option="" fi which match the skip regex defined for grub2 in get_grub_index(): $skip = '^\s*menuentry'; These false positives cause the grub number to be higher than it should be, and the wrong kernel can end up booting. Grub documents the menuentry command with whitespace between it and the title, so make the skip regex reflect this. Link: https://lore.kernel.org/20240904175530.84175-1-daniel.m.jordan@oracle.com Signed-off-by: Daniel Jordan Acked-by: John 'Warthog9' Hawley (Tenstorrent) Signed-off-by: Steven Rostedt --- tools/testing/ktest/ktest.pl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/ktest/ktest.pl b/tools/testing/ktest/ktest.pl index c82b8d55dddb..dacad94e2be4 100755 --- a/tools/testing/ktest/ktest.pl +++ b/tools/testing/ktest/ktest.pl @@ -2056,7 +2056,7 @@ sub get_grub_index { } elsif ($reboot_type eq "grub2") { $command = "cat $grub_file"; $target = '^\s*menuentry.*' . $grub_menu_qt; - $skip = '^\s*menuentry'; + $skip = '^\s*menuentry\s'; $submenu = '^\s*submenu\s'; } elsif ($reboot_type eq "grub2bls") { $command = $grub_bls_get; -- cgit v1.2.3 From 6cdd7750de408b08c9f77ac23723d48598fbeca7 Mon Sep 17 00:00:00 2001 From: Aditya Gupta Date: Thu, 5 Sep 2024 00:31:28 +0530 Subject: perf version: Update --build-options to use 'supported_features' array Now that the feature list has been duplicated in a global 'supported_features' array, use that array instead of manually checking status of built-in features. This helps in being consistent with commands such as 'perf check feature', so commands can use the same array, and any new feature can be added at one place, in the 'supported_features' array Reviewed-by: Athira Rajeev Signed-off-by: Aditya Gupta Acked-by: Namhyung Kim Cc: Disha Goel Cc: Ian Rogers Cc: Jiri Olsa Cc: Kajol Jain Cc: Madhavan Srinivasan Cc: Namhyung Kim Link: https://lore.kernel.org/r/20240904190132.415212-4-adityag@linux.ibm.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-version.c | 44 ++++++++------------------------------------ 1 file changed, 8 insertions(+), 36 deletions(-) (limited to 'tools') diff --git a/tools/perf/builtin-version.c b/tools/perf/builtin-version.c index 4b252196de12..e149d96c6dc5 100644 --- a/tools/perf/builtin-version.c +++ b/tools/perf/builtin-version.c @@ -46,46 +46,18 @@ static void status_print(const char *name, const char *macro, printf(" # %s\n", macro); } -#define STATUS(__d, __m) \ -do { \ - if (IS_BUILTIN(__d)) \ - status_print(#__m, #__d, "on"); \ - else \ - status_print(#__m, #__d, "OFF"); \ +#define STATUS(feature) \ +do { \ + if (feature.is_builtin) \ + status_print(feature.name, feature.macro, "on"); \ + else \ + status_print(feature.name, feature.macro, "OFF"); \ } while (0) static void library_status(void) { - STATUS(HAVE_DWARF_SUPPORT, dwarf); - STATUS(HAVE_DWARF_GETLOCATIONS_SUPPORT, dwarf_getlocations); -#ifndef HAVE_SYSCALL_TABLE_SUPPORT - STATUS(HAVE_LIBAUDIT_SUPPORT, libaudit); -#endif - STATUS(HAVE_SYSCALL_TABLE_SUPPORT, syscall_table); - STATUS(HAVE_LIBBFD_SUPPORT, libbfd); - STATUS(HAVE_DEBUGINFOD_SUPPORT, debuginfod); - STATUS(HAVE_LIBELF_SUPPORT, libelf); - STATUS(HAVE_LIBLLVM_SUPPORT, libllvm); - STATUS(HAVE_LIBNUMA_SUPPORT, libnuma); - STATUS(HAVE_LIBNUMA_SUPPORT, numa_num_possible_cpus); - STATUS(HAVE_LIBPERL_SUPPORT, libperl); - STATUS(HAVE_LIBPYTHON_SUPPORT, libpython); - STATUS(HAVE_SLANG_SUPPORT, libslang); - STATUS(HAVE_LIBCRYPTO_SUPPORT, libcrypto); - STATUS(HAVE_LIBUNWIND_SUPPORT, libunwind); - STATUS(HAVE_DWARF_SUPPORT, libdw-dwarf-unwind); - STATUS(HAVE_LIBCAPSTONE_SUPPORT, libcapstone); - STATUS(HAVE_ZLIB_SUPPORT, zlib); - STATUS(HAVE_LZMA_SUPPORT, lzma); - STATUS(HAVE_AUXTRACE_SUPPORT, get_cpuid); - STATUS(HAVE_LIBBPF_SUPPORT, bpf); - STATUS(HAVE_AIO_SUPPORT, aio); - STATUS(HAVE_ZSTD_SUPPORT, zstd); - STATUS(HAVE_LIBPFM, libpfm4); - STATUS(HAVE_LIBTRACEEVENT, libtraceevent); - STATUS(HAVE_BPF_SKEL, bpf_skeletons); - STATUS(HAVE_DWARF_UNWIND_SUPPORT, dwarf-unwind-support); - STATUS(HAVE_CSTRACE_SUPPORT, libopencsd); + for (int i = 0; supported_features[i].name; ++i) + STATUS(supported_features[i]); } int cmd_version(int argc, const char **argv) -- cgit v1.2.3 From 8a028502b4124eb9e37b5216e440ebb9e187b0a1 Mon Sep 17 00:00:00 2001 From: Aditya Gupta Date: Thu, 5 Sep 2024 00:31:29 +0530 Subject: perf tools test_task_analyzer.sh: Update to use 'perf check feature' Currently we use output of 'perf version --build-options', to check whether perf was built with libtraceevent support. Instead, use 'perf check feature libtraceevent' to check for libtraceevent support. Reviewed-by: Athira Rajeev Signed-off-by: Aditya Gupta Acked-by: Namhyung Kim Cc: Disha Goel Cc: Ian Rogers Cc: Jiri Olsa Cc: Kajol Jain Cc: Madhavan Srinivasan Cc: Namhyung Kim Link: https://lore.kernel.org/r/20240904190132.415212-5-adityag@linux.ibm.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/shell/test_task_analyzer.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/perf/tests/shell/test_task_analyzer.sh b/tools/perf/tests/shell/test_task_analyzer.sh index cb02bf23e6a5..7d76fc63d995 100755 --- a/tools/perf/tests/shell/test_task_analyzer.sh +++ b/tools/perf/tests/shell/test_task_analyzer.sh @@ -55,8 +55,8 @@ find_str_or_fail() { # check if perf is compiled with libtraceevent support skip_no_probe_record_support() { - perf version --build-options | grep -q " OFF .* HAVE_LIBTRACEEVENT" && return 2 - return 0 + perf check feature -q libtraceevent && return 0 + return 2 } prepare_perf_data() { -- cgit v1.2.3 From 512fcf7d9d7fb3751b03db45977b7fabaadfaecd Mon Sep 17 00:00:00 2001 From: Athira Rajeev Date: Thu, 5 Sep 2024 00:31:30 +0530 Subject: perf tests probe_vfs_getname.sh: Update to use 'perf check feature' In probe_vfs_getname.sh, current we use "perf record --dry-run" to check for libtraceevent and skip the test if perf is not build with libtraceevent. Change the check to use "perf check feature" option Signed-off-by: Athira Rajeev Acked-by: Namhyung Kim Cc: Disha Goel Cc: Ian Rogers Cc: Jiri Olsa Cc: Kajol Jain Cc: Madhavan Srinivasan Cc: Namhyung Kim Link: https://lore.kernel.org/r/20240904190132.415212-6-adityag@linux.ibm.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/shell/lib/probe_vfs_getname.sh | 4 ++-- tools/perf/tests/shell/record+probe_libc_inet_pton.sh | 5 ++++- tools/perf/tests/shell/record+script_probe_vfs_getname.sh | 5 ++++- 3 files changed, 10 insertions(+), 4 deletions(-) (limited to 'tools') diff --git a/tools/perf/tests/shell/lib/probe_vfs_getname.sh b/tools/perf/tests/shell/lib/probe_vfs_getname.sh index b3802d9494b4..0606e693eb59 100644 --- a/tools/perf/tests/shell/lib/probe_vfs_getname.sh +++ b/tools/perf/tests/shell/lib/probe_vfs_getname.sh @@ -32,7 +32,7 @@ skip_if_no_debuginfo() { # check if perf is compiled with libtraceevent support skip_no_probe_record_support() { if [ $had_vfs_getname -eq 1 ] ; then - perf record --dry-run -e $1 2>&1 | grep "libtraceevent is necessary for tracepoint support" && return 2 - return 1 + perf check feature -q libtraceevent && return 1 + return 2 fi } diff --git a/tools/perf/tests/shell/record+probe_libc_inet_pton.sh b/tools/perf/tests/shell/record+probe_libc_inet_pton.sh index 72c65570db37..f38c8ead0b03 100755 --- a/tools/perf/tests/shell/record+probe_libc_inet_pton.sh +++ b/tools/perf/tests/shell/record+probe_libc_inet_pton.sh @@ -63,7 +63,10 @@ trace_libc_inet_pton_backtrace() { # Check presence of libtraceevent support to run perf record skip_no_probe_record_support "$event_name/$eventattr/" - [ $? -eq 2 ] && return 2 + if [ $? -eq 2 ]; then + echo "WARN: Skipping test trace_libc_inet_pton_backtrace. No libtraceevent support." + return 2 + fi perf record -e $event_name/$eventattr/ -o $perf_data ping -6 -c 1 ::1 > /dev/null 2>&1 # check if perf data file got created in above step. diff --git a/tools/perf/tests/shell/record+script_probe_vfs_getname.sh b/tools/perf/tests/shell/record+script_probe_vfs_getname.sh index 5eedbe29bba1..9a61928e3c9a 100755 --- a/tools/perf/tests/shell/record+script_probe_vfs_getname.sh +++ b/tools/perf/tests/shell/record+script_probe_vfs_getname.sh @@ -21,7 +21,10 @@ record_open_file() { echo "Recording open file:" # Check presence of libtraceevent support to run perf record skip_no_probe_record_support "probe:vfs_getname*" - [ $? -eq 2 ] && return 2 + if [ $? -eq 2 ]; then + echo "WARN: Skipping test record_open_file. No libtraceevent support" + return 2 + fi perf record -o ${perfdata} -e probe:vfs_getname\* touch $file } -- cgit v1.2.3 From 35439fe4e29bca5fe8da8ed3f8524ef98f3e7aae Mon Sep 17 00:00:00 2001 From: Aditya Gupta Date: Thu, 5 Sep 2024 00:31:31 +0530 Subject: perf check: Fix inconsistencies in feature names Fix two inconsistencies in feature names as discussed in [1]: 1. Rename "dwarf-unwind-support" to "dwarf-unwind" 2. 'get_cpuid' feature and 'HAVE_AUXTRACE_SUPPORT' names don't look related, change the feature name to 'auxtrace' to match the macro name, as 'get_cpuid' string is not used anywhere to check the feature presence [1]: https://lore.kernel.org/linux-perf-users/ZoRw5we4HLSTZND6@x1/ Suggested-by: Arnaldo Carvalho de Melo Signed-off-by: Aditya Gupta Cc: Athira Rajeev Cc: Disha Goel Cc: Ian Rogers Cc: Jiri Olsa Cc: Kajol Jain Cc: Madhavan Srinivasan Cc: Namhyung Kim Link: https://lore.kernel.org/r/20240904190132.415212-7-adityag@linux.ibm.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Documentation/perf-check.txt | 4 ++-- tools/perf/builtin-check.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'tools') diff --git a/tools/perf/Documentation/perf-check.txt b/tools/perf/Documentation/perf-check.txt index b081514d240a..10f69fb6850b 100644 --- a/tools/perf/Documentation/perf-check.txt +++ b/tools/perf/Documentation/perf-check.txt @@ -49,8 +49,8 @@ feature:: debuginfod / HAVE_DEBUGINFOD_SUPPORT dwarf / HAVE_DWARF_SUPPORT dwarf_getlocations / HAVE_DWARF_GETLOCATIONS_SUPPORT - dwarf-unwind-support / HAVE_DWARF_UNWIND_SUPPORT - get_cpuid / HAVE_AUXTRACE_SUPPORT + dwarf-unwind / HAVE_DWARF_UNWIND_SUPPORT + auxtrace / HAVE_AUXTRACE_SUPPORT libaudit / HAVE_LIBAUDIT_SUPPORT libbfd / HAVE_LIBBFD_SUPPORT libcapstone / HAVE_LIBCAPSTONE_SUPPORT diff --git a/tools/perf/builtin-check.c b/tools/perf/builtin-check.c index 16a04b5456d9..0b76b6e42b78 100644 --- a/tools/perf/builtin-check.c +++ b/tools/perf/builtin-check.c @@ -29,8 +29,8 @@ struct feature_status supported_features[] = { FEATURE_STATUS("debuginfod", HAVE_DEBUGINFOD_SUPPORT), FEATURE_STATUS("dwarf", HAVE_DWARF_SUPPORT), FEATURE_STATUS("dwarf_getlocations", HAVE_DWARF_GETLOCATIONS_SUPPORT), - FEATURE_STATUS("dwarf-unwind-support", HAVE_DWARF_UNWIND_SUPPORT), - FEATURE_STATUS("get_cpuid", HAVE_AUXTRACE_SUPPORT), + FEATURE_STATUS("dwarf-unwind", HAVE_DWARF_UNWIND_SUPPORT), + FEATURE_STATUS("auxtrace", HAVE_AUXTRACE_SUPPORT), FEATURE_STATUS("libaudit", HAVE_LIBAUDIT_SUPPORT), FEATURE_STATUS("libbfd", HAVE_LIBBFD_SUPPORT), FEATURE_STATUS("libcapstone", HAVE_LIBCAPSTONE_SUPPORT), -- cgit v1.2.3 From 743070894724bf5ee0b2c77a28f838f6244d19bd Mon Sep 17 00:00:00 2001 From: Jeongjun Park Date: Sat, 31 Aug 2024 14:47:42 +0900 Subject: selftests/bpf: Add a selftest to check for incorrect names Add selftest for cases where btf_name_valid_section() does not properly check for certain types of names. Suggested-by: Eduard Zingerman Signed-off-by: Jeongjun Park Link: https://lore.kernel.org/r/20240831054742.364585-1-aha310510@gmail.com Signed-off-by: Alexei Starovoitov Acked-by: Eduard Zingerman --- tools/testing/selftests/bpf/prog_tests/btf.c | 34 ++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/btf.c b/tools/testing/selftests/bpf/prog_tests/btf.c index 00965a6e83bb..61de88cf4ad0 100644 --- a/tools/testing/selftests/bpf/prog_tests/btf.c +++ b/tools/testing/selftests/bpf/prog_tests/btf.c @@ -3550,6 +3550,40 @@ static struct btf_raw_test raw_tests[] = { }, BTF_STR_SEC("\0x\0?.foo bar:buz"), }, +{ + .descr = "datasec: name with non-printable first char not is ok", + .raw_types = { + /* int */ + BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), /* [1] */ + /* VAR x */ /* [2] */ + BTF_TYPE_ENC(1, BTF_INFO_ENC(BTF_KIND_VAR, 0, 0), 1), + BTF_VAR_STATIC, + /* DATASEC ?.data */ /* [3] */ + BTF_TYPE_ENC(3, BTF_INFO_ENC(BTF_KIND_DATASEC, 0, 1), 4), + BTF_VAR_SECINFO_ENC(2, 0, 4), + BTF_END_RAW, + }, + BTF_STR_SEC("\0x\0\7foo"), + .err_str = "Invalid name", + .btf_load_err = true, +}, +{ + .descr = "datasec: name '\\0' is not ok", + .raw_types = { + /* int */ + BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), /* [1] */ + /* VAR x */ /* [2] */ + BTF_TYPE_ENC(1, BTF_INFO_ENC(BTF_KIND_VAR, 0, 0), 1), + BTF_VAR_STATIC, + /* DATASEC \0 */ /* [3] */ + BTF_TYPE_ENC(3, BTF_INFO_ENC(BTF_KIND_DATASEC, 0, 1), 4), + BTF_VAR_SECINFO_ENC(2, 0, 4), + BTF_END_RAW, + }, + BTF_STR_SEC("\0x\0"), + .err_str = "Invalid name", + .btf_load_err = true, +}, { .descr = "type name '?foo' is not ok", .raw_types = { -- cgit v1.2.3 From 23457b37ec3f9bb373d43cca61db371303726a1e Mon Sep 17 00:00:00 2001 From: Feng Yang Date: Tue, 3 Sep 2024 15:25:59 +0800 Subject: selftests: bpf: Replace sizeof(arr)/sizeof(arr[0]) with ARRAY_SIZE The ARRAY_SIZE macro is more compact and more formal in linux source. Signed-off-by: Feng Yang Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20240903072559.292607-1-yangfeng59949@163.com --- tools/testing/selftests/bpf/prog_tests/fexit_stress.c | 3 ++- tools/testing/selftests/bpf/prog_tests/log_buf.c | 5 +++-- tools/testing/selftests/bpf/prog_tests/module_fentry_shadow.c | 3 ++- .../selftests/bpf/prog_tests/raw_tp_writable_reject_nbd_invalid.c | 3 ++- tools/testing/selftests/bpf/prog_tests/raw_tp_writable_test_run.c | 5 +++-- tools/testing/selftests/bpf/prog_tests/tc_opts.c | 2 +- tools/testing/selftests/bpf/prog_tests/unpriv_bpf_disabled.c | 3 ++- tools/testing/selftests/bpf/progs/syscall.c | 3 ++- tools/testing/selftests/bpf/progs/test_rdonly_maps.c | 3 ++- tools/testing/selftests/bpf/progs/verifier_bits_iter.c | 2 +- 10 files changed, 20 insertions(+), 12 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/fexit_stress.c b/tools/testing/selftests/bpf/prog_tests/fexit_stress.c index 49b1ffc9af1f..14c91b6f1e83 100644 --- a/tools/testing/selftests/bpf/prog_tests/fexit_stress.c +++ b/tools/testing/selftests/bpf/prog_tests/fexit_stress.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2019 Facebook */ #include +#include "bpf_util.h" void serial_test_fexit_stress(void) { @@ -36,7 +37,7 @@ void serial_test_fexit_stress(void) for (i = 0; i < bpf_max_tramp_links; i++) { fexit_fd[i] = bpf_prog_load(BPF_PROG_TYPE_TRACING, NULL, "GPL", trace_program, - sizeof(trace_program) / sizeof(struct bpf_insn), + ARRAY_SIZE(trace_program), &trace_opts); if (!ASSERT_GE(fexit_fd[i], 0, "fexit load")) goto out; diff --git a/tools/testing/selftests/bpf/prog_tests/log_buf.c b/tools/testing/selftests/bpf/prog_tests/log_buf.c index 0f7ea4d7d9f6..46611040dec3 100644 --- a/tools/testing/selftests/bpf/prog_tests/log_buf.c +++ b/tools/testing/selftests/bpf/prog_tests/log_buf.c @@ -5,6 +5,7 @@ #include #include "test_log_buf.skel.h" +#include "bpf_util.h" static size_t libbpf_log_pos; static char libbpf_log_buf[1024 * 1024]; @@ -143,11 +144,11 @@ static void bpf_prog_load_log_buf(void) BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }; - const size_t good_prog_insn_cnt = sizeof(good_prog_insns) / sizeof(struct bpf_insn); + const size_t good_prog_insn_cnt = ARRAY_SIZE(good_prog_insns); const struct bpf_insn bad_prog_insns[] = { BPF_EXIT_INSN(), }; - size_t bad_prog_insn_cnt = sizeof(bad_prog_insns) / sizeof(struct bpf_insn); + size_t bad_prog_insn_cnt = ARRAY_SIZE(bad_prog_insns); LIBBPF_OPTS(bpf_prog_load_opts, opts); const size_t log_buf_sz = 1024 * 1024; char *log_buf; diff --git a/tools/testing/selftests/bpf/prog_tests/module_fentry_shadow.c b/tools/testing/selftests/bpf/prog_tests/module_fentry_shadow.c index aa9f67eb1c95..bea05f78de5f 100644 --- a/tools/testing/selftests/bpf/prog_tests/module_fentry_shadow.c +++ b/tools/testing/selftests/bpf/prog_tests/module_fentry_shadow.c @@ -4,6 +4,7 @@ #include #include "bpf/libbpf_internal.h" #include "cgroup_helpers.h" +#include "bpf_util.h" static const char *module_name = "bpf_testmod"; static const char *symbol_name = "bpf_fentry_shadow_test"; @@ -100,7 +101,7 @@ void test_module_fentry_shadow(void) load_opts.attach_btf_obj_fd = btf_fd[i]; prog_fd[i] = bpf_prog_load(BPF_PROG_TYPE_TRACING, NULL, "GPL", trace_program, - sizeof(trace_program) / sizeof(struct bpf_insn), + ARRAY_SIZE(trace_program), &load_opts); if (!ASSERT_GE(prog_fd[i], 0, "bpf_prog_load")) goto out; diff --git a/tools/testing/selftests/bpf/prog_tests/raw_tp_writable_reject_nbd_invalid.c b/tools/testing/selftests/bpf/prog_tests/raw_tp_writable_reject_nbd_invalid.c index e2f1445b0e10..216b0dfac0fe 100644 --- a/tools/testing/selftests/bpf/prog_tests/raw_tp_writable_reject_nbd_invalid.c +++ b/tools/testing/selftests/bpf/prog_tests/raw_tp_writable_reject_nbd_invalid.c @@ -2,6 +2,7 @@ #include #include +#include "bpf_util.h" void test_raw_tp_writable_reject_nbd_invalid(void) { @@ -25,7 +26,7 @@ void test_raw_tp_writable_reject_nbd_invalid(void) ); bpf_fd = bpf_prog_load(BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE, NULL, "GPL v2", - program, sizeof(program) / sizeof(struct bpf_insn), + program, ARRAY_SIZE(program), &opts); if (CHECK(bpf_fd < 0, "bpf_raw_tracepoint_writable load", "failed: %d errno %d\n", bpf_fd, errno)) diff --git a/tools/testing/selftests/bpf/prog_tests/raw_tp_writable_test_run.c b/tools/testing/selftests/bpf/prog_tests/raw_tp_writable_test_run.c index f4aa7dab4766..e3668058b7bb 100644 --- a/tools/testing/selftests/bpf/prog_tests/raw_tp_writable_test_run.c +++ b/tools/testing/selftests/bpf/prog_tests/raw_tp_writable_test_run.c @@ -2,6 +2,7 @@ #include #include +#include "bpf_util.h" /* NOTE: conflict with other tests. */ void serial_test_raw_tp_writable_test_run(void) @@ -24,7 +25,7 @@ void serial_test_raw_tp_writable_test_run(void) ); int bpf_fd = bpf_prog_load(BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE, NULL, "GPL v2", - trace_program, sizeof(trace_program) / sizeof(struct bpf_insn), + trace_program, ARRAY_SIZE(trace_program), &trace_opts); if (CHECK(bpf_fd < 0, "bpf_raw_tracepoint_writable loaded", "failed: %d errno %d\n", bpf_fd, errno)) @@ -41,7 +42,7 @@ void serial_test_raw_tp_writable_test_run(void) ); int filter_fd = bpf_prog_load(BPF_PROG_TYPE_SOCKET_FILTER, NULL, "GPL v2", - skb_program, sizeof(skb_program) / sizeof(struct bpf_insn), + skb_program, ARRAY_SIZE(skb_program), &skb_opts); if (CHECK(filter_fd < 0, "test_program_loaded", "failed: %d errno %d\n", filter_fd, errno)) diff --git a/tools/testing/selftests/bpf/prog_tests/tc_opts.c b/tools/testing/selftests/bpf/prog_tests/tc_opts.c index 196abf223465..f77f604389aa 100644 --- a/tools/testing/selftests/bpf/prog_tests/tc_opts.c +++ b/tools/testing/selftests/bpf/prog_tests/tc_opts.c @@ -2384,7 +2384,7 @@ static int generate_dummy_prog(void) BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }; - const size_t prog_insn_cnt = sizeof(prog_insns) / sizeof(struct bpf_insn); + const size_t prog_insn_cnt = ARRAY_SIZE(prog_insns); LIBBPF_OPTS(bpf_prog_load_opts, opts); const size_t log_buf_sz = 256; char log_buf[log_buf_sz]; diff --git a/tools/testing/selftests/bpf/prog_tests/unpriv_bpf_disabled.c b/tools/testing/selftests/bpf/prog_tests/unpriv_bpf_disabled.c index 0adf8d9475cb..472f4f9fa95f 100644 --- a/tools/testing/selftests/bpf/prog_tests/unpriv_bpf_disabled.c +++ b/tools/testing/selftests/bpf/prog_tests/unpriv_bpf_disabled.c @@ -7,6 +7,7 @@ #include "test_unpriv_bpf_disabled.skel.h" #include "cap_helpers.h" +#include "bpf_util.h" /* Using CAP_LAST_CAP is risky here, since it can get pulled in from * an old /usr/include/linux/capability.h and be < CAP_BPF; as a result @@ -146,7 +147,7 @@ static void test_unpriv_bpf_disabled_negative(struct test_unpriv_bpf_disabled *s BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }; - const size_t prog_insn_cnt = sizeof(prog_insns) / sizeof(struct bpf_insn); + const size_t prog_insn_cnt = ARRAY_SIZE(prog_insns); LIBBPF_OPTS(bpf_prog_load_opts, load_opts); struct bpf_map_info map_info = {}; __u32 map_info_len = sizeof(map_info); diff --git a/tools/testing/selftests/bpf/progs/syscall.c b/tools/testing/selftests/bpf/progs/syscall.c index 3d3cafdebe72..0f4dfb770c32 100644 --- a/tools/testing/selftests/bpf/progs/syscall.c +++ b/tools/testing/selftests/bpf/progs/syscall.c @@ -8,6 +8,7 @@ #include #include #include +#include "bpf_misc.h" char _license[] SEC("license") = "GPL"; @@ -119,7 +120,7 @@ int load_prog(struct args *ctx) static __u64 value = 34; static union bpf_attr prog_load_attr = { .prog_type = BPF_PROG_TYPE_XDP, - .insn_cnt = sizeof(insns) / sizeof(insns[0]), + .insn_cnt = ARRAY_SIZE(insns), }; int ret; diff --git a/tools/testing/selftests/bpf/progs/test_rdonly_maps.c b/tools/testing/selftests/bpf/progs/test_rdonly_maps.c index fc8e8a34a3db..7035fb4d4165 100644 --- a/tools/testing/selftests/bpf/progs/test_rdonly_maps.c +++ b/tools/testing/selftests/bpf/progs/test_rdonly_maps.c @@ -4,6 +4,7 @@ #include #include #include +#include "bpf_misc.h" const struct { unsigned a[4]; @@ -64,7 +65,7 @@ int full_loop(struct pt_regs *ctx) { /* prevent compiler to optimize everything out */ unsigned * volatile p = (void *)&rdonly_values.a; - int i = sizeof(rdonly_values.a) / sizeof(rdonly_values.a[0]); + int i = ARRAY_SIZE(rdonly_values.a); unsigned iters = 0, sum = 0; /* validate verifier can allow full loop as well */ diff --git a/tools/testing/selftests/bpf/progs/verifier_bits_iter.c b/tools/testing/selftests/bpf/progs/verifier_bits_iter.c index 716113c2bce2..f4da4d508ddb 100644 --- a/tools/testing/selftests/bpf/progs/verifier_bits_iter.c +++ b/tools/testing/selftests/bpf/progs/verifier_bits_iter.c @@ -87,7 +87,7 @@ int bits_memalloc(void) int *bit; __builtin_memset(&data, 0xf0, sizeof(data)); /* 4 * 16 */ - bpf_for_each(bits, bit, &data[0], sizeof(data) / sizeof(u64)) + bpf_for_each(bits, bit, &data[0], ARRAY_SIZE(data)) nr++; return nr; } -- cgit v1.2.3 From 8195136669661fdfe54e9a8923c33b31c92fc1da Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 4 Sep 2024 10:24:59 -1000 Subject: sched_ext: Add cgroup support Add sched_ext_ops operations to init/exit cgroups, and track task migrations and config changes. A BPF scheduler may not implement or implement only subset of cgroup features. The implemented features can be indicated using %SCX_OPS_HAS_CGOUP_* flags. If cgroup configuration makes use of features that are not implemented, a warning is triggered. While a BPF scheduler is being enabled and disabled, relevant cgroup operations are locked out using scx_cgroup_rwsem. This avoids situations like task prep taking place while the task is being moved across cgroups, making things easier for BPF schedulers. v7: - cgroup interface file visibility toggling is dropped in favor just warning messages. Dynamically changing interface visiblity caused more confusion than helping. v6: - Updated to reflect the removal of SCX_KF_SLEEPABLE. - Updated to use CONFIG_GROUP_SCHED_WEIGHT and fixes for !CONFIG_FAIR_GROUP_SCHED && CONFIG_EXT_GROUP_SCHED. v5: - Flipped the locking order between scx_cgroup_rwsem and cpus_read_lock() to avoid locking order conflict w/ cpuset. Better documentation around locking. - sched_move_task() takes an early exit if the source and destination are identical. This triggered the warning in scx_cgroup_can_attach() as it left p->scx.cgrp_moving_from uncleared. Updated the cgroup migration path so that ops.cgroup_prep_move() is skipped for identity migrations so that its invocations always match ops.cgroup_move() one-to-one. v4: - Example schedulers moved into their own patches. - Fix build failure when !CONFIG_CGROUP_SCHED, reported by Andrea Righi. v3: - Make scx_example_pair switch all tasks by default. - Convert to BPF inline iterators. - scx_bpf_task_cgroup() is added to determine the current cgroup from CPU controller's POV. This allows BPF schedulers to accurately track CPU cgroup membership. - scx_example_flatcg added. This demonstrates flattened hierarchy implementation of CPU cgroup control and shows significant performance improvement when cgroups which are nested multiple levels are under competition. v2: - Build fixes for different CONFIG combinations. Signed-off-by: Tejun Heo Reviewed-by: David Vernet Acked-by: Josh Don Acked-by: Hao Luo Acked-by: Barret Rhoden Reported-by: kernel test robot Cc: Andrea Righi --- include/linux/sched/ext.h | 3 + init/Kconfig | 6 + kernel/sched/core.c | 67 ++- kernel/sched/ext.c | 519 +++++++++++++++++++++++- kernel/sched/ext.h | 22 + kernel/sched/sched.h | 5 + tools/sched_ext/include/scx/common.bpf.h | 1 + tools/testing/selftests/sched_ext/maximal.bpf.c | 32 ++ 8 files changed, 636 insertions(+), 19 deletions(-) (limited to 'tools') diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h index db2a266113ac..dc646cca4fcf 100644 --- a/include/linux/sched/ext.h +++ b/include/linux/sched/ext.h @@ -188,6 +188,9 @@ struct sched_ext_entity { bool disallow; /* reject switching into SCX */ /* cold fields */ +#ifdef CONFIG_EXT_GROUP_SCHED + struct cgroup *cgrp_moving_from; +#endif /* must be the last field, see init_scx_entity() */ struct list_head tasks_node; }; diff --git a/init/Kconfig b/init/Kconfig index 84332d3594d0..8c11ed61ca67 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1055,6 +1055,12 @@ config RT_GROUP_SCHED realtime bandwidth for them. See Documentation/scheduler/sched-rt-group.rst for more information. +config EXT_GROUP_SCHED + bool + depends on SCHED_CLASS_EXT && CGROUP_SCHED + select GROUP_SCHED_WEIGHT + default y + endif #CGROUP_SCHED config SCHED_MM_CID diff --git a/kernel/sched/core.c b/kernel/sched/core.c index d302115b1522..b01b63e78d5e 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -8364,6 +8364,9 @@ void __init sched_init(void) root_task_group.shares = ROOT_TASK_GROUP_LOAD; init_cfs_bandwidth(&root_task_group.cfs_bandwidth, NULL); #endif /* CONFIG_FAIR_GROUP_SCHED */ +#ifdef CONFIG_EXT_GROUP_SCHED + root_task_group.scx_weight = CGROUP_WEIGHT_DFL; +#endif /* CONFIG_EXT_GROUP_SCHED */ #ifdef CONFIG_RT_GROUP_SCHED root_task_group.rt_se = (struct sched_rt_entity **)ptr; ptr += nr_cpu_ids * sizeof(void **); @@ -8801,6 +8804,7 @@ struct task_group *sched_create_group(struct task_group *parent) if (!alloc_rt_sched_group(tg, parent)) goto err; + scx_group_set_weight(tg, CGROUP_WEIGHT_DFL); alloc_uclamp_sched_group(tg, parent); return tg; @@ -8928,6 +8932,7 @@ void sched_move_task(struct task_struct *tsk) put_prev_task(rq, tsk); sched_change_group(tsk, group); + scx_move_task(tsk); if (queued) enqueue_task(rq, tsk, queue_flags); @@ -8965,6 +8970,11 @@ static int cpu_cgroup_css_online(struct cgroup_subsys_state *css) { struct task_group *tg = css_tg(css); struct task_group *parent = css_tg(css->parent); + int ret; + + ret = scx_tg_online(tg); + if (ret) + return ret; if (parent) sched_online_group(tg, parent); @@ -8979,6 +8989,13 @@ static int cpu_cgroup_css_online(struct cgroup_subsys_state *css) return 0; } +static void cpu_cgroup_css_offline(struct cgroup_subsys_state *css) +{ + struct task_group *tg = css_tg(css); + + scx_tg_offline(tg); +} + static void cpu_cgroup_css_released(struct cgroup_subsys_state *css) { struct task_group *tg = css_tg(css); @@ -8996,9 +9013,9 @@ static void cpu_cgroup_css_free(struct cgroup_subsys_state *css) sched_unregister_group(tg); } -#ifdef CONFIG_RT_GROUP_SCHED static int cpu_cgroup_can_attach(struct cgroup_taskset *tset) { +#ifdef CONFIG_RT_GROUP_SCHED struct task_struct *task; struct cgroup_subsys_state *css; @@ -9006,9 +9023,9 @@ static int cpu_cgroup_can_attach(struct cgroup_taskset *tset) if (!sched_rt_can_attach(css_tg(css), task)) return -EINVAL; } - return 0; -} #endif + return scx_cgroup_can_attach(tset); +} static void cpu_cgroup_attach(struct cgroup_taskset *tset) { @@ -9017,6 +9034,13 @@ static void cpu_cgroup_attach(struct cgroup_taskset *tset) cgroup_taskset_for_each(task, css, tset) sched_move_task(task); + + scx_cgroup_finish_attach(); +} + +static void cpu_cgroup_cancel_attach(struct cgroup_taskset *tset) +{ + scx_cgroup_cancel_attach(tset); } #ifdef CONFIG_UCLAMP_TASK_GROUP @@ -9196,15 +9220,25 @@ static int cpu_uclamp_max_show(struct seq_file *sf, void *v) #ifdef CONFIG_GROUP_SCHED_WEIGHT static unsigned long tg_weight(struct task_group *tg) { +#ifdef CONFIG_FAIR_GROUP_SCHED return scale_load_down(tg->shares); +#else + return sched_weight_from_cgroup(tg->scx_weight); +#endif } static int cpu_shares_write_u64(struct cgroup_subsys_state *css, struct cftype *cftype, u64 shareval) { + int ret; + if (shareval > scale_load_down(ULONG_MAX)) shareval = MAX_SHARES; - return sched_group_set_shares(css_tg(css), scale_load(shareval)); + ret = sched_group_set_shares(css_tg(css), scale_load(shareval)); + if (!ret) + scx_group_set_weight(css_tg(css), + sched_weight_to_cgroup(shareval)); + return ret; } static u64 cpu_shares_read_u64(struct cgroup_subsys_state *css, @@ -9595,7 +9629,12 @@ static s64 cpu_idle_read_s64(struct cgroup_subsys_state *css, static int cpu_idle_write_s64(struct cgroup_subsys_state *css, struct cftype *cft, s64 idle) { - return sched_group_set_idle(css_tg(css), idle); + int ret; + + ret = sched_group_set_idle(css_tg(css), idle); + if (!ret) + scx_group_set_idle(css_tg(css), idle); + return ret; } #endif @@ -9722,13 +9761,17 @@ static int cpu_weight_write_u64(struct cgroup_subsys_state *css, struct cftype *cft, u64 cgrp_weight) { unsigned long weight; + int ret; if (cgrp_weight < CGROUP_WEIGHT_MIN || cgrp_weight > CGROUP_WEIGHT_MAX) return -ERANGE; weight = sched_weight_from_cgroup(cgrp_weight); - return sched_group_set_shares(css_tg(css), scale_load(weight)); + ret = sched_group_set_shares(css_tg(css), scale_load(weight)); + if (!ret) + scx_group_set_weight(css_tg(css), cgrp_weight); + return ret; } static s64 cpu_weight_nice_read_s64(struct cgroup_subsys_state *css, @@ -9753,7 +9796,7 @@ static int cpu_weight_nice_write_s64(struct cgroup_subsys_state *css, struct cftype *cft, s64 nice) { unsigned long weight; - int idx; + int idx, ret; if (nice < MIN_NICE || nice > MAX_NICE) return -ERANGE; @@ -9762,7 +9805,11 @@ static int cpu_weight_nice_write_s64(struct cgroup_subsys_state *css, idx = array_index_nospec(idx, 40); weight = sched_prio_to_weight[idx]; - return sched_group_set_shares(css_tg(css), scale_load(weight)); + ret = sched_group_set_shares(css_tg(css), scale_load(weight)); + if (!ret) + scx_group_set_weight(css_tg(css), + sched_weight_to_cgroup(weight)); + return ret; } #endif /* CONFIG_GROUP_SCHED_WEIGHT */ @@ -9878,14 +9925,14 @@ static struct cftype cpu_files[] = { struct cgroup_subsys cpu_cgrp_subsys = { .css_alloc = cpu_cgroup_css_alloc, .css_online = cpu_cgroup_css_online, + .css_offline = cpu_cgroup_css_offline, .css_released = cpu_cgroup_css_released, .css_free = cpu_cgroup_css_free, .css_extra_stat_show = cpu_extra_stat_show, .css_local_stat_show = cpu_local_stat_show, -#ifdef CONFIG_RT_GROUP_SCHED .can_attach = cpu_cgroup_can_attach, -#endif .attach = cpu_cgroup_attach, + .cancel_attach = cpu_cgroup_cancel_attach, .legacy_cftypes = cpu_legacy_files, .dfl_cftypes = cpu_files, .early_init = true, diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 5fd2bfc01403..bba9d805dc2b 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -116,10 +116,16 @@ enum scx_ops_flags { */ SCX_OPS_SWITCH_PARTIAL = 1LLU << 3, + /* + * CPU cgroup support flags + */ + SCX_OPS_HAS_CGROUP_WEIGHT = 1LLU << 16, /* cpu.weight */ + SCX_OPS_ALL_FLAGS = SCX_OPS_KEEP_BUILTIN_IDLE | SCX_OPS_ENQ_LAST | SCX_OPS_ENQ_EXITING | - SCX_OPS_SWITCH_PARTIAL, + SCX_OPS_SWITCH_PARTIAL | + SCX_OPS_HAS_CGROUP_WEIGHT, }; /* argument container for ops.init_task() */ @@ -129,6 +135,10 @@ struct scx_init_task_args { * to the scheduler transition path. */ bool fork; +#ifdef CONFIG_EXT_GROUP_SCHED + /* the cgroup the task is joining */ + struct cgroup *cgroup; +#endif }; /* argument container for ops.exit_task() */ @@ -137,6 +147,12 @@ struct scx_exit_task_args { bool cancelled; }; +/* argument container for ops->cgroup_init() */ +struct scx_cgroup_init_args { + /* the weight of the cgroup [1..10000] */ + u32 weight; +}; + enum scx_cpu_preempt_reason { /* next task is being scheduled by &sched_class_rt */ SCX_CPU_PREEMPT_RT, @@ -501,6 +517,79 @@ struct sched_ext_ops { */ void (*dump_task)(struct scx_dump_ctx *ctx, struct task_struct *p); +#ifdef CONFIG_EXT_GROUP_SCHED + /** + * cgroup_init - Initialize a cgroup + * @cgrp: cgroup being initialized + * @args: init arguments, see the struct definition + * + * Either the BPF scheduler is being loaded or @cgrp created, initialize + * @cgrp for sched_ext. This operation may block. + * + * Return 0 for success, -errno for failure. An error return while + * loading will abort loading of the BPF scheduler. During cgroup + * creation, it will abort the specific cgroup creation. + */ + s32 (*cgroup_init)(struct cgroup *cgrp, + struct scx_cgroup_init_args *args); + + /** + * cgroup_exit - Exit a cgroup + * @cgrp: cgroup being exited + * + * Either the BPF scheduler is being unloaded or @cgrp destroyed, exit + * @cgrp for sched_ext. This operation my block. + */ + void (*cgroup_exit)(struct cgroup *cgrp); + + /** + * cgroup_prep_move - Prepare a task to be moved to a different cgroup + * @p: task being moved + * @from: cgroup @p is being moved from + * @to: cgroup @p is being moved to + * + * Prepare @p for move from cgroup @from to @to. This operation may + * block and can be used for allocations. + * + * Return 0 for success, -errno for failure. An error return aborts the + * migration. + */ + s32 (*cgroup_prep_move)(struct task_struct *p, + struct cgroup *from, struct cgroup *to); + + /** + * cgroup_move - Commit cgroup move + * @p: task being moved + * @from: cgroup @p is being moved from + * @to: cgroup @p is being moved to + * + * Commit the move. @p is dequeued during this operation. + */ + void (*cgroup_move)(struct task_struct *p, + struct cgroup *from, struct cgroup *to); + + /** + * cgroup_cancel_move - Cancel cgroup move + * @p: task whose cgroup move is being canceled + * @from: cgroup @p was being moved from + * @to: cgroup @p was being moved to + * + * @p was cgroup_prep_move()'d but failed before reaching cgroup_move(). + * Undo the preparation. + */ + void (*cgroup_cancel_move)(struct task_struct *p, + struct cgroup *from, struct cgroup *to); + + /** + * cgroup_set_weight - A cgroup's weight is being changed + * @cgrp: cgroup whose weight is being updated + * @weight: new weight [1..10000] + * + * Update @tg's weight to @weight. + */ + void (*cgroup_set_weight)(struct cgroup *cgrp, u32 weight); +#endif /* CONFIG_CGROUPS */ + /* * All online ops must come before ops.cpu_online(). */ @@ -683,6 +772,11 @@ enum scx_kick_flags { SCX_KICK_WAIT = 1LLU << 2, }; +enum scx_tg_flags { + SCX_TG_ONLINE = 1U << 0, + SCX_TG_INITED = 1U << 1, +}; + enum scx_ops_enable_state { SCX_OPS_PREPPING, SCX_OPS_ENABLING, @@ -3235,6 +3329,28 @@ static void task_tick_scx(struct rq *rq, struct task_struct *curr, int queued) resched_curr(rq); } +#ifdef CONFIG_EXT_GROUP_SCHED +static struct cgroup *tg_cgrp(struct task_group *tg) +{ + /* + * If CGROUP_SCHED is disabled, @tg is NULL. If @tg is an autogroup, + * @tg->css.cgroup is NULL. In both cases, @tg can be treated as the + * root cgroup. + */ + if (tg && tg->css.cgroup) + return tg->css.cgroup; + else + return &cgrp_dfl_root.cgrp; +} + +#define SCX_INIT_TASK_ARGS_CGROUP(tg) .cgroup = tg_cgrp(tg), + +#else /* CONFIG_EXT_GROUP_SCHED */ + +#define SCX_INIT_TASK_ARGS_CGROUP(tg) + +#endif /* CONFIG_EXT_GROUP_SCHED */ + static enum scx_task_state scx_get_task_state(const struct task_struct *p) { return (p->scx.flags & SCX_TASK_STATE_MASK) >> SCX_TASK_STATE_SHIFT; @@ -3279,6 +3395,7 @@ static int scx_ops_init_task(struct task_struct *p, struct task_group *tg, bool if (SCX_HAS_OP(init_task)) { struct scx_init_task_args args = { + SCX_INIT_TASK_ARGS_CGROUP(tg) .fork = fork, }; @@ -3343,7 +3460,7 @@ static void scx_ops_enable_task(struct task_struct *p) scx_set_task_state(p, SCX_TASK_ENABLED); if (SCX_HAS_OP(set_weight)) - SCX_CALL_OP(SCX_KF_REST, set_weight, p, p->scx.weight); + SCX_CALL_OP_TASK(SCX_KF_REST, set_weight, p, p->scx.weight); } static void scx_ops_disable_task(struct task_struct *p) @@ -3555,6 +3672,219 @@ bool scx_can_stop_tick(struct rq *rq) } #endif +#ifdef CONFIG_EXT_GROUP_SCHED + +DEFINE_STATIC_PERCPU_RWSEM(scx_cgroup_rwsem); +static bool cgroup_warned_missing_weight; +static bool cgroup_warned_missing_idle; + +static void scx_cgroup_warn_missing_weight(struct task_group *tg) +{ + if (scx_ops_enable_state() == SCX_OPS_DISABLED || + cgroup_warned_missing_weight) + return; + + if ((scx_ops.flags & SCX_OPS_HAS_CGROUP_WEIGHT) || !tg->css.parent) + return; + + pr_warn("sched_ext: \"%s\" does not implement cgroup cpu.weight\n", + scx_ops.name); + cgroup_warned_missing_weight = true; +} + +static void scx_cgroup_warn_missing_idle(struct task_group *tg) +{ + if (scx_ops_enable_state() == SCX_OPS_DISABLED || + cgroup_warned_missing_idle) + return; + + if (!tg->idle) + return; + + pr_warn("sched_ext: \"%s\" does not implement cgroup cpu.idle\n", + scx_ops.name); + cgroup_warned_missing_idle = true; +} + +int scx_tg_online(struct task_group *tg) +{ + int ret = 0; + + WARN_ON_ONCE(tg->scx_flags & (SCX_TG_ONLINE | SCX_TG_INITED)); + + percpu_down_read(&scx_cgroup_rwsem); + + scx_cgroup_warn_missing_weight(tg); + + if (SCX_HAS_OP(cgroup_init)) { + struct scx_cgroup_init_args args = { .weight = tg->scx_weight }; + + ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, cgroup_init, + tg->css.cgroup, &args); + if (!ret) + tg->scx_flags |= SCX_TG_ONLINE | SCX_TG_INITED; + else + ret = ops_sanitize_err("cgroup_init", ret); + } else { + tg->scx_flags |= SCX_TG_ONLINE; + } + + percpu_up_read(&scx_cgroup_rwsem); + return ret; +} + +void scx_tg_offline(struct task_group *tg) +{ + WARN_ON_ONCE(!(tg->scx_flags & SCX_TG_ONLINE)); + + percpu_down_read(&scx_cgroup_rwsem); + + if (SCX_HAS_OP(cgroup_exit) && (tg->scx_flags & SCX_TG_INITED)) + SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_exit, tg->css.cgroup); + tg->scx_flags &= ~(SCX_TG_ONLINE | SCX_TG_INITED); + + percpu_up_read(&scx_cgroup_rwsem); +} + +int scx_cgroup_can_attach(struct cgroup_taskset *tset) +{ + struct cgroup_subsys_state *css; + struct task_struct *p; + int ret; + + /* released in scx_finish/cancel_attach() */ + percpu_down_read(&scx_cgroup_rwsem); + + if (!scx_enabled()) + return 0; + + cgroup_taskset_for_each(p, css, tset) { + struct cgroup *from = tg_cgrp(task_group(p)); + struct cgroup *to = tg_cgrp(css_tg(css)); + + WARN_ON_ONCE(p->scx.cgrp_moving_from); + + /* + * sched_move_task() omits identity migrations. Let's match the + * behavior so that ops.cgroup_prep_move() and ops.cgroup_move() + * always match one-to-one. + */ + if (from == to) + continue; + + if (SCX_HAS_OP(cgroup_prep_move)) { + ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, cgroup_prep_move, + p, from, css->cgroup); + if (ret) + goto err; + } + + p->scx.cgrp_moving_from = from; + } + + return 0; + +err: + cgroup_taskset_for_each(p, css, tset) { + if (SCX_HAS_OP(cgroup_cancel_move) && p->scx.cgrp_moving_from) + SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_cancel_move, p, + p->scx.cgrp_moving_from, css->cgroup); + p->scx.cgrp_moving_from = NULL; + } + + percpu_up_read(&scx_cgroup_rwsem); + return ops_sanitize_err("cgroup_prep_move", ret); +} + +void scx_move_task(struct task_struct *p) +{ + if (!scx_enabled()) + return; + + /* + * We're called from sched_move_task() which handles both cgroup and + * autogroup moves. Ignore the latter. + * + * Also ignore exiting tasks, because in the exit path tasks transition + * from the autogroup to the root group, so task_group_is_autogroup() + * alone isn't able to catch exiting autogroup tasks. This is safe for + * cgroup_move(), because cgroup migrations never happen for PF_EXITING + * tasks. + */ + if (task_group_is_autogroup(task_group(p)) || (p->flags & PF_EXITING)) + return; + + /* + * @p must have ops.cgroup_prep_move() called on it and thus + * cgrp_moving_from set. + */ + if (SCX_HAS_OP(cgroup_move) && !WARN_ON_ONCE(!p->scx.cgrp_moving_from)) + SCX_CALL_OP_TASK(SCX_KF_UNLOCKED, cgroup_move, p, + p->scx.cgrp_moving_from, tg_cgrp(task_group(p))); + p->scx.cgrp_moving_from = NULL; +} + +void scx_cgroup_finish_attach(void) +{ + percpu_up_read(&scx_cgroup_rwsem); +} + +void scx_cgroup_cancel_attach(struct cgroup_taskset *tset) +{ + struct cgroup_subsys_state *css; + struct task_struct *p; + + if (!scx_enabled()) + goto out_unlock; + + cgroup_taskset_for_each(p, css, tset) { + if (SCX_HAS_OP(cgroup_cancel_move) && p->scx.cgrp_moving_from) + SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_cancel_move, p, + p->scx.cgrp_moving_from, css->cgroup); + p->scx.cgrp_moving_from = NULL; + } +out_unlock: + percpu_up_read(&scx_cgroup_rwsem); +} + +void scx_group_set_weight(struct task_group *tg, unsigned long weight) +{ + percpu_down_read(&scx_cgroup_rwsem); + + if (tg->scx_weight != weight) { + if (SCX_HAS_OP(cgroup_set_weight)) + SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_set_weight, + tg_cgrp(tg), weight); + tg->scx_weight = weight; + } + + percpu_up_read(&scx_cgroup_rwsem); +} + +void scx_group_set_idle(struct task_group *tg, bool idle) +{ + percpu_down_read(&scx_cgroup_rwsem); + scx_cgroup_warn_missing_idle(tg); + percpu_up_read(&scx_cgroup_rwsem); +} + +static void scx_cgroup_lock(void) +{ + percpu_down_write(&scx_cgroup_rwsem); +} + +static void scx_cgroup_unlock(void) +{ + percpu_up_write(&scx_cgroup_rwsem); +} + +#else /* CONFIG_EXT_GROUP_SCHED */ + +static inline void scx_cgroup_lock(void) {} +static inline void scx_cgroup_unlock(void) {} + +#endif /* CONFIG_EXT_GROUP_SCHED */ + /* * Omitted operations: * @@ -3686,6 +4016,96 @@ out_unlock_rcu: rcu_read_unlock(); } +#ifdef CONFIG_EXT_GROUP_SCHED +static void scx_cgroup_exit(void) +{ + struct cgroup_subsys_state *css; + + percpu_rwsem_assert_held(&scx_cgroup_rwsem); + + /* + * scx_tg_on/offline() are excluded through scx_cgroup_rwsem. If we walk + * cgroups and exit all the inited ones, all online cgroups are exited. + */ + rcu_read_lock(); + css_for_each_descendant_post(css, &root_task_group.css) { + struct task_group *tg = css_tg(css); + + if (!(tg->scx_flags & SCX_TG_INITED)) + continue; + tg->scx_flags &= ~SCX_TG_INITED; + + if (!scx_ops.cgroup_exit) + continue; + + if (WARN_ON_ONCE(!css_tryget(css))) + continue; + rcu_read_unlock(); + + SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_exit, css->cgroup); + + rcu_read_lock(); + css_put(css); + } + rcu_read_unlock(); +} + +static int scx_cgroup_init(void) +{ + struct cgroup_subsys_state *css; + int ret; + + percpu_rwsem_assert_held(&scx_cgroup_rwsem); + + cgroup_warned_missing_weight = false; + cgroup_warned_missing_idle = false; + + /* + * scx_tg_on/offline() are excluded thorugh scx_cgroup_rwsem. If we walk + * cgroups and init, all online cgroups are initialized. + */ + rcu_read_lock(); + css_for_each_descendant_pre(css, &root_task_group.css) { + struct task_group *tg = css_tg(css); + struct scx_cgroup_init_args args = { .weight = tg->scx_weight }; + + scx_cgroup_warn_missing_weight(tg); + scx_cgroup_warn_missing_idle(tg); + + if ((tg->scx_flags & + (SCX_TG_ONLINE | SCX_TG_INITED)) != SCX_TG_ONLINE) + continue; + + if (!scx_ops.cgroup_init) { + tg->scx_flags |= SCX_TG_INITED; + continue; + } + + if (WARN_ON_ONCE(!css_tryget(css))) + continue; + rcu_read_unlock(); + + ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, cgroup_init, + css->cgroup, &args); + if (ret) { + css_put(css); + return ret; + } + tg->scx_flags |= SCX_TG_INITED; + + rcu_read_lock(); + css_put(css); + } + rcu_read_unlock(); + + return 0; +} + +#else +static void scx_cgroup_exit(void) {} +static int scx_cgroup_init(void) { return 0; } +#endif + /******************************************************************************** * Sysfs interface and ops enable/disable. @@ -3978,11 +4398,12 @@ static void scx_ops_disable_workfn(struct kthread_work *work) WRITE_ONCE(scx_switching_all, false); /* - * Avoid racing against fork. See scx_ops_enable() for explanation on - * the locking order. + * Avoid racing against fork and cgroup changes. See scx_ops_enable() + * for explanation on the locking order. */ percpu_down_write(&scx_fork_rwsem); cpus_read_lock(); + scx_cgroup_lock(); spin_lock_irq(&scx_tasks_lock); scx_task_iter_init(&sti); @@ -4018,6 +4439,9 @@ static void scx_ops_disable_workfn(struct kthread_work *work) static_branch_disable_cpuslocked(&scx_builtin_idle_enabled); synchronize_rcu(); + scx_cgroup_exit(); + + scx_cgroup_unlock(); cpus_read_unlock(); percpu_up_write(&scx_fork_rwsem); @@ -4574,11 +4998,17 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) scx_watchdog_timeout / 2); /* - * Lock out forks before opening the floodgate so that they don't wander - * into the operations prematurely. + * Lock out forks, cgroup on/offlining and moves before opening the + * floodgate so that they don't wander into the operations prematurely. + * + * We don't need to keep the CPUs stable but static_branch_*() requires + * cpus_read_lock() and scx_cgroup_rwsem must nest inside + * cpu_hotplug_lock because of the following dependency chain: + * + * cpu_hotplug_lock --> cgroup_threadgroup_rwsem --> scx_cgroup_rwsem * - * We don't need to keep the CPUs stable but grab cpus_read_lock() to - * ease future locking changes for cgroup suport. + * So, we need to do cpus_read_lock() before scx_cgroup_lock() and use + * static_branch_*_cpuslocked(). * * Note that cpu_hotplug_lock must nest inside scx_fork_rwsem due to the * following dependency chain: @@ -4587,6 +5017,7 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) */ percpu_down_write(&scx_fork_rwsem); cpus_read_lock(); + scx_cgroup_lock(); check_hotplug_seq(ops); @@ -4609,6 +5040,14 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) static_branch_disable_cpuslocked(&scx_builtin_idle_enabled); } + /* + * All cgroups should be initialized before letting in tasks. cgroup + * on/offlining and task migrations are already locked out. + */ + ret = scx_cgroup_init(); + if (ret) + goto err_disable_unlock_all; + static_branch_enable_cpuslocked(&__scx_ops_enabled); /* @@ -4700,6 +5139,7 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) spin_unlock_irq(&scx_tasks_lock); preempt_enable(); + scx_cgroup_unlock(); cpus_read_unlock(); percpu_up_write(&scx_fork_rwsem); @@ -4734,6 +5174,7 @@ err_unlock: return ret; err_disable_unlock_all: + scx_cgroup_unlock(); percpu_up_write(&scx_fork_rwsem); err_disable_unlock_cpus: cpus_read_unlock(); @@ -4928,6 +5369,11 @@ static int bpf_scx_check_member(const struct btf_type *t, switch (moff) { case offsetof(struct sched_ext_ops, init_task): +#ifdef CONFIG_EXT_GROUP_SCHED + case offsetof(struct sched_ext_ops, cgroup_init): + case offsetof(struct sched_ext_ops, cgroup_exit): + case offsetof(struct sched_ext_ops, cgroup_prep_move): +#endif case offsetof(struct sched_ext_ops, cpu_online): case offsetof(struct sched_ext_ops, cpu_offline): case offsetof(struct sched_ext_ops, init): @@ -5002,6 +5448,14 @@ static s32 init_task_stub(struct task_struct *p, struct scx_init_task_args *args static void exit_task_stub(struct task_struct *p, struct scx_exit_task_args *args) {} static void enable_stub(struct task_struct *p) {} static void disable_stub(struct task_struct *p) {} +#ifdef CONFIG_EXT_GROUP_SCHED +static s32 cgroup_init_stub(struct cgroup *cgrp, struct scx_cgroup_init_args *args) { return -EINVAL; } +static void cgroup_exit_stub(struct cgroup *cgrp) {} +static s32 cgroup_prep_move_stub(struct task_struct *p, struct cgroup *from, struct cgroup *to) { return -EINVAL; } +static void cgroup_move_stub(struct task_struct *p, struct cgroup *from, struct cgroup *to) {} +static void cgroup_cancel_move_stub(struct task_struct *p, struct cgroup *from, struct cgroup *to) {} +static void cgroup_set_weight_stub(struct cgroup *cgrp, u32 weight) {} +#endif static void cpu_online_stub(s32 cpu) {} static void cpu_offline_stub(s32 cpu) {} static s32 init_stub(void) { return -EINVAL; } @@ -5031,6 +5485,14 @@ static struct sched_ext_ops __bpf_ops_sched_ext_ops = { .exit_task = exit_task_stub, .enable = enable_stub, .disable = disable_stub, +#ifdef CONFIG_EXT_GROUP_SCHED + .cgroup_init = cgroup_init_stub, + .cgroup_exit = cgroup_exit_stub, + .cgroup_prep_move = cgroup_prep_move_stub, + .cgroup_move = cgroup_move_stub, + .cgroup_cancel_move = cgroup_cancel_move_stub, + .cgroup_set_weight = cgroup_set_weight_stub, +#endif .cpu_online = cpu_online_stub, .cpu_offline = cpu_offline_stub, .init = init_stub, @@ -5280,7 +5742,8 @@ void __init init_sched_ext_class(void) * definitions so that BPF scheduler implementations can use them * through the generated vmlinux.h. */ - WRITE_ONCE(v, SCX_ENQ_WAKEUP | SCX_DEQ_SLEEP | SCX_KICK_PREEMPT); + WRITE_ONCE(v, SCX_ENQ_WAKEUP | SCX_DEQ_SLEEP | SCX_KICK_PREEMPT | + SCX_TG_ONLINE); BUG_ON(rhashtable_init(&dsq_hash, &dsq_hash_params)); init_dsq(&scx_dsq_global, SCX_DSQ_GLOBAL); @@ -6340,6 +6803,41 @@ __bpf_kfunc struct rq *scx_bpf_cpu_rq(s32 cpu) return cpu_rq(cpu); } +/** + * scx_bpf_task_cgroup - Return the sched cgroup of a task + * @p: task of interest + * + * @p->sched_task_group->css.cgroup represents the cgroup @p is associated with + * from the scheduler's POV. SCX operations should use this function to + * determine @p's current cgroup as, unlike following @p->cgroups, + * @p->sched_task_group is protected by @p's rq lock and thus atomic w.r.t. all + * rq-locked operations. Can be called on the parameter tasks of rq-locked + * operations. The restriction guarantees that @p's rq is locked by the caller. + */ +#ifdef CONFIG_CGROUP_SCHED +__bpf_kfunc struct cgroup *scx_bpf_task_cgroup(struct task_struct *p) +{ + struct task_group *tg = p->sched_task_group; + struct cgroup *cgrp = &cgrp_dfl_root.cgrp; + + if (!scx_kf_allowed_on_arg_tasks(__SCX_KF_RQ_LOCKED, p)) + goto out; + + /* + * A task_group may either be a cgroup or an autogroup. In the latter + * case, @tg->css.cgroup is %NULL. A task_group can't become the other + * kind once created. + */ + if (tg && tg->css.cgroup) + cgrp = tg->css.cgroup; + else + cgrp = &cgrp_dfl_root.cgrp; +out: + cgroup_get(cgrp); + return cgrp; +} +#endif + __bpf_kfunc_end_defs(); BTF_KFUNCS_START(scx_kfunc_ids_any) @@ -6368,6 +6866,9 @@ BTF_ID_FLAGS(func, scx_bpf_pick_any_cpu, KF_RCU) BTF_ID_FLAGS(func, scx_bpf_task_running, KF_RCU) BTF_ID_FLAGS(func, scx_bpf_task_cpu, KF_RCU) BTF_ID_FLAGS(func, scx_bpf_cpu_rq) +#ifdef CONFIG_CGROUP_SCHED +BTF_ID_FLAGS(func, scx_bpf_task_cgroup, KF_RCU | KF_ACQUIRE) +#endif BTF_KFUNCS_END(scx_kfunc_ids_any) static const struct btf_kfunc_id_set scx_kfunc_set_any = { diff --git a/kernel/sched/ext.h b/kernel/sched/ext.h index 32d3a51f591a..246019519231 100644 --- a/kernel/sched/ext.h +++ b/kernel/sched/ext.h @@ -67,3 +67,25 @@ static inline void scx_update_idle(struct rq *rq, bool idle) #else static inline void scx_update_idle(struct rq *rq, bool idle) {} #endif + +#ifdef CONFIG_CGROUP_SCHED +#ifdef CONFIG_EXT_GROUP_SCHED +int scx_tg_online(struct task_group *tg); +void scx_tg_offline(struct task_group *tg); +int scx_cgroup_can_attach(struct cgroup_taskset *tset); +void scx_move_task(struct task_struct *p); +void scx_cgroup_finish_attach(void); +void scx_cgroup_cancel_attach(struct cgroup_taskset *tset); +void scx_group_set_weight(struct task_group *tg, unsigned long cgrp_weight); +void scx_group_set_idle(struct task_group *tg, bool idle); +#else /* CONFIG_EXT_GROUP_SCHED */ +static inline int scx_tg_online(struct task_group *tg) { return 0; } +static inline void scx_tg_offline(struct task_group *tg) {} +static inline int scx_cgroup_can_attach(struct cgroup_taskset *tset) { return 0; } +static inline void scx_move_task(struct task_struct *p) {} +static inline void scx_cgroup_finish_attach(void) {} +static inline void scx_cgroup_cancel_attach(struct cgroup_taskset *tset) {} +static inline void scx_group_set_weight(struct task_group *tg, unsigned long cgrp_weight) {} +static inline void scx_group_set_idle(struct task_group *tg, bool idle) {} +#endif /* CONFIG_EXT_GROUP_SCHED */ +#endif /* CONFIG_CGROUP_SCHED */ diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 0163f4af1c6e..b912752197d4 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -459,6 +459,11 @@ struct task_group { struct rt_bandwidth rt_bandwidth; #endif +#ifdef CONFIG_EXT_GROUP_SCHED + u32 scx_flags; /* SCX_TG_* */ + u32 scx_weight; +#endif + struct rcu_head rcu; struct list_head list; diff --git a/tools/sched_ext/include/scx/common.bpf.h b/tools/sched_ext/include/scx/common.bpf.h index 20280df62857..457462b19966 100644 --- a/tools/sched_ext/include/scx/common.bpf.h +++ b/tools/sched_ext/include/scx/common.bpf.h @@ -61,6 +61,7 @@ s32 scx_bpf_pick_any_cpu(const cpumask_t *cpus_allowed, u64 flags) __ksym; bool scx_bpf_task_running(const struct task_struct *p) __ksym; s32 scx_bpf_task_cpu(const struct task_struct *p) __ksym; struct rq *scx_bpf_cpu_rq(s32 cpu) __ksym; +struct cgroup *scx_bpf_task_cgroup(struct task_struct *p) __ksym; static inline __attribute__((format(printf, 1, 2))) void ___scx_bpf_bstr_format_checker(const char *fmt, ...) {} diff --git a/tools/testing/selftests/sched_ext/maximal.bpf.c b/tools/testing/selftests/sched_ext/maximal.bpf.c index 44612fdaf399..00bfa9cb95d3 100644 --- a/tools/testing/selftests/sched_ext/maximal.bpf.c +++ b/tools/testing/selftests/sched_ext/maximal.bpf.c @@ -95,6 +95,32 @@ void BPF_STRUCT_OPS(maximal_exit_task, struct task_struct *p, void BPF_STRUCT_OPS(maximal_disable, struct task_struct *p) {} +s32 BPF_STRUCT_OPS(maximal_cgroup_init, struct cgroup *cgrp, + struct scx_cgroup_init_args *args) +{ + return 0; +} + +void BPF_STRUCT_OPS(maximal_cgroup_exit, struct cgroup *cgrp) +{} + +s32 BPF_STRUCT_OPS(maximal_cgroup_prep_move, struct task_struct *p, + struct cgroup *from, struct cgroup *to) +{ + return 0; +} + +void BPF_STRUCT_OPS(maximal_cgroup_move, struct task_struct *p, + struct cgroup *from, struct cgroup *to) +{} + +void BPF_STRUCT_OPS(maximal_cgroup_cancel_move, struct task_struct *p, + struct cgroup *from, struct cgroup *to) +{} + +void BPF_STRUCT_OPS(maximal_cgroup_set_weight, struct cgroup *cgrp, u32 weight) +{} + s32 BPF_STRUCT_OPS_SLEEPABLE(maximal_init) { return 0; @@ -126,6 +152,12 @@ struct sched_ext_ops maximal_ops = { .enable = maximal_enable, .exit_task = maximal_exit_task, .disable = maximal_disable, + .cgroup_init = maximal_cgroup_init, + .cgroup_exit = maximal_cgroup_exit, + .cgroup_prep_move = maximal_cgroup_prep_move, + .cgroup_move = maximal_cgroup_move, + .cgroup_cancel_move = maximal_cgroup_cancel_move, + .cgroup_set_weight = maximal_cgroup_set_weight, .init = maximal_init, .exit = maximal_exit, .name = "maximal", -- cgit v1.2.3 From a4103eacc2ab408bb65e9902f0857b219fb489de Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 4 Sep 2024 10:24:59 -1000 Subject: sched_ext: Add a cgroup scheduler which uses flattened hierarchy This patch adds scx_flatcg example scheduler which implements hierarchical weight-based cgroup CPU control by flattening the cgroup hierarchy into a single layer by compounding the active weight share at each level. This flattening of hierarchy can bring a substantial performance gain when the cgroup hierarchy is nested multiple levels. in a simple benchmark using wrk[8] on apache serving a CGI script calculating sha1sum of a small file, it outperforms CFS by ~3% with CPU controller disabled and by ~10% with two apache instances competing with 2:1 weight ratio nested four level deep. However, the gain comes at the cost of not being able to properly handle thundering herd of cgroups. For example, if many cgroups which are nested behind a low priority parent cgroup wake up around the same time, they may be able to consume more CPU cycles than they are entitled to. In many use cases, this isn't a real concern especially given the performance gain. Also, there are ways to mitigate the problem further by e.g. introducing an extra scheduling layer on cgroup delegation boundaries. v5: - Updated to specify SCX_OPS_HAS_CGROUP_WEIGHT instead of SCX_OPS_KNOB_CGROUP_WEIGHT. v4: - Revert reference counted kptr for cgv_node as the change caused easily reproducible stalls. v3: - Updated to reflect the core API changes including ops.init/exit_task() and direct dispatch from ops.select_cpu(). Fixes and improvements including additional statistics. - Use reference counted kptr for cgv_node instead of xchg'ing against stash location. - Dropped '-p' option. v2: - Use SCX_BUG[_ON]() to simplify error handling. Signed-off-by: Tejun Heo Reviewed-by: David Vernet Acked-by: Josh Don Acked-by: Hao Luo Acked-by: Barret Rhoden --- tools/sched_ext/Makefile | 2 +- tools/sched_ext/README.md | 12 + tools/sched_ext/scx_flatcg.bpf.c | 949 +++++++++++++++++++++++++++++++++++++++ tools/sched_ext/scx_flatcg.c | 233 ++++++++++ tools/sched_ext/scx_flatcg.h | 51 +++ 5 files changed, 1246 insertions(+), 1 deletion(-) create mode 100644 tools/sched_ext/scx_flatcg.bpf.c create mode 100644 tools/sched_ext/scx_flatcg.c create mode 100644 tools/sched_ext/scx_flatcg.h (limited to 'tools') diff --git a/tools/sched_ext/Makefile b/tools/sched_ext/Makefile index bf7e108f5ae1..ca3815e572d8 100644 --- a/tools/sched_ext/Makefile +++ b/tools/sched_ext/Makefile @@ -176,7 +176,7 @@ $(INCLUDE_DIR)/%.bpf.skel.h: $(SCXOBJ_DIR)/%.bpf.o $(INCLUDE_DIR)/vmlinux.h $(BP SCX_COMMON_DEPS := include/scx/common.h include/scx/user_exit_info.h | $(BINDIR) -c-sched-targets = scx_simple scx_qmap scx_central +c-sched-targets = scx_simple scx_qmap scx_central scx_flatcg $(addprefix $(BINDIR)/,$(c-sched-targets)): \ $(BINDIR)/%: \ diff --git a/tools/sched_ext/README.md b/tools/sched_ext/README.md index 8efe70cc4363..16a42e4060f6 100644 --- a/tools/sched_ext/README.md +++ b/tools/sched_ext/README.md @@ -192,6 +192,18 @@ where this could be particularly useful is running VMs, where running with infinite slices and no timer ticks allows the VM to avoid unnecessary expensive vmexits. +## scx_flatcg + +A flattened cgroup hierarchy scheduler. This scheduler implements hierarchical +weight-based cgroup CPU control by flattening the cgroup hierarchy into a single +layer, by compounding the active weight share at each level. The effect of this +is a much more performant CPU controller, which does not need to descend down +cgroup trees in order to properly compute a cgroup's share. + +Similar to scx_simple, in limited scenarios, this scheduler can perform +reasonably well on single socket-socket systems with a unified L3 cache and show +significantly lowered hierarchical scheduling overhead. + # Troubleshooting diff --git a/tools/sched_ext/scx_flatcg.bpf.c b/tools/sched_ext/scx_flatcg.bpf.c new file mode 100644 index 000000000000..3ab2b60781a0 --- /dev/null +++ b/tools/sched_ext/scx_flatcg.bpf.c @@ -0,0 +1,949 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * A demo sched_ext flattened cgroup hierarchy scheduler. It implements + * hierarchical weight-based cgroup CPU control by flattening the cgroup + * hierarchy into a single layer by compounding the active weight share at each + * level. Consider the following hierarchy with weights in parentheses: + * + * R + A (100) + B (100) + * | \ C (100) + * \ D (200) + * + * Ignoring the root and threaded cgroups, only B, C and D can contain tasks. + * Let's say all three have runnable tasks. The total share that each of these + * three cgroups is entitled to can be calculated by compounding its share at + * each level. + * + * For example, B is competing against C and in that competition its share is + * 100/(100+100) == 1/2. At its parent level, A is competing against D and A's + * share in that competition is 100/(200+100) == 1/3. B's eventual share in the + * system can be calculated by multiplying the two shares, 1/2 * 1/3 == 1/6. C's + * eventual shaer is the same at 1/6. D is only competing at the top level and + * its share is 200/(100+200) == 2/3. + * + * So, instead of hierarchically scheduling level-by-level, we can consider it + * as B, C and D competing each other with respective share of 1/6, 1/6 and 2/3 + * and keep updating the eventual shares as the cgroups' runnable states change. + * + * This flattening of hierarchy can bring a substantial performance gain when + * the cgroup hierarchy is nested multiple levels. in a simple benchmark using + * wrk[8] on apache serving a CGI script calculating sha1sum of a small file, it + * outperforms CFS by ~3% with CPU controller disabled and by ~10% with two + * apache instances competing with 2:1 weight ratio nested four level deep. + * + * However, the gain comes at the cost of not being able to properly handle + * thundering herd of cgroups. For example, if many cgroups which are nested + * behind a low priority parent cgroup wake up around the same time, they may be + * able to consume more CPU cycles than they are entitled to. In many use cases, + * this isn't a real concern especially given the performance gain. Also, there + * are ways to mitigate the problem further by e.g. introducing an extra + * scheduling layer on cgroup delegation boundaries. + * + * The scheduler first picks the cgroup to run and then schedule the tasks + * within by using nested weighted vtime scheduling by default. The + * cgroup-internal scheduling can be switched to FIFO with the -f option. + */ +#include +#include "scx_flatcg.h" + +/* + * Maximum amount of retries to find a valid cgroup. + */ +#define CGROUP_MAX_RETRIES 1024 + +char _license[] SEC("license") = "GPL"; + +const volatile u32 nr_cpus = 32; /* !0 for veristat, set during init */ +const volatile u64 cgrp_slice_ns = SCX_SLICE_DFL; +const volatile bool fifo_sched; + +u64 cvtime_now; +UEI_DEFINE(uei); + +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __type(key, u32); + __type(value, u64); + __uint(max_entries, FCG_NR_STATS); +} stats SEC(".maps"); + +static void stat_inc(enum fcg_stat_idx idx) +{ + u32 idx_v = idx; + + u64 *cnt_p = bpf_map_lookup_elem(&stats, &idx_v); + if (cnt_p) + (*cnt_p)++; +} + +struct fcg_cpu_ctx { + u64 cur_cgid; + u64 cur_at; +}; + +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __type(key, u32); + __type(value, struct fcg_cpu_ctx); + __uint(max_entries, 1); +} cpu_ctx SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_CGRP_STORAGE); + __uint(map_flags, BPF_F_NO_PREALLOC); + __type(key, int); + __type(value, struct fcg_cgrp_ctx); +} cgrp_ctx SEC(".maps"); + +struct cgv_node { + struct bpf_rb_node rb_node; + __u64 cvtime; + __u64 cgid; +}; + +private(CGV_TREE) struct bpf_spin_lock cgv_tree_lock; +private(CGV_TREE) struct bpf_rb_root cgv_tree __contains(cgv_node, rb_node); + +struct cgv_node_stash { + struct cgv_node __kptr *node; +}; + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(max_entries, 16384); + __type(key, __u64); + __type(value, struct cgv_node_stash); +} cgv_node_stash SEC(".maps"); + +struct fcg_task_ctx { + u64 bypassed_at; +}; + +struct { + __uint(type, BPF_MAP_TYPE_TASK_STORAGE); + __uint(map_flags, BPF_F_NO_PREALLOC); + __type(key, int); + __type(value, struct fcg_task_ctx); +} task_ctx SEC(".maps"); + +/* gets inc'd on weight tree changes to expire the cached hweights */ +u64 hweight_gen = 1; + +static u64 div_round_up(u64 dividend, u64 divisor) +{ + return (dividend + divisor - 1) / divisor; +} + +static bool vtime_before(u64 a, u64 b) +{ + return (s64)(a - b) < 0; +} + +static bool cgv_node_less(struct bpf_rb_node *a, const struct bpf_rb_node *b) +{ + struct cgv_node *cgc_a, *cgc_b; + + cgc_a = container_of(a, struct cgv_node, rb_node); + cgc_b = container_of(b, struct cgv_node, rb_node); + + return cgc_a->cvtime < cgc_b->cvtime; +} + +static struct fcg_cpu_ctx *find_cpu_ctx(void) +{ + struct fcg_cpu_ctx *cpuc; + u32 idx = 0; + + cpuc = bpf_map_lookup_elem(&cpu_ctx, &idx); + if (!cpuc) { + scx_bpf_error("cpu_ctx lookup failed"); + return NULL; + } + return cpuc; +} + +static struct fcg_cgrp_ctx *find_cgrp_ctx(struct cgroup *cgrp) +{ + struct fcg_cgrp_ctx *cgc; + + cgc = bpf_cgrp_storage_get(&cgrp_ctx, cgrp, 0, 0); + if (!cgc) { + scx_bpf_error("cgrp_ctx lookup failed for cgid %llu", cgrp->kn->id); + return NULL; + } + return cgc; +} + +static struct fcg_cgrp_ctx *find_ancestor_cgrp_ctx(struct cgroup *cgrp, int level) +{ + struct fcg_cgrp_ctx *cgc; + + cgrp = bpf_cgroup_ancestor(cgrp, level); + if (!cgrp) { + scx_bpf_error("ancestor cgroup lookup failed"); + return NULL; + } + + cgc = find_cgrp_ctx(cgrp); + if (!cgc) + scx_bpf_error("ancestor cgrp_ctx lookup failed"); + bpf_cgroup_release(cgrp); + return cgc; +} + +static void cgrp_refresh_hweight(struct cgroup *cgrp, struct fcg_cgrp_ctx *cgc) +{ + int level; + + if (!cgc->nr_active) { + stat_inc(FCG_STAT_HWT_SKIP); + return; + } + + if (cgc->hweight_gen == hweight_gen) { + stat_inc(FCG_STAT_HWT_CACHE); + return; + } + + stat_inc(FCG_STAT_HWT_UPDATES); + bpf_for(level, 0, cgrp->level + 1) { + struct fcg_cgrp_ctx *cgc; + bool is_active; + + cgc = find_ancestor_cgrp_ctx(cgrp, level); + if (!cgc) + break; + + if (!level) { + cgc->hweight = FCG_HWEIGHT_ONE; + cgc->hweight_gen = hweight_gen; + } else { + struct fcg_cgrp_ctx *pcgc; + + pcgc = find_ancestor_cgrp_ctx(cgrp, level - 1); + if (!pcgc) + break; + + /* + * We can be oppotunistic here and not grab the + * cgv_tree_lock and deal with the occasional races. + * However, hweight updates are already cached and + * relatively low-frequency. Let's just do the + * straightforward thing. + */ + bpf_spin_lock(&cgv_tree_lock); + is_active = cgc->nr_active; + if (is_active) { + cgc->hweight_gen = pcgc->hweight_gen; + cgc->hweight = + div_round_up(pcgc->hweight * cgc->weight, + pcgc->child_weight_sum); + } + bpf_spin_unlock(&cgv_tree_lock); + + if (!is_active) { + stat_inc(FCG_STAT_HWT_RACE); + break; + } + } + } +} + +static void cgrp_cap_budget(struct cgv_node *cgv_node, struct fcg_cgrp_ctx *cgc) +{ + u64 delta, cvtime, max_budget; + + /* + * A node which is on the rbtree can't be pointed to from elsewhere yet + * and thus can't be updated and repositioned. Instead, we collect the + * vtime deltas separately and apply it asynchronously here. + */ + delta = cgc->cvtime_delta; + __sync_fetch_and_sub(&cgc->cvtime_delta, delta); + cvtime = cgv_node->cvtime + delta; + + /* + * Allow a cgroup to carry the maximum budget proportional to its + * hweight such that a full-hweight cgroup can immediately take up half + * of the CPUs at the most while staying at the front of the rbtree. + */ + max_budget = (cgrp_slice_ns * nr_cpus * cgc->hweight) / + (2 * FCG_HWEIGHT_ONE); + if (vtime_before(cvtime, cvtime_now - max_budget)) + cvtime = cvtime_now - max_budget; + + cgv_node->cvtime = cvtime; +} + +static void cgrp_enqueued(struct cgroup *cgrp, struct fcg_cgrp_ctx *cgc) +{ + struct cgv_node_stash *stash; + struct cgv_node *cgv_node; + u64 cgid = cgrp->kn->id; + + /* paired with cmpxchg in try_pick_next_cgroup() */ + if (__sync_val_compare_and_swap(&cgc->queued, 0, 1)) { + stat_inc(FCG_STAT_ENQ_SKIP); + return; + } + + stash = bpf_map_lookup_elem(&cgv_node_stash, &cgid); + if (!stash) { + scx_bpf_error("cgv_node lookup failed for cgid %llu", cgid); + return; + } + + /* NULL if the node is already on the rbtree */ + cgv_node = bpf_kptr_xchg(&stash->node, NULL); + if (!cgv_node) { + stat_inc(FCG_STAT_ENQ_RACE); + return; + } + + bpf_spin_lock(&cgv_tree_lock); + cgrp_cap_budget(cgv_node, cgc); + bpf_rbtree_add(&cgv_tree, &cgv_node->rb_node, cgv_node_less); + bpf_spin_unlock(&cgv_tree_lock); +} + +static void set_bypassed_at(struct task_struct *p, struct fcg_task_ctx *taskc) +{ + /* + * Tell fcg_stopping() that this bypassed the regular scheduling path + * and should be force charged to the cgroup. 0 is used to indicate that + * the task isn't bypassing, so if the current runtime is 0, go back by + * one nanosecond. + */ + taskc->bypassed_at = p->se.sum_exec_runtime ?: (u64)-1; +} + +s32 BPF_STRUCT_OPS(fcg_select_cpu, struct task_struct *p, s32 prev_cpu, u64 wake_flags) +{ + struct fcg_task_ctx *taskc; + bool is_idle = false; + s32 cpu; + + cpu = scx_bpf_select_cpu_dfl(p, prev_cpu, wake_flags, &is_idle); + + taskc = bpf_task_storage_get(&task_ctx, p, 0, 0); + if (!taskc) { + scx_bpf_error("task_ctx lookup failed"); + return cpu; + } + + /* + * If select_cpu_dfl() is recommending local enqueue, the target CPU is + * idle. Follow it and charge the cgroup later in fcg_stopping() after + * the fact. + */ + if (is_idle) { + set_bypassed_at(p, taskc); + stat_inc(FCG_STAT_LOCAL); + scx_bpf_dispatch(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, 0); + } + + return cpu; +} + +void BPF_STRUCT_OPS(fcg_enqueue, struct task_struct *p, u64 enq_flags) +{ + struct fcg_task_ctx *taskc; + struct cgroup *cgrp; + struct fcg_cgrp_ctx *cgc; + + taskc = bpf_task_storage_get(&task_ctx, p, 0, 0); + if (!taskc) { + scx_bpf_error("task_ctx lookup failed"); + return; + } + + /* + * Use the direct dispatching and force charging to deal with tasks with + * custom affinities so that we don't have to worry about per-cgroup + * dq's containing tasks that can't be executed from some CPUs. + */ + if (p->nr_cpus_allowed != nr_cpus) { + set_bypassed_at(p, taskc); + + /* + * The global dq is deprioritized as we don't want to let tasks + * to boost themselves by constraining its cpumask. The + * deprioritization is rather severe, so let's not apply that to + * per-cpu kernel threads. This is ham-fisted. We probably wanna + * implement per-cgroup fallback dq's instead so that we have + * more control over when tasks with custom cpumask get issued. + */ + if (p->nr_cpus_allowed == 1 && (p->flags & PF_KTHREAD)) { + stat_inc(FCG_STAT_LOCAL); + scx_bpf_dispatch(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, enq_flags); + } else { + stat_inc(FCG_STAT_GLOBAL); + scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags); + } + return; + } + + cgrp = scx_bpf_task_cgroup(p); + cgc = find_cgrp_ctx(cgrp); + if (!cgc) + goto out_release; + + if (fifo_sched) { + scx_bpf_dispatch(p, cgrp->kn->id, SCX_SLICE_DFL, enq_flags); + } else { + u64 tvtime = p->scx.dsq_vtime; + + /* + * Limit the amount of budget that an idling task can accumulate + * to one slice. + */ + if (vtime_before(tvtime, cgc->tvtime_now - SCX_SLICE_DFL)) + tvtime = cgc->tvtime_now - SCX_SLICE_DFL; + + scx_bpf_dispatch_vtime(p, cgrp->kn->id, SCX_SLICE_DFL, + tvtime, enq_flags); + } + + cgrp_enqueued(cgrp, cgc); +out_release: + bpf_cgroup_release(cgrp); +} + +/* + * Walk the cgroup tree to update the active weight sums as tasks wake up and + * sleep. The weight sums are used as the base when calculating the proportion a + * given cgroup or task is entitled to at each level. + */ +static void update_active_weight_sums(struct cgroup *cgrp, bool runnable) +{ + struct fcg_cgrp_ctx *cgc; + bool updated = false; + int idx; + + cgc = find_cgrp_ctx(cgrp); + if (!cgc) + return; + + /* + * In most cases, a hot cgroup would have multiple threads going to + * sleep and waking up while the whole cgroup stays active. In leaf + * cgroups, ->nr_runnable which is updated with __sync operations gates + * ->nr_active updates, so that we don't have to grab the cgv_tree_lock + * repeatedly for a busy cgroup which is staying active. + */ + if (runnable) { + if (__sync_fetch_and_add(&cgc->nr_runnable, 1)) + return; + stat_inc(FCG_STAT_ACT); + } else { + if (__sync_sub_and_fetch(&cgc->nr_runnable, 1)) + return; + stat_inc(FCG_STAT_DEACT); + } + + /* + * If @cgrp is becoming runnable, its hweight should be refreshed after + * it's added to the weight tree so that enqueue has the up-to-date + * value. If @cgrp is becoming quiescent, the hweight should be + * refreshed before it's removed from the weight tree so that the usage + * charging which happens afterwards has access to the latest value. + */ + if (!runnable) + cgrp_refresh_hweight(cgrp, cgc); + + /* propagate upwards */ + bpf_for(idx, 0, cgrp->level) { + int level = cgrp->level - idx; + struct fcg_cgrp_ctx *cgc, *pcgc = NULL; + bool propagate = false; + + cgc = find_ancestor_cgrp_ctx(cgrp, level); + if (!cgc) + break; + if (level) { + pcgc = find_ancestor_cgrp_ctx(cgrp, level - 1); + if (!pcgc) + break; + } + + /* + * We need the propagation protected by a lock to synchronize + * against weight changes. There's no reason to drop the lock at + * each level but bpf_spin_lock() doesn't want any function + * calls while locked. + */ + bpf_spin_lock(&cgv_tree_lock); + + if (runnable) { + if (!cgc->nr_active++) { + updated = true; + if (pcgc) { + propagate = true; + pcgc->child_weight_sum += cgc->weight; + } + } + } else { + if (!--cgc->nr_active) { + updated = true; + if (pcgc) { + propagate = true; + pcgc->child_weight_sum -= cgc->weight; + } + } + } + + bpf_spin_unlock(&cgv_tree_lock); + + if (!propagate) + break; + } + + if (updated) + __sync_fetch_and_add(&hweight_gen, 1); + + if (runnable) + cgrp_refresh_hweight(cgrp, cgc); +} + +void BPF_STRUCT_OPS(fcg_runnable, struct task_struct *p, u64 enq_flags) +{ + struct cgroup *cgrp; + + cgrp = scx_bpf_task_cgroup(p); + update_active_weight_sums(cgrp, true); + bpf_cgroup_release(cgrp); +} + +void BPF_STRUCT_OPS(fcg_running, struct task_struct *p) +{ + struct cgroup *cgrp; + struct fcg_cgrp_ctx *cgc; + + if (fifo_sched) + return; + + cgrp = scx_bpf_task_cgroup(p); + cgc = find_cgrp_ctx(cgrp); + if (cgc) { + /* + * @cgc->tvtime_now always progresses forward as tasks start + * executing. The test and update can be performed concurrently + * from multiple CPUs and thus racy. Any error should be + * contained and temporary. Let's just live with it. + */ + if (vtime_before(cgc->tvtime_now, p->scx.dsq_vtime)) + cgc->tvtime_now = p->scx.dsq_vtime; + } + bpf_cgroup_release(cgrp); +} + +void BPF_STRUCT_OPS(fcg_stopping, struct task_struct *p, bool runnable) +{ + struct fcg_task_ctx *taskc; + struct cgroup *cgrp; + struct fcg_cgrp_ctx *cgc; + + /* + * Scale the execution time by the inverse of the weight and charge. + * + * Note that the default yield implementation yields by setting + * @p->scx.slice to zero and the following would treat the yielding task + * as if it has consumed all its slice. If this penalizes yielding tasks + * too much, determine the execution time by taking explicit timestamps + * instead of depending on @p->scx.slice. + */ + if (!fifo_sched) + p->scx.dsq_vtime += + (SCX_SLICE_DFL - p->scx.slice) * 100 / p->scx.weight; + + taskc = bpf_task_storage_get(&task_ctx, p, 0, 0); + if (!taskc) { + scx_bpf_error("task_ctx lookup failed"); + return; + } + + if (!taskc->bypassed_at) + return; + + cgrp = scx_bpf_task_cgroup(p); + cgc = find_cgrp_ctx(cgrp); + if (cgc) { + __sync_fetch_and_add(&cgc->cvtime_delta, + p->se.sum_exec_runtime - taskc->bypassed_at); + taskc->bypassed_at = 0; + } + bpf_cgroup_release(cgrp); +} + +void BPF_STRUCT_OPS(fcg_quiescent, struct task_struct *p, u64 deq_flags) +{ + struct cgroup *cgrp; + + cgrp = scx_bpf_task_cgroup(p); + update_active_weight_sums(cgrp, false); + bpf_cgroup_release(cgrp); +} + +void BPF_STRUCT_OPS(fcg_cgroup_set_weight, struct cgroup *cgrp, u32 weight) +{ + struct fcg_cgrp_ctx *cgc, *pcgc = NULL; + + cgc = find_cgrp_ctx(cgrp); + if (!cgc) + return; + + if (cgrp->level) { + pcgc = find_ancestor_cgrp_ctx(cgrp, cgrp->level - 1); + if (!pcgc) + return; + } + + bpf_spin_lock(&cgv_tree_lock); + if (pcgc && cgc->nr_active) + pcgc->child_weight_sum += (s64)weight - cgc->weight; + cgc->weight = weight; + bpf_spin_unlock(&cgv_tree_lock); +} + +static bool try_pick_next_cgroup(u64 *cgidp) +{ + struct bpf_rb_node *rb_node; + struct cgv_node_stash *stash; + struct cgv_node *cgv_node; + struct fcg_cgrp_ctx *cgc; + struct cgroup *cgrp; + u64 cgid; + + /* pop the front cgroup and wind cvtime_now accordingly */ + bpf_spin_lock(&cgv_tree_lock); + + rb_node = bpf_rbtree_first(&cgv_tree); + if (!rb_node) { + bpf_spin_unlock(&cgv_tree_lock); + stat_inc(FCG_STAT_PNC_NO_CGRP); + *cgidp = 0; + return true; + } + + rb_node = bpf_rbtree_remove(&cgv_tree, rb_node); + bpf_spin_unlock(&cgv_tree_lock); + + if (!rb_node) { + /* + * This should never happen. bpf_rbtree_first() was called + * above while the tree lock was held, so the node should + * always be present. + */ + scx_bpf_error("node could not be removed"); + return true; + } + + cgv_node = container_of(rb_node, struct cgv_node, rb_node); + cgid = cgv_node->cgid; + + if (vtime_before(cvtime_now, cgv_node->cvtime)) + cvtime_now = cgv_node->cvtime; + + /* + * If lookup fails, the cgroup's gone. Free and move on. See + * fcg_cgroup_exit(). + */ + cgrp = bpf_cgroup_from_id(cgid); + if (!cgrp) { + stat_inc(FCG_STAT_PNC_GONE); + goto out_free; + } + + cgc = bpf_cgrp_storage_get(&cgrp_ctx, cgrp, 0, 0); + if (!cgc) { + bpf_cgroup_release(cgrp); + stat_inc(FCG_STAT_PNC_GONE); + goto out_free; + } + + if (!scx_bpf_consume(cgid)) { + bpf_cgroup_release(cgrp); + stat_inc(FCG_STAT_PNC_EMPTY); + goto out_stash; + } + + /* + * Successfully consumed from the cgroup. This will be our current + * cgroup for the new slice. Refresh its hweight. + */ + cgrp_refresh_hweight(cgrp, cgc); + + bpf_cgroup_release(cgrp); + + /* + * As the cgroup may have more tasks, add it back to the rbtree. Note + * that here we charge the full slice upfront and then exact later + * according to the actual consumption. This prevents lowpri thundering + * herd from saturating the machine. + */ + bpf_spin_lock(&cgv_tree_lock); + cgv_node->cvtime += cgrp_slice_ns * FCG_HWEIGHT_ONE / (cgc->hweight ?: 1); + cgrp_cap_budget(cgv_node, cgc); + bpf_rbtree_add(&cgv_tree, &cgv_node->rb_node, cgv_node_less); + bpf_spin_unlock(&cgv_tree_lock); + + *cgidp = cgid; + stat_inc(FCG_STAT_PNC_NEXT); + return true; + +out_stash: + stash = bpf_map_lookup_elem(&cgv_node_stash, &cgid); + if (!stash) { + stat_inc(FCG_STAT_PNC_GONE); + goto out_free; + } + + /* + * Paired with cmpxchg in cgrp_enqueued(). If they see the following + * transition, they'll enqueue the cgroup. If they are earlier, we'll + * see their task in the dq below and requeue the cgroup. + */ + __sync_val_compare_and_swap(&cgc->queued, 1, 0); + + if (scx_bpf_dsq_nr_queued(cgid)) { + bpf_spin_lock(&cgv_tree_lock); + bpf_rbtree_add(&cgv_tree, &cgv_node->rb_node, cgv_node_less); + bpf_spin_unlock(&cgv_tree_lock); + stat_inc(FCG_STAT_PNC_RACE); + } else { + cgv_node = bpf_kptr_xchg(&stash->node, cgv_node); + if (cgv_node) { + scx_bpf_error("unexpected !NULL cgv_node stash"); + goto out_free; + } + } + + return false; + +out_free: + bpf_obj_drop(cgv_node); + return false; +} + +void BPF_STRUCT_OPS(fcg_dispatch, s32 cpu, struct task_struct *prev) +{ + struct fcg_cpu_ctx *cpuc; + struct fcg_cgrp_ctx *cgc; + struct cgroup *cgrp; + u64 now = bpf_ktime_get_ns(); + bool picked_next = false; + + cpuc = find_cpu_ctx(); + if (!cpuc) + return; + + if (!cpuc->cur_cgid) + goto pick_next_cgroup; + + if (vtime_before(now, cpuc->cur_at + cgrp_slice_ns)) { + if (scx_bpf_consume(cpuc->cur_cgid)) { + stat_inc(FCG_STAT_CNS_KEEP); + return; + } + stat_inc(FCG_STAT_CNS_EMPTY); + } else { + stat_inc(FCG_STAT_CNS_EXPIRE); + } + + /* + * The current cgroup is expiring. It was already charged a full slice. + * Calculate the actual usage and accumulate the delta. + */ + cgrp = bpf_cgroup_from_id(cpuc->cur_cgid); + if (!cgrp) { + stat_inc(FCG_STAT_CNS_GONE); + goto pick_next_cgroup; + } + + cgc = bpf_cgrp_storage_get(&cgrp_ctx, cgrp, 0, 0); + if (cgc) { + /* + * We want to update the vtime delta and then look for the next + * cgroup to execute but the latter needs to be done in a loop + * and we can't keep the lock held. Oh well... + */ + bpf_spin_lock(&cgv_tree_lock); + __sync_fetch_and_add(&cgc->cvtime_delta, + (cpuc->cur_at + cgrp_slice_ns - now) * + FCG_HWEIGHT_ONE / (cgc->hweight ?: 1)); + bpf_spin_unlock(&cgv_tree_lock); + } else { + stat_inc(FCG_STAT_CNS_GONE); + } + + bpf_cgroup_release(cgrp); + +pick_next_cgroup: + cpuc->cur_at = now; + + if (scx_bpf_consume(SCX_DSQ_GLOBAL)) { + cpuc->cur_cgid = 0; + return; + } + + bpf_repeat(CGROUP_MAX_RETRIES) { + if (try_pick_next_cgroup(&cpuc->cur_cgid)) { + picked_next = true; + break; + } + } + + /* + * This only happens if try_pick_next_cgroup() races against enqueue + * path for more than CGROUP_MAX_RETRIES times, which is extremely + * unlikely and likely indicates an underlying bug. There shouldn't be + * any stall risk as the race is against enqueue. + */ + if (!picked_next) + stat_inc(FCG_STAT_PNC_FAIL); +} + +s32 BPF_STRUCT_OPS(fcg_init_task, struct task_struct *p, + struct scx_init_task_args *args) +{ + struct fcg_task_ctx *taskc; + struct fcg_cgrp_ctx *cgc; + + /* + * @p is new. Let's ensure that its task_ctx is available. We can sleep + * in this function and the following will automatically use GFP_KERNEL. + */ + taskc = bpf_task_storage_get(&task_ctx, p, 0, + BPF_LOCAL_STORAGE_GET_F_CREATE); + if (!taskc) + return -ENOMEM; + + taskc->bypassed_at = 0; + + if (!(cgc = find_cgrp_ctx(args->cgroup))) + return -ENOENT; + + p->scx.dsq_vtime = cgc->tvtime_now; + + return 0; +} + +int BPF_STRUCT_OPS_SLEEPABLE(fcg_cgroup_init, struct cgroup *cgrp, + struct scx_cgroup_init_args *args) +{ + struct fcg_cgrp_ctx *cgc; + struct cgv_node *cgv_node; + struct cgv_node_stash empty_stash = {}, *stash; + u64 cgid = cgrp->kn->id; + int ret; + + /* + * Technically incorrect as cgroup ID is full 64bit while dq ID is + * 63bit. Should not be a problem in practice and easy to spot in the + * unlikely case that it breaks. + */ + ret = scx_bpf_create_dsq(cgid, -1); + if (ret) + return ret; + + cgc = bpf_cgrp_storage_get(&cgrp_ctx, cgrp, 0, + BPF_LOCAL_STORAGE_GET_F_CREATE); + if (!cgc) { + ret = -ENOMEM; + goto err_destroy_dsq; + } + + cgc->weight = args->weight; + cgc->hweight = FCG_HWEIGHT_ONE; + + ret = bpf_map_update_elem(&cgv_node_stash, &cgid, &empty_stash, + BPF_NOEXIST); + if (ret) { + if (ret != -ENOMEM) + scx_bpf_error("unexpected stash creation error (%d)", + ret); + goto err_destroy_dsq; + } + + stash = bpf_map_lookup_elem(&cgv_node_stash, &cgid); + if (!stash) { + scx_bpf_error("unexpected cgv_node stash lookup failure"); + ret = -ENOENT; + goto err_destroy_dsq; + } + + cgv_node = bpf_obj_new(struct cgv_node); + if (!cgv_node) { + ret = -ENOMEM; + goto err_del_cgv_node; + } + + cgv_node->cgid = cgid; + cgv_node->cvtime = cvtime_now; + + cgv_node = bpf_kptr_xchg(&stash->node, cgv_node); + if (cgv_node) { + scx_bpf_error("unexpected !NULL cgv_node stash"); + ret = -EBUSY; + goto err_drop; + } + + return 0; + +err_drop: + bpf_obj_drop(cgv_node); +err_del_cgv_node: + bpf_map_delete_elem(&cgv_node_stash, &cgid); +err_destroy_dsq: + scx_bpf_destroy_dsq(cgid); + return ret; +} + +void BPF_STRUCT_OPS(fcg_cgroup_exit, struct cgroup *cgrp) +{ + u64 cgid = cgrp->kn->id; + + /* + * For now, there's no way find and remove the cgv_node if it's on the + * cgv_tree. Let's drain them in the dispatch path as they get popped + * off the front of the tree. + */ + bpf_map_delete_elem(&cgv_node_stash, &cgid); + scx_bpf_destroy_dsq(cgid); +} + +void BPF_STRUCT_OPS(fcg_cgroup_move, struct task_struct *p, + struct cgroup *from, struct cgroup *to) +{ + struct fcg_cgrp_ctx *from_cgc, *to_cgc; + s64 vtime_delta; + + /* find_cgrp_ctx() triggers scx_ops_error() on lookup failures */ + if (!(from_cgc = find_cgrp_ctx(from)) || !(to_cgc = find_cgrp_ctx(to))) + return; + + vtime_delta = p->scx.dsq_vtime - from_cgc->tvtime_now; + p->scx.dsq_vtime = to_cgc->tvtime_now + vtime_delta; +} + +void BPF_STRUCT_OPS(fcg_exit, struct scx_exit_info *ei) +{ + UEI_RECORD(uei, ei); +} + +SCX_OPS_DEFINE(flatcg_ops, + .select_cpu = (void *)fcg_select_cpu, + .enqueue = (void *)fcg_enqueue, + .dispatch = (void *)fcg_dispatch, + .runnable = (void *)fcg_runnable, + .running = (void *)fcg_running, + .stopping = (void *)fcg_stopping, + .quiescent = (void *)fcg_quiescent, + .init_task = (void *)fcg_init_task, + .cgroup_set_weight = (void *)fcg_cgroup_set_weight, + .cgroup_init = (void *)fcg_cgroup_init, + .cgroup_exit = (void *)fcg_cgroup_exit, + .cgroup_move = (void *)fcg_cgroup_move, + .exit = (void *)fcg_exit, + .flags = SCX_OPS_HAS_CGROUP_WEIGHT | SCX_OPS_ENQ_EXITING, + .name = "flatcg"); diff --git a/tools/sched_ext/scx_flatcg.c b/tools/sched_ext/scx_flatcg.c new file mode 100644 index 000000000000..5d24ca9c29d9 --- /dev/null +++ b/tools/sched_ext/scx_flatcg.c @@ -0,0 +1,233 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2023 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2023 Tejun Heo + * Copyright (c) 2023 David Vernet + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "scx_flatcg.h" +#include "scx_flatcg.bpf.skel.h" + +#ifndef FILEID_KERNFS +#define FILEID_KERNFS 0xfe +#endif + +const char help_fmt[] = +"A flattened cgroup hierarchy sched_ext scheduler.\n" +"\n" +"See the top-level comment in .bpf.c for more details.\n" +"\n" +"Usage: %s [-s SLICE_US] [-i INTERVAL] [-f] [-v]\n" +"\n" +" -s SLICE_US Override slice duration\n" +" -i INTERVAL Report interval\n" +" -f Use FIFO scheduling instead of weighted vtime scheduling\n" +" -v Print libbpf debug messages\n" +" -h Display this help and exit\n"; + +static bool verbose; +static volatile int exit_req; + +static int libbpf_print_fn(enum libbpf_print_level level, const char *format, va_list args) +{ + if (level == LIBBPF_DEBUG && !verbose) + return 0; + return vfprintf(stderr, format, args); +} + +static void sigint_handler(int dummy) +{ + exit_req = 1; +} + +static float read_cpu_util(__u64 *last_sum, __u64 *last_idle) +{ + FILE *fp; + char buf[4096]; + char *line, *cur = NULL, *tok; + __u64 sum = 0, idle = 0; + __u64 delta_sum, delta_idle; + int idx; + + fp = fopen("/proc/stat", "r"); + if (!fp) { + perror("fopen(\"/proc/stat\")"); + return 0.0; + } + + if (!fgets(buf, sizeof(buf), fp)) { + perror("fgets(\"/proc/stat\")"); + fclose(fp); + return 0.0; + } + fclose(fp); + + line = buf; + for (idx = 0; (tok = strtok_r(line, " \n", &cur)); idx++) { + char *endp = NULL; + __u64 v; + + if (idx == 0) { + line = NULL; + continue; + } + v = strtoull(tok, &endp, 0); + if (!endp || *endp != '\0') { + fprintf(stderr, "failed to parse %dth field of /proc/stat (\"%s\")\n", + idx, tok); + continue; + } + sum += v; + if (idx == 4) + idle = v; + } + + delta_sum = sum - *last_sum; + delta_idle = idle - *last_idle; + *last_sum = sum; + *last_idle = idle; + + return delta_sum ? (float)(delta_sum - delta_idle) / delta_sum : 0.0; +} + +static void fcg_read_stats(struct scx_flatcg *skel, __u64 *stats) +{ + __u64 cnts[FCG_NR_STATS][skel->rodata->nr_cpus]; + __u32 idx; + + memset(stats, 0, sizeof(stats[0]) * FCG_NR_STATS); + + for (idx = 0; idx < FCG_NR_STATS; idx++) { + int ret, cpu; + + ret = bpf_map_lookup_elem(bpf_map__fd(skel->maps.stats), + &idx, cnts[idx]); + if (ret < 0) + continue; + for (cpu = 0; cpu < skel->rodata->nr_cpus; cpu++) + stats[idx] += cnts[idx][cpu]; + } +} + +int main(int argc, char **argv) +{ + struct scx_flatcg *skel; + struct bpf_link *link; + struct timespec intv_ts = { .tv_sec = 2, .tv_nsec = 0 }; + bool dump_cgrps = false; + __u64 last_cpu_sum = 0, last_cpu_idle = 0; + __u64 last_stats[FCG_NR_STATS] = {}; + unsigned long seq = 0; + __s32 opt; + __u64 ecode; + + libbpf_set_print(libbpf_print_fn); + signal(SIGINT, sigint_handler); + signal(SIGTERM, sigint_handler); +restart: + skel = SCX_OPS_OPEN(flatcg_ops, scx_flatcg); + + skel->rodata->nr_cpus = libbpf_num_possible_cpus(); + + while ((opt = getopt(argc, argv, "s:i:dfvh")) != -1) { + double v; + + switch (opt) { + case 's': + v = strtod(optarg, NULL); + skel->rodata->cgrp_slice_ns = v * 1000; + break; + case 'i': + v = strtod(optarg, NULL); + intv_ts.tv_sec = v; + intv_ts.tv_nsec = (v - (float)intv_ts.tv_sec) * 1000000000; + break; + case 'd': + dump_cgrps = true; + break; + case 'f': + skel->rodata->fifo_sched = true; + break; + case 'v': + verbose = true; + break; + case 'h': + default: + fprintf(stderr, help_fmt, basename(argv[0])); + return opt != 'h'; + } + } + + printf("slice=%.1lfms intv=%.1lfs dump_cgrps=%d", + (double)skel->rodata->cgrp_slice_ns / 1000000.0, + (double)intv_ts.tv_sec + (double)intv_ts.tv_nsec / 1000000000.0, + dump_cgrps); + + SCX_OPS_LOAD(skel, flatcg_ops, scx_flatcg, uei); + link = SCX_OPS_ATTACH(skel, flatcg_ops, scx_flatcg); + + while (!exit_req && !UEI_EXITED(skel, uei)) { + __u64 acc_stats[FCG_NR_STATS]; + __u64 stats[FCG_NR_STATS]; + float cpu_util; + int i; + + cpu_util = read_cpu_util(&last_cpu_sum, &last_cpu_idle); + + fcg_read_stats(skel, acc_stats); + for (i = 0; i < FCG_NR_STATS; i++) + stats[i] = acc_stats[i] - last_stats[i]; + + memcpy(last_stats, acc_stats, sizeof(acc_stats)); + + printf("\n[SEQ %6lu cpu=%5.1lf hweight_gen=%" PRIu64 "]\n", + seq++, cpu_util * 100.0, skel->data->hweight_gen); + printf(" act:%6llu deact:%6llu global:%6llu local:%6llu\n", + stats[FCG_STAT_ACT], + stats[FCG_STAT_DEACT], + stats[FCG_STAT_GLOBAL], + stats[FCG_STAT_LOCAL]); + printf("HWT cache:%6llu update:%6llu skip:%6llu race:%6llu\n", + stats[FCG_STAT_HWT_CACHE], + stats[FCG_STAT_HWT_UPDATES], + stats[FCG_STAT_HWT_SKIP], + stats[FCG_STAT_HWT_RACE]); + printf("ENQ skip:%6llu race:%6llu\n", + stats[FCG_STAT_ENQ_SKIP], + stats[FCG_STAT_ENQ_RACE]); + printf("CNS keep:%6llu expire:%6llu empty:%6llu gone:%6llu\n", + stats[FCG_STAT_CNS_KEEP], + stats[FCG_STAT_CNS_EXPIRE], + stats[FCG_STAT_CNS_EMPTY], + stats[FCG_STAT_CNS_GONE]); + printf("PNC next:%6llu empty:%6llu nocgrp:%6llu gone:%6llu race:%6llu fail:%6llu\n", + stats[FCG_STAT_PNC_NEXT], + stats[FCG_STAT_PNC_EMPTY], + stats[FCG_STAT_PNC_NO_CGRP], + stats[FCG_STAT_PNC_GONE], + stats[FCG_STAT_PNC_RACE], + stats[FCG_STAT_PNC_FAIL]); + printf("BAD remove:%6llu\n", + acc_stats[FCG_STAT_BAD_REMOVAL]); + fflush(stdout); + + nanosleep(&intv_ts, NULL); + } + + bpf_link__destroy(link); + ecode = UEI_REPORT(skel, uei); + scx_flatcg__destroy(skel); + + if (UEI_ECODE_RESTART(ecode)) + goto restart; + return 0; +} diff --git a/tools/sched_ext/scx_flatcg.h b/tools/sched_ext/scx_flatcg.h new file mode 100644 index 000000000000..6f2ea50acb1c --- /dev/null +++ b/tools/sched_ext/scx_flatcg.h @@ -0,0 +1,51 @@ +#ifndef __SCX_EXAMPLE_FLATCG_H +#define __SCX_EXAMPLE_FLATCG_H + +enum { + FCG_HWEIGHT_ONE = 1LLU << 16, +}; + +enum fcg_stat_idx { + FCG_STAT_ACT, + FCG_STAT_DEACT, + FCG_STAT_LOCAL, + FCG_STAT_GLOBAL, + + FCG_STAT_HWT_UPDATES, + FCG_STAT_HWT_CACHE, + FCG_STAT_HWT_SKIP, + FCG_STAT_HWT_RACE, + + FCG_STAT_ENQ_SKIP, + FCG_STAT_ENQ_RACE, + + FCG_STAT_CNS_KEEP, + FCG_STAT_CNS_EXPIRE, + FCG_STAT_CNS_EMPTY, + FCG_STAT_CNS_GONE, + + FCG_STAT_PNC_NO_CGRP, + FCG_STAT_PNC_NEXT, + FCG_STAT_PNC_EMPTY, + FCG_STAT_PNC_GONE, + FCG_STAT_PNC_RACE, + FCG_STAT_PNC_FAIL, + + FCG_STAT_BAD_REMOVAL, + + FCG_NR_STATS, +}; + +struct fcg_cgrp_ctx { + u32 nr_active; + u32 nr_runnable; + u32 queued; + u32 weight; + u32 hweight; + u64 child_weight_sum; + u64 hweight_gen; + s64 cvtime_delta; + u64 tvtime_now; +}; + +#endif /* __SCX_EXAMPLE_FLATCG_H */ -- cgit v1.2.3 From f0a6ecebd858658df213d114b0530f8f0b96e396 Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Thu, 5 Sep 2024 00:30:21 +0900 Subject: selftests/ftrace: Fix eventfs ownership testcase to find mount point Fix eventfs ownership testcase to find mount point if stat -c "%m" failed. This can happen on the system based on busybox. In this case, this will try to use the current working directory, which should be a tracefs top directory (and eventfs is mounted as a part of tracefs.) If it does not work, the test is skipped as UNRESOLVED because of the environmental problem. Fixes: ee9793be08b1 ("tracing/selftests: Add ownership modification tests for eventfs") Signed-off-by: Masami Hiramatsu (Google) Acked-by: Steven Rostedt (Google) Signed-off-by: Shuah Khan --- .../selftests/ftrace/test.d/00basic/test_ownership.tc | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/ftrace/test.d/00basic/test_ownership.tc b/tools/testing/selftests/ftrace/test.d/00basic/test_ownership.tc index 71e43a92352a..094419e190c2 100644 --- a/tools/testing/selftests/ftrace/test.d/00basic/test_ownership.tc +++ b/tools/testing/selftests/ftrace/test.d/00basic/test_ownership.tc @@ -6,6 +6,18 @@ original_group=`stat -c "%g" .` original_owner=`stat -c "%u" .` mount_point=`stat -c '%m' .` + +# If stat -c '%m' does not work (e.g. busybox) or failed, try to use the +# current working directory (which should be a tracefs) as the mount point. +if [ ! -d "$mount_point" ]; then + if mount | grep -qw $PWD ; then + mount_point=$PWD + else + # If PWD doesn't work, that is an environmental problem. + exit_unresolved + fi +fi + mount_options=`mount | grep "$mount_point" | sed -e 's/.*(\(.*\)).*/\1/'` # find another owner and group that is not the original -- cgit v1.2.3 From eff5b5fffc1d39fb9e5e66d6aa4ed3fcb109caac Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Wed, 4 Sep 2024 15:12:56 -0700 Subject: selftests/bpf: Add a selftest for x86 jit convergence issues The core part of the selftest, i.e., the je <-> jmp cycle, mimics the original sched-ext bpf program. The test will fail without the previous patch. I tried to create some cases for other potential cycles (je <-> je, jmp <-> je and jmp <-> jmp) with similar pattern to the test in this patch, but failed. So this patch only contains one test for je <-> jmp cycle. Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20240904221256.37389-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/prog_tests/verifier.c | 2 + .../selftests/bpf/progs/verifier_jit_convergence.c | 114 +++++++++++++++++++++ 2 files changed, 116 insertions(+) create mode 100644 tools/testing/selftests/bpf/progs/verifier_jit_convergence.c (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/verifier.c b/tools/testing/selftests/bpf/prog_tests/verifier.c index 80a90c627182..df398e714dff 100644 --- a/tools/testing/selftests/bpf/prog_tests/verifier.c +++ b/tools/testing/selftests/bpf/prog_tests/verifier.c @@ -39,6 +39,7 @@ #include "verifier_int_ptr.skel.h" #include "verifier_iterating_callbacks.skel.h" #include "verifier_jeq_infer_not_null.skel.h" +#include "verifier_jit_convergence.skel.h" #include "verifier_ld_ind.skel.h" #include "verifier_ldsx.skel.h" #include "verifier_leak_ptr.skel.h" @@ -163,6 +164,7 @@ void test_verifier_helper_value_access(void) { RUN(verifier_helper_value_access void test_verifier_int_ptr(void) { RUN(verifier_int_ptr); } void test_verifier_iterating_callbacks(void) { RUN(verifier_iterating_callbacks); } void test_verifier_jeq_infer_not_null(void) { RUN(verifier_jeq_infer_not_null); } +void test_verifier_jit_convergence(void) { RUN(verifier_jit_convergence); } void test_verifier_ld_ind(void) { RUN(verifier_ld_ind); } void test_verifier_ldsx(void) { RUN(verifier_ldsx); } void test_verifier_leak_ptr(void) { RUN(verifier_leak_ptr); } diff --git a/tools/testing/selftests/bpf/progs/verifier_jit_convergence.c b/tools/testing/selftests/bpf/progs/verifier_jit_convergence.c new file mode 100644 index 000000000000..9f3f2b7db450 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/verifier_jit_convergence.c @@ -0,0 +1,114 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */ + +#include +#include +#include "bpf_misc.h" + +struct value_t { + long long a[32]; +}; + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(max_entries, 1); + __type(key, long long); + __type(value, struct value_t); +} map_hash SEC(".maps"); + +SEC("socket") +__description("bpf_jit_convergence je <-> jmp") +__success __retval(0) +__arch_x86_64 +__jited(" pushq %rbp") +__naked void btf_jit_convergence_je_jmp(void) +{ + asm volatile ( + "call %[bpf_get_prandom_u32];" + "if r0 == 0 goto l20_%=;" + "if r0 == 1 goto l21_%=;" + "if r0 == 2 goto l22_%=;" + "if r0 == 3 goto l23_%=;" + "if r0 == 4 goto l24_%=;" + "call %[bpf_get_prandom_u32];" + "call %[bpf_get_prandom_u32];" +"l20_%=:" +"l21_%=:" +"l22_%=:" +"l23_%=:" +"l24_%=:" + "r1 = 0;" + "*(u64 *)(r10 - 8) = r1;" + "r2 = r10;" + "r2 += -8;" + "r1 = %[map_hash] ll;" + "call %[bpf_map_lookup_elem];" + "if r0 == 0 goto l1_%=;" + "r6 = r0;" + "call %[bpf_get_prandom_u32];" + "r7 = r0;" + "r5 = r6;" + "if r0 != 0x0 goto l12_%=;" + "call %[bpf_get_prandom_u32];" + "r1 = r0;" + "r2 = r6;" + "if r1 == 0x0 goto l0_%=;" +"l9_%=:" + "r2 = *(u64 *)(r6 + 0x0);" + "r2 += 0x1;" + "*(u64 *)(r6 + 0x0) = r2;" + "goto l1_%=;" +"l12_%=:" + "r1 = r7;" + "r1 += 0x98;" + "r2 = r5;" + "r2 += 0x90;" + "r2 = *(u32 *)(r2 + 0x0);" + "r3 = r7;" + "r3 &= 0x1;" + "r2 *= 0xa8;" + "if r3 == 0x0 goto l2_%=;" + "r1 += r2;" + "r1 -= r7;" + "r1 += 0x8;" + "if r1 <= 0xb20 goto l3_%=;" + "r1 = 0x0;" + "goto l4_%=;" +"l3_%=:" + "r1 += r7;" +"l4_%=:" + "if r1 == 0x0 goto l8_%=;" + "goto l9_%=;" +"l2_%=:" + "r1 += r2;" + "r1 -= r7;" + "r1 += 0x10;" + "if r1 <= 0xb20 goto l6_%=;" + "r1 = 0x0;" + "goto l7_%=;" +"l6_%=:" + "r1 += r7;" +"l7_%=:" + "if r1 == 0x0 goto l8_%=;" + "goto l9_%=;" +"l0_%=:" + "r1 = 0x3;" + "*(u64 *)(r10 - 0x10) = r1;" + "r2 = r1;" + "goto l1_%=;" +"l8_%=:" + "r1 = r5;" + "r1 += 0x4;" + "r1 = *(u32 *)(r1 + 0x0);" + "*(u64 *)(r10 - 0x8) = r1;" +"l1_%=:" + "r0 = 0;" + "exit;" + : + : __imm(bpf_get_prandom_u32), + __imm(bpf_map_lookup_elem), + __imm_addr(map_hash) + : __clobber_all); +} + +char _license[] SEC("license") = "GPL"; -- cgit v1.2.3 From 6ffa72acc9c933a065782cb49afde1130ca722f7 Mon Sep 17 00:00:00 2001 From: Chen Ni Date: Wed, 4 Sep 2024 09:44:41 +0800 Subject: selftests: net: convert comma to semicolon Replace comma between expressions with semicolons. Using a ',' in place of a ';' can have unintended side effects. Although that is not the case here, it is seems best to use ';' unless ',' is intended. Found by inspection. No functional change intended. Compile tested only. Signed-off-by: Chen Ni Reviewed-by: Simon Horman Link: https://patch.msgid.link/20240904014441.1065753-1-nichen@iscas.ac.cn Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/psock_fanout.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/psock_fanout.c b/tools/testing/selftests/net/psock_fanout.c index 1a736f700be4..4f31e92ebd96 100644 --- a/tools/testing/selftests/net/psock_fanout.c +++ b/tools/testing/selftests/net/psock_fanout.c @@ -165,9 +165,9 @@ static void sock_fanout_set_ebpf(int fd) attr.insns = (unsigned long) prog; attr.insn_cnt = ARRAY_SIZE(prog); attr.license = (unsigned long) "GPL"; - attr.log_buf = (unsigned long) log_buf, - attr.log_size = sizeof(log_buf), - attr.log_level = 1, + attr.log_buf = (unsigned long) log_buf; + attr.log_size = sizeof(log_buf); + attr.log_level = 1; pfd = syscall(__NR_bpf, BPF_PROG_LOAD, &attr, sizeof(attr)); if (pfd < 0) { -- cgit v1.2.3 From e4db2a821b6cff4bd59c0efabdbfdd84ac6005c1 Mon Sep 17 00:00:00 2001 From: Pu Lehui Date: Sat, 31 Aug 2024 04:19:31 +0000 Subject: libbpf: Access first syscall argument with CO-RE direct read on s390 Currently PT_REGS_PARM1 SYSCALL(x) is consistent with PT_REGS_PARM1_CORE SYSCALL(x), which will introduce the overhead of BPF_CORE_READ(), taking into account the read pt_regs comes directly from the context, let's use CO-RE direct read to access the first system call argument. Suggested-by: Andrii Nakryiko Signed-off-by: Pu Lehui Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20240831041934.1629216-2-pulehui@huaweicloud.com --- tools/lib/bpf/bpf_tracing.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/lib/bpf/bpf_tracing.h b/tools/lib/bpf/bpf_tracing.h index 9314fa95f04e..1297d8d297d1 100644 --- a/tools/lib/bpf/bpf_tracing.h +++ b/tools/lib/bpf/bpf_tracing.h @@ -163,7 +163,7 @@ struct pt_regs___s390 { unsigned long orig_gpr2; -}; +} __attribute__((preserve_access_index)); /* s390 provides user_pt_regs instead of struct pt_regs to userspace */ #define __PT_REGS_CAST(x) ((const user_pt_regs *)(x)) @@ -179,7 +179,7 @@ struct pt_regs___s390 { #define __PT_PARM4_SYSCALL_REG __PT_PARM4_REG #define __PT_PARM5_SYSCALL_REG __PT_PARM5_REG #define __PT_PARM6_SYSCALL_REG gprs[7] -#define PT_REGS_PARM1_SYSCALL(x) PT_REGS_PARM1_CORE_SYSCALL(x) +#define PT_REGS_PARM1_SYSCALL(x) (((const struct pt_regs___s390 *)(x))->__PT_PARM1_SYSCALL_REG) #define PT_REGS_PARM1_CORE_SYSCALL(x) \ BPF_CORE_READ((const struct pt_regs___s390 *)(x), __PT_PARM1_SYSCALL_REG) -- cgit v1.2.3 From 9ab94078e868a45cbd23828b8377600b83a41fac Mon Sep 17 00:00:00 2001 From: Pu Lehui Date: Sat, 31 Aug 2024 04:19:32 +0000 Subject: libbpf: Access first syscall argument with CO-RE direct read on arm64 Currently PT_REGS_PARM1 SYSCALL(x) is consistent with PT_REGS_PARM1_CORE SYSCALL(x), which will introduce the overhead of BPF_CORE_READ(), taking into account the read pt_regs comes directly from the context, let's use CO-RE direct read to access the first system call argument. Suggested-by: Andrii Nakryiko Signed-off-by: Pu Lehui Signed-off-by: Andrii Nakryiko Acked-by: Xu Kuohai Link: https://lore.kernel.org/bpf/20240831041934.1629216-3-pulehui@huaweicloud.com --- tools/lib/bpf/bpf_tracing.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/lib/bpf/bpf_tracing.h b/tools/lib/bpf/bpf_tracing.h index 1297d8d297d1..c15831c98990 100644 --- a/tools/lib/bpf/bpf_tracing.h +++ b/tools/lib/bpf/bpf_tracing.h @@ -222,7 +222,7 @@ struct pt_regs___s390 { struct pt_regs___arm64 { unsigned long orig_x0; -}; +} __attribute__((preserve_access_index)); /* arm64 provides struct user_pt_regs instead of struct pt_regs to userspace */ #define __PT_REGS_CAST(x) ((const struct user_pt_regs *)(x)) @@ -241,7 +241,7 @@ struct pt_regs___arm64 { #define __PT_PARM4_SYSCALL_REG __PT_PARM4_REG #define __PT_PARM5_SYSCALL_REG __PT_PARM5_REG #define __PT_PARM6_SYSCALL_REG __PT_PARM6_REG -#define PT_REGS_PARM1_SYSCALL(x) PT_REGS_PARM1_CORE_SYSCALL(x) +#define PT_REGS_PARM1_SYSCALL(x) (((const struct pt_regs___arm64 *)(x))->__PT_PARM1_SYSCALL_REG) #define PT_REGS_PARM1_CORE_SYSCALL(x) \ BPF_CORE_READ((const struct pt_regs___arm64 *)(x), __PT_PARM1_SYSCALL_REG) -- cgit v1.2.3 From 4a4c4c0d0a42079d2fa97a232895c56ac2f3f573 Mon Sep 17 00:00:00 2001 From: Pu Lehui Date: Sat, 31 Aug 2024 04:19:33 +0000 Subject: selftests/bpf: Enable test_bpf_syscall_macro: Syscall_arg1 on s390 and arm64 Considering that CO-RE direct read access to the first system call argument is already available on s390 and arm64, let's enable test_bpf_syscall_macro:syscall_arg1 on these architectures. Signed-off-by: Pu Lehui Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20240831041934.1629216-4-pulehui@huaweicloud.com --- tools/testing/selftests/bpf/prog_tests/test_bpf_syscall_macro.c | 4 ---- tools/testing/selftests/bpf/progs/bpf_syscall_macro.c | 2 -- 2 files changed, 6 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/test_bpf_syscall_macro.c b/tools/testing/selftests/bpf/prog_tests/test_bpf_syscall_macro.c index 2900c5e9a016..1750c29b94f8 100644 --- a/tools/testing/selftests/bpf/prog_tests/test_bpf_syscall_macro.c +++ b/tools/testing/selftests/bpf/prog_tests/test_bpf_syscall_macro.c @@ -38,11 +38,7 @@ void test_bpf_syscall_macro(void) /* check whether args of syscall are copied correctly */ prctl(exp_arg1, exp_arg2, exp_arg3, exp_arg4, exp_arg5); -#if defined(__aarch64__) || defined(__s390__) - ASSERT_NEQ(skel->bss->arg1, exp_arg1, "syscall_arg1"); -#else ASSERT_EQ(skel->bss->arg1, exp_arg1, "syscall_arg1"); -#endif ASSERT_EQ(skel->bss->arg2, exp_arg2, "syscall_arg2"); ASSERT_EQ(skel->bss->arg3, exp_arg3, "syscall_arg3"); /* it cannot copy arg4 when uses PT_REGS_PARM4 on x86_64 */ diff --git a/tools/testing/selftests/bpf/progs/bpf_syscall_macro.c b/tools/testing/selftests/bpf/progs/bpf_syscall_macro.c index 1a476d8ed354..9e7d9674ce2a 100644 --- a/tools/testing/selftests/bpf/progs/bpf_syscall_macro.c +++ b/tools/testing/selftests/bpf/progs/bpf_syscall_macro.c @@ -43,9 +43,7 @@ int BPF_KPROBE(handle_sys_prctl) /* test for PT_REGS_PARM */ -#if !defined(bpf_target_arm64) && !defined(bpf_target_s390) bpf_probe_read_kernel(&tmp, sizeof(tmp), &PT_REGS_PARM1_SYSCALL(real_regs)); -#endif arg1 = tmp; bpf_probe_read_kernel(&arg2, sizeof(arg2), &PT_REGS_PARM2_SYSCALL(real_regs)); bpf_probe_read_kernel(&arg3, sizeof(arg3), &PT_REGS_PARM3_SYSCALL(real_regs)); -- cgit v1.2.3 From 99857422338b67f0a2927cbc44fdaa8783717858 Mon Sep 17 00:00:00 2001 From: Pu Lehui Date: Sat, 31 Aug 2024 04:19:34 +0000 Subject: libbpf: Fix accessing first syscall argument on RV64 On RV64, as Ilya mentioned before [0], the first syscall parameter should be accessed through orig_a0 (see arch/riscv64/include/asm/syscall.h), otherwise it will cause selftests like bpf_syscall_macro, vmlinux, test_lsm, etc. to fail on RV64. Let's fix it by using the struct pt_regs style CO-RE direct access. Signed-off-by: Pu Lehui Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20220209021745.2215452-1-iii@linux.ibm.com [0] Link: https://lore.kernel.org/bpf/20240831041934.1629216-5-pulehui@huaweicloud.com --- tools/lib/bpf/bpf_tracing.h | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/lib/bpf/bpf_tracing.h b/tools/lib/bpf/bpf_tracing.h index c15831c98990..bad7a0dbf510 100644 --- a/tools/lib/bpf/bpf_tracing.h +++ b/tools/lib/bpf/bpf_tracing.h @@ -351,6 +351,10 @@ struct pt_regs___arm64 { * https://github.com/riscv-non-isa/riscv-elf-psabi-doc/blob/master/riscv-cc.adoc#risc-v-calling-conventions */ +struct pt_regs___riscv { + unsigned long orig_a0; +} __attribute__((preserve_access_index)); + /* riscv provides struct user_regs_struct instead of struct pt_regs to userspace */ #define __PT_REGS_CAST(x) ((const struct user_regs_struct *)(x)) #define __PT_PARM1_REG a0 @@ -362,12 +366,15 @@ struct pt_regs___arm64 { #define __PT_PARM7_REG a6 #define __PT_PARM8_REG a7 -#define __PT_PARM1_SYSCALL_REG __PT_PARM1_REG +#define __PT_PARM1_SYSCALL_REG orig_a0 #define __PT_PARM2_SYSCALL_REG __PT_PARM2_REG #define __PT_PARM3_SYSCALL_REG __PT_PARM3_REG #define __PT_PARM4_SYSCALL_REG __PT_PARM4_REG #define __PT_PARM5_SYSCALL_REG __PT_PARM5_REG #define __PT_PARM6_SYSCALL_REG __PT_PARM6_REG +#define PT_REGS_PARM1_SYSCALL(x) (((const struct pt_regs___riscv *)(x))->__PT_PARM1_SYSCALL_REG) +#define PT_REGS_PARM1_CORE_SYSCALL(x) \ + BPF_CORE_READ((const struct pt_regs___riscv *)(x), __PT_PARM1_SYSCALL_REG) #define __PT_RET_REG ra #define __PT_FP_REG s0 -- cgit v1.2.3 From 5e5cc1eb65256e6017e3deec04f9806f2f317853 Mon Sep 17 00:00:00 2001 From: zhang jiao Date: Mon, 2 Sep 2024 12:21:03 +0800 Subject: tools: hv: rm .*.cmd when make clean rm .*.cmd when make clean Signed-off-by: zhang jiao Reviewed-by: Saurabh Sengar Link: https://lore.kernel.org/r/20240902042103.5867-1-zhangjiao2@cmss.chinamobile.com Signed-off-by: Wei Liu Message-ID: <20240902042103.5867-1-zhangjiao2@cmss.chinamobile.com> --- tools/hv/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/hv/Makefile b/tools/hv/Makefile index 2e60e2c212cd..34ffcec264ab 100644 --- a/tools/hv/Makefile +++ b/tools/hv/Makefile @@ -52,7 +52,7 @@ $(OUTPUT)hv_fcopy_uio_daemon: $(HV_FCOPY_UIO_DAEMON_IN) clean: rm -f $(ALL_PROGRAMS) - find $(or $(OUTPUT),.) -name '*.o' -delete -o -name '\.*.d' -delete + find $(or $(OUTPUT),.) -name '*.o' -delete -o -name '\.*.d' -delete -o -name '\.*.cmd' -delete install: $(ALL_PROGRAMS) install -d -m 755 $(DESTDIR)$(sbindir); \ -- cgit v1.2.3 From 05144ab7b7eaf531fc728fcb79dcf36b621ff42d Mon Sep 17 00:00:00 2001 From: "Nícolas F. R. A. Prado" Date: Mon, 29 Jul 2024 16:56:02 -0400 Subject: kselftest: dt: Ignore nodes that have ancestors disabled MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Filter out nodes that have one of its ancestors disabled as they aren't expected to probe. This removes the following false-positive failures on the sc7180-trogdor-lazor-limozeen-nots-r5 platform: /soc@0/geniqup@8c0000/i2c@894000/proximity@28 /soc@0/geniqup@ac0000/spi@a90000/ec@0 /soc@0/remoteproc@62400000/glink-edge/apr /soc@0/remoteproc@62400000/glink-edge/apr/service@3 /soc@0/remoteproc@62400000/glink-edge/apr/service@4 /soc@0/remoteproc@62400000/glink-edge/apr/service@4/clock-controller /soc@0/remoteproc@62400000/glink-edge/apr/service@4/dais /soc@0/remoteproc@62400000/glink-edge/apr/service@7 /soc@0/remoteproc@62400000/glink-edge/apr/service@7/dais /soc@0/remoteproc@62400000/glink-edge/apr/service@8 /soc@0/remoteproc@62400000/glink-edge/apr/service@8/routing /soc@0/remoteproc@62400000/glink-edge/fastrpc /soc@0/remoteproc@62400000/glink-edge/fastrpc/compute-cb@3 /soc@0/remoteproc@62400000/glink-edge/fastrpc/compute-cb@4 /soc@0/remoteproc@62400000/glink-edge/fastrpc/compute-cb@5 /soc@0/spmi@c440000/pmic@0/pon@800/pwrkey Fixes: 14571ab1ad21 ("kselftest: Add new test for detecting unprobed Devicetree devices") Signed-off-by: Nícolas F. R. A. Prado Link: https://lore.kernel.org/r/20240729-dt-kselftest-parent-disabled-v2-1-d7a001c4930d@collabora.com Signed-off-by: Rob Herring (Arm) --- tools/testing/selftests/dt/test_unprobed_devices.sh | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/dt/test_unprobed_devices.sh b/tools/testing/selftests/dt/test_unprobed_devices.sh index 2d7e70c5ad2d..5e3f42ef249e 100755 --- a/tools/testing/selftests/dt/test_unprobed_devices.sh +++ b/tools/testing/selftests/dt/test_unprobed_devices.sh @@ -34,8 +34,21 @@ nodes_compatible=$( # Check if node is available if [[ -e "${node}"/status ]]; then status=$(tr -d '\000' < "${node}"/status) - [[ "${status}" != "okay" && "${status}" != "ok" ]] && continue + if [[ "${status}" != "okay" && "${status}" != "ok" ]]; then + if [ -n "${disabled_nodes_regex}" ]; then + disabled_nodes_regex="${disabled_nodes_regex}|${node}" + else + disabled_nodes_regex="${node}" + fi + continue + fi fi + + # Ignore this node if one of its ancestors was disabled + if [ -n "${disabled_nodes_regex}" ]; then + echo "${node}" | grep -q -E "${disabled_nodes_regex}" && continue + fi + echo "${node}" | sed -e 's|\/proc\/device-tree||' done | sort ) -- cgit v1.2.3 From 04b01625da130c7521b768996cd5e48052198b97 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 3 Sep 2024 10:46:00 -0700 Subject: perf/uprobe: split uprobe_unregister() With uprobe_unregister() having grown a synchronize_srcu(), it becomes fairly slow to call. Esp. since both users of this API call it in a loop. Peel off the sync_srcu() and do it once, after the loop. We also need to add uprobe_unregister_sync() into uprobe_register()'s error handling path, as we need to be careful about returning to the caller before we have a guarantee that partially attached consumer won't be called anymore. This is an unlikely slow path and this should be totally fine to be slow in the case of a failed attach. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: "Peter Zijlstra (Intel)" Co-developed-by: Andrii Nakryiko Signed-off-by: Andrii Nakryiko Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Oleg Nesterov Link: https://lore.kernel.org/r/20240903174603.3554182-6-andrii@kernel.org --- include/linux/uprobes.h | 8 ++++++-- kernel/events/uprobes.c | 21 +++++++++++++++------ kernel/trace/bpf_trace.c | 5 ++++- kernel/trace/trace_uprobe.c | 6 +++++- .../testing/selftests/bpf/bpf_testmod/bpf_testmod.c | 3 ++- 5 files changed, 32 insertions(+), 11 deletions(-) (limited to 'tools') diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h index f112b56e9479..2b294bf1881f 100644 --- a/include/linux/uprobes.h +++ b/include/linux/uprobes.h @@ -115,7 +115,8 @@ extern unsigned long uprobe_get_trap_addr(struct pt_regs *regs); extern int uprobe_write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr, uprobe_opcode_t); extern struct uprobe *uprobe_register(struct inode *inode, loff_t offset, loff_t ref_ctr_offset, struct uprobe_consumer *uc); extern int uprobe_apply(struct uprobe *uprobe, struct uprobe_consumer *uc, bool); -extern void uprobe_unregister(struct uprobe *uprobe, struct uprobe_consumer *uc); +extern void uprobe_unregister_nosync(struct uprobe *uprobe, struct uprobe_consumer *uc); +extern void uprobe_unregister_sync(void); extern int uprobe_mmap(struct vm_area_struct *vma); extern void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned long end); extern void uprobe_start_dup_mmap(void); @@ -164,7 +165,10 @@ uprobe_apply(struct uprobe* uprobe, struct uprobe_consumer *uc, bool add) return -ENOSYS; } static inline void -uprobe_unregister(struct uprobe *uprobe, struct uprobe_consumer *uc) +uprobe_unregister_nosync(struct uprobe *uprobe, struct uprobe_consumer *uc) +{ +} +static inline void uprobe_unregister_sync(void) { } static inline int uprobe_mmap(struct vm_area_struct *vma) diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index e15c0306e535..694f6790e848 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1105,11 +1105,11 @@ register_for_each_vma(struct uprobe *uprobe, struct uprobe_consumer *new) } /** - * uprobe_unregister - unregister an already registered probe. + * uprobe_unregister_nosync - unregister an already registered probe. * @uprobe: uprobe to remove * @uc: identify which probe if multiple probes are colocated. */ -void uprobe_unregister(struct uprobe *uprobe, struct uprobe_consumer *uc) +void uprobe_unregister_nosync(struct uprobe *uprobe, struct uprobe_consumer *uc) { int err; @@ -1121,12 +1121,15 @@ void uprobe_unregister(struct uprobe *uprobe, struct uprobe_consumer *uc) /* TODO : cant unregister? schedule a worker thread */ if (unlikely(err)) { uprobe_warn(current, "unregister, leaking uprobe"); - goto out_sync; + return; } put_uprobe(uprobe); +} +EXPORT_SYMBOL_GPL(uprobe_unregister_nosync); -out_sync: +void uprobe_unregister_sync(void) +{ /* * Now that handler_chain() and handle_uretprobe_chain() iterate over * uprobe->consumers list under RCU protection without holding @@ -1138,7 +1141,7 @@ out_sync: */ synchronize_srcu(&uprobes_srcu); } -EXPORT_SYMBOL_GPL(uprobe_unregister); +EXPORT_SYMBOL_GPL(uprobe_unregister_sync); /** * uprobe_register - register a probe @@ -1196,7 +1199,13 @@ struct uprobe *uprobe_register(struct inode *inode, up_write(&uprobe->register_rwsem); if (ret) { - uprobe_unregister(uprobe, uc); + uprobe_unregister_nosync(uprobe, uc); + /* + * Registration might have partially succeeded, so we can have + * this consumer being called right at this time. We need to + * sync here. It's ok, it's unlikely slow path. + */ + uprobe_unregister_sync(); return ERR_PTR(ret); } diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index c99bf0617602..ac0a01cc8634 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -3184,7 +3184,10 @@ static void bpf_uprobe_unregister(struct bpf_uprobe *uprobes, u32 cnt) u32 i; for (i = 0; i < cnt; i++) - uprobe_unregister(uprobes[i].uprobe, &uprobes[i].consumer); + uprobe_unregister_nosync(uprobes[i].uprobe, &uprobes[i].consumer); + + if (cnt) + uprobe_unregister_sync(); } static void bpf_uprobe_multi_link_release(struct bpf_link *link) diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 7eb79e0a5352..f7443e996b1b 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -1097,6 +1097,7 @@ static int trace_uprobe_enable(struct trace_uprobe *tu, filter_func_t filter) static void __probe_event_disable(struct trace_probe *tp) { struct trace_uprobe *tu; + bool sync = false; tu = container_of(tp, struct trace_uprobe, tp); WARN_ON(!uprobe_filter_is_empty(tu->tp.event->filter)); @@ -1105,9 +1106,12 @@ static void __probe_event_disable(struct trace_probe *tp) if (!tu->uprobe) continue; - uprobe_unregister(tu->uprobe, &tu->consumer); + uprobe_unregister_nosync(tu->uprobe, &tu->consumer); + sync = true; tu->uprobe = NULL; } + if (sync) + uprobe_unregister_sync(); } static int probe_event_enable(struct trace_event_call *call, diff --git a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c index 3c0515a27842..1fc16657cf42 100644 --- a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c +++ b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c @@ -475,7 +475,8 @@ static void testmod_unregister_uprobe(void) mutex_lock(&testmod_uprobe_mutex); if (uprobe.uprobe) { - uprobe_unregister(uprobe.uprobe, &uprobe.consumer); + uprobe_unregister_nosync(uprobe.uprobe, &uprobe.consumer); + uprobe_unregister_sync(); path_put(&uprobe.path); uprobe.uprobe = NULL; } -- cgit v1.2.3 From dfc58f467f7163b28d8aaadafd86d93dd08ea01c Mon Sep 17 00:00:00 2001 From: zhangjiao Date: Thu, 29 Aug 2024 13:33:09 +0800 Subject: tools: iio: rm .*.cmd when make clean rm .*.cmd when make clean Signed-off-by: zhangjiao Link: https://patch.msgid.link/20240829053309.10563-1-zhangjiao2@cmss.chinamobile.com Signed-off-by: Jonathan Cameron --- tools/iio/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/iio/Makefile b/tools/iio/Makefile index fa720f062229..3bcce0b7d10f 100644 --- a/tools/iio/Makefile +++ b/tools/iio/Makefile @@ -58,7 +58,7 @@ $(OUTPUT)iio_generic_buffer: $(IIO_GENERIC_BUFFER_IN) clean: rm -f $(ALL_PROGRAMS) rm -rf $(OUTPUT)include/linux/iio - find $(or $(OUTPUT),.) -name '*.o' -delete -o -name '\.*.d' -delete + find $(or $(OUTPUT),.) -name '*.o' -delete -o -name '\.*.d' -delete -o -name '\.*.cmd' -delete install: $(ALL_PROGRAMS) install -d -m 755 $(DESTDIR)$(bindir); \ -- cgit v1.2.3 From 0b0bb453716f1469105f075044c68ee9670fe4ee Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Thu, 5 Sep 2024 14:51:22 +0300 Subject: selftests/bpf: Add child argument to spawn_child function Adding child argument to spawn_child function to allow to create multiple children in following change. Signed-off-by: Jiri Olsa Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20240905115124.1503998-3-jolsa@kernel.org --- .../selftests/bpf/prog_tests/uprobe_multi_test.c | 85 ++++++++++------------ 1 file changed, 39 insertions(+), 46 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c b/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c index acb62675ff65..250eb47c68f9 100644 --- a/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c +++ b/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c @@ -68,29 +68,27 @@ static void kick_child(struct child *child) fflush(NULL); } -static struct child *spawn_child(void) +static int spawn_child(struct child *child) { - static struct child child; - int err; - int c; + int err, c; /* pipe to notify child to execute the trigger functions */ - if (pipe(child.go)) - return NULL; + if (pipe(child->go)) + return -1; - child.pid = child.tid = fork(); - if (child.pid < 0) { - release_child(&child); + child->pid = child->tid = fork(); + if (child->pid < 0) { + release_child(child); errno = EINVAL; - return NULL; + return -1; } /* child */ - if (child.pid == 0) { - close(child.go[1]); + if (child->pid == 0) { + close(child->go[1]); /* wait for parent's kick */ - err = read(child.go[0], &c, 1); + err = read(child->go[0], &c, 1); if (err != 1) exit(err); @@ -102,7 +100,7 @@ static struct child *spawn_child(void) exit(errno); } - return &child; + return 0; } static void *child_thread(void *ctx) @@ -131,39 +129,38 @@ static void *child_thread(void *ctx) pthread_exit(&err); } -static struct child *spawn_thread(void) +static int spawn_thread(struct child *child) { - static struct child child; int c, err; /* pipe to notify child to execute the trigger functions */ - if (pipe(child.go)) - return NULL; + if (pipe(child->go)) + return -1; /* pipe to notify parent that child thread is ready */ - if (pipe(child.c2p)) { - close(child.go[0]); - close(child.go[1]); - return NULL; + if (pipe(child->c2p)) { + close(child->go[0]); + close(child->go[1]); + return -1; } - child.pid = getpid(); + child->pid = getpid(); - err = pthread_create(&child.thread, NULL, child_thread, &child); + err = pthread_create(&child->thread, NULL, child_thread, child); if (err) { err = -errno; - close(child.go[0]); - close(child.go[1]); - close(child.c2p[0]); - close(child.c2p[1]); + close(child->go[0]); + close(child->go[1]); + close(child->c2p[0]); + close(child->c2p[1]); errno = -err; - return NULL; + return -1; } - err = read(child.c2p[0], &c, 1); + err = read(child->c2p[0], &c, 1); if (!ASSERT_EQ(err, 1, "child_thread_ready")) - return NULL; + return -1; - return &child; + return 0; } static void uprobe_multi_test_run(struct uprobe_multi *skel, struct child *child) @@ -304,24 +301,22 @@ cleanup: static void test_attach_api(const char *binary, const char *pattern, struct bpf_uprobe_multi_opts *opts) { - struct child *child; + static struct child child; /* no pid filter */ __test_attach_api(binary, pattern, opts, NULL); /* pid filter */ - child = spawn_child(); - if (!ASSERT_OK_PTR(child, "spawn_child")) + if (!ASSERT_OK(spawn_child(&child), "spawn_child")) return; - __test_attach_api(binary, pattern, opts, child); + __test_attach_api(binary, pattern, opts, &child); /* pid filter (thread) */ - child = spawn_thread(); - if (!ASSERT_OK_PTR(child, "spawn_thread")) + if (!ASSERT_OK(spawn_thread(&child), "spawn_thread")) return; - __test_attach_api(binary, pattern, opts, child); + __test_attach_api(binary, pattern, opts, &child); } static void test_attach_api_pattern(void) @@ -712,24 +707,22 @@ cleanup: static void test_link_api(void) { - struct child *child; + static struct child child; /* no pid filter */ __test_link_api(NULL); /* pid filter */ - child = spawn_child(); - if (!ASSERT_OK_PTR(child, "spawn_child")) + if (!ASSERT_OK(spawn_child(&child), "spawn_child")) return; - __test_link_api(child); + __test_link_api(&child); /* pid filter (thread) */ - child = spawn_thread(); - if (!ASSERT_OK_PTR(child, "spawn_thread")) + if (!ASSERT_OK(spawn_thread(&child), "spawn_thread")) return; - __test_link_api(child); + __test_link_api(&child); } static struct bpf_program * -- cgit v1.2.3 From 8df43e859454952b0dd18c5821b32e6b2d48fddd Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Thu, 5 Sep 2024 14:51:23 +0300 Subject: selftests/bpf: Add uprobe multi pid filter test for fork-ed processes The idea is to create and monitor 3 uprobes, each trigered in separate process and make sure the bpf program gets executed just for the proper PID specified via pid filter. Signed-off-by: Jiri Olsa Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20240905115124.1503998-4-jolsa@kernel.org --- .../selftests/bpf/prog_tests/uprobe_multi_test.c | 67 ++++++++++++++++++++++ .../selftests/bpf/progs/uprobe_multi_pid_filter.c | 40 +++++++++++++ 2 files changed, 107 insertions(+) create mode 100644 tools/testing/selftests/bpf/progs/uprobe_multi_pid_filter.c (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c b/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c index 250eb47c68f9..9c2f99233304 100644 --- a/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c +++ b/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c @@ -7,6 +7,7 @@ #include "uprobe_multi_bench.skel.h" #include "uprobe_multi_usdt.skel.h" #include "uprobe_multi_consumers.skel.h" +#include "uprobe_multi_pid_filter.skel.h" #include "bpf/libbpf_internal.h" #include "testing_helpers.h" #include "../sdt.h" @@ -935,6 +936,70 @@ static void test_consumers(void) uprobe_multi_consumers__destroy(skel); } +static struct bpf_program *uprobe_multi_program(struct uprobe_multi_pid_filter *skel, int idx) +{ + switch (idx) { + case 0: return skel->progs.uprobe_multi_0; + case 1: return skel->progs.uprobe_multi_1; + case 2: return skel->progs.uprobe_multi_2; + } + return NULL; +} + +#define TASKS 3 + +static void run_pid_filter(struct uprobe_multi_pid_filter *skel, bool retprobe) +{ + LIBBPF_OPTS(bpf_uprobe_multi_opts, opts, .retprobe = retprobe); + struct bpf_link *link[TASKS] = {}; + struct child child[TASKS] = {}; + int i; + + memset(skel->bss->test, 0, sizeof(skel->bss->test)); + + for (i = 0; i < TASKS; i++) { + if (!ASSERT_OK(spawn_child(&child[i]), "spawn_child")) + goto cleanup; + skel->bss->pids[i] = child[i].pid; + } + + for (i = 0; i < TASKS; i++) { + link[i] = bpf_program__attach_uprobe_multi(uprobe_multi_program(skel, i), + child[i].pid, "/proc/self/exe", + "uprobe_multi_func_1", &opts); + if (!ASSERT_OK_PTR(link[i], "bpf_program__attach_uprobe_multi")) + goto cleanup; + } + + for (i = 0; i < TASKS; i++) + kick_child(&child[i]); + + for (i = 0; i < TASKS; i++) { + ASSERT_EQ(skel->bss->test[i][0], 1, "pid"); + ASSERT_EQ(skel->bss->test[i][1], 0, "unknown"); + } + +cleanup: + for (i = 0; i < TASKS; i++) + bpf_link__destroy(link[i]); + for (i = 0; i < TASKS; i++) + release_child(&child[i]); +} + +static void test_pid_filter_process(void) +{ + struct uprobe_multi_pid_filter *skel; + + skel = uprobe_multi_pid_filter__open_and_load(); + if (!ASSERT_OK_PTR(skel, "uprobe_multi_pid_filter__open_and_load")) + return; + + run_pid_filter(skel, false); + run_pid_filter(skel, true); + + uprobe_multi_pid_filter__destroy(skel); +} + static void test_bench_attach_uprobe(void) { long attach_start_ns = 0, attach_end_ns = 0; @@ -1027,4 +1092,6 @@ void test_uprobe_multi_test(void) test_attach_uprobe_fails(); if (test__start_subtest("consumers")) test_consumers(); + if (test__start_subtest("filter_fork")) + test_pid_filter_process(); } diff --git a/tools/testing/selftests/bpf/progs/uprobe_multi_pid_filter.c b/tools/testing/selftests/bpf/progs/uprobe_multi_pid_filter.c new file mode 100644 index 000000000000..67fcbad36661 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/uprobe_multi_pid_filter.c @@ -0,0 +1,40 @@ +// SPDX-License-Identifier: GPL-2.0 +#include "vmlinux.h" +#include +#include + +char _license[] SEC("license") = "GPL"; + +__u32 pids[3]; +__u32 test[3][2]; + +static void update_pid(int idx) +{ + __u32 pid = bpf_get_current_pid_tgid() >> 32; + + if (pid == pids[idx]) + test[idx][0]++; + else + test[idx][1]++; +} + +SEC("uprobe.multi") +int uprobe_multi_0(struct pt_regs *ctx) +{ + update_pid(0); + return 0; +} + +SEC("uprobe.multi") +int uprobe_multi_1(struct pt_regs *ctx) +{ + update_pid(1); + return 0; +} + +SEC("uprobe.multi") +int uprobe_multi_2(struct pt_regs *ctx) +{ + update_pid(2); + return 0; +} -- cgit v1.2.3 From d2520bdb1932fe615d87458ea2df2495bb25c886 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Thu, 5 Sep 2024 14:51:24 +0300 Subject: selftests/bpf: Add uprobe multi pid filter test for clone-ed processes The idea is to run same test as for test_pid_filter_process, but instead of standard fork-ed process we create the process with clone(CLONE_VM..) to make sure the thread leader process filter works properly in this case. Signed-off-by: Jiri Olsa Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20240905115124.1503998-5-jolsa@kernel.org --- .../selftests/bpf/prog_tests/uprobe_multi_test.c | 66 ++++++++++++++-------- 1 file changed, 42 insertions(+), 24 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c b/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c index 9c2f99233304..f160d01ba5da 100644 --- a/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c +++ b/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c @@ -40,6 +40,7 @@ struct child { int pid; int tid; pthread_t thread; + char stack[65536]; }; static void release_child(struct child *child) @@ -69,41 +70,56 @@ static void kick_child(struct child *child) fflush(NULL); } -static int spawn_child(struct child *child) +static int child_func(void *arg) { + struct child *child = arg; int err, c; + close(child->go[1]); + + /* wait for parent's kick */ + err = read(child->go[0], &c, 1); + if (err != 1) + exit(err); + + uprobe_multi_func_1(); + uprobe_multi_func_2(); + uprobe_multi_func_3(); + usdt_trigger(); + + exit(errno); +} + +static int spawn_child_flag(struct child *child, bool clone_vm) +{ /* pipe to notify child to execute the trigger functions */ if (pipe(child->go)) return -1; - child->pid = child->tid = fork(); + if (clone_vm) { + child->pid = child->tid = clone(child_func, child->stack + sizeof(child->stack)/2, + CLONE_VM|SIGCHLD, child); + } else { + child->pid = child->tid = fork(); + } if (child->pid < 0) { release_child(child); errno = EINVAL; return -1; } - /* child */ - if (child->pid == 0) { - close(child->go[1]); - - /* wait for parent's kick */ - err = read(child->go[0], &c, 1); - if (err != 1) - exit(err); - - uprobe_multi_func_1(); - uprobe_multi_func_2(); - uprobe_multi_func_3(); - usdt_trigger(); - - exit(errno); - } + /* fork-ed child */ + if (!clone_vm && child->pid == 0) + child_func(child); return 0; } +static int spawn_child(struct child *child) +{ + return spawn_child_flag(child, false); +} + static void *child_thread(void *ctx) { struct child *child = ctx; @@ -948,7 +964,7 @@ static struct bpf_program *uprobe_multi_program(struct uprobe_multi_pid_filter * #define TASKS 3 -static void run_pid_filter(struct uprobe_multi_pid_filter *skel, bool retprobe) +static void run_pid_filter(struct uprobe_multi_pid_filter *skel, bool clone_vm, bool retprobe) { LIBBPF_OPTS(bpf_uprobe_multi_opts, opts, .retprobe = retprobe); struct bpf_link *link[TASKS] = {}; @@ -958,7 +974,7 @@ static void run_pid_filter(struct uprobe_multi_pid_filter *skel, bool retprobe) memset(skel->bss->test, 0, sizeof(skel->bss->test)); for (i = 0; i < TASKS; i++) { - if (!ASSERT_OK(spawn_child(&child[i]), "spawn_child")) + if (!ASSERT_OK(spawn_child_flag(&child[i], clone_vm), "spawn_child")) goto cleanup; skel->bss->pids[i] = child[i].pid; } @@ -986,7 +1002,7 @@ cleanup: release_child(&child[i]); } -static void test_pid_filter_process(void) +static void test_pid_filter_process(bool clone_vm) { struct uprobe_multi_pid_filter *skel; @@ -994,8 +1010,8 @@ static void test_pid_filter_process(void) if (!ASSERT_OK_PTR(skel, "uprobe_multi_pid_filter__open_and_load")) return; - run_pid_filter(skel, false); - run_pid_filter(skel, true); + run_pid_filter(skel, clone_vm, false); + run_pid_filter(skel, clone_vm, true); uprobe_multi_pid_filter__destroy(skel); } @@ -1093,5 +1109,7 @@ void test_uprobe_multi_test(void) if (test__start_subtest("consumers")) test_consumers(); if (test__start_subtest("filter_fork")) - test_pid_filter_process(); + test_pid_filter_process(false); + if (test__start_subtest("filter_clone_vm")) + test_pid_filter_process(true); } -- cgit v1.2.3 From 5db0ba6766f8a6606e655ddad745c87bc01349c7 Mon Sep 17 00:00:00 2001 From: Lin Yikai Date: Thu, 5 Sep 2024 19:03:05 +0800 Subject: selftests/bpf: fix some typos in selftests Hi, fix some spelling errors in selftest, the details are as follows: -in the codes: test_bpf_sk_stoarge_map_iter_fd(void) ->test_bpf_sk_storage_map_iter_fd(void) load BTF from btf_data.o->load BTF from btf_data.bpf.o -in the code comments: preample->preamble multi-contollers->multi-controllers errono->errno unsighed/unsinged->unsigned egree->egress shoud->should regsiter->register assummed->assumed conditiona->conditional rougly->roughly timetamp->timestamp ingores->ignores null-termainted->null-terminated slepable->sleepable implemenation->implementation veriables->variables timetamps->timestamps substitue a costant->substitute a constant secton->section unreferened->unreferenced verifer->verifier libppf->libbpf ... Signed-off-by: Lin Yikai Link: https://lore.kernel.org/r/20240905110354.3274546-1-yikai.lin@vivo.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/benchs/bench_trigger.c | 2 +- tools/testing/selftests/bpf/cgroup_helpers.c | 2 +- tools/testing/selftests/bpf/map_tests/htab_map_batch_ops.c | 2 +- .../selftests/bpf/map_tests/lpm_trie_map_batch_ops.c | 2 +- tools/testing/selftests/bpf/prog_tests/bpf_iter.c | 4 ++-- tools/testing/selftests/bpf/prog_tests/btf.c | 6 +++--- tools/testing/selftests/bpf/prog_tests/cg_storage_multi.c | 2 +- tools/testing/selftests/bpf/prog_tests/log_buf.c | 4 ++-- tools/testing/selftests/bpf/prog_tests/reg_bounds.c | 14 +++++++------- tools/testing/selftests/bpf/prog_tests/resolve_btfids.c | 2 +- tools/testing/selftests/bpf/prog_tests/tc_redirect.c | 2 +- tools/testing/selftests/bpf/prog_tests/test_bprm_opts.c | 2 +- tools/testing/selftests/bpf/prog_tests/test_strncmp.c | 2 +- tools/testing/selftests/bpf/prog_tests/token.c | 4 ++-- tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c | 2 +- tools/testing/selftests/bpf/prog_tests/user_ringbuf.c | 2 +- tools/testing/selftests/bpf/progs/bpf_cubic.c | 6 +++--- tools/testing/selftests/bpf/progs/strobemeta.h | 4 ++-- .../testing/selftests/bpf/progs/test_cls_redirect_dynptr.c | 2 +- tools/testing/selftests/bpf/progs/test_core_read_macros.c | 2 +- tools/testing/selftests/bpf/progs/test_global_func15.c | 2 +- tools/testing/selftests/bpf/progs/test_global_map_resize.c | 2 +- tools/testing/selftests/bpf/test_maps.c | 2 +- tools/testing/selftests/bpf/test_progs.c | 2 +- tools/testing/selftests/bpf/verifier/map_kptr.c | 2 +- 25 files changed, 39 insertions(+), 39 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/benchs/bench_trigger.c b/tools/testing/selftests/bpf/benchs/bench_trigger.c index a220545a3238..2ed0ef6f21ee 100644 --- a/tools/testing/selftests/bpf/benchs/bench_trigger.c +++ b/tools/testing/selftests/bpf/benchs/bench_trigger.c @@ -276,7 +276,7 @@ static void trigger_rawtp_setup(void) * instructions. So use two different targets, one of which starts with nop * and another doesn't. * - * GCC doesn't generate stack setup preample for these functions due to them + * GCC doesn't generate stack setup preamble for these functions due to them * having no input arguments and doing nothing in the body. */ __nocf_check __weak void uprobe_target_nop(void) diff --git a/tools/testing/selftests/bpf/cgroup_helpers.c b/tools/testing/selftests/bpf/cgroup_helpers.c index 23bb9a9e6a7d..e4535451322e 100644 --- a/tools/testing/selftests/bpf/cgroup_helpers.c +++ b/tools/testing/selftests/bpf/cgroup_helpers.c @@ -644,7 +644,7 @@ unsigned long long get_classid_cgroup_id(void) /** * get_cgroup1_hierarchy_id - Retrieves the ID of a cgroup1 hierarchy from the cgroup1 subsys name. * @subsys_name: The cgroup1 subsys name, which can be retrieved from /proc/self/cgroup. It can be - * a named cgroup like "name=systemd", a controller name like "net_cls", or multi-contollers like + * a named cgroup like "name=systemd", a controller name like "net_cls", or multi-controllers like * "net_cls,net_prio". */ int get_cgroup1_hierarchy_id(const char *subsys_name) diff --git a/tools/testing/selftests/bpf/map_tests/htab_map_batch_ops.c b/tools/testing/selftests/bpf/map_tests/htab_map_batch_ops.c index 1230ccf90128..5da493b94ae2 100644 --- a/tools/testing/selftests/bpf/map_tests/htab_map_batch_ops.c +++ b/tools/testing/selftests/bpf/map_tests/htab_map_batch_ops.c @@ -197,7 +197,7 @@ void __test_map_lookup_and_delete_batch(bool is_pcpu) CHECK(total != max_entries, "delete with steps", "total = %u, max_entries = %u\n", total, max_entries); - /* check map is empty, errono == ENOENT */ + /* check map is empty, errno == ENOENT */ err = bpf_map_get_next_key(map_fd, NULL, &key); CHECK(!err || errno != ENOENT, "bpf_map_get_next_key()", "error: %s\n", strerror(errno)); diff --git a/tools/testing/selftests/bpf/map_tests/lpm_trie_map_batch_ops.c b/tools/testing/selftests/bpf/map_tests/lpm_trie_map_batch_ops.c index b66d56ddb7ef..fe3e19f96244 100644 --- a/tools/testing/selftests/bpf/map_tests/lpm_trie_map_batch_ops.c +++ b/tools/testing/selftests/bpf/map_tests/lpm_trie_map_batch_ops.c @@ -135,7 +135,7 @@ void test_lpm_trie_map_batch_ops(void) CHECK(total != max_entries, "delete with steps", "total = %u, max_entries = %u\n", total, max_entries); - /* check map is empty, errono == ENOENT */ + /* check map is empty, errno == ENOENT */ err = bpf_map_get_next_key(map_fd, NULL, &key); CHECK(!err || errno != ENOENT, "bpf_map_get_next_key()", "error: %s\n", strerror(errno)); diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_iter.c b/tools/testing/selftests/bpf/prog_tests/bpf_iter.c index 618af9dfae9b..52e6f7570475 100644 --- a/tools/testing/selftests/bpf/prog_tests/bpf_iter.c +++ b/tools/testing/selftests/bpf/prog_tests/bpf_iter.c @@ -1218,7 +1218,7 @@ out: bpf_iter_bpf_sk_storage_helpers__destroy(skel); } -static void test_bpf_sk_stoarge_map_iter_fd(void) +static void test_bpf_sk_storage_map_iter_fd(void) { struct bpf_iter_bpf_sk_storage_map *skel; @@ -1693,7 +1693,7 @@ void test_bpf_iter(void) if (test__start_subtest("bpf_sk_storage_map")) test_bpf_sk_storage_map(); if (test__start_subtest("bpf_sk_storage_map_iter_fd")) - test_bpf_sk_stoarge_map_iter_fd(); + test_bpf_sk_storage_map_iter_fd(); if (test__start_subtest("bpf_sk_storage_delete")) test_bpf_sk_storage_delete(); if (test__start_subtest("bpf_sk_storage_get")) diff --git a/tools/testing/selftests/bpf/prog_tests/btf.c b/tools/testing/selftests/bpf/prog_tests/btf.c index 00965a6e83bb..7eafcf91b02e 100644 --- a/tools/testing/selftests/bpf/prog_tests/btf.c +++ b/tools/testing/selftests/bpf/prog_tests/btf.c @@ -4986,7 +4986,7 @@ struct pprint_mapv_int128 { static struct btf_raw_test pprint_test_template[] = { { .raw_types = { - /* unsighed char */ /* [1] */ + /* unsigned char */ /* [1] */ BTF_TYPE_INT_ENC(NAME_TBD, 0, 0, 8, 1), /* unsigned short */ /* [2] */ BTF_TYPE_INT_ENC(NAME_TBD, 0, 0, 16, 2), @@ -5053,7 +5053,7 @@ static struct btf_raw_test pprint_test_template[] = { * be encoded with kind_flag set. */ .raw_types = { - /* unsighed char */ /* [1] */ + /* unsigned char */ /* [1] */ BTF_TYPE_INT_ENC(NAME_TBD, 0, 0, 8, 1), /* unsigned short */ /* [2] */ BTF_TYPE_INT_ENC(NAME_TBD, 0, 0, 16, 2), @@ -5120,7 +5120,7 @@ static struct btf_raw_test pprint_test_template[] = { * will have both int and enum types. */ .raw_types = { - /* unsighed char */ /* [1] */ + /* unsigned char */ /* [1] */ BTF_TYPE_INT_ENC(NAME_TBD, 0, 0, 8, 1), /* unsigned short */ /* [2] */ BTF_TYPE_INT_ENC(NAME_TBD, 0, 0, 16, 2), diff --git a/tools/testing/selftests/bpf/prog_tests/cg_storage_multi.c b/tools/testing/selftests/bpf/prog_tests/cg_storage_multi.c index 63ee892bc757..10224f845568 100644 --- a/tools/testing/selftests/bpf/prog_tests/cg_storage_multi.c +++ b/tools/testing/selftests/bpf/prog_tests/cg_storage_multi.c @@ -214,7 +214,7 @@ static void test_isolated(int parent_cgroup_fd, int child_cgroup_fd) /* Attach to parent and child cgroup, trigger packet from child. * Assert that there is six additional runs, parent cgroup egresses and * ingress, child cgroup egresses and ingress. - * Assert that egree and ingress storages are separate. + * Assert that egress and ingress storages are separate. */ child_egress1_link = bpf_program__attach_cgroup(obj->progs.egress1, child_cgroup_fd); diff --git a/tools/testing/selftests/bpf/prog_tests/log_buf.c b/tools/testing/selftests/bpf/prog_tests/log_buf.c index 46611040dec3..27676a04d0b6 100644 --- a/tools/testing/selftests/bpf/prog_tests/log_buf.c +++ b/tools/testing/selftests/bpf/prog_tests/log_buf.c @@ -160,7 +160,7 @@ static void bpf_prog_load_log_buf(void) opts.log_buf = log_buf; opts.log_size = log_buf_sz; - /* with log_level == 0 log_buf shoud stay empty for good prog */ + /* with log_level == 0 log_buf should stay empty for good prog */ log_buf[0] = '\0'; opts.log_level = 0; fd = bpf_prog_load(BPF_PROG_TYPE_SOCKET_FILTER, "good_prog", "GPL", @@ -222,7 +222,7 @@ static void bpf_btf_load_log_buf(void) opts.log_buf = log_buf; opts.log_size = log_buf_sz; - /* with log_level == 0 log_buf shoud stay empty for good BTF */ + /* with log_level == 0 log_buf should stay empty for good BTF */ log_buf[0] = '\0'; opts.log_level = 0; fd = bpf_btf_load(raw_btf_data, raw_btf_size, &opts); diff --git a/tools/testing/selftests/bpf/prog_tests/reg_bounds.c b/tools/testing/selftests/bpf/prog_tests/reg_bounds.c index 467027236d30..39d42271cc46 100644 --- a/tools/testing/selftests/bpf/prog_tests/reg_bounds.c +++ b/tools/testing/selftests/bpf/prog_tests/reg_bounds.c @@ -503,7 +503,7 @@ static const char *op_str(enum op op) /* Can register with range [x.a, x.b] *EVER* satisfy * OP (<, <=, >, >=, ==, !=) relation to - * a regsiter with range [y.a, y.b] + * a register with range [y.a, y.b] * _in *num_t* domain_ */ static bool range_canbe_op(enum num_t t, struct range x, struct range y, enum op op) @@ -532,7 +532,7 @@ static bool range_canbe_op(enum num_t t, struct range x, struct range y, enum op /* Does register with range [x.a, x.b] *ALWAYS* satisfy * OP (<, <=, >, >=, ==, !=) relation to - * a regsiter with range [y.a, y.b] + * a register with range [y.a, y.b] * _in *num_t* domain_ */ static bool range_always_op(enum num_t t, struct range x, struct range y, enum op op) @@ -543,7 +543,7 @@ static bool range_always_op(enum num_t t, struct range x, struct range y, enum o /* Does register with range [x.a, x.b] *NEVER* satisfy * OP (<, <=, >, >=, ==, !=) relation to - * a regsiter with range [y.a, y.b] + * a register with range [y.a, y.b] * _in *num_t* domain_ */ static bool range_never_op(enum num_t t, struct range x, struct range y, enum op op) @@ -1018,11 +1018,11 @@ static int parse_reg_state(const char *s, struct reg_state *reg) * - umin=%llu, if missing, assumed 0; * - umax=%llu, if missing, assumed U64_MAX; * - smin=%lld, if missing, assumed S64_MIN; - * - smax=%lld, if missing, assummed S64_MAX; + * - smax=%lld, if missing, assumed S64_MAX; * - umin32=%d, if missing, assumed 0; * - umax32=%d, if missing, assumed U32_MAX; * - smin32=%d, if missing, assumed S32_MIN; - * - smax32=%d, if missing, assummed S32_MAX; + * - smax32=%d, if missing, assumed S32_MAX; * - var_off=(%#llx; %#llx), tnum part, we don't care about it. * * If some of the values are equal, they will be grouped (but min/max @@ -1884,7 +1884,7 @@ cleanup: * envvar is not set, this test is skipped during test_progs testing. * * We split this up into smaller subsets based on initialization and - * conditiona numeric domains to get an easy parallelization with test_progs' + * conditional numeric domains to get an easy parallelization with test_progs' * -j argument. */ @@ -1938,7 +1938,7 @@ static u64 rand_u64() { /* RAND_MAX is guaranteed to be at least 1<<15, but in practice it * seems to be 1<<31, so we need to call it thrice to get full u64; - * we'll use rougly equal split: 22 + 21 + 21 bits + * we'll use roughly equal split: 22 + 21 + 21 bits */ return ((u64)random() << 42) | (((u64)random() & RAND_21BIT_MASK) << 21) | diff --git a/tools/testing/selftests/bpf/prog_tests/resolve_btfids.c b/tools/testing/selftests/bpf/prog_tests/resolve_btfids.c index f81d08d429a2..51544372f52e 100644 --- a/tools/testing/selftests/bpf/prog_tests/resolve_btfids.c +++ b/tools/testing/selftests/bpf/prog_tests/resolve_btfids.c @@ -103,7 +103,7 @@ static int resolve_symbols(void) btf = btf__parse_elf("btf_data.bpf.o", NULL); if (CHECK(libbpf_get_error(btf), "resolve", - "Failed to load BTF from btf_data.o\n")) + "Failed to load BTF from btf_data.bpf.o\n")) return -1; nr = btf__type_cnt(btf); diff --git a/tools/testing/selftests/bpf/prog_tests/tc_redirect.c b/tools/testing/selftests/bpf/prog_tests/tc_redirect.c index 974f9d6269c9..c85798966aec 100644 --- a/tools/testing/selftests/bpf/prog_tests/tc_redirect.c +++ b/tools/testing/selftests/bpf/prog_tests/tc_redirect.c @@ -872,7 +872,7 @@ static void test_tcp_dtime(struct test_tc_dtime *skel, int family, bool bpf_fwd) test_inet_dtime(family, SOCK_STREAM, addr, 50000 + t); /* fwdns_prio100 prog does not read delivery_time_type, so - * kernel puts the (rcv) timetamp in __sk_buff->tstamp + * kernel puts the (rcv) timestamp in __sk_buff->tstamp */ ASSERT_EQ(dtimes[INGRESS_FWDNS_P100], 0, dtime_cnt_str(t, INGRESS_FWDNS_P100)); diff --git a/tools/testing/selftests/bpf/prog_tests/test_bprm_opts.c b/tools/testing/selftests/bpf/prog_tests/test_bprm_opts.c index a0054019e677..9c0200c132d9 100644 --- a/tools/testing/selftests/bpf/prog_tests/test_bprm_opts.c +++ b/tools/testing/selftests/bpf/prog_tests/test_bprm_opts.c @@ -51,7 +51,7 @@ static int run_set_secureexec(int map_fd, int secureexec) exit(ret); /* If the binary is executed with securexec=1, the dynamic - * loader ingores and unsets certain variables like LD_PRELOAD, + * loader ignores and unsets certain variables like LD_PRELOAD, * TMPDIR etc. TMPDIR is used here to simplify the example, as * LD_PRELOAD requires a real .so file. * diff --git a/tools/testing/selftests/bpf/prog_tests/test_strncmp.c b/tools/testing/selftests/bpf/prog_tests/test_strncmp.c index 7ddd6615b7e7..baceb0de9d49 100644 --- a/tools/testing/selftests/bpf/prog_tests/test_strncmp.c +++ b/tools/testing/selftests/bpf/prog_tests/test_strncmp.c @@ -72,7 +72,7 @@ static void test_strncmp_ret(void) got = trigger_strncmp(skel); ASSERT_EQ(got, 0, "strncmp: same str"); - /* Not-null-termainted string */ + /* Not-null-terminated string */ memcpy(skel->bss->str, skel->rodata->target, sizeof(skel->bss->str)); skel->bss->str[sizeof(skel->bss->str) - 1] = 'A'; got = trigger_strncmp(skel); diff --git a/tools/testing/selftests/bpf/prog_tests/token.c b/tools/testing/selftests/bpf/prog_tests/token.c index fc4a175d8d76..fe86e4fdb89c 100644 --- a/tools/testing/selftests/bpf/prog_tests/token.c +++ b/tools/testing/selftests/bpf/prog_tests/token.c @@ -867,7 +867,7 @@ static int userns_obj_priv_implicit_token(int mnt_fd, struct token_lsm *lsm_skel } unsetenv(TOKEN_ENVVAR); - /* now the same struct_ops skeleton should succeed thanks to libppf + /* now the same struct_ops skeleton should succeed thanks to libbpf * creating BPF token from /sys/fs/bpf mount point */ skel = dummy_st_ops_success__open_and_load(); @@ -929,7 +929,7 @@ static int userns_obj_priv_implicit_token_envvar(int mnt_fd, struct token_lsm *l if (!ASSERT_OK(err, "setenv_token_path")) goto err_out; - /* now the same struct_ops skeleton should succeed thanks to libppf + /* now the same struct_ops skeleton should succeed thanks to libbpf * creating BPF token from custom mount point */ skel = dummy_st_ops_success__open_and_load(); diff --git a/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c b/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c index f160d01ba5da..844f6fc8487b 100644 --- a/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c +++ b/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c @@ -213,7 +213,7 @@ static void uprobe_multi_test_run(struct uprobe_multi *skel, struct child *child /* * There are 2 entry and 2 exit probe called for each uprobe_multi_func_[123] - * function and each slepable probe (6) increments uprobe_multi_sleep_result. + * function and each sleepable probe (6) increments uprobe_multi_sleep_result. */ ASSERT_EQ(skel->bss->uprobe_multi_func_1_result, 2, "uprobe_multi_func_1_result"); ASSERT_EQ(skel->bss->uprobe_multi_func_2_result, 2, "uprobe_multi_func_2_result"); diff --git a/tools/testing/selftests/bpf/prog_tests/user_ringbuf.c b/tools/testing/selftests/bpf/prog_tests/user_ringbuf.c index dfff6feac12c..d424e7ecbd12 100644 --- a/tools/testing/selftests/bpf/prog_tests/user_ringbuf.c +++ b/tools/testing/selftests/bpf/prog_tests/user_ringbuf.c @@ -643,7 +643,7 @@ static void test_user_ringbuf_blocking_reserve(void) if (!ASSERT_EQ(err, 0, "deferred_kick_thread\n")) goto cleanup; - /* After spawning another thread that asychronously kicks the kernel to + /* After spawning another thread that asynchronously kicks the kernel to * drain the messages, we're able to block and successfully get a * sample once we receive an event notification. */ diff --git a/tools/testing/selftests/bpf/progs/bpf_cubic.c b/tools/testing/selftests/bpf/progs/bpf_cubic.c index d665b8a15cc4..f089faa97ae6 100644 --- a/tools/testing/selftests/bpf/progs/bpf_cubic.c +++ b/tools/testing/selftests/bpf/progs/bpf_cubic.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0-only -/* WARNING: This implemenation is not necessarily the same +/* WARNING: This implementation is not necessarily the same * as the tcp_cubic.c. The purpose is mainly for testing * the kernel BPF logic. * @@ -314,7 +314,7 @@ static void bictcp_update(struct bpf_bictcp *ca, __u32 cwnd, __u32 acked) * (so time^3 is done by using 64 bit) * and without the support of division of 64bit numbers * (so all divisions are done by using 32 bit) - * also NOTE the unit of those veriables + * also NOTE the unit of those variables * time = (t - K) / 2^bictcp_HZ * c = bic_scale >> 10 * rtt = (srtt >> 3) / HZ @@ -507,7 +507,7 @@ void BPF_PROG(bpf_cubic_acked, struct sock *sk, const struct ack_sample *sample) __u32 delay; bpf_cubic_acked_called = 1; - /* Some calls are for duplicates without timetamps */ + /* Some calls are for duplicates without timestamps */ if (sample->rtt_us < 0) return; diff --git a/tools/testing/selftests/bpf/progs/strobemeta.h b/tools/testing/selftests/bpf/progs/strobemeta.h index f74459eead26..a5c74d31a244 100644 --- a/tools/testing/selftests/bpf/progs/strobemeta.h +++ b/tools/testing/selftests/bpf/progs/strobemeta.h @@ -373,7 +373,7 @@ static __always_inline uint64_t read_str_var(struct strobemeta_cfg *cfg, len = bpf_probe_read_user_str(&data->payload[off], STROBE_MAX_STR_LEN, value->ptr); /* * if bpf_probe_read_user_str returns error (<0), due to casting to - * unsinged int, it will become big number, so next check is + * unsigned int, it will become big number, so next check is * sufficient to check for errors AND prove to BPF verifier, that * bpf_probe_read_user_str won't return anything bigger than * STROBE_MAX_STR_LEN @@ -557,7 +557,7 @@ static void *read_strobe_meta(struct task_struct *task, return NULL; payload_off = ctx.payload_off; - /* this should not really happen, here only to satisfy verifer */ + /* this should not really happen, here only to satisfy verifier */ if (payload_off > sizeof(data->payload)) payload_off = sizeof(data->payload); #else diff --git a/tools/testing/selftests/bpf/progs/test_cls_redirect_dynptr.c b/tools/testing/selftests/bpf/progs/test_cls_redirect_dynptr.c index da54c09e9a15..464515b824b9 100644 --- a/tools/testing/selftests/bpf/progs/test_cls_redirect_dynptr.c +++ b/tools/testing/selftests/bpf/progs/test_cls_redirect_dynptr.c @@ -503,7 +503,7 @@ static ret_t get_next_hop(struct bpf_dynptr *dynptr, __u64 *offset, encap_header * * fill_tuple(&t, foo, sizeof(struct iphdr), 123, 321) * - * clang will substitue a costant for sizeof, which allows the verifier + * clang will substitute a constant for sizeof, which allows the verifier * to track it's value. Based on this, it can figure out the constant * return value, and calling code works while still being "generic" to * IPv4 and IPv6. diff --git a/tools/testing/selftests/bpf/progs/test_core_read_macros.c b/tools/testing/selftests/bpf/progs/test_core_read_macros.c index fd54caa17319..873d85a4739b 100644 --- a/tools/testing/selftests/bpf/progs/test_core_read_macros.c +++ b/tools/testing/selftests/bpf/progs/test_core_read_macros.c @@ -36,7 +36,7 @@ int handler(void *ctx) return 0; /* next pointers for kernel address space have to be initialized from - * BPF side, user-space mmaped addresses are stil user-space addresses + * BPF side, user-space mmaped addresses are still user-space addresses */ k_probe_in.next = &k_probe_in; __builtin_preserve_access_index(({k_core_in.next = &k_core_in;})); diff --git a/tools/testing/selftests/bpf/progs/test_global_func15.c b/tools/testing/selftests/bpf/progs/test_global_func15.c index b4e089d6981d..201cc000b3f4 100644 --- a/tools/testing/selftests/bpf/progs/test_global_func15.c +++ b/tools/testing/selftests/bpf/progs/test_global_func15.c @@ -44,7 +44,7 @@ __naked int global_func15_tricky_pruning(void) * case we have a valid 1 stored in R0 register, but in * a branch case we assign some random value to R0. So if * there is something wrong with precision tracking for R0 at - * program exit, we might erronenously prune branch case, + * program exit, we might erroneously prune branch case, * because R0 in fallthrough case is imprecise (and thus any * value is valid from POV of verifier is_state_equal() logic) */ diff --git a/tools/testing/selftests/bpf/progs/test_global_map_resize.c b/tools/testing/selftests/bpf/progs/test_global_map_resize.c index 714b29c7f8b2..a3f220ba7025 100644 --- a/tools/testing/selftests/bpf/progs/test_global_map_resize.c +++ b/tools/testing/selftests/bpf/progs/test_global_map_resize.c @@ -16,7 +16,7 @@ const volatile size_t data_array_len; int sum = 0; int array[1]; -/* custom data secton */ +/* custom data section */ int my_array[1] SEC(".data.custom"); /* custom data section which should NOT be resizable, diff --git a/tools/testing/selftests/bpf/test_maps.c b/tools/testing/selftests/bpf/test_maps.c index dfbab214f4d1..905d5981ace1 100644 --- a/tools/testing/selftests/bpf/test_maps.c +++ b/tools/testing/selftests/bpf/test_maps.c @@ -1515,7 +1515,7 @@ again: value == key); } - /* Now let's delete all elemenets in parallel. */ + /* Now let's delete all elements in parallel. */ data[1] = DO_DELETE; run_parallel(TASKS, test_update_delete, data); diff --git a/tools/testing/selftests/bpf/test_progs.c b/tools/testing/selftests/bpf/test_progs.c index 83f390a31681..c7a70e1a1085 100644 --- a/tools/testing/selftests/bpf/test_progs.c +++ b/tools/testing/selftests/bpf/test_progs.c @@ -1500,7 +1500,7 @@ static void calculate_summary_and_print_errors(struct test_env *env) /* * We only print error logs summary when there are failed tests and - * verbose mode is not enabled. Otherwise, results may be incosistent. + * verbose mode is not enabled. Otherwise, results may be inconsistent. * */ if (!verbose() && fail_cnt) { diff --git a/tools/testing/selftests/bpf/verifier/map_kptr.c b/tools/testing/selftests/bpf/verifier/map_kptr.c index d25c3e9605f1..f420c0312aa0 100644 --- a/tools/testing/selftests/bpf/verifier/map_kptr.c +++ b/tools/testing/selftests/bpf/verifier/map_kptr.c @@ -153,7 +153,7 @@ .result = REJECT, .errstr = "variable untrusted_ptr_ access var_off=(0x0; 0x7) disallowed", }, -/* Tests for unreferened PTR_TO_BTF_ID */ +/* Tests for unreferenced PTR_TO_BTF_ID */ { "map_kptr: unref: reject btf_struct_ids_match == false", .insns = { -- cgit v1.2.3 From a86857d2546c98f6c4e5677af8c6b8a80edd0704 Mon Sep 17 00:00:00 2001 From: Lin Yikai Date: Thu, 5 Sep 2024 19:03:06 +0800 Subject: bpftool: fix some typos in bpftool Hi, fix some spelling errors in bpftool, the details are as follows: -in file "bpftool-gen.rst" libppf->libbpf -in the code comments: ouptut->output Signed-off-by: Lin Yikai Link: https://lore.kernel.org/r/20240905110354.3274546-2-yikai.lin@vivo.com Signed-off-by: Alexei Starovoitov --- tools/bpf/bpftool/Documentation/bpftool-gen.rst | 2 +- tools/bpf/bpftool/feature.c | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'tools') diff --git a/tools/bpf/bpftool/Documentation/bpftool-gen.rst b/tools/bpf/bpftool/Documentation/bpftool-gen.rst index c768e6d4ae09..1b426e58a7cd 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-gen.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-gen.rst @@ -104,7 +104,7 @@ bpftool gen skeleton *FILE* - **example__load**. This function creates maps, loads and verifies BPF programs, initializes - global data maps. It corresponds to libppf's **bpf_object__load**\ () + global data maps. It corresponds to libbpf's **bpf_object__load**\ () API. - **example__open_and_load** combines **example__open** and diff --git a/tools/bpf/bpftool/feature.c b/tools/bpf/bpftool/feature.c index c754a428c8c6..4dbc4fcdf473 100644 --- a/tools/bpf/bpftool/feature.c +++ b/tools/bpf/bpftool/feature.c @@ -196,7 +196,7 @@ static void probe_unprivileged_disabled(void) { long res; - /* No support for C-style ouptut */ + /* No support for C-style output */ res = read_procfs("/proc/sys/kernel/unprivileged_bpf_disabled"); if (json_output) { @@ -225,7 +225,7 @@ static void probe_jit_enable(void) { long res; - /* No support for C-style ouptut */ + /* No support for C-style output */ res = read_procfs("/proc/sys/net/core/bpf_jit_enable"); if (json_output) { @@ -255,7 +255,7 @@ static void probe_jit_harden(void) { long res; - /* No support for C-style ouptut */ + /* No support for C-style output */ res = read_procfs("/proc/sys/net/core/bpf_jit_harden"); if (json_output) { @@ -285,7 +285,7 @@ static void probe_jit_kallsyms(void) { long res; - /* No support for C-style ouptut */ + /* No support for C-style output */ res = read_procfs("/proc/sys/net/core/bpf_jit_kallsyms"); if (json_output) { @@ -311,7 +311,7 @@ static void probe_jit_limit(void) { long res; - /* No support for C-style ouptut */ + /* No support for C-style output */ res = read_procfs("/proc/sys/net/core/bpf_jit_limit"); if (json_output) { -- cgit v1.2.3 From bd4d67f8ae55d42273c6cd661b622ed713d20350 Mon Sep 17 00:00:00 2001 From: Lin Yikai Date: Thu, 5 Sep 2024 19:03:07 +0800 Subject: libbpf: fix some typos in libbpf Hi, fix some spelling errors in libbpf, the details are as follows: -in the code comments: termintaing->terminating architecutre->architecture requring->requiring recored->recoded sanitise->sanities allowd->allowed abover->above see bpf_udst_arg()->see bpf_usdt_arg() Signed-off-by: Lin Yikai Link: https://lore.kernel.org/r/20240905110354.3274546-3-yikai.lin@vivo.com Signed-off-by: Alexei Starovoitov --- tools/lib/bpf/bpf.h | 4 ++-- tools/lib/bpf/bpf_tracing.h | 4 ++-- tools/lib/bpf/btf.c | 2 +- tools/lib/bpf/libbpf.c | 2 +- tools/lib/bpf/linker.c | 4 ++-- tools/lib/bpf/usdt.bpf.h | 2 +- 6 files changed, 9 insertions(+), 9 deletions(-) (limited to 'tools') diff --git a/tools/lib/bpf/bpf.h b/tools/lib/bpf/bpf.h index 972e17ec0c09..a4a7b1ad1b63 100644 --- a/tools/lib/bpf/bpf.h +++ b/tools/lib/bpf/bpf.h @@ -100,7 +100,7 @@ struct bpf_prog_load_opts { __u32 log_level; __u32 log_size; char *log_buf; - /* output: actual total log contents size (including termintaing zero). + /* output: actual total log contents size (including terminating zero). * It could be both larger than original log_size (if log was * truncated), or smaller (if log buffer wasn't filled completely). * If kernel doesn't support this feature, log_size is left unchanged. @@ -129,7 +129,7 @@ struct bpf_btf_load_opts { char *log_buf; __u32 log_level; __u32 log_size; - /* output: actual total log contents size (including termintaing zero). + /* output: actual total log contents size (including terminating zero). * It could be both larger than original log_size (if log was * truncated), or smaller (if log buffer wasn't filled completely). * If kernel doesn't support this feature, log_size is left unchanged. diff --git a/tools/lib/bpf/bpf_tracing.h b/tools/lib/bpf/bpf_tracing.h index bad7a0dbf510..4eab132a963e 100644 --- a/tools/lib/bpf/bpf_tracing.h +++ b/tools/lib/bpf/bpf_tracing.h @@ -480,7 +480,7 @@ struct pt_regs; #endif /* * Similarly, syscall-specific conventions might differ between function call - * conventions within each architecutre. All supported architectures pass + * conventions within each architecture. All supported architectures pass * either 6 or 7 syscall arguments in registers. * * See syscall(2) manpage for succinct table with information on each arch. @@ -658,7 +658,7 @@ struct pt_regs; * BPF_PROG is a convenience wrapper for generic tp_btf/fentry/fexit and * similar kinds of BPF programs, that accept input arguments as a single * pointer to untyped u64 array, where each u64 can actually be a typed - * pointer or integer of different size. Instead of requring user to write + * pointer or integer of different size. Instead of requiring user to write * manual casts and work with array elements by index, BPF_PROG macro * allows user to declare a list of named and typed input arguments in the * same syntax as for normal C function. All the casting is hidden and diff --git a/tools/lib/bpf/btf.c b/tools/lib/bpf/btf.c index 40aae244e35f..8d51e73d55a8 100644 --- a/tools/lib/bpf/btf.c +++ b/tools/lib/bpf/btf.c @@ -4192,7 +4192,7 @@ static bool btf_dedup_identical_structs(struct btf_dedup *d, __u32 id1, __u32 id * and canonical graphs are not compatible structurally, whole graphs are * incompatible. If types are structurally equivalent (i.e., all information * except referenced type IDs is exactly the same), a mapping from `canon_id` to - * a `cand_id` is recored in hypothetical mapping (`btf_dedup->hypot_map`). + * a `cand_id` is recoded in hypothetical mapping (`btf_dedup->hypot_map`). * If a type references other types, then those referenced types are checked * for equivalence recursively. * diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index d3a542649e6b..27ad3c6ee868 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -1848,7 +1848,7 @@ static char *internal_map_name(struct bpf_object *obj, const char *real_name) snprintf(map_name, sizeof(map_name), "%.*s%.*s", pfx_len, obj->name, sfx_len, real_name); - /* sanitise map name to characters allowed by kernel */ + /* sanities map name to characters allowed by kernel */ for (p = map_name; *p && p < map_name + sizeof(map_name); p++) if (!isalnum(*p) && *p != '_' && *p != '.') *p = '_'; diff --git a/tools/lib/bpf/linker.c b/tools/lib/bpf/linker.c index 9cd3d4109788..e0005c6ade88 100644 --- a/tools/lib/bpf/linker.c +++ b/tools/lib/bpf/linker.c @@ -1413,7 +1413,7 @@ recur: return true; case BTF_KIND_PTR: /* just validate overall shape of the referenced type, so no - * contents comparison for struct/union, and allowd fwd vs + * contents comparison for struct/union, and allowed fwd vs * struct/union */ exact = false; @@ -1962,7 +1962,7 @@ static int linker_append_elf_sym(struct bpf_linker *linker, struct src_obj *obj, /* If existing symbol is a strong resolved symbol, bail out, * because we lost resolution battle have nothing to - * contribute. We already checked abover that there is no + * contribute. We already checked above that there is no * strong-strong conflict. We also already tightened binding * and visibility, so nothing else to contribute at that point. */ diff --git a/tools/lib/bpf/usdt.bpf.h b/tools/lib/bpf/usdt.bpf.h index 76359bcdc94a..b811f754939f 100644 --- a/tools/lib/bpf/usdt.bpf.h +++ b/tools/lib/bpf/usdt.bpf.h @@ -39,7 +39,7 @@ enum __bpf_usdt_arg_type { struct __bpf_usdt_arg_spec { /* u64 scalar interpreted depending on arg_type, see below */ __u64 val_off; - /* arg location case, see bpf_udst_arg() for details */ + /* arg location case, see bpf_usdt_arg() for details */ enum __bpf_usdt_arg_type arg_type; /* offset of referenced register within struct pt_regs */ short reg_off; -- cgit v1.2.3 From dc3a8804d790cc7dda442e5504a0c67435b356f4 Mon Sep 17 00:00:00 2001 From: Pu Lehui Date: Thu, 5 Sep 2024 08:13:52 +0000 Subject: selftests/bpf: Adapt OUTPUT appending logic to lower versions of Make The $(let ...) function is only supported by GNU Make version 4.4 [0] and above, otherwise the following exception file or directory will be generated: tools/testing/selftests/bpfFEATURE-DUMP.selftests tools/testing/selftests/bpffeature/ Considering that the GNU Make version of most Linux distributions is lower than 4.4, let us adapt the corresponding logic to it. Link: https://lists.gnu.org/archive/html/info-gnu/2022-10/msg00008.html [0] Acked-by: Eduard Zingerman Signed-off-by: Pu Lehui Link: https://lore.kernel.org/r/20240905081401.1894789-2-pulehui@huaweicloud.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/Makefile | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 7660d19b66c2..9905e3739dd0 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -187,8 +187,14 @@ FEATURE_TESTS := llvm FEATURE_DISPLAY := $(FEATURE_TESTS) # Makefile.feature expects OUTPUT to end with a slash +ifeq ($(shell expr $(MAKE_VERSION) \>= 4.4), 1) $(let OUTPUT,$(OUTPUT)/,\ $(eval include ../../../build/Makefile.feature)) +else +OUTPUT := $(OUTPUT)/ +$(eval include ../../../build/Makefile.feature) +OUTPUT := $(patsubst %/,%,$(OUTPUT)) +endif endif ifeq ($(feature-llvm),1) -- cgit v1.2.3 From a48a43884cdd46dba8cc4a15160c116ffd965800 Mon Sep 17 00:00:00 2001 From: Pu Lehui Date: Thu, 5 Sep 2024 08:13:53 +0000 Subject: selftests/bpf: Rename fallback in bpf_dctcp to avoid naming conflict Recently, when compiling bpf selftests on RV64, the following compilation failure occurred: progs/bpf_dctcp.c:29:21: error: redefinition of 'fallback' as different kind of symbol 29 | volatile const char fallback[TCP_CA_NAME_MAX]; | ^ /workspace/tools/testing/selftests/bpf/tools/include/vmlinux.h:86812:15: note: previous definition is here 86812 | typedef u32 (*fallback)(u32, const unsigned char *, size_t); The reason is that the `fallback` symbol has been defined in arch/riscv/lib/crc32.c, which will cause symbol conflicts when vmlinux.h is included in bpf_dctcp. Let we rename `fallback` string to `fallback_cc` in bpf_dctcp to fix this compilation failure. Signed-off-by: Pu Lehui Link: https://lore.kernel.org/r/20240905081401.1894789-3-pulehui@huaweicloud.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c | 2 +- tools/testing/selftests/bpf/progs/bpf_dctcp.c | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c b/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c index 1d494b4453f4..409a06975823 100644 --- a/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c +++ b/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c @@ -285,7 +285,7 @@ static void test_dctcp_fallback(void) dctcp_skel = bpf_dctcp__open(); if (!ASSERT_OK_PTR(dctcp_skel, "dctcp_skel")) return; - strcpy(dctcp_skel->rodata->fallback, "cubic"); + strcpy(dctcp_skel->rodata->fallback_cc, "cubic"); if (!ASSERT_OK(bpf_dctcp__load(dctcp_skel), "bpf_dctcp__load")) goto done; diff --git a/tools/testing/selftests/bpf/progs/bpf_dctcp.c b/tools/testing/selftests/bpf/progs/bpf_dctcp.c index 02f552e7fd4d..7cd73e75f52a 100644 --- a/tools/testing/selftests/bpf/progs/bpf_dctcp.c +++ b/tools/testing/selftests/bpf/progs/bpf_dctcp.c @@ -26,7 +26,7 @@ static bool before(__u32 seq1, __u32 seq2) char _license[] SEC("license") = "GPL"; -volatile const char fallback[TCP_CA_NAME_MAX]; +volatile const char fallback_cc[TCP_CA_NAME_MAX]; const char bpf_dctcp[] = "bpf_dctcp"; const char tcp_cdg[] = "cdg"; char cc_res[TCP_CA_NAME_MAX]; @@ -71,10 +71,10 @@ void BPF_PROG(bpf_dctcp_init, struct sock *sk) struct bpf_dctcp *ca = inet_csk_ca(sk); int *stg; - if (!(tp->ecn_flags & TCP_ECN_OK) && fallback[0]) { + if (!(tp->ecn_flags & TCP_ECN_OK) && fallback_cc[0]) { /* Switch to fallback */ if (bpf_setsockopt(sk, SOL_TCP, TCP_CONGESTION, - (void *)fallback, sizeof(fallback)) == -EBUSY) + (void *)fallback_cc, sizeof(fallback_cc)) == -EBUSY) ebusy_cnt++; /* Switch back to myself and the recurred bpf_dctcp_init() @@ -87,7 +87,7 @@ void BPF_PROG(bpf_dctcp_init, struct sock *sk) /* Switch back to fallback */ if (bpf_setsockopt(sk, SOL_TCP, TCP_CONGESTION, - (void *)fallback, sizeof(fallback)) == -EBUSY) + (void *)fallback_cc, sizeof(fallback_cc)) == -EBUSY) ebusy_cnt++; /* Expecting -ENOTSUPP for tcp_cdg_res */ -- cgit v1.2.3 From 67ab80a01886a178c69c91728c49f94ccb71f338 Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Thu, 5 Sep 2024 08:13:54 +0000 Subject: selftests/bpf: Prefer static linking for LLVM libraries It is not always convenient to have LLVM libraries installed inside CI rootfs images, thus request static libraries from llvm-config. Suggested-by: Andrii Nakryiko Signed-off-by: Eduard Zingerman Acked-by: Daniel Borkmann Tested-by: Daniel Borkmann Link: https://lore.kernel.org/r/20240905081401.1894789-4-pulehui@huaweicloud.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/Makefile | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 9905e3739dd0..04716a5e43f1 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -202,11 +202,9 @@ ifeq ($(feature-llvm),1) LLVM_CONFIG_LIB_COMPONENTS := mcdisassembler all-targets # both llvm-config and lib.mk add -D_GNU_SOURCE, which ends up as conflict LLVM_CFLAGS += $(filter-out -D_GNU_SOURCE,$(shell $(LLVM_CONFIG) --cflags)) - LLVM_LDLIBS += $(shell $(LLVM_CONFIG) --libs $(LLVM_CONFIG_LIB_COMPONENTS)) - ifeq ($(shell $(LLVM_CONFIG) --shared-mode),static) - LLVM_LDLIBS += $(shell $(LLVM_CONFIG) --system-libs $(LLVM_CONFIG_LIB_COMPONENTS)) - LLVM_LDLIBS += -lstdc++ - endif + LLVM_LDLIBS += $(shell $(LLVM_CONFIG) --link-static --libs $(LLVM_CONFIG_LIB_COMPONENTS)) + LLVM_LDLIBS += $(shell $(LLVM_CONFIG) --link-static --system-libs $(LLVM_CONFIG_LIB_COMPONENTS)) + LLVM_LDLIBS += -lstdc++ LLVM_LDFLAGS += $(shell $(LLVM_CONFIG) --ldflags) endif -- cgit v1.2.3 From 0c3fc330be6d85febd0f68b66c657c752e9cd63c Mon Sep 17 00:00:00 2001 From: Pu Lehui Date: Thu, 5 Sep 2024 08:13:55 +0000 Subject: selftests/bpf: Limit URLS parsing logic to actual scope in vmtest The URLS array is only valid in the download_rootfs function and does not need to be parsed globally in advance. At the same time, the logic of loading rootfs is refactored to prepare vmtest for supporting local rootfs. Signed-off-by: Pu Lehui Link: https://lore.kernel.org/r/20240905081401.1894789-5-pulehui@huaweicloud.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/vmtest.sh | 39 +++++++++++++++++------------------ 1 file changed, 19 insertions(+), 20 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/vmtest.sh b/tools/testing/selftests/bpf/vmtest.sh index 65d14f3bbe30..87d93f29c565 100755 --- a/tools/testing/selftests/bpf/vmtest.sh +++ b/tools/testing/selftests/bpf/vmtest.sh @@ -92,19 +92,6 @@ populate_url_map() fi } -download() -{ - local file="$1" - - if [[ ! -v URLS[$file] ]]; then - echo "$file not found" >&2 - return 1 - fi - - echo "Downloading $file..." >&2 - curl -Lsf "${URLS[$file]}" "${@:2}" -} - newest_rootfs_version() { { @@ -118,16 +105,30 @@ newest_rootfs_version() download_rootfs() { - local rootfsversion="$1" - local dir="$2" + populate_url_map + + local rootfsversion="$(newest_rootfs_version)" + local file="${ARCH}/libbpf-vmtest-rootfs-$rootfsversion.tar.zst" + + if [[ ! -v URLS[$file] ]]; then + echo "$file not found" >&2 + return 1 + fi + + echo "Downloading $file..." >&2 + curl -Lsf "${URLS[$file]}" "${@:2}" +} + +load_rootfs() +{ + local dir="$1" if ! which zstd &> /dev/null; then echo 'Could not find "zstd" on the system, please install zstd' exit 1 fi - download "${ARCH}/libbpf-vmtest-rootfs-$rootfsversion.tar.zst" | - zstd -d | sudo tar -C "$dir" -x + download_rootfs | zstd -d | sudo tar -C "$dir" -x } recompile_kernel() @@ -227,7 +228,7 @@ create_vm_image() mkfs.ext4 -q "${rootfs_img}" mount_image - download_rootfs "$(newest_rootfs_version)" "${mount_dir}" + load_rootfs "${mount_dir}" unmount_image } @@ -402,8 +403,6 @@ main() make_command="${make_command} KBUILD_OUTPUT=${KBUILD_OUTPUT}" fi - populate_url_map - local rootfs_img="${OUTPUT_DIR}/${ROOTFS_IMAGE}" local mount_dir="${OUTPUT_DIR}/${MOUNT_DIR}" -- cgit v1.2.3 From 2294073dce32af8535e9d4a32e1efdeea35c1786 Mon Sep 17 00:00:00 2001 From: Pu Lehui Date: Thu, 5 Sep 2024 08:13:56 +0000 Subject: selftests/bpf: Support local rootfs image for vmtest Support vmtest to use local rootfs image generated by [0] that is consistent with BPF CI. Now we can specify the local rootfs image through the `-l` parameter like as follows: vmtest.sh -l ./libbpf-vmtest-rootfs-2024.08.22-noble-amd64.tar.zst -- ./test_progs Meanwhile, some descriptions have been flushed. Link: https://github.com/libbpf/ci/blob/main/rootfs/mkrootfs_debian.sh [0] Acked-by: Eduard Zingerman Signed-off-by: Pu Lehui Link: https://lore.kernel.org/r/20240905081401.1894789-6-pulehui@huaweicloud.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/README.rst | 2 -- tools/testing/selftests/bpf/vmtest.sh | 21 ++++++++++++++++----- 2 files changed, 16 insertions(+), 7 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/README.rst b/tools/testing/selftests/bpf/README.rst index 9b974e425af3..4a1e74b17109 100644 --- a/tools/testing/selftests/bpf/README.rst +++ b/tools/testing/selftests/bpf/README.rst @@ -85,8 +85,6 @@ In case of linker errors when running selftests, try using static linking: If you want to change pahole and llvm, you can change `PATH` environment variable in the beginning of script. -.. note:: The script currently only supports x86_64 and s390x architectures. - Additional information about selftest failures are documented here. diff --git a/tools/testing/selftests/bpf/vmtest.sh b/tools/testing/selftests/bpf/vmtest.sh index 87d93f29c565..7bd2b44deb08 100755 --- a/tools/testing/selftests/bpf/vmtest.sh +++ b/tools/testing/selftests/bpf/vmtest.sh @@ -4,9 +4,11 @@ set -u set -e -# This script currently only works for x86_64 and s390x, as -# it is based on the VM image used by the BPF CI, which is -# available only for these architectures. +# This script currently only works for the following platforms, +# as it is based on the VM image used by the BPF CI, which is +# available only for these architectures. We can also specify +# the local rootfs image generated by the following script: +# https://github.com/libbpf/ci/blob/main/rootfs/mkrootfs_debian.sh ARCH="$(uname -m)" case "${ARCH}" in s390x) @@ -34,6 +36,7 @@ aarch64) esac DEFAULT_COMMAND="./test_progs" MOUNT_DIR="mnt" +LOCAL_ROOTFS_IMAGE="" ROOTFS_IMAGE="root.img" OUTPUT_DIR="$HOME/.bpf_selftests" KCONFIG_REL_PATHS=("tools/testing/selftests/bpf/config" @@ -69,6 +72,7 @@ or Options: + -l) Specify the path to the local rootfs image. -i) Update the rootfs image with a newer version. -d) Update the output directory (default: ${OUTPUT_DIR}) -j) Number of jobs for compilation, similar to -j in make @@ -128,7 +132,11 @@ load_rootfs() exit 1 fi - download_rootfs | zstd -d | sudo tar -C "$dir" -x + if [[ -n "${LOCAL_ROOTFS_IMAGE}" ]]; then + cat "${LOCAL_ROOTFS_IMAGE}" | zstd -d | sudo tar -C "$dir" -x + else + download_rootfs | zstd -d | sudo tar -C "$dir" -x + fi } recompile_kernel() @@ -342,8 +350,11 @@ main() local exit_command="poweroff -f" local debug_shell="no" - while getopts ':hskid:j:' opt; do + while getopts ':hskl:id:j:' opt; do case ${opt} in + l) + LOCAL_ROOTFS_IMAGE="$OPTARG" + ;; i) update_image="yes" ;; -- cgit v1.2.3 From d95d56519026322b25ffa964bb92182620176972 Mon Sep 17 00:00:00 2001 From: Pu Lehui Date: Thu, 5 Sep 2024 08:13:57 +0000 Subject: selftests/bpf: Enable cross platform testing for vmtest Add support cross platform testing for vmtest. The variable $ARCH in the current script is platform semantics, not kernel semantics. Rename it to $PLATFORM so that we can easily use $ARCH in cross-compilation. And drop `set -u` unbound variable check as we will use CROSS_COMPILE env variable. For now, Using PLATFORM= and CROSS_COMPILE= options will enable cross platform testing: PLATFORM= CROSS_COMPILE= vmtest.sh -- ./test_progs Tested-by: Eduard Zingerman Signed-off-by: Pu Lehui Link: https://lore.kernel.org/r/20240905081401.1894789-7-pulehui@huaweicloud.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/vmtest.sh | 42 ++++++++++++++++++++++++++--------- 1 file changed, 31 insertions(+), 11 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/vmtest.sh b/tools/testing/selftests/bpf/vmtest.sh index 7bd2b44deb08..91f940e8ce45 100755 --- a/tools/testing/selftests/bpf/vmtest.sh +++ b/tools/testing/selftests/bpf/vmtest.sh @@ -1,7 +1,6 @@ #!/bin/bash # SPDX-License-Identifier: GPL-2.0 -set -u set -e # This script currently only works for the following platforms, @@ -9,25 +8,31 @@ set -e # available only for these architectures. We can also specify # the local rootfs image generated by the following script: # https://github.com/libbpf/ci/blob/main/rootfs/mkrootfs_debian.sh -ARCH="$(uname -m)" -case "${ARCH}" in +PLATFORM="${PLATFORM:-$(uname -m)}" +case "${PLATFORM}" in s390x) QEMU_BINARY=qemu-system-s390x QEMU_CONSOLE="ttyS1" - QEMU_FLAGS=(-smp 2) + HOST_FLAGS=(-smp 2 -enable-kvm) + CROSS_FLAGS=(-smp 2) BZIMAGE="arch/s390/boot/vmlinux" + ARCH="s390" ;; x86_64) QEMU_BINARY=qemu-system-x86_64 QEMU_CONSOLE="ttyS0,115200" - QEMU_FLAGS=(-cpu host -smp 8) + HOST_FLAGS=(-cpu host -enable-kvm -smp 8) + CROSS_FLAGS=(-smp 8) BZIMAGE="arch/x86/boot/bzImage" + ARCH="x86" ;; aarch64) QEMU_BINARY=qemu-system-aarch64 QEMU_CONSOLE="ttyAMA0,115200" - QEMU_FLAGS=(-M virt,gic-version=3 -cpu host -smp 8) + HOST_FLAGS=(-M virt,gic-version=3 -cpu host -enable-kvm -smp 8) + CROSS_FLAGS=(-M virt,gic-version=3 -cpu cortex-a76 -smp 8) BZIMAGE="arch/arm64/boot/Image" + ARCH="arm64" ;; *) echo "Unsupported architecture" @@ -41,7 +46,7 @@ ROOTFS_IMAGE="root.img" OUTPUT_DIR="$HOME/.bpf_selftests" KCONFIG_REL_PATHS=("tools/testing/selftests/bpf/config" "tools/testing/selftests/bpf/config.vm" - "tools/testing/selftests/bpf/config.${ARCH}") + "tools/testing/selftests/bpf/config.${PLATFORM}") INDEX_URL="https://raw.githubusercontent.com/libbpf/ci/master/INDEX" NUM_COMPILE_JOBS="$(nproc)" LOG_FILE_BASE="$(date +"bpf_selftests.%Y-%m-%d_%H-%M-%S")" @@ -61,6 +66,10 @@ tools/testing/selftests/bpf. e.g: If no command is specified and a debug shell (-s) is not requested, "${DEFAULT_COMMAND}" will be run by default. +Using PLATFORM= and CROSS_COMPILE= options will enable cross platform testing: + + PLATFORM= CROSS_COMPILE= $0 -- ./test_progs -t test_lsm + If you build your kernel using KBUILD_OUTPUT= or O= options, these can be passed as environment variables to the script: @@ -100,7 +109,7 @@ newest_rootfs_version() { { for file in "${!URLS[@]}"; do - if [[ $file =~ ^"${ARCH}"/libbpf-vmtest-rootfs-(.*)\.tar\.zst$ ]]; then + if [[ $file =~ ^"${PLATFORM}"/libbpf-vmtest-rootfs-(.*)\.tar\.zst$ ]]; then echo "${BASH_REMATCH[1]}" fi done @@ -112,7 +121,7 @@ download_rootfs() populate_url_map local rootfsversion="$(newest_rootfs_version)" - local file="${ARCH}/libbpf-vmtest-rootfs-$rootfsversion.tar.zst" + local file="${PLATFORM}/libbpf-vmtest-rootfs-$rootfsversion.tar.zst" if [[ ! -v URLS[$file] ]]; then echo "$file not found" >&2 @@ -253,12 +262,17 @@ EOF exit 1 fi + if [[ "${PLATFORM}" != "$(uname -m)" ]]; then + QEMU_FLAGS=("${CROSS_FLAGS[@]}") + else + QEMU_FLAGS=("${HOST_FLAGS[@]}") + fi + ${QEMU_BINARY} \ -nodefaults \ -display none \ -serial mon:stdio \ "${QEMU_FLAGS[@]}" \ - -enable-kvm \ -m 4G \ -drive file="${rootfs_img}",format=raw,index=1,media=disk,if=virtio,cache=none \ -kernel "${kernel_bzimage}" \ @@ -389,6 +403,11 @@ main() trap 'catch "$?"' EXIT + if [[ "${PLATFORM}" != "$(uname -m)" ]] && [[ -z "${CROSS_COMPILE}" ]]; then + echo "Cross-platform testing needs to specify CROSS_COMPILE" + exit 1 + fi + if [[ $# -eq 0 && "${debug_shell}" == "no" ]]; then echo "No command specified, will run ${DEFAULT_COMMAND} in the vm" else @@ -396,7 +415,8 @@ main() fi local kconfig_file="${OUTPUT_DIR}/latest.config" - local make_command="make -j ${NUM_COMPILE_JOBS} KCONFIG_CONFIG=${kconfig_file}" + local make_command="make ARCH=${ARCH} CROSS_COMPILE=${CROSS_COMPILE} \ + -j ${NUM_COMPILE_JOBS} KCONFIG_CONFIG=${kconfig_file}" # Figure out where the kernel is being built. # O takes precedence over KBUILD_OUTPUT. -- cgit v1.2.3 From 897b3680484b8163c9eb51ed55103504ffd4a7d1 Mon Sep 17 00:00:00 2001 From: Pu Lehui Date: Thu, 5 Sep 2024 08:13:58 +0000 Subject: selftests/bpf: Add config.riscv64 Add config.riscv64 for both BPF CI and local vmtest. Tested-by: Eduard Zingerman Signed-off-by: Pu Lehui Link: https://lore.kernel.org/r/20240905081401.1894789-8-pulehui@huaweicloud.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/config.riscv64 | 84 ++++++++++++++++++++++++++++++ 1 file changed, 84 insertions(+) create mode 100644 tools/testing/selftests/bpf/config.riscv64 (limited to 'tools') diff --git a/tools/testing/selftests/bpf/config.riscv64 b/tools/testing/selftests/bpf/config.riscv64 new file mode 100644 index 000000000000..bb7043a80e1a --- /dev/null +++ b/tools/testing/selftests/bpf/config.riscv64 @@ -0,0 +1,84 @@ +CONFIG_AUDIT=y +CONFIG_BLK_CGROUP=y +CONFIG_BLK_DEV_INITRD=y +CONFIG_BLK_DEV_RAM=y +CONFIG_BONDING=y +CONFIG_BPF_JIT_ALWAYS_ON=y +CONFIG_BPF_PRELOAD=y +CONFIG_BPF_PRELOAD_UMD=y +CONFIG_CGROUPS=y +CONFIG_CGROUP_CPUACCT=y +CONFIG_CGROUP_DEVICE=y +CONFIG_CGROUP_FREEZER=y +CONFIG_CGROUP_HUGETLB=y +CONFIG_CGROUP_NET_CLASSID=y +CONFIG_CGROUP_PERF=y +CONFIG_CGROUP_PIDS=y +CONFIG_CGROUP_SCHED=y +CONFIG_CPUSETS=y +CONFIG_DEBUG_ATOMIC_SLEEP=y +CONFIG_DEBUG_FS=y +CONFIG_DETECT_HUNG_TASK=y +CONFIG_DEVTMPFS=y +CONFIG_DEVTMPFS_MOUNT=y +CONFIG_EXPERT=y +CONFIG_EXT4_FS=y +CONFIG_EXT4_FS_POSIX_ACL=y +CONFIG_EXT4_FS_SECURITY=y +CONFIG_FRAME_POINTER=y +CONFIG_HARDLOCKUP_DETECTOR=y +CONFIG_HIGH_RES_TIMERS=y +CONFIG_HUGETLBFS=y +CONFIG_INET=y +CONFIG_IPV6_SEG6_LWTUNNEL=y +CONFIG_IP_ADVANCED_ROUTER=y +CONFIG_IP_MULTICAST=y +CONFIG_IP_MULTIPLE_TABLES=y +CONFIG_JUMP_LABEL=y +CONFIG_KALLSYMS_ALL=y +CONFIG_KPROBES=y +CONFIG_MEMCG=y +CONFIG_NAMESPACES=y +CONFIG_NET=y +CONFIG_NETDEVICES=y +CONFIG_NETFILTER_XT_MATCH_BPF=y +CONFIG_NET_ACT_BPF=y +CONFIG_NET_L3_MASTER_DEV=y +CONFIG_NET_VRF=y +CONFIG_NONPORTABLE=y +CONFIG_NO_HZ_IDLE=y +CONFIG_NR_CPUS=256 +CONFIG_PACKET=y +CONFIG_PANIC_ON_OOPS=y +CONFIG_PARTITION_ADVANCED=y +CONFIG_PCI=y +CONFIG_PCI_HOST_GENERIC=y +CONFIG_POSIX_MQUEUE=y +CONFIG_PRINTK_TIME=y +CONFIG_PROC_KCORE=y +CONFIG_PROFILING=y +CONFIG_RCU_CPU_STALL_TIMEOUT=60 +CONFIG_RISCV_EFFICIENT_UNALIGNED_ACCESS=y +CONFIG_RISCV_ISA_C=y +CONFIG_RISCV_PMU=y +CONFIG_RISCV_PMU_SBI=y +CONFIG_RT_GROUP_SCHED=y +CONFIG_SECURITY_NETWORK=y +CONFIG_SERIAL_8250=y +CONFIG_SERIAL_8250_CONSOLE=y +CONFIG_SERIAL_OF_PLATFORM=y +CONFIG_SMP=y +CONFIG_SOC_VIRT=y +CONFIG_SYSVIPC=y +CONFIG_TCP_CONG_ADVANCED=y +CONFIG_TLS=y +CONFIG_TMPFS=y +CONFIG_TMPFS_POSIX_ACL=y +CONFIG_TUN=y +CONFIG_UNIX=y +CONFIG_UPROBES=y +CONFIG_USER_NS=y +CONFIG_VETH=y +CONFIG_VLAN_8021Q=y +CONFIG_VSOCKETS_LOOPBACK=y +CONFIG_XFRM_USER=y -- cgit v1.2.3 From c402cb85802fa059f1d42954f25acf14c3bf89e7 Mon Sep 17 00:00:00 2001 From: Pu Lehui Date: Thu, 5 Sep 2024 08:13:59 +0000 Subject: selftests/bpf: Add DENYLIST.riscv64 This patch adds DENYLIST.riscv64 file for riscv64. It will help BPF CI and local vmtest to mask failing and unsupported test cases. We can use the following command to use deny list in local vmtest as previously mentioned by Manu. PLATFORM=riscv64 CROSS_COMPILE=riscv64-linux-gnu- vmtest.sh \ -l ./libbpf-vmtest-rootfs-2024.08.30-noble-riscv64.tar.zst -- \ ./test_progs -d \ \"$(cat tools/testing/selftests/bpf/DENYLIST.riscv64 \ | cut -d'#' -f1 \ | sed -e 's/^[[:space:]]*//' \ -e 's/[[:space:]]*$//' \ | tr -s '\n' ','\ )\" Tested-by: Eduard Zingerman Signed-off-by: Pu Lehui Link: https://lore.kernel.org/r/20240905081401.1894789-9-pulehui@huaweicloud.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/DENYLIST.riscv64 | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 tools/testing/selftests/bpf/DENYLIST.riscv64 (limited to 'tools') diff --git a/tools/testing/selftests/bpf/DENYLIST.riscv64 b/tools/testing/selftests/bpf/DENYLIST.riscv64 new file mode 100644 index 000000000000..4fc4dfdde293 --- /dev/null +++ b/tools/testing/selftests/bpf/DENYLIST.riscv64 @@ -0,0 +1,3 @@ +# riscv64 deny list for BPF CI and local vmtest +exceptions # JIT does not support exceptions +tailcalls/tailcall_bpf2bpf* # JIT does not support mixing bpf2bpf and tailcalls -- cgit v1.2.3 From b2bc9d50549939e3e103f986f3123970a6b81e44 Mon Sep 17 00:00:00 2001 From: Pu Lehui Date: Thu, 5 Sep 2024 08:14:00 +0000 Subject: selftests/bpf: Add riscv64 configurations to local vmtest Add riscv64 configurations to local vmtest. We can now perform cross platform testing for riscv64 bpf using the following command: PLATFORM=riscv64 CROSS_COMPILE=riscv64-linux-gnu- vmtest.sh \ -l ./libbpf-vmtest-rootfs-2024.08.30-noble-riscv64.tar.zst -- \ ./test_progs -d \ \"$(cat tools/testing/selftests/bpf/DENYLIST.riscv64 \ | cut -d'#' -f1 \ | sed -e 's/^[[:space:]]*//' \ -e 's/[[:space:]]*$//' \ | tr -s '\n' ','\ )\" Tested-by: Eduard Zingerman Signed-off-by: Pu Lehui Link: https://lore.kernel.org/r/20240905081401.1894789-10-pulehui@huaweicloud.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/vmtest.sh | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/vmtest.sh b/tools/testing/selftests/bpf/vmtest.sh index 91f940e8ce45..79505d294c44 100755 --- a/tools/testing/selftests/bpf/vmtest.sh +++ b/tools/testing/selftests/bpf/vmtest.sh @@ -34,6 +34,15 @@ aarch64) BZIMAGE="arch/arm64/boot/Image" ARCH="arm64" ;; +riscv64) + # required qemu version v7.2.0+ + QEMU_BINARY=qemu-system-riscv64 + QEMU_CONSOLE="ttyS0,115200" + HOST_FLAGS=(-M virt -cpu host -enable-kvm -smp 8) + CROSS_FLAGS=(-M virt -cpu rv64,sscofpmf=true -smp 8) + BZIMAGE="arch/riscv/boot/Image" + ARCH="riscv" + ;; *) echo "Unsupported architecture" exit 1 -- cgit v1.2.3 From 95b1c5d17832f61d0438cd5060a997bafaf75f2c Mon Sep 17 00:00:00 2001 From: Pu Lehui Date: Thu, 5 Sep 2024 08:14:01 +0000 Subject: selftests/bpf: Add description for running vmtest on RV64 Add description in tools/testing/selftests/bpf/README.rst for running vmtest on RV64. Signed-off-by: Pu Lehui Link: https://lore.kernel.org/r/20240905081401.1894789-11-pulehui@huaweicloud.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/README.rst | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/README.rst b/tools/testing/selftests/bpf/README.rst index 4a1e74b17109..776fbe3cb8f9 100644 --- a/tools/testing/selftests/bpf/README.rst +++ b/tools/testing/selftests/bpf/README.rst @@ -85,6 +85,38 @@ In case of linker errors when running selftests, try using static linking: If you want to change pahole and llvm, you can change `PATH` environment variable in the beginning of script. +Running vmtest on RV64 +====================== +To speed up testing and avoid various dependency issues, it is recommended to +run vmtest in a Docker container. Before running vmtest, we need to prepare +Docker container and local rootfs image. The overall steps are as follows: + +1. Create Docker container as shown in link [0]. + +2. Use mkrootfs_debian.sh script [1] to build local rootfs image: + +.. code-block:: console + + $ sudo ./mkrootfs_debian.sh --arch riscv64 --distro noble + +3. Start Docker container [0] and run vmtest in the container: + +.. code-block:: console + + $ PLATFORM=riscv64 CROSS_COMPILE=riscv64-linux-gnu- \ + tools/testing/selftests/bpf/vmtest.sh \ + -l -- \ + ./test_progs -d \ + \"$(cat tools/testing/selftests/bpf/DENYLIST.riscv64 \ + | cut -d'#' -f1 \ + | sed -e 's/^[[:space:]]*//' \ + -e 's/[[:space:]]*$//' \ + | tr -s '\n' ',' \ + )\" + +Link: https://github.com/pulehui/riscv-bpf-vmtest.git [0] +Link: https://github.com/libbpf/ci/blob/main/rootfs/mkrootfs_debian.sh [1] + Additional information about selftest failures are documented here. -- cgit v1.2.3 From e4af74a53b7aa865e7fcc104630ebb7a9129b71f Mon Sep 17 00:00:00 2001 From: Jamie Bainbridge Date: Wed, 4 Sep 2024 16:12:26 +1000 Subject: selftests: net: enable bind tests bind_wildcard is compiled but not run, bind_timewait is not compiled. These two tests complete in a very short time, use the test harness properly, and seem reasonable to enable. The author of the tests confirmed via email that these were intended to be run. Enable these two tests. Fixes: 13715acf8ab5 ("selftest: Add test for bind() conflicts.") Fixes: 2c042e8e54ef ("tcp: Add selftest for bind() and TIME_WAIT.") Signed-off-by: Jamie Bainbridge Reviewed-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/5a009b26cf5fb1ad1512d89c61b37e2fac702323.1725430322.git.jamie.bainbridge@gmail.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/Makefile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile index 8eaffd7a641c..9d5aa817411b 100644 --- a/tools/testing/selftests/net/Makefile +++ b/tools/testing/selftests/net/Makefile @@ -85,7 +85,8 @@ TEST_GEN_PROGS += so_incoming_cpu TEST_PROGS += sctp_vrf.sh TEST_GEN_FILES += sctp_hello TEST_GEN_FILES += ip_local_port_range -TEST_GEN_FILES += bind_wildcard +TEST_GEN_PROGS += bind_wildcard +TEST_GEN_PROGS += bind_timewait TEST_PROGS += test_vxlan_mdb.sh TEST_PROGS += test_bridge_neigh_suppress.sh TEST_PROGS += test_vxlan_nolocalbypass.sh -- cgit v1.2.3 From 6fda63c45fe8a0870226c13dcce1cc21b7c4d508 Mon Sep 17 00:00:00 2001 From: Arkadiusz Kubalewski Date: Wed, 4 Sep 2024 15:50:34 +0200 Subject: tools/net/ynl: fix cli.py --subscribe feature Execution of command: ./tools/net/ynl/cli.py --spec Documentation/netlink/specs/dpll.yaml / --subscribe "monitor" --sleep 10 fails with: File "/repo/./tools/net/ynl/cli.py", line 109, in main ynl.check_ntf() File "/repo/tools/net/ynl/lib/ynl.py", line 924, in check_ntf op = self.rsp_by_value[nl_msg.cmd()] KeyError: 19 Parsing Generic Netlink notification messages performs lookup for op in the message. The message was not yet decoded, and is not yet considered GenlMsg, thus msg.cmd() returns Generic Netlink family id (19) instead of proper notification command id (i.e.: DPLL_CMD_PIN_CHANGE_NTF=13). Allow the op to be obtained within NetlinkProtocol.decode(..) itself if the op was not passed to the decode function, thus allow parsing of Generic Netlink notifications without causing the failure. Suggested-by: Donald Hunter Link: https://lore.kernel.org/netdev/m2le0n5xpn.fsf@gmail.com/ Fixes: 0a966d606c68 ("tools/net/ynl: Fix extack decoding for directional ops") Signed-off-by: Arkadiusz Kubalewski Reviewed-by: Donald Hunter Link: https://patch.msgid.link/20240904135034.316033-1-arkadiusz.kubalewski@intel.com Signed-off-by: Jakub Kicinski --- tools/net/ynl/lib/ynl.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/net/ynl/lib/ynl.py b/tools/net/ynl/lib/ynl.py index d42c1d605969..c22c22bf2cb7 100644 --- a/tools/net/ynl/lib/ynl.py +++ b/tools/net/ynl/lib/ynl.py @@ -388,6 +388,8 @@ class NetlinkProtocol: def decode(self, ynl, nl_msg, op): msg = self._decode(nl_msg) + if op is None: + op = ynl.rsp_by_value[msg.cmd()] fixed_header_size = ynl._struct_size(op.fixed_header) msg.raw_attrs = NlAttrs(msg.raw, fixed_header_size) return msg @@ -921,8 +923,7 @@ class YnlFamily(SpecFamily): print("Netlink done while checking for ntf!?") continue - op = self.rsp_by_value[nl_msg.cmd()] - decoded = self.nlproto.decode(self, nl_msg, op) + decoded = self.nlproto.decode(self, nl_msg, None) if decoded.cmd() not in self.async_msg_ids: print("Unexpected msg id done while checking for ntf", decoded) continue @@ -980,7 +981,7 @@ class YnlFamily(SpecFamily): if nl_msg.extack: self._decode_extack(req_msg, op, nl_msg.extack) else: - op = self.rsp_by_value[nl_msg.cmd()] + op = None req_flags = [] if nl_msg.error: -- cgit v1.2.3 From 1b3bc648f5067d32959d7389a7c17c8c04e8e689 Mon Sep 17 00:00:00 2001 From: JP Kobryn Date: Thu, 5 Sep 2024 15:38:12 -0700 Subject: bpf/selftests: coverage for tp and perf event progs using kfuncs This coverage ensures that kfuncs are allowed within tracepoint and perf event programs. Signed-off-by: JP Kobryn Link: https://lore.kernel.org/r/20240905223812.141857-3-inwardvessel@gmail.com Signed-off-by: Alexei Starovoitov --- .../bpf/progs/verifier_kfunc_prog_types.c | 48 ++++++++++++++++++++++ 1 file changed, 48 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/progs/verifier_kfunc_prog_types.c b/tools/testing/selftests/bpf/progs/verifier_kfunc_prog_types.c index cb32b0cfc84b..a509cad97e69 100644 --- a/tools/testing/selftests/bpf/progs/verifier_kfunc_prog_types.c +++ b/tools/testing/selftests/bpf/progs/verifier_kfunc_prog_types.c @@ -47,6 +47,22 @@ int BPF_PROG(task_kfunc_syscall) return 0; } +SEC("tracepoint") +__success +int BPF_PROG(task_kfunc_tracepoint) +{ + task_kfunc_load_test(); + return 0; +} + +SEC("perf_event") +__success +int BPF_PROG(task_kfunc_perf_event) +{ + task_kfunc_load_test(); + return 0; +} + /***************** * cgroup kfuncs * *****************/ @@ -85,6 +101,22 @@ int BPF_PROG(cgrp_kfunc_syscall) return 0; } +SEC("tracepoint") +__success +int BPF_PROG(cgrp_kfunc_tracepoint) +{ + cgrp_kfunc_load_test(); + return 0; +} + +SEC("perf_event") +__success +int BPF_PROG(cgrp_kfunc_perf_event) +{ + cgrp_kfunc_load_test(); + return 0; +} + /****************** * cpumask kfuncs * ******************/ @@ -120,3 +152,19 @@ int BPF_PROG(cpumask_kfunc_syscall) cpumask_kfunc_load_test(); return 0; } + +SEC("tracepoint") +__success +int BPF_PROG(cpumask_kfunc_tracepoint) +{ + cpumask_kfunc_load_test(); + return 0; +} + +SEC("perf_event") +__success +int BPF_PROG(cpumask_kfunc_perf_event) +{ + cpumask_kfunc_load_test(); + return 0; +} -- cgit v1.2.3 From 4b80294fb53845dc5c98cca0c989da09150f2ca9 Mon Sep 17 00:00:00 2001 From: "John B. Wyatt IV" Date: Wed, 4 Sep 2024 22:19:08 -0400 Subject: pm:cpupower: Add missing powercap_set_enabled() stub function There was a symbol listed in the powercap.h file that was not implemented. Implement it with a stub return of 0. Programs like SWIG require that functions that are defined in the headers be implemented. Fixes: c2294c1496b7 ("cpupower: Introduce powercap intel-rapl library and powercap-info command") Suggested-by: Shuah Khan Signed-off-by: John B. Wyatt IV Signed-off-by: John B. Wyatt IV Signed-off-by: Shuah Khan --- tools/power/cpupower/lib/powercap.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'tools') diff --git a/tools/power/cpupower/lib/powercap.c b/tools/power/cpupower/lib/powercap.c index a7a59c6bacda..94a0c69e55ef 100644 --- a/tools/power/cpupower/lib/powercap.c +++ b/tools/power/cpupower/lib/powercap.c @@ -77,6 +77,14 @@ int powercap_get_enabled(int *mode) return sysfs_get_enabled(path, mode); } +/* + * TODO: implement function. Returns dummy 0 for now. + */ +int powercap_set_enabled(int mode) +{ + return 0; +} + /* * Hardcoded, because rapl is the only powercap implementation - * this needs to get more generic if more powercap implementations -- cgit v1.2.3 From 338f490e07bc9e83c6fe60125ed9dea3e5d6a45d Mon Sep 17 00:00:00 2001 From: "John B. Wyatt IV" Date: Wed, 4 Sep 2024 22:19:09 -0400 Subject: pm:cpupower: Add SWIG bindings files for libcpupower SWIG is a tool packaged in Fedora and other distros that can generate bindings from C and C++ code for several languages including Python, Perl, and Go. These bindings allows users to easily write scripts that use and extend libcpupower's functionality. Currently, only Python is provided in the makefile, but additional languages may be added if there is demand. Added suggestions from Shuah Khan for the README and license discussion. Note that while SWIG itself is GPL v3+ licensed; the resulting output, the bindings code, is permissively licensed + the license of the .o files. Please see https://swig.org/legal.html and [1] for more details. [1] https://lore.kernel.org/linux-pm/Zqv9BOjxLAgyNP5B@hatbackup/ Suggested-by: Shuah Khan Signed-off-by: John B. Wyatt IV Signed-off-by: John B. Wyatt IV Signed-off-by: Shuah Khan --- tools/power/cpupower/bindings/python/.gitignore | 8 + tools/power/cpupower/bindings/python/Makefile | 31 +++ tools/power/cpupower/bindings/python/README | 59 +++++ .../cpupower/bindings/python/raw_pylibcpupower.i | 247 +++++++++++++++++++++ 4 files changed, 345 insertions(+) create mode 100644 tools/power/cpupower/bindings/python/.gitignore create mode 100644 tools/power/cpupower/bindings/python/Makefile create mode 100644 tools/power/cpupower/bindings/python/README create mode 100644 tools/power/cpupower/bindings/python/raw_pylibcpupower.i (limited to 'tools') diff --git a/tools/power/cpupower/bindings/python/.gitignore b/tools/power/cpupower/bindings/python/.gitignore new file mode 100644 index 000000000000..5c9a1f0212dd --- /dev/null +++ b/tools/power/cpupower/bindings/python/.gitignore @@ -0,0 +1,8 @@ +__pycache__/ +raw_pylibcpupower_wrap.c +*.o +*.so +*.py +!test_raw_pylibcpupower.py +# git keeps ignoring this file, use git add -f raw_libcpupower.i +!raw_pylibcpupower.i diff --git a/tools/power/cpupower/bindings/python/Makefile b/tools/power/cpupower/bindings/python/Makefile new file mode 100644 index 000000000000..d0418f902795 --- /dev/null +++ b/tools/power/cpupower/bindings/python/Makefile @@ -0,0 +1,31 @@ +# SPDX-License-Identifier: GPL-2.0-only +# Makefile for libcpupower's Python bindings +# +# This Makefile expects you have already run the makefile for cpupower to build +# the .o files in the lib directory for the bindings to be created. + +CC=gcc + +LIB_DIR = ../../lib +BIND_DIR = . +PY_INCLUDE := $(firstword $(shell python-config --includes)) +#PY_INCLUDE = $(shell python-config --includes | awk '{ print $1 }') + +OBJECTS_LIB = $(wildcard $(LIB_DIR)/*.o) +OBJECTS_BIND = $(wildcard $(BIND_DIR)/*.o) + +all: _raw_pylibcpupower.so + +_raw_pylibcpupower.so: raw_pylibcpupower_wrap.o + $(CC) -shared $(OBJECTS_LIB) raw_pylibcpupower_wrap.o -o _raw_pylibcpupower.so # raw_pylibcpupower_wrap.o +# $(CC) -shared $(OBJECTS_BIND) $(OBJECTS_LIB) -o _raw_pylibcpupower.so # raw_pylibcpupower_wrap.o + +raw_pylibcpupower_wrap.o: raw_pylibcpupower_wrap.c + $(CC) -fPIC -c raw_pylibcpupower_wrap.c $(PY_INCLUDE) + +raw_pylibcpupower_wrap.c: raw_pylibcpupower.i + swig -python raw_pylibcpupower.i + +# Will only clean the bindings folder; will not clean the actual cpupower folder +clean: + rm -f raw_pylibcpupower.py raw_pylibcpupower_wrap.c raw_pylibcpupower_wrap.o _raw_pylibcpupower.so diff --git a/tools/power/cpupower/bindings/python/README b/tools/power/cpupower/bindings/python/README new file mode 100644 index 000000000000..0a4bb2581e8a --- /dev/null +++ b/tools/power/cpupower/bindings/python/README @@ -0,0 +1,59 @@ +This folder contains the necessary files to build the Python bindings for +libcpupower (aside from the libcpupower object files). + + +requirements +------------ + +* You need the object files in the libcpupower directory compiled by +cpupower's makefile. +* The SWIG program must be installed. +* The Python's development libraries installed. + +Please check that your version of SWIG is compatible with the version of Python +installed on your machine by checking the SWIG changelog on their website. +https://swig.org/ + +Note that while SWIG itself is GPL v3+ licensed; the resulting output, +the bindings code: is permissively licensed + the license of libcpupower's .o +files. For these bindings that means GPL v2. + +Please see https://swig.org/legal.html and the discussion [1] for more details. + +[1] +https://lore.kernel.org/linux-pm/Zqv9BOjxLAgyNP5B@hatbackup/ + + +build +----- + +Install SWIG and the Python development files provided by your distribution. + +Build the object files for libcpupower by running make in the cpupower +directory. + +Return to the directory this README is in to run: + +$ make + + +testing +------- + +Please verify the _raw_pylibcpupower.so and raw_pylibcpupower.py files have +been created. + +To run the test script: + +$ python test_raw_pylibcpupower.py + + +credits +------- + +Original Bindings Author: +John B. Wyatt IV +jwyatt@redhat.com +sageofredondo@gmail.com + +Copyright (C) 2024 Red Hat diff --git a/tools/power/cpupower/bindings/python/raw_pylibcpupower.i b/tools/power/cpupower/bindings/python/raw_pylibcpupower.i new file mode 100644 index 000000000000..96556d87a745 --- /dev/null +++ b/tools/power/cpupower/bindings/python/raw_pylibcpupower.i @@ -0,0 +1,247 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ + +%module raw_pylibcpupower +%{ +#include "../../lib/cpupower_intern.h" +#include "../../lib/acpi_cppc.h" +#include "../../lib/cpufreq.h" +#include "../../lib/cpuidle.h" +#include "../../lib/cpupower.h" +#include "../../lib/powercap.h" +%} + +/* + * cpupower_intern.h + */ + +#define PATH_TO_CPU "/sys/devices/system/cpu/" +#define MAX_LINE_LEN 4096 +#define SYSFS_PATH_MAX 255 + +int is_valid_path(const char *path); + +unsigned int cpupower_read_sysfs(const char *path, char *buf, size_t buflen); + +unsigned int cpupower_write_sysfs(const char *path, char *buf, size_t buflen); + +/* + * acpi_cppc.h + */ + +enum acpi_cppc_value { + HIGHEST_PERF, + LOWEST_PERF, + NOMINAL_PERF, + LOWEST_NONLINEAR_PERF, + LOWEST_FREQ, + NOMINAL_FREQ, + REFERENCE_PERF, + WRAPAROUND_TIME, + MAX_CPPC_VALUE_FILES +}; + +unsigned long acpi_cppc_get_data(unsigned int cpu, + enum acpi_cppc_value which); + +/* + * cpufreq.h + */ + +struct cpufreq_policy { + unsigned long min; + unsigned long max; + char *governor; +}; + +struct cpufreq_available_governors { + char *governor; + struct cpufreq_available_governors *next; + struct cpufreq_available_governors *first; +}; + +struct cpufreq_available_frequencies { + unsigned long frequency; + struct cpufreq_available_frequencies *next; + struct cpufreq_available_frequencies *first; +}; + + +struct cpufreq_affected_cpus { + unsigned int cpu; + struct cpufreq_affected_cpus *next; + struct cpufreq_affected_cpus *first; +}; + +struct cpufreq_stats { + unsigned long frequency; + unsigned long long time_in_state; + struct cpufreq_stats *next; + struct cpufreq_stats *first; +}; + +unsigned long cpufreq_get_freq_kernel(unsigned int cpu); + +unsigned long cpufreq_get_freq_hardware(unsigned int cpu); + +#define cpufreq_get(cpu) cpufreq_get_freq_kernel(cpu); + +unsigned long cpufreq_get_transition_latency(unsigned int cpu); + +int cpufreq_get_hardware_limits(unsigned int cpu, + unsigned long *min, + unsigned long *max); + +char *cpufreq_get_driver(unsigned int cpu); + +void cpufreq_put_driver(char *ptr); + +struct cpufreq_policy *cpufreq_get_policy(unsigned int cpu); + +void cpufreq_put_policy(struct cpufreq_policy *policy); + +struct cpufreq_available_governors +*cpufreq_get_available_governors(unsigned int cpu); + +void cpufreq_put_available_governors( + struct cpufreq_available_governors *first); + +struct cpufreq_available_frequencies +*cpufreq_get_available_frequencies(unsigned int cpu); + +void cpufreq_put_available_frequencies( + struct cpufreq_available_frequencies *first); + +struct cpufreq_available_frequencies +*cpufreq_get_boost_frequencies(unsigned int cpu); + +void cpufreq_put_boost_frequencies( + struct cpufreq_available_frequencies *first); + +struct cpufreq_affected_cpus *cpufreq_get_affected_cpus(unsigned + int cpu); + +void cpufreq_put_affected_cpus(struct cpufreq_affected_cpus *first); + +struct cpufreq_affected_cpus *cpufreq_get_related_cpus(unsigned + int cpu); + +void cpufreq_put_related_cpus(struct cpufreq_affected_cpus *first); + +struct cpufreq_stats *cpufreq_get_stats(unsigned int cpu, + unsigned long long *total_time); + +void cpufreq_put_stats(struct cpufreq_stats *stats); + +unsigned long cpufreq_get_transitions(unsigned int cpu); + +int cpufreq_set_policy(unsigned int cpu, struct cpufreq_policy *policy); + +int cpufreq_modify_policy_min(unsigned int cpu, unsigned long min_freq); + +int cpufreq_modify_policy_max(unsigned int cpu, unsigned long max_freq); + +int cpufreq_modify_policy_governor(unsigned int cpu, char *governor); + +int cpufreq_set_frequency(unsigned int cpu, + unsigned long target_frequency); + +unsigned long cpufreq_get_sysfs_value_from_table(unsigned int cpu, + const char **table, + unsigned int index, + unsigned int size); + +/* + * cpuidle.h + */ + +int cpuidle_is_state_disabled(unsigned int cpu, + unsigned int idlestate); +int cpuidle_state_disable(unsigned int cpu, unsigned int idlestate, + unsigned int disable); +unsigned long cpuidle_state_latency(unsigned int cpu, + unsigned int idlestate); +unsigned long cpuidle_state_usage(unsigned int cpu, + unsigned int idlestate); +unsigned long long cpuidle_state_time(unsigned int cpu, + unsigned int idlestate); +char *cpuidle_state_name(unsigned int cpu, + unsigned int idlestate); +char *cpuidle_state_desc(unsigned int cpu, + unsigned int idlestate); +unsigned int cpuidle_state_count(unsigned int cpu); + +char *cpuidle_get_governor(void); + +char *cpuidle_get_driver(void); + +/* + * cpupower.h + */ + +struct cpupower_topology { + /* Amount of CPU cores, packages and threads per core in the system */ + unsigned int cores; + unsigned int pkgs; + unsigned int threads; /* per core */ + + /* Array gets mallocated with cores entries, holding per core info */ + struct cpuid_core_info *core_info; +}; + +struct cpuid_core_info { + int pkg; + int core; + int cpu; + + /* flags */ + unsigned int is_online:1; +}; + +int get_cpu_topology(struct cpupower_topology *cpu_top); + +void cpu_topology_release(struct cpupower_topology cpu_top); + +int cpupower_is_cpu_online(unsigned int cpu); + +/* + * powercap.h + */ + +struct powercap_zone { + char name[MAX_LINE_LEN]; + /* + * sys_name relative to PATH_TO_POWERCAP, + * do not forget the / in between + */ + char sys_name[SYSFS_PATH_MAX]; + int tree_depth; + struct powercap_zone *parent; + struct powercap_zone *children[POWERCAP_MAX_CHILD_ZONES]; + /* More possible caps or attributes to be added? */ + uint32_t has_power_uw:1, + has_energy_uj:1; + +}; + +int powercap_walk_zones(struct powercap_zone *zone, + int (*f)(struct powercap_zone *zone)); + +struct powercap_zone *powercap_init_zones(void); + +int powercap_get_enabled(int *mode); + +int powercap_set_enabled(int mode); + +int powercap_get_driver(char *driver, int buflen); + +int powercap_get_max_energy_range_uj(struct powercap_zone *zone, uint64_t *val); + +int powercap_get_energy_uj(struct powercap_zone *zone, uint64_t *val); + +int powercap_get_max_power_range_uw(struct powercap_zone *zone, uint64_t *val); + +int powercap_get_power_uw(struct powercap_zone *zone, uint64_t *val); + +int powercap_zone_get_enabled(struct powercap_zone *zone, int *mode); + +int powercap_zone_set_enabled(struct powercap_zone *zone, int mode); -- cgit v1.2.3 From 660475266b744ab02695c6f3cdf28b120b884125 Mon Sep 17 00:00:00 2001 From: "John B. Wyatt IV" Date: Wed, 4 Sep 2024 22:19:10 -0400 Subject: pm:cpupower: Include test_raw_pylibcpupower.py This script demonstrates how to make use of, and tests, the bindings. In the future, this script could become part of a larger test suite to test the bindings and libcpupower. Signed-off-by: John B. Wyatt IV Signed-off-by: John B. Wyatt IV Signed-off-by: Shuah Khan --- .../bindings/python/test_raw_pylibcpupower.py | 42 ++++++++++++++++++++++ 1 file changed, 42 insertions(+) create mode 100755 tools/power/cpupower/bindings/python/test_raw_pylibcpupower.py (limited to 'tools') diff --git a/tools/power/cpupower/bindings/python/test_raw_pylibcpupower.py b/tools/power/cpupower/bindings/python/test_raw_pylibcpupower.py new file mode 100755 index 000000000000..3d6f62b9556a --- /dev/null +++ b/tools/power/cpupower/bindings/python/test_raw_pylibcpupower.py @@ -0,0 +1,42 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: GPL-2.0-only + +import raw_pylibcpupower as p + +# Simple function call + +""" +Get cstate count +""" +cpu_cstates_count = p.cpuidle_state_count(0) +if cpu_cstates_count > -1: + print(f"CPU 0 has {cpu_cstates_count} c-states") +else: + print(f"cstate count error: return code: {cpu_cstates_count}") + +""" +Disable cstate (will fail if the above is 0, ex: a virtual machine) +""" +cstate_disabled = p.cpuidle_state_disable(0, 0, 1) +if cpu_cstates_count == 0: + print(f"CPU 0 has {cpu_cstates_count} c-states") +else: + print(f"cstate count error: return code: {cpu_cstates_count}") + +match cstate_disabled: + case 0: + print(f"CPU state disabled") + case -1: + print(f"Idlestate not available") + case _: + print(f"Not documented") + + +# Pointer example + +topo = p.cpupower_topology() +total_cpus = p.get_cpu_topology(topo) +if total_cpus > 0: + print(f"Number of total cpus: {total_cpus} and number of cores: {topo.cores}") +else: + print(f"Error: could not get cpu topology") -- cgit v1.2.3 From f2dbc7790929b2e39a348a344311a8b70e6ed4b4 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Wed, 4 Sep 2024 14:17:05 -0700 Subject: perf jevents: Ignore sys when determining a model directory Existing sys directories aren't placed under a model directory like skylake. Placing a sys directory there causes the `is_leaf_dir` test to fail and consequently no events or metrics are generated for the model. Ignore sys directories in this case and update the comments to reflect why. This change has no affect, but when testing with a sys directory for a model people have reported running into the no event/metric issue. Reported-by: Stephane Eranian Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ingo Molnar Cc: Jing Zhang Cc: Jiri Olsa Cc: John Garry Cc: Kan Liang Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Sandipan Das Cc: Thomas Richter Cc: Xu Yang Link: https://lore.kernel.org/r/20240904211705.915101-1-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/pmu-events/jevents.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'tools') diff --git a/tools/perf/pmu-events/jevents.py b/tools/perf/pmu-events/jevents.py index 1d96b2204e52..bb0a5d92df4a 100755 --- a/tools/perf/pmu-events/jevents.py +++ b/tools/perf/pmu-events/jevents.py @@ -635,14 +635,17 @@ def preprocess_one_file(parents: Sequence[str], item: os.DirEntry) -> None: def process_one_file(parents: Sequence[str], item: os.DirEntry) -> None: """Process a JSON file during the main walk.""" - def is_leaf_dir(path: str) -> bool: + def is_leaf_dir_ignoring_sys(path: str) -> bool: for item in os.scandir(path): - if item.is_dir(): + if item.is_dir() and item.name != 'sys': return False return True - # model directory, reset topic - if item.is_dir() and is_leaf_dir(item.path): + # Model directories are leaves (ignoring possible sys + # directories). The FTW will walk into the directory next. Flush + # pending events and metrics and update the table names for the new + # model directory. + if item.is_dir() and is_leaf_dir_ignoring_sys(item.path): print_pending_events() print_pending_metrics() -- cgit v1.2.3 From 4bef6168c145852cebefe2f0bd78f9aaeba52843 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 5 Sep 2024 08:07:55 -0700 Subject: perf script python: Avoid buffer overflow in python PEBS register interface Running a script that processes PEBS records gives buffer overflows in valgrind. The problem is that the allocation of the register string doesn't include the terminating 0 byte. Fix this. I also replaced the very magic "28" with a more reasonable larger buffer that should fit all registers. There's no need to conserve memory here. ==2106591== Memcheck, a memory error detector ==2106591== Copyright (C) 2002-2022, and GNU GPL'd, by Julian Seward et al. ==2106591== Using Valgrind-3.22.0 and LibVEX; rerun with -h for copyright info ==2106591== Command: ../perf script -i tcall.data gcov.py tcall.gcov ==2106591== ==2106591== Invalid write of size 1 ==2106591== at 0x713354: regs_map (trace-event-python.c:748) ==2106591== by 0x7134EB: set_regs_in_dict (trace-event-python.c:784) ==2106591== by 0x713E58: get_perf_sample_dict (trace-event-python.c:940) ==2106591== by 0x716327: python_process_general_event (trace-event-python.c:1499) ==2106591== by 0x7164E1: python_process_event (trace-event-python.c:1531) ==2106591== by 0x44F9AF: process_sample_event (builtin-script.c:2549) ==2106591== by 0x6294DC: evlist__deliver_sample (session.c:1534) ==2106591== by 0x6296D0: machines__deliver_event (session.c:1573) ==2106591== by 0x629C39: perf_session__deliver_event (session.c:1655) ==2106591== by 0x625830: ordered_events__deliver_event (session.c:193) ==2106591== by 0x630B23: do_flush (ordered-events.c:245) ==2106591== by 0x630E7A: __ordered_events__flush (ordered-events.c:324) ==2106591== Address 0x7186fe0 is 0 bytes after a block of size 0 alloc'd ==2106591== at 0x484280F: malloc (vg_replace_malloc.c:442) ==2106591== by 0x7134AD: set_regs_in_dict (trace-event-python.c:780) ==2106591== by 0x713E58: get_perf_sample_dict (trace-event-python.c:940) ==2106591== by 0x716327: python_process_general_event (trace-event-python.c:1499) ==2106591== by 0x7164E1: python_process_event (trace-event-python.c:1531) ==2106591== by 0x44F9AF: process_sample_event (builtin-script.c:2549) ==2106591== by 0x6294DC: evlist__deliver_sample (session.c:1534) ==2106591== by 0x6296D0: machines__deliver_event (session.c:1573) ==2106591== by 0x629C39: perf_session__deliver_event (session.c:1655) ==2106591== by 0x625830: ordered_events__deliver_event (session.c:193) ==2106591== by 0x630B23: do_flush (ordered-events.c:245) ==2106591== by 0x630E7A: __ordered_events__flush (ordered-events.c:324) ==2106591== ==2106591== Invalid read of size 1 ==2106591== at 0x484B6C6: strlen (vg_replace_strmem.c:502) ==2106591== by 0x555D494: PyUnicode_FromString (unicodeobject.c:1899) ==2106591== by 0x7134F7: set_regs_in_dict (trace-event-python.c:786) ==2106591== by 0x713E58: get_perf_sample_dict (trace-event-python.c:940) ==2106591== by 0x716327: python_process_general_event (trace-event-python.c:1499) ==2106591== by 0x7164E1: python_process_event (trace-event-python.c:1531) ==2106591== by 0x44F9AF: process_sample_event (builtin-script.c:2549) ==2106591== by 0x6294DC: evlist__deliver_sample (session.c:1534) ==2106591== by 0x6296D0: machines__deliver_event (session.c:1573) ==2106591== by 0x629C39: perf_session__deliver_event (session.c:1655) ==2106591== by 0x625830: ordered_events__deliver_event (session.c:193) ==2106591== by 0x630B23: do_flush (ordered-events.c:245) ==2106591== Address 0x7186fe0 is 0 bytes after a block of size 0 alloc'd ==2106591== at 0x484280F: malloc (vg_replace_malloc.c:442) ==2106591== by 0x7134AD: set_regs_in_dict (trace-event-python.c:780) ==2106591== by 0x713E58: get_perf_sample_dict (trace-event-python.c:940) ==2106591== by 0x716327: python_process_general_event (trace-event-python.c:1499) ==2106591== by 0x7164E1: python_process_event (trace-event-python.c:1531) ==2106591== by 0x44F9AF: process_sample_event (builtin-script.c:2549) ==2106591== by 0x6294DC: evlist__deliver_sample (session.c:1534) ==2106591== by 0x6296D0: machines__deliver_event (session.c:1573) ==2106591== by 0x629C39: perf_session__deliver_event (session.c:1655) ==2106591== by 0x625830: ordered_events__deliver_event (session.c:193) ==2106591== by 0x630B23: do_flush (ordered-events.c:245) ==2106591== by 0x630E7A: __ordered_events__flush (ordered-events.c:324) ==2106591== ==2106591== Invalid write of size 1 ==2106591== at 0x713354: regs_map (trace-event-python.c:748) ==2106591== by 0x713539: set_regs_in_dict (trace-event-python.c:789) ==2106591== by 0x713E58: get_perf_sample_dict (trace-event-python.c:940) ==2106591== by 0x716327: python_process_general_event (trace-event-python.c:1499) ==2106591== by 0x7164E1: python_process_event (trace-event-python.c:1531) ==2106591== by 0x44F9AF: process_sample_event (builtin-script.c:2549) ==2106591== by 0x6294DC: evlist__deliver_sample (session.c:1534) ==2106591== by 0x6296D0: machines__deliver_event (session.c:1573) ==2106591== by 0x629C39: perf_session__deliver_event (session.c:1655) ==2106591== by 0x625830: ordered_events__deliver_event (session.c:193) ==2106591== by 0x630B23: do_flush (ordered-events.c:245) ==2106591== by 0x630E7A: __ordered_events__flush (ordered-events.c:324) ==2106591== Address 0x7186fe0 is 0 bytes after a block of size 0 alloc'd ==2106591== at 0x484280F: malloc (vg_replace_malloc.c:442) ==2106591== by 0x7134AD: set_regs_in_dict (trace-event-python.c:780) ==2106591== by 0x713E58: get_perf_sample_dict (trace-event-python.c:940) ==2106591== by 0x716327: python_process_general_event (trace-event-python.c:1499) ==2106591== by 0x7164E1: python_process_event (trace-event-python.c:1531) ==2106591== by 0x44F9AF: process_sample_event (builtin-script.c:2549) ==2106591== by 0x6294DC: evlist__deliver_sample (session.c:1534) ==2106591== by 0x6296D0: machines__deliver_event (session.c:1573) ==2106591== by 0x629C39: perf_session__deliver_event (session.c:1655) ==2106591== by 0x625830: ordered_events__deliver_event (session.c:193) ==2106591== by 0x630B23: do_flush (ordered-events.c:245) ==2106591== by 0x630E7A: __ordered_events__flush (ordered-events.c:324) ==2106591== ==2106591== Invalid read of size 1 ==2106591== at 0x484B6C6: strlen (vg_replace_strmem.c:502) ==2106591== by 0x555D494: PyUnicode_FromString (unicodeobject.c:1899) ==2106591== by 0x713545: set_regs_in_dict (trace-event-python.c:791) ==2106591== by 0x713E58: get_perf_sample_dict (trace-event-python.c:940) ==2106591== by 0x716327: python_process_general_event (trace-event-python.c:1499) ==2106591== by 0x7164E1: python_process_event (trace-event-python.c:1531) ==2106591== by 0x44F9AF: process_sample_event (builtin-script.c:2549) ==2106591== by 0x6294DC: evlist__deliver_sample (session.c:1534) ==2106591== by 0x6296D0: machines__deliver_event (session.c:1573) ==2106591== by 0x629C39: perf_session__deliver_event (session.c:1655) ==2106591== by 0x625830: ordered_events__deliver_event (session.c:193) ==2106591== by 0x630B23: do_flush (ordered-events.c:245) ==2106591== Address 0x7186fe0 is 0 bytes after a block of size 0 alloc'd ==2106591== at 0x484280F: malloc (vg_replace_malloc.c:442) ==2106591== by 0x7134AD: set_regs_in_dict (trace-event-python.c:780) ==2106591== by 0x713E58: get_perf_sample_dict (trace-event-python.c:940) ==2106591== by 0x716327: python_process_general_event (trace-event-python.c:1499) ==2106591== by 0x7164E1: python_process_event (trace-event-python.c:1531) ==2106591== by 0x44F9AF: process_sample_event (builtin-script.c:2549) ==2106591== by 0x6294DC: evlist__deliver_sample (session.c:1534) ==2106591== by 0x6296D0: machines__deliver_event (session.c:1573) ==2106591== by 0x629C39: perf_session__deliver_event (session.c:1655) ==2106591== by 0x625830: ordered_events__deliver_event (session.c:193) ==2106591== by 0x630B23: do_flush (ordered-events.c:245) ==2106591== by 0x630E7A: __ordered_events__flush (ordered-events.c:324) ==2106591== 73056 total, 29 ignored Signed-off-by: Andi Kleen Cc: Adrian Hunter Cc: Namhyung Kim Link: https://lore.kernel.org/r/20240905151058.2127122-2-ak@linux.intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/scripting-engines/trace-event-python.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/scripting-engines/trace-event-python.c b/tools/perf/util/scripting-engines/trace-event-python.c index 6971dd6c231f..d7183134b669 100644 --- a/tools/perf/util/scripting-engines/trace-event-python.c +++ b/tools/perf/util/scripting-engines/trace-event-python.c @@ -762,6 +762,8 @@ static void regs_map(struct regs_dump *regs, uint64_t mask, const char *arch, ch } } +#define MAX_REG_SIZE 128 + static int set_regs_in_dict(PyObject *dict, struct perf_sample *sample, struct evsel *evsel) @@ -769,14 +771,7 @@ static int set_regs_in_dict(PyObject *dict, struct perf_event_attr *attr = &evsel->core.attr; const char *arch = perf_env__arch(evsel__env(evsel)); - /* - * Here value 28 is a constant size which can be used to print - * one register value and its corresponds to: - * 16 chars is to specify 64 bit register in hexadecimal. - * 2 chars is for appending "0x" to the hexadecimal value and - * 10 chars is for register name. - */ - int size = __sw_hweight64(attr->sample_regs_intr) * 28; + int size = (__sw_hweight64(attr->sample_regs_intr) * MAX_REG_SIZE) + 1; char *bf = malloc(size); if (!bf) return -1; -- cgit v1.2.3 From 6e05d28ff232cf445cc6ae59336b7f2081ef9b96 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Thu, 5 Sep 2024 10:07:35 -0700 Subject: perf mem: Check mem_events for all eligible PMUs The current perf_pmu__mem_events_init() only checks the availability of the mem_events for the first eligible PMU. It works for non-hybrid machines and hybrid machines that have the same mem_events. However, it may bring issues if a hybrid machine has a different mem_events on different PMU, e.g., Alder Lake and Raptor Lake. A mem-loads-aux event is only required for the p-core. The mem_events on both e-core and p-core should be checked and marked. The issue was not found, because it's hidden by another bug, which only records the mem-events for the e-core. The wrong check for the p-core events didn't yell. Fixes: abbdd79b786e036e ("perf mem: Clean up perf_mem_events__name()") Signed-off-by: Kan Liang Cc: Adrian Hunter Cc: Ian Rogers Cc: Jiri Olsa Cc: Namhyung Kim Link: https://lore.kernel.org/r/20240905170737.4070743-1-kan.liang@linux.intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-c2c.c | 2 +- tools/perf/builtin-mem.c | 2 +- tools/perf/util/mem-events.c | 14 +++++++++++++- tools/perf/util/mem-events.h | 2 +- 4 files changed, 16 insertions(+), 4 deletions(-) (limited to 'tools') diff --git a/tools/perf/builtin-c2c.c b/tools/perf/builtin-c2c.c index cd2bd573bfc3..cef95b2781c0 100644 --- a/tools/perf/builtin-c2c.c +++ b/tools/perf/builtin-c2c.c @@ -3261,7 +3261,7 @@ static int perf_c2c__record(int argc, const char **argv) return -1; } - if (perf_pmu__mem_events_init(pmu)) { + if (perf_pmu__mem_events_init()) { pr_err("failed: memory events not supported\n"); return -1; } diff --git a/tools/perf/builtin-mem.c b/tools/perf/builtin-mem.c index bb38bb5a1c26..a9f94785809e 100644 --- a/tools/perf/builtin-mem.c +++ b/tools/perf/builtin-mem.c @@ -88,7 +88,7 @@ static int __cmd_record(int argc, const char **argv, struct perf_mem *mem, return -1; } - if (perf_pmu__mem_events_init(pmu)) { + if (perf_pmu__mem_events_init()) { pr_err("failed: memory events not supported\n"); return -1; } diff --git a/tools/perf/util/mem-events.c b/tools/perf/util/mem-events.c index be048bd02f36..17f80013e574 100644 --- a/tools/perf/util/mem-events.c +++ b/tools/perf/util/mem-events.c @@ -192,7 +192,7 @@ static bool perf_pmu__mem_events_supported(const char *mnt, struct perf_pmu *pmu return !stat(path, &st); } -int perf_pmu__mem_events_init(struct perf_pmu *pmu) +static int __perf_pmu__mem_events_init(struct perf_pmu *pmu) { const char *mnt = sysfs__mount(); bool found = false; @@ -219,6 +219,18 @@ int perf_pmu__mem_events_init(struct perf_pmu *pmu) return found ? 0 : -ENOENT; } +int perf_pmu__mem_events_init(void) +{ + struct perf_pmu *pmu = NULL; + + while ((pmu = perf_pmus__scan_mem(pmu)) != NULL) { + if (__perf_pmu__mem_events_init(pmu)) + return -ENOENT; + } + + return 0; +} + void perf_pmu__mem_events_list(struct perf_pmu *pmu) { int j; diff --git a/tools/perf/util/mem-events.h b/tools/perf/util/mem-events.h index ca31014d7934..a6fc2a593938 100644 --- a/tools/perf/util/mem-events.h +++ b/tools/perf/util/mem-events.h @@ -30,7 +30,7 @@ extern unsigned int perf_mem_events__loads_ldlat; extern struct perf_mem_event perf_mem_events[PERF_MEM_EVENTS__MAX]; int perf_pmu__mem_events_parse(struct perf_pmu *pmu, const char *str); -int perf_pmu__mem_events_init(struct perf_pmu *pmu); +int perf_pmu__mem_events_init(void); struct perf_mem_event *perf_pmu__mem_events_ptr(struct perf_pmu *pmu, int i); struct perf_pmu *perf_mem_events_find_pmu(void); -- cgit v1.2.3 From 5ad7db2c3f941cde3045ce38a9c4c40b0c7d56b9 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Thu, 5 Sep 2024 10:07:36 -0700 Subject: perf mem: Fix missed p-core mem events on ADL and RPL The p-core mem events are missed when launching 'perf mem record' on ADL and RPL. root@number:~# perf mem record sleep 1 Memory events are enabled on a subset of CPUs: 16-27 [ perf record: Woken up 1 times to write data ] [ perf record: Captured and wrote 0.032 MB perf.data ] root@number:~# perf evlist cpu_atom/mem-loads,ldlat=30/P cpu_atom/mem-stores/P dummy:u A variable 'record' in the 'struct perf_mem_event' is to indicate whether a mem event in a mem_events[] should be recorded. The current code only configure the variable for the first eligible PMU. It's good enough for a non-hybrid machine or a hybrid machine which has the same mem_events[]. However, if a different mem_events[] is used for different PMUs on a hybrid machine, e.g., ADL or RPL, the 'record' for the second PMU never get a chance to be set. The mem_events[] of the second PMU are always ignored. 'perf mem' doesn't support the per-PMU configuration now. A per-PMU mem_events[] 'record' variable doesn't make sense. Make it global. That could also avoid searching for the per-PMU mem_events[] via perf_pmu__mem_events_ptr every time. Committer testing: root@number:~# perf evlist -g cpu_atom/mem-loads,ldlat=30/P cpu_atom/mem-stores/P {cpu_core/mem-loads-aux/,cpu_core/mem-loads,ldlat=30/} cpu_core/mem-stores/P dummy:u root@number:~# The :S for '{cpu_core/mem-loads-aux/,cpu_core/mem-loads,ldlat=30/}' is not being added by 'perf evlist -g', to be checked. Fixes: abbdd79b786e036e ("perf mem: Clean up perf_mem_events__name()") Reported-by: Arnaldo Carvalho de Melo Signed-off-by: Kan Liang Tested-by: Arnaldo Carvalho de Melo Cc: Adrian Hunter Cc: Ian Rogers Cc: Jiri Olsa Cc: Namhyung Kim Closes: https://lore.kernel.org/lkml/Zthu81fA3kLC2CS2@x1/ Link: https://lore.kernel.org/r/20240905170737.4070743-2-kan.liang@linux.intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-c2c.c | 12 ++++-------- tools/perf/builtin-mem.c | 17 ++++++----------- tools/perf/util/mem-events.c | 6 ++++-- tools/perf/util/mem-events.h | 2 +- 4 files changed, 15 insertions(+), 22 deletions(-) (limited to 'tools') diff --git a/tools/perf/builtin-c2c.c b/tools/perf/builtin-c2c.c index cef95b2781c0..15e1fce71c72 100644 --- a/tools/perf/builtin-c2c.c +++ b/tools/perf/builtin-c2c.c @@ -3285,19 +3285,15 @@ static int perf_c2c__record(int argc, const char **argv) * PERF_MEM_EVENTS__LOAD_STORE if it is supported. */ if (e->tag) { - e->record = true; + perf_mem_record[PERF_MEM_EVENTS__LOAD_STORE] = true; rec_argv[i++] = "-W"; } else { - e = perf_pmu__mem_events_ptr(pmu, PERF_MEM_EVENTS__LOAD); - e->record = true; - - e = perf_pmu__mem_events_ptr(pmu, PERF_MEM_EVENTS__STORE); - e->record = true; + perf_mem_record[PERF_MEM_EVENTS__LOAD] = true; + perf_mem_record[PERF_MEM_EVENTS__STORE] = true; } } - e = perf_pmu__mem_events_ptr(pmu, PERF_MEM_EVENTS__LOAD); - if (e->record) + if (perf_mem_record[PERF_MEM_EVENTS__LOAD]) rec_argv[i++] = "-W"; rec_argv[i++] = "-d"; diff --git a/tools/perf/builtin-mem.c b/tools/perf/builtin-mem.c index a9f94785809e..615e905450b3 100644 --- a/tools/perf/builtin-mem.c +++ b/tools/perf/builtin-mem.c @@ -117,22 +117,17 @@ static int __cmd_record(int argc, const char **argv, struct perf_mem *mem, if (e->tag && (mem->operation & MEM_OPERATION_LOAD) && (mem->operation & MEM_OPERATION_STORE)) { - e->record = true; + perf_mem_record[PERF_MEM_EVENTS__LOAD_STORE] = true; rec_argv[i++] = "-W"; } else { - if (mem->operation & MEM_OPERATION_LOAD) { - e = perf_pmu__mem_events_ptr(pmu, PERF_MEM_EVENTS__LOAD); - e->record = true; - } + if (mem->operation & MEM_OPERATION_LOAD) + perf_mem_record[PERF_MEM_EVENTS__LOAD] = true; - if (mem->operation & MEM_OPERATION_STORE) { - e = perf_pmu__mem_events_ptr(pmu, PERF_MEM_EVENTS__STORE); - e->record = true; - } + if (mem->operation & MEM_OPERATION_STORE) + perf_mem_record[PERF_MEM_EVENTS__STORE] = true; } - e = perf_pmu__mem_events_ptr(pmu, PERF_MEM_EVENTS__LOAD); - if (e->record) + if (perf_mem_record[PERF_MEM_EVENTS__LOAD]) rec_argv[i++] = "-W"; rec_argv[i++] = "-d"; diff --git a/tools/perf/util/mem-events.c b/tools/perf/util/mem-events.c index 17f80013e574..051feb93ed8d 100644 --- a/tools/perf/util/mem-events.c +++ b/tools/perf/util/mem-events.c @@ -29,6 +29,8 @@ struct perf_mem_event perf_mem_events[PERF_MEM_EVENTS__MAX] = { }; #undef E +bool perf_mem_record[PERF_MEM_EVENTS__MAX] = { 0 }; + static char mem_loads_name[100]; static char mem_stores_name[100]; @@ -163,7 +165,7 @@ int perf_pmu__mem_events_parse(struct perf_pmu *pmu, const char *str) continue; if (strstr(e->tag, tok)) - e->record = found = true; + perf_mem_record[j] = found = true; } tok = strtok_r(NULL, ",", &saveptr); @@ -261,7 +263,7 @@ int perf_mem_events__record_args(const char **rec_argv, int *argv_nr) for (int j = 0; j < PERF_MEM_EVENTS__MAX; j++) { e = perf_pmu__mem_events_ptr(pmu, j); - if (!e->record) + if (!perf_mem_record[j]) continue; if (!e->supported) { diff --git a/tools/perf/util/mem-events.h b/tools/perf/util/mem-events.h index a6fc2a593938..8dc27db9fd52 100644 --- a/tools/perf/util/mem-events.h +++ b/tools/perf/util/mem-events.h @@ -6,7 +6,6 @@ #include struct perf_mem_event { - bool record; bool supported; bool ldlat; u32 aux_event; @@ -28,6 +27,7 @@ struct perf_pmu; extern unsigned int perf_mem_events__loads_ldlat; extern struct perf_mem_event perf_mem_events[PERF_MEM_EVENTS__MAX]; +extern bool perf_mem_record[PERF_MEM_EVENTS__MAX]; int perf_pmu__mem_events_parse(struct perf_pmu *pmu, const char *str); int perf_pmu__mem_events_init(void); -- cgit v1.2.3 From 003265bb6f028d7bcd7cbd92d6ba2b4e26382796 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Thu, 5 Sep 2024 10:07:37 -0700 Subject: perf mem: Fix the wrong reference in parse_record_events() A segmentation fault can be triggered when running 'perf mem record -e ldlat-loads' The commit 35b38a71c92fa033 ("perf mem: Rework command option handling") moves the OPT_CALLBACK of event from __cmd_record() to cmd_mem(). When invoking the __cmd_record(), the 'mem' has been referenced (&). So the &mem passed into the parse_record_events() is a double reference (&&) of the original struct perf_mem mem. But in the cmd_mem(), the &mem is the single reference (&) of the original struct perf_mem mem. Fixes: 35b38a71c92fa033 ("perf mem: Rework command option handling") Signed-off-by: Kan Liang Cc: Adrian Hunter Cc: Ian Rogers Cc: Jiri Olsa Cc: Namhyung Kim Link: https://lore.kernel.org/r/20240905170737.4070743-3-kan.liang@linux.intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-mem.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/perf/builtin-mem.c b/tools/perf/builtin-mem.c index 615e905450b3..651188c1d825 100644 --- a/tools/perf/builtin-mem.c +++ b/tools/perf/builtin-mem.c @@ -47,7 +47,7 @@ struct perf_mem { static int parse_record_events(const struct option *opt, const char *str, int unset __maybe_unused) { - struct perf_mem *mem = *(struct perf_mem **)opt->value; + struct perf_mem *mem = (struct perf_mem *)opt->value; struct perf_pmu *pmu; pmu = perf_mem_events_find_pmu(); -- cgit v1.2.3 From 7beaf1da074f7ea25454d6c11da142c3892d3c4e Mon Sep 17 00:00:00 2001 From: Shuah Khan Date: Thu, 5 Sep 2024 12:02:31 -0600 Subject: selftests:resctrl: Fix build failure on archs without __cpuid_count() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When resctrl is built on architectures without __cpuid_count() support, build fails. resctrl uses __cpuid_count() defined in kselftest.h. Even though the problem is seen while building resctrl on aarch64, this error can be seen on any platform that doesn't support CPUID. CPUID is a x86/x86-64 feature and code paths with CPUID asm commands will fail to build on all other architectures. All others tests call __cpuid_count() do so from x86/x86_64 code paths when _i386__ or __x86_64__ are defined. resctrl is an exception. Fix the problem by defining __cpuid_count() only when __i386__ or __x86_64__ are defined in kselftest.h and changing resctrl to call __cpuid_count() only when __i386__ or __x86_64__ are defined. In file included from resctrl.h:24, from cat_test.c:11: In function ‘arch_supports_noncont_cat’, inlined from ‘noncont_cat_run_test’ at cat_test.c:326:6: ../kselftest.h:74:9: error: impossible constraint in ‘asm’ 74 | __asm__ __volatile__ ("cpuid\n\t" \ | ^~~~~~~ cat_test.c:304:17: note: in expansion of macro ‘__cpuid_count’ 304 | __cpuid_count(0x10, 1, eax, ebx, ecx, edx); | ^~~~~~~~~~~~~ ../kselftest.h:74:9: error: impossible constraint in ‘asm’ 74 | __asm__ __volatile__ ("cpuid\n\t" \ | ^~~~~~~ cat_test.c:306:17: note: in expansion of macro ‘__cpuid_count’ 306 | __cpuid_count(0x10, 2, eax, ebx, ecx, edx); Fixes: ae638551ab64 ("selftests/resctrl: Add non-contiguous CBMs CAT test") Reported-by: Muhammad Usama Anjum Closes: https://lore.kernel.org/lkml/20240809071059.265914-1-usama.anjum@collabora.com/ Reported-by: Ilpo Järvinen Signed-off-by: Shuah Khan Acked-by: Reinette Chatre Reviewed-by: Muhammad Usama Anjum Reviewed-by: Ilpo Järvinen Signed-off-by: Shuah Khan --- tools/testing/selftests/kselftest.h | 2 ++ tools/testing/selftests/resctrl/cat_test.c | 7 +++++-- 2 files changed, 7 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kselftest.h b/tools/testing/selftests/kselftest.h index b8967b6e29d5..e195ec156859 100644 --- a/tools/testing/selftests/kselftest.h +++ b/tools/testing/selftests/kselftest.h @@ -61,6 +61,7 @@ #define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0])) #endif +#if defined(__i386__) || defined(__x86_64__) /* arch */ /* * gcc cpuid.h provides __cpuid_count() since v4.4. * Clang/LLVM cpuid.h provides __cpuid_count() since v3.4.0. @@ -75,6 +76,7 @@ : "=a" (a), "=b" (b), "=c" (c), "=d" (d) \ : "0" (level), "2" (count)) #endif +#endif /* end arch */ /* define kselftest exit codes */ #define KSFT_PASS 0 diff --git a/tools/testing/selftests/resctrl/cat_test.c b/tools/testing/selftests/resctrl/cat_test.c index 742782438ca3..94cfdba5308d 100644 --- a/tools/testing/selftests/resctrl/cat_test.c +++ b/tools/testing/selftests/resctrl/cat_test.c @@ -290,12 +290,12 @@ static int cat_run_test(const struct resctrl_test *test, const struct user_param static bool arch_supports_noncont_cat(const struct resctrl_test *test) { - unsigned int eax, ebx, ecx, edx; - /* AMD always supports non-contiguous CBM. */ if (get_vendor() == ARCH_AMD) return true; +#if defined(__i386__) || defined(__x86_64__) /* arch */ + unsigned int eax, ebx, ecx, edx; /* Intel support for non-contiguous CBM needs to be discovered. */ if (!strcmp(test->resource, "L3")) __cpuid_count(0x10, 1, eax, ebx, ecx, edx); @@ -305,6 +305,9 @@ static bool arch_supports_noncont_cat(const struct resctrl_test *test) return false; return ((ecx >> 3) & 1); +#endif /* end arch */ + + return false; } static int noncont_cat_run_test(const struct resctrl_test *test, -- cgit v1.2.3 From 80e67f1802d0fc21543216557a68320c71d7dbe1 Mon Sep 17 00:00:00 2001 From: "John B. Wyatt IV" Date: Fri, 6 Sep 2024 09:00:06 -0400 Subject: pm:cpupower: Add error warning when SWIG is not installed Add error message to better explain to the user when SWIG and python-config is missing from the path. Makefile was cleaned up and unneeded elements were removed. Suggested-by: Shuah Khan Signed-off-by: John B. Wyatt IV Signed-off-by: John B. Wyatt IV Signed-off-by: Shuah Khan --- tools/power/cpupower/bindings/python/Makefile | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) (limited to 'tools') diff --git a/tools/power/cpupower/bindings/python/Makefile b/tools/power/cpupower/bindings/python/Makefile index d0418f902795..dc09c5b66ead 100644 --- a/tools/power/cpupower/bindings/python/Makefile +++ b/tools/power/cpupower/bindings/python/Makefile @@ -4,26 +4,28 @@ # This Makefile expects you have already run the makefile for cpupower to build # the .o files in the lib directory for the bindings to be created. -CC=gcc - -LIB_DIR = ../../lib -BIND_DIR = . -PY_INCLUDE := $(firstword $(shell python-config --includes)) -#PY_INCLUDE = $(shell python-config --includes | awk '{ print $1 }') +CC := gcc +HAVE_SWIG := $(shell if which swig >/dev/null 2>&1; then echo 1; else echo 0; fi) +HAVE_PYCONFIG := $(shell if which python-config >/dev/null 2>&1; then echo 1; else echo 0; fi) +LIB_DIR := ../../lib +PY_INCLUDE = $(firstword $(shell python-config --includes)) OBJECTS_LIB = $(wildcard $(LIB_DIR)/*.o) -OBJECTS_BIND = $(wildcard $(BIND_DIR)/*.o) all: _raw_pylibcpupower.so _raw_pylibcpupower.so: raw_pylibcpupower_wrap.o - $(CC) -shared $(OBJECTS_LIB) raw_pylibcpupower_wrap.o -o _raw_pylibcpupower.so # raw_pylibcpupower_wrap.o -# $(CC) -shared $(OBJECTS_BIND) $(OBJECTS_LIB) -o _raw_pylibcpupower.so # raw_pylibcpupower_wrap.o + $(CC) -shared $(OBJECTS_LIB) raw_pylibcpupower_wrap.o -o _raw_pylibcpupower.so raw_pylibcpupower_wrap.o: raw_pylibcpupower_wrap.c $(CC) -fPIC -c raw_pylibcpupower_wrap.c $(PY_INCLUDE) raw_pylibcpupower_wrap.c: raw_pylibcpupower.i +ifeq ($(HAVE_SWIG),0) + $(error "swig was not found. Make sure you have it installed and in the PATH to generate the bindings.") +else ifeq ($(HAVE_PYCONFIG),0) + $(error "python-config was not found. Make sure you have it installed and in the PATH to generate the bindings.") +endif swig -python raw_pylibcpupower.i # Will only clean the bindings folder; will not clean the actual cpupower folder -- cgit v1.2.3 From af1ec38c6ccc31ec963ac4bcf8f6a7d8f44d210a Mon Sep 17 00:00:00 2001 From: zhang jiao Date: Fri, 6 Sep 2024 10:52:59 +0800 Subject: selftests/timers: Remove unused NSEC_PER_SEC macro By reading the code, I found the macro NSEC_PER_SEC is never referenced in the code. Just remove it. Signed-off-by: zhang jiao Reviewed-by: Shuah Khan Acked-by: John Stultz Signed-off-by: Shuah Khan --- tools/testing/selftests/timers/change_skew.c | 3 --- tools/testing/selftests/timers/skew_consistency.c | 2 -- 2 files changed, 5 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/timers/change_skew.c b/tools/testing/selftests/timers/change_skew.c index 4421cd562c24..18e794a46c23 100644 --- a/tools/testing/selftests/timers/change_skew.c +++ b/tools/testing/selftests/timers/change_skew.c @@ -30,9 +30,6 @@ #include #include "../kselftest.h" -#define NSEC_PER_SEC 1000000000LL - - int change_skew_test(int ppm) { struct timex tx; diff --git a/tools/testing/selftests/timers/skew_consistency.c b/tools/testing/selftests/timers/skew_consistency.c index c8e6bffe4e0a..83450145fe65 100644 --- a/tools/testing/selftests/timers/skew_consistency.c +++ b/tools/testing/selftests/timers/skew_consistency.c @@ -36,8 +36,6 @@ #include #include "../kselftest.h" -#define NSEC_PER_SEC 1000000000LL - int main(int argc, char **argv) { struct timex tx; -- cgit v1.2.3 From f8c6b7913dfaa67475883f94261c278adbcaa0ae Mon Sep 17 00:00:00 2001 From: Mykyta Yatsenko Date: Fri, 6 Sep 2024 14:24:53 +0100 Subject: bpftool: Improve btf c dump sorting stability Existing algorithm for BTF C dump sorting uses only types and names of the structs and unions for ordering. As dump contains structs with the same names but different contents, relative to each other ordering of those structs will be accidental. This patch addresses this problem by introducing a new sorting field that contains hash of the struct/union field names and types to disambiguate comparison of the non-unique named structs. Signed-off-by: Mykyta Yatsenko Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20240906132453.146085-1-mykyta.yatsenko5@gmail.com --- tools/bpf/bpftool/btf.c | 80 +++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 75 insertions(+), 5 deletions(-) (limited to 'tools') diff --git a/tools/bpf/bpftool/btf.c b/tools/bpf/bpftool/btf.c index 3b57ba095ab6..7d2af1ff3c8d 100644 --- a/tools/bpf/bpftool/btf.c +++ b/tools/bpf/bpftool/btf.c @@ -50,6 +50,7 @@ struct sort_datum { int type_rank; const char *sort_name; const char *own_name; + __u64 disambig_hash; }; static const char *btf_int_enc_str(__u8 encoding) @@ -584,20 +585,88 @@ static const char *btf_type_sort_name(const struct btf *btf, __u32 index, bool f return NULL; } +static __u64 hasher(__u64 hash, __u64 val) +{ + return hash * 31 + val; +} + +static __u64 btf_name_hasher(__u64 hash, const struct btf *btf, __u32 name_off) +{ + if (!name_off) + return hash; + + return hasher(hash, str_hash(btf__name_by_offset(btf, name_off))); +} + +static __u64 btf_type_disambig_hash(const struct btf *btf, __u32 id, bool include_members) +{ + const struct btf_type *t = btf__type_by_id(btf, id); + int i; + size_t hash = 0; + + hash = btf_name_hasher(hash, btf, t->name_off); + + switch (btf_kind(t)) { + case BTF_KIND_ENUM: + case BTF_KIND_ENUM64: + for (i = 0; i < btf_vlen(t); i++) { + __u32 name_off = btf_is_enum(t) ? + btf_enum(t)[i].name_off : + btf_enum64(t)[i].name_off; + + hash = btf_name_hasher(hash, btf, name_off); + } + break; + case BTF_KIND_STRUCT: + case BTF_KIND_UNION: + if (!include_members) + break; + for (i = 0; i < btf_vlen(t); i++) { + const struct btf_member *m = btf_members(t) + i; + + hash = btf_name_hasher(hash, btf, m->name_off); + /* resolve field type's name and hash it as well */ + hash = hasher(hash, btf_type_disambig_hash(btf, m->type, false)); + } + break; + case BTF_KIND_TYPE_TAG: + case BTF_KIND_CONST: + case BTF_KIND_PTR: + case BTF_KIND_VOLATILE: + case BTF_KIND_RESTRICT: + case BTF_KIND_TYPEDEF: + case BTF_KIND_DECL_TAG: + hash = hasher(hash, btf_type_disambig_hash(btf, t->type, include_members)); + break; + case BTF_KIND_ARRAY: { + struct btf_array *arr = btf_array(t); + + hash = hasher(hash, arr->nelems); + hash = hasher(hash, btf_type_disambig_hash(btf, arr->type, include_members)); + break; + } + default: + break; + } + return hash; +} + static int btf_type_compare(const void *left, const void *right) { const struct sort_datum *d1 = (const struct sort_datum *)left; const struct sort_datum *d2 = (const struct sort_datum *)right; int r; - if (d1->type_rank != d2->type_rank) - return d1->type_rank < d2->type_rank ? -1 : 1; - - r = strcmp(d1->sort_name, d2->sort_name); + r = d1->type_rank - d2->type_rank; + r = r ?: strcmp(d1->sort_name, d2->sort_name); + r = r ?: strcmp(d1->own_name, d2->own_name); if (r) return r; - return strcmp(d1->own_name, d2->own_name); + if (d1->disambig_hash != d2->disambig_hash) + return d1->disambig_hash < d2->disambig_hash ? -1 : 1; + + return d1->index - d2->index; } static struct sort_datum *sort_btf_c(const struct btf *btf) @@ -618,6 +687,7 @@ static struct sort_datum *sort_btf_c(const struct btf *btf) d->type_rank = btf_type_rank(btf, i, false); d->sort_name = btf_type_sort_name(btf, i, false); d->own_name = btf__name_by_offset(btf, t->name_off); + d->disambig_hash = btf_type_disambig_hash(btf, i, true); } qsort(datums, n, sizeof(struct sort_datum), btf_type_compare); -- cgit v1.2.3 From 8a3f14bb1e944f496d1a16096435ddb2e12e4753 Mon Sep 17 00:00:00 2001 From: Sam James Date: Fri, 6 Sep 2024 14:48:14 +0100 Subject: libbpf: Workaround (another) -Wmaybe-uninitialized false positive MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We get this with GCC 15 -O3 (at least): ``` libbpf.c: In function ‘bpf_map__init_kern_struct_ops’: libbpf.c:1109:18: error: ‘mod_btf’ may be used uninitialized [-Werror=maybe-uninitialized] 1109 | kern_btf = mod_btf ? mod_btf->btf : obj->btf_vmlinux; | ~~~~~~~~~^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ libbpf.c:1094:28: note: ‘mod_btf’ was declared here 1094 | struct module_btf *mod_btf; | ^~~~~~~ In function ‘find_struct_ops_kern_types’, inlined from ‘bpf_map__init_kern_struct_ops’ at libbpf.c:1102:8: libbpf.c:982:21: error: ‘btf’ may be used uninitialized [-Werror=maybe-uninitialized] 982 | kern_type = btf__type_by_id(btf, kern_type_id); | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ libbpf.c: In function ‘bpf_map__init_kern_struct_ops’: libbpf.c:967:21: note: ‘btf’ was declared here 967 | struct btf *btf; | ^~~ ``` This is similar to the other libbpf fix from a few weeks ago for the same modelling-errno issue (fab45b962749184e1a1a57c7c583782b78fad539). Signed-off-by: Sam James Signed-off-by: Andrii Nakryiko Link: https://bugs.gentoo.org/939106 Link: https://lore.kernel.org/bpf/f6962729197ae7cdf4f6d1512625bd92f2322d31.1725630494.git.sam@gentoo.org --- tools/lib/bpf/libbpf.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 27ad3c6ee868..4f29e06c2641 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -986,7 +986,7 @@ find_struct_ops_kern_types(struct bpf_object *obj, const char *tname_raw, { const struct btf_type *kern_type, *kern_vtype; const struct btf_member *kern_data_member; - struct btf *btf; + struct btf *btf = NULL; __s32 kern_vtype_id, kern_type_id; char tname[256]; __u32 i; @@ -1116,7 +1116,7 @@ static int bpf_map__init_kern_struct_ops(struct bpf_map *map) const struct btf *btf = obj->btf; struct bpf_struct_ops *st_ops; const struct btf *kern_btf; - struct module_btf *mod_btf; + struct module_btf *mod_btf = NULL; void *data, *kern_data; const char *tname; int err; -- cgit v1.2.3 From 94e86b174d103d941b4afc4f016af8af9e5352fa Mon Sep 17 00:00:00 2001 From: Zhu Jun Date: Fri, 6 Sep 2024 02:13:33 -0700 Subject: tools/hv: Add memory allocation check in hv_fcopy_start Added error handling for memory allocation failures of file_name and path_name. Signed-off-by: Zhu Jun Reviewed-by: Dexuan Cui Tested-by: Saurabh Sengar Link: https://lore.kernel.org/r/20240906091333.11419-1-zhujun2@cmss.chinamobile.com Signed-off-by: Wei Liu Message-ID: <20240906091333.11419-1-zhujun2@cmss.chinamobile.com> --- tools/hv/hv_fcopy_uio_daemon.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'tools') diff --git a/tools/hv/hv_fcopy_uio_daemon.c b/tools/hv/hv_fcopy_uio_daemon.c index 3ce316cc9f97..7a00f3066a98 100644 --- a/tools/hv/hv_fcopy_uio_daemon.c +++ b/tools/hv/hv_fcopy_uio_daemon.c @@ -296,6 +296,13 @@ static int hv_fcopy_start(struct hv_start_fcopy *smsg_in) file_name = (char *)malloc(file_size * sizeof(char)); path_name = (char *)malloc(path_size * sizeof(char)); + if (!file_name || !path_name) { + free(file_name); + free(path_name); + syslog(LOG_ERR, "Can't allocate memory for file name and/or path name"); + return HV_E_FAIL; + } + wcstoutf8(file_name, (__u16 *)in_file_name, file_size); wcstoutf8(path_name, (__u16 *)in_path_name, path_size); -- cgit v1.2.3 From 8c9c01ce695eea84d19482e7429e3d54ceb7585c Mon Sep 17 00:00:00 2001 From: Madhavan Srinivasan Date: Mon, 12 Aug 2024 15:11:52 +0530 Subject: selftests/powerpc: Allow building without static libc Currently exec-target.c is linked statically with libc, which on Fedora at least requires installing an additional package (glibc-static). If that package is not installed the build fails with: CC exec_target /usr/bin/ld: cannot find -lc: No such file or directory collect2: error: ld returned 1 exit status All exec_target.c does is call sys_exit, which can be done easily enough using inline assembly, and removes the requirement for a static libc to be installed. Suggested-by: Michael Ellerman Signed-off-by: Madhavan Srinivasan Signed-off-by: Michael Ellerman Link: https://msgid.link/20240812094152.418586-1-maddy@linux.ibm.com --- tools/testing/selftests/powerpc/benchmarks/Makefile | 2 +- tools/testing/selftests/powerpc/benchmarks/exec_target.c | 16 ++++++++++++++-- 2 files changed, 15 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/powerpc/benchmarks/Makefile b/tools/testing/selftests/powerpc/benchmarks/Makefile index 1321922038d0..ca4483c238b9 100644 --- a/tools/testing/selftests/powerpc/benchmarks/Makefile +++ b/tools/testing/selftests/powerpc/benchmarks/Makefile @@ -18,4 +18,4 @@ $(OUTPUT)/context_switch: LDLIBS += -lpthread $(OUTPUT)/fork: LDLIBS += -lpthread -$(OUTPUT)/exec_target: CFLAGS += -static -nostartfiles +$(OUTPUT)/exec_target: CFLAGS += -nostartfiles diff --git a/tools/testing/selftests/powerpc/benchmarks/exec_target.c b/tools/testing/selftests/powerpc/benchmarks/exec_target.c index c14b0fc1edde..a6408d3f26cd 100644 --- a/tools/testing/selftests/powerpc/benchmarks/exec_target.c +++ b/tools/testing/selftests/powerpc/benchmarks/exec_target.c @@ -7,10 +7,22 @@ */ #define _GNU_SOURCE -#include #include void _start(void) { - syscall(SYS_exit, 0); + asm volatile ( + "li %%r0, %[sys_exit];" + "li %%r3, 0;" + "sc;" + : + : [sys_exit] "i" (SYS_exit) + /* + * "sc" will clobber r0, r3-r13, cr0, ctr, xer and memory. + * Even though sys_exit never returns, handle clobber + * registers. + */ + : "r0", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", + "r11", "r12", "r13", "cr0", "ctr", "xer", "memory" + ); } -- cgit v1.2.3 From 5c6e3d5a5da118be2ae074bd70d111994147c708 Mon Sep 17 00:00:00 2001 From: Yanfei Xu Date: Wed, 28 Aug 2024 16:42:29 +0800 Subject: cxl/pci: Remove duplicated implementation of waiting for memory_info_valid commit ce17ad0d5498 ("cxl: Wait Memory_Info_Valid before access memory related info") added another implementation, which is cxl_dvsec_mem_range_valid(), of waiting for memory_info_valid without realizing it duplicated wait_for_valid(). Remove wait_for_valid() and retain cxl_dvsec_mem_range_valid() as the former is hardcoded to check only the Memory_Info_Valid bit of DVSEC range 1, while the latter allows for selection between DVSEC range 1 or 2 via parameter. Suggested-by: Dan Williams Reviewed-by: Jonathan Cameron Signed-off-by: Yanfei Xu Reviewed-by: Alison Schofield Link: https://patch.msgid.link/20240828084231.1378789-3-yanfei.xu@intel.com Signed-off-by: Dave Jiang --- drivers/cxl/core/pci.c | 41 +++++------------------------------------ drivers/cxl/cxl.h | 2 +- drivers/cxl/port.c | 2 +- tools/testing/cxl/test/mock.c | 4 ++-- 4 files changed, 9 insertions(+), 40 deletions(-) (limited to 'tools') diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c index 075a2b2d2ea0..37e537e50b34 100644 --- a/drivers/cxl/core/pci.c +++ b/drivers/cxl/core/pci.c @@ -211,37 +211,6 @@ int cxl_await_media_ready(struct cxl_dev_state *cxlds) } EXPORT_SYMBOL_NS_GPL(cxl_await_media_ready, CXL); -static int wait_for_valid(struct pci_dev *pdev, int d) -{ - u32 val; - int rc; - - /* - * Memory_Info_Valid: When set, indicates that the CXL Range 1 Size high - * and Size Low registers are valid. Must be set within 1 second of - * deassertion of reset to CXL device. Likely it is already set by the - * time this runs, but otherwise give a 1.5 second timeout in case of - * clock skew. - */ - rc = pci_read_config_dword(pdev, d + CXL_DVSEC_RANGE_SIZE_LOW(0), &val); - if (rc) - return rc; - - if (val & CXL_DVSEC_MEM_INFO_VALID) - return 0; - - msleep(1500); - - rc = pci_read_config_dword(pdev, d + CXL_DVSEC_RANGE_SIZE_LOW(0), &val); - if (rc) - return rc; - - if (val & CXL_DVSEC_MEM_INFO_VALID) - return 0; - - return -ETIMEDOUT; -} - static int cxl_set_mem_enable(struct cxl_dev_state *cxlds, u16 val) { struct pci_dev *pdev = to_pci_dev(cxlds->dev); @@ -322,11 +291,13 @@ static int devm_cxl_enable_hdm(struct device *host, struct cxl_hdm *cxlhdm) return devm_add_action_or_reset(host, disable_hdm, cxlhdm); } -int cxl_dvsec_rr_decode(struct device *dev, int d, +int cxl_dvsec_rr_decode(struct device *dev, struct cxl_port *port, struct cxl_endpoint_dvsec_info *info) { struct pci_dev *pdev = to_pci_dev(dev); + struct cxl_dev_state *cxlds = pci_get_drvdata(pdev); int hdm_count, rc, i, ranges = 0; + int d = cxlds->cxl_dvsec; u16 cap, ctrl; if (!d) { @@ -353,11 +324,9 @@ int cxl_dvsec_rr_decode(struct device *dev, int d, if (!hdm_count || hdm_count > 2) return -EINVAL; - rc = wait_for_valid(pdev, d); - if (rc) { - dev_dbg(dev, "Failure awaiting MEM_INFO_VALID (%d)\n", rc); + rc = cxl_dvsec_mem_range_valid(cxlds, 0); + if (rc) return rc; - } /* * The current DVSEC values are moot if the memory capability is diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h index 2890f7e2aad5..0fc96f8bf15c 100644 --- a/drivers/cxl/cxl.h +++ b/drivers/cxl/cxl.h @@ -811,7 +811,7 @@ struct cxl_hdm *devm_cxl_setup_hdm(struct cxl_port *port, int devm_cxl_enumerate_decoders(struct cxl_hdm *cxlhdm, struct cxl_endpoint_dvsec_info *info); int devm_cxl_add_passthrough_decoder(struct cxl_port *port); -int cxl_dvsec_rr_decode(struct device *dev, int dvsec, +int cxl_dvsec_rr_decode(struct device *dev, struct cxl_port *port, struct cxl_endpoint_dvsec_info *info); bool is_cxl_region(struct device *dev); diff --git a/drivers/cxl/port.c b/drivers/cxl/port.c index d7d5d982ce69..861dde65768f 100644 --- a/drivers/cxl/port.c +++ b/drivers/cxl/port.c @@ -98,7 +98,7 @@ static int cxl_endpoint_port_probe(struct cxl_port *port) struct cxl_port *root; int rc; - rc = cxl_dvsec_rr_decode(cxlds->dev, cxlds->cxl_dvsec, &info); + rc = cxl_dvsec_rr_decode(cxlds->dev, port, &info); if (rc < 0) return rc; diff --git a/tools/testing/cxl/test/mock.c b/tools/testing/cxl/test/mock.c index bbd7d938156d..f4ce96cc11d4 100644 --- a/tools/testing/cxl/test/mock.c +++ b/tools/testing/cxl/test/mock.c @@ -228,7 +228,7 @@ int __wrap_cxl_hdm_decode_init(struct cxl_dev_state *cxlds, } EXPORT_SYMBOL_NS_GPL(__wrap_cxl_hdm_decode_init, CXL); -int __wrap_cxl_dvsec_rr_decode(struct device *dev, int dvsec, +int __wrap_cxl_dvsec_rr_decode(struct device *dev, struct cxl_port *port, struct cxl_endpoint_dvsec_info *info) { int rc = 0, index; @@ -237,7 +237,7 @@ int __wrap_cxl_dvsec_rr_decode(struct device *dev, int dvsec, if (ops && ops->is_mock_dev(dev)) rc = 0; else - rc = cxl_dvsec_rr_decode(dev, dvsec, info); + rc = cxl_dvsec_rr_decode(dev, port, info); put_cxl_mock_ops(index); return rc; -- cgit v1.2.3 From c1632cc5ed386cc8d85693ef578a106f4fb1932c Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Fri, 6 Sep 2024 15:37:43 -0300 Subject: perf trace augmented_syscalls.bpf: Move the renameat aumenter to renameat2, temporarily While trying to shape Howard Chu's generic BPF augmenter transition into the codebase I got stuck with the renameat2 syscall. Until I noticed that the attempt at reusing augmenters were making it use the 'openat' syscall augmenter, that collect just one string syscall arg, for the 'renameat2' syscall, that takes two strings. So, for the moment, just to help in this transition period, since 'renameat2' is what is used these days in the 'mv' utility, just make the BPF collector be associated with the more widely used syscall, hopefully the transition to Howard's generic BPF augmenter will cure this, so get this out of the way for now! So now we still have that odd "reuse", but for something we're not testing so won't get in the way anymore: root@number:~# rm -f 987654 ; touch 123456 ; perf trace -vv -e rename* mv 123456 987654 |& grep renameat Reusing "openat" BPF sys_enter augmenter for "renameat" 0.000 ( 0.079 ms): mv/1158612 renameat2(olddfd: CWD, oldname: "123456", newdfd: CWD, newname: "987654", flags: NOREPLACE) = 0 root@number:~# Reviewed-by: Ian Rogers Cc: Adrian Hunter Cc: Howard Chu Cc: Jiri Olsa Cc: Kan Liang Cc: Namhyung Kim Link: https://lore.kernel.org/lkml/CAP-5=fXjGYs=tpBgETK-P9U-CuXssytk9pSnTXpfphrmmOydWA@mail.gmail.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/bpf_skel/augmented_raw_syscalls.bpf.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/bpf_skel/augmented_raw_syscalls.bpf.c b/tools/perf/util/bpf_skel/augmented_raw_syscalls.bpf.c index 0acbd74e8c76..0f9bd2690d4e 100644 --- a/tools/perf/util/bpf_skel/augmented_raw_syscalls.bpf.c +++ b/tools/perf/util/bpf_skel/augmented_raw_syscalls.bpf.c @@ -260,8 +260,8 @@ int sys_enter_rename(struct syscall_enter_args *args) return augmented__output(args, augmented_args, len); } -SEC("tp/syscalls/sys_enter_renameat") -int sys_enter_renameat(struct syscall_enter_args *args) +SEC("tp/syscalls/sys_enter_renameat2") +int sys_enter_renameat2(struct syscall_enter_args *args) { struct augmented_args_payload *augmented_args = augmented_args_payload(); const void *oldpath_arg = (const void *)args->args[1], -- cgit v1.2.3 From c90a88d33a23a8b3c58ee0e1d18d7392244b9b03 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Fri, 6 Sep 2024 16:26:45 -0300 Subject: perf trace: Use a common encoding for augmented arguments, with size + error + payload We were using a more compact format, without explicitely encoding the size and possible error in the payload for an argument. To do it generically, at least as Howard Chu did in his GSoC activities, it is more convenient to use the same model that was being used for string arguments, passing { size, error, payload }. So use that for the non string syscall args we have so far: struct timespec struct perf_event_attr struct sockaddr (this one has even a variable size) With this in place we have the userspace pretty printers: perf_event_attr___scnprintf() syscall_arg__scnprintf_augmented_sockaddr() syscall_arg__scnprintf_augmented_timespec() Ready to have the generic BPF collector in tools/perf/util/bpf_skel/augmented_raw_syscalls.bpf.c sending its generic payload and thus we'll use them instead of a generic libbpf btf_dump interface that doesn't know about about the sockaddr mux, perf_event_attr non-trivial fields (sample_type, etc), leaving it as a (useful) fallback that prints just basic types until we put in place a more sophisticated pretty printer infrastructure that associates synthesized enums to struct fields using the header scrapers we have in tools/perf/trace/beauty/, some of them in this list: $ ls tools/perf/trace/beauty/*.sh tools/perf/trace/beauty/arch_errno_names.sh tools/perf/trace/beauty/kcmp_type.sh tools/perf/trace/beauty/perf_ioctl.sh tools/perf/trace/beauty/statx_mask.sh tools/perf/trace/beauty/clone.sh tools/perf/trace/beauty/kvm_ioctl.sh tools/perf/trace/beauty/pkey_alloc_access_rights.sh tools/perf/trace/beauty/sync_file_range.sh tools/perf/trace/beauty/drm_ioctl.sh tools/perf/trace/beauty/madvise_behavior.sh tools/perf/trace/beauty/prctl_option.sh tools/perf/trace/beauty/usbdevfs_ioctl.sh tools/perf/trace/beauty/fadvise.sh tools/perf/trace/beauty/mmap_flags.sh tools/perf/trace/beauty/rename_flags.sh tools/perf/trace/beauty/vhost_virtio_ioctl.sh tools/perf/trace/beauty/fs_at_flags.sh tools/perf/trace/beauty/mmap_prot.sh tools/perf/trace/beauty/sndrv_ctl_ioctl.sh tools/perf/trace/beauty/x86_arch_prctl.sh tools/perf/trace/beauty/fsconfig.sh tools/perf/trace/beauty/mount_flags.sh tools/perf/trace/beauty/sndrv_pcm_ioctl.sh tools/perf/trace/beauty/fsmount.sh tools/perf/trace/beauty/move_mount_flags.sh tools/perf/trace/beauty/sockaddr.sh tools/perf/trace/beauty/fspick.sh tools/perf/trace/beauty/mremap_flags.sh tools/perf/trace/beauty/socket.sh $ Testing it: root@number:~# rm -f 987654 ; touch 123456 ; perf trace -e rename* mv 123456 987654 0.000 ( 0.031 ms): mv/1193096 renameat2(olddfd: CWD, oldname: "123456", newdfd: CWD, newname: "987654", flags: NOREPLACE) = 0 root@number:~# perf trace -e *nanosleep sleep 1.2345678901 0.000 (1234.654 ms): sleep/1192697 clock_nanosleep(rqtp: { .tv_sec: 1, .tv_nsec: 234567891 }, rmtp: 0x7ffe1ea80460) = 0 root@number:~# perf trace -e perf_event_open* perf stat -e cpu-clock sleep 1 0.000 ( 0.011 ms): perf/1192701 perf_event_open(attr_uptr: { type: 1 (software), size: 136, config: 0 (PERF_COUNT_SW_CPU_CLOCK), sample_type: IDENTIFIER, read_format: TOTAL_TIME_ENABLED|TOTAL_TIME_RUNNING, disabled: 1, inherit: 1, enable_on_exec: 1, exclude_guest: 1 }, pid: 1192702 (perf), cpu: -1, group_fd: -1, flags: FD_CLOEXEC) = 3 Performance counter stats for 'sleep 1': 0.51 msec cpu-clock # 0.001 CPUs utilized 1.001242090 seconds time elapsed 0.000000000 seconds user 0.001010000 seconds sys root@number:~# perf trace -e connect* ping -c 1 bsky.app 0.000 ( 0.130 ms): ping/1192740 connect(fd: 5, uservaddr: { .family: LOCAL, path: /run/systemd/resolve/io.systemd.Resolve }, addrlen: 42) = 0 23.907 ( 0.006 ms): ping/1192740 connect(fd: 5, uservaddr: { .family: INET, port: 0, addr: 3.20.108.158 }, addrlen: 16) = 0 23.915 PING bsky.app (3.20.108.158) 56(84) bytes of data. ( 0.001 ms): ping/1192740 connect(fd: 5, uservaddr: { .family: UNSPEC }, addrlen: 16) = 0 23.917 ( 0.002 ms): ping/1192740 connect(fd: 5, uservaddr: { .family: INET, port: 0, addr: 3.12.170.30 }, addrlen: 16) = 0 23.921 ( 0.001 ms): ping/1192740 connect(fd: 5, uservaddr: { .family: UNSPEC }, addrlen: 16) = 0 23.923 ( 0.001 ms): ping/1192740 connect(fd: 5, uservaddr: { .family: INET, port: 0, addr: 18.217.70.179 }, addrlen: 16) = 0 23.925 ( 0.001 ms): ping/1192740 connect(fd: 5, uservaddr: { .family: UNSPEC }, addrlen: 16) = 0 23.927 ( 0.001 ms): ping/1192740 connect(fd: 5, uservaddr: { .family: INET, port: 0, addr: 3.132.20.46 }, addrlen: 16) = 0 23.930 ( 0.001 ms): ping/1192740 connect(fd: 5, uservaddr: { .family: UNSPEC }, addrlen: 16) = 0 23.931 ( 0.001 ms): ping/1192740 connect(fd: 5, uservaddr: { .family: INET, port: 0, addr: 3.142.89.165 }, addrlen: 16) = 0 23.934 ( 0.001 ms): ping/1192740 connect(fd: 5, uservaddr: { .family: UNSPEC }, addrlen: 16) = 0 23.935 ( 0.002 ms): ping/1192740 connect(fd: 5, uservaddr: { .family: INET, port: 0, addr: 18.119.147.159 }, addrlen: 16) = 0 23.938 ( 0.001 ms): ping/1192740 connect(fd: 5, uservaddr: { .family: UNSPEC }, addrlen: 16) = 0 23.940 ( 0.001 ms): ping/1192740 connect(fd: 5, uservaddr: { .family: INET, port: 0, addr: 3.22.38.164 }, addrlen: 16) = 0 23.942 ( 0.001 ms): ping/1192740 connect(fd: 5, uservaddr: { .family: UNSPEC }, addrlen: 16) = 0 23.944 ( 0.001 ms): ping/1192740 connect(fd: 5, uservaddr: { .family: INET, port: 0, addr: 3.13.14.133 }, addrlen: 16) = 0 23.956 ( 0.001 ms): ping/1192740 connect(fd: 5, uservaddr: { .family: INET, port: 1025, addr: 3.20.108.158 }, addrlen: 16) = 0 ^C --- bsky.app ping statistics --- 1 packets transmitted, 0 received, 100% packet loss, time 0ms root@number:~# Reviewed-by: Ian Rogers Cc: Adrian Hunter Cc: Howard Chu Cc: Jiri Olsa Cc: Kan Liang Cc: Namhyung Kim Link: https://lore.kernel.org/lkml/CAP-5=fW4=2GoP6foAN6qbrCiUzy0a_TzHbd8rvDsakTPfdzvfg@mail.gmail.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/trace/beauty/perf_event_open.c | 2 +- tools/perf/trace/beauty/sockaddr.c | 2 +- tools/perf/trace/beauty/timespec.c | 2 +- .../util/bpf_skel/augmented_raw_syscalls.bpf.c | 104 +++++++++++++-------- 4 files changed, 66 insertions(+), 44 deletions(-) (limited to 'tools') diff --git a/tools/perf/trace/beauty/perf_event_open.c b/tools/perf/trace/beauty/perf_event_open.c index 01ee15fe9d0c..632237128640 100644 --- a/tools/perf/trace/beauty/perf_event_open.c +++ b/tools/perf/trace/beauty/perf_event_open.c @@ -76,7 +76,7 @@ static size_t perf_event_attr___scnprintf(struct perf_event_attr *attr, char *bf static size_t syscall_arg__scnprintf_augmented_perf_event_attr(struct syscall_arg *arg, char *bf, size_t size) { - return perf_event_attr___scnprintf((void *)arg->augmented.args, bf, size, arg->trace->show_zeros); + return perf_event_attr___scnprintf((void *)arg->augmented.args->value, bf, size, arg->trace->show_zeros); } static size_t syscall_arg__scnprintf_perf_event_attr(char *bf, size_t size, struct syscall_arg *arg) diff --git a/tools/perf/trace/beauty/sockaddr.c b/tools/perf/trace/beauty/sockaddr.c index 2e0e867c0c1b..a17a27ac2a6f 100644 --- a/tools/perf/trace/beauty/sockaddr.c +++ b/tools/perf/trace/beauty/sockaddr.c @@ -47,7 +47,7 @@ static size_t (*af_scnprintfs[])(struct sockaddr *sa, char *bf, size_t size) = { static size_t syscall_arg__scnprintf_augmented_sockaddr(struct syscall_arg *arg, char *bf, size_t size) { - struct sockaddr *sa = (struct sockaddr *)arg->augmented.args; + struct sockaddr *sa = (struct sockaddr *)&arg->augmented.args->value; char family[32]; size_t printed; diff --git a/tools/perf/trace/beauty/timespec.c b/tools/perf/trace/beauty/timespec.c index e1a61f092aad..b14ab72a2738 100644 --- a/tools/perf/trace/beauty/timespec.c +++ b/tools/perf/trace/beauty/timespec.c @@ -7,7 +7,7 @@ static size_t syscall_arg__scnprintf_augmented_timespec(struct syscall_arg *arg, char *bf, size_t size) { - struct timespec *ts = (struct timespec *)arg->augmented.args; + struct timespec *ts = (struct timespec *)arg->augmented.args->value; return scnprintf(bf, size, "{ .tv_sec: %" PRIu64 ", .tv_nsec: %" PRIu64 " }", ts->tv_sec, ts->tv_nsec); } diff --git a/tools/perf/util/bpf_skel/augmented_raw_syscalls.bpf.c b/tools/perf/util/bpf_skel/augmented_raw_syscalls.bpf.c index 0f9bd2690d4e..9c7d2f855294 100644 --- a/tools/perf/util/bpf_skel/augmented_raw_syscalls.bpf.c +++ b/tools/perf/util/bpf_skel/augmented_raw_syscalls.bpf.c @@ -10,6 +10,9 @@ #include #include +#define PERF_ALIGN(x, a) __PERF_ALIGN_MASK(x, (typeof(x))(a)-1) +#define __PERF_ALIGN_MASK(x, mask) (((x)+(mask))&~(mask)) + /** * is_power_of_2() - check if a value is a power of two * @n: the value to check @@ -66,19 +69,6 @@ struct syscall_exit_args { long ret; }; -struct augmented_arg { - unsigned int size; - int err; - char value[PATH_MAX]; -}; - -struct pids_filtered { - __uint(type, BPF_MAP_TYPE_HASH); - __type(key, pid_t); - __type(value, bool); - __uint(max_entries, 64); -} pids_filtered SEC(".maps"); - /* * Desired design of maximum size and alignment (see RFC2553) */ @@ -105,17 +95,27 @@ struct sockaddr_storage { }; }; -struct augmented_args_payload { - struct syscall_enter_args args; - union { - struct { - struct augmented_arg arg, arg2; - }; +struct augmented_arg { + unsigned int size; + int err; + union { + char value[PATH_MAX]; struct sockaddr_storage saddr; - char __data[sizeof(struct augmented_arg)]; }; }; +struct pids_filtered { + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, pid_t); + __type(value, bool); + __uint(max_entries, 64); +} pids_filtered SEC(".maps"); + +struct augmented_args_payload { + struct syscall_enter_args args; + struct augmented_arg arg, arg2; // We have to reserve space for two arguments (rename, etc) +}; + // We need more tmp space than the BPF stack can give us struct augmented_args_tmp { __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); @@ -182,15 +182,17 @@ int sys_enter_connect(struct syscall_enter_args *args) struct augmented_args_payload *augmented_args = augmented_args_payload(); const void *sockaddr_arg = (const void *)args->args[1]; unsigned int socklen = args->args[2]; - unsigned int len = sizeof(augmented_args->args); + unsigned int len = sizeof(u64) + sizeof(augmented_args->args); // the size + err in all 'augmented_arg' structs if (augmented_args == NULL) return 1; /* Failure: don't filter */ - _Static_assert(is_power_of_2(sizeof(augmented_args->saddr)), "sizeof(augmented_args->saddr) needs to be a power of two"); - socklen &= sizeof(augmented_args->saddr) - 1; + _Static_assert(is_power_of_2(sizeof(augmented_args->arg.saddr)), "sizeof(augmented_args->arg.saddr) needs to be a power of two"); + socklen &= sizeof(augmented_args->arg.saddr) - 1; - bpf_probe_read_user(&augmented_args->saddr, socklen, sockaddr_arg); + bpf_probe_read_user(&augmented_args->arg.saddr, socklen, sockaddr_arg); + augmented_args->arg.size = socklen; + augmented_args->arg.err = 0; return augmented__output(args, augmented_args, len + socklen); } @@ -201,14 +203,14 @@ int sys_enter_sendto(struct syscall_enter_args *args) struct augmented_args_payload *augmented_args = augmented_args_payload(); const void *sockaddr_arg = (const void *)args->args[4]; unsigned int socklen = args->args[5]; - unsigned int len = sizeof(augmented_args->args); + unsigned int len = sizeof(u64) + sizeof(augmented_args->args); // the size + err in all 'augmented_arg' structs if (augmented_args == NULL) return 1; /* Failure: don't filter */ - socklen &= sizeof(augmented_args->saddr) - 1; + socklen &= sizeof(augmented_args->arg.saddr) - 1; - bpf_probe_read_user(&augmented_args->saddr, socklen, sockaddr_arg); + bpf_probe_read_user(&augmented_args->arg.saddr, socklen, sockaddr_arg); return augmented__output(args, augmented_args, len + socklen); } @@ -249,13 +251,23 @@ int sys_enter_rename(struct syscall_enter_args *args) struct augmented_args_payload *augmented_args = augmented_args_payload(); const void *oldpath_arg = (const void *)args->args[0], *newpath_arg = (const void *)args->args[1]; - unsigned int len = sizeof(augmented_args->args), oldpath_len; + unsigned int len = sizeof(augmented_args->args), oldpath_len, newpath_len; if (augmented_args == NULL) return 1; /* Failure: don't filter */ + len += 2 * sizeof(u64); // The overhead of size and err, just before the payload... + oldpath_len = augmented_arg__read_str(&augmented_args->arg, oldpath_arg, sizeof(augmented_args->arg.value)); - len += oldpath_len + augmented_arg__read_str((void *)(&augmented_args->arg) + oldpath_len, newpath_arg, sizeof(augmented_args->arg.value)); + augmented_args->arg.size = PERF_ALIGN(oldpath_len + 1, sizeof(u64)); + len += augmented_args->arg.size; + + struct augmented_arg *arg2 = (void *)&augmented_args->arg.value + augmented_args->arg.size; + + newpath_len = augmented_arg__read_str(arg2, newpath_arg, sizeof(augmented_args->arg.value)); + arg2->size = newpath_len; + + len += newpath_len; return augmented__output(args, augmented_args, len); } @@ -266,13 +278,23 @@ int sys_enter_renameat2(struct syscall_enter_args *args) struct augmented_args_payload *augmented_args = augmented_args_payload(); const void *oldpath_arg = (const void *)args->args[1], *newpath_arg = (const void *)args->args[3]; - unsigned int len = sizeof(augmented_args->args), oldpath_len; + unsigned int len = sizeof(augmented_args->args), oldpath_len, newpath_len; if (augmented_args == NULL) return 1; /* Failure: don't filter */ + len += 2 * sizeof(u64); // The overhead of size and err, just before the payload... + oldpath_len = augmented_arg__read_str(&augmented_args->arg, oldpath_arg, sizeof(augmented_args->arg.value)); - len += oldpath_len + augmented_arg__read_str((void *)(&augmented_args->arg) + oldpath_len, newpath_arg, sizeof(augmented_args->arg.value)); + augmented_args->arg.size = PERF_ALIGN(oldpath_len + 1, sizeof(u64)); + len += augmented_args->arg.size; + + struct augmented_arg *arg2 = (void *)&augmented_args->arg.value + augmented_args->arg.size; + + newpath_len = augmented_arg__read_str(arg2, newpath_arg, sizeof(augmented_args->arg.value)); + arg2->size = newpath_len; + + len += newpath_len; return augmented__output(args, augmented_args, len); } @@ -293,26 +315,26 @@ int sys_enter_perf_event_open(struct syscall_enter_args *args) { struct augmented_args_payload *augmented_args = augmented_args_payload(); const struct perf_event_attr_size *attr = (const struct perf_event_attr_size *)args->args[0], *attr_read; - unsigned int len = sizeof(augmented_args->args); + unsigned int len = sizeof(u64) + sizeof(augmented_args->args); // the size + err in all 'augmented_arg' structs if (augmented_args == NULL) goto failure; - if (bpf_probe_read_user(&augmented_args->__data, sizeof(*attr), attr) < 0) + if (bpf_probe_read_user(&augmented_args->arg.value, sizeof(*attr), attr) < 0) goto failure; - attr_read = (const struct perf_event_attr_size *)augmented_args->__data; + attr_read = (const struct perf_event_attr_size *)augmented_args->arg.value; __u32 size = attr_read->size; if (!size) size = PERF_ATTR_SIZE_VER0; - if (size > sizeof(augmented_args->__data)) + if (size > sizeof(augmented_args->arg.value)) goto failure; // Now that we read attr->size and tested it against the size limits, read it completely - if (bpf_probe_read_user(&augmented_args->__data, size, attr) < 0) + if (bpf_probe_read_user(&augmented_args->arg.value, size, attr) < 0) goto failure; return augmented__output(args, augmented_args, len + size); @@ -325,16 +347,16 @@ int sys_enter_clock_nanosleep(struct syscall_enter_args *args) { struct augmented_args_payload *augmented_args = augmented_args_payload(); const void *rqtp_arg = (const void *)args->args[2]; - unsigned int len = sizeof(augmented_args->args); + unsigned int len = sizeof(u64) + sizeof(augmented_args->args); // the size + err in all 'augmented_arg' structs __u32 size = sizeof(struct timespec64); if (augmented_args == NULL) goto failure; - if (size > sizeof(augmented_args->__data)) + if (size > sizeof(augmented_args->arg.value)) goto failure; - bpf_probe_read_user(&augmented_args->__data, size, rqtp_arg); + bpf_probe_read_user(&augmented_args->arg.value, size, rqtp_arg); return augmented__output(args, augmented_args, len + size); failure: @@ -352,10 +374,10 @@ int sys_enter_nanosleep(struct syscall_enter_args *args) if (augmented_args == NULL) goto failure; - if (size > sizeof(augmented_args->__data)) + if (size > sizeof(augmented_args->arg.value)) goto failure; - bpf_probe_read_user(&augmented_args->__data, size, req_arg); + bpf_probe_read_user(&augmented_args->arg.value, size, req_arg); return augmented__output(args, augmented_args, len + size); failure: -- cgit v1.2.3 From 2f2e439ba56f45b9d8aff300b39fa0bfa49ec2d8 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Mon, 9 Sep 2024 16:29:44 -0300 Subject: perf trace: Mark which syscall arguments go from user space to kernel space We need to know where to collect it in the BPF augmenters, if in the sys_enter hook or in the sys_exit hook. Start with the SCA_FILENAME one, that is just from user to kernel space. The alternative, better, but takes a bit more time than I have now, is to use the __user information that is already in the syscall args and encoded in BTF via a tag, do it later. Cc: Adrian Hunter Cc: Howard Chu Cc: Ian Rogers Cc: Jiri Olsa Cc: Kan Liang Cc: Namhyung Kim Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-trace.c | 41 +++++++++++++++++++++++++++-------------- 1 file changed, 27 insertions(+), 14 deletions(-) (limited to 'tools') diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c index 53690bbb143e..64f0aefeba3b 100644 --- a/tools/perf/builtin-trace.c +++ b/tools/perf/builtin-trace.c @@ -102,6 +102,12 @@ /* * strtoul: Go from a string to a value, i.e. for msr: MSR_FS_BASE to 0xc0000100 + * + * We have to explicitely mark the direction of the flow of data, if from the + * kernel to user space or the other way around, since the BPF collector we + * have so far copies only from user to kernel space, mark the arguments that + * go that direction, so that we don´t end up collecting the previous contents + * for syscall args that goes from kernel to user space. */ struct syscall_arg_fmt { size_t (*scnprintf)(char *bf, size_t size, struct syscall_arg *arg); @@ -110,6 +116,7 @@ struct syscall_arg_fmt { void *parm; const char *name; u16 nr_entries; // for arrays + bool from_user; bool show_zero; #ifdef HAVE_LIBBPF_SUPPORT const struct btf_type *type; @@ -851,6 +858,11 @@ static size_t syscall_arg__scnprintf_filename(char *bf, size_t size, #define SCA_FILENAME syscall_arg__scnprintf_filename +// 'argname' is just documentational at this point, to remove the previous comment with that info +#define SCA_FILENAME_FROM_USER(argname) \ + { .scnprintf = SCA_FILENAME, \ + .from_user = true, } + static size_t syscall_arg__scnprintf_pipe_flags(char *bf, size_t size, struct syscall_arg *arg) { @@ -1091,11 +1103,11 @@ static const struct syscall_fmt syscall_fmts[] = { .arg = { [1] = { .scnprintf = SCA_EFD_FLAGS, /* flags */ }, }, }, { .name = "faccessat", .arg = { [0] = { .scnprintf = SCA_FDAT, /* dirfd */ }, - [1] = { .scnprintf = SCA_FILENAME, /* pathname */ }, + [1] = SCA_FILENAME_FROM_USER(pathname), [2] = { .scnprintf = SCA_ACCMODE, /* mode */ }, }, }, { .name = "faccessat2", .arg = { [0] = { .scnprintf = SCA_FDAT, /* dirfd */ }, - [1] = { .scnprintf = SCA_FILENAME, /* pathname */ }, + [1] = SCA_FILENAME_FROM_USER(pathname), [2] = { .scnprintf = SCA_ACCMODE, /* mode */ }, [3] = { .scnprintf = SCA_FACCESSAT2_FLAGS, /* flags */ }, }, }, { .name = "fchmodat", @@ -1117,7 +1129,7 @@ static const struct syscall_fmt syscall_fmts[] = { [2] = { .scnprintf = SCA_FSMOUNT_ATTR_FLAGS, /* attr_flags */ }, }, }, { .name = "fspick", .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, - [1] = { .scnprintf = SCA_FILENAME, /* path */ }, + [1] = SCA_FILENAME_FROM_USER(path), [2] = { .scnprintf = SCA_FSPICK_FLAGS, /* flags */ }, }, }, { .name = "fstat", .alias = "newfstat", }, { .name = "futex", @@ -1181,20 +1193,20 @@ static const struct syscall_fmt syscall_fmts[] = { .parm = &strarray__mmap_flags, }, [5] = { .scnprintf = SCA_HEX, /* offset */ }, }, }, { .name = "mount", - .arg = { [0] = { .scnprintf = SCA_FILENAME, /* dev_name */ }, + .arg = { [0] = SCA_FILENAME_FROM_USER(devname), [3] = { .scnprintf = SCA_MOUNT_FLAGS, /* flags */ .mask_val = SCAMV_MOUNT_FLAGS, /* flags */ }, }, }, { .name = "move_mount", .arg = { [0] = { .scnprintf = SCA_FDAT, /* from_dfd */ }, - [1] = { .scnprintf = SCA_FILENAME, /* from_pathname */ }, + [1] = SCA_FILENAME_FROM_USER(pathname), [2] = { .scnprintf = SCA_FDAT, /* to_dfd */ }, - [3] = { .scnprintf = SCA_FILENAME, /* to_pathname */ }, + [3] = SCA_FILENAME_FROM_USER(pathname), [4] = { .scnprintf = SCA_MOVE_MOUNT_FLAGS, /* flags */ }, }, }, { .name = "mprotect", .arg = { [0] = { .scnprintf = SCA_HEX, /* start */ }, [2] = { .scnprintf = SCA_MMAP_PROT, .show_zero = true, /* prot */ }, }, }, { .name = "mq_unlink", - .arg = { [0] = { .scnprintf = SCA_FILENAME, /* u_name */ }, }, }, + .arg = { [0] = SCA_FILENAME_FROM_USER(u_name), }, }, { .name = "mremap", .hexret = true, .arg = { [3] = { .scnprintf = SCA_MREMAP_FLAGS, /* flags */ }, }, }, { .name = "name_to_handle_at", @@ -1203,7 +1215,7 @@ static const struct syscall_fmt syscall_fmts[] = { .arg = { [0] = { .scnprintf = SCA_TIMESPEC, /* req */ }, }, }, { .name = "newfstatat", .alias = "fstatat", .arg = { [0] = { .scnprintf = SCA_FDAT, /* dirfd */ }, - [1] = { .scnprintf = SCA_FILENAME, /* pathname */ }, + [1] = SCA_FILENAME_FROM_USER(pathname), [3] = { .scnprintf = SCA_FS_AT_FLAGS, /* flags */ }, }, }, { .name = "open", .arg = { [1] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, }, @@ -1299,9 +1311,9 @@ static const struct syscall_fmt syscall_fmts[] = { [2] = { .scnprintf = SCA_FS_AT_FLAGS, /* flags */ } , [3] = { .scnprintf = SCA_STATX_MASK, /* mask */ }, }, }, { .name = "swapoff", - .arg = { [0] = { .scnprintf = SCA_FILENAME, /* specialfile */ }, }, }, + .arg = { [0] = SCA_FILENAME_FROM_USER(specialfile), }, }, { .name = "swapon", - .arg = { [0] = { .scnprintf = SCA_FILENAME, /* specialfile */ }, }, }, + .arg = { [0] = SCA_FILENAME_FROM_USER(specialfile), }, }, { .name = "symlinkat", .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, }, { .name = "sync_file_range", @@ -1311,11 +1323,11 @@ static const struct syscall_fmt syscall_fmts[] = { { .name = "tkill", .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, }, { .name = "umount2", .alias = "umount", - .arg = { [0] = { .scnprintf = SCA_FILENAME, /* name */ }, }, }, + .arg = { [0] = SCA_FILENAME_FROM_USER(name), }, }, { .name = "uname", .alias = "newuname", }, { .name = "unlinkat", .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, - [1] = { .scnprintf = SCA_FILENAME, /* pathname */ }, + [1] = SCA_FILENAME_FROM_USER(pathname), [2] = { .scnprintf = SCA_FS_AT_FLAGS, /* flags */ }, }, }, { .name = "utimensat", .arg = { [0] = { .scnprintf = SCA_FDAT, /* dirfd */ }, }, }, @@ -1903,9 +1915,10 @@ syscall_arg_fmt__init_array(struct syscall_arg_fmt *arg, struct tep_format_field if (strcmp(field->type, "const char *") == 0 && ((len >= 4 && strcmp(field->name + len - 4, "name") == 0) || - strstr(field->name, "path") != NULL)) + strstr(field->name, "path") != NULL)) { arg->scnprintf = SCA_FILENAME; - else if ((field->flags & TEP_FIELD_IS_POINTER) || strstr(field->name, "addr")) + arg->from_user = true; + } else if ((field->flags & TEP_FIELD_IS_POINTER) || strstr(field->name, "addr")) arg->scnprintf = SCA_PTR; else if (strcmp(field->type, "pid_t") == 0) arg->scnprintf = SCA_PID; -- cgit v1.2.3 From 690eda6508c23023586ab81b11cae86476754d0b Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Mon, 9 Sep 2024 16:57:22 -0300 Subject: perf trace: Introduce SCA_PERF_ATTR_FROM_USER() to set .from_user = true Paving the way for the generic BPF BTF based syscall arg augmenter. Cc: Adrian Hunter Cc: Howard Chu Cc: Ian Rogers Cc: Jiri Olsa Cc: Kan Liang Cc: Namhyung Kim Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-trace.c | 2 +- tools/perf/trace/beauty/perf_event_open.c | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c index 64f0aefeba3b..5f3146651684 100644 --- a/tools/perf/builtin-trace.c +++ b/tools/perf/builtin-trace.c @@ -1226,7 +1226,7 @@ static const struct syscall_fmt syscall_fmts[] = { .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, [2] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, }, { .name = "perf_event_open", - .arg = { [0] = { .scnprintf = SCA_PERF_ATTR, /* attr */ }, + .arg = { [0] = SCA_PERF_ATTR_FROM_USER(attr), [2] = { .scnprintf = SCA_INT, /* cpu */ }, [3] = { .scnprintf = SCA_FD, /* group_fd */ }, [4] = { .scnprintf = SCA_PERF_FLAGS, /* flags */ }, }, }, diff --git a/tools/perf/trace/beauty/perf_event_open.c b/tools/perf/trace/beauty/perf_event_open.c index 632237128640..9f1ed989c775 100644 --- a/tools/perf/trace/beauty/perf_event_open.c +++ b/tools/perf/trace/beauty/perf_event_open.c @@ -88,3 +88,7 @@ static size_t syscall_arg__scnprintf_perf_event_attr(char *bf, size_t size, stru } #define SCA_PERF_ATTR syscall_arg__scnprintf_perf_event_attr +// 'argname' is just documentational at this point, to remove the previous comment with that info +#define SCA_PERF_ATTR_FROM_USER(argname) \ + { .scnprintf = SCA_PERF_ATTR, \ + .from_user = true, } -- cgit v1.2.3 From be14a71984e16b95ff725dbda19899868c5ec8a6 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Mon, 9 Sep 2024 16:57:22 -0300 Subject: perf trace: Introduce SCA_SOCKADDR_FROM_USER() to set .from_user = true Paving the way for the generic BPF BTF based syscall arg augmenter. Cc: Adrian Hunter Cc: Howard Chu Cc: Ian Rogers Cc: Jiri Olsa Cc: Kan Liang Cc: Namhyung Kim Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-trace.c | 6 +++--- tools/perf/trace/beauty/beauty.h | 5 +++++ 2 files changed, 8 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c index 5f3146651684..64c3f7f7a9aa 100644 --- a/tools/perf/builtin-trace.c +++ b/tools/perf/builtin-trace.c @@ -1075,7 +1075,7 @@ static const struct syscall_fmt syscall_fmts[] = { [1] = { .scnprintf = SCA_PTR, /* arg2 */ }, }, }, { .name = "bind", .arg = { [0] = { .scnprintf = SCA_INT, /* fd */ }, - [1] = { .scnprintf = SCA_SOCKADDR, /* umyaddr */ }, + [1] = SCA_SOCKADDR_FROM_USER(umyaddr), [2] = { .scnprintf = SCA_INT, /* addrlen */ }, }, }, { .name = "bpf", .arg = { [0] = STRARRAY(cmd, bpf_cmd), }, }, @@ -1095,7 +1095,7 @@ static const struct syscall_fmt syscall_fmts[] = { .arg = { [0] = { .scnprintf = SCA_CLOSE_FD, /* fd */ }, }, }, { .name = "connect", .arg = { [0] = { .scnprintf = SCA_INT, /* fd */ }, - [1] = { .scnprintf = SCA_SOCKADDR, /* servaddr */ }, + [1] = SCA_SOCKADDR_FROM_USER(servaddr), [2] = { .scnprintf = SCA_INT, /* addrlen */ }, }, }, { .name = "epoll_ctl", .arg = { [1] = STRARRAY(op, epoll_ctl_ops), }, }, @@ -1289,7 +1289,7 @@ static const struct syscall_fmt syscall_fmts[] = { .arg = { [2] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, }, { .name = "sendto", .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, - [4] = { .scnprintf = SCA_SOCKADDR, /* addr */ }, }, }, + [4] = SCA_SOCKADDR_FROM_USER(addr), }, }, { .name = "set_tid_address", .errpid = true, }, { .name = "setitimer", .arg = { [0] = STRARRAY(which, itimers), }, }, diff --git a/tools/perf/trace/beauty/beauty.h b/tools/perf/trace/beauty/beauty.h index 3ed11e18ee2d..9c6bfb7d0ef1 100644 --- a/tools/perf/trace/beauty/beauty.h +++ b/tools/perf/trace/beauty/beauty.h @@ -229,6 +229,11 @@ size_t syscall_arg__scnprintf_renameat2_flags(char *bf, size_t size, struct sysc size_t syscall_arg__scnprintf_sockaddr(char *bf, size_t size, struct syscall_arg *arg); #define SCA_SOCKADDR syscall_arg__scnprintf_sockaddr +// 'argname' is just documentational at this point, to remove the previous comment with that info +#define SCA_SOCKADDR_FROM_USER(argname) \ + { .scnprintf = SCA_SOCKADDR, \ + .from_user = true, } + size_t syscall_arg__scnprintf_socket_protocol(char *bf, size_t size, struct syscall_arg *arg); #define SCA_SK_PROTO syscall_arg__scnprintf_socket_protocol -- cgit v1.2.3 From c790f2bafb7a17d97c49c17607fa2ff919891f51 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Mon, 9 Sep 2024 16:57:22 -0300 Subject: perf trace: Introduce SCA_TIMESPEC_FROM_USER() to set .from_user = true Paving the way for the generic BPF BTF based syscall arg augmenter. Cc: Adrian Hunter Cc: Howard Chu Cc: Ian Rogers Cc: Jiri Olsa Cc: Kan Liang Cc: Namhyung Kim Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-trace.c | 4 ++-- tools/perf/trace/beauty/beauty.h | 5 +++++ 2 files changed, 7 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c index 64c3f7f7a9aa..2bdbb6813512 100644 --- a/tools/perf/builtin-trace.c +++ b/tools/perf/builtin-trace.c @@ -1084,7 +1084,7 @@ static const struct syscall_fmt syscall_fmts[] = { { .name = "clock_gettime", .arg = { [0] = STRARRAY(clk_id, clockid), }, }, { .name = "clock_nanosleep", - .arg = { [2] = { .scnprintf = SCA_TIMESPEC, /* rqtp */ }, }, }, + .arg = { [2] = SCA_TIMESPEC_FROM_USER(req), }, }, { .name = "clone", .errpid = true, .nr_args = 5, .arg = { [0] = { .name = "flags", .scnprintf = SCA_CLONE_FLAGS, }, [1] = { .name = "child_stack", .scnprintf = SCA_HEX, }, @@ -1212,7 +1212,7 @@ static const struct syscall_fmt syscall_fmts[] = { { .name = "name_to_handle_at", .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, }, { .name = "nanosleep", - .arg = { [0] = { .scnprintf = SCA_TIMESPEC, /* req */ }, }, }, + .arg = { [0] = SCA_TIMESPEC_FROM_USER(req), }, }, { .name = "newfstatat", .alias = "fstatat", .arg = { [0] = { .scnprintf = SCA_FDAT, /* dirfd */ }, [1] = SCA_FILENAME_FROM_USER(pathname), diff --git a/tools/perf/trace/beauty/beauty.h b/tools/perf/trace/beauty/beauty.h index 9c6bfb7d0ef1..0a07ad158f87 100644 --- a/tools/perf/trace/beauty/beauty.h +++ b/tools/perf/trace/beauty/beauty.h @@ -255,6 +255,11 @@ size_t syscall_arg__scnprintf_sync_file_range_flags(char *bf, size_t size, struc size_t syscall_arg__scnprintf_timespec(char *bf, size_t size, struct syscall_arg *arg); #define SCA_TIMESPEC syscall_arg__scnprintf_timespec +// 'argname' is just documentational at this point, to remove the previous comment with that info +#define SCA_TIMESPEC_FROM_USER(argname) \ + { .scnprintf = SCA_TIMESPEC, \ + .from_user = true, } + size_t open__scnprintf_flags(unsigned long flags, char *bf, size_t size, bool show_prefix); void syscall_arg__set_ret_scnprintf(struct syscall_arg *arg, -- cgit v1.2.3 From 12707b9159e631af91c051fd102d469364b6ecc3 Mon Sep 17 00:00:00 2001 From: Shuyi Cheng Date: Sun, 8 Sep 2024 17:23:53 +0800 Subject: libbpf: Fixed getting wrong return address on arm64 architecture ARM64 has a separate lr register to store the return address, so here you only need to read the lr register to get the return address, no need to dereference it again. Signed-off-by: Shuyi Cheng Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/1725787433-77262-1-git-send-email-chengshuyi@linux.alibaba.com --- tools/lib/bpf/bpf_tracing.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/lib/bpf/bpf_tracing.h b/tools/lib/bpf/bpf_tracing.h index 4eab132a963e..aa3b04f5542a 100644 --- a/tools/lib/bpf/bpf_tracing.h +++ b/tools/lib/bpf/bpf_tracing.h @@ -522,7 +522,7 @@ struct pt_regs; #define BPF_KPROBE_READ_RET_IP(ip, ctx) ({ (ip) = (ctx)->link; }) #define BPF_KRETPROBE_READ_RET_IP BPF_KPROBE_READ_RET_IP -#elif defined(bpf_target_sparc) +#elif defined(bpf_target_sparc) || defined(bpf_target_arm64) #define BPF_KPROBE_READ_RET_IP(ip, ctx) ({ (ip) = PT_REGS_RET(ctx); }) #define BPF_KRETPROBE_READ_RET_IP BPF_KPROBE_READ_RET_IP -- cgit v1.2.3 From 4cdc0e4ce5e893bc92255f5f734d983012f2bc2e Mon Sep 17 00:00:00 2001 From: Kuan-Wei Chiu Date: Sun, 8 Sep 2024 22:00:09 +0800 Subject: bpftool: Fix undefined behavior caused by shifting into the sign bit Replace shifts of '1' with '1U' in bitwise operations within __show_dev_tc_bpf() to prevent undefined behavior caused by shifting into the sign bit of a signed integer. By using '1U', the operations are explicitly performed on unsigned integers, avoiding potential integer overflow or sign-related issues. Signed-off-by: Kuan-Wei Chiu Signed-off-by: Andrii Nakryiko Acked-by: Quentin Monnet Link: https://lore.kernel.org/bpf/20240908140009.3149781-1-visitorckw@gmail.com --- tools/bpf/bpftool/net.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'tools') diff --git a/tools/bpf/bpftool/net.c b/tools/bpf/bpftool/net.c index 2a51f1c25732..3c50fc66bb4a 100644 --- a/tools/bpf/bpftool/net.c +++ b/tools/bpf/bpftool/net.c @@ -486,9 +486,9 @@ static void __show_dev_tc_bpf(const struct ip_devname_ifindex *dev, if (prog_flags[i] || json_output) { NET_START_ARRAY("prog_flags", "%s "); for (j = 0; prog_flags[i] && j < 32; j++) { - if (!(prog_flags[i] & (1 << j))) + if (!(prog_flags[i] & (1U << j))) continue; - NET_DUMP_UINT_ONLY(1 << j); + NET_DUMP_UINT_ONLY(1U << j); } NET_END_ARRAY(""); } @@ -497,9 +497,9 @@ static void __show_dev_tc_bpf(const struct ip_devname_ifindex *dev, if (link_flags[i] || json_output) { NET_START_ARRAY("link_flags", "%s "); for (j = 0; link_flags[i] && j < 32; j++) { - if (!(link_flags[i] & (1 << j))) + if (!(link_flags[i] & (1U << j))) continue; - NET_DUMP_UINT_ONLY(1 << j); + NET_DUMP_UINT_ONLY(1U << j); } NET_END_ARRAY(""); } -- cgit v1.2.3 From f028d7716cdeae3be7d8d211482248916b25b1c2 Mon Sep 17 00:00:00 2001 From: Andrew Kreimer Date: Mon, 9 Sep 2024 12:24:41 +0300 Subject: bpftool: Fix typos Fix typos in documentation. Reported-by: Matthew Wilcox Reported-by: Quentin Monnet Signed-off-by: Andrew Kreimer Signed-off-by: Andrii Nakryiko Acked-by: Quentin Monnet Link: https://lore.kernel.org/bpf/20240909092452.4293-1-algonell@gmail.com --- tools/bpf/bpftool/Documentation/bpftool-gen.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/bpf/bpftool/Documentation/bpftool-gen.rst b/tools/bpf/bpftool/Documentation/bpftool-gen.rst index 1b426e58a7cd..ca860fd97d8d 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-gen.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-gen.rst @@ -172,7 +172,7 @@ bpftool gen min_core_btf *INPUT* *OUTPUT* *OBJECT* [*OBJECT*...] CO-RE based application, turning the application portable to different kernel versions. - Check examples bellow for more information how to use it. + Check examples below for more information on how to use it. bpftool gen help Print short help message. -- cgit v1.2.3 From bee109b7b3e50739b88252a219fa07ecd78ad628 Mon Sep 17 00:00:00 2001 From: Maxim Mikityanskiy Date: Mon, 9 Sep 2024 16:39:09 +0300 Subject: bpf: Fix error message on kfunc arg type mismatch When "arg#%d expected pointer to ctx, but got %s" error is printed, both template parts actually point to the type of the argument, therefore, it will also say "but got PTR", regardless of what was the actual register type. Fix the message to print the register type in the second part of the template, change the existing test to adapt to the new format, and add a new test to test the case when arg is a pointer to context, but reg is a scalar. Fixes: 00b85860feb8 ("bpf: Rewrite kfunc argument handling") Signed-off-by: Maxim Mikityanskiy Signed-off-by: Andrii Nakryiko Acked-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/bpf/20240909133909.1315460-1-maxim@isovalent.com --- kernel/bpf/verifier.c | 3 ++- tools/testing/selftests/bpf/prog_tests/kfunc_call.c | 1 + tools/testing/selftests/bpf/progs/kfunc_call_fail.c | 7 +++++++ tools/testing/selftests/bpf/verifier/calls.c | 2 +- 4 files changed, 11 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 53d0556fbbf3..f35b80c16cda 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -12132,7 +12132,8 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ switch (kf_arg_type) { case KF_ARG_PTR_TO_CTX: if (reg->type != PTR_TO_CTX) { - verbose(env, "arg#%d expected pointer to ctx, but got %s\n", i, btf_type_str(t)); + verbose(env, "arg#%d expected pointer to ctx, but got %s\n", + i, reg_type_str(env, reg->type)); return -EINVAL; } diff --git a/tools/testing/selftests/bpf/prog_tests/kfunc_call.c b/tools/testing/selftests/bpf/prog_tests/kfunc_call.c index 5b743212292f..f79c8e53cb3e 100644 --- a/tools/testing/selftests/bpf/prog_tests/kfunc_call.c +++ b/tools/testing/selftests/bpf/prog_tests/kfunc_call.c @@ -68,6 +68,7 @@ static struct kfunc_test_params kfunc_tests[] = { TC_FAIL(kfunc_call_test_get_mem_fail_oob, 0, "min value is outside of the allowed memory range"), TC_FAIL(kfunc_call_test_get_mem_fail_not_const, 0, "is not a const"), TC_FAIL(kfunc_call_test_mem_acquire_fail, 0, "acquire kernel function does not return PTR_TO_BTF_ID"), + TC_FAIL(kfunc_call_test_pointer_arg_type_mismatch, 0, "arg#0 expected pointer to ctx, but got scalar"), /* success cases */ TC_TEST(kfunc_call_test1, 12), diff --git a/tools/testing/selftests/bpf/progs/kfunc_call_fail.c b/tools/testing/selftests/bpf/progs/kfunc_call_fail.c index 4b0b7b79cdfb..08fae306539c 100644 --- a/tools/testing/selftests/bpf/progs/kfunc_call_fail.c +++ b/tools/testing/selftests/bpf/progs/kfunc_call_fail.c @@ -150,4 +150,11 @@ int kfunc_call_test_mem_acquire_fail(struct __sk_buff *skb) return ret; } +SEC("?tc") +int kfunc_call_test_pointer_arg_type_mismatch(struct __sk_buff *skb) +{ + bpf_kfunc_call_test_pass_ctx((void *)10); + return 0; +} + char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/verifier/calls.c b/tools/testing/selftests/bpf/verifier/calls.c index d0cdd156cd55..7afc2619ab14 100644 --- a/tools/testing/selftests/bpf/verifier/calls.c +++ b/tools/testing/selftests/bpf/verifier/calls.c @@ -76,7 +76,7 @@ }, .prog_type = BPF_PROG_TYPE_SCHED_CLS, .result = REJECT, - .errstr = "arg#0 expected pointer to ctx, but got PTR", + .errstr = "arg#0 expected pointer to ctx, but got fp", .fixup_kfunc_btf_id = { { "bpf_kfunc_call_test_pass_ctx", 2 }, }, -- cgit v1.2.3 From 41d0c4677feee1ea063e0f2c2af72dc953b1f1cc Mon Sep 17 00:00:00 2001 From: Yusheng Zheng Date: Mon, 9 Sep 2024 22:59:52 +0000 Subject: libbpf: Fix some typos in comments Fix some spelling errors in the code comments of libbpf: betwen -> between paremeters -> parameters knowning -> knowing definiton -> definition compatiblity -> compatibility overriden -> overridden occured -> occurred proccess -> process managment -> management nessary -> necessary Signed-off-by: Yusheng Zheng Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20240909225952.30324-1-yunwei356@gmail.com --- tools/lib/bpf/bpf_helpers.h | 2 +- tools/lib/bpf/bpf_tracing.h | 2 +- tools/lib/bpf/btf.c | 2 +- tools/lib/bpf/btf.h | 2 +- tools/lib/bpf/btf_dump.c | 2 +- tools/lib/bpf/libbpf.h | 10 +++++----- tools/lib/bpf/libbpf_legacy.h | 4 ++-- tools/lib/bpf/skel_internal.h | 2 +- 8 files changed, 13 insertions(+), 13 deletions(-) (limited to 'tools') diff --git a/tools/lib/bpf/bpf_helpers.h b/tools/lib/bpf/bpf_helpers.h index 305c62817dd3..80bc0242e8dc 100644 --- a/tools/lib/bpf/bpf_helpers.h +++ b/tools/lib/bpf/bpf_helpers.h @@ -341,7 +341,7 @@ extern void bpf_iter_num_destroy(struct bpf_iter_num *it) __weak __ksym; * I.e., it looks almost like high-level for each loop in other languages, * supports continue/break, and is verifiable by BPF verifier. * - * For iterating integers, the difference betwen bpf_for_each(num, i, N, M) + * For iterating integers, the difference between bpf_for_each(num, i, N, M) * and bpf_for(i, N, M) is in that bpf_for() provides additional proof to * verifier that i is in [N, M) range, and in bpf_for_each() case i is `int * *`, not just `int`. So for integers bpf_for() is more convenient. diff --git a/tools/lib/bpf/bpf_tracing.h b/tools/lib/bpf/bpf_tracing.h index aa3b04f5542a..a8f6cd4841b0 100644 --- a/tools/lib/bpf/bpf_tracing.h +++ b/tools/lib/bpf/bpf_tracing.h @@ -808,7 +808,7 @@ struct pt_regs; * tp_btf/fentry/fexit BPF programs. It hides the underlying platform-specific * low-level way of getting kprobe input arguments from struct pt_regs, and * provides a familiar typed and named function arguments syntax and - * semantics of accessing kprobe input paremeters. + * semantics of accessing kprobe input parameters. * * Original struct pt_regs* context is preserved as 'ctx' argument. This might * be necessary when using BPF helpers like bpf_perf_event_output(). diff --git a/tools/lib/bpf/btf.c b/tools/lib/bpf/btf.c index 8d51e73d55a8..3c131039c523 100644 --- a/tools/lib/bpf/btf.c +++ b/tools/lib/bpf/btf.c @@ -4230,7 +4230,7 @@ static bool btf_dedup_identical_structs(struct btf_dedup *d, __u32 id1, __u32 id * consists of portions of the graph that come from multiple compilation units. * This is due to the fact that types within single compilation unit are always * deduplicated and FWDs are already resolved, if referenced struct/union - * definiton is available. So, if we had unresolved FWD and found corresponding + * definition is available. So, if we had unresolved FWD and found corresponding * STRUCT/UNION, they will be from different compilation units. This * consequently means that when we "link" FWD to corresponding STRUCT/UNION, * type graph will likely have at least two different BTF types that describe diff --git a/tools/lib/bpf/btf.h b/tools/lib/bpf/btf.h index b68d216837a9..4e349ad79ee6 100644 --- a/tools/lib/bpf/btf.h +++ b/tools/lib/bpf/btf.h @@ -286,7 +286,7 @@ LIBBPF_API void btf_dump__free(struct btf_dump *d); LIBBPF_API int btf_dump__dump_type(struct btf_dump *d, __u32 id); struct btf_dump_emit_type_decl_opts { - /* size of this struct, for forward/backward compatiblity */ + /* size of this struct, for forward/backward compatibility */ size_t sz; /* optional field name for type declaration, e.g.: * - struct my_struct diff --git a/tools/lib/bpf/btf_dump.c b/tools/lib/bpf/btf_dump.c index 894860111ddb..0a7327541c17 100644 --- a/tools/lib/bpf/btf_dump.c +++ b/tools/lib/bpf/btf_dump.c @@ -304,7 +304,7 @@ int btf_dump__dump_type(struct btf_dump *d, __u32 id) * definition, in which case they have to be declared inline as part of field * type declaration; or as a top-level anonymous enum, typically used for * declaring global constants. It's impossible to distinguish between two - * without knowning whether given enum type was referenced from other type: + * without knowing whether given enum type was referenced from other type: * top-level anonymous enum won't be referenced by anything, while embedded * one will. */ diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h index 64a6a3d323e3..6917653ef9fa 100644 --- a/tools/lib/bpf/libbpf.h +++ b/tools/lib/bpf/libbpf.h @@ -152,7 +152,7 @@ struct bpf_object_open_opts { * log_buf and log_level settings. * * If specified, this log buffer will be passed for: - * - each BPF progral load (BPF_PROG_LOAD) attempt, unless overriden + * - each BPF progral load (BPF_PROG_LOAD) attempt, unless overridden * with bpf_program__set_log() on per-program level, to get * BPF verifier log output. * - during BPF object's BTF load into kernel (BPF_BTF_LOAD) to get @@ -455,7 +455,7 @@ LIBBPF_API int bpf_link__destroy(struct bpf_link *link); /** * @brief **bpf_program__attach()** is a generic function for attaching * a BPF program based on auto-detection of program type, attach type, - * and extra paremeters, where applicable. + * and extra parameters, where applicable. * * @param prog BPF program to attach * @return Reference to the newly created BPF link; or NULL is returned on error, @@ -679,7 +679,7 @@ struct bpf_uprobe_opts { /** * @brief **bpf_program__attach_uprobe()** attaches a BPF program * to the userspace function which is found by binary path and - * offset. You can optionally specify a particular proccess to attach + * offset. You can optionally specify a particular process to attach * to. You can also optionally attach the program to the function * exit instead of entry. * @@ -1593,11 +1593,11 @@ LIBBPF_API int perf_buffer__buffer_fd(const struct perf_buffer *pb, size_t buf_i * memory region of the ring buffer. * This ring buffer can be used to implement a custom events consumer. * The ring buffer starts with the *struct perf_event_mmap_page*, which - * holds the ring buffer managment fields, when accessing the header + * holds the ring buffer management fields, when accessing the header * structure it's important to be SMP aware. * You can refer to *perf_event_read_simple* for a simple example. * @param pb the perf buffer structure - * @param buf_idx the buffer index to retreive + * @param buf_idx the buffer index to retrieve * @param buf (out) gets the base pointer of the mmap()'ed memory * @param buf_size (out) gets the size of the mmap()'ed region * @return 0 on success, negative error code for failure diff --git a/tools/lib/bpf/libbpf_legacy.h b/tools/lib/bpf/libbpf_legacy.h index 1e1be467bede..60b2600be88a 100644 --- a/tools/lib/bpf/libbpf_legacy.h +++ b/tools/lib/bpf/libbpf_legacy.h @@ -76,7 +76,7 @@ enum libbpf_strict_mode { * first BPF program or map creation operation. This is done only if * kernel is too old to support memcg-based memory accounting for BPF * subsystem. By default, RLIMIT_MEMLOCK limit is set to RLIM_INFINITY, - * but it can be overriden with libbpf_set_memlock_rlim() API. + * but it can be overridden with libbpf_set_memlock_rlim() API. * Note that libbpf_set_memlock_rlim() needs to be called before * the very first bpf_prog_load(), bpf_map_create() or bpf_object__load() * operation. @@ -97,7 +97,7 @@ LIBBPF_API int libbpf_set_strict_mode(enum libbpf_strict_mode mode); * @brief **libbpf_get_error()** extracts the error code from the passed * pointer * @param ptr pointer returned from libbpf API function - * @return error code; or 0 if no error occured + * @return error code; or 0 if no error occurred * * Note, as of libbpf 1.0 this function is not necessary and not recommended * to be used. Libbpf doesn't return error code embedded into the pointer diff --git a/tools/lib/bpf/skel_internal.h b/tools/lib/bpf/skel_internal.h index 1e82ab06c3eb..0875452521e9 100644 --- a/tools/lib/bpf/skel_internal.h +++ b/tools/lib/bpf/skel_internal.h @@ -107,7 +107,7 @@ static inline void skel_free(const void *p) * The loader program will perform probe_read_kernel() from maps.rodata.initial_value. * skel_finalize_map_data() sets skel->rodata to point to actual value in a bpf map and * does maps.rodata.initial_value = ~0ULL to signal skel_free_map_data() that kvfree - * is not nessary. + * is not necessary. * * For user space: * skel_prep_map_data() mmaps anon memory into skel->rodata that can be accessed directly. -- cgit v1.2.3 From 391e86971161906e720f5d3c110ca9124c14fb55 Mon Sep 17 00:00:00 2001 From: Alexander Zhu Date: Fri, 30 Aug 2024 11:03:37 +0100 Subject: mm: selftest to verify zero-filled pages are mapped to zeropage When a THP is split, any subpage that is zero-filled will be mapped to the shared zeropage, hence saving memory. Add selftest to verify this by allocating zero-filled THP and comparing RssAnon before and after split. Link: https://lkml.kernel.org/r/20240830100438.3623486-4-usamaarif642@gmail.com Signed-off-by: Alexander Zhu Signed-off-by: Usama Arif Acked-by: Rik van Riel Cc: Barry Song Cc: David Hildenbrand Cc: Domenico Cerasuolo Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Kairui Song Cc: Matthew Wilcox Cc: Mike Rapoport Cc: Nico Pache Cc: Roman Gushchin Cc: Ryan Roberts Cc: Shakeel Butt Cc: Shuang Zhai Cc: Yu Zhao Cc: Shuang Zhai Cc: Hugh Dickins Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/split_huge_page_test.c | 71 +++++++++++++++++++++++ tools/testing/selftests/mm/vm_util.c | 22 +++++++ tools/testing/selftests/mm/vm_util.h | 1 + 3 files changed, 94 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/mm/split_huge_page_test.c b/tools/testing/selftests/mm/split_huge_page_test.c index e5e8dafc9d94..eb6d1b9fc362 100644 --- a/tools/testing/selftests/mm/split_huge_page_test.c +++ b/tools/testing/selftests/mm/split_huge_page_test.c @@ -84,6 +84,76 @@ static void write_debugfs(const char *fmt, ...) write_file(SPLIT_DEBUGFS, input, ret + 1); } +static char *allocate_zero_filled_hugepage(size_t len) +{ + char *result; + size_t i; + + result = memalign(pmd_pagesize, len); + if (!result) { + printf("Fail to allocate memory\n"); + exit(EXIT_FAILURE); + } + + madvise(result, len, MADV_HUGEPAGE); + + for (i = 0; i < len; i++) + result[i] = (char)0; + + return result; +} + +static void verify_rss_anon_split_huge_page_all_zeroes(char *one_page, int nr_hpages, size_t len) +{ + unsigned long rss_anon_before, rss_anon_after; + size_t i; + + if (!check_huge_anon(one_page, 4, pmd_pagesize)) { + printf("No THP is allocated\n"); + exit(EXIT_FAILURE); + } + + rss_anon_before = rss_anon(); + if (!rss_anon_before) { + printf("No RssAnon is allocated before split\n"); + exit(EXIT_FAILURE); + } + + /* split all THPs */ + write_debugfs(PID_FMT, getpid(), (uint64_t)one_page, + (uint64_t)one_page + len, 0); + + for (i = 0; i < len; i++) + if (one_page[i] != (char)0) { + printf("%ld byte corrupted\n", i); + exit(EXIT_FAILURE); + } + + if (!check_huge_anon(one_page, 0, pmd_pagesize)) { + printf("Still AnonHugePages not split\n"); + exit(EXIT_FAILURE); + } + + rss_anon_after = rss_anon(); + if (rss_anon_after >= rss_anon_before) { + printf("Incorrect RssAnon value. Before: %ld After: %ld\n", + rss_anon_before, rss_anon_after); + exit(EXIT_FAILURE); + } +} + +void split_pmd_zero_pages(void) +{ + char *one_page; + int nr_hpages = 4; + size_t len = nr_hpages * pmd_pagesize; + + one_page = allocate_zero_filled_hugepage(len); + verify_rss_anon_split_huge_page_all_zeroes(one_page, nr_hpages, len); + printf("Split zero filled huge pages successful\n"); + free(one_page); +} + void split_pmd_thp(void) { char *one_page; @@ -431,6 +501,7 @@ int main(int argc, char **argv) fd_size = 2 * pmd_pagesize; + split_pmd_zero_pages(); split_pmd_thp(); split_pte_mapped_thp(); split_file_backed_thp(); diff --git a/tools/testing/selftests/mm/vm_util.c b/tools/testing/selftests/mm/vm_util.c index 5a62530da3b5..d8d0cf04bb57 100644 --- a/tools/testing/selftests/mm/vm_util.c +++ b/tools/testing/selftests/mm/vm_util.c @@ -12,6 +12,7 @@ #define PMD_SIZE_FILE_PATH "/sys/kernel/mm/transparent_hugepage/hpage_pmd_size" #define SMAP_FILE_PATH "/proc/self/smaps" +#define STATUS_FILE_PATH "/proc/self/status" #define MAX_LINE_LENGTH 500 unsigned int __page_size; @@ -171,6 +172,27 @@ uint64_t read_pmd_pagesize(void) return strtoul(buf, NULL, 10); } +unsigned long rss_anon(void) +{ + unsigned long rss_anon = 0; + FILE *fp; + char buffer[MAX_LINE_LENGTH]; + + fp = fopen(STATUS_FILE_PATH, "r"); + if (!fp) + ksft_exit_fail_msg("%s: Failed to open file %s\n", __func__, STATUS_FILE_PATH); + + if (!check_for_pattern(fp, "RssAnon:", buffer, sizeof(buffer))) + goto err_out; + + if (sscanf(buffer, "RssAnon:%10lu kB", &rss_anon) != 1) + ksft_exit_fail_msg("Reading status error\n"); + +err_out: + fclose(fp); + return rss_anon; +} + bool __check_huge(void *addr, char *pattern, int nr_hpages, uint64_t hpage_size) { diff --git a/tools/testing/selftests/mm/vm_util.h b/tools/testing/selftests/mm/vm_util.h index 9007c420d52c..2eaed8209925 100644 --- a/tools/testing/selftests/mm/vm_util.h +++ b/tools/testing/selftests/mm/vm_util.h @@ -39,6 +39,7 @@ unsigned long pagemap_get_pfn(int fd, char *start); void clear_softdirty(void); bool check_for_pattern(FILE *fp, const char *pattern, char *buf, size_t len); uint64_t read_pmd_pagesize(void); +unsigned long rss_anon(void); bool check_huge_anon(void *addr, int nr_hpages, uint64_t hpage_size); bool check_huge_file(void *addr, int nr_hpages, uint64_t hpage_size); bool check_huge_shmem(void *addr, int nr_hpages, uint64_t hpage_size); -- cgit v1.2.3 From 536ab838a5b37b6ae3f8d53552560b7c51daeb41 Mon Sep 17 00:00:00 2001 From: Dev Jain Date: Fri, 30 Aug 2024 10:46:09 +0530 Subject: selftests/mm: relax test to fail after 100 migration failures It was recently observed at [1] that during the folio unmapping stage of migration, when the PTEs are cleared, a racing thread faulting on that folio may increase the refcount of the folio, sleep on the folio lock (the migration path has the lock), and migration ultimately fails when asserting the actual refcount against the expected. Thereby, the migration selftest fails on shared-anon mappings. The above enforces the fact that migration is a best-effort service, therefore, it is wrong to fail the test for just a single failure; hence, fail the test after 100 consecutive failures (where 100 is still a subjective choice). Note that, this has no effect on the execution time of the test since that is controlled by a timeout. [1] https://lore.kernel.org/all/20240801081657.1386743-1-dev.jain@arm.com/ Link: https://lkml.kernel.org/r/20240830051609.4037834-1-dev.jain@arm.com Signed-off-by: Dev Jain Suggested-by: David Hildenbrand Reviewed-by: Ryan Roberts Tested-by: Ryan Roberts Cc: Alistair Popple Cc: Aneesh Kumar K.V Cc: Anshuman Khandual Cc: Baolin Wang Cc: Barry Song Cc: Catalin Marinas Cc: Christoph Lameter Cc: Dave Hansen Cc: Gavin Shan Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Kirill A. Shutemov Cc: Lance Yang Cc: Mark Brown Cc: Mark Rutland Cc: Matthew Wilcox Cc: Mel Gorman Cc: Michal Hocko Cc: Oscar Salvador Cc: Shuah Khan Cc: Vlastimil Babka Cc: Will Deacon Cc: Yang Shi Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/migration.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/mm/migration.c b/tools/testing/selftests/mm/migration.c index 6908569ef406..64bcbb7151cf 100644 --- a/tools/testing/selftests/mm/migration.c +++ b/tools/testing/selftests/mm/migration.c @@ -15,10 +15,10 @@ #include #include -#define TWOMEG (2<<20) -#define RUNTIME (20) - -#define ALIGN(x, a) (((x) + (a - 1)) & (~((a) - 1))) +#define TWOMEG (2<<20) +#define RUNTIME (20) +#define MAX_RETRIES 100 +#define ALIGN(x, a) (((x) + (a - 1)) & (~((a) - 1))) FIXTURE(migration) { @@ -65,6 +65,7 @@ int migrate(uint64_t *ptr, int n1, int n2) int ret, tmp; int status = 0; struct timespec ts1, ts2; + int failures = 0; if (clock_gettime(CLOCK_MONOTONIC, &ts1)) return -1; @@ -79,13 +80,17 @@ int migrate(uint64_t *ptr, int n1, int n2) ret = move_pages(0, 1, (void **) &ptr, &n2, &status, MPOL_MF_MOVE_ALL); if (ret) { - if (ret > 0) + if (ret > 0) { + /* Migration is best effort; try again */ + if (++failures < MAX_RETRIES) + continue; printf("Didn't migrate %d pages\n", ret); + } else perror("Couldn't migrate pages"); return -2; } - + failures = 0; tmp = n2; n2 = n1; n1 = tmp; -- cgit v1.2.3 From a7e387375f22a4fd46c8ffcfa395a0ca8c186f9e Mon Sep 17 00:00:00 2001 From: Jason Xing Date: Fri, 6 Sep 2024 00:00:35 +0800 Subject: selftests: return failure when timestamps can't be reported When I was trying to modify the tx timestamping feature, I found that running "./txtimestamp -4 -C -L 127.0.0.1" didn't reflect the error: I succeeded to generate timestamp stored in the skb but later failed to report it to the userspace (which means failed to put css into cmsg). It can happen when someone writes buggy codes in __sock_recv_timestamp(), for example. After adding the check so that running ./txtimestamp will reflect the result correctly like this if there is a bug in the reporting phase: protocol: TCP payload: 10 server port: 9000 family: INET test SND USR: 1725458477 s 667997 us (seq=0, len=0) Failed to report timestamps USR: 1725458477 s 718128 us (seq=0, len=0) Failed to report timestamps USR: 1725458477 s 768273 us (seq=0, len=0) Failed to report timestamps USR: 1725458477 s 818416 us (seq=0, len=0) Failed to report timestamps ... In the future, it will help us detect whether the new coming patch has bugs or not. Signed-off-by: Jason Xing Reviewed-by: Willem de Bruijn Link: https://patch.msgid.link/20240905160035.62407-1-kerneljasonxing@gmail.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/txtimestamp.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/txtimestamp.c b/tools/testing/selftests/net/txtimestamp.c index ec60a16c9307..d626f22f9550 100644 --- a/tools/testing/selftests/net/txtimestamp.c +++ b/tools/testing/selftests/net/txtimestamp.c @@ -356,8 +356,12 @@ static void __recv_errmsg_cmsg(struct msghdr *msg, int payload_len) } } - if (batch > 1) + if (batch > 1) { fprintf(stderr, "batched %d timestamps\n", batch); + } else if (!batch) { + fprintf(stderr, "Failed to report timestamps\n"); + test_failed = true; + } } static int recv_errmsg(int fd) -- cgit v1.2.3 From 4c30f5ce4f7af4f639af99e0bdeada8b268b7361 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 9 Sep 2024 13:42:47 -1000 Subject: sched_ext: Implement scx_bpf_dispatch[_vtime]_from_dsq() Once a task is put into a DSQ, the allowed operations are fairly limited. Tasks in the built-in local and global DSQs are executed automatically and, ignoring dequeue, there is only one way a task in a user DSQ can be manipulated - scx_bpf_consume() moves the first task to the dispatching local DSQ. This inflexibility sometimes gets in the way and is an area where multiple feature requests have been made. Implement scx_bpf_dispatch[_vtime]_from_dsq(), which can be called during DSQ iteration and can move the task to any DSQ - local DSQs, global DSQ and user DSQs. The kfuncs can be called from ops.dispatch() and any BPF context which dosen't hold a rq lock including BPF timers and SYSCALL programs. This is an expansion of an earlier patch which only allowed moving into the dispatching local DSQ: http://lkml.kernel.org/r/Zn4Cw4FDTmvXnhaf@slm.duckdns.org v2: Remove @slice and @vtime from scx_bpf_dispatch_from_dsq[_vtime]() as they push scx_bpf_dispatch_from_dsq_vtime() over the kfunc argument count limit and often won't be needed anyway. Instead provide scx_bpf_dispatch_from_dsq_set_{slice|vtime}() kfuncs which can be called only when needed and override the specified parameter for the subsequent dispatch. Signed-off-by: Tejun Heo Cc: Daniel Hodges Cc: David Vernet Cc: Changwoo Min Cc: Andrea Righi Cc: Dan Schatzberg --- kernel/sched/ext.c | 232 ++++++++++++++++++++++++++++++- tools/sched_ext/include/scx/common.bpf.h | 10 ++ 2 files changed, 239 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 44a2d1b9e837..b6e8c224f894 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -1158,6 +1158,11 @@ static __always_inline bool scx_kf_allowed_on_arg_tasks(u32 mask, return true; } +static bool scx_kf_allowed_if_unlocked(void) +{ + return !current->scx.kf_mask; +} + /** * nldsq_next_task - Iterate to the next task in a non-local DSQ * @dsq: user dsq being interated @@ -1211,13 +1216,20 @@ enum scx_dsq_iter_flags { /* iterate in the reverse dispatch order */ SCX_DSQ_ITER_REV = 1U << 16, + __SCX_DSQ_ITER_HAS_SLICE = 1U << 30, + __SCX_DSQ_ITER_HAS_VTIME = 1U << 31, + __SCX_DSQ_ITER_USER_FLAGS = SCX_DSQ_ITER_REV, - __SCX_DSQ_ITER_ALL_FLAGS = __SCX_DSQ_ITER_USER_FLAGS, + __SCX_DSQ_ITER_ALL_FLAGS = __SCX_DSQ_ITER_USER_FLAGS | + __SCX_DSQ_ITER_HAS_SLICE | + __SCX_DSQ_ITER_HAS_VTIME, }; struct bpf_iter_scx_dsq_kern { struct scx_dsq_list_node cursor; struct scx_dispatch_q *dsq; + u64 slice; + u64 vtime; } __attribute__((aligned(8))); struct bpf_iter_scx_dsq { @@ -5872,7 +5884,7 @@ __bpf_kfunc_start_defs(); * scx_bpf_dispatch - Dispatch a task into the FIFO queue of a DSQ * @p: task_struct to dispatch * @dsq_id: DSQ to dispatch to - * @slice: duration @p can run for in nsecs + * @slice: duration @p can run for in nsecs, 0 to keep the current value * @enq_flags: SCX_ENQ_* * * Dispatch @p into the FIFO queue of the DSQ identified by @dsq_id. It is safe @@ -5922,7 +5934,7 @@ __bpf_kfunc void scx_bpf_dispatch(struct task_struct *p, u64 dsq_id, u64 slice, * scx_bpf_dispatch_vtime - Dispatch a task into the vtime priority queue of a DSQ * @p: task_struct to dispatch * @dsq_id: DSQ to dispatch to - * @slice: duration @p can run for in nsecs + * @slice: duration @p can run for in nsecs, 0 to keep the current value * @vtime: @p's ordering inside the vtime-sorted queue of the target DSQ * @enq_flags: SCX_ENQ_* * @@ -5963,6 +5975,118 @@ static const struct btf_kfunc_id_set scx_kfunc_set_enqueue_dispatch = { .set = &scx_kfunc_ids_enqueue_dispatch, }; +static bool scx_dispatch_from_dsq(struct bpf_iter_scx_dsq_kern *kit, + struct task_struct *p, u64 dsq_id, + u64 enq_flags) +{ + struct scx_dispatch_q *src_dsq = kit->dsq, *dst_dsq; + struct rq *this_rq, *src_rq, *dst_rq, *locked_rq; + bool dispatched = false; + bool in_balance; + unsigned long flags; + + if (!scx_kf_allowed_if_unlocked() && !scx_kf_allowed(SCX_KF_DISPATCH)) + return false; + + /* + * Can be called from either ops.dispatch() locking this_rq() or any + * context where no rq lock is held. If latter, lock @p's task_rq which + * we'll likely need anyway. + */ + src_rq = task_rq(p); + + local_irq_save(flags); + this_rq = this_rq(); + in_balance = this_rq->scx.flags & SCX_RQ_IN_BALANCE; + + if (in_balance) { + if (this_rq != src_rq) { + raw_spin_rq_unlock(this_rq); + raw_spin_rq_lock(src_rq); + } + } else { + raw_spin_rq_lock(src_rq); + } + + locked_rq = src_rq; + raw_spin_lock(&src_dsq->lock); + + /* + * Did someone else get to it? @p could have already left $src_dsq, got + * re-enqueud, or be in the process of being consumed by someone else. + */ + if (unlikely(p->scx.dsq != src_dsq || + u32_before(kit->cursor.priv, p->scx.dsq_seq) || + p->scx.holding_cpu >= 0) || + WARN_ON_ONCE(src_rq != task_rq(p))) { + raw_spin_unlock(&src_dsq->lock); + goto out; + } + + /* @p is still on $src_dsq and stable, determine the destination */ + dst_dsq = find_dsq_for_dispatch(this_rq, dsq_id, p); + + if (dst_dsq->id == SCX_DSQ_LOCAL) { + dst_rq = container_of(dst_dsq, struct rq, scx.local_dsq); + if (!task_can_run_on_remote_rq(p, dst_rq, true)) { + dst_dsq = &scx_dsq_global; + dst_rq = src_rq; + } + } else { + /* no need to migrate if destination is a non-local DSQ */ + dst_rq = src_rq; + } + + /* + * Move @p into $dst_dsq. If $dst_dsq is the local DSQ of a different + * CPU, @p will be migrated. + */ + if (dst_dsq->id == SCX_DSQ_LOCAL) { + /* @p is going from a non-local DSQ to a local DSQ */ + if (src_rq == dst_rq) { + task_unlink_from_dsq(p, src_dsq); + move_local_task_to_local_dsq(p, enq_flags, + src_dsq, dst_rq); + raw_spin_unlock(&src_dsq->lock); + } else { + raw_spin_unlock(&src_dsq->lock); + move_remote_task_to_local_dsq(p, enq_flags, + src_rq, dst_rq); + locked_rq = dst_rq; + } + } else { + /* + * @p is going from a non-local DSQ to a non-local DSQ. As + * $src_dsq is already locked, do an abbreviated dequeue. + */ + task_unlink_from_dsq(p, src_dsq); + p->scx.dsq = NULL; + raw_spin_unlock(&src_dsq->lock); + + if (kit->cursor.flags & __SCX_DSQ_ITER_HAS_VTIME) + p->scx.dsq_vtime = kit->vtime; + dispatch_enqueue(dst_dsq, p, enq_flags); + } + + if (kit->cursor.flags & __SCX_DSQ_ITER_HAS_SLICE) + p->scx.slice = kit->slice; + + dispatched = true; +out: + if (in_balance) { + if (this_rq != locked_rq) { + raw_spin_rq_unlock(locked_rq); + raw_spin_rq_lock(this_rq); + } + } else { + raw_spin_rq_unlock_irqrestore(locked_rq, flags); + } + + kit->cursor.flags &= ~(__SCX_DSQ_ITER_HAS_SLICE | + __SCX_DSQ_ITER_HAS_VTIME); + return dispatched; +} + __bpf_kfunc_start_defs(); /** @@ -6042,12 +6166,112 @@ __bpf_kfunc bool scx_bpf_consume(u64 dsq_id) } } +/** + * scx_bpf_dispatch_from_dsq_set_slice - Override slice when dispatching from DSQ + * @it__iter: DSQ iterator in progress + * @slice: duration the dispatched task can run for in nsecs + * + * Override the slice of the next task that will be dispatched from @it__iter + * using scx_bpf_dispatch_from_dsq[_vtime](). If this function is not called, + * the previous slice duration is kept. + */ +__bpf_kfunc void scx_bpf_dispatch_from_dsq_set_slice( + struct bpf_iter_scx_dsq *it__iter, u64 slice) +{ + struct bpf_iter_scx_dsq_kern *kit = (void *)it__iter; + + kit->slice = slice; + kit->cursor.flags |= __SCX_DSQ_ITER_HAS_SLICE; +} + +/** + * scx_bpf_dispatch_from_dsq_set_vtime - Override vtime when dispatching from DSQ + * @it__iter: DSQ iterator in progress + * @vtime: task's ordering inside the vtime-sorted queue of the target DSQ + * + * Override the vtime of the next task that will be dispatched from @it__iter + * using scx_bpf_dispatch_from_dsq_vtime(). If this function is not called, the + * previous slice vtime is kept. If scx_bpf_dispatch_from_dsq() is used to + * dispatch the next task, the override is ignored and cleared. + */ +__bpf_kfunc void scx_bpf_dispatch_from_dsq_set_vtime( + struct bpf_iter_scx_dsq *it__iter, u64 vtime) +{ + struct bpf_iter_scx_dsq_kern *kit = (void *)it__iter; + + kit->vtime = vtime; + kit->cursor.flags |= __SCX_DSQ_ITER_HAS_VTIME; +} + +/** + * scx_bpf_dispatch_from_dsq - Move a task from DSQ iteration to a DSQ + * @it__iter: DSQ iterator in progress + * @p: task to transfer + * @dsq_id: DSQ to move @p to + * @enq_flags: SCX_ENQ_* + * + * Transfer @p which is on the DSQ currently iterated by @it__iter to the DSQ + * specified by @dsq_id. All DSQs - local DSQs, global DSQ and user DSQs - can + * be the destination. + * + * For the transfer to be successful, @p must still be on the DSQ and have been + * queued before the DSQ iteration started. This function doesn't care whether + * @p was obtained from the DSQ iteration. @p just has to be on the DSQ and have + * been queued before the iteration started. + * + * @p's slice is kept by default. Use scx_bpf_dispatch_from_dsq_set_slice() to + * update. + * + * Can be called from ops.dispatch() or any BPF context which doesn't hold a rq + * lock (e.g. BPF timers or SYSCALL programs). + * + * Returns %true if @p has been consumed, %false if @p had already been consumed + * or dequeued. + */ +__bpf_kfunc bool scx_bpf_dispatch_from_dsq(struct bpf_iter_scx_dsq *it__iter, + struct task_struct *p, u64 dsq_id, + u64 enq_flags) +{ + return scx_dispatch_from_dsq((struct bpf_iter_scx_dsq_kern *)it__iter, + p, dsq_id, enq_flags); +} + +/** + * scx_bpf_dispatch_vtime_from_dsq - Move a task from DSQ iteration to a PRIQ DSQ + * @it__iter: DSQ iterator in progress + * @p: task to transfer + * @dsq_id: DSQ to move @p to + * @enq_flags: SCX_ENQ_* + * + * Transfer @p which is on the DSQ currently iterated by @it__iter to the + * priority queue of the DSQ specified by @dsq_id. The destination must be a + * user DSQ as only user DSQs support priority queue. + * + * @p's slice and vtime are kept by default. Use + * scx_bpf_dispatch_from_dsq_set_slice() and + * scx_bpf_dispatch_from_dsq_set_vtime() to update. + * + * All other aspects are identical to scx_bpf_dispatch_from_dsq(). See + * scx_bpf_dispatch_vtime() for more information on @vtime. + */ +__bpf_kfunc bool scx_bpf_dispatch_vtime_from_dsq(struct bpf_iter_scx_dsq *it__iter, + struct task_struct *p, u64 dsq_id, + u64 enq_flags) +{ + return scx_dispatch_from_dsq((struct bpf_iter_scx_dsq_kern *)it__iter, + p, dsq_id, enq_flags | SCX_ENQ_DSQ_PRIQ); +} + __bpf_kfunc_end_defs(); BTF_KFUNCS_START(scx_kfunc_ids_dispatch) BTF_ID_FLAGS(func, scx_bpf_dispatch_nr_slots) BTF_ID_FLAGS(func, scx_bpf_dispatch_cancel) BTF_ID_FLAGS(func, scx_bpf_consume) +BTF_ID_FLAGS(func, scx_bpf_dispatch_from_dsq_set_slice) +BTF_ID_FLAGS(func, scx_bpf_dispatch_from_dsq_set_vtime) +BTF_ID_FLAGS(func, scx_bpf_dispatch_from_dsq, KF_RCU) +BTF_ID_FLAGS(func, scx_bpf_dispatch_vtime_from_dsq, KF_RCU) BTF_KFUNCS_END(scx_kfunc_ids_dispatch) static const struct btf_kfunc_id_set scx_kfunc_set_dispatch = { @@ -6144,6 +6368,8 @@ __bpf_kfunc_end_defs(); BTF_KFUNCS_START(scx_kfunc_ids_unlocked) BTF_ID_FLAGS(func, scx_bpf_create_dsq, KF_SLEEPABLE) +BTF_ID_FLAGS(func, scx_bpf_dispatch_from_dsq, KF_RCU) +BTF_ID_FLAGS(func, scx_bpf_dispatch_vtime_from_dsq, KF_RCU) BTF_KFUNCS_END(scx_kfunc_ids_unlocked) static const struct btf_kfunc_id_set scx_kfunc_set_unlocked = { diff --git a/tools/sched_ext/include/scx/common.bpf.h b/tools/sched_ext/include/scx/common.bpf.h index 457462b19966..f538c75db183 100644 --- a/tools/sched_ext/include/scx/common.bpf.h +++ b/tools/sched_ext/include/scx/common.bpf.h @@ -35,6 +35,10 @@ void scx_bpf_dispatch_vtime(struct task_struct *p, u64 dsq_id, u64 slice, u64 vt u32 scx_bpf_dispatch_nr_slots(void) __ksym; void scx_bpf_dispatch_cancel(void) __ksym; bool scx_bpf_consume(u64 dsq_id) __ksym; +void scx_bpf_dispatch_from_dsq_set_slice(struct bpf_iter_scx_dsq *it__iter, u64 slice) __ksym; +void scx_bpf_dispatch_from_dsq_set_vtime(struct bpf_iter_scx_dsq *it__iter, u64 vtime) __ksym; +bool scx_bpf_dispatch_from_dsq(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id, u64 enq_flags) __ksym __weak; +bool scx_bpf_dispatch_vtime_from_dsq(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id, u64 enq_flags) __ksym __weak; u32 scx_bpf_reenqueue_local(void) __ksym; void scx_bpf_kick_cpu(s32 cpu, u64 flags) __ksym; s32 scx_bpf_dsq_nr_queued(u64 dsq_id) __ksym; @@ -63,6 +67,12 @@ s32 scx_bpf_task_cpu(const struct task_struct *p) __ksym; struct rq *scx_bpf_cpu_rq(s32 cpu) __ksym; struct cgroup *scx_bpf_task_cgroup(struct task_struct *p) __ksym; +/* + * Use the following as @it__iter when calling + * scx_bpf_dispatch[_vtime]_from_dsq() from within bpf_for_each() loops. + */ +#define BPF_FOR_EACH_ITER (&___it) + static inline __attribute__((format(printf, 1, 2))) void ___scx_bpf_bstr_format_checker(const char *fmt, ...) {} -- cgit v1.2.3 From 2d285d561543d76b4a13263731b447e646c258f2 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 9 Sep 2024 13:42:47 -1000 Subject: scx_qmap: Implement highpri boosting Implement a silly boosting mechanism for nice -20 tasks. The only purpose is demonstrating and testing scx_bpf_dispatch_from_dsq(). The boosting only works within SHARED_DSQ and makes only minor differences with increased dispatch batch (-b). This exercises moving tasks to a user DSQ and all local DSQs from ops.dispatch() and BPF timerfn. v2: - Updated to use scx_bpf_dispatch_from_dsq_set_{slice|vtime}(). - Drop the workaround for the iterated tasks not being trusted by the verifier. The issue is fixed from BPF side. Signed-off-by: Tejun Heo Cc: Daniel Hodges Cc: David Vernet Cc: Changwoo Min Cc: Andrea Righi Cc: Dan Schatzberg --- tools/sched_ext/scx_qmap.bpf.c | 133 +++++++++++++++++++++++++++++++++++++---- tools/sched_ext/scx_qmap.c | 11 +++- 2 files changed, 130 insertions(+), 14 deletions(-) (limited to 'tools') diff --git a/tools/sched_ext/scx_qmap.bpf.c b/tools/sched_ext/scx_qmap.bpf.c index 7e7f28f4117e..83c8f54c1e31 100644 --- a/tools/sched_ext/scx_qmap.bpf.c +++ b/tools/sched_ext/scx_qmap.bpf.c @@ -27,6 +27,8 @@ enum consts { ONE_SEC_IN_NS = 1000000000, SHARED_DSQ = 0, + HIGHPRI_DSQ = 1, + HIGHPRI_WEIGHT = 8668, /* this is what -20 maps to */ }; char _license[] SEC("license") = "GPL"; @@ -36,10 +38,12 @@ const volatile u32 stall_user_nth; const volatile u32 stall_kernel_nth; const volatile u32 dsp_inf_loop_after; const volatile u32 dsp_batch; +const volatile bool highpri_boosting; const volatile bool print_shared_dsq; const volatile s32 disallow_tgid; const volatile bool suppress_dump; +u64 nr_highpri_queued; u32 test_error_cnt; UEI_DEFINE(uei); @@ -95,6 +99,7 @@ static u64 core_sched_tail_seqs[5]; /* Per-task scheduling context */ struct task_ctx { bool force_local; /* Dispatch directly to local_dsq */ + bool highpri; u64 core_sched_seq; }; @@ -122,6 +127,7 @@ struct { /* Statistics */ u64 nr_enqueued, nr_dispatched, nr_reenqueued, nr_dequeued, nr_ddsp_from_enq; u64 nr_core_sched_execed; +u64 nr_expedited_local, nr_expedited_remote, nr_expedited_lost, nr_expedited_from_timer; u32 cpuperf_min, cpuperf_avg, cpuperf_max; u32 cpuperf_target_min, cpuperf_target_avg, cpuperf_target_max; @@ -140,17 +146,25 @@ static s32 pick_direct_dispatch_cpu(struct task_struct *p, s32 prev_cpu) return -1; } +static struct task_ctx *lookup_task_ctx(struct task_struct *p) +{ + struct task_ctx *tctx; + + if (!(tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0))) { + scx_bpf_error("task_ctx lookup failed"); + return NULL; + } + return tctx; +} + s32 BPF_STRUCT_OPS(qmap_select_cpu, struct task_struct *p, s32 prev_cpu, u64 wake_flags) { struct task_ctx *tctx; s32 cpu; - tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0); - if (!tctx) { - scx_bpf_error("task_ctx lookup failed"); + if (!(tctx = lookup_task_ctx(p))) return -ESRCH; - } cpu = pick_direct_dispatch_cpu(p, prev_cpu); @@ -197,11 +211,8 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags) if (test_error_cnt && !--test_error_cnt) scx_bpf_error("test triggering error"); - tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0); - if (!tctx) { - scx_bpf_error("task_ctx lookup failed"); + if (!(tctx = lookup_task_ctx(p))) return; - } /* * All enqueued tasks must have their core_sched_seq updated for correct @@ -255,6 +266,10 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags) return; } + if (highpri_boosting && p->scx.weight >= HIGHPRI_WEIGHT) { + tctx->highpri = true; + __sync_fetch_and_add(&nr_highpri_queued, 1); + } __sync_fetch_and_add(&nr_enqueued, 1); } @@ -271,13 +286,80 @@ void BPF_STRUCT_OPS(qmap_dequeue, struct task_struct *p, u64 deq_flags) static void update_core_sched_head_seq(struct task_struct *p) { - struct task_ctx *tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0); int idx = weight_to_idx(p->scx.weight); + struct task_ctx *tctx; - if (tctx) + if ((tctx = lookup_task_ctx(p))) core_sched_head_seqs[idx] = tctx->core_sched_seq; - else - scx_bpf_error("task_ctx lookup failed"); +} + +/* + * To demonstrate the use of scx_bpf_dispatch_from_dsq(), implement silly + * selective priority boosting mechanism by scanning SHARED_DSQ looking for + * highpri tasks, moving them to HIGHPRI_DSQ and then consuming them first. This + * makes minor difference only when dsp_batch is larger than 1. + * + * scx_bpf_dispatch[_vtime]_from_dsq() are allowed both from ops.dispatch() and + * non-rq-lock holding BPF programs. As demonstration, this function is called + * from qmap_dispatch() and monitor_timerfn(). + */ +static bool dispatch_highpri(bool from_timer) +{ + struct task_struct *p; + s32 this_cpu = bpf_get_smp_processor_id(); + + /* scan SHARED_DSQ and move highpri tasks to HIGHPRI_DSQ */ + bpf_for_each(scx_dsq, p, SHARED_DSQ, 0) { + static u64 highpri_seq; + struct task_ctx *tctx; + + if (!(tctx = lookup_task_ctx(p))) + return false; + + if (tctx->highpri) { + /* exercise the set_*() and vtime interface too */ + scx_bpf_dispatch_from_dsq_set_slice( + BPF_FOR_EACH_ITER, slice_ns * 2); + scx_bpf_dispatch_from_dsq_set_vtime( + BPF_FOR_EACH_ITER, highpri_seq++); + scx_bpf_dispatch_vtime_from_dsq( + BPF_FOR_EACH_ITER, p, HIGHPRI_DSQ, 0); + } + } + + /* + * Scan HIGHPRI_DSQ and dispatch until a task that can run on this CPU + * is found. + */ + bpf_for_each(scx_dsq, p, HIGHPRI_DSQ, 0) { + bool dispatched = false; + s32 cpu; + + if (bpf_cpumask_test_cpu(this_cpu, p->cpus_ptr)) + cpu = this_cpu; + else + cpu = scx_bpf_pick_any_cpu(p->cpus_ptr, 0); + + if (scx_bpf_dispatch_from_dsq(BPF_FOR_EACH_ITER, p, + SCX_DSQ_LOCAL_ON | cpu, + SCX_ENQ_PREEMPT)) { + if (cpu == this_cpu) { + dispatched = true; + __sync_fetch_and_add(&nr_expedited_local, 1); + } else { + __sync_fetch_and_add(&nr_expedited_remote, 1); + } + if (from_timer) + __sync_fetch_and_add(&nr_expedited_from_timer, 1); + } else { + __sync_fetch_and_add(&nr_expedited_lost, 1); + } + + if (dispatched) + return true; + } + + return false; } void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev) @@ -289,7 +371,10 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev) void *fifo; s32 i, pid; - if (scx_bpf_consume(SHARED_DSQ)) + if (dispatch_highpri(false)) + return; + + if (!nr_highpri_queued && scx_bpf_consume(SHARED_DSQ)) return; if (dsp_inf_loop_after && nr_dispatched > dsp_inf_loop_after) { @@ -326,6 +411,8 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev) /* Dispatch or advance. */ bpf_repeat(BPF_MAX_LOOPS) { + struct task_ctx *tctx; + if (bpf_map_pop_elem(fifo, &pid)) break; @@ -333,13 +420,25 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev) if (!p) continue; + if (!(tctx = lookup_task_ctx(p))) { + bpf_task_release(p); + return; + } + + if (tctx->highpri) + __sync_fetch_and_sub(&nr_highpri_queued, 1); + update_core_sched_head_seq(p); __sync_fetch_and_add(&nr_dispatched, 1); + scx_bpf_dispatch(p, SHARED_DSQ, slice_ns, 0); bpf_task_release(p); + batch--; cpuc->dsp_cnt--; if (!batch || !scx_bpf_dispatch_nr_slots()) { + if (dispatch_highpri(false)) + return; scx_bpf_consume(SHARED_DSQ); return; } @@ -664,6 +763,10 @@ static void dump_shared_dsq(void) static int monitor_timerfn(void *map, int *key, struct bpf_timer *timer) { + bpf_rcu_read_lock(); + dispatch_highpri(true); + bpf_rcu_read_unlock(); + monitor_cpuperf(); if (print_shared_dsq) @@ -685,6 +788,10 @@ s32 BPF_STRUCT_OPS_SLEEPABLE(qmap_init) if (ret) return ret; + ret = scx_bpf_create_dsq(HIGHPRI_DSQ, -1); + if (ret) + return ret; + timer = bpf_map_lookup_elem(&monitor_timer, &key); if (!timer) return -ESRCH; diff --git a/tools/sched_ext/scx_qmap.c b/tools/sched_ext/scx_qmap.c index c9ca30d62b2b..ac45a02b4055 100644 --- a/tools/sched_ext/scx_qmap.c +++ b/tools/sched_ext/scx_qmap.c @@ -29,6 +29,7 @@ const char help_fmt[] = " -l COUNT Trigger dispatch infinite looping after COUNT dispatches\n" " -b COUNT Dispatch upto COUNT tasks together\n" " -P Print out DSQ content to trace_pipe every second, use with -b\n" +" -H Boost nice -20 tasks in SHARED_DSQ, use with -b\n" " -d PID Disallow a process from switching into SCHED_EXT (-1 for self)\n" " -D LEN Set scx_exit_info.dump buffer length\n" " -S Suppress qmap-specific debug dump\n" @@ -63,7 +64,7 @@ int main(int argc, char **argv) skel = SCX_OPS_OPEN(qmap_ops, scx_qmap); - while ((opt = getopt(argc, argv, "s:e:t:T:l:b:Pd:D:Spvh")) != -1) { + while ((opt = getopt(argc, argv, "s:e:t:T:l:b:PHd:D:Spvh")) != -1) { switch (opt) { case 's': skel->rodata->slice_ns = strtoull(optarg, NULL, 0) * 1000; @@ -86,6 +87,9 @@ int main(int argc, char **argv) case 'P': skel->rodata->print_shared_dsq = true; break; + case 'H': + skel->rodata->highpri_boosting = true; + break; case 'd': skel->rodata->disallow_tgid = strtol(optarg, NULL, 0); if (skel->rodata->disallow_tgid < 0) @@ -121,6 +125,11 @@ int main(int argc, char **argv) skel->bss->nr_reenqueued, skel->bss->nr_dequeued, skel->bss->nr_core_sched_execed, skel->bss->nr_ddsp_from_enq); + printf(" exp_local=%"PRIu64" exp_remote=%"PRIu64" exp_timer=%"PRIu64" exp_lost=%"PRIu64"\n", + skel->bss->nr_expedited_local, + skel->bss->nr_expedited_remote, + skel->bss->nr_expedited_from_timer, + skel->bss->nr_expedited_lost); if (__COMPAT_has_ksym("scx_bpf_cpuperf_cur")) printf("cpuperf: cur min/avg/max=%u/%u/%u target min/avg/max=%u/%u/%u\n", skel->bss->cpuperf_min, -- cgit v1.2.3 From 0aa75a2b3fafccc875d260e190b14faf5a856d45 Mon Sep 17 00:00:00 2001 From: zhangjiao Date: Thu, 29 Aug 2024 12:20:08 +0800 Subject: tools/mm: rm thp_swap_allocator_test when make clean rm thp_swap_allocator_test when make clean Link: https://lkml.kernel.org/r/20240829042008.6937-1-zhangjiao2@cmss.chinamobile.com Signed-off-by: zhangjiao Signed-off-by: Andrew Morton --- tools/mm/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/mm/Makefile b/tools/mm/Makefile index 15791c1c5b28..f5725b5c23aa 100644 --- a/tools/mm/Makefile +++ b/tools/mm/Makefile @@ -23,7 +23,7 @@ $(LIBS): $(CC) $(CFLAGS) -o $@ $< $(LDFLAGS) clean: - $(RM) page-types slabinfo page_owner_sort + $(RM) page-types slabinfo page_owner_sort thp_swap_allocator_test make -C $(LIB_DIR) clean sbindir ?= /usr/sbin -- cgit v1.2.3 From f58817c852e9c9eb8116c24d8271a35159636605 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Fri, 6 Sep 2024 20:46:07 +0200 Subject: selftests: mptcp: lib: add time per subtests in TAP output It adds 'time=ms' in the diagnostic data of the TAP output, e.g. ok 1 - pm_netlink: defaults addr list # time=9ms This addition is useful to quickly identify which subtests are taking a longer time than the others, or more than expected. Note that there are no specific formats to follow to show this time according to the TAP 13 [1], TAP 14 [2] and KTAP [3] specifications. Let's then define this one here. Link: https://testanything.org/tap-version-13-specification.html [1] Link: https://testanything.org/tap-version-14-specification.html [2] Link: https://docs.kernel.org/dev-tools/ktap.html [3] Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20240906-net-next-mptcp-ksft-subtest-time-v2-1-31d5ee4f3bdf@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/mptcp/mptcp_lib.sh | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/mptcp/mptcp_lib.sh b/tools/testing/selftests/net/mptcp/mptcp_lib.sh index 4578a331041e..975d4d4c862a 100644 --- a/tools/testing/selftests/net/mptcp/mptcp_lib.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_lib.sh @@ -29,6 +29,7 @@ declare -rx MPTCP_LIB_AF_INET6=10 MPTCP_LIB_SUBTESTS=() MPTCP_LIB_SUBTESTS_DUPLICATED=0 MPTCP_LIB_SUBTEST_FLAKY=0 +MPTCP_LIB_SUBTESTS_LAST_TS_MS= MPTCP_LIB_TEST_COUNTER=0 MPTCP_LIB_TEST_FORMAT="%02u %-50s" MPTCP_LIB_IP_MPTCP=0 @@ -205,6 +206,11 @@ mptcp_lib_kversion_ge() { mptcp_lib_fail_if_expected_feature "kernel version ${1} lower than ${v}" } +mptcp_lib_subtests_last_ts_reset() { + MPTCP_LIB_SUBTESTS_LAST_TS_MS="$(date +%s%3N)" +} +mptcp_lib_subtests_last_ts_reset + __mptcp_lib_result_check_duplicated() { local subtest @@ -219,13 +225,22 @@ __mptcp_lib_result_check_duplicated() { __mptcp_lib_result_add() { local result="${1}" + local time="time=" + local ts_prev_ms shift local id=$((${#MPTCP_LIB_SUBTESTS[@]} + 1)) __mptcp_lib_result_check_duplicated "${*}" - MPTCP_LIB_SUBTESTS+=("${result} ${id} - ${KSFT_TEST}: ${*}") + # not to add two '#' + [[ "${*}" != *"#"* ]] && time="# ${time}" + + ts_prev_ms="${MPTCP_LIB_SUBTESTS_LAST_TS_MS}" + mptcp_lib_subtests_last_ts_reset + time+="$((MPTCP_LIB_SUBTESTS_LAST_TS_MS - ts_prev_ms))ms" + + MPTCP_LIB_SUBTESTS+=("${result} ${id} - ${KSFT_TEST}: ${*} ${time}") } # $1: test name -- cgit v1.2.3 From 1a38cee4bbd0adeb62e11aab429678b1fb4a12ae Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Fri, 6 Sep 2024 20:46:08 +0200 Subject: selftests: mptcp: connect: remote time in TAP output It is now added by the MPTCP lib automatically, see the parent commit. The time in the TAP output might be slightly different from the one displayed before, but that's OK. Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20240906-net-next-mptcp-ksft-subtest-time-v2-2-31d5ee4f3bdf@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/mptcp/mptcp_connect.sh | 1 - 1 file changed, 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/mptcp/mptcp_connect.sh b/tools/testing/selftests/net/mptcp/mptcp_connect.sh index b77fb7065bfb..f61e2f5870ea 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_connect.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_connect.sh @@ -431,7 +431,6 @@ do_transfer() local duration duration=$((stop-start)) - result_msg+=" # time=${duration}ms" printf "(duration %05sms) " "${duration}" if [ ${rets} -ne 0 ] || [ ${retc} -ne 0 ]; then mptcp_lib_pr_fail "client exit code $retc, server $rets" -- cgit v1.2.3 From d4e192728efc532bc9a93e664f81fac602786450 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Fri, 6 Sep 2024 20:46:09 +0200 Subject: selftests: mptcp: reset the last TS before the first test Just to slightly improve the precision of the duration of the first test. In mptcp_join.sh, the last append_prev_results is now done as soon as the last test is over: this will add the last result in the list, and get a more precise time for this last test. Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20240906-net-next-mptcp-ksft-subtest-time-v2-3-31d5ee4f3bdf@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/mptcp/mptcp_connect.sh | 2 ++ tools/testing/selftests/net/mptcp/mptcp_join.sh | 3 ++- tools/testing/selftests/net/mptcp/mptcp_sockopt.sh | 1 + tools/testing/selftests/net/mptcp/pm_netlink.sh | 2 ++ tools/testing/selftests/net/mptcp/simult_flows.sh | 1 + tools/testing/selftests/net/mptcp/userspace_pm.sh | 1 + 6 files changed, 9 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/mptcp/mptcp_connect.sh b/tools/testing/selftests/net/mptcp/mptcp_connect.sh index f61e2f5870ea..49d90c4dbc01 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_connect.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_connect.sh @@ -847,6 +847,8 @@ stop_if_error() make_file "$cin" "client" make_file "$sin" "server" +mptcp_lib_subtests_last_ts_reset + check_mptcp_disabled stop_if_error "The kernel configuration is not valid for MPTCP" diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh index 43f8a9bd84c4..3564cd06643c 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_join.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh @@ -3959,9 +3959,11 @@ if [ ${#tests[@]} -eq 0 ]; then tests=("${all_tests_names[@]}") fi +mptcp_lib_subtests_last_ts_reset for subtests in "${tests[@]}"; do "${subtests}" done +append_prev_results if [ ${ret} -ne 0 ]; then echo @@ -3972,7 +3974,6 @@ if [ ${ret} -ne 0 ]; then echo fi -append_prev_results mptcp_lib_result_print_all_tap exit $ret diff --git a/tools/testing/selftests/net/mptcp/mptcp_sockopt.sh b/tools/testing/selftests/net/mptcp/mptcp_sockopt.sh index 68899a303a1a..5e8d5b83e2d0 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_sockopt.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_sockopt.sh @@ -349,6 +349,7 @@ init make_file "$cin" "client" 1 make_file "$sin" "server" 1 trap cleanup EXIT +mptcp_lib_subtests_last_ts_reset run_tests $ns1 $ns2 10.0.1.1 run_tests $ns1 $ns2 dead:beef:1::1 diff --git a/tools/testing/selftests/net/mptcp/pm_netlink.sh b/tools/testing/selftests/net/mptcp/pm_netlink.sh index 2757378b1b13..2e6648a2b2c0 100755 --- a/tools/testing/selftests/net/mptcp/pm_netlink.sh +++ b/tools/testing/selftests/net/mptcp/pm_netlink.sh @@ -137,6 +137,8 @@ check() fi } +mptcp_lib_subtests_last_ts_reset + check "show_endpoints" "" "defaults addr list" default_limits="$(get_limits)" diff --git a/tools/testing/selftests/net/mptcp/simult_flows.sh b/tools/testing/selftests/net/mptcp/simult_flows.sh index f74e1c3c126d..8fa77c8e9b65 100755 --- a/tools/testing/selftests/net/mptcp/simult_flows.sh +++ b/tools/testing/selftests/net/mptcp/simult_flows.sh @@ -286,6 +286,7 @@ while getopts "bcdhi" option;do done setup +mptcp_lib_subtests_last_ts_reset run_test 10 10 0 0 "balanced bwidth" run_test 10 10 1 25 "balanced bwidth with unbalanced delay" diff --git a/tools/testing/selftests/net/mptcp/userspace_pm.sh b/tools/testing/selftests/net/mptcp/userspace_pm.sh index 9cb05978269d..3651f73451cf 100755 --- a/tools/testing/selftests/net/mptcp/userspace_pm.sh +++ b/tools/testing/selftests/net/mptcp/userspace_pm.sh @@ -150,6 +150,7 @@ mptcp_lib_events "${ns2}" "${client_evts}" client_evts_pid server_evts=$(mktemp) mptcp_lib_events "${ns1}" "${server_evts}" server_evts_pid sleep 0.5 +mptcp_lib_subtests_last_ts_reset print_title "Init" print_test "Created network namespaces ns1, ns2" -- cgit v1.2.3 From a5b6be42aac0fd3b94adaeec3cda4d847b304fd8 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Fri, 6 Sep 2024 20:46:10 +0200 Subject: selftests: mptcp: diag: remove trailing whitespace It doesn't need to be there, and it can cause some issues with TAP parsers expecting only one space around the directive delimiter (#). Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20240906-net-next-mptcp-ksft-subtest-time-v2-4-31d5ee4f3bdf@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/mptcp/diag.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/mptcp/diag.sh b/tools/testing/selftests/net/mptcp/diag.sh index 776d43a6922d..2bd0c1eb70c5 100755 --- a/tools/testing/selftests/net/mptcp/diag.sh +++ b/tools/testing/selftests/net/mptcp/diag.sh @@ -284,7 +284,7 @@ echo "b" | \ ./mptcp_connect -p 10000 -r 0 -t ${timeout_poll} -w 20 \ 127.0.0.1 >/dev/null & wait_connected $ns 10000 -chk_msk_nr 2 "after MPC handshake " +chk_msk_nr 2 "after MPC handshake" chk_last_time_info 10000 chk_msk_remote_key_nr 2 "....chk remote_key" chk_msk_fallback_nr 0 "....chk no fallback" -- cgit v1.2.3 From a92d1db0c989d4244d3a671d78f291606ed4ce35 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Fri, 6 Sep 2024 20:46:11 +0200 Subject: selftests: mptcp: connect: remove duplicated spaces in TAP output It is nice to have a visual alignment in the test output to present the different results, but it makes less sense in the TAP output that is there for computers. It sounds then better to remove the duplicated whitespaces in the TAP output, also because it can cause some issues with TAP parsers expecting only one space around the directive delimiter (#). While at it, change the variable name (result_msg) to something more explicit. Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20240906-net-next-mptcp-ksft-subtest-time-v2-5-31d5ee4f3bdf@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/mptcp/mptcp_connect.sh | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/mptcp/mptcp_connect.sh b/tools/testing/selftests/net/mptcp/mptcp_connect.sh index 49d90c4dbc01..57325d57e4c6 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_connect.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_connect.sh @@ -345,9 +345,11 @@ do_transfer() local addr_port addr_port=$(printf "%s:%d" ${connect_addr} ${port}) - local result_msg - result_msg="$(printf "%.3s %-5s -> %.3s (%-20s) %-5s" ${connector_ns} ${cl_proto} ${listener_ns} ${addr_port} ${srv_proto})" - mptcp_lib_print_title "${result_msg}" + local pretty_title + pretty_title="$(printf "%.3s %-5s -> %.3s (%-20s) %-5s" ${connector_ns} ${cl_proto} ${listener_ns} ${addr_port} ${srv_proto})" + mptcp_lib_print_title "${pretty_title}" + + local tap_title="${connector_ns:0:3} ${cl_proto} -> ${listener_ns:0:3} (${addr_port}) ${srv_proto}" if $capture; then local capuser @@ -443,7 +445,7 @@ do_transfer() echo cat "$capout" - mptcp_lib_result_fail "${TEST_GROUP}: ${result_msg}" + mptcp_lib_result_fail "${TEST_GROUP}: ${tap_title}" return 1 fi @@ -543,12 +545,12 @@ do_transfer() if [ $retc -eq 0 ] && [ $rets -eq 0 ]; then mptcp_lib_pr_ok "${extra:1}" - mptcp_lib_result_pass "${TEST_GROUP}: ${result_msg}" + mptcp_lib_result_pass "${TEST_GROUP}: ${tap_title}" else if [ -n "${extra}" ]; then mptcp_lib_print_warn "${extra:1}" fi - mptcp_lib_result_fail "${TEST_GROUP}: ${result_msg}" + mptcp_lib_result_fail "${TEST_GROUP}: ${tap_title}" fi cat "$capout" -- cgit v1.2.3 From 5aa57d9f2d5311f19434d95b2a81610aa263e23b Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 5 Sep 2024 12:32:40 -0700 Subject: af_unix: Don't return OOB skb in manage_oob(). syzbot reported use-after-free in unix_stream_recv_urg(). [0] The scenario is 1. send(MSG_OOB) 2. recv(MSG_OOB) -> The consumed OOB remains in recv queue 3. send(MSG_OOB) 4. recv() -> manage_oob() returns the next skb of the consumed OOB -> This is also OOB, but unix_sk(sk)->oob_skb is not cleared 5. recv(MSG_OOB) -> unix_sk(sk)->oob_skb is used but already freed The recent commit 8594d9b85c07 ("af_unix: Don't call skb_get() for OOB skb.") uncovered the issue. If the OOB skb is consumed and the next skb is peeked in manage_oob(), we still need to check if the skb is OOB. Let's do so by falling back to the following checks in manage_oob() and add the test case in selftest. Note that we need to add a similar check for SIOCATMARK. [0]: BUG: KASAN: slab-use-after-free in unix_stream_read_actor+0xa6/0xb0 net/unix/af_unix.c:2959 Read of size 4 at addr ffff8880326abcc4 by task syz-executor178/5235 CPU: 0 UID: 0 PID: 5235 Comm: syz-executor178 Not tainted 6.11.0-rc5-syzkaller-00742-gfbdaffe41adc #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 08/06/2024 Call Trace: __dump_stack lib/dump_stack.c:93 [inline] dump_stack_lvl+0x241/0x360 lib/dump_stack.c:119 print_address_description mm/kasan/report.c:377 [inline] print_report+0x169/0x550 mm/kasan/report.c:488 kasan_report+0x143/0x180 mm/kasan/report.c:601 unix_stream_read_actor+0xa6/0xb0 net/unix/af_unix.c:2959 unix_stream_recv_urg+0x1df/0x320 net/unix/af_unix.c:2640 unix_stream_read_generic+0x2456/0x2520 net/unix/af_unix.c:2778 unix_stream_recvmsg+0x22b/0x2c0 net/unix/af_unix.c:2996 sock_recvmsg_nosec net/socket.c:1046 [inline] sock_recvmsg+0x22f/0x280 net/socket.c:1068 ____sys_recvmsg+0x1db/0x470 net/socket.c:2816 ___sys_recvmsg net/socket.c:2858 [inline] __sys_recvmsg+0x2f0/0x3e0 net/socket.c:2888 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xf3/0x230 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f RIP: 0033:0x7f5360d6b4e9 Code: 48 83 c4 28 c3 e8 37 17 00 00 0f 1f 80 00 00 00 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 b8 ff ff ff f7 d8 64 89 01 48 RSP: 002b:00007fff29b3a458 EFLAGS: 00000246 ORIG_RAX: 000000000000002f RAX: ffffffffffffffda RBX: 00007fff29b3a638 RCX: 00007f5360d6b4e9 RDX: 0000000000002001 RSI: 0000000020000640 RDI: 0000000000000003 RBP: 00007f5360dde610 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000001 R13: 00007fff29b3a628 R14: 0000000000000001 R15: 0000000000000001 Allocated by task 5235: kasan_save_stack mm/kasan/common.c:47 [inline] kasan_save_track+0x3f/0x80 mm/kasan/common.c:68 unpoison_slab_object mm/kasan/common.c:312 [inline] __kasan_slab_alloc+0x66/0x80 mm/kasan/common.c:338 kasan_slab_alloc include/linux/kasan.h:201 [inline] slab_post_alloc_hook mm/slub.c:3988 [inline] slab_alloc_node mm/slub.c:4037 [inline] kmem_cache_alloc_node_noprof+0x16b/0x320 mm/slub.c:4080 __alloc_skb+0x1c3/0x440 net/core/skbuff.c:667 alloc_skb include/linux/skbuff.h:1320 [inline] alloc_skb_with_frags+0xc3/0x770 net/core/skbuff.c:6528 sock_alloc_send_pskb+0x91a/0xa60 net/core/sock.c:2815 sock_alloc_send_skb include/net/sock.h:1778 [inline] queue_oob+0x108/0x680 net/unix/af_unix.c:2198 unix_stream_sendmsg+0xd24/0xf80 net/unix/af_unix.c:2351 sock_sendmsg_nosec net/socket.c:730 [inline] __sock_sendmsg+0x221/0x270 net/socket.c:745 ____sys_sendmsg+0x525/0x7d0 net/socket.c:2597 ___sys_sendmsg net/socket.c:2651 [inline] __sys_sendmsg+0x2b0/0x3a0 net/socket.c:2680 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xf3/0x230 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f Freed by task 5235: kasan_save_stack mm/kasan/common.c:47 [inline] kasan_save_track+0x3f/0x80 mm/kasan/common.c:68 kasan_save_free_info+0x40/0x50 mm/kasan/generic.c:579 poison_slab_object+0xe0/0x150 mm/kasan/common.c:240 __kasan_slab_free+0x37/0x60 mm/kasan/common.c:256 kasan_slab_free include/linux/kasan.h:184 [inline] slab_free_hook mm/slub.c:2252 [inline] slab_free mm/slub.c:4473 [inline] kmem_cache_free+0x145/0x350 mm/slub.c:4548 unix_stream_read_generic+0x1ef6/0x2520 net/unix/af_unix.c:2917 unix_stream_recvmsg+0x22b/0x2c0 net/unix/af_unix.c:2996 sock_recvmsg_nosec net/socket.c:1046 [inline] sock_recvmsg+0x22f/0x280 net/socket.c:1068 __sys_recvfrom+0x256/0x3e0 net/socket.c:2255 __do_sys_recvfrom net/socket.c:2273 [inline] __se_sys_recvfrom net/socket.c:2269 [inline] __x64_sys_recvfrom+0xde/0x100 net/socket.c:2269 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xf3/0x230 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f The buggy address belongs to the object at ffff8880326abc80 which belongs to the cache skbuff_head_cache of size 240 The buggy address is located 68 bytes inside of freed 240-byte region [ffff8880326abc80, ffff8880326abd70) The buggy address belongs to the physical page: page: refcount:1 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x326ab ksm flags: 0xfff00000000000(node=0|zone=1|lastcpupid=0x7ff) page_type: 0xfdffffff(slab) raw: 00fff00000000000 ffff88801eaee780 ffffea0000b7dc80 dead000000000003 raw: 0000000000000000 00000000800c000c 00000001fdffffff 0000000000000000 page dumped because: kasan: bad access detected page_owner tracks the page as allocated page last allocated via order 0, migratetype Unmovable, gfp_mask 0x52cc0(GFP_KERNEL|__GFP_NOWARN|__GFP_NORETRY|__GFP_COMP), pid 4686, tgid 4686 (udevadm), ts 32357469485, free_ts 28829011109 set_page_owner include/linux/page_owner.h:32 [inline] post_alloc_hook+0x1f3/0x230 mm/page_alloc.c:1493 prep_new_page mm/page_alloc.c:1501 [inline] get_page_from_freelist+0x2e4c/0x2f10 mm/page_alloc.c:3439 __alloc_pages_noprof+0x256/0x6c0 mm/page_alloc.c:4695 __alloc_pages_node_noprof include/linux/gfp.h:269 [inline] alloc_pages_node_noprof include/linux/gfp.h:296 [inline] alloc_slab_page+0x5f/0x120 mm/slub.c:2321 allocate_slab+0x5a/0x2f0 mm/slub.c:2484 new_slab mm/slub.c:2537 [inline] ___slab_alloc+0xcd1/0x14b0 mm/slub.c:3723 __slab_alloc+0x58/0xa0 mm/slub.c:3813 __slab_alloc_node mm/slub.c:3866 [inline] slab_alloc_node mm/slub.c:4025 [inline] kmem_cache_alloc_node_noprof+0x1fe/0x320 mm/slub.c:4080 __alloc_skb+0x1c3/0x440 net/core/skbuff.c:667 alloc_skb include/linux/skbuff.h:1320 [inline] alloc_uevent_skb+0x74/0x230 lib/kobject_uevent.c:289 uevent_net_broadcast_untagged lib/kobject_uevent.c:326 [inline] kobject_uevent_net_broadcast+0x2fd/0x580 lib/kobject_uevent.c:410 kobject_uevent_env+0x57d/0x8e0 lib/kobject_uevent.c:608 kobject_synth_uevent+0x4ef/0xae0 lib/kobject_uevent.c:207 uevent_store+0x4b/0x70 drivers/base/bus.c:633 kernfs_fop_write_iter+0x3a1/0x500 fs/kernfs/file.c:334 new_sync_write fs/read_write.c:497 [inline] vfs_write+0xa72/0xc90 fs/read_write.c:590 page last free pid 1 tgid 1 stack trace: reset_page_owner include/linux/page_owner.h:25 [inline] free_pages_prepare mm/page_alloc.c:1094 [inline] free_unref_page+0xd22/0xea0 mm/page_alloc.c:2612 kasan_depopulate_vmalloc_pte+0x74/0x90 mm/kasan/shadow.c:408 apply_to_pte_range mm/memory.c:2797 [inline] apply_to_pmd_range mm/memory.c:2841 [inline] apply_to_pud_range mm/memory.c:2877 [inline] apply_to_p4d_range mm/memory.c:2913 [inline] __apply_to_page_range+0x8a8/0xe50 mm/memory.c:2947 kasan_release_vmalloc+0x9a/0xb0 mm/kasan/shadow.c:525 purge_vmap_node+0x3e3/0x770 mm/vmalloc.c:2208 __purge_vmap_area_lazy+0x708/0xae0 mm/vmalloc.c:2290 _vm_unmap_aliases+0x79d/0x840 mm/vmalloc.c:2885 change_page_attr_set_clr+0x2fe/0xdb0 arch/x86/mm/pat/set_memory.c:1881 change_page_attr_set arch/x86/mm/pat/set_memory.c:1922 [inline] set_memory_nx+0xf2/0x130 arch/x86/mm/pat/set_memory.c:2110 free_init_pages arch/x86/mm/init.c:924 [inline] free_kernel_image_pages arch/x86/mm/init.c:943 [inline] free_initmem+0x79/0x110 arch/x86/mm/init.c:970 kernel_init+0x31/0x2b0 init/main.c:1476 ret_from_fork+0x4b/0x80 arch/x86/kernel/process.c:147 ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:244 Memory state around the buggy address: ffff8880326abb80: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ffff8880326abc00: fb fb fb fb fb fb fc fc fc fc fc fc fc fc fc fc >ffff8880326abc80: fa fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ^ ffff8880326abd00: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fc fc ffff8880326abd80: fc fc fc fc fc fc fc fc fa fb fb fb fb fb fb fb Fixes: 93c99f21db36 ("af_unix: Don't stop recv(MSG_DONTWAIT) if consumed OOB skb is at the head.") Reported-by: syzbot+8811381d455e3e9ec788@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=8811381d455e3e9ec788 Signed-off-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20240905193240.17565-5-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- net/unix/af_unix.c | 9 +++++++-- tools/testing/selftests/net/af_unix/msg_oob.c | 23 +++++++++++++++++++++++ 2 files changed, 30 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 159d78fc3d14..001ccc55ef0f 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -2673,7 +2673,8 @@ static struct sk_buff *manage_oob(struct sk_buff *skb, struct sock *sk, __skb_unlink(read_skb, &sk->sk_receive_queue); } - goto unlock; + if (!skb) + goto unlock; } if (skb != u->oob_skb) @@ -3175,9 +3176,13 @@ static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) skb = skb_peek(&sk->sk_receive_queue); if (skb) { struct sk_buff *oob_skb = READ_ONCE(u->oob_skb); + struct sk_buff *next_skb; + + next_skb = skb_peek_next(skb, &sk->sk_receive_queue); if (skb == oob_skb || - (!oob_skb && !unix_skb_len(skb))) + (!unix_skb_len(skb) && + (!oob_skb || next_skb == oob_skb))) answ = 1; } diff --git a/tools/testing/selftests/net/af_unix/msg_oob.c b/tools/testing/selftests/net/af_unix/msg_oob.c index 535eb2c3d7d1..3ed3882a93b8 100644 --- a/tools/testing/selftests/net/af_unix/msg_oob.c +++ b/tools/testing/selftests/net/af_unix/msg_oob.c @@ -525,6 +525,29 @@ TEST_F(msg_oob, ex_oob_drop_2) } } +TEST_F(msg_oob, ex_oob_oob) +{ + sendpair("x", 1, MSG_OOB); + epollpair(true); + siocatmarkpair(true); + + recvpair("x", 1, 1, MSG_OOB); + epollpair(false); + siocatmarkpair(true); + + sendpair("y", 1, MSG_OOB); + epollpair(true); + siocatmarkpair(true); + + recvpair("", -EAGAIN, 1, 0); + epollpair(false); + siocatmarkpair(false); + + recvpair("", -EINVAL, 1, MSG_OOB); + epollpair(false); + siocatmarkpair(false); +} + TEST_F(msg_oob, ex_oob_ahead_break) { sendpair("hello", 5, MSG_OOB); -- cgit v1.2.3 From dbd61921a6adbd231d8ca91537f1e7b62e9fea61 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Thu, 5 Sep 2024 19:15:51 -0400 Subject: selftests: support interpreted scripts with ksft_runner.sh Support testcases that are themselves not executable, but need an interpreter to run them. If a test file is not executable, but an executable file ksft_runner.sh exists in the TARGET dir, kselftest will run ./ksft_runner.sh ./$BASENAME_TEST Packetdrill may add hundreds of packetdrill scripts for testing. These scripts must be passed to the packetdrill process. Have kselftest run each test directly, as it already solves common runner requirements like parallel execution and isolation (netns). A previous RFC added a wrapper in between, which would have to reimplement such functionality. Link: https://lore.kernel.org/netdev/66d4d97a4cac_3df182941a@willemb.c.googlers.com.notmuch/T/ Signed-off-by: Willem de Bruijn Link: https://patch.msgid.link/20240905231653.2427327-2-willemdebruijn.kernel@gmail.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/kselftest/runner.sh | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kselftest/runner.sh b/tools/testing/selftests/kselftest/runner.sh index 74954f6a8f94..2c3c58e65a41 100644 --- a/tools/testing/selftests/kselftest/runner.sh +++ b/tools/testing/selftests/kselftest/runner.sh @@ -111,8 +111,11 @@ run_one() stdbuf="/usr/bin/stdbuf --output=L " fi eval kselftest_cmd_args="\$${kselftest_cmd_args_ref:-}" - cmd="$stdbuf ./$BASENAME_TEST $kselftest_cmd_args" - if [ ! -x "$TEST" ]; then + if [ -x "$TEST" ]; then + cmd="$stdbuf ./$BASENAME_TEST $kselftest_cmd_args" + elif [ -x "./ksft_runner.sh" ]; then + cmd="$stdbuf ./ksft_runner.sh ./$BASENAME_TEST" + else echo "# Warning: file $TEST is not executable" if [ $(head -n 1 "$TEST" | cut -c -2) = "#!" ] -- cgit v1.2.3 From 8a405552fd3b1eefe186e724343e88790f6be832 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Thu, 5 Sep 2024 19:15:52 -0400 Subject: selftests/net: integrate packetdrill with ksft Lay the groundwork to import into kselftests the over 150 packetdrill TCP/IP conformance tests on github.com/google/packetdrill. Florian recently added support for packetdrill tests in nf_conntrack, in commit a8a388c2aae49 ("selftests: netfilter: add packetdrill based conntrack tests"). This patch takes a slightly different approach. It relies on ksft_runner.sh to run every *.pkt file in the directory. Any future imports of packetdrill tests should require no additional coding. Just add the *.pkt files. Initially import only two features/directories from github. One with a single script, and one with two. This was the only reason to pick tcp/inq and tcp/md5. The path replaces the directory hierarchy in github with a flat space of files: $(subst /,_,$(wildcard tcp/**/*.pkt)). This is the most straightforward option to integrate with kselftests. The Linked thread reviewed two ways to maintain the hierarchy: TEST_PROGS_RECURSE and PRESERVE_TEST_DIRS. But both introduce significant changes to kselftest infra and with that risk to existing tests. Implementation notes: - restore alphabetical order when adding the new directory to tools/testing/selftests/Makefile - imported *.pkt files and support verbatim from the github project, except for - update `source ./defaults.sh` path (to adjust for flat dir) - add SPDX headers - remove one author statement - Acknowledgment: drop an e (checkpatch) Tested: make -C tools/testing/selftests \ TARGETS=net/packetdrill \ run_tests make -C tools/testing/selftests \ TARGETS=net/packetdrill \ install INSTALL_PATH=$KSFT_INSTALL_PATH # in virtme-ng ./run_kselftest.sh -c net/packetdrill ./run_kselftest.sh -t net/packetdrill:tcp_inq_client.pkt Link: https://lore.kernel.org/netdev/20240827193417.2792223-1-willemdebruijn.kernel@gmail.com/ Signed-off-by: Willem de Bruijn Link: https://patch.msgid.link/20240905231653.2427327-3-willemdebruijn.kernel@gmail.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/Makefile | 5 +- tools/testing/selftests/net/packetdrill/Makefile | 9 ++++ tools/testing/selftests/net/packetdrill/config | 5 ++ .../testing/selftests/net/packetdrill/defaults.sh | 63 ++++++++++++++++++++++ .../selftests/net/packetdrill/ksft_runner.sh | 41 ++++++++++++++ .../selftests/net/packetdrill/tcp_inq_client.pkt | 51 ++++++++++++++++++ .../selftests/net/packetdrill/tcp_inq_server.pkt | 51 ++++++++++++++++++ .../packetdrill/tcp_md5_md5-only-on-client-ack.pkt | 28 ++++++++++ 8 files changed, 251 insertions(+), 2 deletions(-) create mode 100644 tools/testing/selftests/net/packetdrill/Makefile create mode 100644 tools/testing/selftests/net/packetdrill/config create mode 100755 tools/testing/selftests/net/packetdrill/defaults.sh create mode 100755 tools/testing/selftests/net/packetdrill/ksft_runner.sh create mode 100644 tools/testing/selftests/net/packetdrill/tcp_inq_client.pkt create mode 100644 tools/testing/selftests/net/packetdrill/tcp_inq_server.pkt create mode 100644 tools/testing/selftests/net/packetdrill/tcp_md5_md5-only-on-client-ack.pkt (limited to 'tools') diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile index a5f1c0c27dff..3b7df5477317 100644 --- a/tools/testing/selftests/Makefile +++ b/tools/testing/selftests/Makefile @@ -65,10 +65,11 @@ TARGETS += net/af_unix TARGETS += net/forwarding TARGETS += net/hsr TARGETS += net/mptcp -TARGETS += net/openvswitch -TARGETS += net/tcp_ao TARGETS += net/netfilter +TARGETS += net/openvswitch +TARGETS += net/packetdrill TARGETS += net/rds +TARGETS += net/tcp_ao TARGETS += nsfs TARGETS += perf_events TARGETS += pidfd diff --git a/tools/testing/selftests/net/packetdrill/Makefile b/tools/testing/selftests/net/packetdrill/Makefile new file mode 100644 index 000000000000..870f7258dc8d --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/Makefile @@ -0,0 +1,9 @@ +# SPDX-License-Identifier: GPL-2.0 + +TEST_INCLUDES := ksft_runner.sh \ + defaults.sh \ + ../../kselftest/ktap_helpers.sh + +TEST_PROGS := $(wildcard *.pkt) + +include ../../lib.mk diff --git a/tools/testing/selftests/net/packetdrill/config b/tools/testing/selftests/net/packetdrill/config new file mode 100644 index 000000000000..0d402830f18d --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/config @@ -0,0 +1,5 @@ +CONFIG_IPV6=y +CONFIG_NET_SCH_FIFO=y +CONFIG_PROC_SYSCTL=y +CONFIG_TCP_MD5SIG=y +CONFIG_TUN=y diff --git a/tools/testing/selftests/net/packetdrill/defaults.sh b/tools/testing/selftests/net/packetdrill/defaults.sh new file mode 100755 index 000000000000..1095a7b22f44 --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/defaults.sh @@ -0,0 +1,63 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# +# Set standard production config values that relate to TCP behavior. + +# Flush old cached data (fastopen cookies). +ip tcp_metrics flush all > /dev/null 2>&1 + +# TCP min, default, and max receive and send buffer sizes. +sysctl -q net.ipv4.tcp_rmem="4096 540000 $((15*1024*1024))" +sysctl -q net.ipv4.tcp_wmem="4096 $((256*1024)) 4194304" + +# TCP timestamps. +sysctl -q net.ipv4.tcp_timestamps=1 + +# TCP SYN(ACK) retry thresholds +sysctl -q net.ipv4.tcp_syn_retries=5 +sysctl -q net.ipv4.tcp_synack_retries=5 + +# TCP Forward RTO-Recovery, RFC 5682. +sysctl -q net.ipv4.tcp_frto=2 + +# TCP Selective Acknowledgements (SACK) +sysctl -q net.ipv4.tcp_sack=1 + +# TCP Duplicate Selective Acknowledgements (DSACK) +sysctl -q net.ipv4.tcp_dsack=1 + +# TCP FACK (Forward Acknowldgement) +sysctl -q net.ipv4.tcp_fack=0 + +# TCP reordering degree ("dupthresh" threshold for entering Fast Recovery). +sysctl -q net.ipv4.tcp_reordering=3 + +# TCP congestion control. +sysctl -q net.ipv4.tcp_congestion_control=cubic + +# TCP slow start after idle. +sysctl -q net.ipv4.tcp_slow_start_after_idle=0 + +# TCP RACK and TLP. +sysctl -q net.ipv4.tcp_early_retrans=4 net.ipv4.tcp_recovery=1 + +# TCP method for deciding when to defer sending to accumulate big TSO packets. +sysctl -q net.ipv4.tcp_tso_win_divisor=3 + +# TCP Explicit Congestion Notification (ECN) +sysctl -q net.ipv4.tcp_ecn=0 + +sysctl -q net.ipv4.tcp_pacing_ss_ratio=200 +sysctl -q net.ipv4.tcp_pacing_ca_ratio=120 +sysctl -q net.ipv4.tcp_notsent_lowat=4294967295 > /dev/null 2>&1 + +sysctl -q net.ipv4.tcp_fastopen=0x70403 +sysctl -q net.ipv4.tcp_fastopen_key=a1a1a1a1-b2b2b2b2-c3c3c3c3-d4d4d4d4 + +sysctl -q net.ipv4.tcp_syncookies=1 + +# Override the default qdisc on the tun device. +# Many tests fail with timing errors if the default +# is FQ and that paces their flows. +tc qdisc add dev tun0 root pfifo + diff --git a/tools/testing/selftests/net/packetdrill/ksft_runner.sh b/tools/testing/selftests/net/packetdrill/ksft_runner.sh new file mode 100755 index 000000000000..2f62caccbbbc --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/ksft_runner.sh @@ -0,0 +1,41 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +source "$(dirname $(realpath $0))/../../kselftest/ktap_helpers.sh" + +readonly ipv4_args=('--ip_version=ipv4 ' + '--local_ip=192.168.0.1 ' + '--gateway_ip=192.168.0.1 ' + '--netmask_ip=255.255.0.0 ' + '--remote_ip=192.0.2.1 ' + '-D CMSG_LEVEL_IP=SOL_IP ' + '-D CMSG_TYPE_RECVERR=IP_RECVERR ') + +readonly ipv6_args=('--ip_version=ipv6 ' + '--mtu=1520 ' + '--local_ip=fd3d:0a0b:17d6::1 ' + '--gateway_ip=fd3d:0a0b:17d6:8888::1 ' + '--remote_ip=fd3d:fa7b:d17d::1 ' + '-D CMSG_LEVEL_IP=SOL_IPV6 ' + '-D CMSG_TYPE_RECVERR=IPV6_RECVERR ') + +if [ $# -ne 1 ]; then + ktap_exit_fail_msg "usage: $0