diff options
Diffstat (limited to 'tools')
581 files changed, 53841 insertions, 11596 deletions
diff --git a/tools/Makefile b/tools/Makefile index c31cbbd12c45..cb40961a740f 100644 --- a/tools/Makefile +++ b/tools/Makefile @@ -14,6 +14,7 @@ help: @echo ' counter - counter tools' @echo ' cpupower - a tool for all things x86 CPU power' @echo ' debugging - tools for debugging' + @echo ' dma - tools for DMA mapping' @echo ' firewire - the userspace part of nosy, an IEEE-1394 traffic sniffer' @echo ' firmware - Firmware tools' @echo ' freefall - laptop accelerometer program for disk protection' @@ -69,7 +70,7 @@ acpi: FORCE cpupower: FORCE $(call descend,power/$@) -counter firewire hv guest bootconfig spi usb virtio mm bpf iio gpio objtool leds wmi firmware debugging tracing: FORCE +counter dma firewire hv guest bootconfig spi usb virtio mm bpf iio gpio objtool leds wmi firmware debugging tracing: FORCE $(call descend,$@) bpf/%: FORCE @@ -122,7 +123,7 @@ kvm_stat: FORCE ynl: FORCE $(call descend,net/ynl) -all: acpi counter cpupower gpio hv firewire \ +all: acpi counter cpupower dma gpio hv firewire \ perf selftests bootconfig spi turbostat usb \ virtio mm bpf x86_energy_perf_policy \ tmon freefall iio objtool kvm_stat wmi \ @@ -134,7 +135,7 @@ acpi_install: cpupower_install: $(call descend,power/$(@:_install=),install) -counter_install firewire_install gpio_install hv_install iio_install perf_install bootconfig_install spi_install usb_install virtio_install mm_install bpf_install objtool_install wmi_install debugging_install tracing_install: +counter_install dma_install firewire_install gpio_install hv_install iio_install perf_install bootconfig_install spi_install usb_install virtio_install mm_install bpf_install objtool_install wmi_install debugging_install tracing_install: $(call descend,$(@:_install=),install) selftests_install: @@ -164,7 +165,7 @@ kvm_stat_install: ynl_install: $(call descend,net/$(@:_install=),install) -install: acpi_install counter_install cpupower_install gpio_install \ +install: acpi_install counter_install cpupower_install dma_install gpio_install \ hv_install firewire_install iio_install \ perf_install selftests_install turbostat_install usb_install \ virtio_install mm_install bpf_install x86_energy_perf_policy_install \ @@ -178,7 +179,7 @@ acpi_clean: cpupower_clean: $(call descend,power/cpupower,clean) -counter_clean hv_clean firewire_clean bootconfig_clean spi_clean usb_clean virtio_clean mm_clean wmi_clean bpf_clean iio_clean gpio_clean objtool_clean leds_clean firmware_clean debugging_clean tracing_clean: +counter_clean dma_clean hv_clean firewire_clean bootconfig_clean spi_clean usb_clean virtio_clean mm_clean wmi_clean bpf_clean iio_clean gpio_clean objtool_clean leds_clean firmware_clean debugging_clean tracing_clean: $(call descend,$(@:_clean=),clean) libapi_clean: @@ -224,7 +225,7 @@ build_clean: ynl_clean: $(call descend,net/$(@:_clean=),clean) -clean: acpi_clean counter_clean cpupower_clean hv_clean firewire_clean \ +clean: acpi_clean counter_clean cpupower_clean dma_clean hv_clean firewire_clean \ perf_clean selftests_clean turbostat_clean bootconfig_clean spi_clean usb_clean virtio_clean \ mm_clean bpf_clean iio_clean x86_energy_perf_policy_clean tmon_clean \ freefall_clean build_clean libbpf_clean libsubcmd_clean \ diff --git a/tools/arch/arm64/include/asm/cputype.h b/tools/arch/arm64/include/asm/cputype.h index 139d5e87dc95..b35d954d50c3 100644 --- a/tools/arch/arm64/include/asm/cputype.h +++ b/tools/arch/arm64/include/asm/cputype.h @@ -245,7 +245,7 @@ #define MIDR_FUJITSU_ERRATUM_010001_MASK (~MIDR_CPU_VAR_REV(1, 0)) #define TCR_CLEAR_FUJITSU_ERRATUM_010001 (TCR_NFD1 | TCR_NFD0) -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include <asm/sysreg.h> @@ -338,6 +338,6 @@ static inline u32 __attribute_const__ read_cpuid_cachetype(void) { return read_cpuid(CTR_EL0); } -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif diff --git a/tools/arch/arm64/include/asm/esr.h b/tools/arch/arm64/include/asm/esr.h index bd592ca81571..f3c6403e5ef2 100644 --- a/tools/arch/arm64/include/asm/esr.h +++ b/tools/arch/arm64/include/asm/esr.h @@ -141,6 +141,8 @@ #define ESR_ELx_SF (UL(1) << ESR_ELx_SF_SHIFT) #define ESR_ELx_AR_SHIFT (14) #define ESR_ELx_AR (UL(1) << ESR_ELx_AR_SHIFT) +#define ESR_ELx_VNCR_SHIFT (13) +#define ESR_ELx_VNCR (UL(1) << ESR_ELx_VNCR_SHIFT) #define ESR_ELx_CM_SHIFT (8) #define ESR_ELx_CM (UL(1) << ESR_ELx_CM_SHIFT) @@ -385,7 +387,7 @@ #define ESR_ELx_MOPS_ISS_SRCREG(esr) (((esr) & (UL(0x1f) << 5)) >> 5) #define ESR_ELx_MOPS_ISS_SIZEREG(esr) (((esr) & (UL(0x1f) << 0)) >> 0) -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include <asm/types.h> static inline unsigned long esr_brk_comment(unsigned long esr) @@ -450,6 +452,6 @@ static inline bool esr_iss_is_eretab(unsigned long esr) } const char *esr_get_class_string(unsigned long esr); -#endif /* __ASSEMBLY */ +#endif /* __ASSEMBLER__ */ #endif /* __ASM_ESR_H */ diff --git a/tools/arch/arm64/include/asm/gpr-num.h b/tools/arch/arm64/include/asm/gpr-num.h index 05da4a7c5788..a114e4f8209b 100644 --- a/tools/arch/arm64/include/asm/gpr-num.h +++ b/tools/arch/arm64/include/asm/gpr-num.h @@ -2,7 +2,7 @@ #ifndef __ASM_GPR_NUM_H #define __ASM_GPR_NUM_H -#ifdef __ASSEMBLY__ +#ifdef __ASSEMBLER__ .irp num,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30 .equ .L__gpr_num_x\num, \num @@ -11,7 +11,7 @@ .equ .L__gpr_num_xzr, 31 .equ .L__gpr_num_wzr, 31 -#else /* __ASSEMBLY__ */ +#else /* __ASSEMBLER__ */ #define __DEFINE_ASM_GPR_NUMS \ " .irp num,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30\n" \ @@ -21,6 +21,6 @@ " .equ .L__gpr_num_xzr, 31\n" \ " .equ .L__gpr_num_wzr, 31\n" -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* __ASM_GPR_NUM_H */ diff --git a/tools/arch/arm64/include/asm/sysreg.h b/tools/arch/arm64/include/asm/sysreg.h index 65f2759ea27a..178b7322bf04 100644 --- a/tools/arch/arm64/include/asm/sysreg.h +++ b/tools/arch/arm64/include/asm/sysreg.h @@ -51,7 +51,7 @@ #ifndef CONFIG_BROKEN_GAS_INST -#ifdef __ASSEMBLY__ +#ifdef __ASSEMBLER__ // The space separator is omitted so that __emit_inst(x) can be parsed as // either an assembler directive or an assembler macro argument. #define __emit_inst(x) .inst(x) @@ -70,11 +70,11 @@ (((x) >> 24) & 0x000000ff)) #endif /* CONFIG_CPU_BIG_ENDIAN */ -#ifdef __ASSEMBLY__ +#ifdef __ASSEMBLER__ #define __emit_inst(x) .long __INSTR_BSWAP(x) -#else /* __ASSEMBLY__ */ +#else /* __ASSEMBLER__ */ #define __emit_inst(x) ".long " __stringify(__INSTR_BSWAP(x)) "\n\t" -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* CONFIG_BROKEN_GAS_INST */ @@ -1078,9 +1078,7 @@ #define GCS_CAP(x) ((((unsigned long)x) & GCS_CAP_ADDR_MASK) | \ GCS_CAP_VALID_TOKEN) -#define ARM64_FEATURE_FIELD_BITS 4 - -#ifdef __ASSEMBLY__ +#ifdef __ASSEMBLER__ .macro mrs_s, rt, sreg __emit_inst(0xd5200000|(\sreg)|(.L__gpr_num_\rt)) diff --git a/tools/arch/arm64/include/uapi/asm/kvm.h b/tools/arch/arm64/include/uapi/asm/kvm.h index ed5f3892674c..a792a599b9d6 100644 --- a/tools/arch/arm64/include/uapi/asm/kvm.h +++ b/tools/arch/arm64/include/uapi/asm/kvm.h @@ -31,7 +31,7 @@ #define KVM_SPSR_FIQ 4 #define KVM_NR_SPSR 5 -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include <linux/psci.h> #include <linux/types.h> #include <asm/ptrace.h> diff --git a/tools/arch/s390/include/uapi/asm/bitsperlong.h b/tools/arch/s390/include/uapi/asm/bitsperlong.h index d2bb620119bf..a226a1686a53 100644 --- a/tools/arch/s390/include/uapi/asm/bitsperlong.h +++ b/tools/arch/s390/include/uapi/asm/bitsperlong.h @@ -2,11 +2,7 @@ #ifndef __ASM_S390_BITSPERLONG_H #define __ASM_S390_BITSPERLONG_H -#ifndef __s390x__ -#define __BITS_PER_LONG 32 -#else #define __BITS_PER_LONG 64 -#endif #include <asm-generic/bitsperlong.h> diff --git a/tools/arch/x86/include/asm/cpufeatures.h b/tools/arch/x86/include/asm/cpufeatures.h index 4091a776e37a..ccc01ad6ff7c 100644 --- a/tools/arch/x86/include/asm/cpufeatures.h +++ b/tools/arch/x86/include/asm/cpufeatures.h @@ -320,7 +320,7 @@ #define X86_FEATURE_FSRS (12*32+11) /* Fast short REP STOSB */ #define X86_FEATURE_FSRC (12*32+12) /* Fast short REP {CMPSB,SCASB} */ #define X86_FEATURE_FRED (12*32+17) /* "fred" Flexible Return and Event Delivery */ -#define X86_FEATURE_LKGS (12*32+18) /* Load "kernel" (userspace) GS */ +#define X86_FEATURE_LKGS (12*32+18) /* Like MOV_GS except MSR_KERNEL_GS_BASE = GS.base */ #define X86_FEATURE_WRMSRNS (12*32+19) /* Non-serializing WRMSR */ #define X86_FEATURE_AMX_FP16 (12*32+21) /* AMX fp16 Support */ #define X86_FEATURE_AVX_IFMA (12*32+23) /* Support for VPMADD52[H,L]UQ */ @@ -407,9 +407,12 @@ #define X86_FEATURE_ENQCMD (16*32+29) /* "enqcmd" ENQCMD and ENQCMDS instructions */ #define X86_FEATURE_SGX_LC (16*32+30) /* "sgx_lc" Software Guard Extensions Launch Control */ -/* AMD-defined CPU features, CPUID level 0x80000007 (EBX), word 17 */ +/* + * Linux-defined word for use with scattered/synthetic bits. + */ #define X86_FEATURE_OVERFLOW_RECOV (17*32+ 0) /* "overflow_recov" MCA overflow recovery support */ #define X86_FEATURE_SUCCOR (17*32+ 1) /* "succor" Uncorrectable error containment and recovery */ + #define X86_FEATURE_SMCA (17*32+ 3) /* "smca" Scalable MCA */ /* Intel-defined CPU features, CPUID level 0x00000007:0 (EDX), word 18 */ diff --git a/tools/arch/x86/include/asm/insn.h b/tools/arch/x86/include/asm/insn.h index c683d609934b..8f10f2943370 100644 --- a/tools/arch/x86/include/asm/insn.h +++ b/tools/arch/x86/include/asm/insn.h @@ -312,7 +312,6 @@ static inline int insn_offset_immediate(struct insn *insn) /** * for_each_insn_prefix() -- Iterate prefixes in the instruction * @insn: Pointer to struct insn. - * @idx: Index storage. * @prefix: Prefix byte. * * Iterate prefix bytes of given @insn. Each prefix byte is stored in @prefix @@ -321,8 +320,8 @@ static inline int insn_offset_immediate(struct insn *insn) * Since prefixes.nbytes can be bigger than 4 if some prefixes * are repeated, it cannot be used for looping over the prefixes. */ -#define for_each_insn_prefix(insn, idx, prefix) \ - for (idx = 0; idx < ARRAY_SIZE(insn->prefixes.bytes) && (prefix = insn->prefixes.bytes[idx]) != 0; idx++) +#define for_each_insn_prefix(insn, prefix) \ + for (int idx = 0; idx < ARRAY_SIZE(insn->prefixes.bytes) && (prefix = insn->prefixes.bytes[idx]) != 0; idx++) #define POP_SS_OPCODE 0x1f #define MOV_SREG_OPCODE 0x8e diff --git a/tools/arch/x86/tools/gen-cpu-feature-names-x86.awk b/tools/arch/x86/tools/gen-cpu-feature-names-x86.awk new file mode 100644 index 000000000000..cc4c7a3e6c2e --- /dev/null +++ b/tools/arch/x86/tools/gen-cpu-feature-names-x86.awk @@ -0,0 +1,34 @@ +#!/bin/awk -f +# SPDX-License-Identifier: GPL-2.0 +# +# Copyright (c) 2025, Oracle and/or its affiliates. +# +# Usage: awk -f gen-cpu-feature-names-x86.awk cpufeatures.h > cpu-feature-names.c +# + +BEGIN { + print "/* cpu feature name array generated from cpufeatures.h */" + print "/* Do not change this code. */" + print + print "static const char *cpu_feature_names[(NCAPINTS+NBUGINTS)*32] = {" + + value_expr = "\\([0-9*+ ]+\\)" +} + +/^#define X86_FEATURE_/ { + if (match($0, value_expr)) { + value = substr($0, RSTART + 1, RLENGTH - 2) + print "\t[" value "] = \"" $2 "\"," + } +} + +/^#define X86_BUG_/ { + if (match($0, value_expr)) { + value = substr($0, RSTART + 1, RLENGTH - 2) + print "\t[NCAPINTS*32+(" value ")] = \"" $2 "\"," + } +} + +END { + print "};" +} diff --git a/tools/bpf/Makefile b/tools/bpf/Makefile index 062bbd6cd048..fd2585af1252 100644 --- a/tools/bpf/Makefile +++ b/tools/bpf/Makefile @@ -32,7 +32,7 @@ FEATURE_TESTS = libbfd disassembler-four-args disassembler-init-styled FEATURE_DISPLAY = libbfd check_feat := 1 -NON_CHECK_FEAT_TARGETS := clean bpftool_clean runqslower_clean resolve_btfids_clean +NON_CHECK_FEAT_TARGETS := clean bpftool_clean resolve_btfids_clean ifdef MAKECMDGOALS ifeq ($(filter-out $(NON_CHECK_FEAT_TARGETS),$(MAKECMDGOALS)),) check_feat := 0 @@ -70,7 +70,7 @@ $(OUTPUT)%.lex.o: $(OUTPUT)%.lex.c PROGS = $(OUTPUT)bpf_jit_disasm $(OUTPUT)bpf_dbg $(OUTPUT)bpf_asm -all: $(PROGS) bpftool runqslower +all: $(PROGS) bpftool $(OUTPUT)bpf_jit_disasm: CFLAGS += -DPACKAGE='bpf_jit_disasm' $(OUTPUT)bpf_jit_disasm: $(OUTPUT)bpf_jit_disasm.o @@ -86,7 +86,7 @@ $(OUTPUT)bpf_exp.lex.c: $(OUTPUT)bpf_exp.yacc.c $(OUTPUT)bpf_exp.yacc.o: $(OUTPUT)bpf_exp.yacc.c $(OUTPUT)bpf_exp.lex.o: $(OUTPUT)bpf_exp.lex.c -clean: bpftool_clean runqslower_clean resolve_btfids_clean +clean: bpftool_clean resolve_btfids_clean $(call QUIET_CLEAN, bpf-progs) $(Q)$(RM) -r -- $(OUTPUT)*.o $(OUTPUT)bpf_jit_disasm $(OUTPUT)bpf_dbg \ $(OUTPUT)bpf_asm $(OUTPUT)bpf_exp.yacc.* $(OUTPUT)bpf_exp.lex.* @@ -112,12 +112,6 @@ bpftool_install: bpftool_clean: $(call descend,bpftool,clean) -runqslower: - $(call descend,runqslower) - -runqslower_clean: - $(call descend,runqslower,clean) - resolve_btfids: $(call descend,resolve_btfids) @@ -125,5 +119,4 @@ resolve_btfids_clean: $(call descend,resolve_btfids,clean) .PHONY: all install clean bpftool bpftool_install bpftool_clean \ - runqslower runqslower_clean \ resolve_btfids resolve_btfids_clean diff --git a/tools/bpf/bpftool/Documentation/bpftool-map.rst b/tools/bpf/bpftool/Documentation/bpftool-map.rst index 252e4c538edb..1af3305ea2b2 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-map.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-map.rst @@ -55,7 +55,8 @@ MAP COMMANDS | | **devmap** | **devmap_hash** | **sockmap** | **cpumap** | **xskmap** | **sockhash** | | **cgroup_storage** | **reuseport_sockarray** | **percpu_cgroup_storage** | | **queue** | **stack** | **sk_storage** | **struct_ops** | **ringbuf** | **inode_storage** -| | **task_storage** | **bloom_filter** | **user_ringbuf** | **cgrp_storage** | **arena** } +| | **task_storage** | **bloom_filter** | **user_ringbuf** | **cgrp_storage** | **arena** +| | **insn_array** } DESCRIPTION =========== diff --git a/tools/bpf/bpftool/btf_dumper.c b/tools/bpf/bpftool/btf_dumper.c index ff12628593ae..def297e879f4 100644 --- a/tools/bpf/bpftool/btf_dumper.c +++ b/tools/bpf/bpftool/btf_dumper.c @@ -590,7 +590,7 @@ static int btf_dumper_do_type(const struct btf_dumper *d, __u32 type_id, case BTF_KIND_DATASEC: return btf_dumper_datasec(d, type_id, data); default: - jsonw_printf(d->jw, "(unsupported-kind"); + jsonw_printf(d->jw, "(unsupported-kind)"); return -EINVAL; } } diff --git a/tools/bpf/bpftool/map.c b/tools/bpf/bpftool/map.c index c9de44a45778..7ebf7dbcfba4 100644 --- a/tools/bpf/bpftool/map.c +++ b/tools/bpf/bpftool/map.c @@ -1477,7 +1477,8 @@ static int do_help(int argc, char **argv) " devmap | devmap_hash | sockmap | cpumap | xskmap | sockhash |\n" " cgroup_storage | reuseport_sockarray | percpu_cgroup_storage |\n" " queue | stack | sk_storage | struct_ops | ringbuf | inode_storage |\n" - " task_storage | bloom_filter | user_ringbuf | cgrp_storage | arena }\n" + " task_storage | bloom_filter | user_ringbuf | cgrp_storage | arena |\n" + " insn_array }\n" " " HELP_SPEC_OPTIONS " |\n" " {-f|--bpffs} | {-n|--nomount} }\n" "", diff --git a/tools/bpf/bpftool/sign.c b/tools/bpf/bpftool/sign.c index b34f74d210e9..f9b742f4bb10 100644 --- a/tools/bpf/bpftool/sign.c +++ b/tools/bpf/bpftool/sign.c @@ -28,6 +28,12 @@ #define OPEN_SSL_ERR_BUF_LEN 256 +/* Use deprecated in 3.0 ERR_get_error_line_data for openssl < 3 */ +#if !defined(OPENSSL_VERSION_MAJOR) || (OPENSSL_VERSION_MAJOR < 3) +#define ERR_get_error_all(file, line, func, data, flags) \ + ERR_get_error_line_data(file, line, data, flags) +#endif + static void display_openssl_errors(int l) { char buf[OPEN_SSL_ERR_BUF_LEN]; diff --git a/tools/bpf/runqslower/Makefile b/tools/bpf/runqslower/Makefile deleted file mode 100644 index 78a436c4072e..000000000000 --- a/tools/bpf/runqslower/Makefile +++ /dev/null @@ -1,91 +0,0 @@ -# SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) -include ../../scripts/Makefile.include - -OUTPUT ?= $(abspath .output)/ - -BPFTOOL_OUTPUT := $(OUTPUT)bpftool/ -DEFAULT_BPFTOOL := $(BPFTOOL_OUTPUT)bootstrap/bpftool -BPFTOOL ?= $(DEFAULT_BPFTOOL) -BPF_TARGET_ENDIAN ?= --target=bpf -LIBBPF_SRC := $(abspath ../../lib/bpf) -BPFOBJ_OUTPUT := $(OUTPUT)libbpf/ -BPFOBJ := $(BPFOBJ_OUTPUT)libbpf.a -BPF_DESTDIR := $(BPFOBJ_OUTPUT) -BPF_INCLUDE := $(BPF_DESTDIR)/include -INCLUDES := -I$(OUTPUT) -I$(BPF_INCLUDE) -I$(abspath ../../include/uapi) -CFLAGS := -g -Wall $(CLANG_CROSS_FLAGS) -CFLAGS += $(EXTRA_CFLAGS) -LDFLAGS += $(EXTRA_LDFLAGS) -LDLIBS += -lelf -lz - -# Try to detect best kernel BTF source -KERNEL_REL := $(shell uname -r) -VMLINUX_BTF_PATHS := $(if $(O),$(O)/vmlinux) \ - $(if $(KBUILD_OUTPUT),$(KBUILD_OUTPUT)/vmlinux) \ - ../../../vmlinux /sys/kernel/btf/vmlinux \ - /boot/vmlinux-$(KERNEL_REL) -VMLINUX_BTF_PATH := $(or $(VMLINUX_BTF),$(firstword \ - $(wildcard $(VMLINUX_BTF_PATHS)))) - -ifneq ($(V),1) -MAKEFLAGS += --no-print-directory -submake_extras := feature_display=0 -endif - -.DELETE_ON_ERROR: - -.PHONY: all clean runqslower libbpf_hdrs -all: runqslower - -runqslower: $(OUTPUT)/runqslower - -clean: - $(call QUIET_CLEAN, runqslower) - $(Q)$(RM) -r $(BPFOBJ_OUTPUT) $(BPFTOOL_OUTPUT) - $(Q)$(RM) $(OUTPUT)*.o $(OUTPUT)*.d - $(Q)$(RM) $(OUTPUT)*.skel.h $(OUTPUT)vmlinux.h - $(Q)$(RM) $(OUTPUT)runqslower - $(Q)$(RM) -r .output - -libbpf_hdrs: $(BPFOBJ) - -$(OUTPUT)/runqslower: $(OUTPUT)/runqslower.o $(BPFOBJ) - $(QUIET_LINK)$(CC) $(CFLAGS) $(LDFLAGS) $^ $(LDLIBS) -o $@ - -$(OUTPUT)/runqslower.o: runqslower.h $(OUTPUT)/runqslower.skel.h \ - $(OUTPUT)/runqslower.bpf.o | libbpf_hdrs - -$(OUTPUT)/runqslower.bpf.o: $(OUTPUT)/vmlinux.h runqslower.h | libbpf_hdrs - -$(OUTPUT)/%.skel.h: $(OUTPUT)/%.bpf.o | $(BPFTOOL) - $(QUIET_GEN)$(BPFTOOL) gen skeleton $< > $@ - -$(OUTPUT)/%.bpf.o: %.bpf.c $(BPFOBJ) | $(OUTPUT) - $(QUIET_GEN)$(CLANG) -g -O2 $(BPF_TARGET_ENDIAN) $(INCLUDES) \ - -c $(filter %.c,$^) -o $@ && \ - $(LLVM_STRIP) -g $@ - -$(OUTPUT)/%.o: %.c | $(OUTPUT) - $(QUIET_CC)$(CC) $(CFLAGS) $(INCLUDES) -c $(filter %.c,$^) -o $@ - -$(OUTPUT) $(BPFOBJ_OUTPUT) $(BPFTOOL_OUTPUT): - $(QUIET_MKDIR)mkdir -p $@ - -$(OUTPUT)/vmlinux.h: $(VMLINUX_BTF_PATH) | $(OUTPUT) $(BPFTOOL) -ifeq ($(VMLINUX_H),) - $(Q)if [ ! -e "$(VMLINUX_BTF_PATH)" ] ; then \ - echo "Couldn't find kernel BTF; set VMLINUX_BTF to" \ - "specify its location." >&2; \ - exit 1;\ - fi - $(QUIET_GEN)$(BPFTOOL) btf dump file $(VMLINUX_BTF_PATH) format c > $@ -else - $(Q)cp "$(VMLINUX_H)" $@ -endif - -$(BPFOBJ): $(wildcard $(LIBBPF_SRC)/*.[ch] $(LIBBPF_SRC)/Makefile) | $(BPFOBJ_OUTPUT) - $(Q)$(MAKE) $(submake_extras) -C $(LIBBPF_SRC) OUTPUT=$(BPFOBJ_OUTPUT) \ - DESTDIR=$(BPFOBJ_OUTPUT) prefix= $(abspath $@) install_headers - -$(DEFAULT_BPFTOOL): | $(BPFTOOL_OUTPUT) - $(Q)$(MAKE) $(submake_extras) -C ../bpftool OUTPUT=$(BPFTOOL_OUTPUT) bootstrap diff --git a/tools/bpf/runqslower/runqslower.bpf.c b/tools/bpf/runqslower/runqslower.bpf.c deleted file mode 100644 index fced54a3adf6..000000000000 --- a/tools/bpf/runqslower/runqslower.bpf.c +++ /dev/null @@ -1,106 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -// Copyright (c) 2019 Facebook -#include "vmlinux.h" -#include <bpf/bpf_helpers.h> -#include "runqslower.h" - -#define TASK_RUNNING 0 -#define BPF_F_CURRENT_CPU 0xffffffffULL - -const volatile __u64 min_us = 0; -const volatile pid_t targ_pid = 0; - -struct { - __uint(type, BPF_MAP_TYPE_TASK_STORAGE); - __uint(map_flags, BPF_F_NO_PREALLOC); - __type(key, int); - __type(value, u64); -} start SEC(".maps"); - -struct { - __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY); - __uint(key_size, sizeof(u32)); - __uint(value_size, sizeof(u32)); -} events SEC(".maps"); - -/* record enqueue timestamp */ -__always_inline -static int trace_enqueue(struct task_struct *t) -{ - u32 pid = t->pid; - u64 *ptr; - - if (!pid || (targ_pid && targ_pid != pid)) - return 0; - - ptr = bpf_task_storage_get(&start, t, 0, - BPF_LOCAL_STORAGE_GET_F_CREATE); - if (!ptr) - return 0; - - *ptr = bpf_ktime_get_ns(); - return 0; -} - -SEC("tp_btf/sched_wakeup") -int handle__sched_wakeup(u64 *ctx) -{ - /* TP_PROTO(struct task_struct *p) */ - struct task_struct *p = (void *)ctx[0]; - - return trace_enqueue(p); -} - -SEC("tp_btf/sched_wakeup_new") -int handle__sched_wakeup_new(u64 *ctx) -{ - /* TP_PROTO(struct task_struct *p) */ - struct task_struct *p = (void *)ctx[0]; - - return trace_enqueue(p); -} - -SEC("tp_btf/sched_switch") -int handle__sched_switch(u64 *ctx) -{ - /* TP_PROTO(bool preempt, struct task_struct *prev, - * struct task_struct *next) - */ - struct task_struct *prev = (struct task_struct *)ctx[1]; - struct task_struct *next = (struct task_struct *)ctx[2]; - struct runq_event event = {}; - u64 *tsp, delta_us; - u32 pid; - - /* ivcsw: treat like an enqueue event and store timestamp */ - if (prev->__state == TASK_RUNNING) - trace_enqueue(prev); - - pid = next->pid; - - /* For pid mismatch, save a bpf_task_storage_get */ - if (!pid || (targ_pid && targ_pid != pid)) - return 0; - - /* fetch timestamp and calculate delta */ - tsp = bpf_task_storage_get(&start, next, 0, 0); - if (!tsp) - return 0; /* missed enqueue */ - - delta_us = (bpf_ktime_get_ns() - *tsp) / 1000; - if (min_us && delta_us <= min_us) - return 0; - - event.pid = pid; - event.delta_us = delta_us; - bpf_get_current_comm(&event.task, sizeof(event.task)); - - /* output */ - bpf_perf_event_output(ctx, &events, BPF_F_CURRENT_CPU, - &event, sizeof(event)); - - bpf_task_storage_delete(&start, next); - return 0; -} - -char LICENSE[] SEC("license") = "GPL"; diff --git a/tools/bpf/runqslower/runqslower.c b/tools/bpf/runqslower/runqslower.c deleted file mode 100644 index 83c5993a139a..000000000000 --- a/tools/bpf/runqslower/runqslower.c +++ /dev/null @@ -1,171 +0,0 @@ -// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) -// Copyright (c) 2019 Facebook -#include <argp.h> -#include <stdio.h> -#include <stdlib.h> -#include <string.h> -#include <time.h> -#include <bpf/libbpf.h> -#include <bpf/bpf.h> -#include "runqslower.h" -#include "runqslower.skel.h" - -struct env { - pid_t pid; - __u64 min_us; - bool verbose; -} env = { - .min_us = 10000, -}; - -const char *argp_program_version = "runqslower 0.1"; -const char *argp_program_bug_address = "<bpf@vger.kernel.org>"; -const char argp_program_doc[] = -"runqslower Trace long process scheduling delays.\n" -" For Linux, uses eBPF, BPF CO-RE, libbpf, BTF.\n" -"\n" -"This script traces high scheduling delays between tasks being\n" -"ready to run and them running on CPU after that.\n" -"\n" -"USAGE: runqslower [-p PID] [min_us]\n" -"\n" -"EXAMPLES:\n" -" runqslower # trace run queue latency higher than 10000 us (default)\n" -" runqslower 1000 # trace run queue latency higher than 1000 us\n" -" runqslower -p 123 # trace pid 123 only\n"; - -static const struct argp_option opts[] = { - { "pid", 'p', "PID", 0, "Process PID to trace"}, - { "verbose", 'v', NULL, 0, "Verbose debug output" }, - {}, -}; - -static error_t parse_arg(int key, char *arg, struct argp_state *state) -{ - static int pos_args; - int pid; - long long min_us; - - switch (key) { - case 'v': - env.verbose = true; - break; - case 'p': - errno = 0; - pid = strtol(arg, NULL, 10); - if (errno || pid <= 0) { - fprintf(stderr, "Invalid PID: %s\n", arg); - argp_usage(state); - } - env.pid = pid; - break; - case ARGP_KEY_ARG: - if (pos_args++) { - fprintf(stderr, - "Unrecognized positional argument: %s\n", arg); - argp_usage(state); - } - errno = 0; - min_us = strtoll(arg, NULL, 10); - if (errno || min_us <= 0) { - fprintf(stderr, "Invalid delay (in us): %s\n", arg); - argp_usage(state); - } - env.min_us = min_us; - break; - default: - return ARGP_ERR_UNKNOWN; - } - return 0; -} - -int libbpf_print_fn(enum libbpf_print_level level, - const char *format, va_list args) -{ - if (level == LIBBPF_DEBUG && !env.verbose) - return 0; - return vfprintf(stderr, format, args); -} - -void handle_event(void *ctx, int cpu, void *data, __u32 data_sz) -{ - const struct runq_event *e = data; - struct tm *tm; - char ts[32]; - time_t t; - - time(&t); - tm = localtime(&t); - strftime(ts, sizeof(ts), "%H:%M:%S", tm); - printf("%-8s %-16s %-6d %14llu\n", ts, e->task, e->pid, e->delta_us); -} - -void handle_lost_events(void *ctx, int cpu, __u64 lost_cnt) -{ - printf("Lost %llu events on CPU #%d!\n", lost_cnt, cpu); -} - -int main(int argc, char **argv) -{ - static const struct argp argp = { - .options = opts, - .parser = parse_arg, - .doc = argp_program_doc, - }; - struct perf_buffer *pb = NULL; - struct runqslower_bpf *obj; - int err; - - err = argp_parse(&argp, argc, argv, 0, NULL, NULL); - if (err) - return err; - - libbpf_set_print(libbpf_print_fn); - - /* Use libbpf 1.0 API mode */ - libbpf_set_strict_mode(LIBBPF_STRICT_ALL); - - obj = runqslower_bpf__open(); - if (!obj) { - fprintf(stderr, "failed to open and/or load BPF object\n"); - return 1; - } - - /* initialize global data (filtering options) */ - obj->rodata->targ_pid = env.pid; - obj->rodata->min_us = env.min_us; - - err = runqslower_bpf__load(obj); - if (err) { - fprintf(stderr, "failed to load BPF object: %d\n", err); - goto cleanup; - } - - err = runqslower_bpf__attach(obj); - if (err) { - fprintf(stderr, "failed to attach BPF programs\n"); - goto cleanup; - } - - printf("Tracing run queue latency higher than %llu us\n", env.min_us); - printf("%-8s %-16s %-6s %14s\n", "TIME", "COMM", "PID", "LAT(us)"); - - pb = perf_buffer__new(bpf_map__fd(obj->maps.events), 64, - handle_event, handle_lost_events, NULL, NULL); - err = libbpf_get_error(pb); - if (err) { - pb = NULL; - fprintf(stderr, "failed to open perf buffer: %d\n", err); - goto cleanup; - } - - while ((err = perf_buffer__poll(pb, 100)) >= 0) - ; - printf("Error polling perf buffer: %d\n", err); - -cleanup: - perf_buffer__free(pb); - runqslower_bpf__destroy(obj); - - return err != 0; -} diff --git a/tools/bpf/runqslower/runqslower.h b/tools/bpf/runqslower/runqslower.h deleted file mode 100644 index 4f70f07200c2..000000000000 --- a/tools/bpf/runqslower/runqslower.h +++ /dev/null @@ -1,13 +0,0 @@ -/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ -#ifndef __RUNQSLOWER_H -#define __RUNQSLOWER_H - -#define TASK_COMM_LEN 16 - -struct runq_event { - char task[TASK_COMM_LEN]; - __u64 delta_us; - pid_t pid; -}; - -#endif /* __RUNQSLOWER_H */ diff --git a/tools/build/Build b/tools/build/Build new file mode 100644 index 000000000000..1c7e598e9f59 --- /dev/null +++ b/tools/build/Build @@ -0,0 +1,2 @@ +hostprogs := fixdep +fixdep-y := fixdep.o diff --git a/tools/build/Makefile b/tools/build/Makefile index 63ef21878761..3a5a3808ab2a 100644 --- a/tools/build/Makefile +++ b/tools/build/Makefile @@ -37,5 +37,22 @@ ifneq ($(wildcard $(TMP_O)),) $(Q)$(MAKE) -C feature OUTPUT=$(TMP_O) clean >/dev/null endif -$(OUTPUT)fixdep: $(srctree)/tools/build/fixdep.c - $(QUIET_CC)$(HOSTCC) $(KBUILD_HOSTCFLAGS) $(KBUILD_HOSTLDFLAGS) -o $@ $< +FIXDEP := $(OUTPUT)fixdep +FIXDEP_IN := $(OUTPUT)fixdep-in.o + +# To track fixdep's dependencies properly, fixdep needs to run on itself. +# Build it twice the first time. +$(FIXDEP_IN): FORCE + $(Q)if [ ! -f $(FIXDEP) ]; then \ + $(MAKE) $(build)=fixdep HOSTCFLAGS="$(KBUILD_HOSTCFLAGS)"; \ + rm -f $(FIXDEP).o; \ + fi + $(Q)$(MAKE) $(build)=fixdep HOSTCFLAGS="$(KBUILD_HOSTCFLAGS)" + + +$(FIXDEP): $(FIXDEP_IN) + $(QUIET_LINK)$(HOSTCC) $(FIXDEP_IN) $(KBUILD_HOSTLDFLAGS) -o $@ + +FORCE: + +.PHONY: FORCE diff --git a/tools/build/Makefile.feature b/tools/build/Makefile.feature index 32bbe29fe5f6..300a329bc581 100644 --- a/tools/build/Makefile.feature +++ b/tools/build/Makefile.feature @@ -315,5 +315,7 @@ endef ifeq ($(FEATURE_DISPLAY_DEFERRED),) $(call feature_display_entries) - $(info ) + ifeq ($(feature_display),1) + $(info ) + endif endif diff --git a/tools/dma/.gitignore b/tools/dma/.gitignore new file mode 100644 index 000000000000..94b68cf4147b --- /dev/null +++ b/tools/dma/.gitignore @@ -0,0 +1,3 @@ +# SPDX-License-Identifier: GPL-2.0-only +dma_map_benchmark +include/linux/map_benchmark.h diff --git a/tools/dma/Makefile b/tools/dma/Makefile new file mode 100644 index 000000000000..e4abf37bf020 --- /dev/null +++ b/tools/dma/Makefile @@ -0,0 +1,55 @@ +# SPDX-License-Identifier: GPL-2.0 +include ../scripts/Makefile.include + +bindir ?= /usr/bin + +# This will work when dma is built in tools env. where srctree +# isn't set and when invoked from selftests build, where srctree +# is set to ".". building_out_of_srctree is undefined for in srctree +# builds +ifndef building_out_of_srctree +srctree := $(patsubst %/,%,$(dir $(CURDIR))) +srctree := $(patsubst %/,%,$(dir $(srctree))) +endif + +# Do not use make's built-in rules +# (this improves performance and avoids hard-to-debug behaviour); +MAKEFLAGS += -r + +override CFLAGS += -O2 -Wall -g -D_GNU_SOURCE -I$(OUTPUT)include + +ALL_TARGETS := dma_map_benchmark +ALL_PROGRAMS := $(patsubst %,$(OUTPUT)%,$(ALL_TARGETS)) + +all: $(ALL_PROGRAMS) + +export srctree OUTPUT CC LD CFLAGS +include $(srctree)/tools/build/Makefile.include + +# +# We need the following to be outside of kernel tree +# +$(OUTPUT)include/linux/map_benchmark.h: ../../include/uapi/linux/map_benchmark.h + mkdir -p $(OUTPUT)include/linux 2>&1 || true + ln -sf $(CURDIR)/../../include/uapi/linux/map_benchmark.h $@ + +prepare: $(OUTPUT)include/linux/map_benchmark.h + +FORCE: + +DMA_MAP_BENCHMARK = dma_map_benchmark +$(DMA_MAP_BENCHMARK): prepare FORCE + $(CC) $(CFLAGS) $(DMA_MAP_BENCHMARK).c -o $(DMA_MAP_BENCHMARK) + +clean: + rm -f $(ALL_PROGRAMS) + rm -rf $(OUTPUT)include + find $(or $(OUTPUT),.) -name '*.o' -delete -o -name '\.*.d' -delete -o -name '\.*.cmd' -delete + +install: $(ALL_PROGRAMS) + install -d -m 755 $(DESTDIR)$(bindir); \ + for program in $(ALL_PROGRAMS); do \ + install $$program $(DESTDIR)$(bindir); \ + done + +.PHONY: all install clean prepare FORCE diff --git a/tools/testing/selftests/dma/config b/tools/dma/config index 6102ee3c43cd..6102ee3c43cd 100644 --- a/tools/testing/selftests/dma/config +++ b/tools/dma/config diff --git a/tools/testing/selftests/dma/dma_map_benchmark.c b/tools/dma/dma_map_benchmark.c index b12f1f9babf8..dd0ed528e6df 100644 --- a/tools/testing/selftests/dma/dma_map_benchmark.c +++ b/tools/dma/dma_map_benchmark.c @@ -10,7 +10,6 @@ #include <unistd.h> #include <sys/ioctl.h> #include <sys/mman.h> -#include <linux/types.h> #include <linux/map_benchmark.h> #define NSEC_PER_MSEC 1000000L @@ -118,7 +117,7 @@ int main(int argc, char **argv) } printf("dma mapping benchmark: threads:%d seconds:%d node:%d dir:%s granule: %d\n", - threads, seconds, node, dir[directions], granule); + threads, seconds, node, directions[dir], granule); printf("average map latency(us):%.1f standard deviation:%.1f\n", map.avg_map_100ns/10.0, map.map_stddev/10.0); printf("average unmap latency(us):%.1f standard deviation:%.1f\n", diff --git a/tools/docs/check-variable-fonts.py b/tools/docs/check-variable-fonts.py new file mode 100755 index 000000000000..958d5a745724 --- /dev/null +++ b/tools/docs/check-variable-fonts.py @@ -0,0 +1,37 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: GPL-2.0-only +# Copyright (C) Akira Yokosawa, 2024 +# +# Ported to Python by (c) Mauro Carvalho Chehab, 2025 +# +# pylint: disable=C0103 + +""" +Detect problematic Noto CJK variable fonts. + +or more details, see .../tools/lib/python/kdoc/latex_fonts.py. +""" + +import argparse +import sys +import os.path + +src_dir = os.path.dirname(os.path.realpath(__file__)) +sys.path.insert(0, os.path.join(src_dir, '../lib/python')) + +from kdoc.latex_fonts import LatexFontChecker + +checker = LatexFontChecker() + +parser=argparse.ArgumentParser(description=checker.description(), + formatter_class=argparse.RawTextHelpFormatter) +parser.add_argument("--deny-vf", + help="XDG_CONFIG_HOME dir containing fontconfig/fonts.conf file") + +args=parser.parse_args() + +msg = LatexFontChecker(args.deny_vf).check() +if msg: + print(msg) + +sys.exit(1) diff --git a/tools/docs/checktransupdate.py b/tools/docs/checktransupdate.py new file mode 100755 index 000000000000..e894652369a5 --- /dev/null +++ b/tools/docs/checktransupdate.py @@ -0,0 +1,307 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: GPL-2.0 + +""" +This script helps track the translation status of the documentation +in different locales, e.g., zh_CN. More specially, it uses `git log` +commit to find the latest english commit from the translation commit +(order by author date) and the latest english commits from HEAD. If +differences occur, report the file and commits that need to be updated. + +The usage is as follows: +- tools/docs/checktransupdate.py -l zh_CN +This will print all the files that need to be updated or translated in the zh_CN locale. +- tools/docs/checktransupdate.py Documentation/translations/zh_CN/dev-tools/testing-overview.rst +This will only print the status of the specified file. + +The output is something like: +Documentation/dev-tools/kfence.rst +No translation in the locale of zh_CN + +Documentation/translations/zh_CN/dev-tools/testing-overview.rst +commit 42fb9cfd5b18 ("Documentation: dev-tools: Add link to RV docs") +1 commits needs resolving in total +""" + +import os +import re +import time +import logging +from argparse import ArgumentParser, ArgumentTypeError, BooleanOptionalAction +from datetime import datetime + + +def get_origin_path(file_path): + """Get the origin path from the translation path""" + paths = file_path.split("/") + tidx = paths.index("translations") + opaths = paths[:tidx] + opaths += paths[tidx + 2 :] + return "/".join(opaths) + + +def get_latest_commit_from(file_path, commit): + """Get the latest commit from the specified commit for the specified file""" + command = f"git log --pretty=format:%H%n%aD%n%cD%n%n%B {commit} -1 -- {file_path}" + logging.debug(command) + pipe = os.popen(command) + result = pipe.read() + result = result.split("\n") + if len(result) <= 1: + return None + + logging.debug("Result: %s", result[0]) + + return { + "hash": result[0], + "author_date": datetime.strptime(result[1], "%a, %d %b %Y %H:%M:%S %z"), + "commit_date": datetime.strptime(result[2], "%a, %d %b %Y %H:%M:%S %z"), + "message": result[4:], + } + + +def get_origin_from_trans(origin_path, t_from_head): + """Get the latest origin commit from the translation commit""" + o_from_t = get_latest_commit_from(origin_path, t_from_head["hash"]) + while o_from_t is not None and o_from_t["author_date"] > t_from_head["author_date"]: + o_from_t = get_latest_commit_from(origin_path, o_from_t["hash"] + "^") + if o_from_t is not None: + logging.debug("tracked origin commit id: %s", o_from_t["hash"]) + return o_from_t + + +def get_origin_from_trans_smartly(origin_path, t_from_head): + """Get the latest origin commit from the formatted translation commit: + (1) update to commit HASH (TITLE) + (2) Update the translation through commit HASH (TITLE) + """ + # catch flag for 12-bit commit hash + HASH = r'([0-9a-f]{12})' + # pattern 1: contains "update to commit HASH" + pat_update_to = re.compile(rf'update to commit {HASH}') + # pattern 2: contains "Update the translation through commit HASH" + pat_update_translation = re.compile(rf'Update the translation through commit {HASH}') + + origin_commit_hash = None + for line in t_from_head["message"]: + # check if the line matches the first pattern + match = pat_update_to.search(line) + if match: + origin_commit_hash = match.group(1) + break + # check if the line matches the second pattern + match = pat_update_translation.search(line) + if match: + origin_commit_hash = match.group(1) + break + if origin_commit_hash is None: + return None + o_from_t = get_latest_commit_from(origin_path, origin_commit_hash) + if o_from_t is not None: + logging.debug("tracked origin commit id: %s", o_from_t["hash"]) + return o_from_t + + +def get_commits_count_between(opath, commit1, commit2): + """Get the commits count between two commits for the specified file""" + command = f"git log --pretty=format:%H {commit1}...{commit2} -- {opath}" + logging.debug(command) + pipe = os.popen(command) + result = pipe.read().split("\n") + # filter out empty lines + result = list(filter(lambda x: x != "", result)) + return result + + +def pretty_output(commit): + """Pretty print the commit message""" + command = f"git log --pretty='format:%h (\"%s\")' -1 {commit}" + logging.debug(command) + pipe = os.popen(command) + return pipe.read() + + +def valid_commit(commit): + """Check if the commit is valid or not""" + msg = pretty_output(commit) + return "Merge tag" not in msg + +def check_per_file(file_path): + """Check the translation status for the specified file""" + opath = get_origin_path(file_path) + + if not os.path.isfile(opath): + logging.error("Cannot find the origin path for {file_path}") + return + + o_from_head = get_latest_commit_from(opath, "HEAD") + t_from_head = get_latest_commit_from(file_path, "HEAD") + + if o_from_head is None or t_from_head is None: + logging.error("Cannot find the latest commit for %s", file_path) + return + + o_from_t = get_origin_from_trans_smartly(opath, t_from_head) + # notice, o_from_t from get_*_smartly() is always more accurate than from get_*() + if o_from_t is None: + o_from_t = get_origin_from_trans(opath, t_from_head) + + if o_from_t is None: + logging.error("Error: Cannot find the latest origin commit for %s", file_path) + return + + if o_from_head["hash"] == o_from_t["hash"]: + logging.debug("No update needed for %s", file_path) + else: + logging.info(file_path) + commits = get_commits_count_between( + opath, o_from_t["hash"], o_from_head["hash"] + ) + count = 0 + for commit in commits: + if valid_commit(commit): + logging.info("commit %s", pretty_output(commit)) + count += 1 + logging.info("%d commits needs resolving in total\n", count) + + +def valid_locales(locale): + """Check if the locale is valid or not""" + script_path = os.path.dirname(os.path.abspath(__file__)) + linux_path = os.path.join(script_path, "../..") + if not os.path.isdir(f"{linux_path}/Documentation/translations/{locale}"): + raise ArgumentTypeError("Invalid locale: {locale}") + return locale + + +def list_files_with_excluding_folders(folder, exclude_folders, include_suffix): + """List all files with the specified suffix in the folder and its subfolders""" + files = [] + stack = [folder] + + while stack: + pwd = stack.pop() + # filter out the exclude folders + if os.path.basename(pwd) in exclude_folders: + continue + # list all files and folders + for item in os.listdir(pwd): + ab_item = os.path.join(pwd, item) + if os.path.isdir(ab_item): + stack.append(ab_item) + else: + if ab_item.endswith(include_suffix): + files.append(ab_item) + + return files + + +class DmesgFormatter(logging.Formatter): + """Custom dmesg logging formatter""" + def format(self, record): + timestamp = time.time() + formatted_time = f"[{timestamp:>10.6f}]" + log_message = f"{formatted_time} {record.getMessage()}" + return log_message + + +def config_logging(log_level, log_file="checktransupdate.log"): + """configure logging based on the log level""" + # set up the root logger + logger = logging.getLogger() + logger.setLevel(log_level) + + # Create console handler + console_handler = logging.StreamHandler() + console_handler.setLevel(log_level) + + # Create file handler + file_handler = logging.FileHandler(log_file) + file_handler.setLevel(log_level) + + # Create formatter and add it to the handlers + formatter = DmesgFormatter() + console_handler.setFormatter(formatter) + file_handler.setFormatter(formatter) + + # Add the handler to the logger + logger.addHandler(console_handler) + logger.addHandler(file_handler) + + +def main(): + """Main function of the script""" + script_path = os.path.dirname(os.path.abspath(__file__)) + linux_path = os.path.join(script_path, "../..") + + parser = ArgumentParser(description="Check the translation update") + parser.add_argument( + "-l", + "--locale", + default="zh_CN", + type=valid_locales, + help="Locale to check when files are not specified", + ) + + parser.add_argument( + "--print-missing-translations", + action=BooleanOptionalAction, + default=True, + help="Print files that do not have translations", + ) + + parser.add_argument( + '--log', + default='INFO', + choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'], + help='Set the logging level') + + parser.add_argument( + '--logfile', + default='checktransupdate.log', + help='Set the logging file (default: checktransupdate.log)') + + parser.add_argument( + "files", nargs="*", help="Files to check, if not specified, check all files" + ) + args = parser.parse_args() + + # Configure logging based on the --log argument + log_level = getattr(logging, args.log.upper(), logging.INFO) + config_logging(log_level) + + # Get files related to linux path + files = args.files + if len(files) == 0: + offical_files = list_files_with_excluding_folders( + os.path.join(linux_path, "Documentation"), ["translations", "output"], "rst" + ) + + for file in offical_files: + # split the path into parts + path_parts = file.split(os.sep) + # find the index of the "Documentation" directory + kindex = path_parts.index("Documentation") + # insert the translations and locale after the Documentation directory + new_path_parts = path_parts[:kindex + 1] + ["translations", args.locale] \ + + path_parts[kindex + 1 :] + # join the path parts back together + new_file = os.sep.join(new_path_parts) + if os.path.isfile(new_file): + files.append(new_file) + else: + if args.print_missing_translations: + logging.info(os.path.relpath(os.path.abspath(file), linux_path)) + logging.info("No translation in the locale of %s\n", args.locale) + + files = list(map(lambda x: os.path.relpath(os.path.abspath(x), linux_path), files)) + + # cd to linux root directory + os.chdir(linux_path) + + for file in files: + check_per_file(file) + + +if __name__ == "__main__": + main() diff --git a/tools/docs/documentation-file-ref-check b/tools/docs/documentation-file-ref-check new file mode 100755 index 000000000000..0cad42f6943b --- /dev/null +++ b/tools/docs/documentation-file-ref-check @@ -0,0 +1,245 @@ +#!/usr/bin/env perl +# SPDX-License-Identifier: GPL-2.0 +# +# Treewide grep for references to files under Documentation, and report +# non-existing files in stderr. + +use warnings; +use strict; +use Getopt::Long qw(:config no_auto_abbrev); + +# NOTE: only add things here when the file was gone, but the text wants +# to mention a past documentation file, for example, to give credits for +# the original work. +my %false_positives = ( + "Documentation/scsi/scsi_mid_low_api.rst" => "Documentation/Configure.help", + "drivers/vhost/vhost.c" => "Documentation/virtual/lguest/lguest.c", +); + +my $scriptname = $0; +$scriptname =~ s,tools/docs/([^/]+/),$1,; + +# Parse arguments +my $help = 0; +my $fix = 0; +my $warn = 0; + +if (! -e ".git") { + printf "Warning: can't check if file exists, as this is not a git tree\n"; + exit 0; +} + +GetOptions( + 'fix' => \$fix, + 'warn' => \$warn, + 'h|help|usage' => \$help, +); + +if ($help != 0) { + print "$scriptname [--help] [--fix]\n"; + exit -1; +} + +# Step 1: find broken references +print "Finding broken references. This may take a while... " if ($fix); + +my %broken_ref; + +my $doc_fix = 0; + +open IN, "git grep ':doc:\`' Documentation/|" + or die "Failed to run git grep"; +while (<IN>) { + next if (!m,^([^:]+):.*\:doc\:\`([^\`]+)\`,); + next if (m,sphinx/,); + + my $file = $1; + my $d = $1; + my $doc_ref = $2; + + my $f = $doc_ref; + + $d =~ s,(.*/).*,$1,; + $f =~ s,.*\<([^\>]+)\>,$1,; + + if ($f =~ m,^/,) { + $f = "$f.rst"; + $f =~ s,^/,Documentation/,; + } else { + $f = "$d$f.rst"; + } + + next if (grep -e, glob("$f")); + + if ($fix && !$doc_fix) { + print STDERR "\nWARNING: Currently, can't fix broken :doc:`` fields\n"; + } + $doc_fix++; + + print STDERR "$file: :doc:`$doc_ref`\n"; +} +close IN; + +open IN, "git grep 'Documentation/'|" + or die "Failed to run git grep"; +while (<IN>) { + next if (!m/^([^:]+):(.*)/); + + my $f = $1; + my $ln = $2; + + # On linux-next, discard the Next/ directory + next if ($f =~ m,^Next/,); + + # Makefiles and scripts contain nasty expressions to parse docs + next if ($f =~ m/Makefile/ || $f =~ m/\.(sh|py|pl|~|rej|org|orig)$/); + + # It doesn't make sense to parse hidden files + next if ($f =~ m#/\.#); + + # Skip this script + next if ($f eq $scriptname); + + # Ignore the dir where documentation will be built + next if ($ln =~ m,\b(\S*)Documentation/output,); + + if ($ln =~ m,\b(\S*)(Documentation/[A-Za-z0-9\_\.\,\~/\*\[\]\?+-]*)(.*),) { + my $prefix = $1; + my $ref = $2; + my $base = $2; + my $extra = $3; + + # some file references are like: + # /usr/src/linux/Documentation/DMA-{API,mapping}.txt + # For now, ignore them + next if ($extra =~ m/^{/); + + # Remove footnotes at the end like: + # Documentation/devicetree/dt-object-internal.txt[1] + $ref =~ s/(txt|rst)\[\d+]$/$1/; + + # Remove ending ']' without any '[' + $ref =~ s/\].*// if (!($ref =~ m/\[/)); + + # Remove puntuation marks at the end + $ref =~ s/[\,\.]+$//; + + my $fulref = "$prefix$ref"; + + $fulref =~ s/^(\<file|ref)://; + $fulref =~ s/^[\'\`]+//; + $fulref =~ s,^\$\(.*\)/,,; + $base =~ s,.*/,,; + + # Remove URL false-positives + next if ($fulref =~ m/^http/); + + # Remove sched-pelt false-positive + next if ($fulref =~ m,^Documentation/scheduler/sched-pelt$,); + + # Discard some build examples from Documentation/target/tcm_mod_builder.rst + next if ($fulref =~ m,mnt/sdb/lio-core-2.6.git/Documentation/target,); + + # Check if exists, evaluating wildcards + next if (grep -e, glob("$ref $fulref")); + + # Accept relative Documentation patches for tools/ + if ($f =~ m/tools/) { + my $path = $f; + $path =~ s,(.*)/.*,$1,; + $path =~ s,testing/selftests/bpf,bpf/bpftool,; + next if (grep -e, glob("$path/$ref $path/../$ref $path/$fulref")); + } + + # Discard known false-positives + if (defined($false_positives{$f})) { + next if ($false_positives{$f} eq $fulref); + } + + if ($fix) { + if (!($ref =~ m/(scripts|Kconfig|Kbuild)/)) { + $broken_ref{$ref}++; + } + } elsif ($warn) { + print STDERR "Warning: $f references a file that doesn't exist: $fulref\n"; + } else { + print STDERR "$f: $fulref\n"; + } + } +} +close IN; + +exit 0 if (!$fix); + +# Step 2: Seek for file name alternatives +print "Auto-fixing broken references. Please double-check the results\n"; + +foreach my $ref (keys %broken_ref) { + my $new =$ref; + + my $basedir = "."; + # On translations, only seek inside the translations directory + $basedir = $1 if ($ref =~ m,(Documentation/translations/[^/]+),); + + # get just the basename + $new =~ s,.*/,,; + + my $f=""; + + # usual reason for breakage: DT file moved around + if ($ref =~ /devicetree/) { + # usual reason for breakage: DT file renamed to .yaml + if (!$f) { + my $new_ref = $ref; + $new_ref =~ s/\.txt$/.yaml/; + $f=$new_ref if (-f $new_ref); + } + + if (!$f) { + my $search = $new; + $search =~ s,^.*/,,; + $f = qx(find Documentation/devicetree/ -iname "*$search*") if ($search); + if (!$f) { + # Manufacturer name may have changed + $search =~ s/^.*,//; + $f = qx(find Documentation/devicetree/ -iname "*$search*") if ($search); + } + } + } + + # usual reason for breakage: file renamed to .rst + if (!$f) { + $new =~ s/\.txt$/.rst/; + $f=qx(find $basedir -iname $new) if ($new); + } + + # usual reason for breakage: use dash or underline + if (!$f) { + $new =~ s/[-_]/[-_]/g; + $f=qx(find $basedir -iname $new) if ($new); + } + + # Wild guess: seek for the same name on another place + if (!$f) { + $f = qx(find $basedir -iname $new) if ($new); + } + + my @find = split /\s+/, $f; + + if (!$f) { + print STDERR "ERROR: Didn't find a replacement for $ref\n"; + } elsif (scalar(@find) > 1) { + print STDERR "WARNING: Won't auto-replace, as found multiple files close to $ref:\n"; + foreach my $j (@find) { + $j =~ s,^./,,; + print STDERR " $j\n"; + } + } else { + $f = $find[0]; + $f =~ s,^./,,; + print "INFO: Replacing $ref to $f\n"; + foreach my $j (qx(git grep -l $ref)) { + qx(sed "s\@$ref\@$f\@g" -i $j); + } + } +} diff --git a/tools/docs/features-refresh.sh b/tools/docs/features-refresh.sh new file mode 100755 index 000000000000..c2288124e94a --- /dev/null +++ b/tools/docs/features-refresh.sh @@ -0,0 +1,98 @@ +# +# Small script that refreshes the kernel feature support status in place. +# + +for F_FILE in Documentation/features/*/*/arch-support.txt; do + F=$(grep "^# Kconfig:" "$F_FILE" | cut -c26-) + + # + # Each feature F is identified by a pair (O, K), where 'O' can + # be either the empty string (for 'nop') or "not" (the logical + # negation operator '!'); other operators are not supported. + # + O="" + K=$F + if [[ "$F" == !* ]]; then + O="not" + K=$(echo $F | sed -e 's/^!//g') + fi + + # + # F := (O, K) is 'valid' iff there is a Kconfig file (for some + # arch) which contains K. + # + # Notice that this definition entails an 'asymmetry' between + # the case 'O = ""' and the case 'O = "not"'. E.g., F may be + # _invalid_ if: + # + # [case 'O = ""'] + # 1) no arch provides support for F, + # 2) K does not exist (e.g., it was renamed/mis-typed); + # + # [case 'O = "not"'] + # 3) all archs provide support for F, + # 4) as in (2). + # + # The rationale for adopting this definition (and, thus, for + # keeping the asymmetry) is: + # + # We want to be able to 'detect' (2) (or (4)). + # + # (1) and (3) may further warn the developers about the fact + # that K can be removed. + # + F_VALID="false" + for ARCH_DIR in arch/*/; do + K_FILES=$(find $ARCH_DIR -name "Kconfig*") + K_GREP=$(grep "$K" $K_FILES) + if [ ! -z "$K_GREP" ]; then + F_VALID="true" + break + fi + done + if [ "$F_VALID" = "false" ]; then + printf "WARNING: '%s' is not a valid Kconfig\n" "$F" + fi + + T_FILE="$F_FILE.tmp" + grep "^#" $F_FILE > $T_FILE + echo " -----------------------" >> $T_FILE + echo " | arch |status|" >> $T_FILE + echo " -----------------------" >> $T_FILE + for ARCH_DIR in arch/*/; do + ARCH=$(echo $ARCH_DIR | sed -e 's/^arch//g' | sed -e 's/\///g') + K_FILES=$(find $ARCH_DIR -name "Kconfig*") + K_GREP=$(grep "$K" $K_FILES) + # + # Arch support status values for (O, K) are updated according + # to the following rules. + # + # - ("", K) is 'supported by a given arch', if there is a + # Kconfig file for that arch which contains K; + # + # - ("not", K) is 'supported by a given arch', if there is + # no Kconfig file for that arch which contains K; + # + # - otherwise: preserve the previous status value (if any), + # default to 'not yet supported'. + # + # Notice that, according these rules, invalid features may be + # updated/modified. + # + if [ "$O" = "" ] && [ ! -z "$K_GREP" ]; then + printf " |%12s: | ok |\n" "$ARCH" >> $T_FILE + elif [ "$O" = "not" ] && [ -z "$K_GREP" ]; then + printf " |%12s: | ok |\n" "$ARCH" >> $T_FILE + else + S=$(grep -v "^#" "$F_FILE" | grep " $ARCH:") + if [ ! -z "$S" ]; then + echo "$S" >> $T_FILE + else + printf " |%12s: | TODO |\n" "$ARCH" \ + >> $T_FILE + fi + fi + done + echo " -----------------------" >> $T_FILE + mv $T_FILE $F_FILE +done diff --git a/tools/docs/find-unused-docs.sh b/tools/docs/find-unused-docs.sh new file mode 100755 index 000000000000..05552dbda5bc --- /dev/null +++ b/tools/docs/find-unused-docs.sh @@ -0,0 +1,62 @@ +#!/bin/bash +# (c) 2017, Jonathan Corbet <corbet@lwn.net> +# sayli karnik <karniksayli1995@gmail.com> +# +# This script detects files with kernel-doc comments for exported functions +# that are not included in documentation. +# +# usage: Run 'tools/docs/find-unused-docs.sh directory' from top level of kernel +# tree. +# +# example: $tools/docs/find-unused-docs.sh drivers/scsi +# +# Licensed under the terms of the GNU GPL License + +if ! [ -d "Documentation" ]; then + echo "Run from top level of kernel tree" + exit 1 +fi + +if [ "$#" -ne 1 ]; then + echo "Usage: tools/docs/find-unused-docs.sh directory" + exit 1 +fi + +if ! [ -d "$1" ]; then + echo "Directory $1 doesn't exist" + exit 1 +fi + +cd "$( dirname "${BASH_SOURCE[0]}" )" +cd .. + +cd Documentation/ + +echo "The following files contain kerneldoc comments for exported functions \ +that are not used in the formatted documentation" + +# FILES INCLUDED + +files_included=($(grep -rHR ".. kernel-doc" --include \*.rst | cut -d " " -f 3)) + +declare -A FILES_INCLUDED + +for each in "${files_included[@]}"; do + FILES_INCLUDED[$each]="$each" + done + +cd .. + +# FILES NOT INCLUDED + +for file in `find $1 -name '*.c'`; do + + if [[ ${FILES_INCLUDED[$file]+_} ]]; then + continue; + fi + str=$(PYTHONDONTWRITEBYTECODE=1 scripts/kernel-doc -export "$file" 2>/dev/null) + if [[ -n "$str" ]]; then + echo "$file" + fi + done + diff --git a/tools/docs/get_abi.py b/tools/docs/get_abi.py new file mode 100755 index 000000000000..2f0b99401f26 --- /dev/null +++ b/tools/docs/get_abi.py @@ -0,0 +1,214 @@ +#!/usr/bin/env python3 +# pylint: disable=R0903 +# Copyright(c) 2025: Mauro Carvalho Chehab <mchehab@kernel.org>. +# SPDX-License-Identifier: GPL-2.0 + +""" +Parse ABI documentation and produce results from it. +""" + +import argparse +import logging +import os +import sys + +# Import Python modules + +LIB_DIR = "../lib/python" +SRC_DIR = os.path.dirname(os.path.realpath(__file__)) + +sys.path.insert(0, os.path.join(SRC_DIR, LIB_DIR)) + +from abi.abi_parser import AbiParser # pylint: disable=C0413 +from abi.abi_regex import AbiRegex # pylint: disable=C0413 +from abi.helpers import ABI_DIR, DEBUG_HELP # pylint: disable=C0413 +from abi.system_symbols import SystemSymbols # pylint: disable=C0413 + +# Command line classes + + +REST_DESC = """ +Produce output in ReST format. + +The output is done on two sections: + +- Symbols: show all parsed symbols in alphabetic order; +- Files: cross reference the content of each file with the symbols on it. +""" + +class AbiRest: + """Initialize an argparse subparser for rest output""" + + def __init__(self, subparsers): + """Initialize argparse subparsers""" + + parser = subparsers.add_parser("rest", + formatter_class=argparse.RawTextHelpFormatter, + description=REST_DESC) + + parser.add_argument("--enable-lineno", action="store_true", + help="enable lineno") + parser.add_argument("--raw", action="store_true", + help="output text as contained in the ABI files. " + "It not used, output will contain dynamically" + " generated cross references when possible.") + parser.add_argument("--no-file", action="store_true", + help="Don't the files section") + parser.add_argument("--show-hints", help="Show-hints") + + parser.set_defaults(func=self.run) + + def run(self, args): + """Run subparser""" + + parser = AbiParser(args.dir, debug=args.debug) + parser.parse_abi() + parser.check_issues() + + for t in parser.doc(args.raw, not args.no_file): + if args.enable_lineno: + print (f".. LINENO {t[1]}#{t[2]}\n\n") + + print(t[0]) + +class AbiValidate: + """Initialize an argparse subparser for ABI validation""" + + def __init__(self, subparsers): + """Initialize argparse subparsers""" + + parser = subparsers.add_parser("validate", + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + description="list events") + + parser.set_defaults(func=self.run) + + def run(self, args): + """Run subparser""" + + parser = AbiParser(args.dir, debug=args.debug) + parser.parse_abi() + parser.check_issues() + + +class AbiSearch: + """Initialize an argparse subparser for ABI search""" + + def __init__(self, subparsers): + """Initialize argparse subparsers""" + + parser = subparsers.add_parser("search", + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + description="Search ABI using a regular expression") + + parser.add_argument("expression", + help="Case-insensitive search pattern for the ABI symbol") + + parser.set_defaults(func=self.run) + + def run(self, args): + """Run subparser""" + + parser = AbiParser(args.dir, debug=args.debug) + parser.parse_abi() + parser.search_symbols(args.expression) + +UNDEFINED_DESC=""" +Check undefined ABIs on local machine. + +Read sysfs devnodes and check if the devnodes there are defined inside +ABI documentation. + +The search logic tries to minimize the number of regular expressions to +search per each symbol. + +By default, it runs on a single CPU, as Python support for CPU threads +is still experimental, and multi-process runs on Python is very slow. + +On experimental tests, if the number of ABI symbols to search per devnode +is contained on a limit of ~150 regular expressions, using a single CPU +is a lot faster than using multiple processes. However, if the number of +regular expressions to check is at the order of ~30000, using multiple +CPUs speeds up the check. +""" + +class AbiUndefined: + """ + Initialize an argparse subparser for logic to check undefined ABI at + the current machine's sysfs + """ + + def __init__(self, subparsers): + """Initialize argparse subparsers""" + + parser = subparsers.add_parser("undefined", + formatter_class=argparse.RawTextHelpFormatter, + description=UNDEFINED_DESC) + + parser.add_argument("-S", "--sysfs-dir", default="/sys", + help="directory where sysfs is mounted") + parser.add_argument("-s", "--search-string", + help="search string regular expression to limit symbol search") + parser.add_argument("-H", "--show-hints", action="store_true", + help="Hints about definitions for missing ABI symbols.") + parser.add_argument("-j", "--jobs", "--max-workers", type=int, default=1, + help="If bigger than one, enables multiprocessing.") + parser.add_argument("-c", "--max-chunk-size", type=int, default=50, + help="Maximum number of chunk size") + parser.add_argument("-f", "--found", action="store_true", + help="Also show found items. " + "Helpful to debug the parser."), + parser.add_argument("-d", "--dry-run", action="store_true", + help="Don't actually search for undefined. " + "Helpful to debug the parser."), + + parser.set_defaults(func=self.run) + + def run(self, args): + """Run subparser""" + + abi = AbiRegex(args.dir, debug=args.debug, + search_string=args.search_string) + + abi_symbols = SystemSymbols(abi=abi, hints=args.show_hints, + sysfs=args.sysfs_dir) + + abi_symbols.check_undefined_symbols(dry_run=args.dry_run, + found=args.found, + max_workers=args.jobs, + chunk_size=args.max_chunk_size) + + +def main(): + """Main program""" + + parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter) + + parser.add_argument("-d", "--debug", type=int, default=0, help="debug level") + parser.add_argument("-D", "--dir", default=ABI_DIR, help=DEBUG_HELP) + + subparsers = parser.add_subparsers() + + AbiRest(subparsers) + AbiValidate(subparsers) + AbiSearch(subparsers) + AbiUndefined(subparsers) + + args = parser.parse_args() + + if args.debug: + level = logging.DEBUG + else: + level = logging.INFO + + logging.basicConfig(level=level, format="[%(levelname)s] %(message)s") + + if "func" in args: + args.func(args) + else: + sys.exit(f"Please specify a valid command for {sys.argv[0]}") + + +# Call main method +if __name__ == "__main__": + main() diff --git a/tools/docs/get_feat.py b/tools/docs/get_feat.py new file mode 100755 index 000000000000..2b5155a1f134 --- /dev/null +++ b/tools/docs/get_feat.py @@ -0,0 +1,225 @@ +#!/usr/bin/env python3 +# pylint: disable=R0902,R0911,R0912,R0914,R0915 +# Copyright(c) 2025: Mauro Carvalho Chehab <mchehab@kernel.org>. +# SPDX-License-Identifier: GPL-2.0 + + +""" +Parse the Linux Feature files and produce a ReST book. +""" + +import argparse +import os +import subprocess +import sys + +from pprint import pprint + +LIB_DIR = "../../tools/lib/python" +SRC_DIR = os.path.dirname(os.path.realpath(__file__)) + +sys.path.insert(0, os.path.join(SRC_DIR, LIB_DIR)) + +from feat.parse_features import ParseFeature # pylint: disable=C0413 + +SRCTREE = os.path.join(os.path.dirname(os.path.realpath(__file__)), "../..") +DEFAULT_DIR = "Documentation/features" + + +class GetFeature: + """Helper class to parse feature parsing parameters""" + + @staticmethod + def get_current_arch(): + """Detects the current architecture""" + + proc = subprocess.run(["uname", "-m"], check=True, + capture_output=True, text=True) + + arch = proc.stdout.strip() + if arch in ["x86_64", "i386"]: + arch = "x86" + elif arch == "s390x": + arch = "s390" + + return arch + + def run_parser(self, args): + """Execute the feature parser""" + + feat = ParseFeature(args.directory, args.debug, args.enable_fname) + data = feat.parse() + + if args.debug > 2: + pprint(data) + + return feat + + def run_rest(self, args): + """ + Generate tables in ReST format. Three types of tables are + supported, depending on the calling arguments: + + - neither feature nor arch is passed: generates a full matrix; + - arch provided: generates a table of supported tables for the + guiven architecture, eventually filtered by feature; + - only feature provided: generates a table with feature details, + showing what architectures it is implemented. + """ + + feat = self.run_parser(args) + + if args.arch: + rst = feat.output_arch_table(args.arch, args.feat) + elif args.feat: + rst = feat.output_feature(args.feat) + else: + rst = feat.output_matrix() + + print(rst) + + def run_current(self, args): + """ + Instead of using a --arch parameter, get feature for the current + architecture. + """ + + args.arch = self.get_current_arch() + + self.run_rest(args) + + def run_list(self, args): + """ + Generate a list of features for a given architecture, in a format + parseable by other scripts. The output format is not ReST. + """ + + if not args.arch: + args.arch = self.get_current_arch() + + feat = self.run_parser(args) + msg = feat.list_arch_features(args.arch, args.feat) + + print(msg) + + def parse_arch(self, parser): + """Add a --arch parsing argument""" + + parser.add_argument("--arch", + help="Output features for an specific" + " architecture, optionally filtering for a " + "single specific feature.") + + def parse_feat(self, parser): + """Add a --feat parsing argument""" + + parser.add_argument("--feat", "--feature", + help="Output features for a single specific " + "feature.") + + + def current_args(self, subparsers): + """Implementscurrent argparse subparser""" + + parser = subparsers.add_parser("current", + formatter_class=argparse.RawTextHelpFormatter, + description="Output table in ReST " + "compatible ASCII format " + "with features for this " + "machine's architecture") + + self.parse_feat(parser) + parser.set_defaults(func=self.run_current) + + def rest_args(self, subparsers): + """Implement rest argparse subparser""" + + parser = subparsers.add_parser("rest", + formatter_class=argparse.RawTextHelpFormatter, + description="Output table(s) in ReST " + "compatible ASCII format " + "with features in ReST " + "markup language. The " + "output is affected by " + "--arch or --feat/--feature" + " flags.") + + self.parse_arch(parser) + self.parse_feat(parser) + parser.set_defaults(func=self.run_rest) + + def list_args(self, subparsers): + """Implement list argparse subparser""" + + parser = subparsers.add_parser("list", + formatter_class=argparse.RawTextHelpFormatter, + description="List features for this " + "machine's architecture, " + "using an easier to parse " + "format. The output is " + "affected by --arch flag.") + + self.parse_arch(parser) + self.parse_feat(parser) + parser.set_defaults(func=self.run_list) + + def validate_args(self, subparsers): + """Implement validate argparse subparser""" + + parser = subparsers.add_parser("validate", + formatter_class=argparse.RawTextHelpFormatter, + description="Validate the contents of " + "the files under " + f"{DEFAULT_DIR}.") + + parser.set_defaults(func=self.run_parser) + + def parser(self): + """ + Create an arparse with common options and several subparsers + """ + parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter) + + parser.add_argument("-d", "--debug", action="count", default=0, + help="Put the script in verbose mode, useful for " + "debugging. Can be called multiple times, to " + "increase verbosity.") + + parser.add_argument("--directory", "--dir", default=DEFAULT_DIR, + help="Changes the location of the Feature files. " + f"By default, it uses the {DEFAULT_DIR} " + "directory.") + + parser.add_argument("--enable-fname", action="store_true", + help="Prints the file name of the feature files. " + "This can be used in order to track " + "dependencies during documentation build.") + + subparsers = parser.add_subparsers() + + self.current_args(subparsers) + self.rest_args(subparsers) + self.list_args(subparsers) + self.validate_args(subparsers) + + args = parser.parse_args() + + return args + + +def main(): + """Main program""" + + feat = GetFeature() + + args = feat.parser() + + if "func" in args: + args.func(args) + else: + sys.exit(f"Please specify a valid command for {sys.argv[0]}") + + +# Call main method +if __name__ == "__main__": + main() diff --git a/tools/docs/list-arch.sh b/tools/docs/list-arch.sh new file mode 100755 index 000000000000..96fe83b7058b --- /dev/null +++ b/tools/docs/list-arch.sh @@ -0,0 +1,11 @@ +# SPDX-License-Identifier: GPL-2.0 +# +# Small script that visualizes the kernel feature support status +# of an architecture. +# +# (If no arguments are given then it will print the host architecture's status.) +# + +ARCH=${1:-$(uname -m | sed 's/x86_64/x86/' | sed 's/i386/x86/' | sed 's/s390x/s390/')} + +$(dirname $0)/get_feat.pl list --arch $ARCH diff --git a/tools/docs/parse-headers.py b/tools/docs/parse-headers.py index bfa4e46a53e3..436acea4c6ca 100755 --- a/tools/docs/parse-headers.py +++ b/tools/docs/parse-headers.py @@ -24,10 +24,13 @@ The optional ``FILE_RULES`` contains a set of rules like: replace define V4L2_EVENT_MD_FL_HAVE_FRAME_SEQ :c:type:`v4l2_event_motion_det` """ -import argparse +import argparse, sys +import os.path -from lib.parse_data_structs import ParseDataStructs -from lib.enrich_formatter import EnrichFormatter +src_dir = os.path.dirname(os.path.realpath(__file__)) +sys.path.insert(0, os.path.join(src_dir, '../lib/python')) +from kdoc.parse_data_structs import ParseDataStructs +from kdoc.enrich_formatter import EnrichFormatter def main(): """Main function""" @@ -47,10 +50,7 @@ def main(): args = parser.parse_args() parser = ParseDataStructs(debug=args.debug) - parser.parse_file(args.file_in) - - if args.file_rules: - parser.process_exceptions(args.file_rules) + parser.parse_file(args.file_in, args.file_rules) parser.debug_print() parser.write_output(args.file_in, args.file_out, args.toc) diff --git a/tools/docs/sphinx-build-wrapper b/tools/docs/sphinx-build-wrapper new file mode 100755 index 000000000000..7a5fcef25429 --- /dev/null +++ b/tools/docs/sphinx-build-wrapper @@ -0,0 +1,864 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: GPL-2.0 +# Copyright (C) 2025 Mauro Carvalho Chehab <mchehab+huawei@kernel.org> +# +# pylint: disable=R0902, R0912, R0913, R0914, R0915, R0917, C0103 +# +# Converted from docs Makefile and parallel-wrapper.sh, both under +# GPLv2, copyrighted since 2008 by the following authors: +# +# Akira Yokosawa <akiyks@gmail.com> +# Arnd Bergmann <arnd@arndb.de> +# Breno Leitao <leitao@debian.org> +# Carlos Bilbao <carlos.bilbao@amd.com> +# Dave Young <dyoung@redhat.com> +# Donald Hunter <donald.hunter@gmail.com> +# Geert Uytterhoeven <geert+renesas@glider.be> +# Jani Nikula <jani.nikula@intel.com> +# Jan Stancek <jstancek@redhat.com> +# Jonathan Corbet <corbet@lwn.net> +# Joshua Clayton <stillcompiling@gmail.com> +# Kees Cook <keescook@chromium.org> +# Linus Torvalds <torvalds@linux-foundation.org> +# Magnus Damm <damm+renesas@opensource.se> +# Masahiro Yamada <masahiroy@kernel.org> +# Mauro Carvalho Chehab <mchehab+huawei@kernel.org> +# Maxim Cournoyer <maxim.cournoyer@gmail.com> +# Peter Foley <pefoley2@pefoley.com> +# Randy Dunlap <rdunlap@infradead.org> +# Rob Herring <robh@kernel.org> +# Shuah Khan <shuahkh@osg.samsung.com> +# Thorsten Blum <thorsten.blum@toblux.com> +# Tomas Winkler <tomas.winkler@intel.com> + + +""" +Sphinx build wrapper that handles Kernel-specific business rules: + +- it gets the Kernel build environment vars; +- it determines what's the best parallelism; +- it handles SPHINXDIRS + +This tool ensures that MIN_PYTHON_VERSION is satisfied. If version is +below that, it seeks for a new Python version. If found, it re-runs using +the newer version. +""" + +import argparse +import locale +import os +import re +import shlex +import shutil +import subprocess +import sys + +from concurrent import futures +from glob import glob + + +LIB_DIR = "../lib/python" +SRC_DIR = os.path.dirname(os.path.realpath(__file__)) + +sys.path.insert(0, os.path.join(SRC_DIR, LIB_DIR)) + +from kdoc.python_version import PythonVersion +from kdoc.latex_fonts import LatexFontChecker +from jobserver import JobserverExec # pylint: disable=C0413,C0411,E0401 + +# +# Some constants +# +VENV_DEFAULT = "sphinx_latest" +MIN_PYTHON_VERSION = PythonVersion("3.7").version +PAPER = ["", "a4", "letter"] + +TARGETS = { + "cleandocs": { "builder": "clean" }, + "linkcheckdocs": { "builder": "linkcheck" }, + "htmldocs": { "builder": "html" }, + "epubdocs": { "builder": "epub", "out_dir": "epub" }, + "texinfodocs": { "builder": "texinfo", "out_dir": "texinfo" }, + "infodocs": { "builder": "texinfo", "out_dir": "texinfo" }, + "mandocs": { "builder": "man", "out_dir": "man" }, + "latexdocs": { "builder": "latex", "out_dir": "latex" }, + "pdfdocs": { "builder": "latex", "out_dir": "latex" }, + "xmldocs": { "builder": "xml", "out_dir": "xml" }, +} + + +# +# SphinxBuilder class +# + +class SphinxBuilder: + """ + Handles a sphinx-build target, adding needed arguments to build + with the Kernel. + """ + + def get_path(self, path, use_cwd=False, abs_path=False): + """ + Ancillary routine to handle patches the right way, as shell does. + + It first expands "~" and "~user". Then, if patch is not absolute, + join self.srctree. Finally, if requested, convert to abspath. + """ + + path = os.path.expanduser(path) + if not path.startswith("/"): + if use_cwd: + base = os.getcwd() + else: + base = self.srctree + + path = os.path.join(base, path) + + if abs_path: + return os.path.abspath(path) + + return path + + def check_rust(self): + """ + Checks if Rust is enabled + """ + self.rustdoc = False + + config = os.path.join(self.srctree, ".config") + + if not os.path.isfile(config): + return + + re_rust = re.compile(r"CONFIG_RUST=(m|y)") + + try: + with open(config, "r", encoding="utf-8") as fp: + for line in fp: + if re_rust.match(line): + self.rustdoc = True + return + + except OSError as e: + print(f"Failed to open {config}", file=sys.stderr) + + def get_sphinx_extra_opts(self, n_jobs): + """ + Get the number of jobs to be used for docs build passed via command + line and desired sphinx verbosity. + + The number of jobs can be on different places: + + 1) It can be passed via "-j" argument; + 2) The SPHINXOPTS="-j8" env var may have "-j"; + 3) if called via GNU make, -j specifies the desired number of jobs. + with GNU makefile, this number is available via POSIX jobserver; + 4) if none of the above is available, it should default to "-jauto", + and let sphinx decide the best value. + """ + + # + # SPHINXOPTS env var, if used, contains extra arguments to be used + # by sphinx-build time. Among them, it may contain sphinx verbosity + # and desired number of parallel jobs. + # + parser = argparse.ArgumentParser() + parser.add_argument('-j', '--jobs', type=int) + parser.add_argument('-q', '--quiet', action='store_true') + + # + # Other sphinx-build arguments go as-is, so place them + # at self.sphinxopts, using shell parser + # + sphinxopts = shlex.split(os.environ.get("SPHINXOPTS", "")) + + # + # Build a list of sphinx args, honoring verbosity here if specified + # + + verbose = self.verbose + sphinx_args, self.sphinxopts = parser.parse_known_args(sphinxopts) + if sphinx_args.quiet is True: + verbose = False + + # + # If the user explicitly sets "-j" at command line, use it. + # Otherwise, pick it from SPHINXOPTS args + # + if n_jobs: + self.n_jobs = n_jobs + elif sphinx_args.jobs: + self.n_jobs = sphinx_args.jobs + else: + self.n_jobs = None + + if not verbose: + self.sphinxopts += ["-q"] + + def __init__(self, builddir, venv=None, verbose=False, n_jobs=None, + interactive=None): + """Initialize internal variables""" + self.venv = venv + self.verbose = None + + # + # Normal variables passed from Kernel's makefile + # + self.kernelversion = os.environ.get("KERNELVERSION", "unknown") + self.kernelrelease = os.environ.get("KERNELRELEASE", "unknown") + self.pdflatex = os.environ.get("PDFLATEX", "xelatex") + + # + # Kernel main Makefile defines a PYTHON3 variable whose default is + # "python3". When set to a different value, it allows running a + # diferent version than the default official python3 package. + # Several distros package python3xx-sphinx packages with newer + # versions of Python and sphinx-build. + # + # Honor such variable different than default + # + self.python = os.environ.get("PYTHON3") + if self.python == "python3": + self.python = None + + if not interactive: + self.latexopts = os.environ.get("LATEXOPTS", "-interaction=batchmode -no-shell-escape") + else: + self.latexopts = os.environ.get("LATEXOPTS", "") + + if not verbose: + verbose = bool(os.environ.get("KBUILD_VERBOSE", "") != "") + + if verbose is not None: + self.verbose = verbose + + # + # Source tree directory. This needs to be at os.environ, as + # Sphinx extensions use it + # + self.srctree = os.environ.get("srctree") + if not self.srctree: + self.srctree = "." + os.environ["srctree"] = self.srctree + + # + # Now that we can expand srctree, get other directories as well + # + self.sphinxbuild = os.environ.get("SPHINXBUILD", "sphinx-build") + self.kerneldoc = self.get_path(os.environ.get("KERNELDOC", + "scripts/kernel-doc.py")) + self.builddir = self.get_path(builddir, use_cwd=True, abs_path=True) + + # + # Get directory locations for LaTeX build toolchain + # + self.pdflatex_cmd = shutil.which(self.pdflatex) + self.latexmk_cmd = shutil.which("latexmk") + + self.env = os.environ.copy() + + self.get_sphinx_extra_opts(n_jobs) + + self.check_rust() + + # + # If venv command line argument is specified, run Sphinx from venv + # + if venv: + bin_dir = os.path.join(venv, "bin") + if not os.path.isfile(os.path.join(bin_dir, "activate")): + sys.exit(f"Venv {venv} not found.") + + # "activate" virtual env + self.env["PATH"] = bin_dir + ":" + self.env["PATH"] + self.env["VIRTUAL_ENV"] = venv + if "PYTHONHOME" in self.env: + del self.env["PYTHONHOME"] + print(f"Setting venv to {venv}") + + def run_sphinx(self, sphinx_build, build_args, *args, **pwargs): + """ + Executes sphinx-build using current python3 command. + + When calling via GNU make, POSIX jobserver is used to tell how + many jobs are still available from a job pool. claim all remaining + jobs, as we don't want sphinx-build to run in parallel with other + jobs. + + Despite that, the user may actually force a different value than + the number of available jobs via command line. + + The "with" logic here is used to ensure that the claimed jobs will + be freed once subprocess finishes + """ + + with JobserverExec() as jobserver: + if jobserver.claim: + # + # when GNU make is used, claim available jobs from jobserver + # + n_jobs = str(jobserver.claim) + else: + # + # Otherwise, let sphinx decide by default + # + n_jobs = "auto" + + # + # If explicitly requested via command line, override default + # + if self.n_jobs: + n_jobs = str(self.n_jobs) + + # + # We can't simply call python3 sphinx-build, as OpenSUSE + # Tumbleweed uses an ELF binary file (/usr/bin/alts) to switch + # between different versions of sphinx-build. So, only call it + # prepending "python3.xx" when PYTHON3 variable is not default. + # + if self.python: + cmd = [self.python] + else: + cmd = [] + + cmd += [sphinx_build] + cmd += [f"-j{n_jobs}"] + cmd += build_args + cmd += self.sphinxopts + + if self.verbose: + print(" ".join(cmd)) + + return subprocess.call(cmd, *args, **pwargs) + + def handle_html(self, css, output_dir): + """ + Extra steps for HTML and epub output. + + For such targets, we need to ensure that CSS will be properly + copied to the output _static directory + """ + + if css: + css = os.path.expanduser(css) + if not css.startswith("/"): + css = os.path.join(self.srctree, css) + + static_dir = os.path.join(output_dir, "_static") + os.makedirs(static_dir, exist_ok=True) + + try: + shutil.copy2(css, static_dir) + except (OSError, IOError) as e: + print(f"Warning: Failed to copy CSS: {e}", file=sys.stderr) + + if self.rustdoc: + print("Building rust docs") + if "MAKE" in self.env: + cmd = [self.env["MAKE"]] + else: + cmd = ["make", "LLVM=1"] + + cmd += [ "rustdoc"] + if self.verbose: + print(" ".join(cmd)) + + try: + subprocess.run(cmd, check=True) + except subprocess.CalledProcessError as e: + print(f"Ignored errors when building rustdoc: {e}. Is RUST enabled?", + file=sys.stderr) + + def build_pdf_file(self, latex_cmd, from_dir, path): + """Builds a single pdf file using latex_cmd""" + try: + subprocess.run(latex_cmd + [path], + cwd=from_dir, check=True, env=self.env) + + return True + except subprocess.CalledProcessError: + return False + + def pdf_parallel_build(self, tex_suffix, latex_cmd, tex_files, n_jobs): + """Build PDF files in parallel if possible""" + builds = {} + build_failed = False + max_len = 0 + has_tex = False + + # + # LaTeX PDF error code is almost useless for us: + # any warning makes it non-zero. For kernel doc builds it always return + # non-zero even when build succeeds. So, let's do the best next thing: + # Ignore build errors. At the end, check if all PDF files were built, + # printing a summary with the built ones and returning 0 if all of + # them were actually built. + # + with futures.ThreadPoolExecutor(max_workers=n_jobs) as executor: + jobs = {} + + for from_dir, pdf_dir, entry in tex_files: + name = entry.name + + if not name.endswith(tex_suffix): + continue + + name = name[:-len(tex_suffix)] + has_tex = True + + future = executor.submit(self.build_pdf_file, latex_cmd, + from_dir, entry.path) + jobs[future] = (from_dir, pdf_dir, name) + + for future in futures.as_completed(jobs): + from_dir, pdf_dir, name = jobs[future] + + pdf_name = name + ".pdf" + pdf_from = os.path.join(from_dir, pdf_name) + pdf_to = os.path.join(pdf_dir, pdf_name) + out_name = os.path.relpath(pdf_to, self.builddir) + max_len = max(max_len, len(out_name)) + + try: + success = future.result() + + if success and os.path.exists(pdf_from): + os.rename(pdf_from, pdf_to) + + # + # if verbose, get the name of built PDF file + # + if self.verbose: + builds[out_name] = "SUCCESS" + else: + builds[out_name] = "FAILED" + build_failed = True + except futures.Error as e: + builds[out_name] = f"FAILED ({repr(e)})" + build_failed = True + + # + # Handle case where no .tex files were found + # + if not has_tex: + out_name = "LaTeX files" + max_len = max(max_len, len(out_name)) + builds[out_name] = "FAILED: no .tex files were generated" + build_failed = True + + return builds, build_failed, max_len + + def handle_pdf(self, output_dirs, deny_vf): + """ + Extra steps for PDF output. + + As PDF is handled via a LaTeX output, after building the .tex file, + a new build is needed to create the PDF output from the latex + directory. + """ + builds = {} + max_len = 0 + tex_suffix = ".tex" + tex_files = [] + + # + # Since early 2024, Fedora and openSUSE tumbleweed have started + # deploying variable-font format of "Noto CJK", causing LaTeX + # to break with CJK. Work around it, by denying the variable font + # usage during xelatex build by passing the location of a config + # file with a deny list. + # + # See tools/docs/lib/latex_fonts.py for more details. + # + if deny_vf: + deny_vf = os.path.expanduser(deny_vf) + if os.path.isdir(deny_vf): + self.env["XDG_CONFIG_HOME"] = deny_vf + + for from_dir in output_dirs: + pdf_dir = os.path.join(from_dir, "../pdf") + os.makedirs(pdf_dir, exist_ok=True) + + if self.latexmk_cmd: + latex_cmd = [self.latexmk_cmd, f"-{self.pdflatex}"] + else: + latex_cmd = [self.pdflatex] + + latex_cmd.extend(shlex.split(self.latexopts)) + + # Get a list of tex files to process + with os.scandir(from_dir) as it: + for entry in it: + if entry.name.endswith(tex_suffix): + tex_files.append((from_dir, pdf_dir, entry)) + + # + # When using make, this won't be used, as the number of jobs comes + # from POSIX jobserver. So, this covers the case where build comes + # from command line. On such case, serialize by default, except if + # the user explicitly sets the number of jobs. + # + n_jobs = 1 + + # n_jobs is either an integer or "auto". Only use it if it is a number + if self.n_jobs: + try: + n_jobs = int(self.n_jobs) + except ValueError: + pass + + # + # When using make, jobserver.claim is the number of jobs that were + # used with "-j" and that aren't used by other make targets + # + with JobserverExec() as jobserver: + n_jobs = 1 + + # + # Handle the case when a parameter is passed via command line, + # using it as default, if jobserver doesn't claim anything + # + if self.n_jobs: + try: + n_jobs = int(self.n_jobs) + except ValueError: + pass + + if jobserver.claim: + n_jobs = jobserver.claim + + builds, build_failed, max_len = self.pdf_parallel_build(tex_suffix, + latex_cmd, + tex_files, + n_jobs) + + # + # In verbose mode, print a summary with the build results per file. + # Otherwise, print a single line with all failures, if any. + # On both cases, return code 1 indicates build failures, + # + if self.verbose: + msg = "Summary" + msg += "\n" + "=" * len(msg) + print() + print(msg) + + for pdf_name, pdf_file in builds.items(): + print(f"{pdf_name:<{max_len}}: {pdf_file}") + + print() + if build_failed: + msg = LatexFontChecker().check() + if msg: + print(msg) + + sys.exit("Error: not all PDF files were created.") + + elif build_failed: + n_failures = len(builds) + failures = ", ".join(builds.keys()) + + msg = LatexFontChecker().check() + if msg: + print(msg) + + sys.exit(f"Error: Can't build {n_failures} PDF file(s): {failures}") + + def handle_info(self, output_dirs): + """ + Extra steps for Info output. + + For texinfo generation, an additional make is needed from the + texinfo directory. + """ + + for output_dir in output_dirs: + try: + subprocess.run(["make", "info"], cwd=output_dir, check=True) + except subprocess.CalledProcessError as e: + sys.exit(f"Error generating info docs: {e}") + + def handle_man(self, kerneldoc, docs_dir, src_dir, output_dir): + """ + Create man pages from kernel-doc output + """ + + re_kernel_doc = re.compile(r"^\.\.\s+kernel-doc::\s*(\S+)") + re_man = re.compile(r'^\.TH "[^"]*" (\d+) "([^"]*)"') + + if docs_dir == src_dir: + # + # Pick the entire set of kernel-doc markups from the entire tree + # + kdoc_files = set([self.srctree]) + else: + kdoc_files = set() + + for fname in glob(os.path.join(src_dir, "**"), recursive=True): + if os.path.isfile(fname) and fname.endswith(".rst"): + with open(fname, "r", encoding="utf-8") as in_fp: + data = in_fp.read() + + for line in data.split("\n"): + match = re_kernel_doc.match(line) + if match: + if os.path.isfile(match.group(1)): + kdoc_files.add(match.group(1)) + + if not kdoc_files: + sys.exit(f"Directory {src_dir} doesn't contain kernel-doc tags") + + cmd = [ kerneldoc, "-m" ] + sorted(kdoc_files) + try: + if self.verbose: + print(" ".join(cmd)) + + result = subprocess.run(cmd, stdout=subprocess.PIPE, text= True) + + if result.returncode: + print(f"Warning: kernel-doc returned {result.returncode} warnings") + + except (OSError, ValueError, subprocess.SubprocessError) as e: + sys.exit(f"Failed to create man pages for {src_dir}: {repr(e)}") + + fp = None + try: + for line in result.stdout.split("\n"): + match = re_man.match(line) + if not match: + if fp: + fp.write(line + '\n') + continue + + if fp: + fp.close() + + fname = f"{output_dir}/{match.group(2)}.{match.group(1)}" + + if self.verbose: + print(f"Creating {fname}") + fp = open(fname, "w", encoding="utf-8") + fp.write(line + '\n') + finally: + if fp: + fp.close() + + def cleandocs(self, builder): # pylint: disable=W0613 + """Remove documentation output directory""" + shutil.rmtree(self.builddir, ignore_errors=True) + + def build(self, target, sphinxdirs=None, + theme=None, css=None, paper=None, deny_vf=None, + skip_sphinx=False): + """ + Build documentation using Sphinx. This is the core function of this + module. It prepares all arguments required by sphinx-build. + """ + + builder = TARGETS[target]["builder"] + out_dir = TARGETS[target].get("out_dir", "") + + # + # Cleandocs doesn't require sphinx-build + # + if target == "cleandocs": + self.cleandocs(builder) + return + + if theme: + os.environ["DOCS_THEME"] = theme + + # + # Other targets require sphinx-build, so check if it exists + # + if not skip_sphinx: + sphinxbuild = shutil.which(self.sphinxbuild, path=self.env["PATH"]) + if not sphinxbuild and target != "mandocs": + sys.exit(f"Error: {self.sphinxbuild} not found in PATH.\n") + + if target == "pdfdocs": + if not self.pdflatex_cmd and not self.latexmk_cmd: + sys.exit("Error: pdflatex or latexmk required for PDF generation") + + docs_dir = os.path.abspath(os.path.join(self.srctree, "Documentation")) + + # + # Fill in base arguments for Sphinx build + # + kerneldoc = self.kerneldoc + if kerneldoc.startswith(self.srctree): + kerneldoc = os.path.relpath(kerneldoc, self.srctree) + + args = [ "-b", builder, "-c", docs_dir ] + + if builder == "latex": + if not paper: + paper = PAPER[1] + + args.extend(["-D", f"latex_elements.papersize={paper}paper"]) + + if self.rustdoc: + args.extend(["-t", "rustdoc"]) + + if not sphinxdirs: + sphinxdirs = os.environ.get("SPHINXDIRS", ".") + + # + # The sphinx-build tool has a bug: internally, it tries to set + # locale with locale.setlocale(locale.LC_ALL, ''). This causes a + # crash if language is not set. Detect and fix it. + # + try: + locale.setlocale(locale.LC_ALL, '') + except locale.Error: + self.env["LC_ALL"] = "C" + + # + # sphinxdirs can be a list or a whitespace-separated string + # + sphinxdirs_list = [] + for sphinxdir in sphinxdirs: + if isinstance(sphinxdir, list): + sphinxdirs_list += sphinxdir + else: + sphinxdirs_list += sphinxdir.split() + + # + # Step 1: Build each directory in separate. + # + # This is not the best way of handling it, as cross-references between + # them will be broken, but this is what we've been doing since + # the beginning. + # + output_dirs = [] + for sphinxdir in sphinxdirs_list: + src_dir = os.path.join(docs_dir, sphinxdir) + doctree_dir = os.path.join(self.builddir, ".doctrees") + output_dir = os.path.join(self.builddir, sphinxdir, out_dir) + + # + # Make directory names canonical + # + src_dir = os.path.normpath(src_dir) + doctree_dir = os.path.normpath(doctree_dir) + output_dir = os.path.normpath(output_dir) + + os.makedirs(doctree_dir, exist_ok=True) + os.makedirs(output_dir, exist_ok=True) + + output_dirs.append(output_dir) + + build_args = args + [ + "-d", doctree_dir, + "-D", f"kerneldoc_bin={kerneldoc}", + "-D", f"version={self.kernelversion}", + "-D", f"release={self.kernelrelease}", + "-D", f"kerneldoc_srctree={self.srctree}", + src_dir, + output_dir, + ] + + if target == "mandocs": + self.handle_man(kerneldoc, docs_dir, src_dir, output_dir) + elif not skip_sphinx: + try: + result = self.run_sphinx(sphinxbuild, build_args, + env=self.env) + + if result: + sys.exit(f"Build failed: return code: {result}") + + except (OSError, ValueError, subprocess.SubprocessError) as e: + sys.exit(f"Build failed: {repr(e)}") + + # + # Ensure that each html/epub output will have needed static files + # + if target in ["htmldocs", "epubdocs"]: + self.handle_html(css, output_dir) + + # + # Step 2: Some targets (PDF and info) require an extra step once + # sphinx-build finishes + # + if target == "pdfdocs": + self.handle_pdf(output_dirs, deny_vf) + elif target == "infodocs": + self.handle_info(output_dirs) + +def jobs_type(value): + """ + Handle valid values for -j. Accepts Sphinx "-jauto", plus a number + equal or bigger than one. + """ + if value is None: + return None + + if value.lower() == 'auto': + return value.lower() + + try: + if int(value) >= 1: + return value + + raise argparse.ArgumentTypeError(f"Minimum jobs is 1, got {value}") + except ValueError: + raise argparse.ArgumentTypeError(f"Must be 'auto' or positive integer, got {value}") # pylint: disable=W0707 + +def main(): + """ + Main function. The only mandatory argument is the target. If not + specified, the other arguments will use default values if not + specified at os.environ. + """ + parser = argparse.ArgumentParser(description="Kernel documentation builder") + + parser.add_argument("target", choices=list(TARGETS.keys()), + help="Documentation target to build") + parser.add_argument("--sphinxdirs", nargs="+", + help="Specific directories to build") + parser.add_argument("--builddir", default="output", + help="Sphinx configuration file") + + parser.add_argument("--theme", help="Sphinx theme to use") + + parser.add_argument("--css", help="Custom CSS file for HTML/EPUB") + + parser.add_argument("--paper", choices=PAPER, default=PAPER[0], + help="Paper size for LaTeX/PDF output") + + parser.add_argument('--deny-vf', + help="Configuration to deny variable fonts on pdf builds") + + parser.add_argument("-v", "--verbose", action='store_true', + help="place build in verbose mode") + + parser.add_argument('-j', '--jobs', type=jobs_type, + help="Sets number of jobs to use with sphinx-build") + + parser.add_argument('-i', '--interactive', action='store_true', + help="Change latex default to run in interactive mode") + + parser.add_argument('-s', '--skip-sphinx-build', action='store_true', + help="Skip sphinx-build step") + + parser.add_argument("-V", "--venv", nargs='?', const=f'{VENV_DEFAULT}', + default=None, + help=f'If used, run Sphinx from a venv dir (default dir: {VENV_DEFAULT})') + + args = parser.parse_args() + + PythonVersion.check_python(MIN_PYTHON_VERSION, show_alternatives=True, + bail_out=True) + + builder = SphinxBuilder(builddir=args.builddir, venv=args.venv, + verbose=args.verbose, n_jobs=args.jobs, + interactive=args.interactive) + + builder.build(args.target, sphinxdirs=args.sphinxdirs, + theme=args.theme, css=args.css, paper=args.paper, + deny_vf=args.deny_vf, + skip_sphinx=args.skip_sphinx_build) + +if __name__ == "__main__": + main() diff --git a/tools/docs/sphinx-pre-install b/tools/docs/sphinx-pre-install new file mode 100755 index 000000000000..965c9b093a41 --- /dev/null +++ b/tools/docs/sphinx-pre-install @@ -0,0 +1,1543 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: GPL-2.0-or-later +# Copyright (c) 2017-2025 Mauro Carvalho Chehab <mchehab+huawei@kernel.org> +# +# pylint: disable=C0103,C0114,C0115,C0116,C0301,C0302 +# pylint: disable=R0902,R0904,R0911,R0912,R0914,R0915,R1705,R1710,E1121 + +# Note: this script requires at least Python 3.6 to run. +# Don't add changes not compatible with it, it is meant to report +# incompatible python versions. + +""" +Dependency checker for Sphinx documentation Kernel build. + +This module provides tools to check for all required dependencies needed to +build documentation using Sphinx, including system packages, Python modules +and LaTeX packages for PDF generation. + +It detect packages for a subset of Linux distributions used by Kernel +maintainers, showing hints and missing dependencies. + +The main class SphinxDependencyChecker handles the dependency checking logic +and provides recommendations for installing missing packages. It supports both +system package installations and Python virtual environments. By default, +system pacage install is recommended. +""" + +import argparse +import locale +import os +import re +import subprocess +import sys +from glob import glob +import os.path + +src_dir = os.path.dirname(os.path.realpath(__file__)) +sys.path.insert(0, os.path.join(src_dir, '../lib/python')) +from kdoc.python_version import PythonVersion + +RECOMMENDED_VERSION = PythonVersion("3.4.3").version +MIN_PYTHON_VERSION = PythonVersion("3.7").version + + +class DepManager: + """ + Manage package dependencies. There are three types of dependencies: + + - System: dependencies required for docs build; + - Python: python dependencies for a native distro Sphinx install; + - PDF: dependencies needed by PDF builds. + + Each dependency can be mandatory or optional. Not installing an optional + dependency won't break the build, but will cause degradation at the + docs output. + """ + + # Internal types of dependencies. Don't use them outside DepManager class. + _SYS_TYPE = 0 + _PHY_TYPE = 1 + _PDF_TYPE = 2 + + # Dependencies visible outside the class. + # The keys are tuple with: (type, is_mandatory flag). + # + # Currently we're not using all optional dep types. Yet, we'll keep all + # possible combinations here. They're not many, and that makes easier + # if later needed and for the name() method below + + SYSTEM_MANDATORY = (_SYS_TYPE, True) + PYTHON_MANDATORY = (_PHY_TYPE, True) + PDF_MANDATORY = (_PDF_TYPE, True) + + SYSTEM_OPTIONAL = (_SYS_TYPE, False) + PYTHON_OPTIONAL = (_PHY_TYPE, False) + PDF_OPTIONAL = (_PDF_TYPE, True) + + def __init__(self, pdf): + """ + Initialize internal vars: + + - missing: missing dependencies list, containing a distro-independent + name for a missing dependency and its type. + - missing_pkg: ancillary dict containing missing dependencies in + distro namespace, organized by type. + - need: total number of needed dependencies. Never cleaned. + - optional: total number of optional dependencies. Never cleaned. + - pdf: Is PDF support enabled? + """ + self.missing = {} + self.missing_pkg = {} + self.need = 0 + self.optional = 0 + self.pdf = pdf + + @staticmethod + def name(dtype): + """ + Ancillary routine to output a warn/error message reporting + missing dependencies. + """ + if dtype[0] == DepManager._SYS_TYPE: + msg = "build" + elif dtype[0] == DepManager._PHY_TYPE: + msg = "Python" + else: + msg = "PDF" + + if dtype[1]: + return f"ERROR: {msg} mandatory deps missing" + else: + return f"Warning: {msg} optional deps missing" + + @staticmethod + def is_optional(dtype): + """Ancillary routine to report if a dependency is optional""" + return not dtype[1] + + @staticmethod + def is_pdf(dtype): + """Ancillary routine to report if a dependency is for PDF generation""" + if dtype[0] == DepManager._PDF_TYPE: + return True + + return False + + def add_package(self, package, dtype): + """ + Add a package at the self.missing() dictionary. + Doesn't update missing_pkg. + """ + is_optional = DepManager.is_optional(dtype) + self.missing[package] = dtype + if is_optional: + self.optional += 1 + else: + self.need += 1 + + def del_package(self, package): + """ + Remove a package at the self.missing() dictionary. + Doesn't update missing_pkg. + """ + if package in self.missing: + del self.missing[package] + + def clear_deps(self): + """ + Clear dependencies without changing needed/optional. + + This is an ackward way to have a separate section to recommend + a package after system main dependencies. + + TODO: rework the logic to prevent needing it. + """ + + self.missing = {} + self.missing_pkg = {} + + def check_missing(self, progs): + """ + Update self.missing_pkg, using progs dict to convert from the + agnostic package name to distro-specific one. + + Returns an string with the packages to be installed, sorted and + with eventual duplicates removed. + """ + + self.missing_pkg = {} + + for prog, dtype in sorted(self.missing.items()): + # At least on some LTS distros like CentOS 7, texlive doesn't + # provide all packages we need. When such distros are + # detected, we have to disable PDF output. + # + # So, we need to ignore the packages that distros would + # need for LaTeX to work + if DepManager.is_pdf(dtype) and not self.pdf: + self.optional -= 1 + continue + + if not dtype in self.missing_pkg: + self.missing_pkg[dtype] = [] + + self.missing_pkg[dtype].append(progs.get(prog, prog)) + + install = [] + for dtype, pkgs in self.missing_pkg.items(): + install += pkgs + + return " ".join(sorted(set(install))) + + def warn_install(self): + """ + Emit warnings/errors related to missing packages. + """ + + output_msg = "" + + for dtype in sorted(self.missing_pkg.keys()): + progs = " ".join(sorted(set(self.missing_pkg[dtype]))) + + try: + name = DepManager.name(dtype) + output_msg += f'{name}:\t{progs}\n' + except KeyError: + raise KeyError(f"ERROR!!!: invalid dtype for {progs}: {dtype}") + + if output_msg: + print(f"\n{output_msg}") + +class AncillaryMethods: + """ + Ancillary methods that checks for missing dependencies for different + types of types, like binaries, python modules, rpm deps, etc. + """ + + @staticmethod + def which(prog): + """ + Our own implementation of which(). We could instead use + shutil.which(), but this function is simple enough. + Probably faster to use this implementation than to import shutil. + """ + for path in os.environ.get("PATH", "").split(":"): + full_path = os.path.join(path, prog) + if os.access(full_path, os.X_OK): + return full_path + + return None + + @staticmethod + def run(*args, **kwargs): + """ + Excecute a command, hiding its output by default. + Preserve compatibility with older Python versions. + """ + + capture_output = kwargs.pop('capture_output', False) + + if capture_output: + if 'stdout' not in kwargs: + kwargs['stdout'] = subprocess.PIPE + if 'stderr' not in kwargs: + kwargs['stderr'] = subprocess.PIPE + else: + if 'stdout' not in kwargs: + kwargs['stdout'] = subprocess.DEVNULL + if 'stderr' not in kwargs: + kwargs['stderr'] = subprocess.DEVNULL + + # Don't break with older Python versions + if 'text' in kwargs and sys.version_info < (3, 7): + kwargs['universal_newlines'] = kwargs.pop('text') + + return subprocess.run(*args, **kwargs) + +class MissingCheckers(AncillaryMethods): + """ + Contains some ancillary checkers for different types of binaries and + package managers. + """ + + def __init__(self, args, texlive): + """ + Initialize its internal variables + """ + self.pdf = args.pdf + self.virtualenv = args.virtualenv + self.version_check = args.version_check + self.texlive = texlive + + self.min_version = (0, 0, 0) + self.cur_version = (0, 0, 0) + + self.deps = DepManager(self.pdf) + + self.need_symlink = 0 + self.need_sphinx = 0 + + self.verbose_warn_install = 1 + + self.virtenv_dir = "" + self.install = "" + self.python_cmd = "" + + self.virtenv_prefix = ["sphinx_", "Sphinx_" ] + + def check_missing_file(self, files, package, dtype): + """ + Does the file exists? If not, add it to missing dependencies. + """ + for f in files: + if os.path.exists(f): + return + self.deps.add_package(package, dtype) + + def check_program(self, prog, dtype): + """ + Does the program exists and it is at the PATH? + If not, add it to missing dependencies. + """ + found = self.which(prog) + if found: + return found + + self.deps.add_package(prog, dtype) + + return None + + def check_perl_module(self, prog, dtype): + """ + Does perl have a dependency? Is it available? + If not, add it to missing dependencies. + + Right now, we still need Perl for doc build, as it is required + by some tools called at docs or kernel build time, like: + + tools/docs/documentation-file-ref-check + + Also, checkpatch is on Perl. + """ + + # While testing with lxc download template, one of the + # distros (Oracle) didn't have perl - nor even an option to install + # before installing oraclelinux-release-el9 package. + # + # Check it before running an error. If perl is not there, + # add it as a mandatory package, as some parts of the doc builder + # needs it. + if not self.which("perl"): + self.deps.add_package("perl", DepManager.SYSTEM_MANDATORY) + self.deps.add_package(prog, dtype) + return + + try: + self.run(["perl", f"-M{prog}", "-e", "1"], check=True) + except subprocess.CalledProcessError: + self.deps.add_package(prog, dtype) + + def check_python_module(self, module, is_optional=False): + """ + Does a python module exists outside venv? If not, add it to missing + dependencies. + """ + if is_optional: + dtype = DepManager.PYTHON_OPTIONAL + else: + dtype = DepManager.PYTHON_MANDATORY + + try: + self.run([self.python_cmd, "-c", f"import {module}"], check=True) + except subprocess.CalledProcessError: + self.deps.add_package(module, dtype) + + def check_rpm_missing(self, pkgs, dtype): + """ + Does a rpm package exists? If not, add it to missing dependencies. + """ + for prog in pkgs: + try: + self.run(["rpm", "-q", prog], check=True) + except subprocess.CalledProcessError: + self.deps.add_package(prog, dtype) + + def check_pacman_missing(self, pkgs, dtype): + """ + Does a pacman package exists? If not, add it to missing dependencies. + """ + for prog in pkgs: + try: + self.run(["pacman", "-Q", prog], check=True) + except subprocess.CalledProcessError: + self.deps.add_package(prog, dtype) + + def check_missing_tex(self, is_optional=False): + """ + Does a LaTeX package exists? If not, add it to missing dependencies. + """ + if is_optional: + dtype = DepManager.PDF_OPTIONAL + else: + dtype = DepManager.PDF_MANDATORY + + kpsewhich = self.which("kpsewhich") + for prog, package in self.texlive.items(): + + # If kpsewhich is not there, just add it to deps + if not kpsewhich: + self.deps.add_package(package, dtype) + continue + + # Check if the package is needed + try: + result = self.run( + [kpsewhich, prog], stdout=subprocess.PIPE, text=True, check=True + ) + + # Didn't find. Add it + if not result.stdout.strip(): + self.deps.add_package(package, dtype) + + except subprocess.CalledProcessError: + # kpsewhich returned an error. Add it, just in case + self.deps.add_package(package, dtype) + + def get_sphinx_fname(self): + """ + Gets the binary filename for sphinx-build. + """ + if "SPHINXBUILD" in os.environ: + return os.environ["SPHINXBUILD"] + + fname = "sphinx-build" + if self.which(fname): + return fname + + fname = "sphinx-build-3" + if self.which(fname): + self.need_symlink = 1 + return fname + + return "" + + def get_sphinx_version(self, cmd): + """ + Gets sphinx-build version. + """ + env = os.environ.copy() + + # The sphinx-build tool has a bug: internally, it tries to set + # locale with locale.setlocale(locale.LC_ALL, ''). This causes a + # crash if language is not set. Detect and fix it. + try: + locale.setlocale(locale.LC_ALL, '') + except Exception: + env["LC_ALL"] = "C" + env["LANG"] = "C" + + try: + result = self.run([cmd, "--version"], env=env, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + text=True, check=True) + except (subprocess.CalledProcessError, FileNotFoundError): + return None + + for line in result.stdout.split("\n"): + match = re.match(r"^sphinx-build\s+([\d\.]+)(?:\+(?:/[\da-f]+)|b\d+)?\s*$", line) + if match: + return PythonVersion.parse_version(match.group(1)) + + match = re.match(r"^Sphinx.*\s+([\d\.]+)\s*$", line) + if match: + return PythonVersion.parse_version(match.group(1)) + + def check_sphinx(self, conf): + """ + Checks Sphinx minimal requirements + """ + try: + with open(conf, "r", encoding="utf-8") as f: + for line in f: + match = re.match(r"^\s*needs_sphinx\s*=\s*[\'\"]([\d\.]+)[\'\"]", line) + if match: + self.min_version = PythonVersion.parse_version(match.group(1)) + break + except IOError: + sys.exit(f"Can't open {conf}") + + if not self.min_version: + sys.exit(f"Can't get needs_sphinx version from {conf}") + + self.virtenv_dir = self.virtenv_prefix[0] + "latest" + + sphinx = self.get_sphinx_fname() + if not sphinx: + self.need_sphinx = 1 + return + + self.cur_version = self.get_sphinx_version(sphinx) + if not self.cur_version: + sys.exit(f"{sphinx} didn't return its version") + + if self.cur_version < self.min_version: + curver = PythonVersion.ver_str(self.cur_version) + minver = PythonVersion.ver_str(self.min_version) + + print(f"ERROR: Sphinx version is {curver}. It should be >= {minver}") + self.need_sphinx = 1 + return + + # On version check mode, just assume Sphinx has all mandatory deps + if self.version_check and self.cur_version >= RECOMMENDED_VERSION: + sys.exit(0) + + def catcheck(self, filename): + """ + Reads a file if it exists, returning as string. + If not found, returns an empty string. + """ + if os.path.exists(filename): + with open(filename, "r", encoding="utf-8") as f: + return f.read().strip() + return "" + + def get_system_release(self): + """ + Determine the system type. There's no unique way that would work + with all distros with a minimal package install. So, several + methods are used here. + + By default, it will use lsb_release function. If not available, it will + fail back to reading the known different places where the distro name + is stored. + + Several modern distros now have /etc/os-release, which usually have + a decent coverage. + """ + + system_release = "" + + if self.which("lsb_release"): + result = self.run(["lsb_release", "-d"], capture_output=True, text=True) + system_release = result.stdout.replace("Description:", "").strip() + + release_files = [ + "/etc/system-release", + "/etc/redhat-release", + "/etc/lsb-release", + "/etc/gentoo-release", + ] + + if not system_release: + for f in release_files: + system_release = self.catcheck(f) + if system_release: + break + + # This seems more common than LSB these days + if not system_release: + os_var = {} + try: + with open("/etc/os-release", "r", encoding="utf-8") as f: + for line in f: + match = re.match(r"^([\w\d\_]+)=\"?([^\"]*)\"?\n", line) + if match: + os_var[match.group(1)] = match.group(2) + + system_release = os_var.get("NAME", "") + if "VERSION_ID" in os_var: + system_release += " " + os_var["VERSION_ID"] + elif "VERSION" in os_var: + system_release += " " + os_var["VERSION"] + except IOError: + pass + + if not system_release: + system_release = self.catcheck("/etc/issue") + + system_release = system_release.strip() + + return system_release + +class SphinxDependencyChecker(MissingCheckers): + """ + Main class for checking Sphinx documentation build dependencies. + + - Check for missing system packages; + - Check for missing Python modules; + - Check for missing LaTeX packages needed by PDF generation; + - Propose Sphinx install via Python Virtual environment; + - Propose Sphinx install via distro-specific package install. + """ + def __init__(self, args): + """Initialize checker variables""" + + # List of required texlive packages on Fedora and OpenSuse + texlive = { + "amsfonts.sty": "texlive-amsfonts", + "amsmath.sty": "texlive-amsmath", + "amssymb.sty": "texlive-amsfonts", + "amsthm.sty": "texlive-amscls", + "anyfontsize.sty": "texlive-anyfontsize", + "atbegshi.sty": "texlive-oberdiek", + "bm.sty": "texlive-tools", + "capt-of.sty": "texlive-capt-of", + "cmap.sty": "texlive-cmap", + "ctexhook.sty": "texlive-ctex", + "ecrm1000.tfm": "texlive-ec", + "eqparbox.sty": "texlive-eqparbox", + "eu1enc.def": "texlive-euenc", + "fancybox.sty": "texlive-fancybox", + "fancyvrb.sty": "texlive-fancyvrb", + "float.sty": "texlive-float", + "fncychap.sty": "texlive-fncychap", + "footnote.sty": "texlive-mdwtools", + "framed.sty": "texlive-framed", + "luatex85.sty": "texlive-luatex85", + "multirow.sty": "texlive-multirow", + "needspace.sty": "texlive-needspace", + "palatino.sty": "texlive-psnfss", + "parskip.sty": "texlive-parskip", + "polyglossia.sty": "texlive-polyglossia", + "tabulary.sty": "texlive-tabulary", + "threeparttable.sty": "texlive-threeparttable", + "titlesec.sty": "texlive-titlesec", + "ucs.sty": "texlive-ucs", + "upquote.sty": "texlive-upquote", + "wrapfig.sty": "texlive-wrapfig", + } + + super().__init__(args, texlive) + + self.need_pip = False + self.rec_sphinx_upgrade = 0 + + self.system_release = self.get_system_release() + self.activate_cmd = "" + + # Some distros may not have a Sphinx shipped package compatible with + # our minimal requirements + self.package_supported = True + + # Recommend a new python version + self.recommend_python = None + + # Certain hints are meant to be shown only once + self.distro_msg = None + + self.latest_avail_ver = (0, 0, 0) + self.venv_ver = (0, 0, 0) + + prefix = os.environ.get("srctree", ".") + "/" + + self.conf = prefix + "Documentation/conf.py" + self.requirement_file = prefix + "Documentation/sphinx/requirements.txt" + + def get_install_progs(self, progs, cmd, extra=None): + """ + Check for missing dependencies using the provided program mapping. + + The actual distro-specific programs are mapped via progs argument. + """ + install = self.deps.check_missing(progs) + + if self.verbose_warn_install: + self.deps.warn_install() + + if not install: + return + + if cmd: + if self.verbose_warn_install: + msg = "You should run:" + else: + msg = "" + + if extra: + msg += "\n\t" + extra.replace("\n", "\n\t") + + return(msg + "\n\tsudo " + cmd + " " + install) + + return None + + # + # Distro-specific hints methods + # + + def give_debian_hints(self): + """ + Provide package installation hints for Debian-based distros. + """ + progs = { + "Pod::Usage": "perl-modules", + "convert": "imagemagick", + "dot": "graphviz", + "ensurepip": "python3-venv", + "python-sphinx": "python3-sphinx", + "rsvg-convert": "librsvg2-bin", + "virtualenv": "virtualenv", + "xelatex": "texlive-xetex", + "yaml": "python3-yaml", + } + + if self.pdf: + pdf_pkgs = { + "fonts-dejavu": [ + "/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", + ], + "fonts-noto-cjk": [ + "/usr/share/fonts/noto-cjk/NotoSansCJK-Regular.ttc", + "/usr/share/fonts/opentype/noto/NotoSansCJK-Regular.ttc", + "/usr/share/fonts/opentype/noto/NotoSerifCJK-Regular.ttc", + ], + "tex-gyre": [ + "/usr/share/texmf/tex/latex/tex-gyre/tgtermes.sty" + ], + "texlive-fonts-recommended": [ + "/usr/share/texlive/texmf-dist/fonts/tfm/adobe/zapfding/pzdr.tfm", + ], + "texlive-lang-chinese": [ + "/usr/share/texlive/texmf-dist/tex/latex/ctex/ctexhook.sty", + ], + } + + for package, files in pdf_pkgs.items(): + self.check_missing_file(files, package, DepManager.PDF_MANDATORY) + + self.check_program("dvipng", DepManager.PDF_MANDATORY) + + if not self.distro_msg: + self.distro_msg = \ + "Note: ImageMagick is broken on some distros, affecting PDF output. For more details:\n" \ + "\thttps://askubuntu.com/questions/1158894/imagemagick-still-broken-using-with-usr-bin-convert" + + return self.get_install_progs(progs, "apt-get install") + + def give_redhat_hints(self): + """ + Provide package installation hints for RedHat-based distros + (Fedora, RHEL and RHEL-based variants). + """ + progs = { + "Pod::Usage": "perl-Pod-Usage", + "convert": "ImageMagick", + "dot": "graphviz", + "python-sphinx": "python3-sphinx", + "rsvg-convert": "librsvg2-tools", + "virtualenv": "python3-virtualenv", + "xelatex": "texlive-xetex-bin", + "yaml": "python3-pyyaml", + } + + fedora_tex_pkgs = [ + "dejavu-sans-fonts", + "dejavu-sans-mono-fonts", + "dejavu-serif-fonts", + "texlive-collection-fontsrecommended", + "texlive-collection-latex", + "texlive-xecjk", + ] + + fedora = False + rel = None + + match = re.search(r"(release|Linux)\s+(\d+)", self.system_release) + if match: + rel = int(match.group(2)) + + if not rel: + print("Couldn't identify release number") + noto_sans_redhat = None + self.pdf = False + elif re.search("Fedora", self.system_release): + # Fedora 38 and upper use this CJK font + + noto_sans_redhat = "google-noto-sans-cjk-fonts" + fedora = True + else: + # Almalinux, CentOS, RHEL, ... + + # at least up to version 9 (and Fedora < 38), that's the CJK font + noto_sans_redhat = "google-noto-sans-cjk-ttc-fonts" + + progs["virtualenv"] = "python-virtualenv" + + if not rel or rel < 8: + print("ERROR: Distro not supported. Too old?") + return + + # RHEL 8 uses Python 3.6, which is not compatible with + # the build system anymore. Suggest Python 3.11 + if rel == 8: + self.check_program("python3.9", DepManager.SYSTEM_MANDATORY) + progs["python3.9"] = "python39" + progs["yaml"] = "python39-pyyaml" + + self.recommend_python = True + + # There's no python39-sphinx package. Only pip is supported + self.package_supported = False + + if not self.distro_msg: + self.distro_msg = \ + "Note: RHEL-based distros typically require extra repositories.\n" \ + "For most, enabling epel and crb are enough:\n" \ + "\tsudo dnf install -y epel-release\n" \ + "\tsudo dnf config-manager --set-enabled crb\n" \ + "Yet, some may have other required repositories. Those commands could be useful:\n" \ + "\tsudo dnf repolist all\n" \ + "\tsudo dnf repoquery --available --info <pkgs>\n" \ + "\tsudo dnf config-manager --set-enabled '*' # enable all - probably not what you want" + + if self.pdf: + pdf_pkgs = [ + "/usr/share/fonts/google-noto-cjk/NotoSansCJK-Regular.ttc", + "/usr/share/fonts/google-noto-sans-cjk-fonts/NotoSansCJK-Regular.ttc", + ] + + self.check_missing_file(pdf_pkgs, noto_sans_redhat, DepManager.PDF_MANDATORY) + + self.check_rpm_missing(fedora_tex_pkgs, DepManager.PDF_MANDATORY) + + self.check_missing_tex(DepManager.PDF_MANDATORY) + + # There's no texlive-ctex on RHEL 8 repositories. This will + # likely affect CJK pdf build only. + if not fedora and rel == 8: + self.deps.del_package("texlive-ctex") + + return self.get_install_progs(progs, "dnf install") + + def give_opensuse_hints(self): + """ + Provide package installation hints for openSUSE-based distros + (Leap and Tumbleweed). + """ + progs = { + "Pod::Usage": "perl-Pod-Usage", + "convert": "ImageMagick", + "dot": "graphviz", + "python-sphinx": "python3-sphinx", + "virtualenv": "python3-virtualenv", + "xelatex": "texlive-xetex-bin texlive-dejavu", + "yaml": "python3-pyyaml", + } + + suse_tex_pkgs = [ + "texlive-babel-english", + "texlive-caption", + "texlive-colortbl", + "texlive-courier", + "texlive-dvips", + "texlive-helvetic", + "texlive-makeindex", + "texlive-metafont", + "texlive-metapost", + "texlive-palatino", + "texlive-preview", + "texlive-times", + "texlive-zapfchan", + "texlive-zapfding", + ] + + progs["latexmk"] = "texlive-latexmk-bin" + + match = re.search(r"(Leap)\s+(\d+).(\d)", self.system_release) + if match: + rel = int(match.group(2)) + + # Leap 15.x uses Python 3.6, which is not compatible with + # the build system anymore. Suggest Python 3.11 + if rel == 15: + if not self.which(self.python_cmd): + self.check_program("python3.11", DepManager.SYSTEM_MANDATORY) + progs["python3.11"] = "python311" + self.recommend_python = True + + progs.update({ + "python-sphinx": "python311-Sphinx python311-Sphinx-latex", + "virtualenv": "python311-virtualenv", + "yaml": "python311-PyYAML", + }) + else: + # Tumbleweed defaults to Python 3.11 + + progs.update({ + "python-sphinx": "python313-Sphinx python313-Sphinx-latex", + "virtualenv": "python313-virtualenv", + "yaml": "python313-PyYAML", + }) + + # FIXME: add support for installing CJK fonts + # + # I tried hard, but was unable to find a way to install + # "Noto Sans CJK SC" on openSUSE + + if self.pdf: + self.check_rpm_missing(suse_tex_pkgs, DepManager.PDF_MANDATORY) + if self.pdf: + self.check_missing_tex() + + return self.get_install_progs(progs, "zypper install --no-recommends") + + def give_mageia_hints(self): + """ + Provide package installation hints for Mageia and OpenMandriva. + """ + progs = { + "Pod::Usage": "perl-Pod-Usage", + "convert": "ImageMagick", + "dot": "graphviz", + "python-sphinx": "python3-sphinx", + "rsvg-convert": "librsvg2", + "virtualenv": "python3-virtualenv", + "xelatex": "texlive", + "yaml": "python3-yaml", + } + + tex_pkgs = [ + "texlive-fontsextra", + "texlive-fonts-asian", + "fonts-ttf-dejavu", + ] + + if re.search(r"OpenMandriva", self.system_release): + packager_cmd = "dnf install" + noto_sans = "noto-sans-cjk-fonts" + tex_pkgs = [ + "texlive-collection-basic", + "texlive-collection-langcjk", + "texlive-collection-fontsextra", + "texlive-collection-fontsrecommended" + ] + + # Tested on OpenMandriva Lx 4.3 + progs["convert"] = "imagemagick" + progs["yaml"] = "python-pyyaml" + progs["python-virtualenv"] = "python-virtualenv" + progs["python-sphinx"] = "python-sphinx" + progs["xelatex"] = "texlive" + + self.check_program("python-virtualenv", DepManager.PYTHON_MANDATORY) + + # On my tests with openMandriva LX 4.0 docker image, upgraded + # to 4.3, python-virtualenv package is broken: it is missing + # ensurepip. Without it, the alternative would be to run: + # python3 -m venv --without-pip ~/sphinx_latest, but running + # pip there won't install sphinx at venv. + # + # Add a note about that. + + if not self.distro_msg: + self.distro_msg = \ + "Notes:\n"\ + "1. for venv, ensurepip could be broken, preventing its install method.\n" \ + "2. at least on OpenMandriva LX 4.3, texlive packages seem broken" + + else: + packager_cmd = "urpmi" + noto_sans = "google-noto-sans-cjk-ttc-fonts" + + progs["latexmk"] = "texlive-collection-basic" + + if self.pdf: + pdf_pkgs = [ + "/usr/share/fonts/google-noto-cjk/NotoSansCJK-Regular.ttc", + "/usr/share/fonts/TTF/NotoSans-Regular.ttf", + ] + + self.check_missing_file(pdf_pkgs, noto_sans, DepManager.PDF_MANDATORY) + self.check_rpm_missing(tex_pkgs, DepManager.PDF_MANDATORY) + + return self.get_install_progs(progs, packager_cmd) + + def give_arch_linux_hints(self): + """ + Provide package installation hints for ArchLinux. + """ + progs = { + "convert": "imagemagick", + "dot": "graphviz", + "latexmk": "texlive-core", + "rsvg-convert": "extra/librsvg", + "virtualenv": "python-virtualenv", + "xelatex": "texlive-xetex", + "yaml": "python-yaml", + } + + archlinux_tex_pkgs = [ + "texlive-basic", + "texlive-binextra", + "texlive-core", + "texlive-fontsrecommended", + "texlive-langchinese", + "texlive-langcjk", + "texlive-latexextra", + "ttf-dejavu", + ] + + if self.pdf: + self.check_pacman_missing(archlinux_tex_pkgs, + DepManager.PDF_MANDATORY) + + self.check_missing_file(["/usr/share/fonts/noto-cjk/NotoSansCJK-Regular.ttc"], + "noto-fonts-cjk", + DepManager.PDF_MANDATORY) + + + return self.get_install_progs(progs, "pacman -S") + + def give_gentoo_hints(self): + """ + Provide package installation hints for Gentoo. + """ + texlive_deps = [ + "dev-texlive/texlive-fontsrecommended", + "dev-texlive/texlive-latexextra", + "dev-texlive/texlive-xetex", + "media-fonts/dejavu", + ] + + progs = { + "convert": "media-gfx/imagemagick", + "dot": "media-gfx/graphviz", + "rsvg-convert": "gnome-base/librsvg", + "virtualenv": "dev-python/virtualenv", + "xelatex": " ".join(texlive_deps), + "yaml": "dev-python/pyyaml", + "python-sphinx": "dev-python/sphinx", + } + + if self.pdf: + pdf_pkgs = { + "media-fonts/dejavu": [ + "/usr/share/fonts/dejavu/DejaVuSans.ttf", + ], + "media-fonts/noto-cjk": [ + "/usr/share/fonts/noto-cjk/NotoSansCJKsc-Regular.otf", + "/usr/share/fonts/noto-cjk/NotoSerifCJK-Regular.ttc", + ], + } + for package, files in pdf_pkgs.items(): + self.check_missing_file(files, package, DepManager.PDF_MANDATORY) + + # Handling dependencies is a nightmare, as Gentoo refuses to emerge + # some packages if there's no package.use file describing them. + # To make it worse, compilation flags shall also be present there + # for some packages. If USE is not perfect, error/warning messages + # like those are shown: + # + # !!! The following binary packages have been ignored due to non matching USE: + # + # =media-gfx/graphviz-12.2.1-r1 X pdf -python_single_target_python3_13 qt6 svg + # =media-gfx/graphviz-12.2.1-r1 X pdf python_single_target_python3_12 -python_single_target_python3_13 qt6 svg + # =media-gfx/graphviz-12.2.1-r1 X pdf qt6 svg + # =media-gfx/graphviz-12.2.1-r1 X pdf -python_single_target_python3_10 qt6 svg + # =media-gfx/graphviz-12.2.1-r1 X pdf -python_single_target_python3_10 python_single_target_python3_12 -python_single_target_python3_13 qt6 svg + # =media-fonts/noto-cjk-20190416 X + # =app-text/texlive-core-2024-r1 X cjk -xetex + # =app-text/texlive-core-2024-r1 X -xetex + # =app-text/texlive-core-2024-r1 -xetex + # =dev-libs/zziplib-0.13.79-r1 sdl + # + # And will ignore such packages, installing the remaining ones. That + # affects mostly the image extension and PDF generation. + + # Package dependencies and the minimal needed args: + portages = { + "graphviz": "media-gfx/graphviz", + "imagemagick": "media-gfx/imagemagick", + "media-libs": "media-libs/harfbuzz icu", + "media-fonts": "media-fonts/noto-cjk", + "texlive": "app-text/texlive-core xetex", + "zziblib": "dev-libs/zziplib sdl", + } + + extra_cmds = "" + if not self.distro_msg: + self.distro_msg = "Note: Gentoo requires package.use to be adjusted before emerging packages" + + use_base = "/etc/portage/package.use" + files = glob(f"{use_base}/*") + + for fname, portage in portages.items(): + install = False + + while install is False: + if not files: + # No files under package.usage. Install all + install = True + break + + args = portage.split(" ") + + name = args.pop(0) + + cmd = ["grep", "-l", "-E", rf"^{name}\b" ] + files + result = self.run(cmd, stdout=subprocess.PIPE, text=True) + if result.returncode or not result.stdout.strip(): + # File containing portage name not found + install = True + break + + # Ensure that needed USE flags are present + if args: + match_fname = result.stdout.strip() + with open(match_fname, 'r', encoding='utf8', + errors='backslashreplace') as fp: + for line in fp: + for arg in args: + if arg.startswith("-"): + continue + + if not re.search(rf"\s*{arg}\b", line): + # Needed file argument not found + install = True + break + + # Everything looks ok, don't install + break + + # emit a code to setup missing USE + if install: + extra_cmds += (f"sudo su -c 'echo \"{portage}\" > {use_base}/{fname}'\n") + + # Now, we can use emerge and let it respect USE + return self.get_install_progs(progs, + "emerge --ask --changed-use --binpkg-respect-use=y", + extra_cmds) + + def get_install(self): + """ + OS-specific hints logic. Seeks for a hinter. If found, use it to + provide package-manager specific install commands. + + Otherwise, outputs install instructions for the meta-packages. + + Returns a string with the command to be executed to install the + the needed packages, if distro found. Otherwise, return just a + list of packages that require installation. + """ + os_hints = { + re.compile("Red Hat Enterprise Linux"): self.give_redhat_hints, + re.compile("Fedora"): self.give_redhat_hints, + re.compile("AlmaLinux"): self.give_redhat_hints, + re.compile("Amazon Linux"): self.give_redhat_hints, + re.compile("CentOS"): self.give_redhat_hints, + re.compile("openEuler"): self.give_redhat_hints, + re.compile("Oracle Linux Server"): self.give_redhat_hints, + re.compile("Rocky Linux"): self.give_redhat_hints, + re.compile("Springdale Open Enterprise"): self.give_redhat_hints, + + re.compile("Ubuntu"): self.give_debian_hints, + re.compile("Debian"): self.give_debian_hints, + re.compile("Devuan"): self.give_debian_hints, + re.compile("Kali"): self.give_debian_hints, + re.compile("Mint"): self.give_debian_hints, + + re.compile("openSUSE"): self.give_opensuse_hints, + + re.compile("Mageia"): self.give_mageia_hints, + re.compile("OpenMandriva"): self.give_mageia_hints, + + re.compile("Arch Linux"): self.give_arch_linux_hints, + re.compile("Gentoo"): self.give_gentoo_hints, + } + + # If the OS is detected, use per-OS hint logic + for regex, os_hint in os_hints.items(): + if regex.search(self.system_release): + return os_hint() + + # + # Fall-back to generic hint code for other distros + # That's far from ideal, specially for LaTeX dependencies. + # + progs = {"sphinx-build": "sphinx"} + if self.pdf: + self.check_missing_tex() + + self.distro_msg = \ + f"I don't know distro {self.system_release}.\n" \ + "So, I can't provide you a hint with the install procedure.\n" \ + "There are likely missing dependencies." + + return self.get_install_progs(progs, None) + + # + # Common dependencies + # + def deactivate_help(self): + """ + Print a helper message to disable a virtual environment. + """ + + print("\n If you want to exit the virtualenv, you can use:") + print("\tdeactivate") + + def get_virtenv(self): + """ + Give a hint about how to activate an already-existing virtual + environment containing sphinx-build. + + Returns a tuble with (activate_cmd_path, sphinx_version) with + the newest available virtual env. + """ + + cwd = os.getcwd() + + activates = [] + + # Add all sphinx prefixes with possible version numbers + for p in self.virtenv_prefix: + activates += glob(f"{cwd}/{p}[0-9]*/bin/activate") + + activates.sort(reverse=True, key=str.lower) + + # Place sphinx_latest first, if it exists + for p in self.virtenv_prefix: + activates = glob(f"{cwd}/{p}*latest/bin/activate") + activates + + ver = (0, 0, 0) + for f in activates: + # Discard too old Sphinx virtual environments + match = re.search(r"(\d+)\.(\d+)\.(\d+)", f) + if match: + ver = (int(match.group(1)), int(match.group(2)), int(match.group(3))) + + if ver < self.min_version: + continue + + sphinx_cmd = f.replace("activate", "sphinx-build") + if not os.path.isfile(sphinx_cmd): + continue + + ver = self.get_sphinx_version(sphinx_cmd) + + if not ver: + venv_dir = f.replace("/bin/activate", "") + print(f"Warning: virtual environment {venv_dir} is not working.\n" \ + "Python version upgrade? Remove it with:\n\n" \ + "\trm -rf {venv_dir}\n\n") + else: + if self.need_sphinx and ver >= self.min_version: + return (f, ver) + elif PythonVersion.parse_version(ver) > self.cur_version: + return (f, ver) + + return ("", ver) + + def recommend_sphinx_upgrade(self): + """ + Check if Sphinx needs to be upgraded. + + Returns a tuple with the higest available Sphinx version if found. + Otherwise, returns None to indicate either that no upgrade is needed + or no venv was found. + """ + + # Avoid running sphinx-builds from venv if cur_version is good + if self.cur_version and self.cur_version >= RECOMMENDED_VERSION: + self.latest_avail_ver = self.cur_version + return None + + # Get the highest version from sphinx_*/bin/sphinx-build and the + # corresponding command to activate the venv/virtenv + self.activate_cmd, self.venv_ver = self.get_virtenv() + + # Store the highest version from Sphinx existing virtualenvs + if self.activate_cmd and self.venv_ver > self.cur_version: + self.latest_avail_ver = self.venv_ver + else: + if self.cur_version: + self.latest_avail_ver = self.cur_version + else: + self.latest_avail_ver = (0, 0, 0) + + # As we don't know package version of Sphinx, and there's no + # virtual environments, don't check if upgrades are needed + if not self.virtualenv: + if not self.latest_avail_ver: + return None + + return self.latest_avail_ver + + # Either there are already a virtual env or a new one should be created + self.need_pip = True + + if not self.latest_avail_ver: + return None + + # Return if the reason is due to an upgrade or not + if self.latest_avail_ver != (0, 0, 0): + if self.latest_avail_ver < RECOMMENDED_VERSION: + self.rec_sphinx_upgrade = 1 + + return self.latest_avail_ver + + def recommend_package(self): + """ + Recommend installing Sphinx as a distro-specific package. + """ + + print("\n2) As a package with:") + + old_need = self.deps.need + old_optional = self.deps.optional + + self.pdf = False + self.deps.optional = 0 + old_verbose = self.verbose_warn_install + self.verbose_warn_install = 0 + + self.deps.clear_deps() + + self.deps.add_package("python-sphinx", DepManager.PYTHON_MANDATORY) + + cmd = self.get_install() + if cmd: + print(cmd) + + self.deps.need = old_need + self.deps.optional = old_optional + self.verbose_warn_install = old_verbose + + def recommend_sphinx_version(self, virtualenv_cmd): + """ + Provide recommendations for installing or upgrading Sphinx based + on current version. + + The logic here is complex, as it have to deal with different versions: + + - minimal supported version; + - minimal PDF version; + - recommended version. + + It also needs to work fine with both distro's package and + venv/virtualenv + """ + + if self.recommend_python: + cur_ver = sys.version_info[:3] + if cur_ver < MIN_PYTHON_VERSION: + print(f"\nPython version {cur_ver} is incompatible with doc build.\n" \ + "Please upgrade it and re-run.\n") + return + + # Version is OK. Nothing to do. + if self.cur_version != (0, 0, 0) and self.cur_version >= RECOMMENDED_VERSION: + return + + if self.latest_avail_ver: + latest_avail_ver = PythonVersion.ver_str(self.latest_avail_ver) + + if not self.need_sphinx: + # sphinx-build is present and its version is >= $min_version + + # only recommend enabling a newer virtenv version if makes sense. + if self.latest_avail_ver and self.latest_avail_ver > self.cur_version: + print(f"\nYou may also use the newer Sphinx version {latest_avail_ver} with:") + if f"{self.virtenv_prefix}" in os.getcwd(): + print("\tdeactivate") + print(f"\t. {self.activate_cmd}") + self.deactivate_help() + return + + if self.latest_avail_ver and self.latest_avail_ver >= RECOMMENDED_VERSION: + return + + if not self.virtualenv: + # No sphinx either via package or via virtenv. As we can't + # Compare the versions here, just return, recommending the + # user to install it from the package distro. + if not self.latest_avail_ver or self.latest_avail_ver == (0, 0, 0): + return + + # User doesn't want a virtenv recommendation, but he already + # installed one via virtenv with a newer version. + # So, print commands to enable it + if self.latest_avail_ver > self.cur_version: + print(f"\nYou may also use the Sphinx virtualenv version {latest_avail_ver} with:") + if f"{self.virtenv_prefix}" in os.getcwd(): + print("\tdeactivate") + print(f"\t. {self.activate_cmd}") + self.deactivate_help() + return + print("\n") + else: + if self.need_sphinx: + self.deps.need += 1 + + # Suggest newer versions if current ones are too old + if self.latest_avail_ver and self.latest_avail_ver >= self.min_version: + if self.latest_avail_ver >= RECOMMENDED_VERSION: + print(f"\nNeed to activate Sphinx (version {latest_avail_ver}) on virtualenv with:") + print(f"\t. {self.activate_cmd}") + self.deactivate_help() + return + + # Version is above the minimal required one, but may be + # below the recommended one. So, print warnings/notes + if self.latest_avail_ver < RECOMMENDED_VERSION: + print(f"Warning: It is recommended at least Sphinx version {RECOMMENDED_VERSION}.") + + # At this point, either it needs Sphinx or upgrade is recommended, + # both via pip + + if self.rec_sphinx_upgrade: + if not self.virtualenv: + print("Instead of install/upgrade Python Sphinx pkg, you could use pip/pypi with:\n\n") + else: + print("To upgrade Sphinx, use:\n\n") + else: + print("\nSphinx needs to be installed either:\n1) via pip/pypi with:\n") + + if not virtualenv_cmd: + print(" Currently not possible.\n") + print(" Please upgrade Python to a newer version and run this script again") + else: + print(f"\t{virtualenv_cmd} {self.virtenv_dir}") + print(f"\t. {self.virtenv_dir}/bin/activate") + print(f"\tpip install -r {self.requirement_file}") + self.deactivate_help() + + if self.package_supported: + self.recommend_package() + + print("\n" \ + " Please note that Sphinx currentlys produce false-positive\n" \ + " warnings when the same name is used for more than one type (functions,\n" \ + " structs, enums,...). This is known Sphinx bug. For more details, see:\n" \ + "\thttps://github.com/sphinx-doc/sphinx/pull/8313") + + def check_needs(self): + """ + Main method that checks needed dependencies and provides + recommendations. + """ + self.python_cmd = sys.executable + + # Check if Sphinx is already accessible from current environment + self.check_sphinx(self.conf) + + if self.system_release: + print(f"Detected OS: {self.system_release}.") + else: + print("Unknown OS") + if self.cur_version != (0, 0, 0): + ver = PythonVersion.ver_str(self.cur_version) + print(f"Sphinx version: {ver}\n") + + # Check the type of virtual env, depending on Python version + virtualenv_cmd = None + + if sys.version_info < MIN_PYTHON_VERSION: + min_ver = ver_str(MIN_PYTHON_VERSION) + print(f"ERROR: at least python {min_ver} is required to build the kernel docs") + self.need_sphinx = 1 + + self.venv_ver = self.recommend_sphinx_upgrade() + + if self.need_pip: + if sys.version_info < MIN_PYTHON_VERSION: + self.need_pip = False + print("Warning: python version is not supported.") + else: + virtualenv_cmd = f"{self.python_cmd} -m venv" + self.check_python_module("ensurepip") + + # Check for needed programs/tools + self.check_perl_module("Pod::Usage", DepManager.SYSTEM_MANDATORY) + + self.check_program("make", DepManager.SYSTEM_MANDATORY) + self.check_program("which", DepManager.SYSTEM_MANDATORY) + + self.check_program("dot", DepManager.SYSTEM_OPTIONAL) + self.check_program("convert", DepManager.SYSTEM_OPTIONAL) + + self.check_python_module("yaml") + + if self.pdf: + self.check_program("xelatex", DepManager.PDF_MANDATORY) + self.check_program("rsvg-convert", DepManager.PDF_MANDATORY) + self.check_program("latexmk", DepManager.PDF_MANDATORY) + + # Do distro-specific checks and output distro-install commands + cmd = self.get_install() + if cmd: + print(cmd) + + # If distro requires some special instructions, print here. + # Please notice that get_install() needs to be called first. + if self.distro_msg: + print("\n" + self.distro_msg) + + if not self.python_cmd: + if self.need == 1: + sys.exit("Can't build as 1 mandatory dependency is missing") + elif self.need: + sys.exit(f"Can't build as {self.need} mandatory dependencies are missing") + + # Check if sphinx-build is called sphinx-build-3 + if self.need_symlink: + sphinx_path = self.which("sphinx-build-3") + if sphinx_path: + print(f"\tsudo ln -sf {sphinx_path} /usr/bin/sphinx-build\n") + + self.recommend_sphinx_version(virtualenv_cmd) + print("") + + if not self.deps.optional: + print("All optional dependencies are met.") + + if self.deps.need == 1: + sys.exit("Can't build as 1 mandatory dependency is missing") + elif self.deps.need: + sys.exit(f"Can't build as {self.deps.need} mandatory dependencies are missing") + + print("Needed package dependencies are met.") + +DESCRIPTION = """ +Process some flags related to Sphinx installation and documentation build. +""" + + +def main(): + """Main function""" + parser = argparse.ArgumentParser(description=DESCRIPTION) + + parser.add_argument( + "--no-virtualenv", + action="store_false", + dest="virtualenv", + help="Recommend installing Sphinx instead of using a virtualenv", + ) + + parser.add_argument( + "--no-pdf", + action="store_false", + dest="pdf", + help="Don't check for dependencies required to build PDF docs", + ) + + parser.add_argument( + "--version-check", + action="store_true", + dest="version_check", + help="If version is compatible, don't check for missing dependencies", + ) + + args = parser.parse_args() + + checker = SphinxDependencyChecker(args) + + PythonVersion.check_python(MIN_PYTHON_VERSION, + bail_out=True, success_on_error=True) + checker.check_needs() + +# Call main if not used as module +if __name__ == "__main__": + main() diff --git a/tools/docs/test_doc_build.py b/tools/docs/test_doc_build.py new file mode 100755 index 000000000000..47b4606569f9 --- /dev/null +++ b/tools/docs/test_doc_build.py @@ -0,0 +1,513 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: GPL-2.0 +# Copyright(c) 2025: Mauro Carvalho Chehab <mchehab+huawei@kernel.org> +# +# pylint: disable=R0903,R0912,R0913,R0914,R0917,C0301 + +""" +Install minimal supported requirements for different Sphinx versions +and optionally test the build. +""" + +import argparse +import asyncio +import os.path +import shutil +import sys +import time +import subprocess + +# Minimal python version supported by the building system. + +PYTHON = os.path.basename(sys.executable) + +min_python_bin = None + +for i in range(9, 13): + p = f"python3.{i}" + if shutil.which(p): + min_python_bin = p + break + +if not min_python_bin: + min_python_bin = PYTHON + +# Starting from 8.0, Python 3.9 is not supported anymore. +PYTHON_VER_CHANGES = {(8, 0, 0): PYTHON} + +DEFAULT_VERSIONS_TO_TEST = [ + (3, 4, 3), # Minimal supported version + (5, 3, 0), # CentOS Stream 9 / AlmaLinux 9 + (6, 1, 1), # Debian 12 + (7, 2, 1), # openSUSE Leap 15.6 + (7, 2, 6), # Ubuntu 24.04 LTS + (7, 4, 7), # Ubuntu 24.10 + (7, 3, 0), # openSUSE Tumbleweed + (8, 1, 3), # Fedora 42 + (8, 2, 3) # Latest version - covers rolling distros +] + +# Sphinx versions to be installed and their incremental requirements +SPHINX_REQUIREMENTS = { + # Oldest versions we support for each package required by Sphinx 3.4.3 + (3, 4, 3): { + "docutils": "0.16", + "alabaster": "0.7.12", + "babel": "2.8.0", + "certifi": "2020.6.20", + "docutils": "0.16", + "idna": "2.10", + "imagesize": "1.2.0", + "Jinja2": "2.11.2", + "MarkupSafe": "1.1.1", + "packaging": "20.4", + "Pygments": "2.6.1", + "PyYAML": "5.1", + "requests": "2.24.0", + "snowballstemmer": "2.0.0", + "sphinxcontrib-applehelp": "1.0.2", + "sphinxcontrib-devhelp": "1.0.2", + "sphinxcontrib-htmlhelp": "1.0.3", + "sphinxcontrib-jsmath": "1.0.1", + "sphinxcontrib-qthelp": "1.0.3", + "sphinxcontrib-serializinghtml": "1.1.4", + "urllib3": "1.25.9", + }, + + # Update package dependencies to a more modern base. The goal here + # is to avoid to many incremental changes for the next entries + (3, 5, 0): { + "alabaster": "0.7.13", + "babel": "2.17.0", + "certifi": "2025.6.15", + "idna": "3.10", + "imagesize": "1.4.1", + "packaging": "25.0", + "Pygments": "2.8.1", + "requests": "2.32.4", + "snowballstemmer": "3.0.1", + "sphinxcontrib-applehelp": "1.0.4", + "sphinxcontrib-htmlhelp": "2.0.1", + "sphinxcontrib-serializinghtml": "1.1.5", + "urllib3": "2.0.0", + }, + + # Starting from here, ensure all docutils versions are covered with + # supported Sphinx versions. Other packages are upgraded only when + # required by pip + (4, 0, 0): { + "PyYAML": "5.1", + }, + (4, 1, 0): { + "docutils": "0.17", + "Pygments": "2.19.1", + "Jinja2": "3.0.3", + "MarkupSafe": "2.0", + }, + (4, 3, 0): {}, + (4, 4, 0): {}, + (4, 5, 0): { + "docutils": "0.17.1", + }, + (5, 0, 0): {}, + (5, 1, 0): {}, + (5, 2, 0): { + "docutils": "0.18", + "Jinja2": "3.1.2", + "MarkupSafe": "2.0", + "PyYAML": "5.3.1", + }, + (5, 3, 0): { + "docutils": "0.18.1", + }, + (6, 0, 0): {}, + (6, 1, 0): {}, + (6, 2, 0): { + "PyYAML": "5.4.1", + }, + (7, 0, 0): {}, + (7, 1, 0): {}, + (7, 2, 0): { + "docutils": "0.19", + "PyYAML": "6.0.1", + "sphinxcontrib-serializinghtml": "1.1.9", + }, + (7, 2, 6): { + "docutils": "0.20", + }, + (7, 3, 0): { + "alabaster": "0.7.14", + "PyYAML": "6.0.1", + "tomli": "2.0.1", + }, + (7, 4, 0): { + "docutils": "0.20.1", + "PyYAML": "6.0.1", + }, + (8, 0, 0): { + "docutils": "0.21", + }, + (8, 1, 0): { + "docutils": "0.21.1", + "PyYAML": "6.0.1", + "sphinxcontrib-applehelp": "1.0.7", + "sphinxcontrib-devhelp": "1.0.6", + "sphinxcontrib-htmlhelp": "2.0.6", + "sphinxcontrib-qthelp": "1.0.6", + }, + (8, 2, 0): { + "docutils": "0.21.2", + "PyYAML": "6.0.1", + "sphinxcontrib-serializinghtml": "1.1.9", + }, +} + + +class AsyncCommands: + """Excecute command synchronously""" + + def __init__(self, fp=None): + + self.stdout = None + self.stderr = None + self.output = None + self.fp = fp + + def log(self, out, verbose, is_info=True): + out = out.removesuffix('\n') + + if verbose: + if is_info: + print(out) + else: + print(out, file=sys.stderr) + + if self.fp: + self.fp.write(out + "\n") + + async def _read(self, stream, verbose, is_info): + """Ancillary routine to capture while displaying""" + + while stream is not None: + line = await stream.readline() + if line: + out = line.decode("utf-8", errors="backslashreplace") + self.log(out, verbose, is_info) + if is_info: + self.stdout += out + else: + self.stderr += out + else: + break + + async def run(self, cmd, capture_output=False, check=False, + env=None, verbose=True): + + """ + Execute an arbitrary command, handling errors. + + Please notice that this class is not thread safe + """ + + self.stdout = "" + self.stderr = "" + + self.log("$ " + " ".join(cmd), verbose) + + proc = await asyncio.create_subprocess_exec(cmd[0], + *cmd[1:], + env=env, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE) + + # Handle input and output in realtime + await asyncio.gather( + self._read(proc.stdout, verbose, True), + self._read(proc.stderr, verbose, False), + ) + + await proc.wait() + + if check and proc.returncode > 0: + raise subprocess.CalledProcessError(returncode=proc.returncode, + cmd=" ".join(cmd), + output=self.stdout, + stderr=self.stderr) + + if capture_output: + if proc.returncode > 0: + self.log(f"Error {proc.returncode}", verbose=True, is_info=False) + return "" + + return self.output + + ret = subprocess.CompletedProcess(args=cmd, + returncode=proc.returncode, + stdout=self.stdout, + stderr=self.stderr) + + return ret + + +class SphinxVenv: + """ + Installs Sphinx on one virtual env per Sphinx version with a minimal + set of dependencies, adjusting them to each specific version. + """ + + def __init__(self): + """Initialize instance variables""" + + self.built_time = {} + self.first_run = True + + async def _handle_version(self, args, fp, + cur_ver, cur_requirements, python_bin): + """Handle a single Sphinx version""" + + cmd = AsyncCommands(fp) + + ver = ".".join(map(str, cur_ver)) + + if not self.first_run and args.wait_input and args.build: + ret = input("Press Enter to continue or 'a' to abort: ").strip().lower() + if ret == "a": + print("Aborted.") + sys.exit() + else: + self.first_run = False + + venv_dir = f"Sphinx_{ver}" + req_file = f"requirements_{ver}.txt" + + cmd.log(f"\nSphinx {ver} with {python_bin}", verbose=True) + + # Create venv + await cmd.run([python_bin, "-m", "venv", venv_dir], + verbose=args.verbose, check=True) + pip = os.path.join(venv_dir, "bin/pip") + + # Create install list + reqs = [] + for pkg, verstr in cur_requirements.items(): + reqs.append(f"{pkg}=={verstr}") + + reqs.append(f"Sphinx=={ver}") + + await cmd.run([pip, "install"] + reqs, check=True, verbose=args.verbose) + + # Freeze environment + result = await cmd.run([pip, "freeze"], verbose=False, check=True) + + # Pip install succeeded. Write requirements file + if args.req_file: + with open(req_file, "w", encoding="utf-8") as fp: + fp.write(result.stdout) + + if args.build: + start_time = time.time() + + # Prepare a venv environment + env = os.environ.copy() + bin_dir = os.path.join(venv_dir, "bin") + env["PATH"] = bin_dir + ":" + env["PATH"] + env["VIRTUAL_ENV"] = venv_dir + if "PYTHONHOME" in env: + del env["PYTHONHOME"] + + # Test doc build + await cmd.run(["make", "cleandocs"], env=env, check=True) + make = ["make"] + + if args.output: + sphinx_build = os.path.realpath(f"{bin_dir}/sphinx-build") + make += [f"O={args.output}", f"SPHINXBUILD={sphinx_build}"] + + if args.make_args: + make += args.make_args + + make += args.targets + + if args.verbose: + cmd.log(f". {bin_dir}/activate", verbose=True) + await cmd.run(make, env=env, check=True, verbose=True) + if args.verbose: + cmd.log("deactivate", verbose=True) + + end_time = time.time() + elapsed_time = end_time - start_time + hours, minutes = divmod(elapsed_time, 3600) + minutes, seconds = divmod(minutes, 60) + + hours = int(hours) + minutes = int(minutes) + seconds = int(seconds) + + self.built_time[ver] = f"{hours:02d}:{minutes:02d}:{seconds:02d}" + + cmd.log(f"Finished doc build for Sphinx {ver}. Elapsed time: {self.built_time[ver]}", verbose=True) + + async def run(self, args): + """ + Navigate though multiple Sphinx versions, handling each of them + on a loop. + """ + + if args.log: + fp = open(args.log, "w", encoding="utf-8") + if not args.verbose: + args.verbose = False + else: + fp = None + if not args.verbose: + args.verbose = True + + cur_requirements = {} + python_bin = min_python_bin + + vers = set(SPHINX_REQUIREMENTS.keys()) | set(args.versions) + + for cur_ver in sorted(vers): + if cur_ver in SPHINX_REQUIREMENTS: + new_reqs = SPHINX_REQUIREMENTS[cur_ver] + cur_requirements.update(new_reqs) + + if cur_ver in PYTHON_VER_CHANGES: # pylint: disable=R1715 + python_bin = PYTHON_VER_CHANGES[cur_ver] + + if cur_ver not in args.versions: + continue + + if args.min_version: + if cur_ver < args.min_version: + continue + + if args.max_version: + if cur_ver > args.max_version: + break + + await self._handle_version(args, fp, cur_ver, cur_requirements, + python_bin) + + if args.build: + cmd = AsyncCommands(fp) + cmd.log("\nSummary:", verbose=True) + for ver, elapsed_time in sorted(self.built_time.items()): + cmd.log(f"\tSphinx {ver} elapsed time: {elapsed_time}", + verbose=True) + + if fp: + fp.close() + +def parse_version(ver_str): + """Convert a version string into a tuple.""" + + return tuple(map(int, ver_str.split("."))) + + +DEFAULT_VERS = " - " +DEFAULT_VERS += "\n - ".join(map(lambda v: f"{v[0]}.{v[1]}.{v[2]}", + DEFAULT_VERSIONS_TO_TEST)) + +SCRIPT = os.path.relpath(__file__) + +DESCRIPTION = f""" +This tool allows creating Python virtual environments for different +Sphinx versions that are supported by the Linux Kernel build system. + +Besides creating the virtual environment, it can also test building +the documentation using "make htmldocs" (and/or other doc targets). + +If called without "--versions" argument, it covers the versions shipped +on major distros, plus the lowest supported version: + +{DEFAULT_VERS} + +A typical usage is to run: + + {SCRIPT} -m -l sphinx_builds.log + +This will create one virtual env for the default version set and run +"make htmldocs" for each version, creating a log file with the +excecuted commands on it. + +NOTE: The build time can be very long, specially on old versions. Also, there +is a known bug with Sphinx version 6.0.x: each subprocess uses a lot of +memory. That, together with "-jauto" may cause OOM killer to cause +failures at the doc generation. To minimize the risk, you may use the +"-a" command line parameter to constrain the built directories and/or +reduce the number of threads from "-jauto" to, for instance, "-j4": + + {SCRIPT} -m -V 6.0.1 -a "SPHINXDIRS=process" "SPHINXOPTS='-j4'" + +""" + +MAKE_TARGETS = [ + "htmldocs", + "texinfodocs", + "infodocs", + "latexdocs", + "pdfdocs", + "epubdocs", + "xmldocs", +] + +async def main(): + """Main program""" + + parser = argparse.ArgumentParser(description=DESCRIPTION, + formatter_class=argparse.RawDescriptionHelpFormatter) + + ver_group = parser.add_argument_group("Version range options") + + ver_group.add_argument('-V', '--versions', nargs="*", + default=DEFAULT_VERSIONS_TO_TEST,type=parse_version, + help='Sphinx versions to test') + ver_group.add_argument('--min-version', "--min", type=parse_version, + help='Sphinx minimal version') + ver_group.add_argument('--max-version', "--max", type=parse_version, + help='Sphinx maximum version') + ver_group.add_argument('-f', '--full', action='store_true', + help='Add all Sphinx (major,minor) supported versions to the version range') + + build_group = parser.add_argument_group("Build options") + + build_group.add_argument('-b', '--build', action='store_true', + help='Build documentation') + build_group.add_argument('-a', '--make-args', nargs="*", + help='extra arguments for make, like SPHINXDIRS=netlink/specs', + ) + build_group.add_argument('-t', '--targets', nargs="+", choices=MAKE_TARGETS, + default=[MAKE_TARGETS[0]], + help="make build targets. Default: htmldocs.") + build_group.add_argument("-o", '--output', + help="output directory for the make O=OUTPUT") + + other_group = parser.add_argument_group("Other options") + + other_group.add_argument('-r', '--req-file', action='store_true', + help='write a requirements.txt file') + other_group.add_argument('-l', '--log', + help='Log command output on a file') + other_group.add_argument('-v', '--verbose', action='store_true', + help='Verbose all commands') + other_group.add_argument('-i', '--wait-input', action='store_true', + help='Wait for an enter before going to the next version') + + args = parser.parse_args() + + if not args.make_args: + args.make_args = [] + + sphinx_versions = sorted(list(SPHINX_REQUIREMENTS.keys())) + + if args.full: + args.versions += list(SPHINX_REQUIREMENTS.keys()) + + venv = SphinxVenv() + await venv.run(args) + + +# Call main method +if __name__ == "__main__": + asyncio.run(main()) diff --git a/tools/include/linux/interval_tree_generic.h b/tools/include/linux/interval_tree_generic.h index aaa8a0767aa3..c5a2fed49eb0 100644 --- a/tools/include/linux/interval_tree_generic.h +++ b/tools/include/linux/interval_tree_generic.h @@ -77,7 +77,7 @@ ITSTATIC void ITPREFIX ## _remove(ITSTRUCT *node, \ * Cond2: start <= ITLAST(node) \ */ \ \ -static ITSTRUCT * \ +ITSTATIC ITSTRUCT * \ ITPREFIX ## _subtree_search(ITSTRUCT *node, ITTYPE start, ITTYPE last) \ { \ while (true) { \ @@ -104,12 +104,8 @@ ITPREFIX ## _subtree_search(ITSTRUCT *node, ITTYPE start, ITTYPE last) \ if (ITSTART(node) <= last) { /* Cond1 */ \ if (start <= ITLAST(node)) /* Cond2 */ \ return node; /* node is leftmost match */ \ - if (node->ITRB.rb_right) { \ - node = rb_entry(node->ITRB.rb_right, \ - ITSTRUCT, ITRB); \ - if (start <= node->ITSUBTREE) \ - continue; \ - } \ + node = rb_entry(node->ITRB.rb_right, ITSTRUCT, ITRB); \ + continue; \ } \ return NULL; /* No match */ \ } \ diff --git a/tools/include/linux/livepatch_external.h b/tools/include/linux/livepatch_external.h new file mode 100644 index 000000000000..138af19b0f5c --- /dev/null +++ b/tools/include/linux/livepatch_external.h @@ -0,0 +1,76 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * External livepatch interfaces for patch creation tooling + */ + +#ifndef _LINUX_LIVEPATCH_EXTERNAL_H_ +#define _LINUX_LIVEPATCH_EXTERNAL_H_ + +#include <linux/types.h> + +#define KLP_RELOC_SEC_PREFIX ".klp.rela." +#define KLP_SYM_PREFIX ".klp.sym." + +#define __KLP_PRE_PATCH_PREFIX __klp_pre_patch_callback_ +#define __KLP_POST_PATCH_PREFIX __klp_post_patch_callback_ +#define __KLP_PRE_UNPATCH_PREFIX __klp_pre_unpatch_callback_ +#define __KLP_POST_UNPATCH_PREFIX __klp_post_unpatch_callback_ + +#define KLP_PRE_PATCH_PREFIX __stringify(__KLP_PRE_PATCH_PREFIX) +#define KLP_POST_PATCH_PREFIX __stringify(__KLP_POST_PATCH_PREFIX) +#define KLP_PRE_UNPATCH_PREFIX __stringify(__KLP_PRE_UNPATCH_PREFIX) +#define KLP_POST_UNPATCH_PREFIX __stringify(__KLP_POST_UNPATCH_PREFIX) + +struct klp_object; + +typedef int (*klp_pre_patch_t)(struct klp_object *obj); +typedef void (*klp_post_patch_t)(struct klp_object *obj); +typedef void (*klp_pre_unpatch_t)(struct klp_object *obj); +typedef void (*klp_post_unpatch_t)(struct klp_object *obj); + +/** + * struct klp_callbacks - pre/post live-(un)patch callback structure + * @pre_patch: executed before code patching + * @post_patch: executed after code patching + * @pre_unpatch: executed before code unpatching + * @post_unpatch: executed after code unpatching + * @post_unpatch_enabled: flag indicating if post-unpatch callback + * should run + * + * All callbacks are optional. Only the pre-patch callback, if provided, + * will be unconditionally executed. If the parent klp_object fails to + * patch for any reason, including a non-zero error status returned from + * the pre-patch callback, no further callbacks will be executed. + */ +struct klp_callbacks { + klp_pre_patch_t pre_patch; + klp_post_patch_t post_patch; + klp_pre_unpatch_t pre_unpatch; + klp_post_unpatch_t post_unpatch; + bool post_unpatch_enabled; +}; + +/* + * 'struct klp_{func,object}_ext' are compact "external" representations of + * 'struct klp_{func,object}'. They are used by objtool for livepatch + * generation. The structs are then read by the livepatch module and converted + * to the real structs before calling klp_enable_patch(). + * + * TODO make these the official API for klp_enable_patch(). That should + * simplify livepatch's interface as well as its data structure lifetime + * management. + */ +struct klp_func_ext { + const char *old_name; + void *new_func; + unsigned long sympos; +}; + +struct klp_object_ext { + const char *name; + struct klp_func_ext *funcs; + struct klp_callbacks callbacks; + unsigned int nr_funcs; +}; + +#endif /* _LINUX_LIVEPATCH_EXTERNAL_H_ */ diff --git a/tools/include/linux/objtool_types.h b/tools/include/linux/objtool_types.h index aceac94632c8..c6def4049b1a 100644 --- a/tools/include/linux/objtool_types.h +++ b/tools/include/linux/objtool_types.h @@ -67,4 +67,6 @@ struct unwind_hint { #define ANNOTYPE_REACHABLE 8 #define ANNOTYPE_NOCFI 9 +#define ANNOTYPE_DATA_SPECIAL 1 + #endif /* _LINUX_OBJTOOL_TYPES_H */ diff --git a/tools/include/linux/string.h b/tools/include/linux/string.h index 8499f509f03e..51ad3cf4fa82 100644 --- a/tools/include/linux/string.h +++ b/tools/include/linux/string.h @@ -44,6 +44,20 @@ static inline bool strstarts(const char *str, const char *prefix) return strncmp(str, prefix, strlen(prefix)) == 0; } +/* + * Checks if a string ends with another. + */ +static inline bool str_ends_with(const char *str, const char *substr) +{ + size_t len = strlen(str); + size_t sublen = strlen(substr); + + if (sublen > len) + return false; + + return !strcmp(str + len - sublen, substr); +} + extern char * __must_check skip_spaces(const char *); extern char *strim(char *); diff --git a/tools/include/nolibc/Makefile b/tools/include/nolibc/Makefile index 143c2d2c2ba6..8118e22844f1 100644 --- a/tools/include/nolibc/Makefile +++ b/tools/include/nolibc/Makefile @@ -23,7 +23,7 @@ else Q=@ endif -arch_file := arch-$(ARCH).h +arch_files := arch.h $(wildcard arch-*.h) all_files := \ compiler.h \ crt.h \ @@ -33,6 +33,7 @@ all_files := \ errno.h \ fcntl.h \ getopt.h \ + inttypes.h \ limits.h \ math.h \ nolibc.h \ @@ -56,12 +57,14 @@ all_files := \ sys/random.h \ sys/reboot.h \ sys/resource.h \ + sys/select.h \ sys/stat.h \ sys/syscall.h \ sys/sysmacros.h \ sys/time.h \ sys/timerfd.h \ sys/types.h \ + sys/uio.h \ sys/utsname.h \ sys/wait.h \ time.h \ @@ -79,7 +82,7 @@ help: @echo "Supported targets under nolibc:" @echo " all call \"headers\"" @echo " clean clean the sysroot" - @echo " headers prepare a sysroot in tools/include/nolibc/sysroot" + @echo " headers prepare a multi-arch sysroot in \$${OUTPUT}sysroot" @echo " headers_standalone like \"headers\", and also install kernel headers" @echo " help this help" @echo "" @@ -90,18 +93,11 @@ help: @echo " OUTPUT = $(OUTPUT)" @echo "" +# installs headers for all archs at once. headers: - $(Q)mkdir -p $(OUTPUT)sysroot - $(Q)mkdir -p $(OUTPUT)sysroot/include - $(Q)cp --parents $(all_files) $(OUTPUT)sysroot/include/ - $(Q)if [ "$(ARCH)" = "i386" -o "$(ARCH)" = "x86_64" ]; then \ - cat arch-x86.h; \ - elif [ -e "$(arch_file)" ]; then \ - cat $(arch_file); \ - else \ - echo "Fatal: architecture $(ARCH) not yet supported by nolibc." >&2; \ - exit 1; \ - fi > $(OUTPUT)sysroot/include/arch.h + $(Q)mkdir -p "$(OUTPUT)sysroot" + $(Q)mkdir -p "$(OUTPUT)sysroot/include" + $(Q)cp --parents $(arch_files) $(all_files) "$(OUTPUT)sysroot/include/" headers_standalone: headers $(Q)$(MAKE) -C $(srctree) headers diff --git a/tools/include/nolibc/arch-arm.h b/tools/include/nolibc/arch-arm.h index 1f66e7e5a444..251c42579028 100644 --- a/tools/include/nolibc/arch-arm.h +++ b/tools/include/nolibc/arch-arm.h @@ -184,6 +184,7 @@ _arg1; \ }) +#ifndef NOLIBC_NO_RUNTIME /* startup code */ void __attribute__((weak, noreturn)) __nolibc_entrypoint __no_stack_protector _start(void) { @@ -193,5 +194,6 @@ void __attribute__((weak, noreturn)) __nolibc_entrypoint __no_stack_protector _s ); __nolibc_entrypoint_epilogue(); } +#endif /* NOLIBC_NO_RUNTIME */ #endif /* _NOLIBC_ARCH_ARM_H */ diff --git a/tools/include/nolibc/arch-arm64.h b/tools/include/nolibc/arch-arm64.h index 02a3f74c8ec8..080a55a7144e 100644 --- a/tools/include/nolibc/arch-arm64.h +++ b/tools/include/nolibc/arch-arm64.h @@ -141,6 +141,7 @@ _arg1; \ }) +#ifndef NOLIBC_NO_RUNTIME /* startup code */ void __attribute__((weak, noreturn)) __nolibc_entrypoint __no_stack_protector _start(void) { @@ -150,4 +151,5 @@ void __attribute__((weak, noreturn)) __nolibc_entrypoint __no_stack_protector _s ); __nolibc_entrypoint_epilogue(); } +#endif /* NOLIBC_NO_RUNTIME */ #endif /* _NOLIBC_ARCH_ARM64_H */ diff --git a/tools/include/nolibc/arch-loongarch.h b/tools/include/nolibc/arch-loongarch.h index 5511705303ea..c894176c3f89 100644 --- a/tools/include/nolibc/arch-loongarch.h +++ b/tools/include/nolibc/arch-loongarch.h @@ -142,6 +142,7 @@ _arg1; \ }) +#ifndef NOLIBC_NO_RUNTIME /* startup code */ void __attribute__((weak, noreturn)) __nolibc_entrypoint __no_stack_protector _start(void) { @@ -151,5 +152,6 @@ void __attribute__((weak, noreturn)) __nolibc_entrypoint __no_stack_protector _s ); __nolibc_entrypoint_epilogue(); } +#endif /* NOLIBC_NO_RUNTIME */ #endif /* _NOLIBC_ARCH_LOONGARCH_H */ diff --git a/tools/include/nolibc/arch-m68k.h b/tools/include/nolibc/arch-m68k.h index 6dac1845f298..2a4fbada5e79 100644 --- a/tools/include/nolibc/arch-m68k.h +++ b/tools/include/nolibc/arch-m68k.h @@ -128,6 +128,7 @@ _num; \ }) +#ifndef NOLIBC_NO_RUNTIME void _start(void); void __attribute__((weak, noreturn)) __nolibc_entrypoint __no_stack_protector _start(void) { @@ -137,5 +138,6 @@ void __attribute__((weak, noreturn)) __nolibc_entrypoint __no_stack_protector _s ); __nolibc_entrypoint_epilogue(); } +#endif /* NOLIBC_NO_RUNTIME */ #endif /* _NOLIBC_ARCH_M68K_H */ diff --git a/tools/include/nolibc/arch-mips.h b/tools/include/nolibc/arch-mips.h index 0cbac63b249a..a72506ceec6b 100644 --- a/tools/include/nolibc/arch-mips.h +++ b/tools/include/nolibc/arch-mips.h @@ -245,6 +245,7 @@ #endif /* _ABIO32 */ +#ifndef NOLIBC_NO_RUNTIME /* startup code, note that it's called __start on MIPS */ void __start(void); void __attribute__((weak, noreturn)) __nolibc_entrypoint __no_stack_protector __start(void) @@ -266,5 +267,6 @@ void __attribute__((weak, noreturn)) __nolibc_entrypoint __no_stack_protector __ ); __nolibc_entrypoint_epilogue(); } +#endif /* NOLIBC_NO_RUNTIME */ #endif /* _NOLIBC_ARCH_MIPS_H */ diff --git a/tools/include/nolibc/arch-powerpc.h b/tools/include/nolibc/arch-powerpc.h index 204564bbcd32..e0c7e0b81f7c 100644 --- a/tools/include/nolibc/arch-powerpc.h +++ b/tools/include/nolibc/arch-powerpc.h @@ -183,6 +183,7 @@ #endif #endif /* !__powerpc64__ */ +#ifndef NOLIBC_NO_RUNTIME /* startup code */ void __attribute__((weak, noreturn)) __nolibc_entrypoint __no_stack_protector _start(void) { @@ -215,5 +216,6 @@ void __attribute__((weak, noreturn)) __nolibc_entrypoint __no_stack_protector _s #endif __nolibc_entrypoint_epilogue(); } +#endif /* NOLIBC_NO_RUNTIME */ #endif /* _NOLIBC_ARCH_POWERPC_H */ diff --git a/tools/include/nolibc/arch-riscv.h b/tools/include/nolibc/arch-riscv.h index 885383a86c38..1c00cacf57e1 100644 --- a/tools/include/nolibc/arch-riscv.h +++ b/tools/include/nolibc/arch-riscv.h @@ -139,6 +139,7 @@ _arg1; \ }) +#ifndef NOLIBC_NO_RUNTIME /* startup code */ void __attribute__((weak, noreturn)) __nolibc_entrypoint __no_stack_protector _start(void) { @@ -152,5 +153,6 @@ void __attribute__((weak, noreturn)) __nolibc_entrypoint __no_stack_protector _s ); __nolibc_entrypoint_epilogue(); } +#endif /* NOLIBC_NO_RUNTIME */ #endif /* _NOLIBC_ARCH_RISCV_H */ diff --git a/tools/include/nolibc/arch-s390.h b/tools/include/nolibc/arch-s390.h index df4c3cc713ac..74125a254ce3 100644 --- a/tools/include/nolibc/arch-s390.h +++ b/tools/include/nolibc/arch-s390.h @@ -139,22 +139,19 @@ _arg1; \ }) +#ifndef NOLIBC_NO_RUNTIME /* startup code */ void __attribute__((weak, noreturn)) __nolibc_entrypoint __no_stack_protector _start(void) { __asm__ volatile ( -#ifdef __s390x__ "lgr %r2, %r15\n" /* save stack pointer to %r2, as arg1 of _start_c */ "aghi %r15, -160\n" /* allocate new stackframe */ -#else - "lr %r2, %r15\n" - "ahi %r15, -96\n" -#endif "xc 0(8,%r15), 0(%r15)\n" /* clear backchain */ "brasl %r14, _start_c\n" /* transfer to c runtime */ ); __nolibc_entrypoint_epilogue(); } +#endif /* NOLIBC_NO_RUNTIME */ struct s390_mmap_arg_struct { unsigned long addr; diff --git a/tools/include/nolibc/arch-sh.h b/tools/include/nolibc/arch-sh.h index a96b8914607e..7a421197d104 100644 --- a/tools/include/nolibc/arch-sh.h +++ b/tools/include/nolibc/arch-sh.h @@ -140,6 +140,7 @@ _ret; \ }) +#ifndef NOLIBC_NO_RUNTIME /* startup code */ void _start_wrapper(void); void __attribute__((weak,noreturn)) __nolibc_entrypoint __no_stack_protector _start_wrapper(void) @@ -158,5 +159,6 @@ void __attribute__((weak,noreturn)) __nolibc_entrypoint __no_stack_protector _st ); __nolibc_entrypoint_epilogue(); } +#endif /* NOLIBC_NO_RUNTIME */ #endif /* _NOLIBC_ARCH_SH_H */ diff --git a/tools/include/nolibc/arch-sparc.h b/tools/include/nolibc/arch-sparc.h index ca420d843e25..2ebb5686e105 100644 --- a/tools/include/nolibc/arch-sparc.h +++ b/tools/include/nolibc/arch-sparc.h @@ -152,6 +152,7 @@ _arg1; \ }) +#ifndef NOLIBC_NO_RUNTIME /* startup code */ void __attribute__((weak, noreturn)) __nolibc_entrypoint __no_stack_protector _start(void) { @@ -169,6 +170,7 @@ void __attribute__((weak, noreturn)) __nolibc_entrypoint __no_stack_protector _s ); __nolibc_entrypoint_epilogue(); } +#endif /* NOLIBC_NO_RUNTIME */ static pid_t getpid(void); diff --git a/tools/include/nolibc/arch-x86.h b/tools/include/nolibc/arch-x86.h index d3efc0c3b8ad..f6c43ac5377b 100644 --- a/tools/include/nolibc/arch-x86.h +++ b/tools/include/nolibc/arch-x86.h @@ -157,6 +157,7 @@ _eax; \ }) +#ifndef NOLIBC_NO_RUNTIME /* startup code */ /* * i386 System V ABI mandates: @@ -176,6 +177,7 @@ void __attribute__((weak, noreturn)) __nolibc_entrypoint __no_stack_protector _s ); __nolibc_entrypoint_epilogue(); } +#endif /* NOLIBC_NO_RUNTIME */ #else /* !defined(__x86_64__) */ @@ -323,6 +325,7 @@ void __attribute__((weak, noreturn)) __nolibc_entrypoint __no_stack_protector _s _ret; \ }) +#ifndef NOLIBC_NO_RUNTIME /* startup code */ /* * x86-64 System V ABI mandates: @@ -340,6 +343,7 @@ void __attribute__((weak, noreturn)) __nolibc_entrypoint __no_stack_protector _s ); __nolibc_entrypoint_epilogue(); } +#endif /* NOLIBC_NO_RUNTIME */ #define NOLIBC_ARCH_HAS_MEMMOVE void *memmove(void *dst, const void *src, size_t len); @@ -351,7 +355,7 @@ void *memcpy(void *dst, const void *src, size_t len); void *memset(void *dst, int c, size_t len); __asm__ ( -".section .text.nolibc_memmove_memcpy\n" +".pushsection .text.nolibc_memmove_memcpy\n" ".weak memmove\n" ".weak memcpy\n" "memmove:\n" @@ -371,8 +375,9 @@ __asm__ ( "rep movsb\n\t" "cld\n\t" "retq\n" +".popsection\n" -".section .text.nolibc_memset\n" +".pushsection .text.nolibc_memset\n" ".weak memset\n" "memset:\n" "xchgl %eax, %esi\n\t" @@ -381,6 +386,7 @@ __asm__ ( "rep stosb\n\t" "popq %rax\n\t" "retq\n" +".popsection\n" ); #endif /* !defined(__x86_64__) */ diff --git a/tools/include/nolibc/arch.h b/tools/include/nolibc/arch.h index 426c89198135..a3adaf433f2c 100644 --- a/tools/include/nolibc/arch.h +++ b/tools/include/nolibc/arch.h @@ -3,15 +3,6 @@ * Copyright (C) 2017-2022 Willy Tarreau <w@1wt.eu> */ -/* Below comes the architecture-specific code. For each architecture, we have - * the syscall declarations and the _start code definition. This is the only - * global part. On all architectures the kernel puts everything in the stack - * before jumping to _start just above us, without any return address (_start - * is not a function but an entry point). So at the stack pointer we find argc. - * Then argv[] begins, and ends at the first NULL. Then we have envp which - * starts and ends with a NULL as well. So envp=argv+argc+1. - */ - #ifndef _NOLIBC_ARCH_H #define _NOLIBC_ARCH_H @@ -27,7 +18,7 @@ #include "arch-powerpc.h" #elif defined(__riscv) #include "arch-riscv.h" -#elif defined(__s390x__) || defined(__s390__) +#elif defined(__s390x__) #include "arch-s390.h" #elif defined(__loongarch__) #include "arch-loongarch.h" diff --git a/tools/include/nolibc/compiler.h b/tools/include/nolibc/compiler.h index 369cfb5a0e78..87090bbc53e0 100644 --- a/tools/include/nolibc/compiler.h +++ b/tools/include/nolibc/compiler.h @@ -41,8 +41,8 @@ # define __no_stack_protector __attribute__((__optimize__("-fno-stack-protector"))) #endif /* __nolibc_has_attribute(no_stack_protector) */ -#if __nolibc_has_attribute(fallthrough) -# define __nolibc_fallthrough do { } while (0); __attribute__((fallthrough)) +#if __nolibc_has_attribute(__fallthrough__) +# define __nolibc_fallthrough do { } while (0); __attribute__((__fallthrough__)) #else # define __nolibc_fallthrough do { } while (0) #endif /* __nolibc_has_attribute(fallthrough) */ diff --git a/tools/include/nolibc/crt.h b/tools/include/nolibc/crt.h index 961cfe777c35..d9262998dae9 100644 --- a/tools/include/nolibc/crt.h +++ b/tools/include/nolibc/crt.h @@ -7,6 +7,8 @@ #ifndef _NOLIBC_CRT_H #define _NOLIBC_CRT_H +#ifndef NOLIBC_NO_RUNTIME + #include "compiler.h" char **environ __attribute__((weak)); @@ -88,4 +90,5 @@ void _start_c(long *sp) exit(exitcode); } +#endif /* NOLIBC_NO_RUNTIME */ #endif /* _NOLIBC_CRT_H */ diff --git a/tools/include/nolibc/dirent.h b/tools/include/nolibc/dirent.h index 758b95c48e7a..61a122a60327 100644 --- a/tools/include/nolibc/dirent.h +++ b/tools/include/nolibc/dirent.h @@ -86,9 +86,9 @@ int readdir_r(DIR *dirp, struct dirent *entry, struct dirent **result) * readdir() can only return one entry at a time. * Make sure the non-returned ones are not skipped. */ - ret = lseek(fd, ldir->d_off, SEEK_SET); - if (ret == -1) - return errno; + ret = sys_lseek(fd, ldir->d_off, SEEK_SET); + if (ret < 0) + return -ret; entry->d_ino = ldir->d_ino; /* the destination should always be big enough */ diff --git a/tools/include/nolibc/getopt.h b/tools/include/nolibc/getopt.h index 217abb95264b..87565e3b6a33 100644 --- a/tools/include/nolibc/getopt.h +++ b/tools/include/nolibc/getopt.h @@ -78,7 +78,7 @@ int getopt(int argc, char * const argv[], const char *optstring) return '?'; } if (optstring[i] == ':') { - optarg = 0; + optarg = NULL; if (optstring[i + 1] != ':' || __optpos) { optarg = argv[optind++]; if (__optpos) diff --git a/tools/include/nolibc/inttypes.h b/tools/include/nolibc/inttypes.h new file mode 100644 index 000000000000..1977bd74bfeb --- /dev/null +++ b/tools/include/nolibc/inttypes.h @@ -0,0 +1,3 @@ +/* SPDX-License-Identifier: LGPL-2.1 OR MIT */ + +#include "nolibc.h" diff --git a/tools/include/nolibc/nolibc.h b/tools/include/nolibc/nolibc.h index d2f5aa085f8e..272dfc961158 100644 --- a/tools/include/nolibc/nolibc.h +++ b/tools/include/nolibc/nolibc.h @@ -104,11 +104,13 @@ #include "sys/random.h" #include "sys/reboot.h" #include "sys/resource.h" +#include "sys/select.h" #include "sys/stat.h" #include "sys/syscall.h" #include "sys/sysmacros.h" #include "sys/time.h" #include "sys/timerfd.h" +#include "sys/uio.h" #include "sys/utsname.h" #include "sys/wait.h" #include "ctype.h" diff --git a/tools/include/nolibc/stackprotector.h b/tools/include/nolibc/stackprotector.h index c71a2c257177..7123aa056cb0 100644 --- a/tools/include/nolibc/stackprotector.h +++ b/tools/include/nolibc/stackprotector.h @@ -9,6 +9,7 @@ #include "compiler.h" +#ifndef NOLIBC_NO_RUNTIME #if defined(_NOLIBC_STACKPROTECTOR) #include "sys.h" @@ -49,5 +50,6 @@ static __no_stack_protector void __stack_chk_init(void) #else /* !defined(_NOLIBC_STACKPROTECTOR) */ static void __stack_chk_init(void) {} #endif /* defined(_NOLIBC_STACKPROTECTOR) */ +#endif /* NOLIBC_NO_RUNTIME */ #endif /* _NOLIBC_STACKPROTECTOR_H */ diff --git a/tools/include/nolibc/std.h b/tools/include/nolibc/std.h index 2c1ad23b9b5c..392f4dd94158 100644 --- a/tools/include/nolibc/std.h +++ b/tools/include/nolibc/std.h @@ -20,13 +20,13 @@ /* those are commonly provided by sys/types.h */ typedef unsigned int dev_t; -typedef unsigned long ino_t; +typedef uint64_t ino_t; typedef unsigned int mode_t; typedef signed int pid_t; typedef unsigned int uid_t; typedef unsigned int gid_t; typedef unsigned long nlink_t; -typedef signed long off_t; +typedef int64_t off_t; typedef signed long blksize_t; typedef signed long blkcnt_t; typedef __kernel_time_t time_t; diff --git a/tools/include/nolibc/stdio.h b/tools/include/nolibc/stdio.h index 7630234408c5..1f16dab2ac88 100644 --- a/tools/include/nolibc/stdio.h +++ b/tools/include/nolibc/stdio.h @@ -321,11 +321,13 @@ int __nolibc_printf(__nolibc_printf_cb cb, intptr_t state, size_t n, const char if (!outstr) outstr="(null)"; } -#ifndef NOLIBC_IGNORE_ERRNO else if (c == 'm') { +#ifdef NOLIBC_IGNORE_ERRNO + outstr = "unknown error"; +#else outstr = strerror(errno); - } #endif /* NOLIBC_IGNORE_ERRNO */ + } else if (c == '%') { /* queue it verbatim */ continue; @@ -600,7 +602,11 @@ int sscanf(const char *str, const char *format, ...) static __attribute__((unused)) void perror(const char *msg) { +#ifdef NOLIBC_IGNORE_ERRNO + fprintf(stderr, "%s%sunknown error\n", (msg && *msg) ? msg : "", (msg && *msg) ? ": " : ""); +#else fprintf(stderr, "%s%serrno=%d\n", (msg && *msg) ? msg : "", (msg && *msg) ? ": " : "", errno); +#endif } static __attribute__((unused)) diff --git a/tools/include/nolibc/stdlib.h b/tools/include/nolibc/stdlib.h index 5fd99a480f82..f184e108ed0a 100644 --- a/tools/include/nolibc/stdlib.h +++ b/tools/include/nolibc/stdlib.h @@ -100,6 +100,7 @@ void free(void *ptr) munmap(heap, heap->len); } +#ifndef NOLIBC_NO_RUNTIME /* getenv() tries to find the environment variable named <name> in the * environment array pointed to by global variable "environ" which must be * declared as a char **, and must be terminated by a NULL (it is recommended @@ -122,6 +123,7 @@ char *getenv(const char *name) } return NULL; } +#endif /* NOLIBC_NO_RUNTIME */ static __attribute__((unused)) void *malloc(size_t len) diff --git a/tools/include/nolibc/string.h b/tools/include/nolibc/string.h index 163a17e7dd38..4000926f44ac 100644 --- a/tools/include/nolibc/string.h +++ b/tools/include/nolibc/string.h @@ -93,6 +93,21 @@ void *memset(void *dst, int b, size_t len) } #endif /* #ifndef NOLIBC_ARCH_HAS_MEMSET */ +#ifndef NOLIBC_ARCH_HAS_MEMCHR +static __attribute__((unused)) +void *memchr(const void *s, int c, size_t len) +{ + char *p = (char *)s; + + while (len--) { + if (*p == (char)c) + return p; + p++; + } + return NULL; +} +#endif /* #ifndef NOLIBC_ARCH_HAS_MEMCHR */ + static __attribute__((unused)) char *strchr(const char *s, int c) { diff --git a/tools/include/nolibc/sys.h b/tools/include/nolibc/sys.h index c5564f57deec..847af1ccbdc9 100644 --- a/tools/include/nolibc/sys.h +++ b/tools/include/nolibc/sys.h @@ -106,7 +106,7 @@ static __attribute__((unused)) void *sbrk(intptr_t inc) { /* first call to find current end */ - void *ret = sys_brk(0); + void *ret = sys_brk(NULL); if (ret && sys_brk(ret + inc) == ret + inc) return ret + inc; @@ -118,6 +118,7 @@ void *sbrk(intptr_t inc) /* * int chdir(const char *path); + * int fchdir(int fildes); */ static __attribute__((unused)) @@ -132,6 +133,18 @@ int chdir(const char *path) return __sysret(sys_chdir(path)); } +static __attribute__((unused)) +int sys_fchdir(int fildes) +{ + return my_syscall1(__NR_fchdir, fildes); +} + +static __attribute__((unused)) +int fchdir(int fildes) +{ + return __sysret(sys_fchdir(fildes)); +} + /* * int chmod(const char *path, mode_t mode); @@ -512,6 +525,7 @@ pid_t gettid(void) return sys_gettid(); } +#ifndef NOLIBC_NO_RUNTIME static unsigned long getauxval(unsigned long key); /* @@ -523,7 +537,7 @@ int getpagesize(void) { return __sysret((int)getauxval(AT_PAGESZ) ?: -ENOENT); } - +#endif /* NOLIBC_NO_RUNTIME */ /* * uid_t getuid(void); @@ -591,23 +605,20 @@ int link(const char *old, const char *new) static __attribute__((unused)) off_t sys_lseek(int fd, off_t offset, int whence) { -#if defined(__NR_lseek) - return my_syscall3(__NR_lseek, fd, offset, whence); -#else +#if defined(__NR_llseek) __kernel_loff_t loff = 0; off_t result; int ret; - /* Only exists on 32bit where nolibc off_t is also 32bit */ - ret = my_syscall5(__NR_llseek, fd, 0, offset, &loff, whence); + ret = my_syscall5(__NR_llseek, fd, offset >> 32, (uint32_t)offset, &loff, whence); if (ret < 0) result = ret; - else if (loff != (off_t)loff) - result = -EOVERFLOW; else result = loff; return result; +#else + return my_syscall3(__NR_lseek, fd, offset, whence); #endif } @@ -756,51 +767,6 @@ int sched_yield(void) /* - * int select(int nfds, fd_set *read_fds, fd_set *write_fds, - * fd_set *except_fds, struct timeval *timeout); - */ - -static __attribute__((unused)) -int sys_select(int nfds, fd_set *rfds, fd_set *wfds, fd_set *efds, struct timeval *timeout) -{ -#if defined(__ARCH_WANT_SYS_OLD_SELECT) && !defined(__NR__newselect) - struct sel_arg_struct { - unsigned long n; - fd_set *r, *w, *e; - struct timeval *t; - } arg = { .n = nfds, .r = rfds, .w = wfds, .e = efds, .t = timeout }; - return my_syscall1(__NR_select, &arg); -#elif defined(__NR__newselect) - return my_syscall5(__NR__newselect, nfds, rfds, wfds, efds, timeout); -#elif defined(__NR_select) - return my_syscall5(__NR_select, nfds, rfds, wfds, efds, timeout); -#elif defined(__NR_pselect6) - struct timespec t; - - if (timeout) { - t.tv_sec = timeout->tv_sec; - t.tv_nsec = timeout->tv_usec * 1000; - } - return my_syscall6(__NR_pselect6, nfds, rfds, wfds, efds, timeout ? &t : NULL, NULL); -#else - struct __kernel_timespec t; - - if (timeout) { - t.tv_sec = timeout->tv_sec; - t.tv_nsec = timeout->tv_usec * 1000; - } - return my_syscall6(__NR_pselect6_time64, nfds, rfds, wfds, efds, timeout ? &t : NULL, NULL); -#endif -} - -static __attribute__((unused)) -int select(int nfds, fd_set *rfds, fd_set *wfds, fd_set *efds, struct timeval *timeout) -{ - return __sysret(sys_select(nfds, rfds, wfds, efds, timeout)); -} - - -/* * int setpgid(pid_t pid, pid_t pgid); */ diff --git a/tools/include/nolibc/sys/auxv.h b/tools/include/nolibc/sys/auxv.h index c52463d6c18d..0e98325e7347 100644 --- a/tools/include/nolibc/sys/auxv.h +++ b/tools/include/nolibc/sys/auxv.h @@ -10,6 +10,8 @@ #ifndef _NOLIBC_SYS_AUXV_H #define _NOLIBC_SYS_AUXV_H +#ifndef NOLIBC_NO_RUNTIME + #include "../crt.h" static __attribute__((unused)) @@ -38,4 +40,5 @@ unsigned long getauxval(unsigned long type) return ret; } +#endif /* NOLIBC_NO_RUNTIME */ #endif /* _NOLIBC_SYS_AUXV_H */ diff --git a/tools/include/nolibc/sys/mman.h b/tools/include/nolibc/sys/mman.h index 5228751b458c..77084ac3405a 100644 --- a/tools/include/nolibc/sys/mman.h +++ b/tools/include/nolibc/sys/mman.h @@ -31,11 +31,6 @@ void *sys_mmap(void *addr, size_t length, int prot, int flags, int fd, } #endif -/* Note that on Linux, MAP_FAILED is -1 so we can use the generic __sysret() - * which returns -1 upon error and still satisfy user land that checks for - * MAP_FAILED. - */ - static __attribute__((unused)) void *mmap(void *addr, size_t length, int prot, int flags, int fd, off_t offset) { diff --git a/tools/include/nolibc/sys/reboot.h b/tools/include/nolibc/sys/reboot.h index 4a1e435be669..38274c64a722 100644 --- a/tools/include/nolibc/sys/reboot.h +++ b/tools/include/nolibc/sys/reboot.h @@ -28,7 +28,7 @@ ssize_t sys_reboot(int magic1, int magic2, int cmd, void *arg) static __attribute__((unused)) int reboot(int cmd) { - return __sysret(sys_reboot(LINUX_REBOOT_MAGIC1, LINUX_REBOOT_MAGIC2, cmd, 0)); + return __sysret(sys_reboot(LINUX_REBOOT_MAGIC1, LINUX_REBOOT_MAGIC2, cmd, NULL)); } #endif /* _NOLIBC_SYS_REBOOT_H */ diff --git a/tools/include/nolibc/sys/select.h b/tools/include/nolibc/sys/select.h new file mode 100644 index 000000000000..2a5619c01277 --- /dev/null +++ b/tools/include/nolibc/sys/select.h @@ -0,0 +1,103 @@ +/* SPDX-License-Identifier: LGPL-2.1 OR MIT */ + +#include "../nolibc.h" + +#ifndef _NOLIBC_SYS_SELECT_H +#define _NOLIBC_SYS_SELECT_H + +#include <linux/time.h> +#include <linux/unistd.h> + +/* commonly an fd_set represents 256 FDs */ +#ifndef FD_SETSIZE +#define FD_SETSIZE 256 +#endif + +#define FD_SETIDXMASK (8 * sizeof(unsigned long)) +#define FD_SETBITMASK (8 * sizeof(unsigned long)-1) + +/* for select() */ +typedef struct { + unsigned long fds[(FD_SETSIZE + FD_SETBITMASK) / FD_SETIDXMASK]; +} fd_set; + +#define FD_CLR(fd, set) do { \ + fd_set *__set = (set); \ + int __fd = (fd); \ + if (__fd >= 0) \ + __set->fds[__fd / FD_SETIDXMASK] &= \ + ~(1U << (__fd & FD_SETBITMASK)); \ + } while (0) + +#define FD_SET(fd, set) do { \ + fd_set *__set = (set); \ + int __fd = (fd); \ + if (__fd >= 0) \ + __set->fds[__fd / FD_SETIDXMASK] |= \ + 1 << (__fd & FD_SETBITMASK); \ + } while (0) + +#define FD_ISSET(fd, set) ({ \ + fd_set *__set = (set); \ + int __fd = (fd); \ + int __r = 0; \ + if (__fd >= 0) \ + __r = !!(__set->fds[__fd / FD_SETIDXMASK] & \ +1U << (__fd & FD_SETBITMASK)); \ + __r; \ + }) + +#define FD_ZERO(set) do { \ + fd_set *__set = (set); \ + int __idx; \ + int __size = (FD_SETSIZE+FD_SETBITMASK) / FD_SETIDXMASK;\ + for (__idx = 0; __idx < __size; __idx++) \ + __set->fds[__idx] = 0; \ + } while (0) + +/* + * int select(int nfds, fd_set *read_fds, fd_set *write_fds, + * fd_set *except_fds, struct timeval *timeout); + */ + +static __attribute__((unused)) +int sys_select(int nfds, fd_set *rfds, fd_set *wfds, fd_set *efds, struct timeval *timeout) +{ +#if defined(__ARCH_WANT_SYS_OLD_SELECT) && !defined(__NR__newselect) + struct sel_arg_struct { + unsigned long n; + fd_set *r, *w, *e; + struct timeval *t; + } arg = { .n = nfds, .r = rfds, .w = wfds, .e = efds, .t = timeout }; + return my_syscall1(__NR_select, &arg); +#elif defined(__NR__newselect) + return my_syscall5(__NR__newselect, nfds, rfds, wfds, efds, timeout); +#elif defined(__NR_select) + return my_syscall5(__NR_select, nfds, rfds, wfds, efds, timeout); +#elif defined(__NR_pselect6) + struct timespec t; + + if (timeout) { + t.tv_sec = timeout->tv_sec; + t.tv_nsec = timeout->tv_usec * 1000; + } + return my_syscall6(__NR_pselect6, nfds, rfds, wfds, efds, timeout ? &t : NULL, NULL); +#else + struct __kernel_timespec t; + + if (timeout) { + t.tv_sec = timeout->tv_sec; + t.tv_nsec = timeout->tv_usec * 1000; + } + return my_syscall6(__NR_pselect6_time64, nfds, rfds, wfds, efds, timeout ? &t : NULL, NULL); +#endif +} + +static __attribute__((unused)) +int select(int nfds, fd_set *rfds, fd_set *wfds, fd_set *efds, struct timeval *timeout) +{ + return __sysret(sys_select(nfds, rfds, wfds, efds, timeout)); +} + + +#endif /* _NOLIBC_SYS_SELECT_H */ diff --git a/tools/include/nolibc/sys/uio.h b/tools/include/nolibc/sys/uio.h new file mode 100644 index 000000000000..7ad42b927d2f --- /dev/null +++ b/tools/include/nolibc/sys/uio.h @@ -0,0 +1,49 @@ +/* SPDX-License-Identifier: LGPL-2.1 OR MIT */ +/* + * uio for NOLIBC + * Copyright (C) 2017-2021 Willy Tarreau <w@1wt.eu> + * Copyright (C) 2025 Intel Corporation + */ + +/* make sure to include all global symbols */ +#include "../nolibc.h" + +#ifndef _NOLIBC_SYS_UIO_H +#define _NOLIBC_SYS_UIO_H + +#include "../sys.h" +#include <linux/uio.h> + + +/* + * ssize_t readv(int fd, const struct iovec *iovec, int count); + */ +static __attribute__((unused)) +ssize_t sys_readv(int fd, const struct iovec *iovec, int count) +{ + return my_syscall3(__NR_readv, fd, iovec, count); +} + +static __attribute__((unused)) +ssize_t readv(int fd, const struct iovec *iovec, int count) +{ + return __sysret(sys_readv(fd, iovec, count)); +} + +/* + * ssize_t writev(int fd, const struct iovec *iovec, int count); + */ +static __attribute__((unused)) +ssize_t sys_writev(int fd, const struct iovec *iovec, int count) +{ + return my_syscall3(__NR_writev, fd, iovec, count); +} + +static __attribute__((unused)) +ssize_t writev(int fd, const struct iovec *iovec, int count) +{ + return __sysret(sys_writev(fd, iovec, count)); +} + + +#endif /* _NOLIBC_SYS_UIO_H */ diff --git a/tools/include/nolibc/sys/wait.h b/tools/include/nolibc/sys/wait.h index 4e66e1f7a03e..9d9319ba92cb 100644 --- a/tools/include/nolibc/sys/wait.h +++ b/tools/include/nolibc/sys/wait.h @@ -65,23 +65,29 @@ pid_t waitpid(pid_t pid, int *status, int options) switch (info.si_code) { case 0: - *status = 0; + if (status) + *status = 0; break; case CLD_EXITED: - *status = (info.si_status & 0xff) << 8; + if (status) + *status = (info.si_status & 0xff) << 8; break; case CLD_KILLED: - *status = info.si_status & 0x7f; + if (status) + *status = info.si_status & 0x7f; break; case CLD_DUMPED: - *status = (info.si_status & 0x7f) | 0x80; + if (status) + *status = (info.si_status & 0x7f) | 0x80; break; case CLD_STOPPED: case CLD_TRAPPED: - *status = (info.si_status << 8) + 0x7f; + if (status) + *status = (info.si_status << 8) + 0x7f; break; case CLD_CONTINUED: - *status = 0xffff; + if (status) + *status = 0xffff; break; default: return -1; diff --git a/tools/include/nolibc/time.h b/tools/include/nolibc/time.h index 6c276b8d646a..48e78f8becf9 100644 --- a/tools/include/nolibc/time.h +++ b/tools/include/nolibc/time.h @@ -89,13 +89,11 @@ int sys_clock_settime(clockid_t clockid, struct timespec *tp) { #if defined(__NR_clock_settime) return my_syscall2(__NR_clock_settime, clockid, tp); -#elif defined(__NR_clock_settime64) +#else struct __kernel_timespec ktp; __nolibc_timespec_user_to_kernel(tp, &ktp); return my_syscall2(__NR_clock_settime64, clockid, &ktp); -#else - return __nolibc_enosys(__func__, clockid, tp); #endif } @@ -111,7 +109,7 @@ int sys_clock_nanosleep(clockid_t clockid, int flags, const struct timespec *rqt { #if defined(__NR_clock_nanosleep) return my_syscall4(__NR_clock_nanosleep, clockid, flags, rqtp, rmtp); -#elif defined(__NR_clock_nanosleep_time64) +#else struct __kernel_timespec krqtp, krmtp; int ret; @@ -120,8 +118,6 @@ int sys_clock_nanosleep(clockid_t clockid, int flags, const struct timespec *rqt if (rmtp) __nolibc_timespec_kernel_to_user(&krmtp, rmtp); return ret; -#else - return __nolibc_enosys(__func__, clockid, flags, rqtp, rmtp); #endif } @@ -195,7 +191,7 @@ int sys_timer_gettime(timer_t timerid, struct itimerspec *curr_value) { #if defined(__NR_timer_gettime) return my_syscall2(__NR_timer_gettime, timerid, curr_value); -#elif defined(__NR_timer_gettime64) +#else struct __kernel_itimerspec kcurr_value; int ret; @@ -203,8 +199,6 @@ int sys_timer_gettime(timer_t timerid, struct itimerspec *curr_value) __nolibc_timespec_kernel_to_user(&kcurr_value.it_interval, &curr_value->it_interval); __nolibc_timespec_kernel_to_user(&kcurr_value.it_value, &curr_value->it_value); return ret; -#else - return __nolibc_enosys(__func__, timerid, curr_value); #endif } @@ -220,7 +214,7 @@ int sys_timer_settime(timer_t timerid, int flags, { #if defined(__NR_timer_settime) return my_syscall4(__NR_timer_settime, timerid, flags, new_value, old_value); -#elif defined(__NR_timer_settime64) +#else struct __kernel_itimerspec knew_value, kold_value; int ret; @@ -232,8 +226,6 @@ int sys_timer_settime(timer_t timerid, int flags, __nolibc_timespec_kernel_to_user(&kold_value.it_value, &old_value->it_value); } return ret; -#else - return __nolibc_enosys(__func__, timerid, flags, new_value, old_value); #endif } diff --git a/tools/include/nolibc/types.h b/tools/include/nolibc/types.h index 16c6e9ec9451..470a5f77bc0f 100644 --- a/tools/include/nolibc/types.h +++ b/tools/include/nolibc/types.h @@ -70,11 +70,6 @@ #define DT_LNK 0xa #define DT_SOCK 0xc -/* commonly an fd_set represents 256 FDs */ -#ifndef FD_SETSIZE -#define FD_SETSIZE 256 -#endif - /* PATH_MAX and MAXPATHLEN are often used and found with plenty of different * values. */ @@ -115,48 +110,6 @@ #define EXIT_SUCCESS 0 #define EXIT_FAILURE 1 -#define FD_SETIDXMASK (8 * sizeof(unsigned long)) -#define FD_SETBITMASK (8 * sizeof(unsigned long)-1) - -/* for select() */ -typedef struct { - unsigned long fds[(FD_SETSIZE + FD_SETBITMASK) / FD_SETIDXMASK]; -} fd_set; - -#define FD_CLR(fd, set) do { \ - fd_set *__set = (set); \ - int __fd = (fd); \ - if (__fd >= 0) \ - __set->fds[__fd / FD_SETIDXMASK] &= \ - ~(1U << (__fd & FD_SETBITMASK)); \ - } while (0) - -#define FD_SET(fd, set) do { \ - fd_set *__set = (set); \ - int __fd = (fd); \ - if (__fd >= 0) \ - __set->fds[__fd / FD_SETIDXMASK] |= \ - 1 << (__fd & FD_SETBITMASK); \ - } while (0) - -#define FD_ISSET(fd, set) ({ \ - fd_set *__set = (set); \ - int __fd = (fd); \ - int __r = 0; \ - if (__fd >= 0) \ - __r = !!(__set->fds[__fd / FD_SETIDXMASK] & \ -1U << (__fd & FD_SETBITMASK)); \ - __r; \ - }) - -#define FD_ZERO(set) do { \ - fd_set *__set = (set); \ - int __idx; \ - int __size = (FD_SETSIZE+FD_SETBITMASK) / FD_SETIDXMASK;\ - for (__idx = 0; __idx < __size; __idx++) \ - __set->fds[__idx] = 0; \ - } while (0) - /* for getdents64() */ struct linux_dirent64 { uint64_t d_ino; diff --git a/tools/include/nolibc/unistd.h b/tools/include/nolibc/unistd.h index 7405fa2b89ba..bb5e80f3f05d 100644 --- a/tools/include/nolibc/unistd.h +++ b/tools/include/nolibc/unistd.h @@ -54,7 +54,7 @@ int msleep(unsigned int msecs) { struct timeval my_timeval = { msecs / 1000, (msecs % 1000) * 1000 }; - if (sys_select(0, 0, 0, 0, &my_timeval) < 0) + if (sys_select(0, NULL, NULL, NULL, &my_timeval) < 0) return (my_timeval.tv_sec * 1000) + (my_timeval.tv_usec / 1000) + !!(my_timeval.tv_usec % 1000); @@ -67,7 +67,7 @@ unsigned int sleep(unsigned int seconds) { struct timeval my_timeval = { seconds, 0 }; - if (sys_select(0, 0, 0, 0, &my_timeval) < 0) + if (sys_select(0, NULL, NULL, NULL, &my_timeval) < 0) return my_timeval.tv_sec + !!my_timeval.tv_usec; else return 0; @@ -78,7 +78,7 @@ int usleep(unsigned int usecs) { struct timeval my_timeval = { usecs / 1000000, usecs % 1000000 }; - return sys_select(0, 0, 0, 0, &my_timeval); + return sys_select(0, NULL, NULL, NULL, &my_timeval); } static __attribute__((unused)) diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 6829936d33f5..be7d8e060e10 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -1026,6 +1026,7 @@ enum bpf_map_type { BPF_MAP_TYPE_USER_RINGBUF, BPF_MAP_TYPE_CGRP_STORAGE, BPF_MAP_TYPE_ARENA, + BPF_MAP_TYPE_INSN_ARRAY, __MAX_BPF_MAP_TYPE }; @@ -1430,6 +1431,9 @@ enum { /* Do not translate kernel bpf_arena pointers to user pointers */ BPF_F_NO_USER_CONV = (1U << 18), + +/* Enable BPF ringbuf overwrite mode */ + BPF_F_RB_OVERWRITE = (1U << 19), }; /* Flags for BPF_PROG_QUERY. */ @@ -5618,7 +5622,7 @@ union bpf_attr { * Return * *sk* if casting is valid, or **NULL** otherwise. * - * long bpf_dynptr_from_mem(void *data, u32 size, u64 flags, struct bpf_dynptr *ptr) + * long bpf_dynptr_from_mem(void *data, u64 size, u64 flags, struct bpf_dynptr *ptr) * Description * Get a dynptr to local memory *data*. * @@ -5661,7 +5665,7 @@ union bpf_attr { * Return * Nothing. Always succeeds. * - * long bpf_dynptr_read(void *dst, u32 len, const struct bpf_dynptr *src, u32 offset, u64 flags) + * long bpf_dynptr_read(void *dst, u64 len, const struct bpf_dynptr *src, u64 offset, u64 flags) * Description * Read *len* bytes from *src* into *dst*, starting from *offset* * into *src*. @@ -5671,7 +5675,7 @@ union bpf_attr { * of *src*'s data, -EINVAL if *src* is an invalid dynptr or if * *flags* is not 0. * - * long bpf_dynptr_write(const struct bpf_dynptr *dst, u32 offset, void *src, u32 len, u64 flags) + * long bpf_dynptr_write(const struct bpf_dynptr *dst, u64 offset, void *src, u64 len, u64 flags) * Description * Write *len* bytes from *src* into *dst*, starting from *offset* * into *dst*. @@ -5692,7 +5696,7 @@ union bpf_attr { * is a read-only dynptr or if *flags* is not correct. For skb-type dynptrs, * other errors correspond to errors returned by **bpf_skb_store_bytes**\ (). * - * void *bpf_dynptr_data(const struct bpf_dynptr *ptr, u32 offset, u32 len) + * void *bpf_dynptr_data(const struct bpf_dynptr *ptr, u64 offset, u64 len) * Description * Get a pointer to the underlying dynptr data. * @@ -6231,6 +6235,7 @@ enum { BPF_RB_RING_SIZE = 1, BPF_RB_CONS_POS = 2, BPF_RB_PROD_POS = 3, + BPF_RB_OVERWRITE_POS = 4, }; /* BPF ring buffer constants */ @@ -7200,6 +7205,7 @@ enum { TCP_BPF_SYN_MAC = 1007, /* Copy the MAC, IP[46], and TCP header */ TCP_BPF_SOCK_OPS_CB_FLAGS = 1008, /* Get or Set TCP sock ops flags */ SK_BPF_CB_FLAGS = 1009, /* Get or set sock ops flags in socket */ + SK_BPF_BYPASS_PROT_MEM = 1010, /* Get or Set sk->sk_bypass_prot_mem */ }; enum { @@ -7645,4 +7651,24 @@ enum bpf_kfunc_flags { BPF_F_PAD_ZEROS = (1ULL << 0), }; +/* + * Values of a BPF_MAP_TYPE_INSN_ARRAY entry must be of this type. + * + * Before the map is used the orig_off field should point to an + * instruction inside the program being loaded. The other fields + * must be set to 0. + * + * After the program is loaded, the xlated_off will be adjusted + * by the verifier to point to the index of the original instruction + * in the xlated program. If the instruction is deleted, it will + * be set to (u32)-1. The jitted_off will be set to the corresponding + * offset in the jitted image of the program. + */ +struct bpf_insn_array_value { + __u32 orig_off; + __u32 xlated_off; + __u32 jitted_off; + __u32 :32; +}; + #endif /* _UAPI__LINUX_BPF_H__ */ diff --git a/tools/include/uapi/linux/netdev.h b/tools/include/uapi/linux/netdev.h index 48eb49aa03d4..e0b579a1df4f 100644 --- a/tools/include/uapi/linux/netdev.h +++ b/tools/include/uapi/linux/netdev.h @@ -2,6 +2,7 @@ /* Do not edit directly, auto-generated from: */ /* Documentation/netlink/specs/netdev.yaml */ /* YNL-GEN uapi header */ +/* To regenerate run: tools/net/ynl/ynl-regen.sh */ #ifndef _UAPI_LINUX_NETDEV_H #define _UAPI_LINUX_NETDEV_H @@ -80,6 +81,7 @@ enum netdev_qstats_scope { enum netdev_napi_threaded { NETDEV_NAPI_THREADED_DISABLED, NETDEV_NAPI_THREADED_ENABLED, + NETDEV_NAPI_THREADED_BUSY_POLL, }; enum { diff --git a/tools/include/uapi/linux/nsfs.h b/tools/include/uapi/linux/nsfs.h index 33c9b578b3b2..a25e38d1c874 100644 --- a/tools/include/uapi/linux/nsfs.h +++ b/tools/include/uapi/linux/nsfs.h @@ -53,6 +53,76 @@ enum init_ns_ino { TIME_NS_INIT_INO = 0xEFFFFFFAU, NET_NS_INIT_INO = 0xEFFFFFF9U, MNT_NS_INIT_INO = 0xEFFFFFF8U, +#ifdef __KERNEL__ + MNT_NS_ANON_INO = 0xEFFFFFF7U, +#endif }; +struct nsfs_file_handle { + __u64 ns_id; + __u32 ns_type; + __u32 ns_inum; +}; + +#define NSFS_FILE_HANDLE_SIZE_VER0 16 /* sizeof first published struct */ +#define NSFS_FILE_HANDLE_SIZE_LATEST sizeof(struct nsfs_file_handle) /* sizeof latest published struct */ + +enum init_ns_id { + IPC_NS_INIT_ID = 1ULL, + UTS_NS_INIT_ID = 2ULL, + USER_NS_INIT_ID = 3ULL, + PID_NS_INIT_ID = 4ULL, + CGROUP_NS_INIT_ID = 5ULL, + TIME_NS_INIT_ID = 6ULL, + NET_NS_INIT_ID = 7ULL, + MNT_NS_INIT_ID = 8ULL, +#ifdef __KERNEL__ + NS_LAST_INIT_ID = MNT_NS_INIT_ID, +#endif +}; + +enum ns_type { + TIME_NS = (1ULL << 7), /* CLONE_NEWTIME */ + MNT_NS = (1ULL << 17), /* CLONE_NEWNS */ + CGROUP_NS = (1ULL << 25), /* CLONE_NEWCGROUP */ + UTS_NS = (1ULL << 26), /* CLONE_NEWUTS */ + IPC_NS = (1ULL << 27), /* CLONE_NEWIPC */ + USER_NS = (1ULL << 28), /* CLONE_NEWUSER */ + PID_NS = (1ULL << 29), /* CLONE_NEWPID */ + NET_NS = (1ULL << 30), /* CLONE_NEWNET */ +}; + +/** + * struct ns_id_req - namespace ID request structure + * @size: size of this structure + * @spare: reserved for future use + * @filter: filter mask + * @ns_id: last namespace id + * @user_ns_id: owning user namespace ID + * + * Structure for passing namespace ID and miscellaneous parameters to + * statns(2) and listns(2). + * + * For statns(2) @param represents the request mask. + * For listns(2) @param represents the last listed mount id (or zero). + */ +struct ns_id_req { + __u32 size; + __u32 spare; + __u64 ns_id; + struct /* listns */ { + __u32 ns_type; + __u32 spare2; + __u64 user_ns_id; + }; +}; + +/* + * Special @user_ns_id value that can be passed to listns() + */ +#define LISTNS_CURRENT_USER 0xffffffffffffffff /* Caller's userns */ + +/* List of all ns_id_req versions. */ +#define NS_ID_REQ_SIZE_VER0 32 /* sizeof first published struct */ + #endif /* __LINUX_NSFS_H */ diff --git a/tools/include/uapi/linux/perf_event.h b/tools/include/uapi/linux/perf_event.h index 78a362b80027..d292f96bc06f 100644 --- a/tools/include/uapi/linux/perf_event.h +++ b/tools/include/uapi/linux/perf_event.h @@ -463,7 +463,9 @@ struct perf_event_attr { inherit_thread : 1, /* children only inherit if cloned with CLONE_THREAD */ remove_on_exec : 1, /* event is removed from task on exec */ sigtrap : 1, /* send synchronous SIGTRAP on event */ - __reserved_1 : 26; + defer_callchain: 1, /* request PERF_RECORD_CALLCHAIN_DEFERRED records */ + defer_output : 1, /* output PERF_RECORD_CALLCHAIN_DEFERRED records */ + __reserved_1 : 24; union { __u32 wakeup_events; /* wake up every n events */ @@ -1239,6 +1241,22 @@ enum perf_event_type { */ PERF_RECORD_AUX_OUTPUT_HW_ID = 21, + /* + * This user callchain capture was deferred until shortly before + * returning to user space. Previous samples would have kernel + * callchains only and they need to be stitched with this to make full + * callchains. + * + * struct { + * struct perf_event_header header; + * u64 cookie; + * u64 nr; + * u64 ips[nr]; + * struct sample_id sample_id; + * }; + */ + PERF_RECORD_CALLCHAIN_DEFERRED = 22, + PERF_RECORD_MAX, /* non-ABI */ }; @@ -1269,6 +1287,7 @@ enum perf_callchain_context { PERF_CONTEXT_HV = (__u64)-32, PERF_CONTEXT_KERNEL = (__u64)-128, PERF_CONTEXT_USER = (__u64)-512, + PERF_CONTEXT_USER_DEFERRED = (__u64)-640, PERF_CONTEXT_GUEST = (__u64)-2048, PERF_CONTEXT_GUEST_KERNEL = (__u64)-2176, diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c index 339b19797237..b66f5fbfbbb2 100644 --- a/tools/lib/bpf/bpf.c +++ b/tools/lib/bpf/bpf.c @@ -154,7 +154,7 @@ int bump_rlimit_memlock(void) memlock_bumped = true; - /* zero memlock_rlim_max disables auto-bumping RLIMIT_MEMLOCK */ + /* zero memlock_rlim disables auto-bumping RLIMIT_MEMLOCK */ if (memlock_rlim == 0) return 0; diff --git a/tools/lib/bpf/btf.c b/tools/lib/bpf/btf.c index 18907f0fcf9f..84a4b0abc8be 100644 --- a/tools/lib/bpf/btf.c +++ b/tools/lib/bpf/btf.c @@ -1061,7 +1061,7 @@ static struct btf *btf_new(const void *data, __u32 size, struct btf *base_btf, b if (base_btf) { btf->base_btf = base_btf; btf->start_id = btf__type_cnt(base_btf); - btf->start_str_off = base_btf->hdr->str_len; + btf->start_str_off = base_btf->hdr->str_len + base_btf->start_str_off; } if (is_mmap) { @@ -3901,6 +3901,20 @@ err_out: return err; } +/* + * Calculate type signature hash of TYPEDEF, ignoring referenced type IDs, + * as referenced type IDs equivalence is established separately during type + * graph equivalence check algorithm. + */ +static long btf_hash_typedef(struct btf_type *t) +{ + long h; + + h = hash_combine(0, t->name_off); + h = hash_combine(h, t->info); + return h; +} + static long btf_hash_common(struct btf_type *t) { long h; @@ -3918,6 +3932,13 @@ static bool btf_equal_common(struct btf_type *t1, struct btf_type *t2) t1->size == t2->size; } +/* Check structural compatibility of two TYPEDEF. */ +static bool btf_equal_typedef(struct btf_type *t1, struct btf_type *t2) +{ + return t1->name_off == t2->name_off && + t1->info == t2->info; +} + /* Calculate type signature hash of INT or TAG. */ static long btf_hash_int_decl_tag(struct btf_type *t) { @@ -4844,13 +4865,30 @@ static void btf_dedup_merge_hypot_map(struct btf_dedup *d) } } +static inline long btf_hash_by_kind(struct btf_type *t, __u16 kind) +{ + if (kind == BTF_KIND_TYPEDEF) + return btf_hash_typedef(t); + else + return btf_hash_struct(t); +} + +static inline bool btf_equal_by_kind(struct btf_type *t1, struct btf_type *t2, __u16 kind) +{ + if (kind == BTF_KIND_TYPEDEF) + return btf_equal_typedef(t1, t2); + else + return btf_shallow_equal_struct(t1, t2); +} + /* - * Deduplicate struct/union types. + * Deduplicate struct/union and typedef types. * * For each struct/union type its type signature hash is calculated, taking * into account type's name, size, number, order and names of fields, but * ignoring type ID's referenced from fields, because they might not be deduped - * completely until after reference types deduplication phase. This type hash + * completely until after reference types deduplication phase. For each typedef + * type, the hash is computed based on the type’s name and size. This type hash * is used to iterate over all potential canonical types, sharing same hash. * For each canonical candidate we check whether type graphs that they form * (through referenced types in fields and so on) are equivalent using algorithm @@ -4882,18 +4920,20 @@ static int btf_dedup_struct_type(struct btf_dedup *d, __u32 type_id) t = btf_type_by_id(d->btf, type_id); kind = btf_kind(t); - if (kind != BTF_KIND_STRUCT && kind != BTF_KIND_UNION) + if (kind != BTF_KIND_STRUCT && + kind != BTF_KIND_UNION && + kind != BTF_KIND_TYPEDEF) return 0; - h = btf_hash_struct(t); + h = btf_hash_by_kind(t, kind); for_each_dedup_cand(d, hash_entry, h) { __u32 cand_id = hash_entry->value; int eq; /* * Even though btf_dedup_is_equiv() checks for - * btf_shallow_equal_struct() internally when checking two - * structs (unions) for equivalence, we need to guard here + * btf_equal_by_kind() internally when checking two + * structs (unions) or typedefs for equivalence, we need to guard here * from picking matching FWD type as a dedup candidate. * This can happen due to hash collision. In such case just * relying on btf_dedup_is_equiv() would lead to potentially @@ -4901,7 +4941,7 @@ static int btf_dedup_struct_type(struct btf_dedup *d, __u32 type_id) * FWD and compatible STRUCT/UNION are considered equivalent. */ cand_type = btf_type_by_id(d->btf, cand_id); - if (!btf_shallow_equal_struct(t, cand_type)) + if (!btf_equal_by_kind(t, cand_type, kind)) continue; btf_dedup_clear_hypot_map(d); @@ -4939,18 +4979,18 @@ static int btf_dedup_struct_types(struct btf_dedup *d) /* * Deduplicate reference type. * - * Once all primitive and struct/union types got deduplicated, we can easily + * Once all primitive, struct/union and typedef types got deduplicated, we can easily * deduplicate all other (reference) BTF types. This is done in two steps: * * 1. Resolve all referenced type IDs into their canonical type IDs. This - * resolution can be done either immediately for primitive or struct/union types - * (because they were deduped in previous two phases) or recursively for + * resolution can be done either immediately for primitive, struct/union, and typedef + * types (because they were deduped in previous two phases) or recursively for * reference types. Recursion will always terminate at either primitive or - * struct/union type, at which point we can "unwind" chain of reference types - * one by one. There is no danger of encountering cycles because in C type - * system the only way to form type cycle is through struct/union, so any chain - * of reference types, even those taking part in a type cycle, will inevitably - * reach struct/union at some point. + * struct/union and typedef types, at which point we can "unwind" chain of reference + * types one by one. There is no danger of encountering cycles in C, as the only way to + * form a type cycle is through struct or union types. Go can form such cycles through + * typedef. Thus, any chain of reference types, even those taking part in a type cycle, + * will inevitably reach a struct/union or typedef type at some point. * * 2. Once all referenced type IDs are resolved into canonical ones, BTF type * becomes "stable", in the sense that no further deduplication will cause @@ -4982,7 +5022,6 @@ static int btf_dedup_ref_type(struct btf_dedup *d, __u32 type_id) case BTF_KIND_VOLATILE: case BTF_KIND_RESTRICT: case BTF_KIND_PTR: - case BTF_KIND_TYPEDEF: case BTF_KIND_FUNC: case BTF_KIND_TYPE_TAG: ref_type_id = btf_dedup_ref_type(d, t->type); @@ -5818,7 +5857,7 @@ void btf_set_base_btf(struct btf *btf, const struct btf *base_btf) { btf->base_btf = (struct btf *)base_btf; btf->start_id = btf__type_cnt(base_btf); - btf->start_str_off = base_btf->hdr->str_len; + btf->start_str_off = base_btf->hdr->str_len + base_btf->start_str_off; } int btf__relocate(struct btf *btf, const struct btf *base_btf) diff --git a/tools/lib/bpf/btf.h b/tools/lib/bpf/btf.h index ccfd905f03df..cc01494d6210 100644 --- a/tools/lib/bpf/btf.h +++ b/tools/lib/bpf/btf.h @@ -94,6 +94,7 @@ LIBBPF_API struct btf *btf__new_empty(void); * @brief **btf__new_empty_split()** creates an unpopulated BTF object from an * ELF BTF section except with a base BTF on top of which split BTF should be * based + * @param base_btf base BTF object * @return new BTF object instance which has to be eventually freed with * **btf__free()** * @@ -115,6 +116,10 @@ LIBBPF_API struct btf *btf__new_empty_split(struct btf *base_btf); * When that split BTF is loaded against a (possibly changed) base, this * distilled base BTF will help update references to that (possibly changed) * base BTF. + * @param src_btf source split BTF object + * @param new_base_btf pointer to where the new base BTF object pointer will be stored + * @param new_split_btf pointer to where the new split BTF object pointer will be stored + * @return 0 on success; negative error code, otherwise * * Both the new split and its associated new base BTF must be freed by * the caller. @@ -264,6 +269,9 @@ LIBBPF_API int btf__dedup(struct btf *btf, const struct btf_dedup_opts *opts); * to base BTF kinds, and verify those references are compatible with * *base_btf*; if they are, *btf* is adjusted such that is re-parented to * *base_btf* and type ids and strings are adjusted to accommodate this. + * @param btf split BTF object to relocate + * @param base_btf base BTF object + * @return 0 on success; negative error code, otherwise * * If successful, 0 is returned and **btf** now has **base_btf** as its * base. diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index dd3b2f57082d..3dc8a8078815 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -190,6 +190,7 @@ static const char * const map_type_name[] = { [BPF_MAP_TYPE_USER_RINGBUF] = "user_ringbuf", [BPF_MAP_TYPE_CGRP_STORAGE] = "cgrp_storage", [BPF_MAP_TYPE_ARENA] = "arena", + [BPF_MAP_TYPE_INSN_ARRAY] = "insn_array", }; static const char * const prog_type_name[] = { @@ -369,6 +370,7 @@ enum reloc_type { RELO_EXTERN_CALL, RELO_SUBPROG_ADDR, RELO_CORE, + RELO_INSN_ARRAY, }; struct reloc_desc { @@ -379,7 +381,16 @@ struct reloc_desc { struct { int map_idx; int sym_off; - int ext_idx; + /* + * The following two fields can be unionized, as the + * ext_idx field is used for extern symbols, and the + * sym_size is used for jump tables, which are never + * extern + */ + union { + int ext_idx; + int sym_size; + }; }; }; }; @@ -421,6 +432,11 @@ struct bpf_sec_def { libbpf_prog_attach_fn_t prog_attach_fn; }; +struct bpf_light_subprog { + __u32 sec_insn_off; + __u32 sub_insn_off; +}; + /* * bpf_prog should be a better name but it has been used in * linux/filter.h. @@ -494,6 +510,9 @@ struct bpf_program { __u32 line_info_cnt; __u32 prog_flags; __u8 hash[SHA256_DIGEST_LENGTH]; + + struct bpf_light_subprog *subprogs; + __u32 subprog_cnt; }; struct bpf_struct_ops { @@ -667,6 +686,7 @@ struct elf_state { int symbols_shndx; bool has_st_ops; int arena_data_shndx; + int jumptables_data_shndx; }; struct usdt_manager; @@ -738,6 +758,16 @@ struct bpf_object { void *arena_data; size_t arena_data_sz; + void *jumptables_data; + size_t jumptables_data_sz; + + struct { + struct bpf_program *prog; + int sym_off; + int fd; + } *jumptable_maps; + size_t jumptable_map_cnt; + struct kern_feature_cache *feat_cache; char *token_path; int token_fd; @@ -764,6 +794,7 @@ void bpf_program__unload(struct bpf_program *prog) zfree(&prog->func_info); zfree(&prog->line_info); + zfree(&prog->subprogs); } static void bpf_program__exit(struct bpf_program *prog) @@ -2996,7 +3027,7 @@ static int bpf_object__init_user_btf_maps(struct bpf_object *obj, bool strict, scn = elf_sec_by_idx(obj, obj->efile.btf_maps_shndx); data = elf_sec_data(obj, scn); - if (!scn || !data) { + if (!data) { pr_warn("elf: failed to get %s map definitions for %s\n", MAPS_ELF_SEC, obj->path); return -EINVAL; @@ -3942,6 +3973,13 @@ static int bpf_object__elf_collect(struct bpf_object *obj) } else if (strcmp(name, ARENA_SEC) == 0) { obj->efile.arena_data = data; obj->efile.arena_data_shndx = idx; + } else if (strcmp(name, JUMPTABLES_SEC) == 0) { + obj->jumptables_data = malloc(data->d_size); + if (!obj->jumptables_data) + return -ENOMEM; + memcpy(obj->jumptables_data, data->d_buf, data->d_size); + obj->jumptables_data_sz = data->d_size; + obj->efile.jumptables_data_shndx = idx; } else { pr_info("elf: skipping unrecognized data section(%d) %s\n", idx, name); @@ -4634,6 +4672,16 @@ static int bpf_program__record_reloc(struct bpf_program *prog, return 0; } + /* jump table data relocation */ + if (shdr_idx == obj->efile.jumptables_data_shndx) { + reloc_desc->type = RELO_INSN_ARRAY; + reloc_desc->insn_idx = insn_idx; + reloc_desc->map_idx = -1; + reloc_desc->sym_off = sym->st_value; + reloc_desc->sym_size = sym->st_size; + return 0; + } + /* generic map reference relocation */ if (type == LIBBPF_MAP_UNSPEC) { if (!bpf_object__shndx_is_maps(obj, shdr_idx)) { @@ -6144,6 +6192,157 @@ static void poison_kfunc_call(struct bpf_program *prog, int relo_idx, insn->imm = POISON_CALL_KFUNC_BASE + ext_idx; } +static int find_jt_map(struct bpf_object *obj, struct bpf_program *prog, int sym_off) +{ + size_t i; + + for (i = 0; i < obj->jumptable_map_cnt; i++) { + /* + * This might happen that same offset is used for two different + * programs (as jump tables can be the same). However, for + * different programs different maps should be created. + */ + if (obj->jumptable_maps[i].sym_off == sym_off && + obj->jumptable_maps[i].prog == prog) + return obj->jumptable_maps[i].fd; + } + + return -ENOENT; +} + +static int add_jt_map(struct bpf_object *obj, struct bpf_program *prog, int sym_off, int map_fd) +{ + size_t cnt = obj->jumptable_map_cnt; + size_t size = sizeof(obj->jumptable_maps[0]); + void *tmp; + + tmp = libbpf_reallocarray(obj->jumptable_maps, cnt + 1, size); + if (!tmp) + return -ENOMEM; + + obj->jumptable_maps = tmp; + obj->jumptable_maps[cnt].prog = prog; + obj->jumptable_maps[cnt].sym_off = sym_off; + obj->jumptable_maps[cnt].fd = map_fd; + obj->jumptable_map_cnt++; + + return 0; +} + +static int find_subprog_idx(struct bpf_program *prog, int insn_idx) +{ + int i; + + for (i = prog->subprog_cnt - 1; i >= 0; i--) { + if (insn_idx >= prog->subprogs[i].sub_insn_off) + return i; + } + + return -1; +} + +static int create_jt_map(struct bpf_object *obj, struct bpf_program *prog, struct reloc_desc *relo) +{ + const __u32 jt_entry_size = 8; + int sym_off = relo->sym_off; + int jt_size = relo->sym_size; + __u32 max_entries = jt_size / jt_entry_size; + __u32 value_size = sizeof(struct bpf_insn_array_value); + struct bpf_insn_array_value val = {}; + int subprog_idx; + int map_fd, err; + __u64 insn_off; + __u64 *jt; + __u32 i; + + map_fd = find_jt_map(obj, prog, sym_off); + if (map_fd >= 0) + return map_fd; + + if (sym_off % jt_entry_size) { + pr_warn("map '.jumptables': jumptable start %d should be multiple of %u\n", + sym_off, jt_entry_size); + return -EINVAL; + } + + if (jt_size % jt_entry_size) { + pr_warn("map '.jumptables': jumptable size %d should be multiple of %u\n", + jt_size, jt_entry_size); + return -EINVAL; + } + + map_fd = bpf_map_create(BPF_MAP_TYPE_INSN_ARRAY, ".jumptables", + 4, value_size, max_entries, NULL); + if (map_fd < 0) + return map_fd; + + if (!obj->jumptables_data) { + pr_warn("map '.jumptables': ELF file is missing jump table data\n"); + err = -EINVAL; + goto err_close; + } + if (sym_off + jt_size > obj->jumptables_data_sz) { + pr_warn("map '.jumptables': jumptables_data size is %zd, trying to access %d\n", + obj->jumptables_data_sz, sym_off + jt_size); + err = -EINVAL; + goto err_close; + } + + subprog_idx = -1; /* main program */ + if (relo->insn_idx < 0 || relo->insn_idx >= prog->insns_cnt) { + pr_warn("map '.jumptables': invalid instruction index %d\n", relo->insn_idx); + err = -EINVAL; + goto err_close; + } + if (prog->subprogs) + subprog_idx = find_subprog_idx(prog, relo->insn_idx); + + jt = (__u64 *)(obj->jumptables_data + sym_off); + for (i = 0; i < max_entries; i++) { + /* + * The offset should be made to be relative to the beginning of + * the main function, not the subfunction. + */ + insn_off = jt[i]/sizeof(struct bpf_insn); + if (subprog_idx >= 0) { + insn_off -= prog->subprogs[subprog_idx].sec_insn_off; + insn_off += prog->subprogs[subprog_idx].sub_insn_off; + } else { + insn_off -= prog->sec_insn_off; + } + + /* + * LLVM-generated jump tables contain u64 records, however + * should contain values that fit in u32. + */ + if (insn_off > UINT32_MAX) { + pr_warn("map '.jumptables': invalid jump table value 0x%llx at offset %d\n", + (long long)jt[i], sym_off + i * jt_entry_size); + err = -EINVAL; + goto err_close; + } + + val.orig_off = insn_off; + err = bpf_map_update_elem(map_fd, &i, &val, 0); + if (err) + goto err_close; + } + + err = bpf_map_freeze(map_fd); + if (err) + goto err_close; + + err = add_jt_map(obj, prog, sym_off, map_fd); + if (err) + goto err_close; + + return map_fd; + +err_close: + close(map_fd); + return err; +} + /* Relocate data references within program code: * - map references; * - global variable references; @@ -6235,6 +6434,20 @@ bpf_object__relocate_data(struct bpf_object *obj, struct bpf_program *prog) case RELO_CORE: /* will be handled by bpf_program_record_relos() */ break; + case RELO_INSN_ARRAY: { + int map_fd; + + map_fd = create_jt_map(obj, prog, relo); + if (map_fd < 0) { + pr_warn("prog '%s': relo #%d: can't create jump table: sym_off %u\n", + prog->name, i, relo->sym_off); + return map_fd; + } + insn[0].src_reg = BPF_PSEUDO_MAP_VALUE; + insn->imm = map_fd; + insn->off = 0; + } + break; default: pr_warn("prog '%s': relo #%d: bad relo type %d\n", prog->name, i, relo->type); @@ -6432,36 +6645,62 @@ static int append_subprog_relos(struct bpf_program *main_prog, struct bpf_progra return 0; } +static int save_subprog_offsets(struct bpf_program *main_prog, struct bpf_program *subprog) +{ + size_t size = sizeof(main_prog->subprogs[0]); + int cnt = main_prog->subprog_cnt; + void *tmp; + + tmp = libbpf_reallocarray(main_prog->subprogs, cnt + 1, size); + if (!tmp) + return -ENOMEM; + + main_prog->subprogs = tmp; + main_prog->subprogs[cnt].sec_insn_off = subprog->sec_insn_off; + main_prog->subprogs[cnt].sub_insn_off = subprog->sub_insn_off; + main_prog->subprog_cnt++; + + return 0; +} + static int bpf_object__append_subprog_code(struct bpf_object *obj, struct bpf_program *main_prog, struct bpf_program *subprog) { - struct bpf_insn *insns; - size_t new_cnt; - int err; + struct bpf_insn *insns; + size_t new_cnt; + int err; - subprog->sub_insn_off = main_prog->insns_cnt; + subprog->sub_insn_off = main_prog->insns_cnt; - new_cnt = main_prog->insns_cnt + subprog->insns_cnt; - insns = libbpf_reallocarray(main_prog->insns, new_cnt, sizeof(*insns)); - if (!insns) { - pr_warn("prog '%s': failed to realloc prog code\n", main_prog->name); - return -ENOMEM; - } - main_prog->insns = insns; - main_prog->insns_cnt = new_cnt; + new_cnt = main_prog->insns_cnt + subprog->insns_cnt; + insns = libbpf_reallocarray(main_prog->insns, new_cnt, sizeof(*insns)); + if (!insns) { + pr_warn("prog '%s': failed to realloc prog code\n", main_prog->name); + return -ENOMEM; + } + main_prog->insns = insns; + main_prog->insns_cnt = new_cnt; - memcpy(main_prog->insns + subprog->sub_insn_off, subprog->insns, - subprog->insns_cnt * sizeof(*insns)); + memcpy(main_prog->insns + subprog->sub_insn_off, subprog->insns, + subprog->insns_cnt * sizeof(*insns)); - pr_debug("prog '%s': added %zu insns from sub-prog '%s'\n", - main_prog->name, subprog->insns_cnt, subprog->name); + pr_debug("prog '%s': added %zu insns from sub-prog '%s'\n", + main_prog->name, subprog->insns_cnt, subprog->name); - /* The subprog insns are now appended. Append its relos too. */ - err = append_subprog_relos(main_prog, subprog); - if (err) - return err; - return 0; + /* The subprog insns are now appended. Append its relos too. */ + err = append_subprog_relos(main_prog, subprog); + if (err) + return err; + + err = save_subprog_offsets(main_prog, subprog); + if (err) { + pr_warn("prog '%s': failed to add subprog offsets: %s\n", + main_prog->name, errstr(err)); + return err; + } + + return 0; } static int @@ -9228,6 +9467,13 @@ void bpf_object__close(struct bpf_object *obj) zfree(&obj->arena_data); + zfree(&obj->jumptables_data); + obj->jumptables_data_sz = 0; + + for (i = 0; i < obj->jumptable_map_cnt; i++) + close(obj->jumptable_maps[i].fd); + zfree(&obj->jumptable_maps); + free(obj); } @@ -11325,8 +11571,6 @@ static const char *arch_specific_syscall_pfx(void) return "ia32"; #elif defined(__s390x__) return "s390x"; -#elif defined(__s390__) - return "s390"; #elif defined(__arm__) return "arm"; #elif defined(__aarch64__) @@ -12113,8 +12357,6 @@ static const char *arch_specific_lib_paths(void) return "/lib/i386-linux-gnu"; #elif defined(__s390x__) return "/lib/s390x-linux-gnu"; -#elif defined(__s390__) - return "/lib/s390-linux-gnu"; #elif defined(__arm__) && defined(__SOFTFP__) return "/lib/arm-linux-gnueabi"; #elif defined(__arm__) && !defined(__SOFTFP__) @@ -13858,8 +14100,8 @@ int bpf_program__set_attach_target(struct bpf_program *prog, return libbpf_err(-EINVAL); if (attach_prog_fd && !attach_func_name) { - /* remember attach_prog_fd and let bpf_program__load() find - * BTF ID during the program load + /* Store attach_prog_fd. The BTF ID will be resolved later during + * the normal object/program load phase. */ prog->attach_prog_fd = attach_prog_fd; return 0; diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h index 5118d0a90e24..65e68e964b89 100644 --- a/tools/lib/bpf/libbpf.h +++ b/tools/lib/bpf/libbpf.h @@ -448,7 +448,7 @@ LIBBPF_API int bpf_program__pin(struct bpf_program *prog, const char *path); /** * @brief **bpf_program__unpin()** unpins the BPF program from a file - * in the BPFFS specified by a path. This decrements the programs + * in the BPFFS specified by a path. This decrements program's in-kernel * reference count. * * The file pinning the BPF program can also be unlinked by a different @@ -481,14 +481,12 @@ LIBBPF_API int bpf_link__pin(struct bpf_link *link, const char *path); /** * @brief **bpf_link__unpin()** unpins the BPF link from a file - * in the BPFFS specified by a path. This decrements the links - * reference count. + * in the BPFFS. This decrements link's in-kernel reference count. * * The file pinning the BPF link can also be unlinked by a different * process in which case this function will return an error. * - * @param prog BPF program to unpin - * @param path file path to the pin in a BPF file system + * @param link BPF link to unpin * @return 0, on success; negative error code, otherwise */ LIBBPF_API int bpf_link__unpin(struct bpf_link *link); @@ -995,8 +993,13 @@ LIBBPF_API __u32 bpf_program__line_info_cnt(const struct bpf_program *prog); * - fentry/fexit/fmod_ret; * - lsm; * - freplace. - * @param prog BPF program to set the attach type for - * @param type attach type to set the BPF map to have + * @param prog BPF program to configure; must be not yet loaded. + * @param attach_prog_fd FD of target BPF program (for freplace/extension). + * If >0 and func name omitted, defers BTF ID resolution. + * @param attach_func_name Target function name. Used either with + * attach_prog_fd to find destination BTF type ID in that BPF program, or + * alone (no attach_prog_fd) to resolve kernel (vmlinux/module) BTF ID. + * Must be provided if attach_prog_fd is 0. * @return error code; or 0 if no error occurred. */ LIBBPF_API int @@ -1098,6 +1101,7 @@ LIBBPF_API __u32 bpf_map__value_size(const struct bpf_map *map); /** * @brief **bpf_map__set_value_size()** sets map value size. * @param map the BPF map instance + * @param size the new value size * @return 0, on success; negative error, otherwise * * There is a special case for maps with associated memory-mapped regions, like @@ -1202,7 +1206,7 @@ LIBBPF_API struct bpf_map *bpf_map__inner_map(struct bpf_map *map); * per-CPU values value size has to be aligned up to closest 8 bytes for * alignment reasons, so expected size is: `round_up(value_size, 8) * * libbpf_num_possible_cpus()`. - * @flags extra flags passed to kernel for this operation + * @param flags extra flags passed to kernel for this operation * @return 0, on success; negative error, otherwise * * **bpf_map__lookup_elem()** is high-level equivalent of @@ -1226,7 +1230,7 @@ LIBBPF_API int bpf_map__lookup_elem(const struct bpf_map *map, * per-CPU values value size has to be aligned up to closest 8 bytes for * alignment reasons, so expected size is: `round_up(value_size, 8) * * libbpf_num_possible_cpus()`. - * @flags extra flags passed to kernel for this operation + * @param flags extra flags passed to kernel for this operation * @return 0, on success; negative error, otherwise * * **bpf_map__update_elem()** is high-level equivalent of @@ -1242,7 +1246,7 @@ LIBBPF_API int bpf_map__update_elem(const struct bpf_map *map, * @param map BPF map to delete element from * @param key pointer to memory containing bytes of the key * @param key_sz size in bytes of key data, needs to match BPF map definition's **key_size** - * @flags extra flags passed to kernel for this operation + * @param flags extra flags passed to kernel for this operation * @return 0, on success; negative error, otherwise * * **bpf_map__delete_elem()** is high-level equivalent of @@ -1265,7 +1269,7 @@ LIBBPF_API int bpf_map__delete_elem(const struct bpf_map *map, * per-CPU values value size has to be aligned up to closest 8 bytes for * alignment reasons, so expected size is: `round_up(value_size, 8) * * libbpf_num_possible_cpus()`. - * @flags extra flags passed to kernel for this operation + * @param flags extra flags passed to kernel for this operation * @return 0, on success; negative error, otherwise * * **bpf_map__lookup_and_delete_elem()** is high-level equivalent of @@ -1637,6 +1641,7 @@ struct perf_buffer_opts { * @param sample_cb function called on each received data record * @param lost_cb function called when record loss has occurred * @param ctx user-provided extra context passed into *sample_cb* and *lost_cb* + * @param opts optional parameters for the perf buffer, can be null * @return a new instance of struct perf_buffer on success, NULL on error with * *errno* containing an error code */ diff --git a/tools/lib/bpf/libbpf_internal.h b/tools/lib/bpf/libbpf_internal.h index 35b2527bedec..fc59b21b51b5 100644 --- a/tools/lib/bpf/libbpf_internal.h +++ b/tools/lib/bpf/libbpf_internal.h @@ -74,6 +74,8 @@ #define ELF64_ST_VISIBILITY(o) ((o) & 0x03) #endif +#define JUMPTABLES_SEC ".jumptables" + #define BTF_INFO_ENC(kind, kind_flag, vlen) \ ((!!(kind_flag) << 31) | ((kind) << 24) | ((vlen) & BTF_MAX_VLEN)) #define BTF_TYPE_ENC(name, info, size_or_type) (name), (info), (size_or_type) diff --git a/tools/lib/bpf/libbpf_probes.c b/tools/lib/bpf/libbpf_probes.c index 9dfbe7750f56..bccf4bb747e1 100644 --- a/tools/lib/bpf/libbpf_probes.c +++ b/tools/lib/bpf/libbpf_probes.c @@ -364,6 +364,10 @@ static int probe_map_create(enum bpf_map_type map_type) case BPF_MAP_TYPE_SOCKHASH: case BPF_MAP_TYPE_REUSEPORT_SOCKARRAY: break; + case BPF_MAP_TYPE_INSN_ARRAY: + key_size = sizeof(__u32); + value_size = sizeof(struct bpf_insn_array_value); + break; case BPF_MAP_TYPE_UNSPEC: default: return -EOPNOTSUPP; diff --git a/tools/lib/bpf/linker.c b/tools/lib/bpf/linker.c index 56ae77047bc3..f4403e3cf994 100644 --- a/tools/lib/bpf/linker.c +++ b/tools/lib/bpf/linker.c @@ -2025,6 +2025,9 @@ static int linker_append_elf_sym(struct bpf_linker *linker, struct src_obj *obj, obj->sym_map[src_sym_idx] = dst_sec->sec_sym_idx; return 0; } + + if (strcmp(src_sec->sec_name, JUMPTABLES_SEC) == 0) + goto add_sym; } if (sym_bind == STB_LOCAL) diff --git a/tools/lib/bpf/usdt.c b/tools/lib/bpf/usdt.c index c174b4086673..d1524f6f54ae 100644 --- a/tools/lib/bpf/usdt.c +++ b/tools/lib/bpf/usdt.c @@ -1376,8 +1376,6 @@ static int parse_usdt_arg(const char *arg_str, int arg_num, struct usdt_arg_spec #elif defined(__s390x__) -/* Do not support __s390__ for now, since user_pt_regs is broken with -m31. */ - static int parse_usdt_arg(const char *arg_str, int arg_num, struct usdt_arg_spec *arg, int *arg_sz) { unsigned int reg; diff --git a/tools/docs/lib/__init__.py b/tools/lib/python/__init__.py index e69de29bb2d1..e69de29bb2d1 100644 --- a/tools/docs/lib/__init__.py +++ b/tools/lib/python/__init__.py diff --git a/tools/lib/python/abi/__init__.py b/tools/lib/python/abi/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 --- /dev/null +++ b/tools/lib/python/abi/__init__.py diff --git a/tools/lib/python/abi/abi_parser.py b/tools/lib/python/abi/abi_parser.py new file mode 100644 index 000000000000..9b8db70067ef --- /dev/null +++ b/tools/lib/python/abi/abi_parser.py @@ -0,0 +1,628 @@ +#!/usr/bin/env python3 +# pylint: disable=R0902,R0903,R0911,R0912,R0913,R0914,R0915,R0917,C0302 +# Copyright(c) 2025: Mauro Carvalho Chehab <mchehab@kernel.org>. +# SPDX-License-Identifier: GPL-2.0 + +""" +Parse ABI documentation and produce results from it. +""" + +from argparse import Namespace +import logging +import os +import re + +from pprint import pformat +from random import randrange, seed + +# Import Python modules + +from abi.helpers import AbiDebug, ABI_DIR + + +class AbiParser: + """Main class to parse ABI files""" + + TAGS = r"(what|where|date|kernelversion|contact|description|users)" + XREF = r"(?:^|\s|\()(\/(?:sys|config|proc|dev|kvd)\/[^,.:;\)\s]+)(?:[,.:;\)\s]|\Z)" + + def __init__(self, directory, logger=None, + enable_lineno=False, show_warnings=True, debug=0): + """Stores arguments for the class and initialize class vars""" + + self.directory = directory + self.enable_lineno = enable_lineno + self.show_warnings = show_warnings + self.debug = debug + + if not logger: + self.log = logging.getLogger("get_abi") + else: + self.log = logger + + self.data = {} + self.what_symbols = {} + self.file_refs = {} + self.what_refs = {} + + # Ignore files that contain such suffixes + self.ignore_suffixes = (".rej", ".org", ".orig", ".bak", "~") + + # Regular expressions used on parser + self.re_abi_dir = re.compile(r"(.*)" + ABI_DIR) + self.re_tag = re.compile(r"(\S+)(:\s*)(.*)", re.I) + self.re_valid = re.compile(self.TAGS) + self.re_start_spc = re.compile(r"(\s*)(\S.*)") + self.re_whitespace = re.compile(r"^\s+") + + # Regular used on print + self.re_what = re.compile(r"(\/?(?:[\w\-]+\/?){1,2})") + self.re_escape = re.compile(r"([\.\x01-\x08\x0e-\x1f\x21-\x2f\x3a-\x40\x7b-\xff])") + self.re_unprintable = re.compile(r"([\x00-\x2f\x3a-\x40\x5b-\x60\x7b-\xff]+)") + self.re_title_mark = re.compile(r"\n[\-\*\=\^\~]+\n") + self.re_doc = re.compile(r"Documentation/(?!devicetree)(\S+)\.rst") + self.re_abi = re.compile(r"(Documentation/ABI/)([\w\/\-]+)") + self.re_xref_node = re.compile(self.XREF) + + def warn(self, fdata, msg, extra=None): + """Displays a parse error if warning is enabled""" + + if not self.show_warnings: + return + + msg = f"{fdata.fname}:{fdata.ln}: {msg}" + if extra: + msg += "\n\t\t" + extra + + self.log.warning(msg) + + def add_symbol(self, what, fname, ln=None, xref=None): + """Create a reference table describing where each 'what' is located""" + + if what not in self.what_symbols: + self.what_symbols[what] = {"file": {}} + + if fname not in self.what_symbols[what]["file"]: + self.what_symbols[what]["file"][fname] = [] + + if ln and ln not in self.what_symbols[what]["file"][fname]: + self.what_symbols[what]["file"][fname].append(ln) + + if xref: + self.what_symbols[what]["xref"] = xref + + def _parse_line(self, fdata, line): + """Parse a single line of an ABI file""" + + new_what = False + new_tag = False + content = None + + match = self.re_tag.match(line) + if match: + new = match.group(1).lower() + sep = match.group(2) + content = match.group(3) + + match = self.re_valid.search(new) + if match: + new_tag = match.group(1) + else: + if fdata.tag == "description": + # New "tag" is actually part of description. + # Don't consider it a tag + new_tag = False + elif fdata.tag != "": + self.warn(fdata, f"tag '{fdata.tag}' is invalid", line) + + if new_tag: + # "where" is Invalid, but was a common mistake. Warn if found + if new_tag == "where": + self.warn(fdata, "tag 'Where' is invalid. Should be 'What:' instead") + new_tag = "what" + + if new_tag == "what": + fdata.space = None + + if content not in self.what_symbols: + self.add_symbol(what=content, fname=fdata.fname, ln=fdata.ln) + + if fdata.tag == "what": + fdata.what.append(content.strip("\n")) + else: + if fdata.key: + if "description" not in self.data.get(fdata.key, {}): + self.warn(fdata, f"{fdata.key} doesn't have a description") + + for w in fdata.what: + self.add_symbol(what=w, fname=fdata.fname, + ln=fdata.what_ln, xref=fdata.key) + + fdata.label = content + new_what = True + + key = "abi_" + content.lower() + fdata.key = self.re_unprintable.sub("_", key).strip("_") + + # Avoid duplicated keys but using a defined seed, to make + # the namespace identical if there aren't changes at the + # ABI symbols + seed(42) + + while fdata.key in self.data: + char = randrange(0, 51) + ord("A") + if char > ord("Z"): + char += ord("a") - ord("Z") - 1 + + fdata.key += chr(char) + + if fdata.key and fdata.key not in self.data: + self.data[fdata.key] = { + "what": [content], + "file": [fdata.file_ref], + "path": fdata.ftype, + "line_no": fdata.ln, + } + + fdata.what = self.data[fdata.key]["what"] + + self.what_refs[content] = fdata.key + fdata.tag = new_tag + fdata.what_ln = fdata.ln + + if fdata.nametag["what"]: + t = (content, fdata.key) + if t not in fdata.nametag["symbols"]: + fdata.nametag["symbols"].append(t) + + return + + if fdata.tag and new_tag: + fdata.tag = new_tag + + if new_what: + fdata.label = "" + + if "description" in self.data[fdata.key]: + self.data[fdata.key]["description"] += "\n\n" + + if fdata.file_ref not in self.data[fdata.key]["file"]: + self.data[fdata.key]["file"].append(fdata.file_ref) + + if self.debug == AbiDebug.WHAT_PARSING: + self.log.debug("what: %s", fdata.what) + + if not fdata.what: + self.warn(fdata, "'What:' should come first:", line) + return + + if new_tag == "description": + fdata.space = None + + if content: + sep = sep.replace(":", " ") + + c = " " * len(new_tag) + sep + content + c = c.expandtabs() + + match = self.re_start_spc.match(c) + if match: + # Preserve initial spaces for the first line + fdata.space = match.group(1) + content = match.group(2) + "\n" + + self.data[fdata.key][fdata.tag] = content + + return + + # Store any contents before tags at the database + if not fdata.tag and "what" in fdata.nametag: + fdata.nametag["description"] += line + return + + if fdata.tag == "description": + content = line.expandtabs() + + if self.re_whitespace.sub("", content) == "": + self.data[fdata.key][fdata.tag] += "\n" + return + + if fdata.space is None: + match = self.re_start_spc.match(content) + if match: + # Preserve initial spaces for the first line + fdata.space = match.group(1) + + content = match.group(2) + "\n" + else: + if content.startswith(fdata.space): + content = content[len(fdata.space):] + + else: + fdata.space = "" + + if fdata.tag == "what": + w = content.strip("\n") + if w: + self.data[fdata.key][fdata.tag].append(w) + else: + self.data[fdata.key][fdata.tag] += content + return + + content = line.strip() + if fdata.tag: + if fdata.tag == "what": + w = content.strip("\n") + if w: + self.data[fdata.key][fdata.tag].append(w) + else: + self.data[fdata.key][fdata.tag] += "\n" + content.rstrip("\n") + return + + # Everything else is error + if content: + self.warn(fdata, "Unexpected content", line) + + def parse_readme(self, nametag, fname): + """Parse ABI README file""" + + nametag["what"] = ["Introduction"] + nametag["path"] = "README" + with open(fname, "r", encoding="utf8", errors="backslashreplace") as fp: + for line in fp: + match = self.re_tag.match(line) + if match: + new = match.group(1).lower() + + match = self.re_valid.search(new) + if match: + nametag["description"] += "\n:" + line + continue + + nametag["description"] += line + + def parse_file(self, fname, path, basename): + """Parse a single file""" + + ref = f"abi_file_{path}_{basename}" + ref = self.re_unprintable.sub("_", ref).strip("_") + + # Store per-file state into a namespace variable. This will be used + # by the per-line parser state machine and by the warning function. + fdata = Namespace + + fdata.fname = fname + fdata.name = basename + + pos = fname.find(ABI_DIR) + if pos > 0: + f = fname[pos:] + else: + f = fname + + fdata.file_ref = (f, ref) + self.file_refs[f] = ref + + fdata.ln = 0 + fdata.what_ln = 0 + fdata.tag = "" + fdata.label = "" + fdata.what = [] + fdata.key = None + fdata.xrefs = None + fdata.space = None + fdata.ftype = path.split("/")[0] + + fdata.nametag = {} + fdata.nametag["what"] = [f"ABI file {path}/{basename}"] + fdata.nametag["type"] = "File" + fdata.nametag["path"] = fdata.ftype + fdata.nametag["file"] = [fdata.file_ref] + fdata.nametag["line_no"] = 1 + fdata.nametag["description"] = "" + fdata.nametag["symbols"] = [] + + self.data[ref] = fdata.nametag + + if self.debug & AbiDebug.WHAT_OPEN: + self.log.debug("Opening file %s", fname) + + if basename == "README": + self.parse_readme(fdata.nametag, fname) + return + + with open(fname, "r", encoding="utf8", errors="backslashreplace") as fp: + for line in fp: + fdata.ln += 1 + + self._parse_line(fdata, line) + + if "description" in fdata.nametag: + fdata.nametag["description"] = fdata.nametag["description"].lstrip("\n") + + if fdata.key: + if "description" not in self.data.get(fdata.key, {}): + self.warn(fdata, f"{fdata.key} doesn't have a description") + + for w in fdata.what: + self.add_symbol(what=w, fname=fname, xref=fdata.key) + + def _parse_abi(self, root=None): + """Internal function to parse documentation ABI recursively""" + + if not root: + root = self.directory + + with os.scandir(root) as obj: + for entry in obj: + name = os.path.join(root, entry.name) + + if entry.is_dir(): + self._parse_abi(name) + continue + + if not entry.is_file(): + continue + + basename = os.path.basename(name) + + if basename.startswith("."): + continue + + if basename.endswith(self.ignore_suffixes): + continue + + path = self.re_abi_dir.sub("", os.path.dirname(name)) + + self.parse_file(name, path, basename) + + def parse_abi(self, root=None): + """Parse documentation ABI""" + + self._parse_abi(root) + + if self.debug & AbiDebug.DUMP_ABI_STRUCTS: + self.log.debug(pformat(self.data)) + + def desc_txt(self, desc): + """Print description as found inside ABI files""" + + desc = desc.strip(" \t\n") + + return desc + "\n\n" + + def xref(self, fname): + """ + Converts a Documentation/ABI + basename into a ReST cross-reference + """ + + xref = self.file_refs.get(fname) + if not xref: + return None + else: + return xref + + def desc_rst(self, desc): + """Enrich ReST output by creating cross-references""" + + # Remove title markups from the description + # Having titles inside ABI files will only work if extra + # care would be taken in order to strictly follow the same + # level order for each markup. + desc = self.re_title_mark.sub("\n\n", "\n" + desc) + desc = desc.rstrip(" \t\n").lstrip("\n") + + # Python's regex performance for non-compiled expressions is a lot + # than Perl, as Perl automatically caches them at their + # first usage. Here, we'll need to do the same, as otherwise the + # performance penalty is be high + + new_desc = "" + for d in desc.split("\n"): + if d == "": + new_desc += "\n" + continue + + # Use cross-references for doc files where needed + d = self.re_doc.sub(r":doc:`/\1`", d) + + # Use cross-references for ABI generated docs where needed + matches = self.re_abi.findall(d) + for m in matches: + abi = m[0] + m[1] + + xref = self.file_refs.get(abi) + if not xref: + # This may happen if ABI is on a separate directory, + # like parsing ABI testing and symbol is at stable. + # The proper solution is to move this part of the code + # for it to be inside sphinx/kernel_abi.py + self.log.info("Didn't find ABI reference for '%s'", abi) + else: + new = self.re_escape.sub(r"\\\1", m[1]) + d = re.sub(fr"\b{abi}\b", f":ref:`{new} <{xref}>`", d) + + # Seek for cross reference symbols like /sys/... + # Need to be careful to avoid doing it on a code block + if d[0] not in [" ", "\t"]: + matches = self.re_xref_node.findall(d) + for m in matches: + # Finding ABI here is more complex due to wildcards + xref = self.what_refs.get(m) + if xref: + new = self.re_escape.sub(r"\\\1", m) + d = re.sub(fr"\b{m}\b", f":ref:`{new} <{xref}>`", d) + + new_desc += d + "\n" + + return new_desc + "\n\n" + + def doc(self, output_in_txt=False, show_symbols=True, show_file=True, + filter_path=None): + """Print ABI at stdout""" + + part = None + for key, v in sorted(self.data.items(), + key=lambda x: (x[1].get("type", ""), + x[1].get("what"))): + + wtype = v.get("type", "Symbol") + file_ref = v.get("file") + names = v.get("what", [""]) + + if wtype == "File": + if not show_file: + continue + else: + if not show_symbols: + continue + + if filter_path: + if v.get("path") != filter_path: + continue + + msg = "" + + if wtype != "File": + cur_part = names[0] + if cur_part.find("/") >= 0: + match = self.re_what.match(cur_part) + if match: + symbol = match.group(1).rstrip("/") + cur_part = "Symbols under " + symbol + + if cur_part and cur_part != part: + part = cur_part + msg += part + "\n"+ "-" * len(part) +"\n\n" + + msg += f".. _{key}:\n\n" + + max_len = 0 + for i in range(0, len(names)): # pylint: disable=C0200 + names[i] = "**" + self.re_escape.sub(r"\\\1", names[i]) + "**" + + max_len = max(max_len, len(names[i])) + + msg += "+-" + "-" * max_len + "-+\n" + for name in names: + msg += f"| {name}" + " " * (max_len - len(name)) + " |\n" + msg += "+-" + "-" * max_len + "-+\n" + msg += "\n" + + for ref in file_ref: + if wtype == "File": + msg += f".. _{ref[1]}:\n\n" + else: + base = os.path.basename(ref[0]) + msg += f"Defined on file :ref:`{base} <{ref[1]}>`\n\n" + + if wtype == "File": + msg += names[0] +"\n" + "-" * len(names[0]) +"\n\n" + + desc = v.get("description") + if not desc and wtype != "File": + msg += f"DESCRIPTION MISSING for {names[0]}\n\n" + + if desc: + if output_in_txt: + msg += self.desc_txt(desc) + else: + msg += self.desc_rst(desc) + + symbols = v.get("symbols") + if symbols: + msg += "Has the following ABI:\n\n" + + for w, label in symbols: + # Escape special chars from content + content = self.re_escape.sub(r"\\\1", w) + + msg += f"- :ref:`{content} <{label}>`\n\n" + + users = v.get("users") + if users and users.strip(" \t\n"): + users = users.strip("\n").replace('\n', '\n\t') + msg += f"Users:\n\t{users}\n\n" + + ln = v.get("line_no", 1) + + yield (msg, file_ref[0][0], ln) + + def check_issues(self): + """Warn about duplicated ABI entries""" + + for what, v in self.what_symbols.items(): + files = v.get("file") + if not files: + # Should never happen if the parser works properly + self.log.warning("%s doesn't have a file associated", what) + continue + + if len(files) == 1: + continue + + f = [] + for fname, lines in sorted(files.items()): + if not lines: + f.append(f"{fname}") + elif len(lines) == 1: + f.append(f"{fname}:{lines[0]}") + else: + m = fname + "lines " + m += ", ".join(str(x) for x in lines) + f.append(m) + + self.log.warning("%s is defined %d times: %s", what, len(f), "; ".join(f)) + + def search_symbols(self, expr): + """ Searches for ABI symbols """ + + regex = re.compile(expr, re.I) + + found_keys = 0 + for t in sorted(self.data.items(), key=lambda x: [0]): + v = t[1] + + wtype = v.get("type", "") + if wtype == "File": + continue + + for what in v.get("what", [""]): + if regex.search(what): + found_keys += 1 + + kernelversion = v.get("kernelversion", "").strip(" \t\n") + date = v.get("date", "").strip(" \t\n") + contact = v.get("contact", "").strip(" \t\n") + users = v.get("users", "").strip(" \t\n") + desc = v.get("description", "").strip(" \t\n") + + files = [] + for f in v.get("file", ()): + files.append(f[0]) + + what = str(found_keys) + ". " + what + title_tag = "-" * len(what) + + print(f"\n{what}\n{title_tag}\n") + + if kernelversion: + print(f"Kernel version:\t\t{kernelversion}") + + if date: + print(f"Date:\t\t\t{date}") + + if contact: + print(f"Contact:\t\t{contact}") + + if users: + print(f"Users:\t\t\t{users}") + + print("Defined on file(s):\t" + ", ".join(files)) + + if desc: + desc = desc.strip("\n") + print(f"\n{desc}\n") + + if not found_keys: + print(f"Regular expression /{expr}/ not found.") diff --git a/tools/lib/python/abi/abi_regex.py b/tools/lib/python/abi/abi_regex.py new file mode 100644 index 000000000000..d5553206de3c --- /dev/null +++ b/tools/lib/python/abi/abi_regex.py @@ -0,0 +1,234 @@ +#!/usr/bin/env python3 +# xxpylint: disable=R0903 +# Copyright(c) 2025: Mauro Carvalho Chehab <mchehab@kernel.org>. +# SPDX-License-Identifier: GPL-2.0 + +""" +Convert ABI what into regular expressions +""" + +import re +import sys + +from pprint import pformat + +from abi.abi_parser import AbiParser +from abi.helpers import AbiDebug + +class AbiRegex(AbiParser): + """Extends AbiParser to search ABI nodes with regular expressions""" + + # Escape only ASCII visible characters + escape_symbols = r"([\x21-\x29\x2b-\x2d\x3a-\x40\x5c\x60\x7b-\x7e])" + leave_others = "others" + + # Tuples with regular expressions to be compiled and replacement data + re_whats = [ + # Drop escape characters that might exist + (re.compile("\\\\"), ""), + + # Temporarily escape dot characters + (re.compile(r"\."), "\xf6"), + + # Temporarily change [0-9]+ type of patterns + (re.compile(r"\[0\-9\]\+"), "\xff"), + + # Temporarily change [\d+-\d+] type of patterns + (re.compile(r"\[0\-\d+\]"), "\xff"), + (re.compile(r"\[0:\d+\]"), "\xff"), + (re.compile(r"\[(\d+)\]"), "\xf4\\\\d+\xf5"), + + # Temporarily change [0-9] type of patterns + (re.compile(r"\[(\d)\-(\d)\]"), "\xf4\1-\2\xf5"), + + # Handle multiple option patterns + (re.compile(r"[\{\<\[]([\w_]+)(?:[,|]+([\w_]+)){1,}[\}\>\]]"), r"(\1|\2)"), + + # Handle wildcards + (re.compile(r"([^\/])\*"), "\\1\\\\w\xf7"), + (re.compile(r"/\*/"), "/.*/"), + (re.compile(r"/\xf6\xf6\xf6"), "/.*"), + (re.compile(r"\<[^\>]+\>"), "\\\\w\xf7"), + (re.compile(r"\{[^\}]+\}"), "\\\\w\xf7"), + (re.compile(r"\[[^\]]+\]"), "\\\\w\xf7"), + + (re.compile(r"XX+"), "\\\\w\xf7"), + (re.compile(r"([^A-Z])[XYZ]([^A-Z])"), "\\1\\\\w\xf7\\2"), + (re.compile(r"([^A-Z])[XYZ]$"), "\\1\\\\w\xf7"), + (re.compile(r"_[AB]_"), "_\\\\w\xf7_"), + + # Recover [0-9] type of patterns + (re.compile(r"\xf4"), "["), + (re.compile(r"\xf5"), "]"), + + # Remove duplicated spaces + (re.compile(r"\s+"), r" "), + + # Special case: drop comparison as in: + # What: foo = <something> + # (this happens on a few IIO definitions) + (re.compile(r"\s*\=.*$"), ""), + + # Escape all other symbols + (re.compile(escape_symbols), r"\\\1"), + (re.compile(r"\\\\"), r"\\"), + (re.compile(r"\\([\[\]\(\)\|])"), r"\1"), + (re.compile(r"(\d+)\\(-\d+)"), r"\1\2"), + + (re.compile(r"\xff"), r"\\d+"), + + # Special case: IIO ABI which a parenthesis. + (re.compile(r"sqrt(.*)"), r"sqrt(.*)"), + + # Simplify regexes with multiple .* + (re.compile(r"(?:\.\*){2,}"), ""), + + # Recover dot characters + (re.compile(r"\xf6"), "\\."), + # Recover plus characters + (re.compile(r"\xf7"), "+"), + ] + re_has_num = re.compile(r"\\d") + + # Symbol name after escape_chars that are considered a devnode basename + re_symbol_name = re.compile(r"(\w|\\[\.\-\:])+$") + + # List of popular group names to be skipped to minimize regex group size + # Use AbiDebug.SUBGROUP_SIZE to detect those + skip_names = set(["devices", "hwmon"]) + + def regex_append(self, what, new): + """ + Get a search group for a subset of regular expressions. + + As ABI may have thousands of symbols, using a for to search all + regular expressions is at least O(n^2). When there are wildcards, + the complexity increases substantially, eventually becoming exponential. + + To avoid spending too much time on them, use a logic to split + them into groups. The smaller the group, the better, as it would + mean that searches will be confined to a small number of regular + expressions. + + The conversion to a regex subset is tricky, as we need something + that can be easily obtained from the sysfs symbol and from the + regular expression. So, we need to discard nodes that have + wildcards. + + If it can't obtain a subgroup, place the regular expression inside + a special group (self.leave_others). + """ + + search_group = None + + for search_group in reversed(new.split("/")): + if not search_group or search_group in self.skip_names: + continue + if self.re_symbol_name.match(search_group): + break + + if not search_group: + search_group = self.leave_others + + if self.debug & AbiDebug.SUBGROUP_MAP: + self.log.debug("%s: mapped as %s", what, search_group) + + try: + if search_group not in self.regex_group: + self.regex_group[search_group] = [] + + self.regex_group[search_group].append(re.compile(new)) + if self.search_string: + if what.find(self.search_string) >= 0: + print(f"What: {what}") + except re.PatternError: + self.log.warning("Ignoring '%s' as it produced an invalid regex:\n" + " '%s'", what, new) + + def get_regexes(self, what): + """ + Given an ABI devnode, return a list of all regular expressions that + may match it, based on the sub-groups created by regex_append() + """ + + re_list = [] + + patches = what.split("/") + patches.reverse() + patches.append(self.leave_others) + + for search_group in patches: + if search_group in self.regex_group: + re_list += self.regex_group[search_group] + + return re_list + + def __init__(self, *args, **kwargs): + """ + Override init method to get verbose argument + """ + + self.regex_group = None + self.search_string = None + self.re_string = None + + if "search_string" in kwargs: + self.search_string = kwargs.get("search_string") + del kwargs["search_string"] + + if self.search_string: + + try: + self.re_string = re.compile(self.search_string) + except re.PatternError as e: + msg = f"{self.search_string} is not a valid regular expression" + raise ValueError(msg) from e + + super().__init__(*args, **kwargs) + + def parse_abi(self, *args, **kwargs): + + super().parse_abi(*args, **kwargs) + + self.regex_group = {} + + print("Converting ABI What fields into regexes...", file=sys.stderr) + + for t in sorted(self.data.items(), key=lambda x: x[0]): + v = t[1] + if v.get("type") == "File": + continue + + v["regex"] = [] + + for what in v.get("what", []): + if not what.startswith("/sys"): + continue + + new = what + for r, s in self.re_whats: + try: + new = r.sub(s, new) + except re.PatternError as e: + # Help debugging troubles with new regexes + raise re.PatternError(f"{e}\nwhile re.sub('{r.pattern}', {s}, str)") from e + + v["regex"].append(new) + + if self.debug & AbiDebug.REGEX: + self.log.debug("%-90s <== %s", new, what) + + # Store regex into a subgroup to speedup searches + self.regex_append(what, new) + + if self.debug & AbiDebug.SUBGROUP_DICT: + self.log.debug("%s", pformat(self.regex_group)) + + if self.debug & AbiDebug.SUBGROUP_SIZE: + biggestd_keys = sorted(self.regex_group.keys(), + key= lambda k: len(self.regex_group[k]), + reverse=True) + + print("Top regex subgroups:", file=sys.stderr) + for k in biggestd_keys[:10]: + print(f"{k} has {len(self.regex_group[k])} elements", file=sys.stderr) diff --git a/tools/lib/python/abi/helpers.py b/tools/lib/python/abi/helpers.py new file mode 100644 index 000000000000..639b23e4ca33 --- /dev/null +++ b/tools/lib/python/abi/helpers.py @@ -0,0 +1,38 @@ +#!/usr/bin/env python3 +# Copyright(c) 2025: Mauro Carvalho Chehab <mchehab@kernel.org>. +# pylint: disable=R0903 +# SPDX-License-Identifier: GPL-2.0 + +""" +Helper classes for ABI parser +""" + +ABI_DIR = "Documentation/ABI/" + + +class AbiDebug: + """Debug levels""" + + WHAT_PARSING = 1 + WHAT_OPEN = 2 + DUMP_ABI_STRUCTS = 4 + UNDEFINED = 8 + REGEX = 16 + SUBGROUP_MAP = 32 + SUBGROUP_DICT = 64 + SUBGROUP_SIZE = 128 + GRAPH = 256 + + +DEBUG_HELP = """ +1 - enable debug parsing logic +2 - enable debug messages on file open +4 - enable debug for ABI parse data +8 - enable extra debug information to identify troubles + with ABI symbols found at the local machine that + weren't found on ABI documentation (used only for + undefined subcommand) +16 - enable debug for what to regex conversion +32 - enable debug for symbol regex subgroups +64 - enable debug for sysfs graph tree variable +""" diff --git a/tools/lib/python/abi/system_symbols.py b/tools/lib/python/abi/system_symbols.py new file mode 100644 index 000000000000..4a2554da217b --- /dev/null +++ b/tools/lib/python/abi/system_symbols.py @@ -0,0 +1,378 @@ +#!/usr/bin/env python3 +# pylint: disable=R0902,R0912,R0914,R0915,R1702 +# Copyright(c) 2025: Mauro Carvalho Chehab <mchehab@kernel.org>. +# SPDX-License-Identifier: GPL-2.0 + +""" +Parse ABI documentation and produce results from it. +""" + +import os +import re +import sys + +from concurrent import futures +from datetime import datetime +from random import shuffle + +from abi.helpers import AbiDebug + +class SystemSymbols: + """Stores arguments for the class and initialize class vars""" + + def graph_add_file(self, path, link=None): + """ + add a file path to the sysfs graph stored at self.root + """ + + if path in self.files: + return + + name = "" + ref = self.root + for edge in path.split("/"): + name += edge + "/" + if edge not in ref: + ref[edge] = {"__name": [name.rstrip("/")]} + + ref = ref[edge] + + if link and link not in ref["__name"]: + ref["__name"].append(link.rstrip("/")) + + self.files.add(path) + + def print_graph(self, root_prefix="", root=None, level=0): + """Prints a reference tree graph using UTF-8 characters""" + + if not root: + root = self.root + level = 0 + + # Prevent endless traverse + if level > 5: + return + + if level > 0: + prefix = "├──" + last_prefix = "└──" + else: + prefix = "" + last_prefix = "" + + items = list(root.items()) + + names = root.get("__name", []) + for k, edge in items: + if k == "__name": + continue + + if not k: + k = "/" + + if len(names) > 1: + k += " links: " + ",".join(names[1:]) + + if edge == items[-1][1]: + print(root_prefix + last_prefix + k) + p = root_prefix + if level > 0: + p += " " + self.print_graph(p, edge, level + 1) + else: + print(root_prefix + prefix + k) + p = root_prefix + "│ " + self.print_graph(p, edge, level + 1) + + def _walk(self, root): + """ + Walk through sysfs to get all devnodes that aren't ignored. + + By default, uses /sys as sysfs mounting point. If another + directory is used, it replaces them to /sys at the patches. + """ + + with os.scandir(root) as obj: + for entry in obj: + path = os.path.join(root, entry.name) + if self.sysfs: + p = path.replace(self.sysfs, "/sys", count=1) + else: + p = path + + if self.re_ignore.search(p): + return + + # Handle link first to avoid directory recursion + if entry.is_symlink(): + real = os.path.realpath(path) + if not self.sysfs: + self.aliases[path] = real + else: + real = real.replace(self.sysfs, "/sys", count=1) + + # Add absfile location to graph if it doesn't exist + if not self.re_ignore.search(real): + # Add link to the graph + self.graph_add_file(real, p) + + elif entry.is_file(): + self.graph_add_file(p) + + elif entry.is_dir(): + self._walk(path) + + def __init__(self, abi, sysfs="/sys", hints=False): + """ + Initialize internal variables and get a list of all files inside + sysfs that can currently be parsed. + + Please notice that there are several entries on sysfs that aren't + documented as ABI. Ignore those. + + The real paths will be stored under self.files. Aliases will be + stored in separate, as self.aliases. + """ + + self.abi = abi + self.log = abi.log + + if sysfs != "/sys": + self.sysfs = sysfs.rstrip("/") + else: + self.sysfs = None + + self.hints = hints + + self.root = {} + self.aliases = {} + self.files = set() + + dont_walk = [ + # Those require root access and aren't documented at ABI + f"^{sysfs}/kernel/debug", + f"^{sysfs}/kernel/tracing", + f"^{sysfs}/fs/pstore", + f"^{sysfs}/fs/bpf", + f"^{sysfs}/fs/fuse", + + # This is not documented at ABI + f"^{sysfs}/module", + + f"^{sysfs}/fs/cgroup", # this is big and has zero docs under ABI + f"^{sysfs}/firmware", # documented elsewhere: ACPI, DT bindings + "sections|notes", # aren't actually part of ABI + + # kernel-parameters.txt - not easy to parse + "parameters", + ] + + self.re_ignore = re.compile("|".join(dont_walk)) + + print(f"Reading {sysfs} directory contents...", file=sys.stderr) + self._walk(sysfs) + + def check_file(self, refs, found): + """Check missing ABI symbols for a given sysfs file""" + + res_list = [] + + try: + for names in refs: + fname = names[0] + + res = { + "found": False, + "fname": fname, + "msg": "", + } + res_list.append(res) + + re_what = self.abi.get_regexes(fname) + if not re_what: + self.abi.log.warning(f"missing rules for {fname}") + continue + + for name in names: + for r in re_what: + if self.abi.debug & AbiDebug.UNDEFINED: + self.log.debug("check if %s matches '%s'", name, r.pattern) + if r.match(name): + res["found"] = True + if found: + res["msg"] += f" {fname}: regex:\n\t" + continue + + if self.hints and not res["found"]: + res["msg"] += f" {fname} not found. Tested regexes:\n" + for r in re_what: + res["msg"] += " " + r.pattern + "\n" + + except KeyboardInterrupt: + pass + + return res_list + + def _ref_interactor(self, root): + """Recursive function to interact over the sysfs tree""" + + for k, v in root.items(): + if isinstance(v, dict): + yield from self._ref_interactor(v) + + if root == self.root or k == "__name": + continue + + if self.abi.re_string: + fname = v["__name"][0] + if self.abi.re_string.search(fname): + yield v + else: + yield v + + + def get_fileref(self, all_refs, chunk_size): + """Interactor to group refs into chunks""" + + n = 0 + refs = [] + + for ref in all_refs: + refs.append(ref) + + n += 1 + if n >= chunk_size: + yield refs + n = 0 + refs = [] + + yield refs + + def check_undefined_symbols(self, max_workers=None, chunk_size=50, + found=None, dry_run=None): + """Seach ABI for sysfs symbols missing documentation""" + + self.abi.parse_abi() + + if self.abi.debug & AbiDebug.GRAPH: + self.print_graph() + + all_refs = [] + for ref in self._ref_interactor(self.root): + all_refs.append(ref["__name"]) + + if dry_run: + print("Would check", file=sys.stderr) + for ref in all_refs: + print(", ".join(ref)) + + return + + print("Starting to search symbols (it may take several minutes):", + file=sys.stderr) + start = datetime.now() + old_elapsed = None + + # Python doesn't support multithreading due to limitations on its + # global lock (GIL). While Python 3.13 finally made GIL optional, + # there are still issues related to it. Also, we want to have + # backward compatibility with older versions of Python. + # + # So, use instead multiprocess. However, Python is very slow passing + # data from/to multiple processes. Also, it may consume lots of memory + # if the data to be shared is not small. So, we need to group workload + # in chunks that are big enough to generate performance gains while + # not being so big that would cause out-of-memory. + + num_refs = len(all_refs) + print(f"Number of references to parse: {num_refs}", file=sys.stderr) + + if not max_workers: + max_workers = os.cpu_count() + elif max_workers > os.cpu_count(): + max_workers = os.cpu_count() + + max_workers = max(max_workers, 1) + + max_chunk_size = int((num_refs + max_workers - 1) / max_workers) + chunk_size = min(chunk_size, max_chunk_size) + chunk_size = max(1, chunk_size) + + if max_workers > 1: + executor = futures.ProcessPoolExecutor + + # Place references in a random order. This may help improving + # performance, by mixing complex/simple expressions when creating + # chunks + shuffle(all_refs) + else: + # Python has a high overhead with processes. When there's just + # one worker, it is faster to not create a new process. + # Yet, User still deserves to have a progress print. So, use + # python's "thread", which is actually a single process, using + # an internal schedule to switch between tasks. No performance + # gains for non-IO tasks, but still it can be quickly interrupted + # from time to time to display progress. + executor = futures.ThreadPoolExecutor + + not_found = [] + f_list = [] + with executor(max_workers=max_workers) as exe: + for refs in self.get_fileref(all_refs, chunk_size): + if refs: + try: + f_list.append(exe.submit(self.check_file, refs, found)) + + except KeyboardInterrupt: + return + + total = len(f_list) + + if not total: + if self.abi.re_string: + print(f"No ABI symbol matches {self.abi.search_string}") + else: + self.abi.log.warning("No ABI symbols found") + return + + print(f"{len(f_list):6d} jobs queued on {max_workers} workers", + file=sys.stderr) + + while f_list: + try: + t = futures.wait(f_list, timeout=1, + return_when=futures.FIRST_COMPLETED) + + done = t[0] + + for fut in done: + res_list = fut.result() + + for res in res_list: + if not res["found"]: + not_found.append(res["fname"]) + if res["msg"]: + print(res["msg"]) + + f_list.remove(fut) + except KeyboardInterrupt: + return + + except RuntimeError as e: + self.abi.log.warning(f"Future: {e}") + break + + if sys.stderr.isatty(): + elapsed = str(datetime.now() - start).split(".", maxsplit=1)[0] + if len(f_list) < total: + elapsed += f" ({total - len(f_list)}/{total} jobs completed). " + if elapsed != old_elapsed: + print(elapsed + "\r", end="", flush=True, + file=sys.stderr) + old_elapsed = elapsed + + elapsed = str(datetime.now() - start).split(".", maxsplit=1)[0] + print(elapsed, file=sys.stderr) + + for f in sorted(not_found): + print(f"{f} not found.") diff --git a/tools/lib/python/feat/parse_features.py b/tools/lib/python/feat/parse_features.py new file mode 100755 index 000000000000..b88c04d3e2fe --- /dev/null +++ b/tools/lib/python/feat/parse_features.py @@ -0,0 +1,494 @@ +#!/usr/bin/env python3 +# pylint: disable=R0902,R0911,R0912,R0914,R0915 +# Copyright(c) 2025: Mauro Carvalho Chehab <mchehab@kernel.org>. +# SPDX-License-Identifier: GPL-2.0 + + +""" +Library to parse the Linux Feature files and produce a ReST book. +""" + +import os +import re +import sys + +from glob import iglob + + +class ParseFeature: + """ + Parses Documentation/features, allowing to generate ReST documentation + from it. + """ + + h_name = "Feature" + h_kconfig = "Kconfig" + h_description = "Description" + h_subsys = "Subsystem" + h_status = "Status" + h_arch = "Architecture" + + # Sort order for status. Others will be mapped at the end. + status_map = { + "ok": 0, + "TODO": 1, + "N/A": 2, + # The only missing status is "..", which was mapped as "---", + # as this is an special ReST cell value. Let it get the + # default order (99). + } + + def __init__(self, prefix, debug=0, enable_fname=False): + """ + Sets internal variables + """ + + self.prefix = prefix + self.debug = debug + self.enable_fname = enable_fname + + self.data = {} + + # Initial maximum values use just the headers + self.max_size_name = len(self.h_name) + self.max_size_kconfig = len(self.h_kconfig) + self.max_size_description = len(self.h_description) + self.max_size_desc_word = 0 + self.max_size_subsys = len(self.h_subsys) + self.max_size_status = len(self.h_status) + self.max_size_arch = len(self.h_arch) + self.max_size_arch_with_header = self.max_size_arch + self.max_size_arch + self.description_size = 1 + + self.msg = "" + + def emit(self, msg="", end="\n"): + self.msg += msg + end + + def parse_error(self, fname, ln, msg, data=None): + """ + Displays an error message, printing file name and line + """ + + if ln: + fname += f"#{ln}" + + print(f"Warning: file {fname}: {msg}", file=sys.stderr, end="") + + if data: + data = data.rstrip() + print(f":\n\t{data}", file=sys.stderr) + else: + print("", file=sys.stderr) + + def parse_feat_file(self, fname): + """Parses a single arch-support.txt feature file""" + + if os.path.isdir(fname): + return + + base = os.path.basename(fname) + + if base != "arch-support.txt": + if self.debug: + print(f"ignoring {fname}", file=sys.stderr) + return + + subsys = os.path.dirname(fname).split("/")[-2] + self.max_size_subsys = max(self.max_size_subsys, len(subsys)) + + feature_name = "" + kconfig = "" + description = "" + comments = "" + arch_table = {} + + if self.debug > 1: + print(f"Opening {fname}", file=sys.stderr) + + if self.enable_fname: + full_fname = os.path.abspath(fname) + self.emit(f".. FILE {full_fname}") + + with open(fname, encoding="utf-8") as f: + for ln, line in enumerate(f, start=1): + line = line.strip() + + match = re.match(r"^\#\s+Feature\s+name:\s*(.*\S)", line) + if match: + feature_name = match.group(1) + + self.max_size_name = max(self.max_size_name, + len(feature_name)) + continue + + match = re.match(r"^\#\s+Kconfig:\s*(.*\S)", line) + if match: + kconfig = match.group(1) + + self.max_size_kconfig = max(self.max_size_kconfig, + len(kconfig)) + continue + + match = re.match(r"^\#\s+description:\s*(.*\S)", line) + if match: + description = match.group(1) + + self.max_size_description = max(self.max_size_description, + len(description)) + + words = re.split(r"\s+", line)[1:] + for word in words: + self.max_size_desc_word = max(self.max_size_desc_word, + len(word)) + + continue + + if re.search(r"^\\s*$", line): + continue + + if re.match(r"^\s*\-+\s*$", line): + continue + + if re.search(r"^\s*\|\s*arch\s*\|\s*status\s*\|\s*$", line): + continue + + match = re.match(r"^\#\s*(.*)$", line) + if match: + comments += match.group(1) + continue + + match = re.match(r"^\s*\|\s*(\S+):\s*\|\s*(\S+)\s*\|\s*$", line) + if match: + arch = match.group(1) + status = match.group(2) + + self.max_size_status = max(self.max_size_status, + len(status)) + self.max_size_arch = max(self.max_size_arch, len(arch)) + + if status == "..": + status = "---" + + arch_table[arch] = status + + continue + + self.parse_error(fname, ln, "Line is invalid", line) + + if not feature_name: + self.parse_error(fname, 0, "Feature name not found") + return + if not subsys: + self.parse_error(fname, 0, "Subsystem not found") + return + if not kconfig: + self.parse_error(fname, 0, "Kconfig not found") + return + if not description: + self.parse_error(fname, 0, "Description not found") + return + if not arch_table: + self.parse_error(fname, 0, "Architecture table not found") + return + + self.data[feature_name] = { + "where": fname, + "subsys": subsys, + "kconfig": kconfig, + "description": description, + "comments": comments, + "table": arch_table, + } + + self.max_size_arch_with_header = self.max_size_arch + len(self.h_arch) + + def parse(self): + """Parses all arch-support.txt feature files inside self.prefix""" + + path = os.path.expanduser(self.prefix) + + if self.debug > 2: + print(f"Running parser for {path}") + + example_path = os.path.join(path, "arch-support.txt") + + for fname in iglob(os.path.join(path, "**"), recursive=True): + if fname != example_path: + self.parse_feat_file(fname) + + return self.data + + def output_arch_table(self, arch, feat=None): + """ + Output feature(s) for a given architecture. + """ + + title = f"Feature status on {arch} architecture" + + self.emit("=" * len(title)) + self.emit(title) + self.emit("=" * len(title)) + self.emit() + + self.emit("=" * self.max_size_subsys + " ", end="") + self.emit("=" * self.max_size_name + " ", end="") + self.emit("=" * self.max_size_kconfig + " ", end="") + self.emit("=" * self.max_size_status + " ", end="") + self.emit("=" * self.max_size_description) + + self.emit(f"{self.h_subsys:<{self.max_size_subsys}} ", end="") + self.emit(f"{self.h_name:<{self.max_size_name}} ", end="") + self.emit(f"{self.h_kconfig:<{self.max_size_kconfig}} ", end="") + self.emit(f"{self.h_status:<{self.max_size_status}} ", end="") + self.emit(f"{self.h_description:<{self.max_size_description}}") + + self.emit("=" * self.max_size_subsys + " ", end="") + self.emit("=" * self.max_size_name + " ", end="") + self.emit("=" * self.max_size_kconfig + " ", end="") + self.emit("=" * self.max_size_status + " ", end="") + self.emit("=" * self.max_size_description) + + sorted_features = sorted(self.data.keys(), + key=lambda x: (self.data[x]["subsys"], + x.lower())) + + for name in sorted_features: + if feat and name != feat: + continue + + arch_table = self.data[name]["table"] + + if not arch in arch_table: + continue + + self.emit(f"{self.data[name]['subsys']:<{self.max_size_subsys}} ", + end="") + self.emit(f"{name:<{self.max_size_name}} ", end="") + self.emit(f"{self.data[name]['kconfig']:<{self.max_size_kconfig}} ", + end="") + self.emit(f"{arch_table[arch]:<{self.max_size_status}} ", + end="") + self.emit(f"{self.data[name]['description']}") + + self.emit("=" * self.max_size_subsys + " ", end="") + self.emit("=" * self.max_size_name + " ", end="") + self.emit("=" * self.max_size_kconfig + " ", end="") + self.emit("=" * self.max_size_status + " ", end="") + self.emit("=" * self.max_size_description) + + return self.msg + + def output_feature(self, feat): + """ + Output a feature on all architectures + """ + + title = f"Feature {feat}" + + self.emit("=" * len(title)) + self.emit(title) + self.emit("=" * len(title)) + self.emit() + + if not feat in self.data: + return + + if self.data[feat]["subsys"]: + self.emit(f":Subsystem: {self.data[feat]['subsys']}") + if self.data[feat]["kconfig"]: + self.emit(f":Kconfig: {self.data[feat]['kconfig']}") + + desc = self.data[feat]["description"] + desc = desc[0].upper() + desc[1:] + desc = desc.rstrip(". \t") + self.emit(f"\n{desc}.\n") + + com = self.data[feat]["comments"].strip() + if com: + self.emit("Comments") + self.emit("--------") + self.emit(f"\n{com}\n") + + self.emit("=" * self.max_size_arch + " ", end="") + self.emit("=" * self.max_size_status) + + self.emit(f"{self.h_arch:<{self.max_size_arch}} ", end="") + self.emit(f"{self.h_status:<{self.max_size_status}}") + + self.emit("=" * self.max_size_arch + " ", end="") + self.emit("=" * self.max_size_status) + + arch_table = self.data[feat]["table"] + for arch in sorted(arch_table.keys()): + self.emit(f"{arch:<{self.max_size_arch}} ", end="") + self.emit(f"{arch_table[arch]:<{self.max_size_status}}") + + self.emit("=" * self.max_size_arch + " ", end="") + self.emit("=" * self.max_size_status) + + return self.msg + + def matrix_lines(self, desc_size, max_size_status, header): + """ + Helper function to split element tables at the output matrix + """ + + if header: + ln_marker = "=" + else: + ln_marker = "-" + + self.emit("+" + ln_marker * self.max_size_name + "+", end="") + self.emit(ln_marker * desc_size, end="") + self.emit("+" + ln_marker * max_size_status + "+") + + def output_matrix(self): + """ + Generates a set of tables, groped by subsystem, containing + what's the feature state on each architecture. + """ + + title = "Feature status on all architectures" + + self.emit("=" * len(title)) + self.emit(title) + self.emit("=" * len(title)) + self.emit() + + desc_title = f"{self.h_kconfig} / {self.h_description}" + + desc_size = self.max_size_kconfig + 4 + if not self.description_size: + desc_size = max(self.max_size_description, desc_size) + else: + desc_size = max(self.description_size, desc_size) + + desc_size = max(self.max_size_desc_word, desc_size, len(desc_title)) + + notcompat = "Not compatible" + self.max_size_status = max(self.max_size_status, len(notcompat)) + + min_status_size = self.max_size_status + self.max_size_arch + 4 + max_size_status = max(min_status_size, self.max_size_status) + + h_status_per_arch = "Status per architecture" + max_size_status = max(max_size_status, len(h_status_per_arch)) + + cur_subsys = None + for name in sorted(self.data.keys(), + key=lambda x: (self.data[x]["subsys"], x.lower())): + if not cur_subsys or cur_subsys != self.data[name]["subsys"]: + if cur_subsys: + self.emit() + + cur_subsys = self.data[name]["subsys"] + + title = f"Subsystem: {cur_subsys}" + self.emit(title) + self.emit("=" * len(title)) + self.emit() + + self.matrix_lines(desc_size, max_size_status, 0) + + self.emit(f"|{self.h_name:<{self.max_size_name}}", end="") + self.emit(f"|{desc_title:<{desc_size}}", end="") + self.emit(f"|{h_status_per_arch:<{max_size_status}}|") + + self.matrix_lines(desc_size, max_size_status, 1) + + lines = [] + descs = [] + cur_status = "" + line = "" + + arch_table = sorted(self.data[name]["table"].items(), + key=lambda x: (self.status_map.get(x[1], 99), + x[0].lower())) + + for arch, status in arch_table: + if status == "---": + status = notcompat + + if status != cur_status: + if line != "": + lines.append(line) + line = "" + line = f"- **{status}**: {arch}" + elif len(line) + len(arch) + 2 < max_size_status: + line += f", {arch}" + else: + lines.append(line) + line = f" {arch}" + cur_status = status + + if line != "": + lines.append(line) + + description = self.data[name]["description"] + while len(description) > desc_size: + desc_line = description[:desc_size] + + last_space = desc_line.rfind(" ") + if last_space != -1: + desc_line = desc_line[:last_space] + descs.append(desc_line) + description = description[last_space + 1:] + else: + desc_line = desc_line[:-1] + descs.append(desc_line + "\\") + description = description[len(desc_line):] + + if description: + descs.append(description) + + while len(lines) < 2 + len(descs): + lines.append("") + + for ln, line in enumerate(lines): + col = ["", ""] + + if not ln: + col[0] = name + col[1] = f"``{self.data[name]['kconfig']}``" + else: + if ln >= 2 and descs: + col[1] = descs.pop(0) + + self.emit(f"|{col[0]:<{self.max_size_name}}", end="") + self.emit(f"|{col[1]:<{desc_size}}", end="") + self.emit(f"|{line:<{max_size_status}}|") + + self.matrix_lines(desc_size, max_size_status, 0) + + return self.msg + + def list_arch_features(self, arch, feat): + """ + Print a matrix of kernel feature support for the chosen architecture. + """ + self.emit("#") + self.emit(f"# Kernel feature support matrix of the '{arch}' architecture:") + self.emit("#") + + # Sort by subsystem, then by feature name (case‑insensitive) + for name in sorted(self.data.keys(), + key=lambda n: (self.data[n]["subsys"].lower(), + n.lower())): + if feat and name != feat: + continue + + feature = self.data[name] + arch_table = feature["table"] + status = arch_table.get(arch, "") + status = " " * ((4 - len(status)) // 2) + status + + self.emit(f"{feature['subsys']:>{self.max_size_subsys + 1}}/ ", + end="") + self.emit(f"{name:<{self.max_size_name}}: ", end="") + self.emit(f"{status:<5}| ", end="") + self.emit(f"{feature['kconfig']:>{self.max_size_kconfig}} ", + end="") + self.emit(f"# {feature['description']}") + + return self.msg diff --git a/tools/lib/python/jobserver.py b/tools/lib/python/jobserver.py new file mode 100755 index 000000000000..a24f30ef4fa8 --- /dev/null +++ b/tools/lib/python/jobserver.py @@ -0,0 +1,149 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: GPL-2.0+ +# +# pylint: disable=C0103,C0209 +# +# + +""" +Interacts with the POSIX jobserver during the Kernel build time. + +A "normal" jobserver task, like the one initiated by a make subrocess would do: + + - open read/write file descriptors to communicate with the job server; + - ask for one slot by calling: + claim = os.read(reader, 1) + - when the job finshes, call: + os.write(writer, b"+") # os.write(writer, claim) + +Here, the goal is different: This script aims to get the remaining number +of slots available, using all of them to run a command which handle tasks in +parallel. To to that, it has a loop that ends only after there are no +slots left. It then increments the number by one, in order to allow a +call equivalent to make -j$((claim+1)), e.g. having a parent make creating +$claim child to do the actual work. + +The end goal here is to keep the total number of build tasks under the +limit established by the initial make -j$n_proc call. + +See: + https://www.gnu.org/software/make/manual/html_node/POSIX-Jobserver.html#POSIX-Jobserver +""" + +import errno +import os +import subprocess +import sys + +class JobserverExec: + """ + Claim all slots from make using POSIX Jobserver. + + The main methods here are: + - open(): reserves all slots; + - close(): method returns all used slots back to make; + - run(): executes a command setting PARALLELISM=<available slots jobs + 1> + """ + + def __init__(self): + """Initialize internal vars""" + self.claim = 0 + self.jobs = b"" + self.reader = None + self.writer = None + self.is_open = False + + def open(self): + """Reserve all available slots to be claimed later on""" + + if self.is_open: + return + + try: + # Fetch the make environment options. + flags = os.environ["MAKEFLAGS"] + # Look for "--jobserver=R,W" + # Note that GNU Make has used --jobserver-fds and --jobserver-auth + # so this handles all of them. + opts = [x for x in flags.split(" ") if x.startswith("--jobserver")] + + # Parse out R,W file descriptor numbers and set them nonblocking. + # If the MAKEFLAGS variable contains multiple instances of the + # --jobserver-auth= option, the last one is relevant. + fds = opts[-1].split("=", 1)[1] + + # Starting with GNU Make 4.4, named pipes are used for reader + # and writer. + # Example argument: --jobserver-auth=fifo:/tmp/GMfifo8134 + _, _, path = fds.partition("fifo:") + + if path: + self.reader = os.open(path, os.O_RDONLY | os.O_NONBLOCK) + self.writer = os.open(path, os.O_WRONLY) + else: + self.reader, self.writer = [int(x) for x in fds.split(",", 1)] + # Open a private copy of reader to avoid setting nonblocking + # on an unexpecting process with the same reader fd. + self.reader = os.open("/proc/self/fd/%d" % (self.reader), + os.O_RDONLY | os.O_NONBLOCK) + + # Read out as many jobserver slots as possible + while True: + try: + slot = os.read(self.reader, 8) + self.jobs += slot + except (OSError, IOError) as e: + if e.errno == errno.EWOULDBLOCK: + # Stop at the end of the jobserver queue. + break + # If something went wrong, give back the jobs. + if self.jobs: + os.write(self.writer, self.jobs) + raise e + + # Add a bump for our caller's reserveration, since we're just going + # to sit here blocked on our child. + self.claim = len(self.jobs) + 1 + + except (KeyError, IndexError, ValueError, OSError, IOError): + # Any missing environment strings or bad fds should result in just + # not being parallel. + self.claim = None + + self.is_open = True + + def close(self): + """Return all reserved slots to Jobserver""" + + if not self.is_open: + return + + # Return all the reserved slots. + if len(self.jobs): + os.write(self.writer, self.jobs) + + self.is_open = False + + def __enter__(self): + self.open() + return self + + def __exit__(self, exc_type, exc_value, exc_traceback): + self.close() + + def run(self, cmd, *args, **pwargs): + """ + Run a command setting PARALLELISM env variable to the number of + available job slots (claim) + 1, e.g. it will reserve claim slots + to do the actual build work, plus one to monitor its children. + """ + self.open() # Ensure that self.claim is set + + # We can only claim parallelism if there was a jobserver (i.e. a + # top-level "-jN" argument) and there were no other failures. Otherwise + # leave out the environment variable and let the child figure out what + # is best. + if self.claim: + os.environ["PARALLELISM"] = str(self.claim) + + return subprocess.call(cmd, *args, **pwargs) diff --git a/tools/lib/python/kdoc/__init__.py b/tools/lib/python/kdoc/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 --- /dev/null +++ b/tools/lib/python/kdoc/__init__.py diff --git a/tools/docs/lib/enrich_formatter.py b/tools/lib/python/kdoc/enrich_formatter.py index bb171567a4ca..bb171567a4ca 100644 --- a/tools/docs/lib/enrich_formatter.py +++ b/tools/lib/python/kdoc/enrich_formatter.py diff --git a/tools/lib/python/kdoc/kdoc_files.py b/tools/lib/python/kdoc/kdoc_files.py new file mode 100644 index 000000000000..bfe02baf1606 --- /dev/null +++ b/tools/lib/python/kdoc/kdoc_files.py @@ -0,0 +1,294 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: GPL-2.0 +# Copyright(c) 2025: Mauro Carvalho Chehab <mchehab@kernel.org>. +# +# pylint: disable=R0903,R0913,R0914,R0917 + +""" +Parse lernel-doc tags on multiple kernel source files. +""" + +import argparse +import logging +import os +import re + +from kdoc.kdoc_parser import KernelDoc +from kdoc.kdoc_output import OutputFormat + + +class GlobSourceFiles: + """ + Parse C source code file names and directories via an Interactor. + """ + + def __init__(self, srctree=None, valid_extensions=None): + """ + Initialize valid extensions with a tuple. + + If not defined, assume default C extensions (.c and .h) + + It would be possible to use python's glob function, but it is + very slow, and it is not interactive. So, it would wait to read all + directories before actually do something. + + So, let's use our own implementation. + """ + + if not valid_extensions: + self.extensions = (".c", ".h") + else: + self.extensions = valid_extensions + + self.srctree = srctree + + def _parse_dir(self, dirname): + """Internal function to parse files recursively""" + + with os.scandir(dirname) as obj: + for entry in obj: + name = os.path.join(dirname, entry.name) + + if entry.is_dir(follow_symlinks=False): + yield from self._parse_dir(name) + + if not entry.is_file(): + continue + + basename = os.path.basename(name) + + if not basename.endswith(self.extensions): + continue + + yield name + + def parse_files(self, file_list, file_not_found_cb): + """ + Define an iterator to parse all source files from file_list, + handling directories if any + """ + + if not file_list: + return + + for fname in file_list: + if self.srctree: + f = os.path.join(self.srctree, fname) + else: + f = fname + + if os.path.isdir(f): + yield from self._parse_dir(f) + elif os.path.isfile(f): + yield f + elif file_not_found_cb: + file_not_found_cb(fname) + + +class KernelFiles(): + """ + Parse kernel-doc tags on multiple kernel source files. + + There are two type of parsers defined here: + - self.parse_file(): parses both kernel-doc markups and + EXPORT_SYMBOL* macros; + - self.process_export_file(): parses only EXPORT_SYMBOL* macros. + """ + + def warning(self, msg): + """Ancillary routine to output a warning and increment error count""" + + self.config.log.warning(msg) + self.errors += 1 + + def error(self, msg): + """Ancillary routine to output an error and increment error count""" + + self.config.log.error(msg) + self.errors += 1 + + def parse_file(self, fname): + """ + Parse a single Kernel source. + """ + + # Prevent parsing the same file twice if results are cached + if fname in self.files: + return + + doc = KernelDoc(self.config, fname) + export_table, entries = doc.parse_kdoc() + + self.export_table[fname] = export_table + + self.files.add(fname) + self.export_files.add(fname) # parse_kdoc() already check exports + + self.results[fname] = entries + + def process_export_file(self, fname): + """ + Parses EXPORT_SYMBOL* macros from a single Kernel source file. + """ + + # Prevent parsing the same file twice if results are cached + if fname in self.export_files: + return + + doc = KernelDoc(self.config, fname) + export_table = doc.parse_export() + + if not export_table: + self.error(f"Error: Cannot check EXPORT_SYMBOL* on {fname}") + export_table = set() + + self.export_table[fname] = export_table + self.export_files.add(fname) + + def file_not_found_cb(self, fname): + """ + Callback to warn if a file was not found. + """ + + self.error(f"Cannot find file {fname}") + + def __init__(self, verbose=False, out_style=None, + werror=False, wreturn=False, wshort_desc=False, + wcontents_before_sections=False, + logger=None): + """ + Initialize startup variables and parse all files + """ + + if not verbose: + verbose = bool(os.environ.get("KBUILD_VERBOSE", 0)) + + if out_style is None: + out_style = OutputFormat() + + if not werror: + kcflags = os.environ.get("KCFLAGS", None) + if kcflags: + match = re.search(r"(\s|^)-Werror(\s|$)/", kcflags) + if match: + werror = True + + # reading this variable is for backwards compat just in case + # someone was calling it with the variable from outside the + # kernel's build system + kdoc_werror = os.environ.get("KDOC_WERROR", None) + if kdoc_werror: + werror = kdoc_werror + + # Some variables are global to the parser logic as a whole as they are + # used to send control configuration to KernelDoc class. As such, + # those variables are read-only inside the KernelDoc. + self.config = argparse.Namespace + + self.config.verbose = verbose + self.config.werror = werror + self.config.wreturn = wreturn + self.config.wshort_desc = wshort_desc + self.config.wcontents_before_sections = wcontents_before_sections + + if not logger: + self.config.log = logging.getLogger("kernel-doc") + else: + self.config.log = logger + + self.config.warning = self.warning + + self.config.src_tree = os.environ.get("SRCTREE", None) + + # Initialize variables that are internal to KernelFiles + + self.out_style = out_style + + self.errors = 0 + self.results = {} + + self.files = set() + self.export_files = set() + self.export_table = {} + + def parse(self, file_list, export_file=None): + """ + Parse all files + """ + + glob = GlobSourceFiles(srctree=self.config.src_tree) + + for fname in glob.parse_files(file_list, self.file_not_found_cb): + self.parse_file(fname) + + for fname in glob.parse_files(export_file, self.file_not_found_cb): + self.process_export_file(fname) + + def out_msg(self, fname, name, arg): + """ + Return output messages from a file name using the output style + filtering. + + If output type was not handled by the styler, return None. + """ + + # NOTE: we can add rules here to filter out unwanted parts, + # although OutputFormat.msg already does that. + + return self.out_style.msg(fname, name, arg) + + def msg(self, enable_lineno=False, export=False, internal=False, + symbol=None, nosymbol=None, no_doc_sections=False, + filenames=None, export_file=None): + """ + Interacts over the kernel-doc results and output messages, + returning kernel-doc markups on each interaction + """ + + self.out_style.set_config(self.config) + + if not filenames: + filenames = sorted(self.results.keys()) + + glob = GlobSourceFiles(srctree=self.config.src_tree) + + for fname in filenames: + function_table = set() + + if internal or export: + if not export_file: + export_file = [fname] + + for f in glob.parse_files(export_file, self.file_not_found_cb): + function_table |= self.export_table[f] + + if symbol: + for s in symbol: + function_table.add(s) + + self.out_style.set_filter(export, internal, symbol, nosymbol, + function_table, enable_lineno, + no_doc_sections) + + msg = "" + if fname not in self.results: + self.config.log.warning("No kernel-doc for file %s", fname) + continue + + symbols = self.results[fname] + self.out_style.set_symbols(symbols) + + for arg in symbols: + m = self.out_msg(fname, arg.name, arg) + + if m is None: + ln = arg.get("ln", 0) + dtype = arg.get('type', "") + + self.config.log.warning("%s:%d Can't handle %s", + fname, ln, dtype) + else: + msg += m + + if msg: + yield fname, msg diff --git a/tools/lib/python/kdoc/kdoc_item.py b/tools/lib/python/kdoc/kdoc_item.py new file mode 100644 index 000000000000..19805301cb2c --- /dev/null +++ b/tools/lib/python/kdoc/kdoc_item.py @@ -0,0 +1,43 @@ +# SPDX-License-Identifier: GPL-2.0 +# +# A class that will, eventually, encapsulate all of the parsed data that we +# then pass into the output modules. +# + +class KdocItem: + def __init__(self, name, fname, type, start_line, **other_stuff): + self.name = name + self.fname = fname + self.type = type + self.declaration_start_line = start_line + self.sections = {} + self.sections_start_lines = {} + self.parameterlist = [] + self.parameterdesc_start_lines = [] + self.parameterdescs = {} + self.parametertypes = {} + # + # Just save everything else into our own dict so that the output + # side can grab it directly as before. As we move things into more + # structured data, this will, hopefully, fade away. + # + self.other_stuff = other_stuff + + def get(self, key, default = None): + return self.other_stuff.get(key, default) + + def __getitem__(self, key): + return self.get(key) + + # + # Tracking of section and parameter information. + # + def set_sections(self, sections, start_lines): + self.sections = sections + self.section_start_lines = start_lines + + def set_params(self, names, descs, types, starts): + self.parameterlist = names + self.parameterdescs = descs + self.parametertypes = types + self.parameterdesc_start_lines = starts diff --git a/tools/lib/python/kdoc/kdoc_output.py b/tools/lib/python/kdoc/kdoc_output.py new file mode 100644 index 000000000000..b1aaa7fc3604 --- /dev/null +++ b/tools/lib/python/kdoc/kdoc_output.py @@ -0,0 +1,824 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: GPL-2.0 +# Copyright(c) 2025: Mauro Carvalho Chehab <mchehab@kernel.org>. +# +# pylint: disable=C0301,R0902,R0911,R0912,R0913,R0914,R0915,R0917 + +""" +Implement output filters to print kernel-doc documentation. + +The implementation uses a virtual base class (OutputFormat) which +contains dispatches to virtual methods, and some code to filter +out output messages. + +The actual implementation is done on one separate class per each type +of output. Currently, there are output classes for ReST and man/troff. +""" + +import os +import re +from datetime import datetime + +from kdoc.kdoc_parser import KernelDoc, type_param +from kdoc.kdoc_re import KernRe + + +function_pointer = KernRe(r"([^\(]*\(\*)\s*\)\s*\(([^\)]*)\)", cache=False) + +# match expressions used to find embedded type information +type_constant = KernRe(r"\b``([^\`]+)``\b", cache=False) +type_constant2 = KernRe(r"\%([-_*\w]+)", cache=False) +type_func = KernRe(r"(\w+)\(\)", cache=False) +type_param_ref = KernRe(r"([\!~\*]?)\@(\w*((\.\w+)|(->\w+))*(\.\.\.)?)", cache=False) + +# Special RST handling for func ptr params +type_fp_param = KernRe(r"\@(\w+)\(\)", cache=False) + +# Special RST handling for structs with func ptr params +type_fp_param2 = KernRe(r"\@(\w+->\S+)\(\)", cache=False) + +type_env = KernRe(r"(\$\w+)", cache=False) +type_enum = KernRe(r"\&(enum\s*([_\w]+))", cache=False) +type_struct = KernRe(r"\&(struct\s*([_\w]+))", cache=False) +type_typedef = KernRe(r"\&(typedef\s*([_\w]+))", cache=False) +type_union = KernRe(r"\&(union\s*([_\w]+))", cache=False) +type_member = KernRe(r"\&([_\w]+)(\.|->)([_\w]+)", cache=False) +type_fallback = KernRe(r"\&([_\w]+)", cache=False) +type_member_func = type_member + KernRe(r"\(\)", cache=False) + + +class OutputFormat: + """ + Base class for OutputFormat. If used as-is, it means that only + warnings will be displayed. + """ + + # output mode. + OUTPUT_ALL = 0 # output all symbols and doc sections + OUTPUT_INCLUDE = 1 # output only specified symbols + OUTPUT_EXPORTED = 2 # output exported symbols + OUTPUT_INTERNAL = 3 # output non-exported symbols + + # Virtual member to be overridden at the inherited classes + highlights = [] + + def __init__(self): + """Declare internal vars and set mode to OUTPUT_ALL""" + + self.out_mode = self.OUTPUT_ALL + self.enable_lineno = None + self.nosymbol = {} + self.symbol = None + self.function_table = None + self.config = None + self.no_doc_sections = False + + self.data = "" + + def set_config(self, config): + """ + Setup global config variables used by both parser and output. + """ + + self.config = config + + def set_filter(self, export, internal, symbol, nosymbol, function_table, + enable_lineno, no_doc_sections): + """ + Initialize filter variables according to the requested mode. + + Only one choice is valid between export, internal and symbol. + + The nosymbol filter can be used on all modes. + """ + + self.enable_lineno = enable_lineno + self.no_doc_sections = no_doc_sections + self.function_table = function_table + + if symbol: + self.out_mode = self.OUTPUT_INCLUDE + elif export: + self.out_mode = self.OUTPUT_EXPORTED + elif internal: + self.out_mode = self.OUTPUT_INTERNAL + else: + self.out_mode = self.OUTPUT_ALL + + if nosymbol: + self.nosymbol = set(nosymbol) + + + def highlight_block(self, block): + """ + Apply the RST highlights to a sub-block of text. + """ + + for r, sub in self.highlights: + block = r.sub(sub, block) + + return block + + def out_warnings(self, args): + """ + Output warnings for identifiers that will be displayed. + """ + + for log_msg in args.warnings: + self.config.warning(log_msg) + + def check_doc(self, name, args): + """Check if DOC should be output""" + + if self.no_doc_sections: + return False + + if name in self.nosymbol: + return False + + if self.out_mode == self.OUTPUT_ALL: + self.out_warnings(args) + return True + + if self.out_mode == self.OUTPUT_INCLUDE: + if name in self.function_table: + self.out_warnings(args) + return True + + return False + + def check_declaration(self, dtype, name, args): + """ + Checks if a declaration should be output or not based on the + filtering criteria. + """ + + if name in self.nosymbol: + return False + + if self.out_mode == self.OUTPUT_ALL: + self.out_warnings(args) + return True + + if self.out_mode in [self.OUTPUT_INCLUDE, self.OUTPUT_EXPORTED]: + if name in self.function_table: + return True + + if self.out_mode == self.OUTPUT_INTERNAL: + if dtype != "function": + self.out_warnings(args) + return True + + if name not in self.function_table: + self.out_warnings(args) + return True + + return False + + def msg(self, fname, name, args): + """ + Handles a single entry from kernel-doc parser + """ + + self.data = "" + + dtype = args.type + + if dtype == "doc": + self.out_doc(fname, name, args) + return self.data + + if not self.check_declaration(dtype, name, args): + return self.data + + if dtype == "function": + self.out_function(fname, name, args) + return self.data + + if dtype == "enum": + self.out_enum(fname, name, args) + return self.data + + if dtype == "typedef": + self.out_typedef(fname, name, args) + return self.data + + if dtype in ["struct", "union"]: + self.out_struct(fname, name, args) + return self.data + + # Warn if some type requires an output logic + self.config.log.warning("doesn't know how to output '%s' block", + dtype) + + return None + + # Virtual methods to be overridden by inherited classes + # At the base class, those do nothing. + def set_symbols(self, symbols): + """Get a list of all symbols from kernel_doc""" + + def out_doc(self, fname, name, args): + """Outputs a DOC block""" + + def out_function(self, fname, name, args): + """Outputs a function""" + + def out_enum(self, fname, name, args): + """Outputs an enum""" + + def out_typedef(self, fname, name, args): + """Outputs a typedef""" + + def out_struct(self, fname, name, args): + """Outputs a struct""" + + +class RestFormat(OutputFormat): + """Consts and functions used by ReST output""" + + highlights = [ + (type_constant, r"``\1``"), + (type_constant2, r"``\1``"), + + # Note: need to escape () to avoid func matching later + (type_member_func, r":c:type:`\1\2\3\\(\\) <\1>`"), + (type_member, r":c:type:`\1\2\3 <\1>`"), + (type_fp_param, r"**\1\\(\\)**"), + (type_fp_param2, r"**\1\\(\\)**"), + (type_func, r"\1()"), + (type_enum, r":c:type:`\1 <\2>`"), + (type_struct, r":c:type:`\1 <\2>`"), + (type_typedef, r":c:type:`\1 <\2>`"), + (type_union, r":c:type:`\1 <\2>`"), + + # in rst this can refer to any type + (type_fallback, r":c:type:`\1`"), + (type_param_ref, r"**\1\2**") + ] + blankline = "\n" + + sphinx_literal = KernRe(r'^[^.].*::$', cache=False) + sphinx_cblock = KernRe(r'^\.\.\ +code-block::', cache=False) + + def __init__(self): + """ + Creates class variables. + + Not really mandatory, but it is a good coding style and makes + pylint happy. + """ + + super().__init__() + self.lineprefix = "" + + def print_lineno(self, ln): + """Outputs a line number""" + + if self.enable_lineno and ln is not None: + ln += 1 + self.data += f".. LINENO {ln}\n" + + def output_highlight(self, args): + """ + Outputs a C symbol that may require being converted to ReST using + the self.highlights variable + """ + + input_text = args + output = "" + in_literal = False + litprefix = "" + block = "" + + for line in input_text.strip("\n").split("\n"): + + # If we're in a literal block, see if we should drop out of it. + # Otherwise, pass the line straight through unmunged. + if in_literal: + if line.strip(): # If the line is not blank + # If this is the first non-blank line in a literal block, + # figure out the proper indent. + if not litprefix: + r = KernRe(r'^(\s*)') + if r.match(line): + litprefix = '^' + r.group(1) + else: + litprefix = "" + + output += line + "\n" + elif not KernRe(litprefix).match(line): + in_literal = False + else: + output += line + "\n" + else: + output += line + "\n" + + # Not in a literal block (or just dropped out) + if not in_literal: + block += line + "\n" + if self.sphinx_literal.match(line) or self.sphinx_cblock.match(line): + in_literal = True + litprefix = "" + output += self.highlight_block(block) + block = "" + + # Handle any remaining block + if block: + output += self.highlight_block(block) + + # Print the output with the line prefix + for line in output.strip("\n").split("\n"): + self.data += self.lineprefix + line + "\n" + + def out_section(self, args, out_docblock=False): + """ + Outputs a block section. + + This could use some work; it's used to output the DOC: sections, and + starts by putting out the name of the doc section itself, but that + tends to duplicate a header already in the template file. + """ + for section, text in args.sections.items(): + # Skip sections that are in the nosymbol_table + if section in self.nosymbol: + continue + + if out_docblock: + if not self.out_mode == self.OUTPUT_INCLUDE: + self.data += f".. _{section}:\n\n" + self.data += f'{self.lineprefix}**{section}**\n\n' + else: + self.data += f'{self.lineprefix}**{section}**\n\n' + + self.print_lineno(args.section_start_lines.get(section, 0)) + self.output_highlight(text) + self.data += "\n" + self.data += "\n" + + def out_doc(self, fname, name, args): + if not self.check_doc(name, args): + return + self.out_section(args, out_docblock=True) + + def out_function(self, fname, name, args): + + oldprefix = self.lineprefix + signature = "" + + func_macro = args.get('func_macro', False) + if func_macro: + signature = name + else: + if args.get('functiontype'): + signature = args['functiontype'] + " " + signature += name + " (" + + ln = args.declaration_start_line + count = 0 + for parameter in args.parameterlist: + if count != 0: + signature += ", " + count += 1 + dtype = args.parametertypes.get(parameter, "") + + if function_pointer.search(dtype): + signature += function_pointer.group(1) + parameter + function_pointer.group(3) + else: + signature += dtype + + if not func_macro: + signature += ")" + + self.print_lineno(ln) + if args.get('typedef') or not args.get('functiontype'): + self.data += f".. c:macro:: {name}\n\n" + + if args.get('typedef'): + self.data += " **Typedef**: " + self.lineprefix = "" + self.output_highlight(args.get('purpose', "")) + self.data += "\n\n**Syntax**\n\n" + self.data += f" ``{signature}``\n\n" + else: + self.data += f"``{signature}``\n\n" + else: + self.data += f".. c:function:: {signature}\n\n" + + if not args.get('typedef'): + self.print_lineno(ln) + self.lineprefix = " " + self.output_highlight(args.get('purpose', "")) + self.data += "\n" + + # Put descriptive text into a container (HTML <div>) to help set + # function prototypes apart + self.lineprefix = " " + + if args.parameterlist: + self.data += ".. container:: kernelindent\n\n" + self.data += f"{self.lineprefix}**Parameters**\n\n" + + for parameter in args.parameterlist: + parameter_name = KernRe(r'\[.*').sub('', parameter) + dtype = args.parametertypes.get(parameter, "") + + if dtype: + self.data += f"{self.lineprefix}``{dtype}``\n" + else: + self.data += f"{self.lineprefix}``{parameter}``\n" + + self.print_lineno(args.parameterdesc_start_lines.get(parameter_name, 0)) + + self.lineprefix = " " + if parameter_name in args.parameterdescs and \ + args.parameterdescs[parameter_name] != KernelDoc.undescribed: + + self.output_highlight(args.parameterdescs[parameter_name]) + self.data += "\n" + else: + self.data += f"{self.lineprefix}*undescribed*\n\n" + self.lineprefix = " " + + self.out_section(args) + self.lineprefix = oldprefix + + def out_enum(self, fname, name, args): + + oldprefix = self.lineprefix + ln = args.declaration_start_line + + self.data += f"\n\n.. c:enum:: {name}\n\n" + + self.print_lineno(ln) + self.lineprefix = " " + self.output_highlight(args.get('purpose', '')) + self.data += "\n" + + self.data += ".. container:: kernelindent\n\n" + outer = self.lineprefix + " " + self.lineprefix = outer + " " + self.data += f"{outer}**Constants**\n\n" + + for parameter in args.parameterlist: + self.data += f"{outer}``{parameter}``\n" + + if args.parameterdescs.get(parameter, '') != KernelDoc.undescribed: + self.output_highlight(args.parameterdescs[parameter]) + else: + self.data += f"{self.lineprefix}*undescribed*\n\n" + self.data += "\n" + + self.lineprefix = oldprefix + self.out_section(args) + + def out_typedef(self, fname, name, args): + + oldprefix = self.lineprefix + ln = args.declaration_start_line + + self.data += f"\n\n.. c:type:: {name}\n\n" + + self.print_lineno(ln) + self.lineprefix = " " + + self.output_highlight(args.get('purpose', '')) + + self.data += "\n" + + self.lineprefix = oldprefix + self.out_section(args) + + def out_struct(self, fname, name, args): + + purpose = args.get('purpose', "") + declaration = args.get('definition', "") + dtype = args.type + ln = args.declaration_start_line + + self.data += f"\n\n.. c:{dtype}:: {name}\n\n" + + self.print_lineno(ln) + + oldprefix = self.lineprefix + self.lineprefix += " " + + self.output_highlight(purpose) + self.data += "\n" + + self.data += ".. container:: kernelindent\n\n" + self.data += f"{self.lineprefix}**Definition**::\n\n" + + self.lineprefix = self.lineprefix + " " + + declaration = declaration.replace("\t", self.lineprefix) + + self.data += f"{self.lineprefix}{dtype} {name}" + ' {' + "\n" + self.data += f"{declaration}{self.lineprefix}" + "};\n\n" + + self.lineprefix = " " + self.data += f"{self.lineprefix}**Members**\n\n" + for parameter in args.parameterlist: + if not parameter or parameter.startswith("#"): + continue + + parameter_name = parameter.split("[", maxsplit=1)[0] + + if args.parameterdescs.get(parameter_name) == KernelDoc.undescribed: + continue + + self.print_lineno(args.parameterdesc_start_lines.get(parameter_name, 0)) + + self.data += f"{self.lineprefix}``{parameter}``\n" + + self.lineprefix = " " + self.output_highlight(args.parameterdescs[parameter_name]) + self.lineprefix = " " + + self.data += "\n" + + self.data += "\n" + + self.lineprefix = oldprefix + self.out_section(args) + + +class ManFormat(OutputFormat): + """Consts and functions used by man pages output""" + + highlights = ( + (type_constant, r"\1"), + (type_constant2, r"\1"), + (type_func, r"\\fB\1\\fP"), + (type_enum, r"\\fI\1\\fP"), + (type_struct, r"\\fI\1\\fP"), + (type_typedef, r"\\fI\1\\fP"), + (type_union, r"\\fI\1\\fP"), + (type_param, r"\\fI\1\\fP"), + (type_param_ref, r"\\fI\1\2\\fP"), + (type_member, r"\\fI\1\2\3\\fP"), + (type_fallback, r"\\fI\1\\fP") + ) + blankline = "" + + date_formats = [ + "%a %b %d %H:%M:%S %Z %Y", + "%a %b %d %H:%M:%S %Y", + "%Y-%m-%d", + "%b %d %Y", + "%B %d %Y", + "%m %d %Y", + ] + + def __init__(self, modulename): + """ + Creates class variables. + + Not really mandatory, but it is a good coding style and makes + pylint happy. + """ + + super().__init__() + self.modulename = modulename + self.symbols = [] + + dt = None + tstamp = os.environ.get("KBUILD_BUILD_TIMESTAMP") + if tstamp: + for fmt in self.date_formats: + try: + dt = datetime.strptime(tstamp, fmt) + break + except ValueError: + pass + + if not dt: + dt = datetime.now() + + self.man_date = dt.strftime("%B %Y") + + def arg_name(self, args, name): + """ + Return the name that will be used for the man page. + + As we may have the same name on different namespaces, + prepend the data type for all types except functions and typedefs. + + The doc section is special: it uses the modulename. + """ + + dtype = args.type + + if dtype == "doc": + return self.modulename + + if dtype in ["function", "typedef"]: + return name + + return f"{dtype} {name}" + + def set_symbols(self, symbols): + """ + Get a list of all symbols from kernel_doc. + + Man pages will uses it to add a SEE ALSO section with other + symbols at the same file. + """ + self.symbols = symbols + + def out_tail(self, fname, name, args): + """Adds a tail for all man pages""" + + # SEE ALSO section + self.data += f'.SH "SEE ALSO"' + "\n.PP\n" + self.data += (f"Kernel file \\fB{args.fname}\\fR\n") + if len(self.symbols) >= 2: + cur_name = self.arg_name(args, name) + + related = [] + for arg in self.symbols: + out_name = self.arg_name(arg, arg.name) + + if cur_name == out_name: + continue + + related.append(f"\\fB{out_name}\\fR(9)") + + self.data += ",\n".join(related) + "\n" + + # TODO: does it make sense to add other sections? Maybe + # REPORTING ISSUES? LICENSE? + + def msg(self, fname, name, args): + """ + Handles a single entry from kernel-doc parser. + + Add a tail at the end of man pages output. + """ + super().msg(fname, name, args) + self.out_tail(fname, name, args) + + return self.data + + def output_highlight(self, block): + """ + Outputs a C symbol that may require being highlighted with + self.highlights variable using troff syntax + """ + + contents = self.highlight_block(block) + + if isinstance(contents, list): + contents = "\n".join(contents) + + for line in contents.strip("\n").split("\n"): + line = KernRe(r"^\s*").sub("", line) + if not line: + continue + + if line[0] == ".": + self.data += "\\&" + line + "\n" + else: + self.data += line + "\n" + + def out_doc(self, fname, name, args): + if not self.check_doc(name, args): + return + + out_name = self.arg_name(args, name) + + self.data += f'.TH "{self.modulename}" 9 "{out_name}" "{self.man_date}" "API Manual" LINUX' + "\n" + + for section, text in args.sections.items(): + self.data += f'.SH "{section}"' + "\n" + self.output_highlight(text) + + def out_function(self, fname, name, args): + """output function in man""" + + out_name = self.arg_name(args, name) + + self.data += f'.TH "{name}" 9 "{out_name}" "{self.man_date}" "Kernel Hacker\'s Manual" LINUX' + "\n" + + self.data += ".SH NAME\n" + self.data += f"{name} \\- {args['purpose']}\n" + + self.data += ".SH SYNOPSIS\n" + if args.get('functiontype', ''): + self.data += f'.B "{args["functiontype"]}" {name}' + "\n" + else: + self.data += f'.B "{name}' + "\n" + + count = 0 + parenth = "(" + post = "," + + for parameter in args.parameterlist: + if count == len(args.parameterlist) - 1: + post = ");" + + dtype = args.parametertypes.get(parameter, "") + if function_pointer.match(dtype): + # Pointer-to-function + self.data += f'".BI "{parenth}{function_pointer.group(1)}" " ") ({function_pointer.group(2)}){post}"' + "\n" + else: + dtype = KernRe(r'([^\*])$').sub(r'\1 ', dtype) + + self.data += f'.BI "{parenth}{dtype}" "{post}"' + "\n" + count += 1 + parenth = "" + + if args.parameterlist: + self.data += ".SH ARGUMENTS\n" + + for parameter in args.parameterlist: + parameter_name = re.sub(r'\[.*', '', parameter) + + self.data += f'.IP "{parameter}" 12' + "\n" + self.output_highlight(args.parameterdescs.get(parameter_name, "")) + + for section, text in args.sections.items(): + self.data += f'.SH "{section.upper()}"' + "\n" + self.output_highlight(text) + + def out_enum(self, fname, name, args): + out_name = self.arg_name(args, name) + + self.data += f'.TH "{self.modulename}" 9 "{out_name}" "{self.man_date}" "API Manual" LINUX' + "\n" + + self.data += ".SH NAME\n" + self.data += f"enum {name} \\- {args['purpose']}\n" + + self.data += ".SH SYNOPSIS\n" + self.data += f"enum {name}" + " {\n" + + count = 0 + for parameter in args.parameterlist: + self.data += f'.br\n.BI " {parameter}"' + "\n" + if count == len(args.parameterlist) - 1: + self.data += "\n};\n" + else: + self.data += ", \n.br\n" + + count += 1 + + self.data += ".SH Constants\n" + + for parameter in args.parameterlist: + parameter_name = KernRe(r'\[.*').sub('', parameter) + self.data += f'.IP "{parameter}" 12' + "\n" + self.output_highlight(args.parameterdescs.get(parameter_name, "")) + + for section, text in args.sections.items(): + self.data += f'.SH "{section}"' + "\n" + self.output_highlight(text) + + def out_typedef(self, fname, name, args): + module = self.modulename + purpose = args.get('purpose') + out_name = self.arg_name(args, name) + + self.data += f'.TH "{module}" 9 "{out_name}" "{self.man_date}" "API Manual" LINUX' + "\n" + + self.data += ".SH NAME\n" + self.data += f"typedef {name} \\- {purpose}\n" + + for section, text in args.sections.items(): + self.data += f'.SH "{section}"' + "\n" + self.output_highlight(text) + + def out_struct(self, fname, name, args): + module = self.modulename + purpose = args.get('purpose') + definition = args.get('definition') + out_name = self.arg_name(args, name) + + self.data += f'.TH "{module}" 9 "{out_name}" "{self.man_date}" "API Manual" LINUX' + "\n" + + self.data += ".SH NAME\n" + self.data += f"{args.type} {name} \\- {purpose}\n" + + # Replace tabs with two spaces and handle newlines + declaration = definition.replace("\t", " ") + declaration = KernRe(r"\n").sub('"\n.br\n.BI "', declaration) + + self.data += ".SH SYNOPSIS\n" + self.data += f"{args.type} {name} " + "{" + "\n.br\n" + self.data += f'.BI "{declaration}\n' + "};\n.br\n\n" + + self.data += ".SH Members\n" + for parameter in args.parameterlist: + if parameter.startswith("#"): + continue + + parameter_name = re.sub(r"\[.*", "", parameter) + + if args.parameterdescs.get(parameter_name) == KernelDoc.undescribed: + continue + + self.data += f'.IP "{parameter}" 12' + "\n" + self.output_highlight(args.parameterdescs.get(parameter_name)) + + for section, text in args.sections.items(): + self.data += f'.SH "{section}"' + "\n" + self.output_highlight(text) diff --git a/tools/lib/python/kdoc/kdoc_parser.py b/tools/lib/python/kdoc/kdoc_parser.py new file mode 100644 index 000000000000..500aafc50032 --- /dev/null +++ b/tools/lib/python/kdoc/kdoc_parser.py @@ -0,0 +1,1670 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: GPL-2.0 +# Copyright(c) 2025: Mauro Carvalho Chehab <mchehab@kernel.org>. +# +# pylint: disable=C0301,C0302,R0904,R0912,R0913,R0914,R0915,R0917,R1702 + +""" +kdoc_parser +=========== + +Read a C language source or header FILE and extract embedded +documentation comments +""" + +import sys +import re +from pprint import pformat + +from kdoc.kdoc_re import NestedMatch, KernRe +from kdoc.kdoc_item import KdocItem + +# +# Regular expressions used to parse kernel-doc markups at KernelDoc class. +# +# Let's declare them in lowercase outside any class to make it easier to +# convert from the Perl script. +# +# As those are evaluated at the beginning, no need to cache them +# + +# Allow whitespace at end of comment start. +doc_start = KernRe(r'^/\*\*\s*$', cache=False) + +doc_end = KernRe(r'\*/', cache=False) +doc_com = KernRe(r'\s*\*\s*', cache=False) +doc_com_body = KernRe(r'\s*\* ?', cache=False) +doc_decl = doc_com + KernRe(r'(\w+)', cache=False) + +# @params and a strictly limited set of supported section names +# Specifically: +# Match @word: +# @...: +# @{section-name}: +# while trying to not match literal block starts like "example::" +# +known_section_names = 'description|context|returns?|notes?|examples?' +known_sections = KernRe(known_section_names, flags = re.I) +doc_sect = doc_com + \ + KernRe(r'\s*(@[.\w]+|@\.\.\.|' + known_section_names + r')\s*:([^:].*)?$', + flags=re.I, cache=False) + +doc_content = doc_com_body + KernRe(r'(.*)', cache=False) +doc_inline_start = KernRe(r'^\s*/\*\*\s*$', cache=False) +doc_inline_sect = KernRe(r'\s*\*\s*(@\s*[\w][\w\.]*\s*):(.*)', cache=False) +doc_inline_end = KernRe(r'^\s*\*/\s*$', cache=False) +doc_inline_oneline = KernRe(r'^\s*/\*\*\s*(@[\w\s]+):\s*(.*)\s*\*/\s*$', cache=False) + +export_symbol = KernRe(r'^\s*EXPORT_SYMBOL(_GPL)?\s*\(\s*(\w+)\s*\)\s*', cache=False) +export_symbol_ns = KernRe(r'^\s*EXPORT_SYMBOL_NS(_GPL)?\s*\(\s*(\w+)\s*,\s*"\S+"\)\s*', cache=False) + +type_param = KernRe(r"@(\w*((\.\w+)|(->\w+))*(\.\.\.)?)", cache=False) + +# +# Tests for the beginning of a kerneldoc block in its various forms. +# +doc_block = doc_com + KernRe(r'DOC:\s*(.*)?', cache=False) +doc_begin_data = KernRe(r"^\s*\*?\s*(struct|union|enum|typedef)\b\s*(\w*)", cache = False) +doc_begin_func = KernRe(str(doc_com) + # initial " * ' + r"(?:\w+\s*\*\s*)?" + # type (not captured) + r'(?:define\s+)?' + # possible "define" (not captured) + r'(\w+)\s*(?:\(\w*\))?\s*' + # name and optional "(...)" + r'(?:[-:].*)?$', # description (not captured) + cache = False) + +# +# Here begins a long set of transformations to turn structure member prefixes +# and macro invocations into something we can parse and generate kdoc for. +# +struct_args_pattern = r'([^,)]+)' + +struct_xforms = [ + # Strip attributes + (KernRe(r"__attribute__\s*\(\([a-z0-9,_\*\s\(\)]*\)\)", flags=re.I | re.S, cache=False), ' '), + (KernRe(r'\s*__aligned\s*\([^;]*\)', re.S), ' '), + (KernRe(r'\s*__counted_by\s*\([^;]*\)', re.S), ' '), + (KernRe(r'\s*__counted_by_(le|be)\s*\([^;]*\)', re.S), ' '), + (KernRe(r'\s*__packed\s*', re.S), ' '), + (KernRe(r'\s*CRYPTO_MINALIGN_ATTR', re.S), ' '), + (KernRe(r'\s*__private', re.S), ' '), + (KernRe(r'\s*__rcu', re.S), ' '), + (KernRe(r'\s*____cacheline_aligned_in_smp', re.S), ' '), + (KernRe(r'\s*____cacheline_aligned', re.S), ' '), + (KernRe(r'\s*__cacheline_group_(begin|end)\([^\)]+\);'), ''), + # + # Unwrap struct_group macros based on this definition: + # __struct_group(TAG, NAME, ATTRS, MEMBERS...) + # which has variants like: struct_group(NAME, MEMBERS...) + # Only MEMBERS arguments require documentation. + # + # Parsing them happens on two steps: + # + # 1. drop struct group arguments that aren't at MEMBERS, + # storing them as STRUCT_GROUP(MEMBERS) + # + # 2. remove STRUCT_GROUP() ancillary macro. + # + # The original logic used to remove STRUCT_GROUP() using an + # advanced regex: + # + # \bSTRUCT_GROUP(\(((?:(?>[^)(]+)|(?1))*)\))[^;]*; + # + # with two patterns that are incompatible with + # Python re module, as it has: + # + # - a recursive pattern: (?1) + # - an atomic grouping: (?>...) + # + # I tried a simpler version: but it didn't work either: + # \bSTRUCT_GROUP\(([^\)]+)\)[^;]*; + # + # As it doesn't properly match the end parenthesis on some cases. + # + # So, a better solution was crafted: there's now a NestedMatch + # class that ensures that delimiters after a search are properly + # matched. So, the implementation to drop STRUCT_GROUP() will be + # handled in separate. + # + (KernRe(r'\bstruct_group\s*\(([^,]*,)', re.S), r'STRUCT_GROUP('), + (KernRe(r'\bstruct_group_attr\s*\(([^,]*,){2}', re.S), r'STRUCT_GROUP('), + (KernRe(r'\bstruct_group_tagged\s*\(([^,]*),([^,]*),', re.S), r'struct \1 \2; STRUCT_GROUP('), + (KernRe(r'\b__struct_group\s*\(([^,]*,){3}', re.S), r'STRUCT_GROUP('), + # + # Replace macros + # + # TODO: use NestedMatch for FOO($1, $2, ...) matches + # + # it is better to also move those to the NestedMatch logic, + # to ensure that parentheses will be properly matched. + # + (KernRe(r'__ETHTOOL_DECLARE_LINK_MODE_MASK\s*\(([^\)]+)\)', re.S), + r'DECLARE_BITMAP(\1, __ETHTOOL_LINK_MODE_MASK_NBITS)'), + (KernRe(r'DECLARE_PHY_INTERFACE_MASK\s*\(([^\)]+)\)', re.S), + r'DECLARE_BITMAP(\1, PHY_INTERFACE_MODE_MAX)'), + (KernRe(r'DECLARE_BITMAP\s*\(' + struct_args_pattern + r',\s*' + struct_args_pattern + r'\)', + re.S), r'unsigned long \1[BITS_TO_LONGS(\2)]'), + (KernRe(r'DECLARE_HASHTABLE\s*\(' + struct_args_pattern + r',\s*' + struct_args_pattern + r'\)', + re.S), r'unsigned long \1[1 << ((\2) - 1)]'), + (KernRe(r'DECLARE_KFIFO\s*\(' + struct_args_pattern + r',\s*' + struct_args_pattern + + r',\s*' + struct_args_pattern + r'\)', re.S), r'\2 *\1'), + (KernRe(r'DECLARE_KFIFO_PTR\s*\(' + struct_args_pattern + r',\s*' + + struct_args_pattern + r'\)', re.S), r'\2 *\1'), + (KernRe(r'(?:__)?DECLARE_FLEX_ARRAY\s*\(' + struct_args_pattern + r',\s*' + + struct_args_pattern + r'\)', re.S), r'\1 \2[]'), + (KernRe(r'DEFINE_DMA_UNMAP_ADDR\s*\(' + struct_args_pattern + r'\)', re.S), r'dma_addr_t \1'), + (KernRe(r'DEFINE_DMA_UNMAP_LEN\s*\(' + struct_args_pattern + r'\)', re.S), r'__u32 \1'), +] +# +# Regexes here are guaranteed to have the end delimiter matching +# the start delimiter. Yet, right now, only one replace group +# is allowed. +# +struct_nested_prefixes = [ + (re.compile(r'\bSTRUCT_GROUP\('), r'\1'), +] + +# +# Transforms for function prototypes +# +function_xforms = [ + (KernRe(r"^static +"), ""), + (KernRe(r"^extern +"), ""), + (KernRe(r"^asmlinkage +"), ""), + (KernRe(r"^inline +"), ""), + (KernRe(r"^__inline__ +"), ""), + (KernRe(r"^__inline +"), ""), + (KernRe(r"^__always_inline +"), ""), + (KernRe(r"^noinline +"), ""), + (KernRe(r"^__FORTIFY_INLINE +"), ""), + (KernRe(r"__init +"), ""), + (KernRe(r"__init_or_module +"), ""), + (KernRe(r"__deprecated +"), ""), + (KernRe(r"__flatten +"), ""), + (KernRe(r"__meminit +"), ""), + (KernRe(r"__must_check +"), ""), + (KernRe(r"__weak +"), ""), + (KernRe(r"__sched +"), ""), + (KernRe(r"_noprof"), ""), + (KernRe(r"__always_unused *"), ""), + (KernRe(r"__printf\s*\(\s*\d*\s*,\s*\d*\s*\) +"), ""), + (KernRe(r"__(?:re)?alloc_size\s*\(\s*\d+\s*(?:,\s*\d+\s*)?\) +"), ""), + (KernRe(r"__diagnose_as\s*\(\s*\S+\s*(?:,\s*\d+\s*)*\) +"), ""), + (KernRe(r"DECL_BUCKET_PARAMS\s*\(\s*(\S+)\s*,\s*(\S+)\s*\)"), r"\1, \2"), + (KernRe(r"__attribute_const__ +"), ""), + (KernRe(r"__attribute__\s*\(\((?:[\w\s]+(?:\([^)]*\))?\s*,?)+\)\)\s+"), ""), +] + +# +# Apply a set of transforms to a block of text. +# +def apply_transforms(xforms, text): + for search, subst in xforms: + text = search.sub(subst, text) + return text + +# +# A little helper to get rid of excess white space +# +multi_space = KernRe(r'\s\s+') +def trim_whitespace(s): + return multi_space.sub(' ', s.strip()) + +# +# Remove struct/enum members that have been marked "private". +# +def trim_private_members(text): + # + # First look for a "public:" block that ends a private region, then + # handle the "private until the end" case. + # + text = KernRe(r'/\*\s*private:.*?/\*\s*public:.*?\*/', flags=re.S).sub('', text) + text = KernRe(r'/\*\s*private:.*', flags=re.S).sub('', text) + # + # We needed the comments to do the above, but now we can take them out. + # + return KernRe(r'\s*/\*.*?\*/\s*', flags=re.S).sub('', text).strip() + +class state: + """ + State machine enums + """ + + # Parser states + NORMAL = 0 # normal code + NAME = 1 # looking for function name + DECLARATION = 2 # We have seen a declaration which might not be done + BODY = 3 # the body of the comment + SPECIAL_SECTION = 4 # doc section ending with a blank line + PROTO = 5 # scanning prototype + DOCBLOCK = 6 # documentation block + INLINE_NAME = 7 # gathering doc outside main block + INLINE_TEXT = 8 # reading the body of inline docs + + name = [ + "NORMAL", + "NAME", + "DECLARATION", + "BODY", + "SPECIAL_SECTION", + "PROTO", + "DOCBLOCK", + "INLINE_NAME", + "INLINE_TEXT", + ] + + +SECTION_DEFAULT = "Description" # default section + +class KernelEntry: + + def __init__(self, config, fname, ln): + self.config = config + self.fname = fname + + self._contents = [] + self.prototype = "" + + self.warnings = [] + + self.parameterlist = [] + self.parameterdescs = {} + self.parametertypes = {} + self.parameterdesc_start_lines = {} + + self.section_start_lines = {} + self.sections = {} + + self.anon_struct_union = False + + self.leading_space = None + + self.fname = fname + + # State flags + self.brcount = 0 + self.declaration_start_line = ln + 1 + + # + # Management of section contents + # + def add_text(self, text): + self._contents.append(text) + + def contents(self): + return '\n'.join(self._contents) + '\n' + + # TODO: rename to emit_message after removal of kernel-doc.pl + def emit_msg(self, ln, msg, *, warning=True): + """Emit a message""" + + log_msg = f"{self.fname}:{ln} {msg}" + + if not warning: + self.config.log.info(log_msg) + return + + # Delegate warning output to output logic, as this way it + # will report warnings/info only for symbols that are output + + self.warnings.append(log_msg) + return + + # + # Begin a new section. + # + def begin_section(self, line_no, title = SECTION_DEFAULT, dump = False): + if dump: + self.dump_section(start_new = True) + self.section = title + self.new_start_line = line_no + + def dump_section(self, start_new=True): + """ + Dumps section contents to arrays/hashes intended for that purpose. + """ + # + # If we have accumulated no contents in the default ("description") + # section, don't bother. + # + if self.section == SECTION_DEFAULT and not self._contents: + return + name = self.section + contents = self.contents() + + if type_param.match(name): + name = type_param.group(1) + + self.parameterdescs[name] = contents + self.parameterdesc_start_lines[name] = self.new_start_line + + self.new_start_line = 0 + + else: + if name in self.sections and self.sections[name] != "": + # Only warn on user-specified duplicate section names + if name != SECTION_DEFAULT: + self.emit_msg(self.new_start_line, + f"duplicate section name '{name}'") + # Treat as a new paragraph - add a blank line + self.sections[name] += '\n' + contents + else: + self.sections[name] = contents + self.section_start_lines[name] = self.new_start_line + self.new_start_line = 0 + +# self.config.log.debug("Section: %s : %s", name, pformat(vars(self))) + + if start_new: + self.section = SECTION_DEFAULT + self._contents = [] + +python_warning = False + +class KernelDoc: + """ + Read a C language source or header FILE and extract embedded + documentation comments. + """ + + # Section names + + section_context = "Context" + section_return = "Return" + + undescribed = "-- undescribed --" + + def __init__(self, config, fname): + """Initialize internal variables""" + + self.fname = fname + self.config = config + + # Initial state for the state machines + self.state = state.NORMAL + + # Store entry currently being processed + self.entry = None + + # Place all potential outputs into an array + self.entries = [] + + # + # We need Python 3.7 for its "dicts remember the insertion + # order" guarantee + # + global python_warning + if (not python_warning and + sys.version_info.major == 3 and sys.version_info.minor < 7): + + self.emit_msg(0, + 'Python 3.7 or later is required for correct results') + python_warning = True + + def emit_msg(self, ln, msg, *, warning=True): + """Emit a message""" + + if self.entry: + self.entry.emit_msg(ln, msg, warning=warning) + return + + log_msg = f"{self.fname}:{ln} {msg}" + + if warning: + self.config.log.warning(log_msg) + else: + self.config.log.info(log_msg) + + def dump_section(self, start_new=True): + """ + Dumps section contents to arrays/hashes intended for that purpose. + """ + + if self.entry: + self.entry.dump_section(start_new) + + # TODO: rename it to store_declaration after removal of kernel-doc.pl + def output_declaration(self, dtype, name, **args): + """ + Stores the entry into an entry array. + + The actual output and output filters will be handled elsewhere + """ + + item = KdocItem(name, self.fname, dtype, + self.entry.declaration_start_line, **args) + item.warnings = self.entry.warnings + + # Drop empty sections + # TODO: improve empty sections logic to emit warnings + sections = self.entry.sections + for section in ["Description", "Return"]: + if section in sections and not sections[section].rstrip(): + del sections[section] + item.set_sections(sections, self.entry.section_start_lines) + item.set_params(self.entry.parameterlist, self.entry.parameterdescs, + self.entry.parametertypes, + self.entry.parameterdesc_start_lines) + self.entries.append(item) + + self.config.log.debug("Output: %s:%s = %s", dtype, name, pformat(args)) + + def reset_state(self, ln): + """ + Ancillary routine to create a new entry. It initializes all + variables used by the state machine. + """ + + # + # Flush the warnings out before we proceed further + # + if self.entry and self.entry not in self.entries: + for log_msg in self.entry.warnings: + self.config.log.warning(log_msg) + + self.entry = KernelEntry(self.config, self.fname, ln) + + # State flags + self.state = state.NORMAL + + def push_parameter(self, ln, decl_type, param, dtype, + org_arg, declaration_name): + """ + Store parameters and their descriptions at self.entry. + """ + + if self.entry.anon_struct_union and dtype == "" and param == "}": + return # Ignore the ending }; from anonymous struct/union + + self.entry.anon_struct_union = False + + param = KernRe(r'[\[\)].*').sub('', param, count=1) + + # + # Look at various "anonymous type" cases. + # + if dtype == '': + if param.endswith("..."): + if len(param) > 3: # there is a name provided, use that + param = param[:-3] + if not self.entry.parameterdescs.get(param): + self.entry.parameterdescs[param] = "variable arguments" + + elif (not param) or param == "void": + param = "void" + self.entry.parameterdescs[param] = "no arguments" + + elif param in ["struct", "union"]: + # Handle unnamed (anonymous) union or struct + dtype = param + param = "{unnamed_" + param + "}" + self.entry.parameterdescs[param] = "anonymous\n" + self.entry.anon_struct_union = True + + # Warn if parameter has no description + # (but ignore ones starting with # as these are not parameters + # but inline preprocessor statements) + if param not in self.entry.parameterdescs and not param.startswith("#"): + self.entry.parameterdescs[param] = self.undescribed + + if "." not in param: + if decl_type == 'function': + dname = f"{decl_type} parameter" + else: + dname = f"{decl_type} member" + + self.emit_msg(ln, + f"{dname} '{param}' not described in '{declaration_name}'") + + # Strip spaces from param so that it is one continuous string on + # parameterlist. This fixes a problem where check_sections() + # cannot find a parameter like "addr[6 + 2]" because it actually + # appears as "addr[6", "+", "2]" on the parameter list. + # However, it's better to maintain the param string unchanged for + # output, so just weaken the string compare in check_sections() + # to ignore "[blah" in a parameter string. + + self.entry.parameterlist.append(param) + org_arg = KernRe(r'\s\s+').sub(' ', org_arg) + self.entry.parametertypes[param] = org_arg + + + def create_parameter_list(self, ln, decl_type, args, + splitter, declaration_name): + """ + Creates a list of parameters, storing them at self.entry. + """ + + # temporarily replace all commas inside function pointer definition + arg_expr = KernRe(r'(\([^\),]+),') + while arg_expr.search(args): + args = arg_expr.sub(r"\1#", args) + + for arg in args.split(splitter): + # Ignore argument attributes + arg = KernRe(r'\sPOS0?\s').sub(' ', arg) + + # Strip leading/trailing spaces + arg = arg.strip() + arg = KernRe(r'\s+').sub(' ', arg, count=1) + + if arg.startswith('#'): + # Treat preprocessor directive as a typeless variable just to fill + # corresponding data structures "correctly". Catch it later in + # output_* subs. + + # Treat preprocessor directive as a typeless variable + self.push_parameter(ln, decl_type, arg, "", + "", declaration_name) + # + # The pointer-to-function case. + # + elif KernRe(r'\(.+\)\s*\(').search(arg): + arg = arg.replace('#', ',') + r = KernRe(r'[^\(]+\(\*?\s*' # Everything up to "(*" + r'([\w\[\].]*)' # Capture the name and possible [array] + r'\s*\)') # Make sure the trailing ")" is there + if r.match(arg): + param = r.group(1) + else: + self.emit_msg(ln, f"Invalid param: {arg}") + param = arg + dtype = arg.replace(param, '') + self.push_parameter(ln, decl_type, param, dtype, arg, declaration_name) + # + # The array-of-pointers case. Dig the parameter name out from the middle + # of the declaration. + # + elif KernRe(r'\(.+\)\s*\[').search(arg): + r = KernRe(r'[^\(]+\(\s*\*\s*' # Up to "(" and maybe "*" + r'([\w.]*?)' # The actual pointer name + r'\s*(\[\s*\w+\s*\]\s*)*\)') # The [array portion] + if r.match(arg): + param = r.group(1) + else: + self.emit_msg(ln, f"Invalid param: {arg}") + param = arg + dtype = arg.replace(param, '') + self.push_parameter(ln, decl_type, param, dtype, arg, declaration_name) + elif arg: + # + # Clean up extraneous spaces and split the string at commas; the first + # element of the resulting list will also include the type information. + # + arg = KernRe(r'\s*:\s*').sub(":", arg) + arg = KernRe(r'\s*\[').sub('[', arg) + args = KernRe(r'\s*,\s*').split(arg) + args[0] = re.sub(r'(\*+)\s*', r' \1', args[0]) + # + # args[0] has a string of "type a". If "a" includes an [array] + # declaration, we want to not be fooled by any white space inside + # the brackets, so detect and handle that case specially. + # + r = KernRe(r'^([^[\]]*\s+)(.*)$') + if r.match(args[0]): + args[0] = r.group(2) + dtype = r.group(1) + else: + # No space in args[0]; this seems wrong but preserves previous behavior + dtype = '' + + bitfield_re = KernRe(r'(.*?):(\w+)') + for param in args: + # + # For pointers, shift the star(s) from the variable name to the + # type declaration. + # + r = KernRe(r'^(\*+)\s*(.*)') + if r.match(param): + self.push_parameter(ln, decl_type, r.group(2), + f"{dtype} {r.group(1)}", + arg, declaration_name) + # + # Perform a similar shift for bitfields. + # + elif bitfield_re.search(param): + if dtype != "": # Skip unnamed bit-fields + self.push_parameter(ln, decl_type, bitfield_re.group(1), + f"{dtype}:{bitfield_re.group(2)}", + arg, declaration_name) + else: + self.push_parameter(ln, decl_type, param, dtype, + arg, declaration_name) + + def check_sections(self, ln, decl_name, decl_type): + """ + Check for errors inside sections, emitting warnings if not found + parameters are described. + """ + for section in self.entry.sections: + if section not in self.entry.parameterlist and \ + not known_sections.search(section): + if decl_type == 'function': + dname = f"{decl_type} parameter" + else: + dname = f"{decl_type} member" + self.emit_msg(ln, + f"Excess {dname} '{section}' description in '{decl_name}'") + + def check_return_section(self, ln, declaration_name, return_type): + """ + If the function doesn't return void, warns about the lack of a + return description. + """ + + if not self.config.wreturn: + return + + # Ignore an empty return type (It's a macro) + # Ignore functions with a "void" return type (but not "void *") + if not return_type or KernRe(r'void\s*\w*\s*$').search(return_type): + return + + if not self.entry.sections.get("Return", None): + self.emit_msg(ln, + f"No description found for return value of '{declaration_name}'") + + # + # Split apart a structure prototype; returns (struct|union, name, members) or None + # + def split_struct_proto(self, proto): + type_pattern = r'(struct|union)' + qualifiers = [ + "__attribute__", + "__packed", + "__aligned", + "____cacheline_aligned_in_smp", + "____cacheline_aligned", + ] + definition_body = r'\{(.*)\}\s*' + "(?:" + '|'.join(qualifiers) + ")?" + + r = KernRe(type_pattern + r'\s+(\w+)\s*' + definition_body) + if r.search(proto): + return (r.group(1), r.group(2), r.group(3)) + else: + r = KernRe(r'typedef\s+' + type_pattern + r'\s*' + definition_body + r'\s*(\w+)\s*;') + if r.search(proto): + return (r.group(1), r.group(3), r.group(2)) + return None + # + # Rewrite the members of a structure or union for easier formatting later on. + # Among other things, this function will turn a member like: + # + # struct { inner_members; } foo; + # + # into: + # + # struct foo; inner_members; + # + def rewrite_struct_members(self, members): + # + # Process struct/union members from the most deeply nested outward. The + # trick is in the ^{ below - it prevents a match of an outer struct/union + # until the inner one has been munged (removing the "{" in the process). + # + struct_members = KernRe(r'(struct|union)' # 0: declaration type + r'([^\{\};]+)' # 1: possible name + r'(\{)' + r'([^\{\}]*)' # 3: Contents of declaration + r'(\})' + r'([^\{\};]*)(;)') # 5: Remaining stuff after declaration + tuples = struct_members.findall(members) + while tuples: + for t in tuples: + newmember = "" + oldmember = "".join(t) # Reconstruct the original formatting + dtype, name, lbr, content, rbr, rest, semi = t + # + # Pass through each field name, normalizing the form and formatting. + # + for s_id in rest.split(','): + s_id = s_id.strip() + newmember += f"{dtype} {s_id}; " + # + # Remove bitfield/array/pointer info, getting the bare name. + # + s_id = KernRe(r'[:\[].*').sub('', s_id) + s_id = KernRe(r'^\s*\**(\S+)\s*').sub(r'\1', s_id) + # + # Pass through the members of this inner structure/union. + # + for arg in content.split(';'): + arg = arg.strip() + # + # Look for (type)(*name)(args) - pointer to function + # + r = KernRe(r'^([^\(]+\(\*?\s*)([\w.]*)(\s*\).*)') + if r.match(arg): + dtype, name, extra = r.group(1), r.group(2), r.group(3) + # Pointer-to-function + if not s_id: + # Anonymous struct/union + newmember += f"{dtype}{name}{extra}; " + else: + newmember += f"{dtype}{s_id}.{name}{extra}; " + # + # Otherwise a non-function member. + # + else: + # + # Remove bitmap and array portions and spaces around commas + # + arg = KernRe(r':\s*\d+\s*').sub('', arg) + arg = KernRe(r'\[.*\]').sub('', arg) + arg = KernRe(r'\s*,\s*').sub(',', arg) + # + # Look for a normal decl - "type name[,name...]" + # + r = KernRe(r'(.*)\s+([\S+,]+)') + if r.search(arg): + for name in r.group(2).split(','): + name = KernRe(r'^\s*\**(\S+)\s*').sub(r'\1', name) + if not s_id: + # Anonymous struct/union + newmember += f"{r.group(1)} {name}; " + else: + newmember += f"{r.group(1)} {s_id}.{name}; " + else: + newmember += f"{arg}; " + # + # At the end of the s_id loop, replace the original declaration with + # the munged version. + # + members = members.replace(oldmember, newmember) + # + # End of the tuple loop - search again and see if there are outer members + # that now turn up. + # + tuples = struct_members.findall(members) + return members + + # + # Format the struct declaration into a standard form for inclusion in the + # resulting docs. + # + def format_struct_decl(self, declaration): + # + # Insert newlines, get rid of extra spaces. + # + declaration = KernRe(r'([\{;])').sub(r'\1\n', declaration) + declaration = KernRe(r'\}\s+;').sub('};', declaration) + # + # Format inline enums with each member on its own line. + # + r = KernRe(r'(enum\s+\{[^\}]+),([^\n])') + while r.search(declaration): + declaration = r.sub(r'\1,\n\2', declaration) + # + # Now go through and supply the right number of tabs + # for each line. + # + def_args = declaration.split('\n') + level = 1 + declaration = "" + for clause in def_args: + clause = KernRe(r'\s+').sub(' ', clause.strip(), count=1) + if clause: + if '}' in clause and level > 1: + level -= 1 + if not clause.startswith('#'): + declaration += "\t" * level + declaration += "\t" + clause + "\n" + if "{" in clause and "}" not in clause: + level += 1 + return declaration + + + def dump_struct(self, ln, proto): + """ + Store an entry for a struct or union + """ + # + # Do the basic parse to get the pieces of the declaration. + # + struct_parts = self.split_struct_proto(proto) + if not struct_parts: + self.emit_msg(ln, f"{proto} error: Cannot parse struct or union!") + return + decl_type, declaration_name, members = struct_parts + + if self.entry.identifier != declaration_name: + self.emit_msg(ln, f"expecting prototype for {decl_type} {self.entry.identifier}. " + f"Prototype was for {decl_type} {declaration_name} instead\n") + return + # + # Go through the list of members applying all of our transformations. + # + members = trim_private_members(members) + members = apply_transforms(struct_xforms, members) + + nested = NestedMatch() + for search, sub in struct_nested_prefixes: + members = nested.sub(search, sub, members) + # + # Deal with embedded struct and union members, and drop enums entirely. + # + declaration = members + members = self.rewrite_struct_members(members) + members = re.sub(r'(\{[^\{\}]*\})', '', members) + # + # Output the result and we are done. + # + self.create_parameter_list(ln, decl_type, members, ';', + declaration_name) + self.check_sections(ln, declaration_name, decl_type) + self.output_declaration(decl_type, declaration_name, + definition=self.format_struct_decl(declaration), + purpose=self.entry.declaration_purpose) + + def dump_enum(self, ln, proto): + """ + Stores an enum inside self.entries array. + """ + # + # Strip preprocessor directives. Note that this depends on the + # trailing semicolon we added in process_proto_type(). + # + proto = KernRe(r'#\s*((define|ifdef|if)\s+|endif)[^;]*;', flags=re.S).sub('', proto) + # + # Parse out the name and members of the enum. Typedef form first. + # + r = KernRe(r'typedef\s+enum\s*\{(.*)\}\s*(\w*)\s*;') + if r.search(proto): + declaration_name = r.group(2) + members = trim_private_members(r.group(1)) + # + # Failing that, look for a straight enum + # + else: + r = KernRe(r'enum\s+(\w*)\s*\{(.*)\}') + if r.match(proto): + declaration_name = r.group(1) + members = trim_private_members(r.group(2)) + # + # OK, this isn't going to work. + # + else: + self.emit_msg(ln, f"{proto}: error: Cannot parse enum!") + return + # + # Make sure we found what we were expecting. + # + if self.entry.identifier != declaration_name: + if self.entry.identifier == "": + self.emit_msg(ln, + f"{proto}: wrong kernel-doc identifier on prototype") + else: + self.emit_msg(ln, + f"expecting prototype for enum {self.entry.identifier}. " + f"Prototype was for enum {declaration_name} instead") + return + + if not declaration_name: + declaration_name = "(anonymous)" + # + # Parse out the name of each enum member, and verify that we + # have a description for it. + # + member_set = set() + members = KernRe(r'\([^;)]*\)').sub('', members) + for arg in members.split(','): + if not arg: + continue + arg = KernRe(r'^\s*(\w+).*').sub(r'\1', arg) + self.entry.parameterlist.append(arg) + if arg not in self.entry.parameterdescs: + self.entry.parameterdescs[arg] = self.undescribed + self.emit_msg(ln, + f"Enum value '{arg}' not described in enum '{declaration_name}'") + member_set.add(arg) + # + # Ensure that every described member actually exists in the enum. + # + for k in self.entry.parameterdescs: + if k not in member_set: + self.emit_msg(ln, + f"Excess enum value '@{k}' description in '{declaration_name}'") + + self.output_declaration('enum', declaration_name, + purpose=self.entry.declaration_purpose) + + def dump_declaration(self, ln, prototype): + """ + Stores a data declaration inside self.entries array. + """ + + if self.entry.decl_type == "enum": + self.dump_enum(ln, prototype) + elif self.entry.decl_type == "typedef": + self.dump_typedef(ln, prototype) + elif self.entry.decl_type in ["union", "struct"]: + self.dump_struct(ln, prototype) + else: + # This would be a bug + self.emit_message(ln, f'Unknown declaration type: {self.entry.decl_type}') + + def dump_function(self, ln, prototype): + """ + Stores a function or function macro inside self.entries array. + """ + + found = func_macro = False + return_type = '' + decl_type = 'function' + # + # Apply the initial transformations. + # + prototype = apply_transforms(function_xforms, prototype) + # + # If we have a macro, remove the "#define" at the front. + # + new_proto = KernRe(r"^#\s*define\s+").sub("", prototype) + if new_proto != prototype: + prototype = new_proto + # + # Dispense with the simple "#define A B" case here; the key + # is the space after the name of the symbol being defined. + # NOTE that the seemingly misnamed "func_macro" indicates a + # macro *without* arguments. + # + r = KernRe(r'^(\w+)\s+') + if r.search(prototype): + return_type = '' + declaration_name = r.group(1) + func_macro = True + found = True + + # Yes, this truly is vile. We are looking for: + # 1. Return type (may be nothing if we're looking at a macro) + # 2. Function name + # 3. Function parameters. + # + # All the while we have to watch out for function pointer parameters + # (which IIRC is what the two sections are for), C types (these + # regexps don't even start to express all the possibilities), and + # so on. + # + # If you mess with these regexps, it's a good idea to check that + # the following functions' documentation still comes out right: + # - parport_register_device (function pointer parameters) + # - atomic_set (macro) + # - pci_match_device, __copy_to_user (long return type) + + name = r'\w+' + type1 = r'(?:[\w\s]+)?' + type2 = r'(?:[\w\s]+\*+)+' + # + # Attempt to match first on (args) with no internal parentheses; this + # lets us easily filter out __acquires() and other post-args stuff. If + # that fails, just grab the rest of the line to the last closing + # parenthesis. + # + proto_args = r'\(([^\(]*|.*)\)' + # + # (Except for the simple macro case) attempt to split up the prototype + # in the various ways we understand. + # + if not found: + patterns = [ + rf'^()({name})\s*{proto_args}', + rf'^({type1})\s+({name})\s*{proto_args}', + rf'^({type2})\s*({name})\s*{proto_args}', + ] + + for p in patterns: + r = KernRe(p) + if r.match(prototype): + return_type = r.group(1) + declaration_name = r.group(2) + args = r.group(3) + self.create_parameter_list(ln, decl_type, args, ',', + declaration_name) + found = True + break + # + # Parsing done; make sure that things are as we expect. + # + if not found: + self.emit_msg(ln, + f"cannot understand function prototype: '{prototype}'") + return + if self.entry.identifier != declaration_name: + self.emit_msg(ln, f"expecting prototype for {self.entry.identifier}(). " + f"Prototype was for {declaration_name}() instead") + return + self.check_sections(ln, declaration_name, "function") + self.check_return_section(ln, declaration_name, return_type) + # + # Store the result. + # + self.output_declaration(decl_type, declaration_name, + typedef=('typedef' in return_type), + functiontype=return_type, + purpose=self.entry.declaration_purpose, + func_macro=func_macro) + + + def dump_typedef(self, ln, proto): + """ + Stores a typedef inside self.entries array. + """ + # + # We start by looking for function typedefs. + # + typedef_type = r'typedef((?:\s+[\w*]+\b){0,7}\s+(?:\w+\b|\*+))\s*' + typedef_ident = r'\*?\s*(\w\S+)\s*' + typedef_args = r'\s*\((.*)\);' + + typedef1 = KernRe(typedef_type + r'\(' + typedef_ident + r'\)' + typedef_args) + typedef2 = KernRe(typedef_type + typedef_ident + typedef_args) + + # Parse function typedef prototypes + for r in [typedef1, typedef2]: + if not r.match(proto): + continue + + return_type = r.group(1).strip() + declaration_name = r.group(2) + args = r.group(3) + + if self.entry.identifier != declaration_name: + self.emit_msg(ln, + f"expecting prototype for typedef {self.entry.identifier}. Prototype was for typedef {declaration_name} instead\n") + return + + self.create_parameter_list(ln, 'function', args, ',', declaration_name) + + self.output_declaration('function', declaration_name, + typedef=True, + functiontype=return_type, + purpose=self.entry.declaration_purpose) + return + # + # Not a function, try to parse a simple typedef. + # + r = KernRe(r'typedef.*\s+(\w+)\s*;') + if r.match(proto): + declaration_name = r.group(1) + + if self.entry.identifier != declaration_name: + self.emit_msg(ln, + f"expecting prototype for typedef {self.entry.identifier}. Prototype was for typedef {declaration_name} instead\n") + return + + self.output_declaration('typedef', declaration_name, + purpose=self.entry.declaration_purpose) + return + + self.emit_msg(ln, "error: Cannot parse typedef!") + + @staticmethod + def process_export(function_set, line): + """ + process EXPORT_SYMBOL* tags + + This method doesn't use any variable from the class, so declare it + with a staticmethod decorator. + """ + + # We support documenting some exported symbols with different + # names. A horrible hack. + suffixes = [ '_noprof' ] + + # Note: it accepts only one EXPORT_SYMBOL* per line, as having + # multiple export lines would violate Kernel coding style. + + if export_symbol.search(line): + symbol = export_symbol.group(2) + elif export_symbol_ns.search(line): + symbol = export_symbol_ns.group(2) + else: + return False + # + # Found an export, trim out any special suffixes + # + for suffix in suffixes: + # Be backward compatible with Python < 3.9 + if symbol.endswith(suffix): + symbol = symbol[:-len(suffix)] + function_set.add(symbol) + return True + + def process_normal(self, ln, line): + """ + STATE_NORMAL: looking for the /** to begin everything. + """ + + if not doc_start.match(line): + return + + # start a new entry + self.reset_state(ln) + + # next line is always the function name + self.state = state.NAME + + def process_name(self, ln, line): + """ + STATE_NAME: Looking for the "name - description" line + """ + # + # Check for a DOC: block and handle them specially. + # + if doc_block.search(line): + + if not doc_block.group(1): + self.entry.begin_section(ln, "Introduction") + else: + self.entry.begin_section(ln, doc_block.group(1)) + + self.entry.identifier = self.entry.section + self.state = state.DOCBLOCK + # + # Otherwise we're looking for a normal kerneldoc declaration line. + # + elif doc_decl.search(line): + self.entry.identifier = doc_decl.group(1) + + # Test for data declaration + if doc_begin_data.search(line): + self.entry.decl_type = doc_begin_data.group(1) + self.entry.identifier = doc_begin_data.group(2) + # + # Look for a function description + # + elif doc_begin_func.search(line): + self.entry.identifier = doc_begin_func.group(1) + self.entry.decl_type = "function" + # + # We struck out. + # + else: + self.emit_msg(ln, + f"This comment starts with '/**', but isn't a kernel-doc comment. Refer to Documentation/doc-guide/kernel-doc.rst\n{line}") + self.state = state.NORMAL + return + # + # OK, set up for a new kerneldoc entry. + # + self.state = state.BODY + self.entry.identifier = self.entry.identifier.strip(" ") + # if there's no @param blocks need to set up default section here + self.entry.begin_section(ln + 1) + # + # Find the description portion, which *should* be there but + # isn't always. + # (We should be able to capture this from the previous parsing - someday) + # + r = KernRe("[-:](.*)") + if r.search(line): + self.entry.declaration_purpose = trim_whitespace(r.group(1)) + self.state = state.DECLARATION + else: + self.entry.declaration_purpose = "" + + if not self.entry.declaration_purpose and self.config.wshort_desc: + self.emit_msg(ln, + f"missing initial short description on line:\n{line}") + + if not self.entry.identifier and self.entry.decl_type != "enum": + self.emit_msg(ln, + f"wrong kernel-doc identifier on line:\n{line}") + self.state = state.NORMAL + + if self.config.verbose: + self.emit_msg(ln, + f"Scanning doc for {self.entry.decl_type} {self.entry.identifier}", + warning=False) + # + # Failed to find an identifier. Emit a warning + # + else: + self.emit_msg(ln, f"Cannot find identifier on line:\n{line}") + + # + # Helper function to determine if a new section is being started. + # + def is_new_section(self, ln, line): + if doc_sect.search(line): + self.state = state.BODY + # + # Pick out the name of our new section, tweaking it if need be. + # + newsection = doc_sect.group(1) + if newsection.lower() == 'description': + newsection = 'Description' + elif newsection.lower() == 'context': + newsection = 'Context' + self.state = state.SPECIAL_SECTION + elif newsection.lower() in ["@return", "@returns", + "return", "returns"]: + newsection = "Return" + self.state = state.SPECIAL_SECTION + elif newsection[0] == '@': + self.state = state.SPECIAL_SECTION + # + # Initialize the contents, and get the new section going. + # + newcontents = doc_sect.group(2) + if not newcontents: + newcontents = "" + self.dump_section() + self.entry.begin_section(ln, newsection) + self.entry.leading_space = None + + self.entry.add_text(newcontents.lstrip()) + return True + return False + + # + # Helper function to detect (and effect) the end of a kerneldoc comment. + # + def is_comment_end(self, ln, line): + if doc_end.search(line): + self.dump_section() + + # Look for doc_com + <text> + doc_end: + r = KernRe(r'\s*\*\s*[a-zA-Z_0-9:.]+\*/') + if r.match(line): + self.emit_msg(ln, f"suspicious ending line: {line}") + + self.entry.prototype = "" + self.entry.new_start_line = ln + 1 + + self.state = state.PROTO + return True + return False + + + def process_decl(self, ln, line): + """ + STATE_DECLARATION: We've seen the beginning of a declaration + """ + if self.is_new_section(ln, line) or self.is_comment_end(ln, line): + return + # + # Look for anything with the " * " line beginning. + # + if doc_content.search(line): + cont = doc_content.group(1) + # + # A blank line means that we have moved out of the declaration + # part of the comment (without any "special section" parameter + # descriptions). + # + if cont == "": + self.state = state.BODY + # + # Otherwise we have more of the declaration section to soak up. + # + else: + self.entry.declaration_purpose = \ + trim_whitespace(self.entry.declaration_purpose + ' ' + cont) + else: + # Unknown line, ignore + self.emit_msg(ln, f"bad line: {line}") + + + def process_special(self, ln, line): + """ + STATE_SPECIAL_SECTION: a section ending with a blank line + """ + # + # If we have hit a blank line (only the " * " marker), then this + # section is done. + # + if KernRe(r"\s*\*\s*$").match(line): + self.entry.begin_section(ln, dump = True) + self.state = state.BODY + return + # + # Not a blank line, look for the other ways to end the section. + # + if self.is_new_section(ln, line) or self.is_comment_end(ln, line): + return + # + # OK, we should have a continuation of the text for this section. + # + if doc_content.search(line): + cont = doc_content.group(1) + # + # If the lines of text after the first in a special section have + # leading white space, we need to trim it out or Sphinx will get + # confused. For the second line (the None case), see what we + # find there and remember it. + # + if self.entry.leading_space is None: + r = KernRe(r'^(\s+)') + if r.match(cont): + self.entry.leading_space = len(r.group(1)) + else: + self.entry.leading_space = 0 + # + # Otherwise, before trimming any leading chars, be *sure* + # that they are white space. We should maybe warn if this + # isn't the case. + # + for i in range(0, self.entry.leading_space): + if cont[i] != " ": + self.entry.leading_space = i + break + # + # Add the trimmed result to the section and we're done. + # + self.entry.add_text(cont[self.entry.leading_space:]) + else: + # Unknown line, ignore + self.emit_msg(ln, f"bad line: {line}") + + def process_body(self, ln, line): + """ + STATE_BODY: the bulk of a kerneldoc comment. + """ + if self.is_new_section(ln, line) or self.is_comment_end(ln, line): + return + + if doc_content.search(line): + cont = doc_content.group(1) + self.entry.add_text(cont) + else: + # Unknown line, ignore + self.emit_msg(ln, f"bad line: {line}") + + def process_inline_name(self, ln, line): + """STATE_INLINE_NAME: beginning of docbook comments within a prototype.""" + + if doc_inline_sect.search(line): + self.entry.begin_section(ln, doc_inline_sect.group(1)) + self.entry.add_text(doc_inline_sect.group(2).lstrip()) + self.state = state.INLINE_TEXT + elif doc_inline_end.search(line): + self.dump_section() + self.state = state.PROTO + elif doc_content.search(line): + self.emit_msg(ln, f"Incorrect use of kernel-doc format: {line}") + self.state = state.PROTO + # else ... ?? + + def process_inline_text(self, ln, line): + """STATE_INLINE_TEXT: docbook comments within a prototype.""" + + if doc_inline_end.search(line): + self.dump_section() + self.state = state.PROTO + elif doc_content.search(line): + self.entry.add_text(doc_content.group(1)) + # else ... ?? + + def syscall_munge(self, ln, proto): # pylint: disable=W0613 + """ + Handle syscall definitions + """ + + is_void = False + + # Strip newlines/CR's + proto = re.sub(r'[\r\n]+', ' ', proto) + + # Check if it's a SYSCALL_DEFINE0 + if 'SYSCALL_DEFINE0' in proto: + is_void = True + + # Replace SYSCALL_DEFINE with correct return type & function name + proto = KernRe(r'SYSCALL_DEFINE.*\(').sub('long sys_', proto) + + r = KernRe(r'long\s+(sys_.*?),') + if r.search(proto): + proto = KernRe(',').sub('(', proto, count=1) + elif is_void: + proto = KernRe(r'\)').sub('(void)', proto, count=1) + + # Now delete all of the odd-numbered commas in the proto + # so that argument types & names don't have a comma between them + count = 0 + length = len(proto) + + if is_void: + length = 0 # skip the loop if is_void + + for ix in range(length): + if proto[ix] == ',': + count += 1 + if count % 2 == 1: + proto = proto[:ix] + ' ' + proto[ix + 1:] + + return proto + + def tracepoint_munge(self, ln, proto): + """ + Handle tracepoint definitions + """ + + tracepointname = None + tracepointargs = None + + # Match tracepoint name based on different patterns + r = KernRe(r'TRACE_EVENT\((.*?),') + if r.search(proto): + tracepointname = r.group(1) + + r = KernRe(r'DEFINE_SINGLE_EVENT\((.*?),') + if r.search(proto): + tracepointname = r.group(1) + + r = KernRe(r'DEFINE_EVENT\((.*?),(.*?),') + if r.search(proto): + tracepointname = r.group(2) + + if tracepointname: + tracepointname = tracepointname.lstrip() + + r = KernRe(r'TP_PROTO\((.*?)\)') + if r.search(proto): + tracepointargs = r.group(1) + + if not tracepointname or not tracepointargs: + self.emit_msg(ln, + f"Unrecognized tracepoint format:\n{proto}\n") + else: + proto = f"static inline void trace_{tracepointname}({tracepointargs})" + self.entry.identifier = f"trace_{self.entry.identifier}" + + return proto + + def process_proto_function(self, ln, line): + """Ancillary routine to process a function prototype""" + + # strip C99-style comments to end of line + line = KernRe(r"//.*$", re.S).sub('', line) + # + # Soak up the line's worth of prototype text, stopping at { or ; if present. + # + if KernRe(r'\s*#\s*define').match(line): + self.entry.prototype = line + elif not line.startswith('#'): # skip other preprocessor stuff + r = KernRe(r'([^\{]*)') + if r.match(line): + self.entry.prototype += r.group(1) + " " + # + # If we now have the whole prototype, clean it up and declare victory. + # + if '{' in line or ';' in line or KernRe(r'\s*#\s*define').match(line): + # strip comments and surrounding spaces + self.entry.prototype = KernRe(r'/\*.*\*/').sub('', self.entry.prototype).strip() + # + # Handle self.entry.prototypes for function pointers like: + # int (*pcs_config)(struct foo) + # by turning it into + # int pcs_config(struct foo) + # + r = KernRe(r'^(\S+\s+)\(\s*\*(\S+)\)') + self.entry.prototype = r.sub(r'\1\2', self.entry.prototype) + # + # Handle special declaration syntaxes + # + if 'SYSCALL_DEFINE' in self.entry.prototype: + self.entry.prototype = self.syscall_munge(ln, + self.entry.prototype) + else: + r = KernRe(r'TRACE_EVENT|DEFINE_EVENT|DEFINE_SINGLE_EVENT') + if r.search(self.entry.prototype): + self.entry.prototype = self.tracepoint_munge(ln, + self.entry.prototype) + # + # ... and we're done + # + self.dump_function(ln, self.entry.prototype) + self.reset_state(ln) + + def process_proto_type(self, ln, line): + """Ancillary routine to process a type""" + + # Strip C99-style comments and surrounding whitespace + line = KernRe(r"//.*$", re.S).sub('', line).strip() + if not line: + return # nothing to see here + + # To distinguish preprocessor directive from regular declaration later. + if line.startswith('#'): + line += ";" + # + # Split the declaration on any of { } or ;, and accumulate pieces + # until we hit a semicolon while not inside {brackets} + # + r = KernRe(r'(.*?)([{};])') + for chunk in r.split(line): + if chunk: # Ignore empty matches + self.entry.prototype += chunk + # + # This cries out for a match statement ... someday after we can + # drop Python 3.9 ... + # + if chunk == '{': + self.entry.brcount += 1 + elif chunk == '}': + self.entry.brcount -= 1 + elif chunk == ';' and self.entry.brcount <= 0: + self.dump_declaration(ln, self.entry.prototype) + self.reset_state(ln) + return + # + # We hit the end of the line while still in the declaration; put + # in a space to represent the newline. + # + self.entry.prototype += ' ' + + def process_proto(self, ln, line): + """STATE_PROTO: reading a function/whatever prototype.""" + + if doc_inline_oneline.search(line): + self.entry.begin_section(ln, doc_inline_oneline.group(1)) + self.entry.add_text(doc_inline_oneline.group(2)) + self.dump_section() + + elif doc_inline_start.search(line): + self.state = state.INLINE_NAME + + elif self.entry.decl_type == 'function': + self.process_proto_function(ln, line) + + else: + self.process_proto_type(ln, line) + + def process_docblock(self, ln, line): + """STATE_DOCBLOCK: within a DOC: block.""" + + if doc_end.search(line): + self.dump_section() + self.output_declaration("doc", self.entry.identifier) + self.reset_state(ln) + + elif doc_content.search(line): + self.entry.add_text(doc_content.group(1)) + + def parse_export(self): + """ + Parses EXPORT_SYMBOL* macros from a single Kernel source file. + """ + + export_table = set() + + try: + with open(self.fname, "r", encoding="utf8", + errors="backslashreplace") as fp: + + for line in fp: + self.process_export(export_table, line) + + except IOError: + return None + + return export_table + + # + # The state/action table telling us which function to invoke in + # each state. + # + state_actions = { + state.NORMAL: process_normal, + state.NAME: process_name, + state.BODY: process_body, + state.DECLARATION: process_decl, + state.SPECIAL_SECTION: process_special, + state.INLINE_NAME: process_inline_name, + state.INLINE_TEXT: process_inline_text, + state.PROTO: process_proto, + state.DOCBLOCK: process_docblock, + } + + def parse_kdoc(self): + """ + Open and process each line of a C source file. + The parsing is controlled via a state machine, and the line is passed + to a different process function depending on the state. The process + function may update the state as needed. + + Besides parsing kernel-doc tags, it also parses export symbols. + """ + + prev = "" + prev_ln = None + export_table = set() + + try: + with open(self.fname, "r", encoding="utf8", + errors="backslashreplace") as fp: + for ln, line in enumerate(fp): + + line = line.expandtabs().strip("\n") + + # Group continuation lines on prototypes + if self.state == state.PROTO: + if line.endswith("\\"): + prev += line.rstrip("\\") + if not prev_ln: + prev_ln = ln + continue + + if prev: + ln = prev_ln + line = prev + line + prev = "" + prev_ln = None + + self.config.log.debug("%d %s: %s", + ln, state.name[self.state], + line) + + # This is an optimization over the original script. + # There, when export_file was used for the same file, + # it was read twice. Here, we use the already-existing + # loop to parse exported symbols as well. + # + if (self.state != state.NORMAL) or \ + not self.process_export(export_table, line): + # Hand this line to the appropriate state handler + self.state_actions[self.state](self, ln, line) + + except OSError: + self.config.log.error(f"Error: Cannot open file {self.fname}") + + return export_table, self.entries diff --git a/tools/lib/python/kdoc/kdoc_re.py b/tools/lib/python/kdoc/kdoc_re.py new file mode 100644 index 000000000000..2dfa1bf83d64 --- /dev/null +++ b/tools/lib/python/kdoc/kdoc_re.py @@ -0,0 +1,270 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: GPL-2.0 +# Copyright(c) 2025: Mauro Carvalho Chehab <mchehab@kernel.org>. + +""" +Regular expression ancillary classes. + +Those help caching regular expressions and do matching for kernel-doc. +""" + +import re + +# Local cache for regular expressions +re_cache = {} + + +class KernRe: + """ + Helper class to simplify regex declaration and usage. + + It calls re.compile for a given pattern. It also allows adding + regular expressions and define sub at class init time. + + Regular expressions can be cached via an argument, helping to speedup + searches. + """ + + def _add_regex(self, string, flags): + """ + Adds a new regex or reuses it from the cache. + """ + self.regex = re_cache.get(string, None) + if not self.regex: + self.regex = re.compile(string, flags=flags) + if self.cache: + re_cache[string] = self.regex + + def __init__(self, string, cache=True, flags=0): + """ + Compile a regular expression and initialize internal vars. + """ + + self.cache = cache + self.last_match = None + + self._add_regex(string, flags) + + def __str__(self): + """ + Return the regular expression pattern. + """ + return self.regex.pattern + + def __add__(self, other): + """ + Allows adding two regular expressions into one. + """ + + return KernRe(str(self) + str(other), cache=self.cache or other.cache, + flags=self.regex.flags | other.regex.flags) + + def match(self, string): + """ + Handles a re.match storing its results + """ + + self.last_match = self.regex.match(string) + return self.last_match + + def search(self, string): + """ + Handles a re.search storing its results + """ + + self.last_match = self.regex.search(string) + return self.last_match + + def findall(self, string): + """ + Alias to re.findall + """ + + return self.regex.findall(string) + + def split(self, string): + """ + Alias to re.split + """ + + return self.regex.split(string) + + def sub(self, sub, string, count=0): + """ + Alias to re.sub + """ + + return self.regex.sub(sub, string, count=count) + + def group(self, num): + """ + Returns the group results of the last match + """ + + return self.last_match.group(num) + + +class NestedMatch: + """ + Finding nested delimiters is hard with regular expressions. It is + even harder on Python with its normal re module, as there are several + advanced regular expressions that are missing. + + This is the case of this pattern: + + '\\bSTRUCT_GROUP(\\(((?:(?>[^)(]+)|(?1))*)\\))[^;]*;' + + which is used to properly match open/close parentheses of the + string search STRUCT_GROUP(), + + Add a class that counts pairs of delimiters, using it to match and + replace nested expressions. + + The original approach was suggested by: + https://stackoverflow.com/questions/5454322/python-how-to-match-nested-parentheses-with-regex + + Although I re-implemented it to make it more generic and match 3 types + of delimiters. The logic checks if delimiters are paired. If not, it + will ignore the search string. + """ + + # TODO: make NestedMatch handle multiple match groups + # + # Right now, regular expressions to match it are defined only up to + # the start delimiter, e.g.: + # + # \bSTRUCT_GROUP\( + # + # is similar to: STRUCT_GROUP\((.*)\) + # except that the content inside the match group is delimiter-aligned. + # + # The content inside parentheses is converted into a single replace + # group (e.g. r`\1'). + # + # It would be nice to change such definition to support multiple + # match groups, allowing a regex equivalent to: + # + # FOO\((.*), (.*), (.*)\) + # + # it is probably easier to define it not as a regular expression, but + # with some lexical definition like: + # + # FOO(arg1, arg2, arg3) + + DELIMITER_PAIRS = { + '{': '}', + '(': ')', + '[': ']', + } + + RE_DELIM = re.compile(r'[\{\}\[\]\(\)]') + + def _search(self, regex, line): + """ + Finds paired blocks for a regex that ends with a delimiter. + + The suggestion of using finditer to match pairs came from: + https://stackoverflow.com/questions/5454322/python-how-to-match-nested-parentheses-with-regex + but I ended using a different implementation to align all three types + of delimiters and seek for an initial regular expression. + + The algorithm seeks for open/close paired delimiters and places them + into a stack, yielding a start/stop position of each match when the + stack is zeroed. + + The algorithm should work fine for properly paired lines, but will + silently ignore end delimiters that precede a start delimiter. + This should be OK for kernel-doc parser, as unaligned delimiters + would cause compilation errors. So, we don't need to raise exceptions + to cover such issues. + """ + + stack = [] + + for match_re in regex.finditer(line): + start = match_re.start() + offset = match_re.end() + + d = line[offset - 1] + if d not in self.DELIMITER_PAIRS: + continue + + end = self.DELIMITER_PAIRS[d] + stack.append(end) + + for match in self.RE_DELIM.finditer(line[offset:]): + pos = match.start() + offset + + d = line[pos] + + if d in self.DELIMITER_PAIRS: + end = self.DELIMITER_PAIRS[d] + + stack.append(end) + continue + + # Does the end delimiter match what is expected? + if stack and d == stack[-1]: + stack.pop() + + if not stack: + yield start, offset, pos + 1 + break + + def search(self, regex, line): + """ + This is similar to re.search: + + It matches a regex that it is followed by a delimiter, + returning occurrences only if all delimiters are paired. + """ + + for t in self._search(regex, line): + + yield line[t[0]:t[2]] + + def sub(self, regex, sub, line, count=0): + """ + This is similar to re.sub: + + It matches a regex that it is followed by a delimiter, + replacing occurrences only if all delimiters are paired. + + if r'\1' is used, it works just like re: it places there the + matched paired data with the delimiter stripped. + + If count is different than zero, it will replace at most count + items. + """ + out = "" + + cur_pos = 0 + n = 0 + + for start, end, pos in self._search(regex, line): + out += line[cur_pos:start] + + # Value, ignoring start/end delimiters + value = line[end:pos - 1] + + # replaces \1 at the sub string, if \1 is used there + new_sub = sub + new_sub = new_sub.replace(r'\1', value) + + out += new_sub + + # Drop end ';' if any + if line[pos] == ';': + pos += 1 + + cur_pos = pos + n += 1 + + if count and count >= n: + break + + # Append the remaining string + l = len(line) + out += line[cur_pos:l] + + return out diff --git a/tools/lib/python/kdoc/latex_fonts.py b/tools/lib/python/kdoc/latex_fonts.py new file mode 100755 index 000000000000..29317f8006ea --- /dev/null +++ b/tools/lib/python/kdoc/latex_fonts.py @@ -0,0 +1,167 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: GPL-2.0-only +# Copyright (C) Akira Yokosawa, 2024 +# +# Ported to Python by (c) Mauro Carvalho Chehab, 2025 + +""" +Detect problematic Noto CJK variable fonts. + +For "make pdfdocs", reports of build errors of translations.pdf started +arriving early 2024 [1, 2]. It turned out that Fedora and openSUSE +tumbleweed have started deploying variable-font [3] format of "Noto CJK" +fonts [4, 5]. For PDF, a LaTeX package named xeCJK is used for CJK +(Chinese, Japanese, Korean) pages. xeCJK requires XeLaTeX/XeTeX, which +does not (and likely never will) understand variable fonts for historical +reasons. + +The build error happens even when both of variable- and non-variable-format +fonts are found on the build system. To make matters worse, Fedora enlists +variable "Noto CJK" fonts in the requirements of langpacks-ja, -ko, -zh_CN, +-zh_TW, etc. Hence developers who have interest in CJK pages are more +likely to encounter the build errors. + +This script is invoked from the error path of "make pdfdocs" and emits +suggestions if variable-font files of "Noto CJK" fonts are in the list of +fonts accessible from XeTeX. + +References: +[1]: https://lore.kernel.org/r/8734tqsrt7.fsf@meer.lwn.net/ +[2]: https://lore.kernel.org/r/1708585803.600323099@f111.i.mail.ru/ +[3]: https://en.wikipedia.org/wiki/Variable_font +[4]: https://fedoraproject.org/wiki/Changes/Noto_CJK_Variable_Fonts +[5]: https://build.opensuse.org/request/show/1157217 + +#=========================================================================== +Workarounds for building translations.pdf +#=========================================================================== + +* Denylist "variable font" Noto CJK fonts. + - Create $HOME/deny-vf/fontconfig/fonts.conf from template below, with + tweaks if necessary. Remove leading "". + - Path of fontconfig/fonts.conf can be overridden by setting an env + variable FONTS_CONF_DENY_VF. + + * Template: +----------------------------------------------------------------- +<?xml version="1.0"?> +<!DOCTYPE fontconfig SYSTEM "urn:fontconfig:fonts.dtd"> +<fontconfig> +<!-- + Ignore variable-font glob (not to break xetex) +--> + <selectfont> + <rejectfont> + <!-- + for Fedora + --> + <glob>/usr/share/fonts/google-noto-*-cjk-vf-fonts</glob> + <!-- + for openSUSE tumbleweed + --> + <glob>/usr/share/fonts/truetype/Noto*CJK*-VF.otf</glob> + </rejectfont> + </selectfont> +</fontconfig> +----------------------------------------------------------------- + + The denylisting is activated for "make pdfdocs". + +* For skipping CJK pages in PDF + - Uninstall texlive-xecjk. + Denylisting is not needed in this case. + +* For printing CJK pages in PDF + - Need non-variable "Noto CJK" fonts. + * Fedora + - google-noto-sans-cjk-fonts + - google-noto-serif-cjk-fonts + * openSUSE tumbleweed + - Non-variable "Noto CJK" fonts are not available as distro packages + as of April, 2024. Fetch a set of font files from upstream Noto + CJK Font released at: + https://github.com/notofonts/noto-cjk/tree/main/Sans#super-otc + and at: + https://github.com/notofonts/noto-cjk/tree/main/Serif#super-otc + , then uncompress and deploy them. + - Remember to update fontconfig cache by running fc-cache. + +!!! Caution !!! + Uninstalling "variable font" packages can be dangerous. + They might be depended upon by other packages important for your work. + Denylisting should be less invasive, as it is effective only while + XeLaTeX runs in "make pdfdocs". +""" + +import os +import re +import subprocess +import textwrap +import sys + +class LatexFontChecker: + """ + Detect problems with CJK variable fonts that affect PDF builds for + translations. + """ + + def __init__(self, deny_vf=None): + if not deny_vf: + deny_vf = os.environ.get('FONTS_CONF_DENY_VF', "~/deny-vf") + + self.environ = os.environ.copy() + self.environ['XDG_CONFIG_HOME'] = os.path.expanduser(deny_vf) + + self.re_cjk = re.compile(r"([^:]+):\s*Noto\s+(Sans|Sans Mono|Serif) CJK") + + def description(self): + return __doc__ + + def get_noto_cjk_vf_fonts(self): + """Get Noto CJK fonts""" + + cjk_fonts = set() + cmd = ["fc-list", ":", "file", "family", "variable"] + try: + result = subprocess.run(cmd,stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + universal_newlines=True, + env=self.environ, + check=True) + + except subprocess.CalledProcessError as exc: + sys.exit(f"Error running fc-list: {repr(exc)}") + + for line in result.stdout.splitlines(): + if 'variable=True' not in line: + continue + + match = self.re_cjk.search(line) + if match: + cjk_fonts.add(match.group(1)) + + return sorted(cjk_fonts) + + def check(self): + """Check for problems with CJK fonts""" + + fonts = textwrap.indent("\n".join(self.get_noto_cjk_vf_fonts()), " ") + if not fonts: + return None + + rel_file = os.path.relpath(__file__, os.getcwd()) + + msg = "=" * 77 + "\n" + msg += 'XeTeX is confused by "variable font" files listed below:\n' + msg += fonts + "\n" + msg += textwrap.dedent(f""" + For CJK pages in PDF, they need to be hidden from XeTeX by denylisting. + Or, CJK pages can be skipped by uninstalling texlive-xecjk. + + For more info on denylisting, other options, and variable font, run: + + tools/docs/check-variable-fonts.py -h + """) + msg += "=" * 77 + + return msg diff --git a/tools/docs/lib/parse_data_structs.py b/tools/lib/python/kdoc/parse_data_structs.py index a5aa2e182052..25361996cd20 100755 --- a/tools/docs/lib/parse_data_structs.py +++ b/tools/lib/python/kdoc/parse_data_structs.py @@ -53,11 +53,19 @@ class ParseDataStructs: replace <type> <old_symbol> <new_reference> - Replaces how old_symbol with a new reference. The new_reference can be: + Replaces how old_symbol with a new reference. The new_reference can be: + - A simple symbol name; - A full Sphinx reference. - On both cases, <type> can be: + 3. Namespace rules + + namespace <namespace> + + Sets C namespace to be used during cross-reference generation. Can + be overridden by replace rules. + + On ignore and replace rules, <type> can be: - ioctl: for defines that end with _IO*, e.g. ioctl definitions - define: for other defines - symbol: for symbols defined within enums; @@ -71,6 +79,8 @@ class ParseDataStructs: ignore ioctl VIDIOC_ENUM_FMT replace ioctl VIDIOC_DQBUF vidioc_qbuf replace define V4L2_EVENT_MD_FL_HAVE_FRAME_SEQ :c:type:`v4l2_event_motion_det` + + namespace MC """ # Parser regexes with multiple ways to capture enums and structs @@ -140,10 +150,96 @@ class ParseDataStructs: self.symbols = {} + self.namespace = None + self.ignore = [] + self.replace = [] + for symbol_type in self.DEF_SYMBOL_TYPES: self.symbols[symbol_type] = {} - def store_type(self, symbol_type: str, symbol: str, + def read_exceptions(self, fname: str): + if not fname: + return + + name = os.path.basename(fname) + + with open(fname, "r", encoding="utf-8", errors="backslashreplace") as f: + for ln, line in enumerate(f): + ln += 1 + line = line.strip() + if not line or line.startswith("#"): + continue + + # ignore rules + match = re.match(r"^ignore\s+(\w+)\s+(\S+)", line) + + if match: + self.ignore.append((ln, match.group(1), match.group(2))) + continue + + # replace rules + match = re.match(r"^replace\s+(\S+)\s+(\S+)\s+(\S+)", line) + if match: + self.replace.append((ln, match.group(1), match.group(2), + match.group(3))) + continue + + match = re.match(r"^namespace\s+(\S+)", line) + if match: + self.namespace = match.group(1) + continue + + sys.exit(f"{name}:{ln}: invalid line: {line}") + + def apply_exceptions(self): + """ + Process exceptions file with rules to ignore or replace references. + """ + + # Handle ignore rules + for ln, c_type, symbol in self.ignore: + if c_type not in self.DEF_SYMBOL_TYPES: + sys.exit(f"{name}:{ln}: {c_type} is invalid") + + d = self.symbols[c_type] + if symbol in d: + del d[symbol] + + # Handle replace rules + for ln, c_type, old, new in self.replace: + if c_type not in self.DEF_SYMBOL_TYPES: + sys.exit(f"{name}:{ln}: {c_type} is invalid") + + reftype = None + + # Parse reference type when the type is specified + + match = re.match(r"^\:c\:(\w+)\:\`(.+)\`", new) + if match: + reftype = f":c:{match.group(1)}" + new = match.group(2) + else: + match = re.search(r"(\:ref)\:\`(.+)\`", new) + if match: + reftype = match.group(1) + new = match.group(2) + + # If the replacement rule doesn't have a type, get default + if not reftype: + reftype = self.DEF_SYMBOL_TYPES[c_type].get("ref_type") + if not reftype: + reftype = self.DEF_SYMBOL_TYPES[c_type].get("real_type") + + new_ref = f"{reftype}:`{old} <{new}>`" + + # Change self.symbols to use the replacement rule + if old in self.symbols[c_type]: + (_, ln) = self.symbols[c_type][old] + self.symbols[c_type][old] = (new_ref, ln) + else: + print(f"{name}:{ln}: Warning: can't find {old} {c_type}") + + def store_type(self, ln, symbol_type: str, symbol: str, ref_name: str = None, replace_underscores: bool = True): """ Stores a new symbol at self.symbols under symbol_type. @@ -157,35 +253,42 @@ class ParseDataStructs: ref_type = defs.get("ref_type") # Determine ref_link based on symbol type - if ref_type: - if symbol_type == "enum": - ref_link = f"{ref_type}:`{symbol}`" - else: - if not ref_name: - ref_name = symbol.lower() + if ref_type or self.namespace: + if not ref_name: + ref_name = symbol.lower() + + # c-type references don't support hash + if ref_type == ":ref" and replace_underscores: + ref_name = ref_name.replace("_", "-") - # c-type references don't support hash - if ref_type == ":ref" and replace_underscores: - ref_name = ref_name.replace("_", "-") + # C domain references may have namespaces + if ref_type.startswith(":c:"): + if self.namespace: + ref_name = f"{self.namespace}.{ref_name}" + if ref_type: ref_link = f"{ref_type}:`{symbol} <{ref_name}>`" + else: + ref_link = f"`{symbol} <{ref_name}>`" else: ref_link = symbol - self.symbols[symbol_type][symbol] = f"{prefix}{ref_link}{suffix}" + self.symbols[symbol_type][symbol] = (f"{prefix}{ref_link}{suffix}", ln) def store_line(self, line): """Stores a line at self.data, properly indented""" line = " " + line.expandtabs() self.data += line.rstrip(" ") - def parse_file(self, file_in: str): + def parse_file(self, file_in: str, exceptions: str = None): """Reads a C source file and get identifiers""" self.data = "" is_enum = False is_comment = False multiline = "" + self.read_exceptions(exceptions) + with open(file_in, "r", encoding="utf-8", errors="backslashreplace") as f: for line_no, line in enumerate(f): @@ -240,20 +343,20 @@ class ParseDataStructs: if is_enum: match = re.match(r"^\s*([_\w][\w\d_]+)\s*[\,=]?", line) if match: - self.store_type("symbol", match.group(1)) + self.store_type(line_no, "symbol", match.group(1)) if "}" in line: is_enum = False continue match = re.match(r"^\s*#\s*define\s+([\w_]+)\s+_IO", line) if match: - self.store_type("ioctl", match.group(1), + self.store_type(line_no, "ioctl", match.group(1), replace_underscores=False) continue match = re.match(r"^\s*#\s*define\s+([\w_]+)(\s+|$)", line) if match: - self.store_type("define", match.group(1)) + self.store_type(line_no, "define", match.group(1)) continue match = re.match(r"^\s*typedef\s+([_\w][\w\d_]+)\s+(.*)\s+([_\w][\w\d_]+);", @@ -261,90 +364,23 @@ class ParseDataStructs: if match: name = match.group(2).strip() symbol = match.group(3) - self.store_type("typedef", symbol, ref_name=name) + self.store_type(line_no, "typedef", symbol, ref_name=name) continue for re_enum in self.RE_ENUMS: match = re_enum.match(line) if match: - self.store_type("enum", match.group(1)) + self.store_type(line_no, "enum", match.group(1)) is_enum = True break for re_struct in self.RE_STRUCTS: match = re_struct.match(line) if match: - self.store_type("struct", match.group(1)) + self.store_type(line_no, "struct", match.group(1)) break - def process_exceptions(self, fname: str): - """ - Process exceptions file with rules to ignore or replace references. - """ - if not fname: - return - - name = os.path.basename(fname) - - with open(fname, "r", encoding="utf-8", errors="backslashreplace") as f: - for ln, line in enumerate(f): - ln += 1 - line = line.strip() - if not line or line.startswith("#"): - continue - - # Handle ignore rules - match = re.match(r"^ignore\s+(\w+)\s+(\S+)", line) - if match: - c_type = match.group(1) - symbol = match.group(2) - - if c_type not in self.DEF_SYMBOL_TYPES: - sys.exit(f"{name}:{ln}: {c_type} is invalid") - - d = self.symbols[c_type] - if symbol in d: - del d[symbol] - - continue - - # Handle replace rules - match = re.match(r"^replace\s+(\S+)\s+(\S+)\s+(\S+)", line) - if not match: - sys.exit(f"{name}:{ln}: invalid line: {line}") - - c_type, old, new = match.groups() - - if c_type not in self.DEF_SYMBOL_TYPES: - sys.exit(f"{name}:{ln}: {c_type} is invalid") - - reftype = None - - # Parse reference type when the type is specified - - match = re.match(r"^\:c\:(data|func|macro|type)\:\`(.+)\`", new) - if match: - reftype = f":c:{match.group(1)}" - new = match.group(2) - else: - match = re.search(r"(\:ref)\:\`(.+)\`", new) - if match: - reftype = match.group(1) - new = match.group(2) - - # If the replacement rule doesn't have a type, get default - if not reftype: - reftype = self.DEF_SYMBOL_TYPES[c_type].get("ref_type") - if not reftype: - reftype = self.DEF_SYMBOL_TYPES[c_type].get("real_type") - - new_ref = f"{reftype}:`{old} <{new}>`" - - # Change self.symbols to use the replacement rule - if old in self.symbols[c_type]: - self.symbols[c_type][old] = new_ref - else: - print(f"{name}:{ln}: Warning: can't find {old} {c_type}") + self.apply_exceptions() def debug_print(self): """ @@ -360,8 +396,8 @@ class ParseDataStructs: print(f"{c_type}:") - for symbol, ref in sorted(refs.items()): - print(f" {symbol} -> {ref}") + for symbol, (ref, ln) in sorted(refs.items()): + print(f" #{ln:<5d} {symbol} -> {ref}") print() @@ -384,7 +420,7 @@ class ParseDataStructs: # Process all reference types for ref_dict in self.symbols.values(): - for symbol, replacement in ref_dict.items(): + for symbol, (replacement, _) in ref_dict.items(): symbol = re.escape(re.sub(r"([\_\`\*\<\>\&\\\\:\/])", r"\\\1", symbol)) text = re.sub(fr'{start_delim}{symbol}{end_delim}', fr'\1{replacement}\2', text) @@ -397,16 +433,10 @@ class ParseDataStructs: def gen_toc(self): """ - Create a TOC table pointing to each symbol from the header + Create a list of symbols to be part of a TOC contents table """ text = [] - # Add header - text.append(".. contents:: Table of Contents") - text.append(" :depth: 2") - text.append(" :local:") - text.append("") - # Sort symbol types per description symbol_descriptions = [] for k, v in self.DEF_SYMBOL_TYPES.items(): @@ -426,8 +456,8 @@ class ParseDataStructs: text.append("") # Sort symbols alphabetically - for symbol, ref in sorted(refs.items()): - text.append(f"* :{ref}:") + for symbol, (ref, ln) in sorted(refs.items()): + text.append(f"- LINENO_{ln}: {ref}") text.append("") # Add empty line between categories diff --git a/tools/lib/python/kdoc/python_version.py b/tools/lib/python/kdoc/python_version.py new file mode 100644 index 000000000000..e83088013db2 --- /dev/null +++ b/tools/lib/python/kdoc/python_version.py @@ -0,0 +1,178 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: GPL-2.0-or-later +# Copyright (c) 2017-2025 Mauro Carvalho Chehab <mchehab+huawei@kernel.org> + +""" +Handle Python version check logic. + +Not all Python versions are supported by scripts. Yet, on some cases, +like during documentation build, a newer version of python could be +available. + +This class allows checking if the minimal requirements are followed. + +Better than that, PythonVersion.check_python() not only checks the minimal +requirements, but it automatically switches to a the newest available +Python version if present. + +""" + +import os +import re +import subprocess +import shlex +import sys + +from glob import glob +from textwrap import indent + +class PythonVersion: + """ + Ancillary methods that checks for missing dependencies for different + types of types, like binaries, python modules, rpm deps, etc. + """ + + def __init__(self, version): + """Ïnitialize self.version tuple from a version string""" + self.version = self.parse_version(version) + + @staticmethod + def parse_version(version): + """Convert a major.minor.patch version into a tuple""" + return tuple(int(x) for x in version.split(".")) + + @staticmethod + def ver_str(version): + """Returns a version tuple as major.minor.patch""" + return ".".join([str(x) for x in version]) + + @staticmethod + def cmd_print(cmd, max_len=80): + cmd_line = [] + + for w in cmd: + w = shlex.quote(w) + + if cmd_line: + if not max_len or len(cmd_line[-1]) + len(w) < max_len: + cmd_line[-1] += " " + w + continue + else: + cmd_line[-1] += " \\" + cmd_line.append(w) + else: + cmd_line.append(w) + + return "\n ".join(cmd_line) + + def __str__(self): + """Returns a version tuple as major.minor.patch from self.version""" + return self.ver_str(self.version) + + @staticmethod + def get_python_version(cmd): + """ + Get python version from a Python binary. As we need to detect if + are out there newer python binaries, we can't rely on sys.release here. + """ + + kwargs = {} + if sys.version_info < (3, 7): + kwargs['universal_newlines'] = True + else: + kwargs['text'] = True + + result = subprocess.run([cmd, "--version"], + stdout = subprocess.PIPE, + stderr = subprocess.PIPE, + **kwargs, check=False) + + version = result.stdout.strip() + + match = re.search(r"(\d+\.\d+\.\d+)", version) + if match: + return PythonVersion.parse_version(match.group(1)) + + print(f"Can't parse version {version}") + return (0, 0, 0) + + @staticmethod + def find_python(min_version): + """ + Detect if are out there any python 3.xy version newer than the + current one. + + Note: this routine is limited to up to 2 digits for python3. We + may need to update it one day, hopefully on a distant future. + """ + patterns = [ + "python3.[0-9][0-9]", + "python3.[0-9]", + ] + + python_cmd = [] + + # Seek for a python binary newer than min_version + for path in os.getenv("PATH", "").split(":"): + for pattern in patterns: + for cmd in glob(os.path.join(path, pattern)): + if os.path.isfile(cmd) and os.access(cmd, os.X_OK): + version = PythonVersion.get_python_version(cmd) + if version >= min_version: + python_cmd.append((version, cmd)) + + return sorted(python_cmd, reverse=True) + + @staticmethod + def check_python(min_version, show_alternatives=False, bail_out=False, + success_on_error=False): + """ + Check if the current python binary satisfies our minimal requirement + for Sphinx build. If not, re-run with a newer version if found. + """ + cur_ver = sys.version_info[:3] + if cur_ver >= min_version: + ver = PythonVersion.ver_str(cur_ver) + return + + python_ver = PythonVersion.ver_str(cur_ver) + + available_versions = PythonVersion.find_python(min_version) + if not available_versions: + print(f"ERROR: Python version {python_ver} is not supported anymore\n") + print(" Can't find a new version. This script may fail") + return + + script_path = os.path.abspath(sys.argv[0]) + + # Check possible alternatives + if available_versions: + new_python_cmd = available_versions[0][1] + else: + new_python_cmd = None + + if show_alternatives and available_versions: + print("You could run, instead:") + for _, cmd in available_versions: + args = [cmd, script_path] + sys.argv[1:] + + cmd_str = indent(PythonVersion.cmd_print(args), " ") + print(f"{cmd_str}\n") + + if bail_out: + msg = f"Python {python_ver} not supported. Bailing out" + if success_on_error: + print(msg, file=sys.stderr) + sys.exit(0) + else: + sys.exit(msg) + + print(f"Python {python_ver} not supported. Changing to {new_python_cmd}") + + # Restart script using the newer version + args = [new_python_cmd, script_path] + sys.argv[1:] + + try: + os.execv(new_python_cmd, args) + except OSError as e: + sys.exit(f"Failed to restart with {new_python_cmd}: {e}") diff --git a/tools/mm/page_owner_sort.c b/tools/mm/page_owner_sort.c index 880e36df0c11..14c67e9e84c4 100644 --- a/tools/mm/page_owner_sort.c +++ b/tools/mm/page_owner_sort.c @@ -13,6 +13,7 @@ #include <stdio.h> #include <stdlib.h> +#include <stdbool.h> #include <sys/types.h> #include <sys/stat.h> #include <fcntl.h> @@ -23,9 +24,6 @@ #include <linux/types.h> #include <getopt.h> -#define bool int -#define true 1 -#define false 0 #define TASK_COMM_LEN 16 struct block_list { @@ -669,14 +667,15 @@ int main(int argc, char **argv) { "pid", required_argument, NULL, 1 }, { "tgid", required_argument, NULL, 2 }, { "name", required_argument, NULL, 3 }, - { "cull", required_argument, NULL, 4 }, - { "sort", required_argument, NULL, 5 }, + { "cull", required_argument, NULL, 4 }, + { "sort", required_argument, NULL, 5 }, + { "help", no_argument, NULL, 'h' }, { 0, 0, 0, 0}, }; compare_flag = COMP_NO_FLAG; - while ((opt = getopt_long(argc, argv, "admnpstP", longopts, NULL)) != -1) + while ((opt = getopt_long(argc, argv, "admnpstPh", longopts, NULL)) != -1) switch (opt) { case 'a': compare_flag |= COMP_ALLOC; @@ -702,6 +701,9 @@ int main(int argc, char **argv) case 'n': compare_flag |= COMP_COMM; break; + case 'h': + usage(); + exit(0); case 1: filter = filter | FILTER_PID; fc.pids = parse_nums_list(optarg, &fc.pids_size); diff --git a/tools/net/sunrpc/xdrgen/generators/__init__.py b/tools/net/sunrpc/xdrgen/generators/__init__.py index b98574a36a4a..e22632cf38fb 100644 --- a/tools/net/sunrpc/xdrgen/generators/__init__.py +++ b/tools/net/sunrpc/xdrgen/generators/__init__.py @@ -2,7 +2,7 @@ """Define a base code generator class""" -import sys +from pathlib import Path from jinja2 import Environment, FileSystemLoader, Template from xdr_ast import _XdrAst, Specification, _RpcProgram, _XdrTypeSpecifier @@ -14,8 +14,11 @@ def create_jinja2_environment(language: str, xdr_type: str) -> Environment: """Open a set of templates based on output language""" match language: case "C": + templates_dir = ( + Path(__file__).parent.parent / "templates" / language / xdr_type + ) environment = Environment( - loader=FileSystemLoader(sys.path[0] + "/templates/C/" + xdr_type + "/"), + loader=FileSystemLoader(templates_dir), trim_blocks=True, lstrip_blocks=True, ) @@ -48,9 +51,7 @@ def find_xdr_program_name(root: Specification) -> str: def header_guard_infix(filename: str) -> str: """Extract the header guard infix from the specification filename""" - basename = filename.split("/")[-1] - program = basename.replace(".x", "") - return program.upper() + return Path(filename).stem.upper() def kernel_c_type(spec: _XdrTypeSpecifier) -> str: diff --git a/tools/net/sunrpc/xdrgen/generators/union.py b/tools/net/sunrpc/xdrgen/generators/union.py index 2cca00e279cd..ad1f214ef22a 100644 --- a/tools/net/sunrpc/xdrgen/generators/union.py +++ b/tools/net/sunrpc/xdrgen/generators/union.py @@ -8,7 +8,7 @@ from jinja2 import Environment from generators import SourceGenerator from generators import create_jinja2_environment, get_jinja2_template -from xdr_ast import _XdrBasic, _XdrUnion, _XdrVoid, get_header_name +from xdr_ast import _XdrBasic, _XdrUnion, _XdrVoid, _XdrString, get_header_name from xdr_ast import _XdrDeclaration, _XdrCaseSpec, public_apis, big_endian @@ -40,13 +40,20 @@ def emit_union_case_spec_definition( """Emit a definition for an XDR union's case arm""" if isinstance(node.arm, _XdrVoid): return - assert isinstance(node.arm, _XdrBasic) + if isinstance(node.arm, _XdrString): + type_name = "char *" + classifier = "" + else: + type_name = node.arm.spec.type_name + classifier = node.arm.spec.c_classifier + + assert isinstance(node.arm, (_XdrBasic, _XdrString)) template = get_jinja2_template(environment, "definition", "case_spec") print( template.render( name=node.arm.name, - type=node.arm.spec.type_name, - classifier=node.arm.spec.c_classifier, + type=type_name, + classifier=classifier, ) ) @@ -84,6 +91,12 @@ def emit_union_case_spec_decoder( if isinstance(node.arm, _XdrVoid): return + if isinstance(node.arm, _XdrString): + type_name = "char *" + classifier = "" + else: + type_name = node.arm.spec.type_name + classifier = node.arm.spec.c_classifier if big_endian_discriminant: template = get_jinja2_template(environment, "decoder", "case_spec_be") @@ -92,13 +105,13 @@ def emit_union_case_spec_decoder( for case in node.values: print(template.render(case=case)) - assert isinstance(node.arm, _XdrBasic) + assert isinstance(node.arm, (_XdrBasic, _XdrString)) template = get_jinja2_template(environment, "decoder", node.arm.template) print( template.render( name=node.arm.name, - type=node.arm.spec.type_name, - classifier=node.arm.spec.c_classifier, + type=type_name, + classifier=classifier, ) ) @@ -169,7 +182,10 @@ def emit_union_case_spec_encoder( if isinstance(node.arm, _XdrVoid): return - + if isinstance(node.arm, _XdrString): + type_name = "char *" + else: + type_name = node.arm.spec.type_name if big_endian_discriminant: template = get_jinja2_template(environment, "encoder", "case_spec_be") else: @@ -181,7 +197,7 @@ def emit_union_case_spec_encoder( print( template.render( name=node.arm.name, - type=node.arm.spec.type_name, + type=type_name, ) ) diff --git a/tools/net/sunrpc/xdrgen/templates/C/pointer/decoder/close.j2 b/tools/net/sunrpc/xdrgen/templates/C/pointer/decoder/close.j2 index 5bf010665f84..3dbd724d7f17 100644 --- a/tools/net/sunrpc/xdrgen/templates/C/pointer/decoder/close.j2 +++ b/tools/net/sunrpc/xdrgen/templates/C/pointer/decoder/close.j2 @@ -1,3 +1,3 @@ {# SPDX-License-Identifier: GPL-2.0 #} return true; -}; +} diff --git a/tools/net/sunrpc/xdrgen/templates/C/pointer/encoder/close.j2 b/tools/net/sunrpc/xdrgen/templates/C/pointer/encoder/close.j2 index 5bf010665f84..3dbd724d7f17 100644 --- a/tools/net/sunrpc/xdrgen/templates/C/pointer/encoder/close.j2 +++ b/tools/net/sunrpc/xdrgen/templates/C/pointer/encoder/close.j2 @@ -1,3 +1,3 @@ {# SPDX-License-Identifier: GPL-2.0 #} return true; -}; +} diff --git a/tools/net/sunrpc/xdrgen/templates/C/struct/decoder/close.j2 b/tools/net/sunrpc/xdrgen/templates/C/struct/decoder/close.j2 index 5bf010665f84..3dbd724d7f17 100644 --- a/tools/net/sunrpc/xdrgen/templates/C/struct/decoder/close.j2 +++ b/tools/net/sunrpc/xdrgen/templates/C/struct/decoder/close.j2 @@ -1,3 +1,3 @@ {# SPDX-License-Identifier: GPL-2.0 #} return true; -}; +} diff --git a/tools/net/sunrpc/xdrgen/templates/C/struct/decoder/variable_length_opaque.j2 b/tools/net/sunrpc/xdrgen/templates/C/struct/decoder/variable_length_opaque.j2 index 9a814de54ae8..65698e20d8cd 100644 --- a/tools/net/sunrpc/xdrgen/templates/C/struct/decoder/variable_length_opaque.j2 +++ b/tools/net/sunrpc/xdrgen/templates/C/struct/decoder/variable_length_opaque.j2 @@ -2,5 +2,5 @@ {% if annotate %} /* member {{ name }} (variable-length opaque) */ {% endif %} - if (!xdrgen_decode_opaque(xdr, (opaque *)ptr, {{ maxsize }})) + if (!xdrgen_decode_opaque(xdr, &ptr->{{ name }}, {{ maxsize }})) return false; diff --git a/tools/net/sunrpc/xdrgen/templates/C/struct/encoder/close.j2 b/tools/net/sunrpc/xdrgen/templates/C/struct/encoder/close.j2 index 5bf010665f84..3dbd724d7f17 100644 --- a/tools/net/sunrpc/xdrgen/templates/C/struct/encoder/close.j2 +++ b/tools/net/sunrpc/xdrgen/templates/C/struct/encoder/close.j2 @@ -1,3 +1,3 @@ {# SPDX-License-Identifier: GPL-2.0 #} return true; -}; +} diff --git a/tools/net/sunrpc/xdrgen/templates/C/typedef/decoder/basic.j2 b/tools/net/sunrpc/xdrgen/templates/C/typedef/decoder/basic.j2 index da4709403dc9..b215e157dfa7 100644 --- a/tools/net/sunrpc/xdrgen/templates/C/typedef/decoder/basic.j2 +++ b/tools/net/sunrpc/xdrgen/templates/C/typedef/decoder/basic.j2 @@ -14,4 +14,4 @@ xdrgen_decode_{{ name }}(struct xdr_stream *xdr, {{ name }} *ptr) /* (basic) */ {% endif %} return xdrgen_decode_{{ type }}(xdr, ptr); -}; +} diff --git a/tools/net/sunrpc/xdrgen/templates/C/typedef/decoder/fixed_length_array.j2 b/tools/net/sunrpc/xdrgen/templates/C/typedef/decoder/fixed_length_array.j2 index d7c80e472fe3..c8953719e626 100644 --- a/tools/net/sunrpc/xdrgen/templates/C/typedef/decoder/fixed_length_array.j2 +++ b/tools/net/sunrpc/xdrgen/templates/C/typedef/decoder/fixed_length_array.j2 @@ -22,4 +22,4 @@ xdrgen_decode_{{ name }}(struct xdr_stream *xdr, {{ classifier }}{{ name }} *ptr return false; } return true; -}; +} diff --git a/tools/net/sunrpc/xdrgen/templates/C/typedef/decoder/fixed_length_opaque.j2 b/tools/net/sunrpc/xdrgen/templates/C/typedef/decoder/fixed_length_opaque.j2 index bdc7bd24ffb1..c854fc8c74e3 100644 --- a/tools/net/sunrpc/xdrgen/templates/C/typedef/decoder/fixed_length_opaque.j2 +++ b/tools/net/sunrpc/xdrgen/templates/C/typedef/decoder/fixed_length_opaque.j2 @@ -14,4 +14,4 @@ xdrgen_decode_{{ name }}(struct xdr_stream *xdr, {{ classifier }}{{ name }} *ptr /* (fixed-length opaque) */ {% endif %} return xdr_stream_decode_opaque_fixed(xdr, ptr, {{ size }}) == 0; -}; +} diff --git a/tools/net/sunrpc/xdrgen/templates/C/typedef/decoder/string.j2 b/tools/net/sunrpc/xdrgen/templates/C/typedef/decoder/string.j2 index 56c5a17d6a70..bcbc1758aae9 100644 --- a/tools/net/sunrpc/xdrgen/templates/C/typedef/decoder/string.j2 +++ b/tools/net/sunrpc/xdrgen/templates/C/typedef/decoder/string.j2 @@ -14,4 +14,4 @@ xdrgen_decode_{{ name }}(struct xdr_stream *xdr, {{ classifier }}{{ name }} *ptr /* (variable-length string) */ {% endif %} return xdrgen_decode_string(xdr, ptr, {{ maxsize }}); -}; +} diff --git a/tools/net/sunrpc/xdrgen/templates/C/typedef/decoder/variable_length_array.j2 b/tools/net/sunrpc/xdrgen/templates/C/typedef/decoder/variable_length_array.j2 index e74ffdd98463..a59cc1f38eed 100644 --- a/tools/net/sunrpc/xdrgen/templates/C/typedef/decoder/variable_length_array.j2 +++ b/tools/net/sunrpc/xdrgen/templates/C/typedef/decoder/variable_length_array.j2 @@ -23,4 +23,4 @@ xdrgen_decode_{{ name }}(struct xdr_stream *xdr, {{ classifier }}{{ name }} *ptr if (!xdrgen_decode_{{ type }}(xdr, &ptr->element[i])) return false; return true; -}; +} diff --git a/tools/net/sunrpc/xdrgen/templates/C/typedef/decoder/variable_length_opaque.j2 b/tools/net/sunrpc/xdrgen/templates/C/typedef/decoder/variable_length_opaque.j2 index f28f8b228ad5..eb05f53e1041 100644 --- a/tools/net/sunrpc/xdrgen/templates/C/typedef/decoder/variable_length_opaque.j2 +++ b/tools/net/sunrpc/xdrgen/templates/C/typedef/decoder/variable_length_opaque.j2 @@ -14,4 +14,4 @@ xdrgen_decode_{{ name }}(struct xdr_stream *xdr, {{ classifier }}{{ name }} *ptr /* (variable-length opaque) */ {% endif %} return xdrgen_decode_opaque(xdr, ptr, {{ maxsize }}); -}; +} diff --git a/tools/net/sunrpc/xdrgen/templates/C/typedef/encoder/basic.j2 b/tools/net/sunrpc/xdrgen/templates/C/typedef/encoder/basic.j2 index 35effe67e4ef..0d21dd0b723a 100644 --- a/tools/net/sunrpc/xdrgen/templates/C/typedef/encoder/basic.j2 +++ b/tools/net/sunrpc/xdrgen/templates/C/typedef/encoder/basic.j2 @@ -18,4 +18,4 @@ xdrgen_encode_{{ name }}(struct xdr_stream *xdr, const {{ classifier }}{{ name } /* (basic) */ {% endif %} return xdrgen_encode_{{ type }}(xdr, value); -}; +} diff --git a/tools/net/sunrpc/xdrgen/templates/C/typedef/encoder/fixed_length_array.j2 b/tools/net/sunrpc/xdrgen/templates/C/typedef/encoder/fixed_length_array.j2 index 95202ad5ad2d..ec8cd6509514 100644 --- a/tools/net/sunrpc/xdrgen/templates/C/typedef/encoder/fixed_length_array.j2 +++ b/tools/net/sunrpc/xdrgen/templates/C/typedef/encoder/fixed_length_array.j2 @@ -22,4 +22,4 @@ xdrgen_encode_{{ name }}(struct xdr_stream *xdr, const {{ classifier }}{{ name } return false; } return true; -}; +} diff --git a/tools/net/sunrpc/xdrgen/templates/C/typedef/encoder/fixed_length_opaque.j2 b/tools/net/sunrpc/xdrgen/templates/C/typedef/encoder/fixed_length_opaque.j2 index 9c66a11b9912..b53fa87e1858 100644 --- a/tools/net/sunrpc/xdrgen/templates/C/typedef/encoder/fixed_length_opaque.j2 +++ b/tools/net/sunrpc/xdrgen/templates/C/typedef/encoder/fixed_length_opaque.j2 @@ -14,4 +14,4 @@ xdrgen_encode_{{ name }}(struct xdr_stream *xdr, const {{ classifier }}{{ name } /* (fixed-length opaque) */ {% endif %} return xdr_stream_encode_opaque_fixed(xdr, value, {{ size }}) >= 0; -}; +} diff --git a/tools/net/sunrpc/xdrgen/templates/C/typedef/encoder/string.j2 b/tools/net/sunrpc/xdrgen/templates/C/typedef/encoder/string.j2 index 3d490ff180d0..28b81f1d0bd6 100644 --- a/tools/net/sunrpc/xdrgen/templates/C/typedef/encoder/string.j2 +++ b/tools/net/sunrpc/xdrgen/templates/C/typedef/encoder/string.j2 @@ -14,4 +14,4 @@ xdrgen_encode_{{ name }}(struct xdr_stream *xdr, const {{ classifier }}{{ name } /* (variable-length string) */ {% endif %} return xdr_stream_encode_opaque(xdr, value.data, value.len) >= 0; -}; +} diff --git a/tools/net/sunrpc/xdrgen/templates/C/typedef/encoder/variable_length_array.j2 b/tools/net/sunrpc/xdrgen/templates/C/typedef/encoder/variable_length_array.j2 index 2d2384f64918..ff093c281d51 100644 --- a/tools/net/sunrpc/xdrgen/templates/C/typedef/encoder/variable_length_array.j2 +++ b/tools/net/sunrpc/xdrgen/templates/C/typedef/encoder/variable_length_array.j2 @@ -27,4 +27,4 @@ xdrgen_encode_{{ name }}(struct xdr_stream *xdr, const {{ classifier }}{{ name } {% endif %} return false; return true; -}; +} diff --git a/tools/net/sunrpc/xdrgen/templates/C/typedef/encoder/variable_length_opaque.j2 b/tools/net/sunrpc/xdrgen/templates/C/typedef/encoder/variable_length_opaque.j2 index 8508f13c95b9..2e89592fa702 100644 --- a/tools/net/sunrpc/xdrgen/templates/C/typedef/encoder/variable_length_opaque.j2 +++ b/tools/net/sunrpc/xdrgen/templates/C/typedef/encoder/variable_length_opaque.j2 @@ -14,4 +14,4 @@ xdrgen_encode_{{ name }}(struct xdr_stream *xdr, const {{ classifier }}{{ name } /* (variable-length opaque) */ {% endif %} return xdr_stream_encode_opaque(xdr, value.data, value.len) >= 0; -}; +} diff --git a/tools/net/sunrpc/xdrgen/templates/C/union/declaration/close.j2 b/tools/net/sunrpc/xdrgen/templates/C/union/declaration/close.j2 new file mode 100644 index 000000000000..816291184e8c --- /dev/null +++ b/tools/net/sunrpc/xdrgen/templates/C/union/declaration/close.j2 @@ -0,0 +1,4 @@ +{# SPDX-License-Identifier: GPL-2.0 #} + +bool xdrgen_decode_{{ name }}(struct xdr_stream *xdr, struct {{ name }} *ptr); +bool xdrgen_encode_{{ name }}(struct xdr_stream *xdr, const struct {{ name }} *value); diff --git a/tools/net/sunrpc/xdrgen/templates/C/union/decoder/close.j2 b/tools/net/sunrpc/xdrgen/templates/C/union/decoder/close.j2 index fdc2dfd1843b..39d8d6c5094d 100644 --- a/tools/net/sunrpc/xdrgen/templates/C/union/decoder/close.j2 +++ b/tools/net/sunrpc/xdrgen/templates/C/union/decoder/close.j2 @@ -1,4 +1,4 @@ {# SPDX-License-Identifier: GPL-2.0 #} } return true; -}; +} diff --git a/tools/net/sunrpc/xdrgen/templates/C/union/encoder/close.j2 b/tools/net/sunrpc/xdrgen/templates/C/union/encoder/close.j2 index fdc2dfd1843b..39d8d6c5094d 100644 --- a/tools/net/sunrpc/xdrgen/templates/C/union/encoder/close.j2 +++ b/tools/net/sunrpc/xdrgen/templates/C/union/encoder/close.j2 @@ -1,4 +1,4 @@ {# SPDX-License-Identifier: GPL-2.0 #} } return true; -}; +} diff --git a/tools/net/sunrpc/xdrgen/templates/C/union/encoder/string.j2 b/tools/net/sunrpc/xdrgen/templates/C/union/encoder/string.j2 new file mode 100644 index 000000000000..2f035a64f1f4 --- /dev/null +++ b/tools/net/sunrpc/xdrgen/templates/C/union/encoder/string.j2 @@ -0,0 +1,6 @@ +{# SPDX-License-Identifier: GPL-2.0 #} +{% if annotate %} + /* member {{ name }} (variable-length string) */ +{% endif %} + if (!xdrgen_encode_string(xdr, ptr->u.{{ name }}, {{ maxsize }})) + return false; diff --git a/tools/net/sunrpc/xdrgen/xdrgen b/tools/net/sunrpc/xdrgen/xdrgen index 43762be39252..3afd0547d67c 100755 --- a/tools/net/sunrpc/xdrgen/xdrgen +++ b/tools/net/sunrpc/xdrgen/xdrgen @@ -10,8 +10,13 @@ __license__ = "GPL-2.0 only" __version__ = "0.2" import sys +from pathlib import Path import argparse +_XDRGEN_DIR = Path(__file__).resolve().parent +if str(_XDRGEN_DIR) not in sys.path: + sys.path.insert(0, str(_XDRGEN_DIR)) + from subcmds import definitions from subcmds import declarations from subcmds import lint diff --git a/tools/net/ynl/Makefile b/tools/net/ynl/Makefile index 211df5a93ad9..7736b492f559 100644 --- a/tools/net/ynl/Makefile +++ b/tools/net/ynl/Makefile @@ -12,10 +12,13 @@ endif libdir ?= $(prefix)/$(libdir_relative) includedir ?= $(prefix)/include -SUBDIRS = lib generated samples +SPECDIR=../../../Documentation/netlink/specs + +SUBDIRS = lib generated samples ynltool tests all: $(SUBDIRS) libynl.a +ynltool: | lib generated libynl.a samples: | lib generated libynl.a: | lib generated @echo -e "\tAR $@" @@ -48,5 +51,27 @@ install: libynl.a lib/*.h @echo -e "\tINSTALL pyynl" @pip install --prefix=$(DESTDIR)$(prefix) . @make -C generated install + @make -C tests install + +run_tests: + @$(MAKE) -C tests run_tests + +lint: + yamllint $(SPECDIR) + +schema_check: + @N=1; \ + for spec in $(SPECDIR)/*.yaml ; do \ + NAME=$$(basename $$spec) ; \ + OUTPUT=$$(./pyynl/cli.py --spec $$spec --validate) ; \ + if [ $$? -eq 0 ] ; then \ + echo "ok $$N $$NAME schema validation" ; \ + else \ + echo "not ok $$N $$NAME schema validation" ; \ + echo "$$OUTPUT" ; \ + echo ; \ + fi ; \ + N=$$((N+1)) ; \ + done -.PHONY: all clean distclean install $(SUBDIRS) +.PHONY: all clean distclean install run_tests lint schema_check $(SUBDIRS) diff --git a/tools/net/ynl/pyynl/cli.py b/tools/net/ynl/pyynl/cli.py index 8c192e900bd3..af02a5b7e5a2 100755 --- a/tools/net/ynl/pyynl/cli.py +++ b/tools/net/ynl/pyynl/cli.py @@ -7,9 +7,10 @@ import os import pathlib import pprint import sys +import textwrap sys.path.append(pathlib.Path(__file__).resolve().parent.as_posix()) -from lib import YnlFamily, Netlink, NlError +from lib import YnlFamily, Netlink, NlError, SpecFamily sys_schema_dir='/usr/share/ynl' relative_schema_dir='../../../../Documentation/netlink' @@ -39,6 +40,60 @@ class YnlEncoder(json.JSONEncoder): return json.JSONEncoder.default(self, obj) +def print_attr_list(ynl, attr_names, attr_set, indent=2): + """Print a list of attributes with their types and documentation.""" + prefix = ' ' * indent + for attr_name in attr_names: + if attr_name in attr_set.attrs: + attr = attr_set.attrs[attr_name] + attr_info = f'{prefix}- {attr_name}: {attr.type}' + if 'enum' in attr.yaml: + enum_name = attr.yaml['enum'] + attr_info += f" (enum: {enum_name})" + # Print enum values if available + if enum_name in ynl.consts: + const = ynl.consts[enum_name] + enum_values = list(const.entries.keys()) + attr_info += f"\n{prefix} {const.type.capitalize()}: {', '.join(enum_values)}" + + # Show nested attributes reference and recursively display them + nested_set_name = None + if attr.type == 'nest' and 'nested-attributes' in attr.yaml: + nested_set_name = attr.yaml['nested-attributes'] + attr_info += f" -> {nested_set_name}" + + if attr.yaml.get('doc'): + doc_text = textwrap.indent(attr.yaml['doc'], prefix + ' ') + attr_info += f"\n{doc_text}" + print(attr_info) + + # Recursively show nested attributes + if nested_set_name in ynl.attr_sets: + nested_set = ynl.attr_sets[nested_set_name] + # Filter out 'unspec' and other unused attrs + nested_names = [n for n in nested_set.attrs.keys() + if nested_set.attrs[n].type != 'unused'] + if nested_names: + print_attr_list(ynl, nested_names, nested_set, indent + 4) + + +def print_mode_attrs(ynl, mode, mode_spec, attr_set, print_request=True): + """Print a given mode (do/dump/event/notify).""" + mode_title = mode.capitalize() + + if print_request and 'request' in mode_spec and 'attributes' in mode_spec['request']: + print(f'\n{mode_title} request attributes:') + print_attr_list(ynl, mode_spec['request']['attributes'], attr_set) + + if 'reply' in mode_spec and 'attributes' in mode_spec['reply']: + print(f'\n{mode_title} reply attributes:') + print_attr_list(ynl, mode_spec['reply']['attributes'], attr_set) + + if 'attributes' in mode_spec: + print(f'\n{mode_title} attributes:') + print_attr_list(ynl, mode_spec['attributes'], attr_set) + + def main(): description = """ YNL CLI utility - a general purpose netlink utility that uses YAML @@ -70,6 +125,9 @@ def main(): group.add_argument('--dump', dest='dump', metavar='DUMP-OPERATION', type=str) group.add_argument('--list-ops', action='store_true') group.add_argument('--list-msgs', action='store_true') + group.add_argument('--list-attrs', dest='list_attrs', metavar='OPERATION', type=str, + help='List attributes for an operation') + group.add_argument('--validate', action='store_true') parser.add_argument('--duration', dest='duration', type=int, help='when subscribed, watch for DURATION seconds') @@ -111,15 +169,25 @@ def main(): if args.family: spec = f"{spec_dir()}/{args.family}.yaml" - if args.schema is None and spec.startswith(sys_schema_dir): - args.schema = '' # disable schema validation when installed - if args.process_unknown is None: - args.process_unknown = True else: spec = args.spec if not os.path.isfile(spec): raise Exception(f"Spec file {spec} does not exist") + if args.validate: + try: + SpecFamily(spec, args.schema) + except Exception as error: + print(error) + exit(1) + return + + if args.family: # set behaviour when using installed specs + if args.schema is None and spec.startswith(sys_schema_dir): + args.schema = '' # disable schema validation when installed + if args.process_unknown is None: + args.process_unknown = True + ynl = YnlFamily(spec, args.schema, args.process_unknown, recv_size=args.dbg_small_recv) if args.dbg_small_recv: @@ -135,6 +203,28 @@ def main(): for op_name, op in ynl.msgs.items(): print(op_name, " [", ", ".join(op.modes), "]") + if args.list_attrs: + op = ynl.msgs.get(args.list_attrs) + if not op: + print(f'Operation {args.list_attrs} not found') + exit(1) + + print(f'Operation: {op.name}') + print(op.yaml['doc']) + + for mode in ['do', 'dump', 'event']: + if mode in op.yaml: + print_mode_attrs(ynl, mode, op.yaml[mode], op.attr_set, True) + + if 'notify' in op.yaml: + mode_spec = op.yaml['notify'] + ref_spec = ynl.msgs.get(mode_spec).yaml.get('do') + if ref_spec: + print_mode_attrs(ynl, 'notify', ref_spec, op.attr_set, False) + + if 'mcgrp' in op.yaml: + print(f"\nMulticast group: {op.yaml['mcgrp']}") + try: if args.do: reply = ynl.do(args.do, attrs, args.flags) diff --git a/tools/net/ynl/pyynl/lib/ynl.py b/tools/net/ynl/pyynl/lib/ynl.py index 62383c70ebb9..36d36eb7e3b8 100644 --- a/tools/net/ynl/pyynl/lib/ynl.py +++ b/tools/net/ynl/pyynl/lib/ynl.py @@ -100,12 +100,21 @@ class Netlink: 'bitfield32', 'sint', 'uint']) class NlError(Exception): - def __init__(self, nl_msg): - self.nl_msg = nl_msg - self.error = -nl_msg.error - - def __str__(self): - return f"Netlink error: {os.strerror(self.error)}\n{self.nl_msg}" + def __init__(self, nl_msg): + self.nl_msg = nl_msg + self.error = -nl_msg.error + + def __str__(self): + msg = "Netlink error: " + + extack = self.nl_msg.extack.copy() if self.nl_msg.extack else {} + if 'msg' in extack: + msg += extack['msg'] + ': ' + del extack['msg'] + msg += os.strerror(self.error) + if extack: + msg += ' ' + str(extack) + return msg class ConfigError(Exception): @@ -976,6 +985,15 @@ class YnlFamily(SpecFamily): raw = bytes.fromhex(string) else: raw = int(string, 16) + elif attr_spec.display_hint == 'mac': + # Parse MAC address in format "00:11:22:33:44:55" or "001122334455" + if ':' in string: + mac_bytes = [int(x, 16) for x in string.split(':')] + else: + if len(string) % 2 != 0: + raise Exception(f"Invalid MAC address format: {string}") + mac_bytes = [int(string[i:i+2], 16) for i in range(0, len(string), 2)] + raw = bytes(mac_bytes) else: raise Exception(f"Display hint '{attr_spec.display_hint}' not implemented" f" when parsing '{attr_spec['name']}'") @@ -1039,15 +1057,15 @@ class YnlFamily(SpecFamily): self.check_ntf() def operation_do_attributes(self, name): - """ - For a given operation name, find and return a supported - set of attributes (as a dict). - """ - op = self.find_operation(name) - if not op: - return None - - return op['do']['request']['attributes'].copy() + """ + For a given operation name, find and return a supported + set of attributes (as a dict). + """ + op = self.find_operation(name) + if not op: + return None + + return op['do']['request']['attributes'].copy() def _encode_message(self, op, vals, flags, req_seq): nl_flags = Netlink.NLM_F_REQUEST | Netlink.NLM_F_ACK diff --git a/tools/net/ynl/pyynl/ynl_gen_c.py b/tools/net/ynl/pyynl/ynl_gen_c.py index aadeb3abcad8..b517d0c605ad 100755 --- a/tools/net/ynl/pyynl/ynl_gen_c.py +++ b/tools/net/ynl/pyynl/ynl_gen_c.py @@ -1205,7 +1205,7 @@ class SubMessage(SpecSubMessage): class Family(SpecFamily): - def __init__(self, file_name, exclude_ops): + def __init__(self, file_name, exclude_ops, fn_prefix): # Added by resolve: self.c_name = None delattr(self, "c_name") @@ -1237,6 +1237,8 @@ class Family(SpecFamily): else: self.uapi_header_name = self.ident_name + self.fn_prefix = fn_prefix if fn_prefix else f'{self.ident_name}-nl' + def resolve(self): self.resolve_up(super()) @@ -2911,12 +2913,12 @@ def print_kernel_op_table_fwd(family, cw, terminate): continue if 'do' in op: - name = c_lower(f"{family.ident_name}-nl-{op_name}-doit") + name = c_lower(f"{family.fn_prefix}-{op_name}-doit") cw.write_func_prot('int', name, ['struct sk_buff *skb', 'struct genl_info *info'], suffix=';') if 'dump' in op: - name = c_lower(f"{family.ident_name}-nl-{op_name}-dumpit") + name = c_lower(f"{family.fn_prefix}-{op_name}-dumpit") cw.write_func_prot('int', name, ['struct sk_buff *skb', 'struct netlink_callback *cb'], suffix=';') cw.nl() @@ -2942,7 +2944,7 @@ def print_kernel_op_table(family, cw): for x in op['dont-validate']])), ) for op_mode in ['do', 'dump']: if op_mode in op: - name = c_lower(f"{family.ident_name}-nl-{op_name}-{op_mode}it") + name = c_lower(f"{family.fn_prefix}-{op_name}-{op_mode}it") members.append((op_mode + 'it', name)) if family.kernel_policy == 'per-op': struct = Struct(family, op['attribute-set'], @@ -2980,7 +2982,7 @@ def print_kernel_op_table(family, cw): members.append(('validate', ' | '.join([c_upper('genl-dont-validate-' + x) for x in dont_validate])), ) - name = c_lower(f"{family.ident_name}-nl-{op_name}-{op_mode}it") + name = c_lower(f"{family.fn_prefix}-{op_name}-{op_mode}it") if 'pre' in op[op_mode]: members.append((cb_names[op_mode]['pre'], c_lower(op[op_mode]['pre']))) members.append((op_mode + 'it', name)) @@ -3402,6 +3404,7 @@ def main(): help='Do not overwrite the output file if the new output is identical to the old') parser.add_argument('--exclude-op', action='append', default=[]) parser.add_argument('-o', dest='out_file', type=str, default=None) + parser.add_argument('--function-prefix', dest='fn_prefix', type=str) args = parser.parse_args() if args.header is None: @@ -3410,7 +3413,7 @@ def main(): exclude_ops = [re.compile(expr) for expr in args.exclude_op] try: - parsed = Family(args.spec, exclude_ops) + parsed = Family(args.spec, exclude_ops, args.fn_prefix) if parsed.license != '((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause)': print('Spec license:', parsed.license) print('License must be: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause)') @@ -3430,11 +3433,16 @@ def main(): cw.p("/* Do not edit directly, auto-generated from: */") cw.p(f"/*\t{spec_kernel} */") cw.p(f"/* YNL-GEN {args.mode} {'header' if args.header else 'source'} */") - if args.exclude_op or args.user_header: + if args.exclude_op or args.user_header or args.fn_prefix: line = '' - line += ' --user-header '.join([''] + args.user_header) - line += ' --exclude-op '.join([''] + args.exclude_op) + if args.user_header: + line += ' --user-header '.join([''] + args.user_header) + if args.exclude_op: + line += ' --exclude-op '.join([''] + args.exclude_op) + if args.fn_prefix: + line += f' --function-prefix {args.fn_prefix}' cw.p(f'/* YNL-ARG{line} */') + cw.p('/* To regenerate run: tools/net/ynl/ynl-regen.sh */') cw.nl() if args.mode == 'uapi': diff --git a/tools/net/ynl/samples/.gitignore b/tools/net/ynl/samples/.gitignore index 7f5fca7682d7..05087ee323ba 100644 --- a/tools/net/ynl/samples/.gitignore +++ b/tools/net/ynl/samples/.gitignore @@ -7,3 +7,4 @@ rt-addr rt-link rt-route tc +tc-filter-add diff --git a/tools/net/ynl/samples/Makefile b/tools/net/ynl/samples/Makefile index c9494a564da4..d76cbd41cbb1 100644 --- a/tools/net/ynl/samples/Makefile +++ b/tools/net/ynl/samples/Makefile @@ -19,6 +19,7 @@ include $(wildcard *.d) all: $(BINS) CFLAGS_page-pool=$(CFLAGS_netdev) +CFLAGS_tc-filter-add:=$(CFLAGS_tc) $(BINS): ../lib/ynl.a ../generated/protos.a $(SRCS) @echo -e '\tCC sample $@' diff --git a/tools/net/ynl/samples/page-pool.c b/tools/net/ynl/samples/page-pool.c deleted file mode 100644 index e5d521320fbf..000000000000 --- a/tools/net/ynl/samples/page-pool.c +++ /dev/null @@ -1,149 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#define _GNU_SOURCE - -#include <stdio.h> -#include <string.h> - -#include <ynl.h> - -#include <net/if.h> - -#include "netdev-user.h" - -struct stat { - unsigned int ifc; - - struct { - unsigned int cnt; - size_t refs, bytes; - } live[2]; - - size_t alloc_slow, alloc_fast, recycle_ring, recycle_cache; -}; - -struct stats_array { - unsigned int i, max; - struct stat *s; -}; - -static struct stat *find_ifc(struct stats_array *a, unsigned int ifindex) -{ - unsigned int i; - - for (i = 0; i < a->i; i++) { - if (a->s[i].ifc == ifindex) - return &a->s[i]; - } - - a->i++; - if (a->i == a->max) { - a->max *= 2; - a->s = reallocarray(a->s, a->max, sizeof(*a->s)); - } - a->s[i].ifc = ifindex; - return &a->s[i]; -} - -static void count(struct stat *s, unsigned int l, - struct netdev_page_pool_get_rsp *pp) -{ - s->live[l].cnt++; - if (pp->_present.inflight) - s->live[l].refs += pp->inflight; - if (pp->_present.inflight_mem) - s->live[l].bytes += pp->inflight_mem; -} - -int main(int argc, char **argv) -{ - struct netdev_page_pool_stats_get_list *pp_stats; - struct netdev_page_pool_get_list *pools; - struct stats_array a = {}; - struct ynl_error yerr; - struct ynl_sock *ys; - - ys = ynl_sock_create(&ynl_netdev_family, &yerr); - if (!ys) { - fprintf(stderr, "YNL: %s\n", yerr.msg); - return 1; - } - - a.max = 128; - a.s = calloc(a.max, sizeof(*a.s)); - if (!a.s) - goto err_close; - - pools = netdev_page_pool_get_dump(ys); - if (!pools) - goto err_free; - - ynl_dump_foreach(pools, pp) { - struct stat *s = find_ifc(&a, pp->ifindex); - - count(s, 1, pp); - if (pp->_present.detach_time) - count(s, 0, pp); - } - netdev_page_pool_get_list_free(pools); - - pp_stats = netdev_page_pool_stats_get_dump(ys); - if (!pp_stats) - goto err_free; - - ynl_dump_foreach(pp_stats, pp) { - struct stat *s = find_ifc(&a, pp->info.ifindex); - - if (pp->_present.alloc_fast) - s->alloc_fast += pp->alloc_fast; - if (pp->_present.alloc_refill) - s->alloc_fast += pp->alloc_refill; - if (pp->_present.alloc_slow) - s->alloc_slow += pp->alloc_slow; - if (pp->_present.recycle_ring) - s->recycle_ring += pp->recycle_ring; - if (pp->_present.recycle_cached) - s->recycle_cache += pp->recycle_cached; - } - netdev_page_pool_stats_get_list_free(pp_stats); - - for (unsigned int i = 0; i < a.i; i++) { - char ifname[IF_NAMESIZE]; - struct stat *s = &a.s[i]; - const char *name; - double recycle; - - if (!s->ifc) { - name = "<orphan>\t"; - } else { - name = if_indextoname(s->ifc, ifname); - if (name) - printf("%8s", name); - printf("[%u]\t", s->ifc); - } - - printf("page pools: %u (zombies: %u)\n", - s->live[1].cnt, s->live[0].cnt); - printf("\t\trefs: %zu bytes: %zu (refs: %zu bytes: %zu)\n", - s->live[1].refs, s->live[1].bytes, - s->live[0].refs, s->live[0].bytes); - - /* We don't know how many pages are sitting in cache and ring - * so we will under-count the recycling rate a bit. - */ - recycle = (double)(s->recycle_ring + s->recycle_cache) / - (s->alloc_fast + s->alloc_slow) * 100; - printf("\t\trecycling: %.1lf%% (alloc: %zu:%zu recycle: %zu:%zu)\n", - recycle, s->alloc_slow, s->alloc_fast, - s->recycle_ring, s->recycle_cache); - } - - ynl_sock_destroy(ys); - return 0; - -err_free: - free(a.s); -err_close: - fprintf(stderr, "YNL: %s\n", ys->err.msg); - ynl_sock_destroy(ys); - return 2; -} diff --git a/tools/net/ynl/samples/tc-filter-add.c b/tools/net/ynl/samples/tc-filter-add.c new file mode 100644 index 000000000000..97871e9e9edc --- /dev/null +++ b/tools/net/ynl/samples/tc-filter-add.c @@ -0,0 +1,335 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <stdio.h> +#include <string.h> +#include <stdlib.h> +#include <arpa/inet.h> +#include <linux/pkt_sched.h> +#include <linux/tc_act/tc_vlan.h> +#include <linux/tc_act/tc_gact.h> +#include <linux/if_ether.h> +#include <net/if.h> + +#include <ynl.h> + +#include "tc-user.h" + +#define TC_HANDLE (0xFFFF << 16) + +const char *vlan_act_name(struct tc_vlan *p) +{ + switch (p->v_action) { + case TCA_VLAN_ACT_POP: + return "pop"; + case TCA_VLAN_ACT_PUSH: + return "push"; + case TCA_VLAN_ACT_MODIFY: + return "modify"; + default: + break; + } + + return "not supported"; +} + +const char *gact_act_name(struct tc_gact *p) +{ + switch (p->action) { + case TC_ACT_SHOT: + return "drop"; + case TC_ACT_OK: + return "ok"; + case TC_ACT_PIPE: + return "pipe"; + default: + break; + } + + return "not supported"; +} + +static void print_vlan(struct tc_act_vlan_attrs *vlan) +{ + printf("%s ", vlan_act_name(vlan->parms)); + if (vlan->_present.push_vlan_id) + printf("id %u ", vlan->push_vlan_id); + if (vlan->_present.push_vlan_protocol) + printf("protocol %#x ", ntohs(vlan->push_vlan_protocol)); + if (vlan->_present.push_vlan_priority) + printf("priority %u ", vlan->push_vlan_priority); +} + +static void print_gact(struct tc_act_gact_attrs *gact) +{ + struct tc_gact *p = gact->parms; + + printf("%s ", gact_act_name(p)); +} + +static void flower_print(struct tc_flower_attrs *flower, const char *kind) +{ + struct tc_act_attrs *a; + unsigned int i; + + printf("%s:\n", kind); + + if (flower->_present.key_vlan_id) + printf(" vlan_id: %u\n", flower->key_vlan_id); + if (flower->_present.key_vlan_prio) + printf(" vlan_prio: %u\n", flower->key_vlan_prio); + if (flower->_present.key_num_of_vlans) + printf(" num_of_vlans: %u\n", flower->key_num_of_vlans); + + for (i = 0; i < flower->_count.act; i++) { + a = &flower->act[i]; + printf("action order: %i %s ", i + 1, a->kind); + if (a->options._present.vlan) + print_vlan(&a->options.vlan); + else if (a->options._present.gact) + print_gact(&a->options.gact); + printf("\n"); + } + printf("\n"); +} + +static void tc_filter_print(struct tc_gettfilter_rsp *f) +{ + struct tc_options_msg *opt = &f->options; + + if (opt->_present.flower) + flower_print(&opt->flower, f->kind); + else if (f->_len.kind) + printf("%s pref %u proto: %#x\n", f->kind, + (f->_hdr.tcm_info >> 16), + ntohs(TC_H_MIN(f->_hdr.tcm_info))); +} + +static int tc_filter_add(struct ynl_sock *ys, int ifi) +{ + struct tc_newtfilter_req *req; + struct tc_act_attrs *acts; + struct tc_vlan p = { + .action = TC_ACT_PIPE, + .v_action = TCA_VLAN_ACT_PUSH + }; + __u16 flags = NLM_F_REQUEST | NLM_F_EXCL | NLM_F_CREATE; + int ret; + + req = tc_newtfilter_req_alloc(); + if (!req) { + fprintf(stderr, "tc_newtfilter_req_alloc failed\n"); + return -1; + } + memset(req, 0, sizeof(*req)); + + acts = tc_act_attrs_alloc(3); + if (!acts) { + fprintf(stderr, "tc_act_attrs_alloc\n"); + tc_newtfilter_req_free(req); + return -1; + } + memset(acts, 0, sizeof(*acts) * 3); + + req->_hdr.tcm_ifindex = ifi; + req->_hdr.tcm_parent = TC_H_MAKE(TC_H_CLSACT, TC_H_MIN_INGRESS); + req->_hdr.tcm_info = TC_H_MAKE(1 << 16, htons(ETH_P_8021Q)); + req->chain = 0; + + tc_newtfilter_req_set_nlflags(req, flags); + tc_newtfilter_req_set_kind(req, "flower"); + tc_newtfilter_req_set_options_flower_key_vlan_id(req, 100); + tc_newtfilter_req_set_options_flower_key_vlan_prio(req, 5); + tc_newtfilter_req_set_options_flower_key_num_of_vlans(req, 3); + + __tc_newtfilter_req_set_options_flower_act(req, acts, 3); + + /* Skip action at index 0 because in TC, the action array + * index starts at 1, with each index defining the action's + * order. In contrast, in YNL indexed arrays start at index 0. + */ + tc_act_attrs_set_kind(&acts[1], "vlan"); + tc_act_attrs_set_options_vlan_parms(&acts[1], &p, sizeof(p)); + tc_act_attrs_set_options_vlan_push_vlan_id(&acts[1], 200); + tc_act_attrs_set_kind(&acts[2], "vlan"); + tc_act_attrs_set_options_vlan_parms(&acts[2], &p, sizeof(p)); + tc_act_attrs_set_options_vlan_push_vlan_id(&acts[2], 300); + + tc_newtfilter_req_set_options_flower_flags(req, 0); + tc_newtfilter_req_set_options_flower_key_eth_type(req, htons(0x8100)); + + ret = tc_newtfilter(ys, req); + if (ret) + fprintf(stderr, "tc_newtfilter: %s\n", ys->err.msg); + + tc_newtfilter_req_free(req); + + return ret; +} + +static int tc_filter_show(struct ynl_sock *ys, int ifi) +{ + struct tc_gettfilter_req_dump *req; + struct tc_gettfilter_list *rsp; + + req = tc_gettfilter_req_dump_alloc(); + if (!req) { + fprintf(stderr, "tc_gettfilter_req_dump_alloc failed\n"); + return -1; + } + memset(req, 0, sizeof(*req)); + + req->_hdr.tcm_ifindex = ifi; + req->_hdr.tcm_parent = TC_H_MAKE(TC_H_CLSACT, TC_H_MIN_INGRESS); + req->_present.chain = 1; + req->chain = 0; + + rsp = tc_gettfilter_dump(ys, req); + tc_gettfilter_req_dump_free(req); + if (!rsp) { + fprintf(stderr, "YNL: %s\n", ys->err.msg); + return -1; + } + + if (ynl_dump_empty(rsp)) + fprintf(stderr, "Error: no filters reported\n"); + else + ynl_dump_foreach(rsp, flt) tc_filter_print(flt); + + tc_gettfilter_list_free(rsp); + + return 0; +} + +static int tc_filter_del(struct ynl_sock *ys, int ifi) +{ + struct tc_deltfilter_req *req; + __u16 flags = NLM_F_REQUEST; + int ret; + + req = tc_deltfilter_req_alloc(); + if (!req) { + fprintf(stderr, "tc_deltfilter_req_alloc failed\n"); + return -1; + } + memset(req, 0, sizeof(*req)); + + req->_hdr.tcm_ifindex = ifi; + req->_hdr.tcm_parent = TC_H_MAKE(TC_H_CLSACT, TC_H_MIN_INGRESS); + req->_hdr.tcm_info = TC_H_MAKE(1 << 16, htons(ETH_P_8021Q)); + tc_deltfilter_req_set_nlflags(req, flags); + + ret = tc_deltfilter(ys, req); + if (ret) + fprintf(stderr, "tc_deltfilter failed: %s\n", ys->err.msg); + + tc_deltfilter_req_free(req); + + return ret; +} + +static int tc_clsact_add(struct ynl_sock *ys, int ifi) +{ + struct tc_newqdisc_req *req; + __u16 flags = NLM_F_REQUEST | NLM_F_EXCL | NLM_F_CREATE; + int ret; + + req = tc_newqdisc_req_alloc(); + if (!req) { + fprintf(stderr, "tc_newqdisc_req_alloc failed\n"); + return -1; + } + memset(req, 0, sizeof(*req)); + + req->_hdr.tcm_ifindex = ifi; + req->_hdr.tcm_parent = TC_H_CLSACT; + req->_hdr.tcm_handle = TC_HANDLE; + tc_newqdisc_req_set_nlflags(req, flags); + tc_newqdisc_req_set_kind(req, "clsact"); + + ret = tc_newqdisc(ys, req); + if (ret) + fprintf(stderr, "tc_newqdisc failed: %s\n", ys->err.msg); + + tc_newqdisc_req_free(req); + + return ret; +} + +static int tc_clsact_del(struct ynl_sock *ys, int ifi) +{ + struct tc_delqdisc_req *req; + __u16 flags = NLM_F_REQUEST; + int ret; + + req = tc_delqdisc_req_alloc(); + if (!req) { + fprintf(stderr, "tc_delqdisc_req_alloc failed\n"); + return -1; + } + memset(req, 0, sizeof(*req)); + + req->_hdr.tcm_ifindex = ifi; + req->_hdr.tcm_parent = TC_H_CLSACT; + req->_hdr.tcm_handle = TC_HANDLE; + tc_delqdisc_req_set_nlflags(req, flags); + + ret = tc_delqdisc(ys, req); + if (ret) + fprintf(stderr, "tc_delqdisc failed: %s\n", ys->err.msg); + + tc_delqdisc_req_free(req); + + return ret; +} + +static int tc_filter_config(struct ynl_sock *ys, int ifi) +{ + int ret = 0; + + if (tc_filter_add(ys, ifi)) + return -1; + + ret = tc_filter_show(ys, ifi); + + if (tc_filter_del(ys, ifi)) + return -1; + + return ret; +} + +int main(int argc, char **argv) +{ + struct ynl_error yerr; + struct ynl_sock *ys; + int ifi, ret = 0; + + if (argc < 2) { + fprintf(stderr, "Usage: %s <interface_name>\n", argv[0]); + return 1; + } + ifi = if_nametoindex(argv[1]); + if (!ifi) { + perror("if_nametoindex"); + return 1; + } + + ys = ynl_sock_create(&ynl_tc_family, &yerr); + if (!ys) { + fprintf(stderr, "YNL: %s\n", yerr.msg); + return 1; + } + + if (tc_clsact_add(ys, ifi)) { + ret = 2; + goto err_destroy; + } + + if (tc_filter_config(ys, ifi)) + ret = 3; + + if (tc_clsact_del(ys, ifi)) + ret = 4; + +err_destroy: + ynl_sock_destroy(ys); + return ret; +} diff --git a/tools/net/ynl/tests/Makefile b/tools/net/ynl/tests/Makefile new file mode 100644 index 000000000000..c1df2e001255 --- /dev/null +++ b/tools/net/ynl/tests/Makefile @@ -0,0 +1,32 @@ +# SPDX-License-Identifier: GPL-2.0 +# Makefile for YNL tests + +TESTS := \ + test_ynl_cli.sh \ + test_ynl_ethtool.sh \ +# end of TESTS + +all: $(TESTS) + +run_tests: + @for test in $(TESTS); do \ + ./$$test; \ + done + +install: $(TESTS) + @mkdir -p $(DESTDIR)/usr/bin + @mkdir -p $(DESTDIR)/usr/share/kselftest + @cp ../../../testing/selftests/kselftest/ktap_helpers.sh $(DESTDIR)/usr/share/kselftest/ + @for test in $(TESTS); do \ + name=$$(basename $$test .sh); \ + sed -e 's|^ynl=.*|ynl="ynl"|' \ + -e 's|^ynl_ethtool=.*|ynl_ethtool="ynl-ethtool"|' \ + -e 's|KSELFTEST_KTAP_HELPERS=.*|KSELFTEST_KTAP_HELPERS="/usr/share/kselftest/ktap_helpers.sh"|' \ + $$test > $(DESTDIR)/usr/bin/$$name; \ + chmod +x $(DESTDIR)/usr/bin/$$name; \ + done + +clean distclean: + @# Nothing to clean + +.PHONY: all install clean run_tests diff --git a/tools/net/ynl/tests/config b/tools/net/ynl/tests/config new file mode 100644 index 000000000000..339f1309c03f --- /dev/null +++ b/tools/net/ynl/tests/config @@ -0,0 +1,6 @@ +CONFIG_DUMMY=m +CONFIG_INET_DIAG=y +CONFIG_IPV6=y +CONFIG_NET_NS=y +CONFIG_NETDEVSIM=m +CONFIG_VETH=m diff --git a/tools/net/ynl/tests/test_ynl_cli.sh b/tools/net/ynl/tests/test_ynl_cli.sh new file mode 100755 index 000000000000..7c0722a08117 --- /dev/null +++ b/tools/net/ynl/tests/test_ynl_cli.sh @@ -0,0 +1,327 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# Test YNL CLI functionality + +# Load KTAP test helpers +KSELFTEST_KTAP_HELPERS="$(dirname "$(realpath "$0")")/../../../testing/selftests/kselftest/ktap_helpers.sh" +# shellcheck source=../../../testing/selftests/kselftest/ktap_helpers.sh +source "$KSELFTEST_KTAP_HELPERS" + +# Default ynl path for direct execution, can be overridden by make install +ynl="../pyynl/cli.py" + +readonly NSIM_ID="1338" +readonly NSIM_DEV_NAME="nsim${NSIM_ID}" +readonly VETH_A="veth_a" +readonly VETH_B="veth_b" + +testns="ynl-$(mktemp -u XXXXXX)" +TESTS_NO=0 + +# Test listing available families +cli_list_families() +{ + if $ynl --list-families &>/dev/null; then + ktap_test_pass "YNL CLI list families" + else + ktap_test_fail "YNL CLI list families" + fi +} +TESTS_NO=$((TESTS_NO + 1)) + +# Test netdev family operations (dev-get, queue-get) +cli_netdev_ops() +{ + local dev_output + local ifindex + + ifindex=$(ip netns exec "$testns" cat /sys/class/net/"$NSIM_DEV_NAME"/ifindex 2>/dev/null) + + dev_output=$(ip netns exec "$testns" $ynl --family netdev \ + --do dev-get --json "{\"ifindex\": $ifindex}" 2>/dev/null) + + if ! echo "$dev_output" | grep -q "ifindex"; then + ktap_test_fail "YNL CLI netdev operations (netdev dev-get output missing ifindex)" + return + fi + + if ! ip netns exec "$testns" $ynl --family netdev \ + --dump queue-get --json "{\"ifindex\": $ifindex}" &>/dev/null; then + ktap_test_fail "YNL CLI netdev operations (failed to get netdev queue info)" + return + fi + + ktap_test_pass "YNL CLI netdev operations" +} +TESTS_NO=$((TESTS_NO + 1)) + +# Test ethtool family operations (rings-get, linkinfo-get) +cli_ethtool_ops() +{ + local rings_output + local linkinfo_output + + rings_output=$(ip netns exec "$testns" $ynl --family ethtool \ + --do rings-get --json "{\"header\": {\"dev-name\": \"$NSIM_DEV_NAME\"}}" 2>/dev/null) + + if ! echo "$rings_output" | grep -q "header"; then + ktap_test_fail "YNL CLI ethtool operations (ethtool rings-get output missing header)" + return + fi + + linkinfo_output=$(ip netns exec "$testns" $ynl --family ethtool \ + --do linkinfo-get --json "{\"header\": {\"dev-name\": \"$VETH_A\"}}" 2>/dev/null) + + if ! echo "$linkinfo_output" | grep -q "header"; then + ktap_test_fail "YNL CLI ethtool operations (ethtool linkinfo-get output missing header)" + return + fi + + ktap_test_pass "YNL CLI ethtool operations" +} +TESTS_NO=$((TESTS_NO + 1)) + +# Test rt-route family operations +cli_rt_route_ops() +{ + local ifindex + + if ! $ynl --list-families 2>/dev/null | grep -q "rt-route"; then + ktap_test_skip "YNL CLI rt-route operations (rt-route family not available)" + return + fi + + ifindex=$(ip netns exec "$testns" cat /sys/class/net/"$NSIM_DEV_NAME"/ifindex 2>/dev/null) + + # Add route: 192.0.2.0/24 dev $dev scope link + if ! ip netns exec "$testns" $ynl --family rt-route --do newroute --create \ + --json "{\"dst\": \"192.0.2.0\", \"oif\": $ifindex, \"rtm-dst-len\": 24, \"rtm-family\": 2, \"rtm-scope\": 253, \"rtm-type\": 1, \"rtm-protocol\": 3, \"rtm-table\": 254}" &>/dev/null; then + ktap_test_fail "YNL CLI rt-route operations (failed to add route)" + return + fi + + local route_output + route_output=$(ip netns exec "$testns" $ynl --family rt-route --dump getroute 2>/dev/null) + if echo "$route_output" | grep -q "192.0.2.0"; then + ktap_test_pass "YNL CLI rt-route operations" + else + ktap_test_fail "YNL CLI rt-route operations (failed to verify route)" + fi + + ip netns exec "$testns" $ynl --family rt-route --do delroute \ + --json "{\"dst\": \"192.0.2.0\", \"oif\": $ifindex, \"rtm-dst-len\": 24, \"rtm-family\": 2, \"rtm-scope\": 253, \"rtm-type\": 1, \"rtm-protocol\": 3, \"rtm-table\": 254}" &>/dev/null +} +TESTS_NO=$((TESTS_NO + 1)) + +# Test rt-addr family operations +cli_rt_addr_ops() +{ + local ifindex + + if ! $ynl --list-families 2>/dev/null | grep -q "rt-addr"; then + ktap_test_skip "YNL CLI rt-addr operations (rt-addr family not available)" + return + fi + + ifindex=$(ip netns exec "$testns" cat /sys/class/net/"$NSIM_DEV_NAME"/ifindex 2>/dev/null) + + if ! ip netns exec "$testns" $ynl --family rt-addr --do newaddr \ + --json "{\"ifa-index\": $ifindex, \"local\": \"192.0.2.100\", \"ifa-prefixlen\": 24, \"ifa-family\": 2}" &>/dev/null; then + ktap_test_fail "YNL CLI rt-addr operations (failed to add address)" + return + fi + + local addr_output + addr_output=$(ip netns exec "$testns" $ynl --family rt-addr --dump getaddr 2>/dev/null) + if echo "$addr_output" | grep -q "192.0.2.100"; then + ktap_test_pass "YNL CLI rt-addr operations" + else + ktap_test_fail "YNL CLI rt-addr operations (failed to verify address)" + fi + + ip netns exec "$testns" $ynl --family rt-addr --do deladdr \ + --json "{\"ifa-index\": $ifindex, \"local\": \"192.0.2.100\", \"ifa-prefixlen\": 24, \"ifa-family\": 2}" &>/dev/null +} +TESTS_NO=$((TESTS_NO + 1)) + +# Test rt-link family operations +cli_rt_link_ops() +{ + if ! $ynl --list-families 2>/dev/null | grep -q "rt-link"; then + ktap_test_skip "YNL CLI rt-link operations (rt-link family not available)" + return + fi + + if ! ip netns exec "$testns" $ynl --family rt-link --do newlink --create \ + --json "{\"ifname\": \"dummy0\", \"linkinfo\": {\"kind\": \"dummy\"}}" &>/dev/null; then + ktap_test_fail "YNL CLI rt-link operations (failed to add link)" + return + fi + + local link_output + link_output=$(ip netns exec "$testns" $ynl --family rt-link --dump getlink 2>/dev/null) + if echo "$link_output" | grep -q "$NSIM_DEV_NAME" && echo "$link_output" | grep -q "dummy0"; then + ktap_test_pass "YNL CLI rt-link operations" + else + ktap_test_fail "YNL CLI rt-link operations (failed to verify link)" + fi + + ip netns exec "$testns" $ynl --family rt-link --do dellink \ + --json "{\"ifname\": \"dummy0\"}" &>/dev/null +} +TESTS_NO=$((TESTS_NO + 1)) + +# Test rt-neigh family operations +cli_rt_neigh_ops() +{ + local ifindex + + if ! $ynl --list-families 2>/dev/null | grep -q "rt-neigh"; then + ktap_test_skip "YNL CLI rt-neigh operations (rt-neigh family not available)" + return + fi + + ifindex=$(ip netns exec "$testns" cat /sys/class/net/"$NSIM_DEV_NAME"/ifindex 2>/dev/null) + + # Add neighbor: 192.0.2.1 dev nsim1338 lladdr 11:22:33:44:55:66 PERMANENT + if ! ip netns exec "$testns" $ynl --family rt-neigh --do newneigh --create \ + --json "{\"ndm-ifindex\": $ifindex, \"dst\": \"192.0.2.1\", \"lladdr\": \"11:22:33:44:55:66\", \"ndm-family\": 2, \"ndm-state\": 128}" &>/dev/null; then + ktap_test_fail "YNL CLI rt-neigh operations (failed to add neighbor)" + fi + + local neigh_output + neigh_output=$(ip netns exec "$testns" $ynl --family rt-neigh --dump getneigh 2>/dev/null) + if echo "$neigh_output" | grep -q "192.0.2.1"; then + ktap_test_pass "YNL CLI rt-neigh operations" + else + ktap_test_fail "YNL CLI rt-neigh operations (failed to verify neighbor)" + fi + + ip netns exec "$testns" $ynl --family rt-neigh --do delneigh \ + --json "{\"ndm-ifindex\": $ifindex, \"dst\": \"192.0.2.1\", \"lladdr\": \"11:22:33:44:55:66\", \"ndm-family\": 2}" &>/dev/null +} +TESTS_NO=$((TESTS_NO + 1)) + +# Test rt-rule family operations +cli_rt_rule_ops() +{ + if ! $ynl --list-families 2>/dev/null | grep -q "rt-rule"; then + ktap_test_skip "YNL CLI rt-rule operations (rt-rule family not available)" + return + fi + + # Add rule: from 192.0.2.0/24 lookup 100 none + if ! ip netns exec "$testns" $ynl --family rt-rule --do newrule \ + --json "{\"family\": 2, \"src-len\": 24, \"src\": \"192.0.2.0\", \"table\": 100}" &>/dev/null; then + ktap_test_fail "YNL CLI rt-rule operations (failed to add rule)" + return + fi + + local rule_output + rule_output=$(ip netns exec "$testns" $ynl --family rt-rule --dump getrule 2>/dev/null) + if echo "$rule_output" | grep -q "192.0.2.0"; then + ktap_test_pass "YNL CLI rt-rule operations" + else + ktap_test_fail "YNL CLI rt-rule operations (failed to verify rule)" + fi + + ip netns exec "$testns" $ynl --family rt-rule --do delrule \ + --json "{\"family\": 2, \"src-len\": 24, \"src\": \"192.0.2.0\", \"table\": 100}" &>/dev/null +} +TESTS_NO=$((TESTS_NO + 1)) + +# Test nlctrl family operations +cli_nlctrl_ops() +{ + local family_output + + if ! family_output=$($ynl --family nlctrl \ + --do getfamily --json "{\"family-name\": \"netdev\"}" 2>/dev/null); then + ktap_test_fail "YNL CLI nlctrl getfamily (failed to get nlctrl family info)" + return + fi + + if ! echo "$family_output" | grep -q "family-name"; then + ktap_test_fail "YNL CLI nlctrl getfamily (nlctrl getfamily output missing family-name)" + return + fi + + if ! echo "$family_output" | grep -q "family-id"; then + ktap_test_fail "YNL CLI nlctrl getfamily (nlctrl getfamily output missing family-id)" + return + fi + + ktap_test_pass "YNL CLI nlctrl getfamily" +} +TESTS_NO=$((TESTS_NO + 1)) + +setup() +{ + modprobe netdevsim &> /dev/null + if ! [ -f /sys/bus/netdevsim/new_device ]; then + ktap_skip_all "netdevsim module not available" + exit "$KSFT_SKIP" + fi + + if ! ip netns add "$testns" 2>/dev/null; then + ktap_skip_all "failed to create test namespace" + exit "$KSFT_SKIP" + fi + + echo "$NSIM_ID 1" | ip netns exec "$testns" tee /sys/bus/netdevsim/new_device >/dev/null 2>&1 || { + ktap_skip_all "failed to create netdevsim device" + exit "$KSFT_SKIP" + } + + local dev + dev=$(ip netns exec "$testns" ls /sys/bus/netdevsim/devices/netdevsim$NSIM_ID/net 2>/dev/null | head -1) + if [[ -z "$dev" ]]; then + ktap_skip_all "failed to find netdevsim device" + exit "$KSFT_SKIP" + fi + + ip -netns "$testns" link set dev "$dev" name "$NSIM_DEV_NAME" 2>/dev/null || { + ktap_skip_all "failed to rename netdevsim device" + exit "$KSFT_SKIP" + } + + ip -netns "$testns" link set dev "$NSIM_DEV_NAME" up 2>/dev/null + + if ! ip -n "$testns" link add "$VETH_A" type veth peer name "$VETH_B" 2>/dev/null; then + ktap_skip_all "failed to create veth pair" + exit "$KSFT_SKIP" + fi + + ip -n "$testns" link set "$VETH_A" up 2>/dev/null + ip -n "$testns" link set "$VETH_B" up 2>/dev/null +} + +cleanup() +{ + ip netns exec "$testns" bash -c "echo $NSIM_ID > /sys/bus/netdevsim/del_device" 2>/dev/null || true + ip netns del "$testns" 2>/dev/null || true +} + +# Check if ynl command is available +if ! command -v $ynl &>/dev/null && [[ ! -x $ynl ]]; then + ktap_skip_all "ynl command not found: $ynl" + exit "$KSFT_SKIP" +fi + +trap cleanup EXIT + +ktap_print_header +setup +ktap_set_plan "${TESTS_NO}" + +cli_list_families +cli_netdev_ops +cli_ethtool_ops +cli_rt_route_ops +cli_rt_addr_ops +cli_rt_link_ops +cli_rt_neigh_ops +cli_rt_rule_ops +cli_nlctrl_ops + +ktap_finished diff --git a/tools/net/ynl/tests/test_ynl_ethtool.sh b/tools/net/ynl/tests/test_ynl_ethtool.sh new file mode 100755 index 000000000000..b826269017f4 --- /dev/null +++ b/tools/net/ynl/tests/test_ynl_ethtool.sh @@ -0,0 +1,222 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# Test YNL ethtool functionality + +# Load KTAP test helpers +KSELFTEST_KTAP_HELPERS="$(dirname "$(realpath "$0")")/../../../testing/selftests/kselftest/ktap_helpers.sh" +# shellcheck source=../../../testing/selftests/kselftest/ktap_helpers.sh +source "$KSELFTEST_KTAP_HELPERS" + +# Default ynl-ethtool path for direct execution, can be overridden by make install +ynl_ethtool="../pyynl/ethtool.py" + +readonly NSIM_ID="1337" +readonly NSIM_DEV_NAME="nsim${NSIM_ID}" +readonly VETH_A="veth_a" +readonly VETH_B="veth_b" + +testns="ynl-ethtool-$(mktemp -u XXXXXX)" +TESTS_NO=0 + +# Uses veth device as netdevsim doesn't support basic ethtool device info +ethtool_device_info() +{ + local info_output + + info_output=$(ip netns exec "$testns" $ynl_ethtool "$VETH_A" 2>/dev/null) + + if ! echo "$info_output" | grep -q "Settings for"; then + ktap_test_fail "YNL ethtool device info (device info output missing expected content)" + return + fi + + ktap_test_pass "YNL ethtool device info" +} +TESTS_NO=$((TESTS_NO + 1)) + +ethtool_statistics() +{ + local stats_output + + stats_output=$(ip netns exec "$testns" $ynl_ethtool --statistics "$NSIM_DEV_NAME" 2>/dev/null) + + if ! echo "$stats_output" | grep -q -E "(NIC statistics|packets|bytes)"; then + ktap_test_fail "YNL ethtool statistics (statistics output missing expected content)" + return + fi + + ktap_test_pass "YNL ethtool statistics" +} +TESTS_NO=$((TESTS_NO + 1)) + +ethtool_ring_params() +{ + local ring_output + + ring_output=$(ip netns exec "$testns" $ynl_ethtool --show-ring "$NSIM_DEV_NAME" 2>/dev/null) + + if ! echo "$ring_output" | grep -q -E "(Ring parameters|RX|TX)"; then + ktap_test_fail "YNL ethtool ring parameters (ring parameters output missing expected content)" + return + fi + + if ! ip netns exec "$testns" $ynl_ethtool --set-ring "$NSIM_DEV_NAME" rx 64 2>/dev/null; then + ktap_test_fail "YNL ethtool ring parameters (set-ring command failed unexpectedly)" + return + fi + + ktap_test_pass "YNL ethtool ring parameters (show/set)" +} +TESTS_NO=$((TESTS_NO + 1)) + +ethtool_coalesce_params() +{ + if ! ip netns exec "$testns" $ynl_ethtool --show-coalesce "$NSIM_DEV_NAME" &>/dev/null; then + ktap_test_fail "YNL ethtool coalesce parameters (failed to get coalesce parameters)" + return + fi + + if ! ip netns exec "$testns" $ynl_ethtool --set-coalesce "$NSIM_DEV_NAME" rx-usecs 50 2>/dev/null; then + ktap_test_fail "YNL ethtool coalesce parameters (set-coalesce command failed unexpectedly)" + return + fi + + ktap_test_pass "YNL ethtool coalesce parameters (show/set)" +} +TESTS_NO=$((TESTS_NO + 1)) + +ethtool_pause_params() +{ + if ! ip netns exec "$testns" $ynl_ethtool --show-pause "$NSIM_DEV_NAME" &>/dev/null; then + ktap_test_fail "YNL ethtool pause parameters (failed to get pause parameters)" + return + fi + + if ! ip netns exec "$testns" $ynl_ethtool --set-pause "$NSIM_DEV_NAME" tx 1 rx 1 2>/dev/null; then + ktap_test_fail "YNL ethtool pause parameters (set-pause command failed unexpectedly)" + return + fi + + ktap_test_pass "YNL ethtool pause parameters (show/set)" +} +TESTS_NO=$((TESTS_NO + 1)) + +ethtool_features_info() +{ + local features_output + + features_output=$(ip netns exec "$testns" $ynl_ethtool --show-features "$NSIM_DEV_NAME" 2>/dev/null) + + if ! echo "$features_output" | grep -q -E "(Features|offload)"; then + ktap_test_fail "YNL ethtool features info (features output missing expected content)" + return + fi + + ktap_test_pass "YNL ethtool features info (show/set)" +} +TESTS_NO=$((TESTS_NO + 1)) + +ethtool_channels_info() +{ + local channels_output + + channels_output=$(ip netns exec "$testns" $ynl_ethtool --show-channels "$NSIM_DEV_NAME" 2>/dev/null) + + if ! echo "$channels_output" | grep -q -E "(Channel|Combined|RX|TX)"; then + ktap_test_fail "YNL ethtool channels info (channels output missing expected content)" + return + fi + + if ! ip netns exec "$testns" $ynl_ethtool --set-channels "$NSIM_DEV_NAME" combined-count 1 2>/dev/null; then + ktap_test_fail "YNL ethtool channels info (set-channels command failed unexpectedly)" + return + fi + + ktap_test_pass "YNL ethtool channels info (show/set)" +} +TESTS_NO=$((TESTS_NO + 1)) + +ethtool_time_stamping() +{ + local ts_output + + ts_output=$(ip netns exec "$testns" $ynl_ethtool --show-time-stamping "$NSIM_DEV_NAME" 2>/dev/null) + + if ! echo "$ts_output" | grep -q -E "(Time stamping|timestamping|SOF_TIMESTAMPING)"; then + ktap_test_fail "YNL ethtool time stamping (time stamping output missing expected content)" + return + fi + + ktap_test_pass "YNL ethtool time stamping" +} +TESTS_NO=$((TESTS_NO + 1)) + +setup() +{ + modprobe netdevsim &> /dev/null + if ! [ -f /sys/bus/netdevsim/new_device ]; then + ktap_skip_all "netdevsim module not available" + exit "$KSFT_SKIP" + fi + + if ! ip netns add "$testns" 2>/dev/null; then + ktap_skip_all "failed to create test namespace" + exit "$KSFT_SKIP" + fi + + echo "$NSIM_ID 1" | ip netns exec "$testns" tee /sys/bus/netdevsim/new_device >/dev/null 2>&1 || { + ktap_skip_all "failed to create netdevsim device" + exit "$KSFT_SKIP" + } + + local dev + dev=$(ip netns exec "$testns" ls /sys/bus/netdevsim/devices/netdevsim$NSIM_ID/net 2>/dev/null | head -1) + if [[ -z "$dev" ]]; then + ktap_skip_all "failed to find netdevsim device" + exit "$KSFT_SKIP" + fi + + ip -netns "$testns" link set dev "$dev" name "$NSIM_DEV_NAME" 2>/dev/null || { + ktap_skip_all "failed to rename netdevsim device" + exit "$KSFT_SKIP" + } + + ip -netns "$testns" link set dev "$NSIM_DEV_NAME" up 2>/dev/null + + if ! ip -n "$testns" link add "$VETH_A" type veth peer name "$VETH_B" 2>/dev/null; then + ktap_skip_all "failed to create veth pair" + exit "$KSFT_SKIP" + fi + + ip -n "$testns" link set "$VETH_A" up 2>/dev/null + ip -n "$testns" link set "$VETH_B" up 2>/dev/null +} + +cleanup() +{ + ip netns exec "$testns" bash -c "echo $NSIM_ID > /sys/bus/netdevsim/del_device" 2>/dev/null || true + ip netns del "$testns" 2>/dev/null || true +} + +# Check if ynl-ethtool command is available +if ! command -v $ynl_ethtool &>/dev/null && [[ ! -x $ynl_ethtool ]]; then + ktap_skip_all "ynl-ethtool command not found: $ynl_ethtool" + exit "$KSFT_SKIP" +fi + +trap cleanup EXIT + +ktap_print_header +setup +ktap_set_plan "${TESTS_NO}" + +ethtool_device_info +ethtool_statistics +ethtool_ring_params +ethtool_coalesce_params +ethtool_pause_params +ethtool_features_info +ethtool_channels_info +ethtool_time_stamping + +ktap_finished diff --git a/tools/net/ynl/ynltool/.gitignore b/tools/net/ynl/ynltool/.gitignore new file mode 100644 index 000000000000..690d399c921a --- /dev/null +++ b/tools/net/ynl/ynltool/.gitignore @@ -0,0 +1,2 @@ +ynltool +*.d diff --git a/tools/net/ynl/ynltool/Makefile b/tools/net/ynl/ynltool/Makefile new file mode 100644 index 000000000000..f5b1de32daa5 --- /dev/null +++ b/tools/net/ynl/ynltool/Makefile @@ -0,0 +1,55 @@ +# SPDX-License-Identifier: GPL-2.0-only + +include ../Makefile.deps + +INSTALL ?= install +prefix ?= /usr + +CC := gcc +CFLAGS := -Wall -Wextra -Werror -O2 +ifeq ("$(DEBUG)","1") + CFLAGS += -g -fsanitize=address -fsanitize=leak -static-libasan +endif +CFLAGS += -I../lib -I../generated -I../../../include/uapi/ + +SRC_VERSION := \ + $(shell make --no-print-directory -sC ../../../.. kernelversion || \ + echo "unknown") + +CFLAGS += -DSRC_VERSION='"$(SRC_VERSION)"' + +SRCS := $(wildcard *.c) +OBJS := $(patsubst %.c,$(OUTPUT)%.o,$(SRCS)) + +YNLTOOL := $(OUTPUT)ynltool + +include $(wildcard *.d) + +all: $(YNLTOOL) + +Q = @ + +$(YNLTOOL): ../libynl.a $(OBJS) + $(Q)echo -e "\tLINK $@" + $(Q)$(CC) $(CFLAGS) -o $@ $(OBJS) ../libynl.a -lm + +%.o: %.c ../libynl.a + $(Q)echo -e "\tCC $@" + $(Q)$(COMPILE.c) -MMD -c -o $@ $< + +../libynl.a: + $(Q)$(MAKE) -C ../ + +clean: + rm -f *.o *.d *~ + +distclean: clean + rm -f $(YNLTOOL) + +bindir ?= /usr/bin + +install: $(YNLTOOL) + $(INSTALL) -m 0755 $(YNLTOOL) $(DESTDIR)$(bindir)/$(YNLTOOL) + +.PHONY: all clean distclean +.DEFAULT_GOAL=all diff --git a/tools/net/ynl/ynltool/json_writer.c b/tools/net/ynl/ynltool/json_writer.c new file mode 100644 index 000000000000..c8685e592cd3 --- /dev/null +++ b/tools/net/ynl/ynltool/json_writer.c @@ -0,0 +1,288 @@ +// SPDX-License-Identifier: (GPL-2.0-or-later OR BSD-2-Clause) +/* + * Simple streaming JSON writer + * + * This takes care of the annoying bits of JSON syntax like the commas + * after elements + * + * Authors: Stephen Hemminger <stephen@networkplumber.org> + */ + +#include <stdio.h> +#include <stdbool.h> +#include <stdarg.h> +#include <assert.h> +#include <malloc.h> +#include <inttypes.h> +#include <stdint.h> + +#include "json_writer.h" + +struct json_writer { + FILE *out; + unsigned depth; + bool pretty; + char sep; +}; + +static void jsonw_indent(json_writer_t *self) +{ + unsigned i; + for (i = 0; i < self->depth; ++i) + fputs(" ", self->out); +} + +static void jsonw_eol(json_writer_t *self) +{ + if (!self->pretty) + return; + + putc('\n', self->out); + jsonw_indent(self); +} + +static void jsonw_eor(json_writer_t *self) +{ + if (self->sep != '\0') + putc(self->sep, self->out); + self->sep = ','; +} + +static void jsonw_puts(json_writer_t *self, const char *str) +{ + putc('"', self->out); + for (; *str; ++str) + switch (*str) { + case '\t': + fputs("\\t", self->out); + break; + case '\n': + fputs("\\n", self->out); + break; + case '\r': + fputs("\\r", self->out); + break; + case '\f': + fputs("\\f", self->out); + break; + case '\b': + fputs("\\b", self->out); + break; + case '\\': + fputs("\\\\", self->out); + break; + case '"': + fputs("\\\"", self->out); + break; + default: + putc(*str, self->out); + } + putc('"', self->out); +} + +json_writer_t *jsonw_new(FILE *f) +{ + json_writer_t *self = malloc(sizeof(*self)); + if (self) { + self->out = f; + self->depth = 0; + self->pretty = false; + self->sep = '\0'; + } + return self; +} + +void jsonw_destroy(json_writer_t **self_p) +{ + json_writer_t *self = *self_p; + + assert(self->depth == 0); + fputs("\n", self->out); + fflush(self->out); + free(self); + *self_p = NULL; +} + +void jsonw_pretty(json_writer_t *self, bool on) +{ + self->pretty = on; +} + +void jsonw_reset(json_writer_t *self) +{ + assert(self->depth == 0); + self->sep = '\0'; +} + +static void jsonw_begin(json_writer_t *self, int c) +{ + jsonw_eor(self); + putc(c, self->out); + ++self->depth; + self->sep = '\0'; +} + +static void jsonw_end(json_writer_t *self, int c) +{ + assert(self->depth > 0); + + --self->depth; + if (self->sep != '\0') + jsonw_eol(self); + putc(c, self->out); + self->sep = ','; +} + +void jsonw_name(json_writer_t *self, const char *name) +{ + jsonw_eor(self); + jsonw_eol(self); + self->sep = '\0'; + jsonw_puts(self, name); + putc(':', self->out); + if (self->pretty) + putc(' ', self->out); +} + +void jsonw_vprintf_enquote(json_writer_t *self, const char *fmt, va_list ap) +{ + jsonw_eor(self); + putc('"', self->out); + vfprintf(self->out, fmt, ap); + putc('"', self->out); +} + +void jsonw_printf(json_writer_t *self, const char *fmt, ...) +{ + va_list ap; + + va_start(ap, fmt); + jsonw_eor(self); + vfprintf(self->out, fmt, ap); + va_end(ap); +} + +void jsonw_start_object(json_writer_t *self) +{ + jsonw_begin(self, '{'); +} + +void jsonw_end_object(json_writer_t *self) +{ + jsonw_end(self, '}'); +} + +void jsonw_start_array(json_writer_t *self) +{ + jsonw_begin(self, '['); +} + +void jsonw_end_array(json_writer_t *self) +{ + jsonw_end(self, ']'); +} + +void jsonw_string(json_writer_t *self, const char *value) +{ + jsonw_eor(self); + jsonw_puts(self, value); +} + +void jsonw_bool(json_writer_t *self, bool val) +{ + jsonw_printf(self, "%s", val ? "true" : "false"); +} + +void jsonw_null(json_writer_t *self) +{ + jsonw_printf(self, "null"); +} + +void jsonw_float_fmt(json_writer_t *self, const char *fmt, double num) +{ + jsonw_printf(self, fmt, num); +} + +void jsonw_float(json_writer_t *self, double num) +{ + jsonw_printf(self, "%g", num); +} + +void jsonw_hu(json_writer_t *self, unsigned short num) +{ + jsonw_printf(self, "%hu", num); +} + +void jsonw_uint(json_writer_t *self, uint64_t num) +{ + jsonw_printf(self, "%"PRIu64, num); +} + +void jsonw_lluint(json_writer_t *self, unsigned long long int num) +{ + jsonw_printf(self, "%llu", num); +} + +void jsonw_int(json_writer_t *self, int64_t num) +{ + jsonw_printf(self, "%"PRId64, num); +} + +void jsonw_string_field(json_writer_t *self, const char *prop, const char *val) +{ + jsonw_name(self, prop); + jsonw_string(self, val); +} + +void jsonw_bool_field(json_writer_t *self, const char *prop, bool val) +{ + jsonw_name(self, prop); + jsonw_bool(self, val); +} + +void jsonw_float_field(json_writer_t *self, const char *prop, double val) +{ + jsonw_name(self, prop); + jsonw_float(self, val); +} + +void jsonw_float_field_fmt(json_writer_t *self, + const char *prop, + const char *fmt, + double val) +{ + jsonw_name(self, prop); + jsonw_float_fmt(self, fmt, val); +} + +void jsonw_uint_field(json_writer_t *self, const char *prop, uint64_t num) +{ + jsonw_name(self, prop); + jsonw_uint(self, num); +} + +void jsonw_hu_field(json_writer_t *self, const char *prop, unsigned short num) +{ + jsonw_name(self, prop); + jsonw_hu(self, num); +} + +void jsonw_lluint_field(json_writer_t *self, + const char *prop, + unsigned long long int num) +{ + jsonw_name(self, prop); + jsonw_lluint(self, num); +} + +void jsonw_int_field(json_writer_t *self, const char *prop, int64_t num) +{ + jsonw_name(self, prop); + jsonw_int(self, num); +} + +void jsonw_null_field(json_writer_t *self, const char *prop) +{ + jsonw_name(self, prop); + jsonw_null(self); +} diff --git a/tools/net/ynl/ynltool/json_writer.h b/tools/net/ynl/ynltool/json_writer.h new file mode 100644 index 000000000000..0f1e63c88f6a --- /dev/null +++ b/tools/net/ynl/ynltool/json_writer.h @@ -0,0 +1,75 @@ +/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */ +/* + * Simple streaming JSON writer + * + * This takes care of the annoying bits of JSON syntax like the commas + * after elements + * + * Authors: Stephen Hemminger <stephen@networkplumber.org> + */ + +#ifndef _JSON_WRITER_H_ +#define _JSON_WRITER_H_ + +#include <stdbool.h> +#include <stdint.h> +#include <stdarg.h> +#include <stdio.h> + +/* Opaque class structure */ +typedef struct json_writer json_writer_t; + +/* Create a new JSON stream */ +json_writer_t *jsonw_new(FILE *f); +/* End output to JSON stream */ +void jsonw_destroy(json_writer_t **self_p); + +/* Cause output to have pretty whitespace */ +void jsonw_pretty(json_writer_t *self, bool on); + +/* Reset separator to create new JSON */ +void jsonw_reset(json_writer_t *self); + +/* Add property name */ +void jsonw_name(json_writer_t *self, const char *name); + +/* Add value */ +void __attribute__((format(printf, 2, 0))) jsonw_vprintf_enquote(json_writer_t *self, + const char *fmt, + va_list ap); +void __attribute__((format(printf, 2, 3))) jsonw_printf(json_writer_t *self, + const char *fmt, ...); +void jsonw_string(json_writer_t *self, const char *value); +void jsonw_bool(json_writer_t *self, bool value); +void jsonw_float(json_writer_t *self, double number); +void jsonw_float_fmt(json_writer_t *self, const char *fmt, double num); +void jsonw_uint(json_writer_t *self, uint64_t number); +void jsonw_hu(json_writer_t *self, unsigned short number); +void jsonw_int(json_writer_t *self, int64_t number); +void jsonw_null(json_writer_t *self); +void jsonw_lluint(json_writer_t *self, unsigned long long int num); + +/* Useful Combinations of name and value */ +void jsonw_string_field(json_writer_t *self, const char *prop, const char *val); +void jsonw_bool_field(json_writer_t *self, const char *prop, bool value); +void jsonw_float_field(json_writer_t *self, const char *prop, double num); +void jsonw_uint_field(json_writer_t *self, const char *prop, uint64_t num); +void jsonw_hu_field(json_writer_t *self, const char *prop, unsigned short num); +void jsonw_int_field(json_writer_t *self, const char *prop, int64_t num); +void jsonw_null_field(json_writer_t *self, const char *prop); +void jsonw_lluint_field(json_writer_t *self, const char *prop, + unsigned long long int num); +void jsonw_float_field_fmt(json_writer_t *self, const char *prop, + const char *fmt, double val); + +/* Collections */ +void jsonw_start_object(json_writer_t *self); +void jsonw_end_object(json_writer_t *self); + +void jsonw_start_array(json_writer_t *self); +void jsonw_end_array(json_writer_t *self); + +/* Override default exception handling */ +typedef void (jsonw_err_handler_fn)(const char *); + +#endif /* _JSON_WRITER_H_ */ diff --git a/tools/net/ynl/ynltool/main.c b/tools/net/ynl/ynltool/main.c new file mode 100644 index 000000000000..5d0f428eed0a --- /dev/null +++ b/tools/net/ynl/ynltool/main.c @@ -0,0 +1,242 @@ +// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +/* Copyright (C) 2017-2018 Netronome Systems, Inc. */ +/* Copyright Meta Platforms, Inc. and affiliates */ + +#include <ctype.h> +#include <errno.h> +#include <getopt.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <stdarg.h> + +#include "main.h" + +const char *bin_name; +static int last_argc; +static char **last_argv; +static int (*last_do_help)(int argc, char **argv); +json_writer_t *json_wtr; +bool pretty_output; +bool json_output; + +static void __attribute__((noreturn)) clean_and_exit(int i) +{ + if (json_output) + jsonw_destroy(&json_wtr); + + exit(i); +} + +void usage(void) +{ + last_do_help(last_argc - 1, last_argv + 1); + + clean_and_exit(-1); +} + +static int do_help(int argc __attribute__((unused)), + char **argv __attribute__((unused))) +{ + if (json_output) { + jsonw_null(json_wtr); + return 0; + } + + fprintf(stderr, + "Usage: %s [OPTIONS] OBJECT { COMMAND | help }\n" + " %s version\n" + "\n" + " OBJECT := { page-pool | qstats }\n" + " " HELP_SPEC_OPTIONS "\n" + "", + bin_name, bin_name); + + return 0; +} + +static int do_version(int argc __attribute__((unused)), + char **argv __attribute__((unused))) +{ + if (json_output) { + jsonw_start_object(json_wtr); + jsonw_name(json_wtr, "version"); + jsonw_printf(json_wtr, SRC_VERSION); + jsonw_end_object(json_wtr); + } else { + printf("%s " SRC_VERSION "\n", bin_name); + } + return 0; +} + +static const struct cmd commands[] = { + { "help", do_help }, + { "page-pool", do_page_pool }, + { "qstats", do_qstats }, + { "version", do_version }, + { 0 } +}; + +int cmd_select(const struct cmd *cmds, int argc, char **argv, + int (*help)(int argc, char **argv)) +{ + unsigned int i; + + last_argc = argc; + last_argv = argv; + last_do_help = help; + + if (argc < 1 && cmds[0].func) + return cmds[0].func(argc, argv); + + for (i = 0; cmds[i].cmd; i++) { + if (is_prefix(*argv, cmds[i].cmd)) { + if (!cmds[i].func) { + p_err("command '%s' is not available", cmds[i].cmd); + return -1; + } + return cmds[i].func(argc - 1, argv + 1); + } + } + + help(argc - 1, argv + 1); + + return -1; +} + +bool is_prefix(const char *pfx, const char *str) +{ + if (!pfx) + return false; + if (strlen(str) < strlen(pfx)) + return false; + + return !memcmp(str, pfx, strlen(pfx)); +} + +/* Last argument MUST be NULL pointer */ +int detect_common_prefix(const char *arg, ...) +{ + unsigned int count = 0; + const char *ref; + char msg[256]; + va_list ap; + + snprintf(msg, sizeof(msg), "ambiguous prefix: '%s' could be '", arg); + va_start(ap, arg); + while ((ref = va_arg(ap, const char *))) { + if (!is_prefix(arg, ref)) + continue; + count++; + if (count > 1) + strncat(msg, "' or '", sizeof(msg) - strlen(msg) - 1); + strncat(msg, ref, sizeof(msg) - strlen(msg) - 1); + } + va_end(ap); + strncat(msg, "'", sizeof(msg) - strlen(msg) - 1); + + if (count >= 2) { + p_err("%s", msg); + return -1; + } + + return 0; +} + +void p_err(const char *fmt, ...) +{ + va_list ap; + + va_start(ap, fmt); + if (json_output) { + jsonw_start_object(json_wtr); + jsonw_name(json_wtr, "error"); + jsonw_vprintf_enquote(json_wtr, fmt, ap); + jsonw_end_object(json_wtr); + } else { + fprintf(stderr, "Error: "); + vfprintf(stderr, fmt, ap); + fprintf(stderr, "\n"); + } + va_end(ap); +} + +void p_info(const char *fmt, ...) +{ + va_list ap; + + if (json_output) + return; + + va_start(ap, fmt); + vfprintf(stderr, fmt, ap); + fprintf(stderr, "\n"); + va_end(ap); +} + +int main(int argc, char **argv) +{ + static const struct option options[] = { + { "json", no_argument, NULL, 'j' }, + { "help", no_argument, NULL, 'h' }, + { "pretty", no_argument, NULL, 'p' }, + { "version", no_argument, NULL, 'V' }, + { 0 } + }; + bool version_requested = false; + int opt, ret; + + setlinebuf(stdout); + + last_do_help = do_help; + pretty_output = false; + json_output = false; + bin_name = "ynltool"; + + opterr = 0; + while ((opt = getopt_long(argc, argv, "Vhjp", + options, NULL)) >= 0) { + switch (opt) { + case 'V': + version_requested = true; + break; + case 'h': + return do_help(argc, argv); + case 'p': + pretty_output = true; + /* fall through */ + case 'j': + if (!json_output) { + json_wtr = jsonw_new(stdout); + if (!json_wtr) { + p_err("failed to create JSON writer"); + return -1; + } + json_output = true; + } + jsonw_pretty(json_wtr, pretty_output); + break; + default: + p_err("unrecognized option '%s'", argv[optind - 1]); + if (json_output) + clean_and_exit(-1); + else + usage(); + } + } + + argc -= optind; + argv += optind; + if (argc < 0) + usage(); + + if (version_requested) + ret = do_version(argc, argv); + else + ret = cmd_select(commands, argc, argv, do_help); + + if (json_output) + jsonw_destroy(&json_wtr); + + return ret; +} diff --git a/tools/net/ynl/ynltool/main.h b/tools/net/ynl/ynltool/main.h new file mode 100644 index 000000000000..c7039f9ac55a --- /dev/null +++ b/tools/net/ynl/ynltool/main.h @@ -0,0 +1,66 @@ +/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */ +/* Copyright (C) 2017-2018 Netronome Systems, Inc. */ +/* Copyright Meta Platforms, Inc. and affiliates */ + +#ifndef __YNLTOOL_H +#define __YNLTOOL_H + +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif +#include <stdbool.h> +#include <stdio.h> +#include <stdlib.h> +#include <errno.h> +#include <string.h> + +#include "json_writer.h" + +#define NEXT_ARG() ({ argc--; argv++; if (argc < 0) usage(); }) +#define NEXT_ARGP() ({ (*argc)--; (*argv)++; if (*argc < 0) usage(); }) +#define BAD_ARG() ({ p_err("what is '%s'?", *argv); -1; }) +#define GET_ARG() ({ argc--; *argv++; }) +#define REQ_ARGS(cnt) \ + ({ \ + int _cnt = (cnt); \ + bool _res; \ + \ + if (argc < _cnt) { \ + p_err("'%s' needs at least %d arguments, %d found", \ + argv[-1], _cnt, argc); \ + _res = false; \ + } else { \ + _res = true; \ + } \ + _res; \ + }) + +#define HELP_SPEC_OPTIONS \ + "OPTIONS := { {-j|--json} [{-p|--pretty}] }" + +extern const char *bin_name; + +extern json_writer_t *json_wtr; +extern bool json_output; +extern bool pretty_output; + +void __attribute__((format(printf, 1, 2))) p_err(const char *fmt, ...); +void __attribute__((format(printf, 1, 2))) p_info(const char *fmt, ...); + +bool is_prefix(const char *pfx, const char *str); +int detect_common_prefix(const char *arg, ...); +void usage(void) __attribute__((noreturn)); + +struct cmd { + const char *cmd; + int (*func)(int argc, char **argv); +}; + +int cmd_select(const struct cmd *cmds, int argc, char **argv, + int (*help)(int argc, char **argv)); + +/* subcommands */ +int do_page_pool(int argc, char **argv); +int do_qstats(int argc, char **argv); + +#endif /* __YNLTOOL_H */ diff --git a/tools/net/ynl/ynltool/page-pool.c b/tools/net/ynl/ynltool/page-pool.c new file mode 100644 index 000000000000..4b24492abab7 --- /dev/null +++ b/tools/net/ynl/ynltool/page-pool.c @@ -0,0 +1,461 @@ +// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <errno.h> +#include <net/if.h> + +#include <ynl.h> +#include "netdev-user.h" + +#include "main.h" + +struct pp_stat { + unsigned int ifc; + + struct { + unsigned int cnt; + size_t refs, bytes; + } live[2]; + + size_t alloc_slow, alloc_fast, recycle_ring, recycle_cache; +}; + +struct pp_stats_array { + unsigned int i, max; + struct pp_stat *s; +}; + +static struct pp_stat *find_ifc(struct pp_stats_array *a, unsigned int ifindex) +{ + unsigned int i; + + for (i = 0; i < a->i; i++) { + if (a->s[i].ifc == ifindex) + return &a->s[i]; + } + + a->i++; + if (a->i == a->max) { + a->max *= 2; + a->s = reallocarray(a->s, a->max, sizeof(*a->s)); + } + a->s[i].ifc = ifindex; + return &a->s[i]; +} + +static void count_pool(struct pp_stat *s, unsigned int l, + struct netdev_page_pool_get_rsp *pp) +{ + s->live[l].cnt++; + if (pp->_present.inflight) + s->live[l].refs += pp->inflight; + if (pp->_present.inflight_mem) + s->live[l].bytes += pp->inflight_mem; +} + +/* We don't know how many pages are sitting in cache and ring + * so we will under-count the recycling rate a bit. + */ +static void print_json_recycling_stats(struct pp_stat *s) +{ + double recycle; + + if (s->alloc_fast + s->alloc_slow) { + recycle = (double)(s->recycle_ring + s->recycle_cache) / + (s->alloc_fast + s->alloc_slow) * 100; + jsonw_float_field(json_wtr, "recycling_pct", recycle); + } + + jsonw_name(json_wtr, "alloc"); + jsonw_start_object(json_wtr); + jsonw_uint_field(json_wtr, "slow", s->alloc_slow); + jsonw_uint_field(json_wtr, "fast", s->alloc_fast); + jsonw_end_object(json_wtr); + + jsonw_name(json_wtr, "recycle"); + jsonw_start_object(json_wtr); + jsonw_uint_field(json_wtr, "ring", s->recycle_ring); + jsonw_uint_field(json_wtr, "cache", s->recycle_cache); + jsonw_end_object(json_wtr); +} + +static void print_plain_recycling_stats(struct pp_stat *s) +{ + double recycle; + + if (s->alloc_fast + s->alloc_slow) { + recycle = (double)(s->recycle_ring + s->recycle_cache) / + (s->alloc_fast + s->alloc_slow) * 100; + printf("recycling: %.1lf%% (alloc: %zu:%zu recycle: %zu:%zu)", + recycle, s->alloc_slow, s->alloc_fast, + s->recycle_ring, s->recycle_cache); + } +} + +static void print_json_stats(struct pp_stats_array *a) +{ + jsonw_start_array(json_wtr); + + for (unsigned int i = 0; i < a->i; i++) { + char ifname[IF_NAMESIZE]; + struct pp_stat *s = &a->s[i]; + const char *name; + + jsonw_start_object(json_wtr); + + if (!s->ifc) { + jsonw_string_field(json_wtr, "ifname", "<orphan>"); + jsonw_uint_field(json_wtr, "ifindex", 0); + } else { + name = if_indextoname(s->ifc, ifname); + if (name) + jsonw_string_field(json_wtr, "ifname", name); + jsonw_uint_field(json_wtr, "ifindex", s->ifc); + } + + jsonw_uint_field(json_wtr, "page_pools", s->live[1].cnt); + jsonw_uint_field(json_wtr, "zombies", s->live[0].cnt); + + jsonw_name(json_wtr, "live"); + jsonw_start_object(json_wtr); + jsonw_uint_field(json_wtr, "refs", s->live[1].refs); + jsonw_uint_field(json_wtr, "bytes", s->live[1].bytes); + jsonw_end_object(json_wtr); + + jsonw_name(json_wtr, "zombie"); + jsonw_start_object(json_wtr); + jsonw_uint_field(json_wtr, "refs", s->live[0].refs); + jsonw_uint_field(json_wtr, "bytes", s->live[0].bytes); + jsonw_end_object(json_wtr); + + if (s->alloc_fast || s->alloc_slow) + print_json_recycling_stats(s); + + jsonw_end_object(json_wtr); + } + + jsonw_end_array(json_wtr); +} + +static void print_plain_stats(struct pp_stats_array *a) +{ + for (unsigned int i = 0; i < a->i; i++) { + char ifname[IF_NAMESIZE]; + struct pp_stat *s = &a->s[i]; + const char *name; + + if (!s->ifc) { + printf("<orphan>\t"); + } else { + name = if_indextoname(s->ifc, ifname); + if (name) + printf("%8s", name); + printf("[%u]\t", s->ifc); + } + + printf("page pools: %u (zombies: %u)\n", + s->live[1].cnt, s->live[0].cnt); + printf("\t\trefs: %zu bytes: %zu (refs: %zu bytes: %zu)\n", + s->live[1].refs, s->live[1].bytes, + s->live[0].refs, s->live[0].bytes); + + if (s->alloc_fast || s->alloc_slow) { + printf("\t\t"); + print_plain_recycling_stats(s); + printf("\n"); + } + } +} + +static bool +find_pool_stat_in_list(struct netdev_page_pool_stats_get_list *pp_stats, + __u64 pool_id, struct pp_stat *pstat) +{ + ynl_dump_foreach(pp_stats, pp) { + if (!pp->_present.info || !pp->info._present.id) + continue; + if (pp->info.id != pool_id) + continue; + + memset(pstat, 0, sizeof(*pstat)); + if (pp->_present.alloc_fast) + pstat->alloc_fast = pp->alloc_fast; + if (pp->_present.alloc_refill) + pstat->alloc_fast += pp->alloc_refill; + if (pp->_present.alloc_slow) + pstat->alloc_slow = pp->alloc_slow; + if (pp->_present.recycle_ring) + pstat->recycle_ring = pp->recycle_ring; + if (pp->_present.recycle_cached) + pstat->recycle_cache = pp->recycle_cached; + return true; + } + return false; +} + +static void +print_json_pool_list(struct netdev_page_pool_get_list *pools, + struct netdev_page_pool_stats_get_list *pp_stats, + bool zombies_only) +{ + jsonw_start_array(json_wtr); + + ynl_dump_foreach(pools, pp) { + char ifname[IF_NAMESIZE]; + struct pp_stat pstat; + const char *name; + + if (zombies_only && !pp->_present.detach_time) + continue; + + jsonw_start_object(json_wtr); + + jsonw_uint_field(json_wtr, "id", pp->id); + + if (pp->_present.ifindex) { + name = if_indextoname(pp->ifindex, ifname); + if (name) + jsonw_string_field(json_wtr, "ifname", name); + jsonw_uint_field(json_wtr, "ifindex", pp->ifindex); + } + + if (pp->_present.napi_id) + jsonw_uint_field(json_wtr, "napi_id", pp->napi_id); + + if (pp->_present.inflight) + jsonw_uint_field(json_wtr, "refs", pp->inflight); + + if (pp->_present.inflight_mem) + jsonw_uint_field(json_wtr, "bytes", pp->inflight_mem); + + if (pp->_present.detach_time) + jsonw_uint_field(json_wtr, "detach_time", pp->detach_time); + + if (pp->_present.dmabuf) + jsonw_uint_field(json_wtr, "dmabuf", pp->dmabuf); + + if (find_pool_stat_in_list(pp_stats, pp->id, &pstat) && + (pstat.alloc_fast || pstat.alloc_slow)) + print_json_recycling_stats(&pstat); + + jsonw_end_object(json_wtr); + } + + jsonw_end_array(json_wtr); +} + +static void +print_plain_pool_list(struct netdev_page_pool_get_list *pools, + struct netdev_page_pool_stats_get_list *pp_stats, + bool zombies_only) +{ + ynl_dump_foreach(pools, pp) { + char ifname[IF_NAMESIZE]; + struct pp_stat pstat; + const char *name; + + if (zombies_only && !pp->_present.detach_time) + continue; + + printf("pool id: %llu", pp->id); + + if (pp->_present.ifindex) { + name = if_indextoname(pp->ifindex, ifname); + if (name) + printf(" dev: %s", name); + printf("[%u]", pp->ifindex); + } + + if (pp->_present.napi_id) + printf(" napi: %llu", pp->napi_id); + + printf("\n"); + + if (pp->_present.inflight || pp->_present.inflight_mem) { + printf(" inflight:"); + if (pp->_present.inflight) + printf(" %llu pages", pp->inflight); + if (pp->_present.inflight_mem) + printf(" %llu bytes", pp->inflight_mem); + printf("\n"); + } + + if (pp->_present.detach_time) + printf(" detached: %llu\n", pp->detach_time); + + if (pp->_present.dmabuf) + printf(" dmabuf: %u\n", pp->dmabuf); + + if (find_pool_stat_in_list(pp_stats, pp->id, &pstat) && + (pstat.alloc_fast || pstat.alloc_slow)) { + printf(" "); + print_plain_recycling_stats(&pstat); + printf("\n"); + } + } +} + +static void aggregate_device_stats(struct pp_stats_array *a, + struct netdev_page_pool_get_list *pools, + struct netdev_page_pool_stats_get_list *pp_stats) +{ + ynl_dump_foreach(pools, pp) { + struct pp_stat *s = find_ifc(a, pp->ifindex); + + count_pool(s, 1, pp); + if (pp->_present.detach_time) + count_pool(s, 0, pp); + } + + ynl_dump_foreach(pp_stats, pp) { + struct pp_stat *s = find_ifc(a, pp->info.ifindex); + + if (pp->_present.alloc_fast) + s->alloc_fast += pp->alloc_fast; + if (pp->_present.alloc_refill) + s->alloc_fast += pp->alloc_refill; + if (pp->_present.alloc_slow) + s->alloc_slow += pp->alloc_slow; + if (pp->_present.recycle_ring) + s->recycle_ring += pp->recycle_ring; + if (pp->_present.recycle_cached) + s->recycle_cache += pp->recycle_cached; + } +} + +static int do_stats(int argc, char **argv) +{ + struct netdev_page_pool_stats_get_list *pp_stats; + struct netdev_page_pool_get_list *pools; + enum { + GROUP_BY_DEVICE, + GROUP_BY_POOL, + } group_by = GROUP_BY_DEVICE; + bool zombies_only = false; + struct pp_stats_array a = {}; + struct ynl_error yerr; + struct ynl_sock *ys; + int ret = 0; + + /* Parse options */ + while (argc > 0) { + if (is_prefix(*argv, "group-by")) { + NEXT_ARG(); + + if (!REQ_ARGS(1)) + return -1; + + if (is_prefix(*argv, "device")) { + group_by = GROUP_BY_DEVICE; + } else if (is_prefix(*argv, "pp") || + is_prefix(*argv, "page-pool") || + is_prefix(*argv, "none")) { + group_by = GROUP_BY_POOL; + } else { + p_err("invalid group-by value '%s'", *argv); + return -1; + } + NEXT_ARG(); + } else if (is_prefix(*argv, "zombies")) { + zombies_only = true; + group_by = GROUP_BY_POOL; + NEXT_ARG(); + } else { + p_err("unknown option '%s'", *argv); + return -1; + } + } + + ys = ynl_sock_create(&ynl_netdev_family, &yerr); + if (!ys) { + p_err("YNL: %s", yerr.msg); + return -1; + } + + pools = netdev_page_pool_get_dump(ys); + if (!pools) { + p_err("failed to get page pools: %s", ys->err.msg); + ret = -1; + goto exit_close; + } + + pp_stats = netdev_page_pool_stats_get_dump(ys); + if (!pp_stats) { + p_err("failed to get page pool stats: %s", ys->err.msg); + ret = -1; + goto exit_free_pp_list; + } + + /* If grouping by pool, print individual pools */ + if (group_by == GROUP_BY_POOL) { + if (json_output) + print_json_pool_list(pools, pp_stats, zombies_only); + else + print_plain_pool_list(pools, pp_stats, zombies_only); + } else { + /* Aggregated stats mode (group-by device) */ + a.max = 64; + a.s = calloc(a.max, sizeof(*a.s)); + if (!a.s) { + p_err("failed to allocate stats array"); + ret = -1; + goto exit_free_stats_list; + } + + aggregate_device_stats(&a, pools, pp_stats); + + if (json_output) + print_json_stats(&a); + else + print_plain_stats(&a); + + free(a.s); + } + +exit_free_stats_list: + netdev_page_pool_stats_get_list_free(pp_stats); +exit_free_pp_list: + netdev_page_pool_get_list_free(pools); +exit_close: + ynl_sock_destroy(ys); + return ret; +} + +static int do_help(int argc __attribute__((unused)), + char **argv __attribute__((unused))) +{ + if (json_output) { + jsonw_null(json_wtr); + return 0; + } + + fprintf(stderr, + "Usage: %s page-pool { COMMAND | help }\n" + " %s page-pool stats [ OPTIONS ]\n" + "\n" + " OPTIONS := { group-by { device | page-pool | none } | zombies }\n" + "\n" + " stats - Display page pool statistics\n" + " stats group-by device - Group statistics by network device (default)\n" + " stats group-by page-pool | pp | none\n" + " - Show individual page pool details (no grouping)\n" + " stats zombies - Show only zombie page pools (detached but with\n" + " pages in flight). Implies group-by page-pool.\n" + "", + bin_name, bin_name); + + return 0; +} + +static const struct cmd page_pool_cmds[] = { + { "help", do_help }, + { "stats", do_stats }, + { 0 } +}; + +int do_page_pool(int argc, char **argv) +{ + return cmd_select(page_pool_cmds, argc, argv, do_help); +} diff --git a/tools/net/ynl/ynltool/qstats.c b/tools/net/ynl/ynltool/qstats.c new file mode 100644 index 000000000000..31fb45709ffa --- /dev/null +++ b/tools/net/ynl/ynltool/qstats.c @@ -0,0 +1,621 @@ +// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <errno.h> +#include <net/if.h> +#include <math.h> + +#include <ynl.h> +#include "netdev-user.h" + +#include "main.h" + +static enum netdev_qstats_scope scope; /* default - device */ + +struct queue_balance { + unsigned int ifindex; + enum netdev_queue_type type; + unsigned int queue_count; + __u64 *rx_packets; + __u64 *rx_bytes; + __u64 *tx_packets; + __u64 *tx_bytes; +}; + +static void print_json_qstats(struct netdev_qstats_get_list *qstats) +{ + jsonw_start_array(json_wtr); + + ynl_dump_foreach(qstats, qs) { + char ifname[IF_NAMESIZE]; + const char *name; + + jsonw_start_object(json_wtr); + + name = if_indextoname(qs->ifindex, ifname); + if (name) + jsonw_string_field(json_wtr, "ifname", name); + jsonw_uint_field(json_wtr, "ifindex", qs->ifindex); + + if (qs->_present.queue_type) + jsonw_string_field(json_wtr, "queue-type", + netdev_queue_type_str(qs->queue_type)); + if (qs->_present.queue_id) + jsonw_uint_field(json_wtr, "queue-id", qs->queue_id); + + if (qs->_present.rx_packets || qs->_present.rx_bytes || + qs->_present.rx_alloc_fail || qs->_present.rx_hw_drops || + qs->_present.rx_csum_complete || qs->_present.rx_hw_gro_packets) { + jsonw_name(json_wtr, "rx"); + jsonw_start_object(json_wtr); + if (qs->_present.rx_packets) + jsonw_uint_field(json_wtr, "packets", qs->rx_packets); + if (qs->_present.rx_bytes) + jsonw_uint_field(json_wtr, "bytes", qs->rx_bytes); + if (qs->_present.rx_alloc_fail) + jsonw_uint_field(json_wtr, "alloc-fail", qs->rx_alloc_fail); + if (qs->_present.rx_hw_drops) + jsonw_uint_field(json_wtr, "hw-drops", qs->rx_hw_drops); + if (qs->_present.rx_hw_drop_overruns) + jsonw_uint_field(json_wtr, "hw-drop-overruns", qs->rx_hw_drop_overruns); + if (qs->_present.rx_hw_drop_ratelimits) + jsonw_uint_field(json_wtr, "hw-drop-ratelimits", qs->rx_hw_drop_ratelimits); + if (qs->_present.rx_csum_complete) + jsonw_uint_field(json_wtr, "csum-complete", qs->rx_csum_complete); + if (qs->_present.rx_csum_unnecessary) + jsonw_uint_field(json_wtr, "csum-unnecessary", qs->rx_csum_unnecessary); + if (qs->_present.rx_csum_none) + jsonw_uint_field(json_wtr, "csum-none", qs->rx_csum_none); + if (qs->_present.rx_csum_bad) + jsonw_uint_field(json_wtr, "csum-bad", qs->rx_csum_bad); + if (qs->_present.rx_hw_gro_packets) + jsonw_uint_field(json_wtr, "hw-gro-packets", qs->rx_hw_gro_packets); + if (qs->_present.rx_hw_gro_bytes) + jsonw_uint_field(json_wtr, "hw-gro-bytes", qs->rx_hw_gro_bytes); + if (qs->_present.rx_hw_gro_wire_packets) + jsonw_uint_field(json_wtr, "hw-gro-wire-packets", qs->rx_hw_gro_wire_packets); + if (qs->_present.rx_hw_gro_wire_bytes) + jsonw_uint_field(json_wtr, "hw-gro-wire-bytes", qs->rx_hw_gro_wire_bytes); + jsonw_end_object(json_wtr); + } + + if (qs->_present.tx_packets || qs->_present.tx_bytes || + qs->_present.tx_hw_drops || qs->_present.tx_csum_none || + qs->_present.tx_hw_gso_packets) { + jsonw_name(json_wtr, "tx"); + jsonw_start_object(json_wtr); + if (qs->_present.tx_packets) + jsonw_uint_field(json_wtr, "packets", qs->tx_packets); + if (qs->_present.tx_bytes) + jsonw_uint_field(json_wtr, "bytes", qs->tx_bytes); + if (qs->_present.tx_hw_drops) + jsonw_uint_field(json_wtr, "hw-drops", qs->tx_hw_drops); + if (qs->_present.tx_hw_drop_errors) + jsonw_uint_field(json_wtr, "hw-drop-errors", qs->tx_hw_drop_errors); + if (qs->_present.tx_hw_drop_ratelimits) + jsonw_uint_field(json_wtr, "hw-drop-ratelimits", qs->tx_hw_drop_ratelimits); + if (qs->_present.tx_csum_none) + jsonw_uint_field(json_wtr, "csum-none", qs->tx_csum_none); + if (qs->_present.tx_needs_csum) + jsonw_uint_field(json_wtr, "needs-csum", qs->tx_needs_csum); + if (qs->_present.tx_hw_gso_packets) + jsonw_uint_field(json_wtr, "hw-gso-packets", qs->tx_hw_gso_packets); + if (qs->_present.tx_hw_gso_bytes) + jsonw_uint_field(json_wtr, "hw-gso-bytes", qs->tx_hw_gso_bytes); + if (qs->_present.tx_hw_gso_wire_packets) + jsonw_uint_field(json_wtr, "hw-gso-wire-packets", qs->tx_hw_gso_wire_packets); + if (qs->_present.tx_hw_gso_wire_bytes) + jsonw_uint_field(json_wtr, "hw-gso-wire-bytes", qs->tx_hw_gso_wire_bytes); + if (qs->_present.tx_stop) + jsonw_uint_field(json_wtr, "stop", qs->tx_stop); + if (qs->_present.tx_wake) + jsonw_uint_field(json_wtr, "wake", qs->tx_wake); + jsonw_end_object(json_wtr); + } + + jsonw_end_object(json_wtr); + } + + jsonw_end_array(json_wtr); +} + +static void print_one(bool present, const char *name, unsigned long long val, + int *line) +{ + if (!present) + return; + + if (!*line) { + printf(" "); + ++(*line); + } + + /* Don't waste space on tx- and rx- prefix, its implied by queue type */ + if (scope == NETDEV_QSTATS_SCOPE_QUEUE && + (name[0] == 'r' || name[0] == 't') && + name[1] == 'x' && name[2] == '-') + name += 3; + + printf(" %15s: %15llu", name, val); + + if (++(*line) == 3) { + printf("\n"); + *line = 0; + } +} + +static void print_plain_qstats(struct netdev_qstats_get_list *qstats) +{ + ynl_dump_foreach(qstats, qs) { + char ifname[IF_NAMESIZE]; + const char *name; + int n; + + name = if_indextoname(qs->ifindex, ifname); + if (name) + printf("%s", name); + else + printf("ifindex:%u", qs->ifindex); + + if (qs->_present.queue_type && qs->_present.queue_id) + printf("\t%s-%-3u", + netdev_queue_type_str(qs->queue_type), + qs->queue_id); + else + printf("\t "); + + n = 1; + + /* Basic counters */ + print_one(qs->_present.rx_packets, "rx-packets", qs->rx_packets, &n); + print_one(qs->_present.rx_bytes, "rx-bytes", qs->rx_bytes, &n); + print_one(qs->_present.tx_packets, "tx-packets", qs->tx_packets, &n); + print_one(qs->_present.tx_bytes, "tx-bytes", qs->tx_bytes, &n); + + /* RX error/drop counters */ + print_one(qs->_present.rx_alloc_fail, "rx-alloc-fail", + qs->rx_alloc_fail, &n); + print_one(qs->_present.rx_hw_drops, "rx-hw-drops", + qs->rx_hw_drops, &n); + print_one(qs->_present.rx_hw_drop_overruns, "rx-hw-drop-overruns", + qs->rx_hw_drop_overruns, &n); + print_one(qs->_present.rx_hw_drop_ratelimits, "rx-hw-drop-ratelimits", + qs->rx_hw_drop_ratelimits, &n); + + /* RX checksum counters */ + print_one(qs->_present.rx_csum_complete, "rx-csum-complete", + qs->rx_csum_complete, &n); + print_one(qs->_present.rx_csum_unnecessary, "rx-csum-unnecessary", + qs->rx_csum_unnecessary, &n); + print_one(qs->_present.rx_csum_none, "rx-csum-none", + qs->rx_csum_none, &n); + print_one(qs->_present.rx_csum_bad, "rx-csum-bad", + qs->rx_csum_bad, &n); + + /* RX GRO counters */ + print_one(qs->_present.rx_hw_gro_packets, "rx-hw-gro-packets", + qs->rx_hw_gro_packets, &n); + print_one(qs->_present.rx_hw_gro_bytes, "rx-hw-gro-bytes", + qs->rx_hw_gro_bytes, &n); + print_one(qs->_present.rx_hw_gro_wire_packets, "rx-hw-gro-wire-packets", + qs->rx_hw_gro_wire_packets, &n); + print_one(qs->_present.rx_hw_gro_wire_bytes, "rx-hw-gro-wire-bytes", + qs->rx_hw_gro_wire_bytes, &n); + + /* TX error/drop counters */ + print_one(qs->_present.tx_hw_drops, "tx-hw-drops", + qs->tx_hw_drops, &n); + print_one(qs->_present.tx_hw_drop_errors, "tx-hw-drop-errors", + qs->tx_hw_drop_errors, &n); + print_one(qs->_present.tx_hw_drop_ratelimits, "tx-hw-drop-ratelimits", + qs->tx_hw_drop_ratelimits, &n); + + /* TX checksum counters */ + print_one(qs->_present.tx_csum_none, "tx-csum-none", + qs->tx_csum_none, &n); + print_one(qs->_present.tx_needs_csum, "tx-needs-csum", + qs->tx_needs_csum, &n); + + /* TX GSO counters */ + print_one(qs->_present.tx_hw_gso_packets, "tx-hw-gso-packets", + qs->tx_hw_gso_packets, &n); + print_one(qs->_present.tx_hw_gso_bytes, "tx-hw-gso-bytes", + qs->tx_hw_gso_bytes, &n); + print_one(qs->_present.tx_hw_gso_wire_packets, "tx-hw-gso-wire-packets", + qs->tx_hw_gso_wire_packets, &n); + print_one(qs->_present.tx_hw_gso_wire_bytes, "tx-hw-gso-wire-bytes", + qs->tx_hw_gso_wire_bytes, &n); + + /* TX queue control */ + print_one(qs->_present.tx_stop, "tx-stop", qs->tx_stop, &n); + print_one(qs->_present.tx_wake, "tx-wake", qs->tx_wake, &n); + + if (n) + printf("\n"); + } +} + +static int do_show(int argc, char **argv) +{ + struct netdev_qstats_get_list *qstats; + struct netdev_qstats_get_req *req; + struct ynl_error yerr; + struct ynl_sock *ys; + int ret = 0; + + /* Parse options */ + while (argc > 0) { + if (is_prefix(*argv, "scope") || is_prefix(*argv, "group-by")) { + NEXT_ARG(); + + if (!REQ_ARGS(1)) + return -1; + + if (is_prefix(*argv, "queue")) { + scope = NETDEV_QSTATS_SCOPE_QUEUE; + } else if (is_prefix(*argv, "device")) { + scope = 0; + } else { + p_err("invalid scope value '%s'", *argv); + return -1; + } + NEXT_ARG(); + } else { + p_err("unknown option '%s'", *argv); + return -1; + } + } + + ys = ynl_sock_create(&ynl_netdev_family, &yerr); + if (!ys) { + p_err("YNL: %s", yerr.msg); + return -1; + } + + req = netdev_qstats_get_req_alloc(); + if (!req) { + p_err("failed to allocate qstats request"); + ret = -1; + goto exit_close; + } + + if (scope) + netdev_qstats_get_req_set_scope(req, scope); + + qstats = netdev_qstats_get_dump(ys, req); + netdev_qstats_get_req_free(req); + if (!qstats) { + p_err("failed to get queue stats: %s", ys->err.msg); + ret = -1; + goto exit_close; + } + + /* Print the stats as returned by the kernel */ + if (json_output) + print_json_qstats(qstats); + else + print_plain_qstats(qstats); + + netdev_qstats_get_list_free(qstats); +exit_close: + ynl_sock_destroy(ys); + return ret; +} + +static void compute_stats(__u64 *values, unsigned int count, + double *mean, double *stddev, __u64 *min, __u64 *max) +{ + double sum = 0.0, variance = 0.0; + unsigned int i; + + *min = ~0ULL; + *max = 0; + + if (count == 0) { + *mean = 0; + *stddev = 0; + *min = 0; + return; + } + + for (i = 0; i < count; i++) { + sum += values[i]; + if (values[i] < *min) + *min = values[i]; + if (values[i] > *max) + *max = values[i]; + } + + *mean = sum / count; + + if (count > 1) { + for (i = 0; i < count; i++) { + double diff = values[i] - *mean; + + variance += diff * diff; + } + *stddev = sqrt(variance / (count - 1)); + } else { + *stddev = 0; + } +} + +static void print_balance_stats(const char *name, enum netdev_queue_type type, + __u64 *values, unsigned int count) +{ + double mean, stddev, cv, ns; + __u64 min, max; + + if ((name[0] == 'r' && type != NETDEV_QUEUE_TYPE_RX) || + (name[0] == 't' && type != NETDEV_QUEUE_TYPE_TX)) + return; + + compute_stats(values, count, &mean, &stddev, &min, &max); + + cv = mean > 0 ? (stddev / mean) * 100.0 : 0.0; + ns = min + max > 0 ? (double)2 * (max - min) / (max + min) * 100 : 0.0; + + printf(" %-12s: cv=%.1f%% ns=%.1f%% stddev=%.0f\n", + name, cv, ns, stddev); + printf(" %-12s min=%llu max=%llu mean=%.0f\n", + "", min, max, mean); +} + +static void +print_balance_stats_json(const char *name, enum netdev_queue_type type, + __u64 *values, unsigned int count) +{ + double mean, stddev, cv, ns; + __u64 min, max; + + if ((name[0] == 'r' && type != NETDEV_QUEUE_TYPE_RX) || + (name[0] == 't' && type != NETDEV_QUEUE_TYPE_TX)) + return; + + compute_stats(values, count, &mean, &stddev, &min, &max); + + cv = mean > 0 ? (stddev / mean) * 100.0 : 0.0; + ns = min + max > 0 ? (double)2 * (max - min) / (max + min) * 100 : 0.0; + + jsonw_name(json_wtr, name); + jsonw_start_object(json_wtr); + jsonw_uint_field(json_wtr, "queue-count", count); + jsonw_uint_field(json_wtr, "min", min); + jsonw_uint_field(json_wtr, "max", max); + jsonw_float_field(json_wtr, "mean", mean); + jsonw_float_field(json_wtr, "stddev", stddev); + jsonw_float_field(json_wtr, "coefficient-of-variation", cv); + jsonw_float_field(json_wtr, "normalized-spread", ns); + jsonw_end_object(json_wtr); +} + +static int cmp_ifindex_type(const void *a, const void *b) +{ + const struct netdev_qstats_get_rsp *qa = a; + const struct netdev_qstats_get_rsp *qb = b; + + if (qa->ifindex != qb->ifindex) + return qa->ifindex - qb->ifindex; + if (qa->queue_type != qb->queue_type) + return qa->queue_type - qb->queue_type; + return qa->queue_id - qb->queue_id; +} + +static int do_balance(int argc, char **argv __attribute__((unused))) +{ + struct netdev_qstats_get_list *qstats; + struct netdev_qstats_get_req *req; + struct netdev_qstats_get_rsp **sorted; + struct ynl_error yerr; + struct ynl_sock *ys; + unsigned int count = 0; + unsigned int i, j; + int ret = 0; + + if (argc > 0) { + p_err("balance command takes no arguments"); + return -1; + } + + ys = ynl_sock_create(&ynl_netdev_family, &yerr); + if (!ys) { + p_err("YNL: %s", yerr.msg); + return -1; + } + + req = netdev_qstats_get_req_alloc(); + if (!req) { + p_err("failed to allocate qstats request"); + ret = -1; + goto exit_close; + } + + /* Always use queue scope for balance analysis */ + netdev_qstats_get_req_set_scope(req, NETDEV_QSTATS_SCOPE_QUEUE); + + qstats = netdev_qstats_get_dump(ys, req); + netdev_qstats_get_req_free(req); + if (!qstats) { + p_err("failed to get queue stats: %s", ys->err.msg); + ret = -1; + goto exit_close; + } + + /* Count and sort queues */ + ynl_dump_foreach(qstats, qs) + count++; + + if (count == 0) { + if (json_output) + jsonw_start_array(json_wtr); + else + printf("No queue statistics available\n"); + goto exit_free_qstats; + } + + sorted = calloc(count, sizeof(*sorted)); + if (!sorted) { + p_err("failed to allocate sorted array"); + ret = -1; + goto exit_free_qstats; + } + + i = 0; + ynl_dump_foreach(qstats, qs) + sorted[i++] = qs; + + qsort(sorted, count, sizeof(*sorted), cmp_ifindex_type); + + if (json_output) + jsonw_start_array(json_wtr); + + /* Process each device/queue-type combination */ + i = 0; + while (i < count) { + __u64 *rx_packets, *rx_bytes, *tx_packets, *tx_bytes; + enum netdev_queue_type type = sorted[i]->queue_type; + unsigned int ifindex = sorted[i]->ifindex; + unsigned int queue_count = 0; + char ifname[IF_NAMESIZE]; + const char *name; + + /* Count queues for this device/type */ + for (j = i; j < count && sorted[j]->ifindex == ifindex && + sorted[j]->queue_type == type; j++) + queue_count++; + + /* Skip if no packets/bytes (inactive queues) */ + if (!sorted[i]->_present.rx_packets && + !sorted[i]->_present.rx_bytes && + !sorted[i]->_present.tx_packets && + !sorted[i]->_present.tx_bytes) + goto next_ifc; + + /* Allocate arrays for statistics */ + rx_packets = calloc(queue_count, sizeof(*rx_packets)); + rx_bytes = calloc(queue_count, sizeof(*rx_bytes)); + tx_packets = calloc(queue_count, sizeof(*tx_packets)); + tx_bytes = calloc(queue_count, sizeof(*tx_bytes)); + + if (!rx_packets || !rx_bytes || !tx_packets || !tx_bytes) { + p_err("failed to allocate statistics arrays"); + free(rx_packets); + free(rx_bytes); + free(tx_packets); + free(tx_bytes); + ret = -1; + goto exit_free_sorted; + } + + /* Collect statistics */ + for (j = 0; j < queue_count; j++) { + rx_packets[j] = sorted[i + j]->_present.rx_packets ? + sorted[i + j]->rx_packets : 0; + rx_bytes[j] = sorted[i + j]->_present.rx_bytes ? + sorted[i + j]->rx_bytes : 0; + tx_packets[j] = sorted[i + j]->_present.tx_packets ? + sorted[i + j]->tx_packets : 0; + tx_bytes[j] = sorted[i + j]->_present.tx_bytes ? + sorted[i + j]->tx_bytes : 0; + } + + name = if_indextoname(ifindex, ifname); + + if (json_output) { + jsonw_start_object(json_wtr); + if (name) + jsonw_string_field(json_wtr, "ifname", name); + jsonw_uint_field(json_wtr, "ifindex", ifindex); + jsonw_string_field(json_wtr, "queue-type", + netdev_queue_type_str(type)); + + print_balance_stats_json("rx-packets", type, + rx_packets, queue_count); + print_balance_stats_json("rx-bytes", type, + rx_bytes, queue_count); + print_balance_stats_json("tx-packets", type, + tx_packets, queue_count); + print_balance_stats_json("tx-bytes", type, + tx_bytes, queue_count); + + jsonw_end_object(json_wtr); + } else { + if (name) + printf("%s", name); + else + printf("ifindex:%u", ifindex); + printf(" %s %d queues:\n", + netdev_queue_type_str(type), queue_count); + + print_balance_stats("rx-packets", type, + rx_packets, queue_count); + print_balance_stats("rx-bytes", type, + rx_bytes, queue_count); + print_balance_stats("tx-packets", type, + tx_packets, queue_count); + print_balance_stats("tx-bytes", type, + tx_bytes, queue_count); + printf("\n"); + } + + free(rx_packets); + free(rx_bytes); + free(tx_packets); + free(tx_bytes); + +next_ifc: + i += queue_count; + } + + if (json_output) + jsonw_end_array(json_wtr); + +exit_free_sorted: + free(sorted); +exit_free_qstats: + netdev_qstats_get_list_free(qstats); +exit_close: + ynl_sock_destroy(ys); + return ret; +} + +static int do_help(int argc __attribute__((unused)), + char **argv __attribute__((unused))) +{ + if (json_output) { + jsonw_null(json_wtr); + return 0; + } + + fprintf(stderr, + "Usage: %s qstats { COMMAND | help }\n" + " %s qstats [ show ] [ OPTIONS ]\n" + " %s qstats balance\n" + "\n" + " OPTIONS := { scope queue | group-by { device | queue } }\n" + "\n" + " show - Display queue statistics (default)\n" + " Statistics are aggregated for the entire device.\n" + " show scope queue - Display per-queue statistics\n" + " show group-by device - Display device-aggregated statistics (default)\n" + " show group-by queue - Display per-queue statistics\n" + " balance - Analyze traffic distribution balance.\n" + "", + bin_name, bin_name, bin_name); + + return 0; +} + +static const struct cmd qstats_cmds[] = { + { "show", do_show }, + { "balance", do_balance }, + { "help", do_help }, + { 0 } +}; + +int do_qstats(int argc, char **argv) +{ + return cmd_select(qstats_cmds, argc, argv, do_help); +} diff --git a/tools/objtool/.gitignore b/tools/objtool/.gitignore index 4faa4dd72f35..73d883128511 100644 --- a/tools/objtool/.gitignore +++ b/tools/objtool/.gitignore @@ -1,5 +1,8 @@ # SPDX-License-Identifier: GPL-2.0-only +arch/x86/lib/cpu-feature-names.c arch/x86/lib/inat-tables.c /objtool +feature +FEATURE-DUMP.objtool fixdep libsubcmd/ diff --git a/tools/objtool/Build b/tools/objtool/Build index a3cdf8af6635..600da051af12 100644 --- a/tools/objtool/Build +++ b/tools/objtool/Build @@ -8,13 +8,17 @@ objtool-y += builtin-check.o objtool-y += elf.o objtool-y += objtool.o -objtool-$(BUILD_ORC) += orc_gen.o -objtool-$(BUILD_ORC) += orc_dump.o +objtool-$(BUILD_DISAS) += disas.o +objtool-$(BUILD_DISAS) += trace.o + +objtool-$(BUILD_ORC) += orc_gen.o orc_dump.o +objtool-$(BUILD_KLP) += builtin-klp.o klp-diff.o klp-post-link.o objtool-y += libstring.o objtool-y += libctype.o objtool-y += str_error_r.o objtool-y += librbtree.o +objtool-y += signal.o $(OUTPUT)libstring.o: ../lib/string.c FORCE $(call rule_mkdir) diff --git a/tools/objtool/Makefile b/tools/objtool/Makefile index 8c20361dd100..ad6e1ec706ce 100644 --- a/tools/objtool/Makefile +++ b/tools/objtool/Makefile @@ -2,6 +2,28 @@ include ../scripts/Makefile.include include ../scripts/Makefile.arch +ifeq ($(SRCARCH),x86) + BUILD_ORC := y + ARCH_HAS_KLP := y +endif + +ifeq ($(SRCARCH),loongarch) + BUILD_ORC := y +endif + +ifeq ($(ARCH_HAS_KLP),y) + HAVE_XXHASH = $(shell printf "$(pound)include <xxhash.h>\nXXH3_state_t *state;int main() {}" | \ + $(HOSTCC) -xc - -o /dev/null -lxxhash 2> /dev/null && echo y || echo n) + ifeq ($(HAVE_XXHASH),y) + BUILD_KLP := y + LIBXXHASH_CFLAGS := $(shell $(HOSTPKG_CONFIG) libxxhash --cflags 2>/dev/null) \ + -DBUILD_KLP + LIBXXHASH_LIBS := $(shell $(HOSTPKG_CONFIG) libxxhash --libs 2>/dev/null || echo -lxxhash) + endif +endif + +export BUILD_ORC BUILD_KLP + ifeq ($(srctree),) srctree := $(patsubst %/,%,$(dir $(CURDIR))) srctree := $(patsubst %/,%,$(dir $(srctree))) @@ -23,6 +45,11 @@ LIBELF_LIBS := $(shell $(HOSTPKG_CONFIG) libelf --libs 2>/dev/null || echo -lel all: $(OBJTOOL) +WARNINGS := -Werror -Wall -Wextra -Wmissing-prototypes \ + -Wmissing-declarations -Wwrite-strings \ + -Wno-implicit-fallthrough -Wno-sign-compare \ + -Wno-unused-parameter + INCLUDES := -I$(srctree)/tools/include \ -I$(srctree)/tools/include/uapi \ -I$(srctree)/tools/arch/$(HOSTARCH)/include/uapi \ @@ -30,11 +57,11 @@ INCLUDES := -I$(srctree)/tools/include \ -I$(srctree)/tools/objtool/include \ -I$(srctree)/tools/objtool/arch/$(SRCARCH)/include \ -I$(LIBSUBCMD_OUTPUT)/include -# Note, EXTRA_WARNINGS here was determined for CC and not HOSTCC, it -# is passed here to match a legacy behavior. -WARNINGS := $(EXTRA_WARNINGS) -Wno-switch-default -Wno-switch-enum -Wno-packed -Wno-nested-externs -OBJTOOL_CFLAGS := -Werror $(WARNINGS) $(KBUILD_HOSTCFLAGS) -g $(INCLUDES) $(LIBELF_FLAGS) -OBJTOOL_LDFLAGS := $(LIBELF_LIBS) $(LIBSUBCMD) $(KBUILD_HOSTLDFLAGS) + +OBJTOOL_CFLAGS := -std=gnu11 -fomit-frame-pointer -O2 -g $(WARNINGS) \ + $(INCLUDES) $(LIBELF_FLAGS) $(LIBXXHASH_CFLAGS) $(HOSTCFLAGS) + +OBJTOOL_LDFLAGS := $(LIBSUBCMD) $(LIBELF_LIBS) $(LIBXXHASH_LIBS) $(HOSTLDFLAGS) # Allow old libelf to be used: elfshdr := $(shell echo '$(pound)include <libelf.h>' | $(HOSTCC) $(OBJTOOL_CFLAGS) -x c -E - 2>/dev/null | grep elf_getshdr) @@ -43,20 +70,32 @@ OBJTOOL_CFLAGS += $(if $(elfshdr),,-DLIBELF_USE_DEPRECATED) # Always want host compilation. HOST_OVERRIDES := CC="$(HOSTCC)" LD="$(HOSTLD)" AR="$(HOSTAR)" -AWK = awk -MKDIR = mkdir +# +# To support disassembly, objtool needs libopcodes which is provided +# with libbdf (binutils-dev or binutils-devel package). +# +FEATURE_USER = .objtool +FEATURE_TESTS = libbfd disassembler-init-styled +FEATURE_DISPLAY = +include $(srctree)/tools/build/Makefile.feature + +ifeq ($(feature-disassembler-init-styled), 1) + OBJTOOL_CFLAGS += -DDISASM_INIT_STYLED +endif -BUILD_ORC := n +BUILD_DISAS := n -ifeq ($(SRCARCH),x86) - BUILD_ORC := y +ifeq ($(feature-libbfd),1) + BUILD_DISAS := y + OBJTOOL_CFLAGS += -DDISAS -DPACKAGE="objtool" + OBJTOOL_LDFLAGS += -lopcodes endif -ifeq ($(SRCARCH),loongarch) - BUILD_ORC := y -endif +export BUILD_DISAS + +AWK = awk +MKDIR = mkdir -export BUILD_ORC export srctree OUTPUT CFLAGS SRCARCH AWK include $(srctree)/tools/build/Makefile.include @@ -86,7 +125,10 @@ $(LIBSUBCMD)-clean: clean: $(LIBSUBCMD)-clean $(call QUIET_CLEAN, objtool) $(RM) $(OBJTOOL) $(Q)find $(OUTPUT) -name '*.o' -delete -o -name '\.*.cmd' -delete -o -name '\.*.d' -delete + $(Q)$(RM) $(OUTPUT)arch/x86/lib/cpu-feature-names.c $(OUTPUT)fixdep $(Q)$(RM) $(OUTPUT)arch/x86/lib/inat-tables.c $(OUTPUT)fixdep + $(Q)$(RM) -- $(OUTPUT)FEATURE-DUMP.objtool + $(Q)$(RM) -r -- $(OUTPUT)feature FORCE: diff --git a/tools/objtool/arch/loongarch/decode.c b/tools/objtool/arch/loongarch/decode.c index 2e555c4060c5..6cd288150f49 100644 --- a/tools/objtool/arch/loongarch/decode.c +++ b/tools/objtool/arch/loongarch/decode.c @@ -1,13 +1,25 @@ // SPDX-License-Identifier: GPL-2.0-or-later #include <string.h> #include <objtool/check.h> +#include <objtool/disas.h> #include <objtool/warn.h> #include <asm/inst.h> #include <asm/orc_types.h> #include <linux/objtool_types.h> #include <arch/elf.h> -int arch_ftrace_match(char *name) +const char *arch_reg_name[CFI_NUM_REGS] = { + "zero", "ra", "tp", "sp", + "a0", "a1", "a2", "a3", + "a4", "a5", "a6", "a7", + "t0", "t1", "t2", "t3", + "t4", "t5", "t6", "t7", + "t8", "u0", "fp", "s0", + "s1", "s2", "s3", "s4", + "s5", "s6", "s7", "s8" +}; + +int arch_ftrace_match(const char *name) { return !strcmp(name, "_mcount"); } @@ -17,9 +29,9 @@ unsigned long arch_jump_destination(struct instruction *insn) return insn->offset + (insn->immediate << 2); } -unsigned long arch_dest_reloc_offset(int addend) +s64 arch_insn_adjusted_addend(struct instruction *insn, struct reloc *reloc) { - return addend; + return reloc_addend(reloc); } bool arch_pc_relative_reloc(struct reloc *reloc) @@ -414,3 +426,14 @@ unsigned long arch_jump_table_sym_offset(struct reloc *reloc, struct reloc *tabl return reloc->sym->offset + reloc_addend(reloc); } } + +#ifdef DISAS + +int arch_disas_info_init(struct disassemble_info *dinfo) +{ + return disas_info_init(dinfo, bfd_arch_loongarch, + bfd_mach_loongarch32, bfd_mach_loongarch64, + NULL); +} + +#endif /* DISAS */ diff --git a/tools/objtool/arch/loongarch/orc.c b/tools/objtool/arch/loongarch/orc.c index b58c5ff443c9..ffd3a3c858ae 100644 --- a/tools/objtool/arch/loongarch/orc.c +++ b/tools/objtool/arch/loongarch/orc.c @@ -5,7 +5,6 @@ #include <objtool/check.h> #include <objtool/orc.h> #include <objtool/warn.h> -#include <objtool/endianness.h> int init_orc_entry(struct orc_entry *orc, struct cfi_state *cfi, struct instruction *insn) { diff --git a/tools/objtool/arch/loongarch/special.c b/tools/objtool/arch/loongarch/special.c index a80b75f7b061..aba774109437 100644 --- a/tools/objtool/arch/loongarch/special.c +++ b/tools/objtool/arch/loongarch/special.c @@ -194,3 +194,8 @@ struct reloc *arch_find_switch_table(struct objtool_file *file, return rodata_reloc; } + +const char *arch_cpu_feature_name(int feature_number) +{ + return NULL; +} diff --git a/tools/objtool/arch/powerpc/decode.c b/tools/objtool/arch/powerpc/decode.c index c851c51d4bd3..e534ac1123b3 100644 --- a/tools/objtool/arch/powerpc/decode.c +++ b/tools/objtool/arch/powerpc/decode.c @@ -3,20 +3,32 @@ #include <stdio.h> #include <stdlib.h> #include <objtool/check.h> +#include <objtool/disas.h> #include <objtool/elf.h> #include <objtool/arch.h> #include <objtool/warn.h> #include <objtool/builtin.h> -#include <objtool/endianness.h> -int arch_ftrace_match(char *name) +const char *arch_reg_name[CFI_NUM_REGS] = { + "r0", "sp", "r2", "r3", + "r4", "r5", "r6", "r7", + "r8", "r9", "r10", "r11", + "r12", "r13", "r14", "r15", + "r16", "r17", "r18", "r19", + "r20", "r21", "r22", "r23", + "r24", "r25", "r26", "r27", + "r28", "r29", "r30", "r31", + "ra" +}; + +int arch_ftrace_match(const char *name) { return !strcmp(name, "_mcount"); } -unsigned long arch_dest_reloc_offset(int addend) +s64 arch_insn_adjusted_addend(struct instruction *insn, struct reloc *reloc) { - return addend; + return reloc_addend(reloc); } bool arch_callee_saved_reg(unsigned char reg) @@ -128,3 +140,14 @@ unsigned int arch_reloc_size(struct reloc *reloc) return 8; } } + +#ifdef DISAS + +int arch_disas_info_init(struct disassemble_info *dinfo) +{ + return disas_info_init(dinfo, bfd_arch_powerpc, + bfd_mach_ppc, bfd_mach_ppc64, + NULL); +} + +#endif /* DISAS */ diff --git a/tools/objtool/arch/powerpc/special.c b/tools/objtool/arch/powerpc/special.c index 51610689abf7..8f9bf61ca089 100644 --- a/tools/objtool/arch/powerpc/special.c +++ b/tools/objtool/arch/powerpc/special.c @@ -18,3 +18,8 @@ struct reloc *arch_find_switch_table(struct objtool_file *file, { exit(-1); } + +const char *arch_cpu_feature_name(int feature_number) +{ + return NULL; +} diff --git a/tools/objtool/arch/x86/Build b/tools/objtool/arch/x86/Build index 3dedb2fd8f3a..febee0b8ee0b 100644 --- a/tools/objtool/arch/x86/Build +++ b/tools/objtool/arch/x86/Build @@ -1,5 +1,5 @@ -objtool-y += special.o objtool-y += decode.o +objtool-y += special.o objtool-y += orc.o inat_tables_script = ../arch/x86/tools/gen-insn-attr-x86.awk @@ -12,3 +12,14 @@ $(OUTPUT)arch/x86/lib/inat-tables.c: $(inat_tables_script) $(inat_tables_maps) $(OUTPUT)arch/x86/decode.o: $(OUTPUT)arch/x86/lib/inat-tables.c CFLAGS_decode.o += -I$(OUTPUT)arch/x86/lib + +cpu_features = ../arch/x86/include/asm/cpufeatures.h +cpu_features_script = ../arch/x86/tools/gen-cpu-feature-names-x86.awk + +$(OUTPUT)arch/x86/lib/cpu-feature-names.c: $(cpu_features_script) $(cpu_features) + $(call rule_mkdir) + $(Q)$(call echo-cmd,gen)$(AWK) -f $(cpu_features_script) $(cpu_features) > $@ + +$(OUTPUT)arch/x86/special.o: $(OUTPUT)arch/x86/lib/cpu-feature-names.c + +CFLAGS_special.o += -I$(OUTPUT)arch/x86/lib diff --git a/tools/objtool/arch/x86/decode.c b/tools/objtool/arch/x86/decode.c index 0ad5cc70ecbe..f4af82508228 100644 --- a/tools/objtool/arch/x86/decode.c +++ b/tools/objtool/arch/x86/decode.c @@ -16,14 +16,22 @@ #include <asm/orc_types.h> #include <objtool/check.h> +#include <objtool/disas.h> #include <objtool/elf.h> #include <objtool/arch.h> #include <objtool/warn.h> -#include <objtool/endianness.h> #include <objtool/builtin.h> #include <arch/elf.h> -int arch_ftrace_match(char *name) +const char *arch_reg_name[CFI_NUM_REGS] = { + "rax", "rcx", "rdx", "rbx", + "rsp", "rbp", "rsi", "rdi", + "r8", "r9", "r10", "r11", + "r12", "r13", "r14", "r15", + "ra" +}; + +int arch_ftrace_match(const char *name) { return !strcmp(name, "__fentry__"); } @@ -68,9 +76,65 @@ bool arch_callee_saved_reg(unsigned char reg) } } -unsigned long arch_dest_reloc_offset(int addend) +/* Undo the effects of __pa_symbol() if necessary */ +static unsigned long phys_to_virt(unsigned long pa) +{ + s64 va = pa; + + if (va > 0) + va &= ~(0x80000000); + + return va; +} + +s64 arch_insn_adjusted_addend(struct instruction *insn, struct reloc *reloc) +{ + s64 addend = reloc_addend(reloc); + + if (arch_pc_relative_reloc(reloc)) + addend += insn->offset + insn->len - reloc_offset(reloc); + + return phys_to_virt(addend); +} + +static void scan_for_insn(struct section *sec, unsigned long offset, + unsigned long *insn_off, unsigned int *insn_len) { - return addend + 4; + unsigned long o = 0; + struct insn insn; + + while (1) { + + insn_decode(&insn, sec->data->d_buf + o, sec_size(sec) - o, + INSN_MODE_64); + + if (o + insn.length > offset) { + *insn_off = o; + *insn_len = insn.length; + return; + } + + o += insn.length; + } +} + +u64 arch_adjusted_addend(struct reloc *reloc) +{ + unsigned int type = reloc_type(reloc); + s64 addend = reloc_addend(reloc); + unsigned long insn_off; + unsigned int insn_len; + + if (type == R_X86_64_PLT32) + return addend + 4; + + if (type != R_X86_64_PC32 || !is_text_sec(reloc->sec->base)) + return addend; + + scan_for_insn(reloc->sec->base, reloc_offset(reloc), + &insn_off, &insn_len); + + return addend + insn_off + insn_len - reloc_offset(reloc); } unsigned long arch_jump_destination(struct instruction *insn) @@ -189,15 +253,6 @@ int arch_decode_instruction(struct objtool_file *file, const struct section *sec op2 = ins.opcode.bytes[1]; op3 = ins.opcode.bytes[2]; - /* - * XXX hack, decoder is buggered and thinks 0xea is 7 bytes long. - */ - if (op1 == 0xea) { - insn->len = 1; - insn->type = INSN_BUG; - return 0; - } - if (ins.rex_prefix.nbytes) { rex = ins.rex_prefix.bytes[0]; rex_w = X86_REX_W(rex) >> 3; @@ -503,6 +558,12 @@ int arch_decode_instruction(struct objtool_file *file, const struct section *sec break; case 0x90: + if (rex_b) /* XCHG %r8, %rax */ + break; + + if (prefix == 0xf3) /* REP NOP := PAUSE */ + break; + insn->type = INSN_NOP; break; @@ -556,13 +617,14 @@ int arch_decode_instruction(struct objtool_file *file, const struct section *sec } else if (op2 == 0x0b || op2 == 0xb9) { - /* ud2 */ + /* ud2, ud1 */ insn->type = INSN_BUG; - } else if (op2 == 0x0d || op2 == 0x1f) { + } else if (op2 == 0x1f) { - /* nopl/nopw */ - insn->type = INSN_NOP; + /* 0f 1f /0 := NOPL */ + if (modrm_reg == 0) + insn->type = INSN_NOP; } else if (op2 == 0x1e) { @@ -692,6 +754,10 @@ int arch_decode_instruction(struct objtool_file *file, const struct section *sec insn->type = INSN_SYSRET; break; + case 0xd6: /* udb */ + insn->type = INSN_BUG; + break; + case 0xe0: /* loopne */ case 0xe1: /* loope */ case 0xe2: /* loop */ @@ -892,3 +958,14 @@ bool arch_absolute_reloc(struct elf *elf, struct reloc *reloc) return false; } } + +#ifdef DISAS + +int arch_disas_info_init(struct disassemble_info *dinfo) +{ + return disas_info_init(dinfo, bfd_arch_i386, + bfd_mach_i386_i386, bfd_mach_x86_64, + "att"); +} + +#endif /* DISAS */ diff --git a/tools/objtool/arch/x86/orc.c b/tools/objtool/arch/x86/orc.c index 7176b9ec5b05..735e150ca6b7 100644 --- a/tools/objtool/arch/x86/orc.c +++ b/tools/objtool/arch/x86/orc.c @@ -5,7 +5,6 @@ #include <objtool/check.h> #include <objtool/orc.h> #include <objtool/warn.h> -#include <objtool/endianness.h> int init_orc_entry(struct orc_entry *orc, struct cfi_state *cfi, struct instruction *insn) { diff --git a/tools/objtool/arch/x86/special.c b/tools/objtool/arch/x86/special.c index 06ca4a2659a4..e817a3fff449 100644 --- a/tools/objtool/arch/x86/special.c +++ b/tools/objtool/arch/x86/special.c @@ -4,6 +4,10 @@ #include <objtool/special.h> #include <objtool/builtin.h> #include <objtool/warn.h> +#include <asm/cpufeatures.h> + +/* cpu feature name array generated from cpufeatures.h */ +#include "cpu-feature-names.c" void arch_handle_alternative(struct special_alt *alt) { @@ -89,7 +93,7 @@ struct reloc *arch_find_switch_table(struct objtool_file *file, /* look for a relocation which references .rodata */ text_reloc = find_reloc_by_dest_range(file->elf, insn->sec, insn->offset, insn->len); - if (!text_reloc || text_reloc->sym->type != STT_SECTION || + if (!text_reloc || !is_sec_sym(text_reloc->sym) || !text_reloc->sym->sec->rodata) return NULL; @@ -134,3 +138,9 @@ struct reloc *arch_find_switch_table(struct objtool_file *file, *table_size = 0; return rodata_reloc; } + +const char *arch_cpu_feature_name(int feature_number) +{ + return (feature_number < ARRAY_SIZE(cpu_feature_names)) ? + cpu_feature_names[feature_number] : NULL; +} diff --git a/tools/objtool/builtin-check.c b/tools/objtool/builtin-check.c index 0f6b197cfcb0..b780df513715 100644 --- a/tools/objtool/builtin-check.c +++ b/tools/objtool/builtin-check.c @@ -73,35 +73,41 @@ static int parse_hacks(const struct option *opt, const char *str, int unset) static const struct option check_options[] = { OPT_GROUP("Actions:"), + OPT_BOOLEAN(0, "checksum", &opts.checksum, "generate per-function checksums"), + OPT_BOOLEAN(0, "cfi", &opts.cfi, "annotate kernel control flow integrity (kCFI) function preambles"), + OPT_STRING_OPTARG('d', "disas", &opts.disas, "function-pattern", "disassemble functions", "*"), OPT_CALLBACK_OPTARG('h', "hacks", NULL, NULL, "jump_label,noinstr,skylake", "patch toolchain bugs/limitations", parse_hacks), - OPT_BOOLEAN('i', "ibt", &opts.ibt, "validate and annotate IBT"), - OPT_BOOLEAN('m', "mcount", &opts.mcount, "annotate mcount/fentry calls for ftrace"), - OPT_BOOLEAN('n', "noinstr", &opts.noinstr, "validate noinstr rules"), - OPT_BOOLEAN(0, "orc", &opts.orc, "generate ORC metadata"), - OPT_BOOLEAN('r', "retpoline", &opts.retpoline, "validate and annotate retpoline usage"), - OPT_BOOLEAN(0, "rethunk", &opts.rethunk, "validate and annotate rethunk usage"), - OPT_BOOLEAN(0, "unret", &opts.unret, "validate entry unret placement"), - OPT_INTEGER(0, "prefix", &opts.prefix, "generate prefix symbols"), - OPT_BOOLEAN('l', "sls", &opts.sls, "validate straight-line-speculation mitigations"), - OPT_BOOLEAN('s', "stackval", &opts.stackval, "validate frame pointer rules"), - OPT_BOOLEAN('t', "static-call", &opts.static_call, "annotate static calls"), - OPT_BOOLEAN('u', "uaccess", &opts.uaccess, "validate uaccess rules for SMAP"), - OPT_BOOLEAN(0 , "cfi", &opts.cfi, "annotate kernel control flow integrity (kCFI) function preambles"), - OPT_BOOLEAN(0 , "noabs", &opts.noabs, "reject absolute references in allocatable sections"), - OPT_CALLBACK_OPTARG(0, "dump", NULL, NULL, "orc", "dump metadata", parse_dump), + OPT_BOOLEAN('i', "ibt", &opts.ibt, "validate and annotate IBT"), + OPT_BOOLEAN('m', "mcount", &opts.mcount, "annotate mcount/fentry calls for ftrace"), + OPT_BOOLEAN(0, "noabs", &opts.noabs, "reject absolute references in allocatable sections"), + OPT_BOOLEAN('n', "noinstr", &opts.noinstr, "validate noinstr rules"), + OPT_BOOLEAN(0, "orc", &opts.orc, "generate ORC metadata"), + OPT_BOOLEAN('r', "retpoline", &opts.retpoline, "validate and annotate retpoline usage"), + OPT_BOOLEAN(0, "rethunk", &opts.rethunk, "validate and annotate rethunk usage"), + OPT_BOOLEAN(0, "unret", &opts.unret, "validate entry unret placement"), + OPT_INTEGER(0, "prefix", &opts.prefix, "generate prefix symbols"), + OPT_BOOLEAN('l', "sls", &opts.sls, "validate straight-line-speculation mitigations"), + OPT_BOOLEAN('s', "stackval", &opts.stackval, "validate frame pointer rules"), + OPT_BOOLEAN('t', "static-call", &opts.static_call, "annotate static calls"), + OPT_BOOLEAN('u', "uaccess", &opts.uaccess, "validate uaccess rules for SMAP"), + OPT_CALLBACK_OPTARG(0, "dump", NULL, NULL, "orc", "dump metadata", parse_dump), OPT_GROUP("Options:"), - OPT_BOOLEAN(0, "backtrace", &opts.backtrace, "unwind on error"), - OPT_BOOLEAN(0, "dry-run", &opts.dryrun, "don't write modifications"), - OPT_BOOLEAN(0, "link", &opts.link, "object is a linked object"), - OPT_BOOLEAN(0, "module", &opts.module, "object is part of a kernel module"), - OPT_BOOLEAN(0, "mnop", &opts.mnop, "nop out mcount call sites"), - OPT_BOOLEAN(0, "no-unreachable", &opts.no_unreachable, "skip 'unreachable instruction' warnings"), - OPT_STRING('o', "output", &opts.output, "file", "output file name"), - OPT_BOOLEAN(0, "sec-address", &opts.sec_address, "print section addresses in warnings"), - OPT_BOOLEAN(0, "stats", &opts.stats, "print statistics"), - OPT_BOOLEAN('v', "verbose", &opts.verbose, "verbose warnings"), - OPT_BOOLEAN(0, "Werror", &opts.werror, "return error on warnings"), + OPT_BOOLEAN(0, "backtrace", &opts.backtrace, "unwind on error"), + OPT_BOOLEAN(0, "backup", &opts.backup, "create backup (.orig) file on warning/error"), + OPT_STRING(0, "debug-checksum", &opts.debug_checksum, "funcs", "enable checksum debug output"), + OPT_BOOLEAN(0, "dry-run", &opts.dryrun, "don't write modifications"), + OPT_BOOLEAN(0, "link", &opts.link, "object is a linked object"), + OPT_BOOLEAN(0, "module", &opts.module, "object is part of a kernel module"), + OPT_BOOLEAN(0, "mnop", &opts.mnop, "nop out mcount call sites"), + OPT_BOOLEAN(0, "no-unreachable", &opts.no_unreachable, "skip 'unreachable instruction' warnings"), + OPT_STRING('o', "output", &opts.output, "file", "output file name"), + OPT_BOOLEAN(0, "sec-address", &opts.sec_address, "print section addresses in warnings"), + OPT_BOOLEAN(0, "stats", &opts.stats, "print statistics"), + OPT_STRING(0, "trace", &opts.trace, "func", "trace function validation"), + OPT_BOOLEAN('v', "verbose", &opts.verbose, "verbose warnings"), + OPT_BOOLEAN(0, "werror", &opts.werror, "return error on warnings"), + OPT_BOOLEAN(0, "wide", &opts.wide, "wide output"), OPT_END(), }; @@ -159,7 +165,21 @@ static bool opts_valid(void) return false; } - if (opts.hack_jump_label || +#ifndef BUILD_KLP + if (opts.checksum) { + ERROR("--checksum not supported; install xxhash-devel/libxxhash-dev (version >= 0.8) and recompile"); + return false; + } +#endif + + if (opts.debug_checksum && !opts.checksum) { + ERROR("--debug-checksum requires --checksum"); + return false; + } + + if (opts.checksum || + opts.disas || + opts.hack_jump_label || opts.hack_noinstr || opts.ibt || opts.mcount || @@ -243,15 +263,12 @@ static void save_argv(int argc, const char **argv) ERROR_GLIBC("strdup(%s)", argv[i]); exit(1); } - }; + } } -void print_args(void) +int make_backup(void) { - char *backup = NULL; - - if (opts.output || opts.dryrun) - goto print; + char *backup; /* * Make a backup before kbuild deletes the file so the error @@ -260,33 +277,32 @@ void print_args(void) backup = malloc(strlen(objname) + strlen(ORIG_SUFFIX) + 1); if (!backup) { ERROR_GLIBC("malloc"); - goto print; + return 1; } strcpy(backup, objname); strcat(backup, ORIG_SUFFIX); - if (copy_file(objname, backup)) { - backup = NULL; - goto print; - } + if (copy_file(objname, backup)) + return 1; -print: /* - * Print the cmdline args to make it easier to recreate. If '--output' - * wasn't used, add it to the printed args with the backup as input. + * Print the cmdline args to make it easier to recreate. */ + fprintf(stderr, "%s", orig_argv[0]); for (int i = 1; i < orig_argc; i++) { char *arg = orig_argv[i]; - if (backup && !strcmp(arg, objname)) + /* Modify the printed args to use the backup */ + if (!opts.output && !strcmp(arg, objname)) fprintf(stderr, " %s -o %s", backup, objname); else fprintf(stderr, " %s", arg); } fprintf(stderr, "\n"); + return 0; } int objtool_run(int argc, const char **argv) @@ -332,5 +348,5 @@ int objtool_run(int argc, const char **argv) if (!opts.dryrun && file->elf->changed && elf_write(file->elf)) return 1; - return 0; + return elf_close(file->elf); } diff --git a/tools/objtool/builtin-klp.c b/tools/objtool/builtin-klp.c new file mode 100644 index 000000000000..56d5a5b92f72 --- /dev/null +++ b/tools/objtool/builtin-klp.c @@ -0,0 +1,53 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +#include <subcmd/parse-options.h> +#include <string.h> +#include <stdlib.h> +#include <objtool/builtin.h> +#include <objtool/objtool.h> +#include <objtool/klp.h> + +struct subcmd { + const char *name; + const char *description; + int (*fn)(int, const char **); +}; + +static struct subcmd subcmds[] = { + { "diff", "Generate binary diff of two object files", cmd_klp_diff, }, + { "post-link", "Finalize klp symbols/relocs after module linking", cmd_klp_post_link, }, +}; + +static void cmd_klp_usage(void) +{ + fprintf(stderr, "usage: objtool klp <subcommand> [<options>]\n\n"); + fprintf(stderr, "Subcommands:\n"); + + for (int i = 0; i < ARRAY_SIZE(subcmds); i++) { + struct subcmd *cmd = &subcmds[i]; + + fprintf(stderr, " %s\t%s\n", cmd->name, cmd->description); + } + + exit(1); +} + +int cmd_klp(int argc, const char **argv) +{ + argc--; + argv++; + + if (!argc) + cmd_klp_usage(); + + if (argc) { + for (int i = 0; i < ARRAY_SIZE(subcmds); i++) { + struct subcmd *cmd = &subcmds[i]; + + if (!strcmp(cmd->name, argv[0])) + return cmd->fn(argc, argv); + } + } + + cmd_klp_usage(); + return 0; +} diff --git a/tools/objtool/check.c b/tools/objtool/check.c index 9004fbc06769..3f7999317f4d 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -3,6 +3,8 @@ * Copyright (C) 2015-2017 Josh Poimboeuf <jpoimboe@redhat.com> */ +#define _GNU_SOURCE /* memmem() */ +#include <fnmatch.h> #include <string.h> #include <stdlib.h> #include <inttypes.h> @@ -11,10 +13,13 @@ #include <objtool/builtin.h> #include <objtool/cfi.h> #include <objtool/arch.h> +#include <objtool/disas.h> #include <objtool/check.h> #include <objtool/special.h> +#include <objtool/trace.h> #include <objtool/warn.h> -#include <objtool/endianness.h> +#include <objtool/checksum.h> +#include <objtool/util.h> #include <linux/objtool_types.h> #include <linux/hashtable.h> @@ -22,11 +27,6 @@ #include <linux/static_call_types.h> #include <linux/string.h> -struct alternative { - struct alternative *next; - struct instruction *insn; -}; - static unsigned long nr_cfi, nr_cfi_reused, nr_cfi_cache; static struct cfi_init_state initial_func_cfi; @@ -34,6 +34,10 @@ static struct cfi_state init_cfi; static struct cfi_state func_cfi; static struct cfi_state force_undefined_cfi; +struct disas_context *objtool_disas_ctx; + +size_t sym_name_max_len; + struct instruction *find_insn(struct objtool_file *file, struct section *sec, unsigned long offset) { @@ -106,7 +110,7 @@ static struct instruction *prev_insn_same_sym(struct objtool_file *file, #define for_each_insn(file, insn) \ for (struct section *__sec, *__fake = (struct section *)1; \ __fake; __fake = NULL) \ - for_each_sec(file, __sec) \ + for_each_sec(file->elf, __sec) \ sec_for_each_insn(file, __sec, insn) #define func_for_each_insn(file, func, insn) \ @@ -131,15 +135,6 @@ static struct instruction *prev_insn_same_sym(struct objtool_file *file, for (insn = next_insn_same_sec(file, insn); insn; \ insn = next_insn_same_sec(file, insn)) -static inline struct symbol *insn_call_dest(struct instruction *insn) -{ - if (insn->type == INSN_JUMP_DYNAMIC || - insn->type == INSN_CALL_DYNAMIC) - return NULL; - - return insn->_call_dest; -} - static inline struct reloc *insn_jump_table(struct instruction *insn) { if (insn->type == INSN_JUMP_DYNAMIC || @@ -186,20 +181,6 @@ static bool is_sibling_call(struct instruction *insn) } /* - * Checks if a string ends with another. - */ -static bool str_ends_with(const char *s, const char *sub) -{ - const int slen = strlen(s); - const int sublen = strlen(sub); - - if (sublen > slen) - return 0; - - return !memcmp(s + slen - sublen, sub, sublen); -} - -/* * Checks if a function is a Rust "noreturn" one. */ static bool is_rust_noreturn(const struct symbol *func) @@ -262,7 +243,7 @@ static bool __dead_end_function(struct objtool_file *file, struct symbol *func, if (!func) return false; - if (func->bind == STB_GLOBAL || func->bind == STB_WEAK) { + if (!is_local_sym(func)) { if (is_rust_noreturn(func)) return true; @@ -271,7 +252,7 @@ static bool __dead_end_function(struct objtool_file *file, struct symbol *func, return true; } - if (func->bind == STB_WEAK) + if (is_weak_sym(func)) return false; if (!func->len) @@ -431,14 +412,13 @@ static int decode_instructions(struct objtool_file *file) struct symbol *func; unsigned long offset; struct instruction *insn; - int ret; - for_each_sec(file, sec) { + for_each_sec(file->elf, sec) { struct instruction *insns = NULL; u8 prev_len = 0; u8 idx = 0; - if (!(sec->sh.sh_flags & SHF_EXECINSTR)) + if (!is_text_sec(sec)) continue; if (strcmp(sec->name, ".altinstr_replacement") && @@ -461,9 +441,9 @@ static int decode_instructions(struct objtool_file *file) if (!strcmp(sec->name, ".init.text") && !opts.module) sec->init = true; - for (offset = 0; offset < sec->sh.sh_size; offset += insn->len) { + for (offset = 0; offset < sec_size(sec); offset += insn->len) { if (!insns || idx == INSN_CHUNK_MAX) { - insns = calloc(sizeof(*insn), INSN_CHUNK_SIZE); + insns = calloc(INSN_CHUNK_SIZE, sizeof(*insn)); if (!insns) { ERROR_GLIBC("calloc"); return -1; @@ -480,11 +460,8 @@ static int decode_instructions(struct objtool_file *file) insn->offset = offset; insn->prev_len = prev_len; - ret = arch_decode_instruction(file, sec, offset, - sec->sh.sh_size - offset, - insn); - if (ret) - return ret; + if (arch_decode_instruction(file, sec, offset, sec_size(sec) - offset, insn)) + return -1; prev_len = insn->len; @@ -501,12 +478,12 @@ static int decode_instructions(struct objtool_file *file) } sec_for_each_sym(sec, func) { - if (func->type != STT_NOTYPE && func->type != STT_FUNC) + if (!is_notype_sym(func) && !is_func_sym(func)) continue; - if (func->offset == sec->sh.sh_size) { + if (func->offset == sec_size(sec)) { /* Heuristic: likely an "end" symbol */ - if (func->type == STT_NOTYPE) + if (is_notype_sym(func)) continue; ERROR("%s(): STT_FUNC at end of section", func->name); return -1; @@ -522,7 +499,7 @@ static int decode_instructions(struct objtool_file *file) sym_for_each_insn(file, func, insn) { insn->sym = func; - if (func->type == STT_FUNC && + if (is_func_sym(func) && insn->type == INSN_ENDBR && list_empty(&insn->call_node)) { if (insn->offset == func->offset) { @@ -566,7 +543,7 @@ static int add_pv_ops(struct objtool_file *file, const char *symname) idx = (reloc_offset(reloc) - sym->offset) / sizeof(unsigned long); func = reloc->sym; - if (func->type == STT_SECTION) + if (is_sec_sym(func)) func = find_symbol_by_offset(reloc->sym->sec, reloc_addend(reloc)); if (!func) { @@ -600,7 +577,7 @@ static int init_pv_ops(struct objtool_file *file) }; const char *pv_ops; struct symbol *sym; - int idx, nr, ret; + int idx, nr; if (!opts.noinstr) return 0; @@ -612,7 +589,7 @@ static int init_pv_ops(struct objtool_file *file) return 0; nr = sym->len / sizeof(unsigned long); - file->pv_ops = calloc(sizeof(struct pv_state), nr); + file->pv_ops = calloc(nr, sizeof(struct pv_state)); if (!file->pv_ops) { ERROR_GLIBC("calloc"); return -1; @@ -622,14 +599,27 @@ static int init_pv_ops(struct objtool_file *file) INIT_LIST_HEAD(&file->pv_ops[idx].targets); for (idx = 0; (pv_ops = pv_ops_tables[idx]); idx++) { - ret = add_pv_ops(file, pv_ops); - if (ret) - return ret; + if (add_pv_ops(file, pv_ops)) + return -1; } return 0; } +static bool is_livepatch_module(struct objtool_file *file) +{ + struct section *sec; + + if (!opts.module) + return false; + + sec = find_section_by_name(file->elf, ".modinfo"); + if (!sec) + return false; + + return memmem(sec->data->d_buf, sec_size(sec), "\0livepatch=Y", 12); +} + static int create_static_call_sections(struct objtool_file *file) { struct static_call_site *site; @@ -641,8 +631,14 @@ static int create_static_call_sections(struct objtool_file *file) sec = find_section_by_name(file->elf, ".static_call_sites"); if (sec) { - INIT_LIST_HEAD(&file->static_call_list); - WARN("file already has .static_call_sites section, skipping"); + /* + * Livepatch modules may have already extracted the static call + * site entries to take advantage of vmlinux static call + * privileges. + */ + if (!file->klp) + WARN("file already has .static_call_sites section, skipping"); + return 0; } @@ -686,7 +682,7 @@ static int create_static_call_sections(struct objtool_file *file) key_sym = find_symbol_by_name(file->elf, tmp); if (!key_sym) { - if (!opts.module) { + if (!opts.module || file->klp) { ERROR("static_call: can't find static_call_key symbol: %s", tmp); return -1; } @@ -829,7 +825,7 @@ static int create_ibt_endbr_seal_sections(struct objtool_file *file) struct symbol *sym = insn->sym; *site = 0; - if (opts.module && sym && sym->type == STT_FUNC && + if (opts.module && sym && is_func_sym(sym) && insn->offset == sym->offset && (!strcmp(sym->name, "init_module") || !strcmp(sym->name, "cleanup_module"))) { @@ -857,14 +853,13 @@ static int create_cfi_sections(struct objtool_file *file) sec = find_section_by_name(file->elf, ".cfi_sites"); if (sec) { - INIT_LIST_HEAD(&file->call_list); WARN("file already has .cfi_sites section, skipping"); return 0; } idx = 0; - for_each_sym(file, sym) { - if (sym->type != STT_FUNC) + for_each_sym(file->elf, sym) { + if (!is_func_sym(sym)) continue; if (strncmp(sym->name, "__cfi_", 6)) @@ -879,8 +874,8 @@ static int create_cfi_sections(struct objtool_file *file) return -1; idx = 0; - for_each_sym(file, sym) { - if (sym->type != STT_FUNC) + for_each_sym(file->elf, sym) { + if (!is_func_sym(sym)) continue; if (strncmp(sym->name, "__cfi_", 6)) @@ -906,8 +901,13 @@ static int create_mcount_loc_sections(struct objtool_file *file) sec = find_section_by_name(file->elf, "__mcount_loc"); if (sec) { - INIT_LIST_HEAD(&file->mcount_loc_list); - WARN("file already has __mcount_loc section, skipping"); + /* + * Livepatch modules have already extracted their __mcount_loc + * entries to cover the !CONFIG_FTRACE_MCOUNT_USE_OBJTOOL case. + */ + if (!file->klp) + WARN("file already has __mcount_loc section, skipping"); + return 0; } @@ -951,7 +951,6 @@ static int create_direct_call_sections(struct objtool_file *file) sec = find_section_by_name(file->elf, ".call_sites"); if (sec) { - INIT_LIST_HEAD(&file->call_list); WARN("file already has .call_sites section, skipping"); return 0; } @@ -982,6 +981,59 @@ static int create_direct_call_sections(struct objtool_file *file) return 0; } +#ifdef BUILD_KLP +static int create_sym_checksum_section(struct objtool_file *file) +{ + struct section *sec; + struct symbol *sym; + unsigned int idx = 0; + struct sym_checksum *checksum; + size_t entsize = sizeof(struct sym_checksum); + + sec = find_section_by_name(file->elf, ".discard.sym_checksum"); + if (sec) { + if (!opts.dryrun) + WARN("file already has .discard.sym_checksum section, skipping"); + + return 0; + } + + for_each_sym(file->elf, sym) + if (sym->csum.checksum) + idx++; + + if (!idx) + return 0; + + sec = elf_create_section_pair(file->elf, ".discard.sym_checksum", entsize, + idx, idx); + if (!sec) + return -1; + + idx = 0; + for_each_sym(file->elf, sym) { + if (!sym->csum.checksum) + continue; + + if (!elf_init_reloc(file->elf, sec->rsec, idx, idx * entsize, + sym, 0, R_TEXT64)) + return -1; + + checksum = (struct sym_checksum *)sec->data->d_buf + idx; + checksum->addr = 0; /* reloc */ + checksum->checksum = sym->csum.checksum; + + mark_sec_changed(file->elf, sec, true); + + idx++; + } + + return 0; +} +#else +static int create_sym_checksum_section(struct objtool_file *file) { return -EINVAL; } +#endif + /* * Warnings shouldn't be reported for ignored functions. */ @@ -1433,9 +1485,14 @@ static void add_return_call(struct objtool_file *file, struct instruction *insn, } static bool is_first_func_insn(struct objtool_file *file, - struct instruction *insn, struct symbol *sym) + struct instruction *insn) { - if (insn->offset == sym->offset) + struct symbol *func = insn_func(insn); + + if (!func) + return false; + + if (insn->offset == func->offset) return true; /* Allow direct CALL/JMP past ENDBR */ @@ -1443,7 +1500,7 @@ static bool is_first_func_insn(struct objtool_file *file, struct instruction *prev = prev_insn_same_sym(file, insn); if (prev && prev->type == INSN_ENDBR && - insn->offset == sym->offset + prev->len) + insn->offset == func->offset + prev->len) return true; } @@ -1451,44 +1508,22 @@ static bool is_first_func_insn(struct objtool_file *file, } /* - * A sibling call is a tail-call to another symbol -- to differentiate from a - * recursive tail-call which is to the same symbol. - */ -static bool jump_is_sibling_call(struct objtool_file *file, - struct instruction *from, struct instruction *to) -{ - struct symbol *fs = from->sym; - struct symbol *ts = to->sym; - - /* Not a sibling call if from/to a symbol hole */ - if (!fs || !ts) - return false; - - /* Not a sibling call if not targeting the start of a symbol. */ - if (!is_first_func_insn(file, to, ts)) - return false; - - /* Disallow sibling calls into STT_NOTYPE */ - if (ts->type == STT_NOTYPE) - return false; - - /* Must not be self to be a sibling */ - return fs->pfunc != ts->pfunc; -} - -/* * Find the destination instructions for all jumps. */ static int add_jump_destinations(struct objtool_file *file) { - struct instruction *insn, *jump_dest; + struct instruction *insn; struct reloc *reloc; - struct section *dest_sec; - unsigned long dest_off; - int ret; for_each_insn(file, insn) { struct symbol *func = insn_func(insn); + struct instruction *dest_insn; + struct section *dest_sec; + struct symbol *dest_sym; + unsigned long dest_off; + + if (!is_static_jump(insn)) + continue; if (insn->jump_dest) { /* @@ -1497,53 +1532,53 @@ static int add_jump_destinations(struct objtool_file *file) */ continue; } - if (!is_static_jump(insn)) - continue; reloc = insn_reloc(file, insn); if (!reloc) { dest_sec = insn->sec; dest_off = arch_jump_destination(insn); - } else if (reloc->sym->type == STT_SECTION) { - dest_sec = reloc->sym->sec; - dest_off = arch_dest_reloc_offset(reloc_addend(reloc)); - } else if (reloc->sym->retpoline_thunk) { - ret = add_retpoline_call(file, insn); - if (ret) - return ret; - continue; - } else if (reloc->sym->return_thunk) { - add_return_call(file, insn, true); - continue; - } else if (func) { - /* - * External sibling call or internal sibling call with - * STT_FUNC reloc. - */ - ret = add_call_dest(file, insn, reloc->sym, true); - if (ret) - return ret; - continue; - } else if (reloc->sym->sec->idx) { - dest_sec = reloc->sym->sec; - dest_off = reloc->sym->sym.st_value + - arch_dest_reloc_offset(reloc_addend(reloc)); + dest_sym = dest_sec->sym; } else { - /* non-func asm code jumping to another file */ - continue; + dest_sym = reloc->sym; + if (is_undef_sym(dest_sym)) { + if (dest_sym->retpoline_thunk) { + if (add_retpoline_call(file, insn)) + return -1; + continue; + } + + if (dest_sym->return_thunk) { + add_return_call(file, insn, true); + continue; + } + + /* External symbol */ + if (func) { + /* External sibling call */ + if (add_call_dest(file, insn, dest_sym, true)) + return -1; + continue; + } + + /* Non-func asm code jumping to external symbol */ + continue; + } + + dest_sec = dest_sym->sec; + dest_off = dest_sym->offset + arch_insn_adjusted_addend(insn, reloc); } - jump_dest = find_insn(file, dest_sec, dest_off); - if (!jump_dest) { + dest_insn = find_insn(file, dest_sec, dest_off); + if (!dest_insn) { struct symbol *sym = find_symbol_by_offset(dest_sec, dest_off); /* - * This is a special case for retbleed_untrain_ret(). - * It jumps to __x86_return_thunk(), but objtool - * can't find the thunk's starting RET - * instruction, because the RET is also in the - * middle of another instruction. Objtool only - * knows about the outer instruction. + * retbleed_untrain_ret() jumps to + * __x86_return_thunk(), but objtool can't find + * the thunk's starting RET instruction, + * because the RET is also in the middle of + * another instruction. Objtool only knows + * about the outer instruction. */ if (sym && sym->embedded_insn) { add_return_call(file, insn, false); @@ -1551,76 +1586,52 @@ static int add_jump_destinations(struct objtool_file *file) } /* - * GCOV/KCOV dead code can jump to the end of the - * function/section. + * GCOV/KCOV dead code can jump to the end of + * the function/section. */ if (file->ignore_unreachables && func && dest_sec == insn->sec && dest_off == func->offset + func->len) continue; - ERROR_INSN(insn, "can't find jump dest instruction at %s+0x%lx", - dest_sec->name, dest_off); + ERROR_INSN(insn, "can't find jump dest instruction at %s", + offstr(dest_sec, dest_off)); return -1; } - /* - * An intra-TU jump in retpoline.o might not have a relocation - * for its jump dest, in which case the above - * add_{retpoline,return}_call() didn't happen. - */ - if (jump_dest->sym && jump_dest->offset == jump_dest->sym->offset) { - if (jump_dest->sym->retpoline_thunk) { - ret = add_retpoline_call(file, insn); - if (ret) - return ret; - continue; - } - if (jump_dest->sym->return_thunk) { - add_return_call(file, insn, true); - continue; - } + if (!dest_sym || is_sec_sym(dest_sym)) { + dest_sym = dest_insn->sym; + if (!dest_sym) + goto set_jump_dest; } - /* - * Cross-function jump. - */ - if (func && insn_func(jump_dest) && func != insn_func(jump_dest)) { + if (dest_sym->retpoline_thunk && dest_insn->offset == dest_sym->offset) { + if (add_retpoline_call(file, insn)) + return -1; + continue; + } - /* - * For GCC 8+, create parent/child links for any cold - * subfunctions. This is _mostly_ redundant with a - * similar initialization in read_symbols(). - * - * If a function has aliases, we want the *first* such - * function in the symbol table to be the subfunction's - * parent. In that case we overwrite the - * initialization done in read_symbols(). - * - * However this code can't completely replace the - * read_symbols() code because this doesn't detect the - * case where the parent function's only reference to a - * subfunction is through a jump table. - */ - if (!strstr(func->name, ".cold") && - strstr(insn_func(jump_dest)->name, ".cold")) { - func->cfunc = insn_func(jump_dest); - insn_func(jump_dest)->pfunc = func; - } + if (dest_sym->return_thunk && dest_insn->offset == dest_sym->offset) { + add_return_call(file, insn, true); + continue; } - if (jump_is_sibling_call(file, insn, jump_dest)) { - /* - * Internal sibling call without reloc or with - * STT_SECTION reloc. - */ - ret = add_call_dest(file, insn, insn_func(jump_dest), true); - if (ret) - return ret; + if (!insn->sym || insn->sym->pfunc == dest_sym->pfunc) + goto set_jump_dest; + + /* + * Internal cross-function jump. + */ + + if (is_first_func_insn(file, dest_insn)) { + /* Internal sibling call */ + if (add_call_dest(file, insn, dest_sym, true)) + return -1; continue; } - insn->jump_dest = jump_dest; +set_jump_dest: + insn->jump_dest = dest_insn; } return 0; @@ -1646,7 +1657,6 @@ static int add_call_destinations(struct objtool_file *file) unsigned long dest_off; struct symbol *dest; struct reloc *reloc; - int ret; for_each_insn(file, insn) { struct symbol *func = insn_func(insn); @@ -1658,9 +1668,8 @@ static int add_call_destinations(struct objtool_file *file) dest_off = arch_jump_destination(insn); dest = find_call_destination(insn->sec, dest_off); - ret = add_call_dest(file, insn, dest, false); - if (ret) - return ret; + if (add_call_dest(file, insn, dest, false)) + return -1; if (func && func->ignore) continue; @@ -1670,13 +1679,13 @@ static int add_call_destinations(struct objtool_file *file) return -1; } - if (func && insn_call_dest(insn)->type != STT_FUNC) { + if (func && !is_func_sym(insn_call_dest(insn))) { ERROR_INSN(insn, "unsupported call to non-function"); return -1; } - } else if (reloc->sym->type == STT_SECTION) { - dest_off = arch_dest_reloc_offset(reloc_addend(reloc)); + } else if (is_sec_sym(reloc->sym)) { + dest_off = arch_insn_adjusted_addend(insn, reloc); dest = find_call_destination(reloc->sym->sec, dest_off); if (!dest) { ERROR_INSN(insn, "can't find call dest symbol at %s+0x%lx", @@ -1684,19 +1693,16 @@ static int add_call_destinations(struct objtool_file *file) return -1; } - ret = add_call_dest(file, insn, dest, false); - if (ret) - return ret; + if (add_call_dest(file, insn, dest, false)) + return -1; } else if (reloc->sym->retpoline_thunk) { - ret = add_retpoline_call(file, insn); - if (ret) - return ret; + if (add_retpoline_call(file, insn)) + return -1; } else { - ret = add_call_dest(file, insn, reloc->sym, false); - if (ret) - return ret; + if (add_call_dest(file, insn, reloc->sym, false)) + return -1; } } @@ -1745,6 +1751,7 @@ static int handle_group_alt(struct objtool_file *file, orig_alt_group->last_insn = last_orig_insn; orig_alt_group->nop = NULL; orig_alt_group->ignore = orig_insn->ignore_alts; + orig_alt_group->feature = 0; } else { if (orig_alt_group->last_insn->offset + orig_alt_group->last_insn->len - orig_alt_group->first_insn->offset != special_alt->orig_len) { @@ -1784,6 +1791,7 @@ static int handle_group_alt(struct objtool_file *file, nop->type = INSN_NOP; nop->sym = orig_insn->sym; nop->alt_group = new_alt_group; + nop->fake = 1; } if (!special_alt->new_len) { @@ -1848,6 +1856,7 @@ end: new_alt_group->nop = nop; new_alt_group->ignore = (*new_insn)->ignore_alts; new_alt_group->cfi = orig_alt_group->cfi; + new_alt_group->feature = special_alt->feature; return 0; } @@ -1912,8 +1921,9 @@ static int add_special_section_alts(struct objtool_file *file) struct list_head special_alts; struct instruction *orig_insn, *new_insn; struct special_alt *special_alt, *tmp; + enum alternative_type alt_type; struct alternative *alt; - int ret; + struct alternative *a; if (special_get_alts(file->elf, &special_alts)) return -1; @@ -1945,16 +1955,18 @@ static int add_special_section_alts(struct objtool_file *file) continue; } - ret = handle_group_alt(file, special_alt, orig_insn, - &new_insn); - if (ret) - return ret; + if (handle_group_alt(file, special_alt, orig_insn, &new_insn)) + return -1; + + alt_type = ALT_TYPE_INSTRUCTIONS; } else if (special_alt->jump_or_nop) { - ret = handle_jump_alt(file, special_alt, orig_insn, - &new_insn); - if (ret) - return ret; + if (handle_jump_alt(file, special_alt, orig_insn, &new_insn)) + return -1; + + alt_type = ALT_TYPE_JUMP_TABLE; + } else { + alt_type = ALT_TYPE_EX_TABLE; } alt = calloc(1, sizeof(*alt)); @@ -1964,8 +1976,20 @@ static int add_special_section_alts(struct objtool_file *file) } alt->insn = new_insn; - alt->next = orig_insn->alts; - orig_insn->alts = alt; + alt->type = alt_type; + alt->next = NULL; + + /* + * Store alternatives in the same order they have been + * defined. + */ + if (!orig_insn->alts) { + orig_insn->alts = alt; + } else { + for (a = orig_insn->alts; a->next; a = a->next) + ; + a->next = alt; + } list_del(&special_alt->list); free(special_alt); @@ -2142,15 +2166,13 @@ static int add_func_jump_tables(struct objtool_file *file, struct symbol *func) { struct instruction *insn; - int ret; func_for_each_insn(file, func, insn) { if (!insn_jump_table(insn)) continue; - ret = add_jump_table(file, insn); - if (ret) - return ret; + if (add_jump_table(file, insn)) + return -1; } return 0; @@ -2164,19 +2186,17 @@ static int add_func_jump_tables(struct objtool_file *file, static int add_jump_table_alts(struct objtool_file *file) { struct symbol *func; - int ret; if (!file->rodata) return 0; - for_each_sym(file, func) { - if (func->type != STT_FUNC) + for_each_sym(file->elf, func) { + if (!is_func_sym(func) || func->alias != func) continue; mark_func_jump_tables(file, func); - ret = add_func_jump_tables(file, func); - if (ret) - return ret; + if (add_func_jump_tables(file, func)) + return -1; } return 0; @@ -2210,14 +2230,14 @@ static int read_unwind_hints(struct objtool_file *file) return -1; } - if (sec->sh.sh_size % sizeof(struct unwind_hint)) { + if (sec_size(sec) % sizeof(struct unwind_hint)) { ERROR("struct unwind_hint size mismatch"); return -1; } file->hints = true; - for (i = 0; i < sec->sh.sh_size / sizeof(struct unwind_hint); i++) { + for (i = 0; i < sec_size(sec) / sizeof(struct unwind_hint); i++) { hint = (struct unwind_hint *)sec->data->d_buf + i; reloc = find_reloc_by_dest(file->elf, sec, i * sizeof(*hint)); @@ -2226,14 +2246,7 @@ static int read_unwind_hints(struct objtool_file *file) return -1; } - if (reloc->sym->type == STT_SECTION) { - offset = reloc_addend(reloc); - } else if (reloc->sym->local_label) { - offset = reloc->sym->offset; - } else { - ERROR("unexpected relocation symbol type in %s", sec->rsec->name); - return -1; - } + offset = reloc->sym->offset + reloc_addend(reloc); insn = find_insn(file, reloc->sym->sec, offset); if (!insn) { @@ -2262,7 +2275,7 @@ static int read_unwind_hints(struct objtool_file *file) if (hint->type == UNWIND_HINT_TYPE_REGS_PARTIAL) { struct symbol *sym = find_symbol_by_offset(insn->sec, insn->offset); - if (sym && sym->bind == STB_GLOBAL) { + if (sym && is_global_sym(sym)) { if (opts.ibt && insn->type != INSN_ENDBR && !insn->noendbr) { ERROR_INSN(insn, "UNWIND_HINT_IRET_REGS without ENDBR"); return -1; @@ -2300,7 +2313,7 @@ static int read_annotate(struct objtool_file *file, struct instruction *insn; struct reloc *reloc; uint64_t offset; - int type, ret; + int type; sec = find_section_by_name(file->elf, ".discard.annotate_insn"); if (!sec) @@ -2318,10 +2331,13 @@ static int read_annotate(struct objtool_file *file, sec->sh.sh_entsize = 8; } - for_each_reloc(sec->rsec, reloc) { - type = *(u32 *)(sec->data->d_buf + (reloc_idx(reloc) * sec->sh.sh_entsize) + 4); - type = bswap_if_needed(file->elf, type); + if (sec_num_entries(sec) != sec_num_entries(sec->rsec)) { + ERROR("bad .discard.annotate_insn section: missing relocs"); + return -1; + } + for_each_reloc(sec->rsec, reloc) { + type = annotype(file->elf, sec, reloc); offset = reloc->sym->offset + reloc_addend(reloc); insn = find_insn(file, reloc->sym->sec, offset); @@ -2330,9 +2346,8 @@ static int read_annotate(struct objtool_file *file, return -1; } - ret = func(file, type, insn); - if (ret < 0) - return ret; + if (func(file, type, insn)) + return -1; } return 0; @@ -2471,12 +2486,13 @@ static bool is_profiling_func(const char *name) static int classify_symbols(struct objtool_file *file) { struct symbol *func; + size_t len; - for_each_sym(file, func) { - if (func->type == STT_NOTYPE && strstarts(func->name, ".L")) + for_each_sym(file->elf, func) { + if (is_notype_sym(func) && strstarts(func->name, ".L")) func->local_label = true; - if (func->bind != STB_GLOBAL) + if (!is_global_sym(func)) continue; if (!strncmp(func->name, STATIC_CALL_TRAMP_PREFIX_STR, @@ -2497,6 +2513,10 @@ static int classify_symbols(struct objtool_file *file) if (is_profiling_func(func->name)) func->profiling_func = true; + + len = strlen(func->name); + if (len > sym_name_max_len) + sym_name_max_len = len; } return 0; @@ -2517,7 +2537,7 @@ static void mark_rodata(struct objtool_file *file) * * .rodata.str1.* sections are ignored; they don't contain jump tables. */ - for_each_sec(file, sec) { + for_each_sec(file->elf, sec) { if ((!strncmp(sec->name, ".rodata", 7) && !strstr(sec->name, ".str1.")) || !strncmp(sec->name, ".data.rel.ro", 12)) { @@ -2529,78 +2549,115 @@ static void mark_rodata(struct objtool_file *file) file->rodata = found; } +static void mark_holes(struct objtool_file *file) +{ + struct instruction *insn; + bool in_hole = false; + + if (!opts.link) + return; + + /* + * Whole archive runs might encounter dead code from weak symbols. + * This is where the linker will have dropped the weak symbol in + * favour of a regular symbol, but leaves the code in place. + */ + for_each_insn(file, insn) { + if (insn->sym || !find_symbol_hole_containing(insn->sec, insn->offset)) { + in_hole = false; + continue; + } + + /* Skip function padding and pfx code */ + if (!in_hole && insn->type == INSN_NOP) + continue; + + in_hole = true; + insn->hole = 1; + + /* + * If this hole jumps to a .cold function, mark it ignore. + */ + if (insn->jump_dest) { + struct symbol *dest_func = insn_func(insn->jump_dest); + + if (dest_func && dest_func->cold) + dest_func->ignore = true; + } + } +} + +static bool validate_branch_enabled(void) +{ + return opts.stackval || + opts.orc || + opts.uaccess || + opts.checksum; +} + static int decode_sections(struct objtool_file *file) { - int ret; + file->klp = is_livepatch_module(file); mark_rodata(file); - ret = init_pv_ops(file); - if (ret) - return ret; + if (init_pv_ops(file)) + return -1; /* * Must be before add_{jump_call}_destination. */ - ret = classify_symbols(file); - if (ret) - return ret; + if (classify_symbols(file)) + return -1; - ret = decode_instructions(file); - if (ret) - return ret; + if (decode_instructions(file)) + return -1; - ret = add_ignores(file); - if (ret) - return ret; + if (add_ignores(file)) + return -1; add_uaccess_safe(file); - ret = read_annotate(file, __annotate_early); - if (ret) - return ret; + if (read_annotate(file, __annotate_early)) + return -1; /* * Must be before add_jump_destinations(), which depends on 'func' * being set for alternatives, to enable proper sibling call detection. */ - if (opts.stackval || opts.orc || opts.uaccess || opts.noinstr) { - ret = add_special_section_alts(file); - if (ret) - return ret; + if (validate_branch_enabled() || opts.noinstr || opts.hack_jump_label || opts.disas) { + if (add_special_section_alts(file)) + return -1; } - ret = add_jump_destinations(file); - if (ret) - return ret; + if (add_jump_destinations(file)) + return -1; /* * Must be before add_call_destination(); it changes INSN_CALL to * INSN_JUMP. */ - ret = read_annotate(file, __annotate_ifc); - if (ret) - return ret; + if (read_annotate(file, __annotate_ifc)) + return -1; - ret = add_call_destinations(file); - if (ret) - return ret; + if (add_call_destinations(file)) + return -1; - ret = add_jump_table_alts(file); - if (ret) - return ret; + if (add_jump_table_alts(file)) + return -1; - ret = read_unwind_hints(file); - if (ret) - return ret; + if (read_unwind_hints(file)) + return -1; + + /* Must be after add_jump_destinations() */ + mark_holes(file); /* * Must be after add_call_destinations() such that it can override * dead_end_function() marks. */ - ret = read_annotate(file, __annotate_late); - if (ret) - return ret; + if (read_annotate(file, __annotate_late)) + return -1; return 0; } @@ -3225,18 +3282,19 @@ static int propagate_alt_cfi(struct objtool_file *file, struct instruction *insn return 0; } -static int handle_insn_ops(struct instruction *insn, - struct instruction *next_insn, - struct insn_state *state) +static int noinline handle_insn_ops(struct instruction *insn, + struct instruction *next_insn, + struct insn_state *state) { + struct insn_state prev_state __maybe_unused = *state; struct stack_op *op; - int ret; + int ret = 0; for (op = insn->stack_ops; op; op = op->next) { ret = update_cfi_state(insn, next_insn, &state->cfi, op); if (ret) - return ret; + goto done; if (!opts.uaccess || !insn->alt_group) continue; @@ -3246,7 +3304,8 @@ static int handle_insn_ops(struct instruction *insn, state->uaccess_stack = 1; } else if (state->uaccess_stack >> 31) { WARN_INSN(insn, "PUSHF stack exhausted"); - return 1; + ret = 1; + goto done; } state->uaccess_stack <<= 1; state->uaccess_stack |= state->uaccess; @@ -3262,7 +3321,10 @@ static int handle_insn_ops(struct instruction *insn, } } - return 0; +done: + TRACE_INSN_STATE(insn, &prev_state, state); + + return ret; } static bool insn_cfi_match(struct instruction *insn, struct cfi_state *cfi2) @@ -3354,7 +3416,7 @@ static bool pv_call_dest(struct objtool_file *file, struct instruction *insn) if (!reloc || strcmp(reloc->sym->name, "pv_ops")) return false; - idx = (arch_dest_reloc_offset(reloc_addend(reloc)) / sizeof(void *)); + idx = arch_insn_adjusted_addend(insn, reloc) / sizeof(void *); if (file->pv_ops[idx].clean) return true; @@ -3520,8 +3582,10 @@ static bool skip_alt_group(struct instruction *insn) return false; /* ANNOTATE_IGNORE_ALTERNATIVE */ - if (insn->alt_group->ignore) + if (insn->alt_group->ignore) { + TRACE_ALT(insn, "alt group ignored"); return true; + } /* * For NOP patched with CLAC/STAC, only follow the latter to avoid @@ -3543,258 +3607,398 @@ static bool skip_alt_group(struct instruction *insn) return alt_insn->type == INSN_CLAC || alt_insn->type == INSN_STAC; } -/* - * Follow the branch starting at the given instruction, and recursively follow - * any other branches (jumps). Meanwhile, track the frame pointer state at - * each instruction and validate all the rules described in - * tools/objtool/Documentation/objtool.txt. - */ -static int validate_branch(struct objtool_file *file, struct symbol *func, - struct instruction *insn, struct insn_state state) +static int checksum_debug_init(struct objtool_file *file) { - struct alternative *alt; - struct instruction *next_insn, *prev_insn = NULL; - struct section *sec; - u8 visited; - int ret; + char *dup, *s; - if (func && func->ignore) + if (!opts.debug_checksum) return 0; - sec = insn->sec; + dup = strdup(opts.debug_checksum); + if (!dup) { + ERROR_GLIBC("strdup"); + return -1; + } - while (1) { - next_insn = next_insn_to_validate(file, insn); + s = dup; + while (*s) { + struct symbol *func; + char *comma; - if (func && insn_func(insn) && func != insn_func(insn)->pfunc) { - /* Ignore KCFI type preambles, which always fall through */ - if (!strncmp(func->name, "__cfi_", 6) || - !strncmp(func->name, "__pfx_", 6) || - !strncmp(func->name, "__pi___cfi_", 11) || - !strncmp(func->name, "__pi___pfx_", 11)) - return 0; + comma = strchr(s, ','); + if (comma) + *comma = '\0'; - if (file->ignore_unreachables) - return 0; + func = find_symbol_by_name(file->elf, s); + if (!func || !is_func_sym(func)) + WARN("--debug-checksum: can't find '%s'", s); + else + func->debug_checksum = 1; - WARN("%s() falls through to next function %s()", - func->name, insn_func(insn)->name); - func->warned = 1; + if (!comma) + break; - return 1; - } + s = comma + 1; + } - visited = VISITED_BRANCH << state.uaccess; - if (insn->visited & VISITED_BRANCH_MASK) { - if (!insn->hint && !insn_cfi_match(insn, &state.cfi)) - return 1; + free(dup); + return 0; +} - if (insn->visited & visited) - return 0; - } else { - nr_insns_visited++; - } +static void checksum_update_insn(struct objtool_file *file, struct symbol *func, + struct instruction *insn) +{ + struct reloc *reloc = insn_reloc(file, insn); + unsigned long offset; + struct symbol *sym; - if (state.noinstr) - state.instr += insn->instr; + if (insn->fake) + return; - if (insn->hint) { - if (insn->restore) { - struct instruction *save_insn, *i; + checksum_update(func, insn, insn->sec->data->d_buf + insn->offset, insn->len); - i = insn; - save_insn = NULL; + if (!reloc) { + struct symbol *call_dest = insn_call_dest(insn); - sym_for_each_insn_continue_reverse(file, func, i) { - if (i->save) { - save_insn = i; - break; - } - } + if (call_dest) + checksum_update(func, insn, call_dest->demangled_name, + strlen(call_dest->demangled_name)); + return; + } - if (!save_insn) { - WARN_INSN(insn, "no corresponding CFI save for CFI restore"); - return 1; + sym = reloc->sym; + offset = arch_insn_adjusted_addend(insn, reloc); + + if (is_string_sec(sym->sec)) { + char *str; + + str = sym->sec->data->d_buf + sym->offset + offset; + checksum_update(func, insn, str, strlen(str)); + return; + } + + if (is_sec_sym(sym)) { + sym = find_symbol_containing(reloc->sym->sec, offset); + if (!sym) + return; + + offset -= sym->offset; + } + + checksum_update(func, insn, sym->demangled_name, strlen(sym->demangled_name)); + checksum_update(func, insn, &offset, sizeof(offset)); +} + +static int validate_branch(struct objtool_file *file, struct symbol *func, + struct instruction *insn, struct insn_state state); +static int do_validate_branch(struct objtool_file *file, struct symbol *func, + struct instruction *insn, struct insn_state state); + +static int validate_insn(struct objtool_file *file, struct symbol *func, + struct instruction *insn, struct insn_state *statep, + struct instruction *prev_insn, struct instruction *next_insn, + bool *dead_end) +{ + char *alt_name __maybe_unused = NULL; + struct alternative *alt; + u8 visited; + int ret; + + /* + * Any returns before the end of this function are effectively dead + * ends, i.e. validate_branch() has reached the end of the branch. + */ + *dead_end = true; + + visited = VISITED_BRANCH << statep->uaccess; + if (insn->visited & VISITED_BRANCH_MASK) { + if (!insn->hint && !insn_cfi_match(insn, &statep->cfi)) + return 1; + + if (insn->visited & visited) { + TRACE_INSN(insn, "already visited"); + return 0; + } + } else { + nr_insns_visited++; + } + + if (statep->noinstr) + statep->instr += insn->instr; + + if (insn->hint) { + if (insn->restore) { + struct instruction *save_insn, *i; + + i = insn; + save_insn = NULL; + + sym_for_each_insn_continue_reverse(file, func, i) { + if (i->save) { + save_insn = i; + break; } + } - if (!save_insn->visited) { - /* - * If the restore hint insn is at the - * beginning of a basic block and was - * branched to from elsewhere, and the - * save insn hasn't been visited yet, - * defer following this branch for now. - * It will be seen later via the - * straight-line path. - */ - if (!prev_insn) - return 0; + if (!save_insn) { + WARN_INSN(insn, "no corresponding CFI save for CFI restore"); + return 1; + } - WARN_INSN(insn, "objtool isn't smart enough to handle this CFI save/restore combo"); - return 1; + if (!save_insn->visited) { + /* + * If the restore hint insn is at the + * beginning of a basic block and was + * branched to from elsewhere, and the + * save insn hasn't been visited yet, + * defer following this branch for now. + * It will be seen later via the + * straight-line path. + */ + if (!prev_insn) { + TRACE_INSN(insn, "defer restore"); + return 0; } - insn->cfi = save_insn->cfi; - nr_cfi_reused++; + WARN_INSN(insn, "objtool isn't smart enough to handle this CFI save/restore combo"); + return 1; } - state.cfi = *insn->cfi; + insn->cfi = save_insn->cfi; + nr_cfi_reused++; + } + + statep->cfi = *insn->cfi; + } else { + /* XXX track if we actually changed statep->cfi */ + + if (prev_insn && !cficmp(prev_insn->cfi, &statep->cfi)) { + insn->cfi = prev_insn->cfi; + nr_cfi_reused++; } else { - /* XXX track if we actually changed state.cfi */ + insn->cfi = cfi_hash_find_or_add(&statep->cfi); + } + } - if (prev_insn && !cficmp(prev_insn->cfi, &state.cfi)) { - insn->cfi = prev_insn->cfi; - nr_cfi_reused++; - } else { - insn->cfi = cfi_hash_find_or_add(&state.cfi); + insn->visited |= visited; + + if (propagate_alt_cfi(file, insn)) + return 1; + + if (insn->alts) { + for (alt = insn->alts; alt; alt = alt->next) { + TRACE_ALT_BEGIN(insn, alt, alt_name); + ret = validate_branch(file, func, alt->insn, *statep); + TRACE_ALT_END(insn, alt, alt_name); + if (ret) { + BT_INSN(insn, "(alt)"); + return ret; } } + TRACE_ALT_INFO_NOADDR(insn, "/ ", "DEFAULT"); + } + + if (skip_alt_group(insn)) + return 0; + + if (handle_insn_ops(insn, next_insn, statep)) + return 1; + + switch (insn->type) { - insn->visited |= visited; + case INSN_RETURN: + TRACE_INSN(insn, "return"); + return validate_return(func, insn, statep); + + case INSN_CALL: + case INSN_CALL_DYNAMIC: + if (insn->type == INSN_CALL) + TRACE_INSN(insn, "call"); + else + TRACE_INSN(insn, "indirect call"); - if (propagate_alt_cfi(file, insn)) + ret = validate_call(file, insn, statep); + if (ret) + return ret; + + if (opts.stackval && func && !is_special_call(insn) && + !has_valid_stack_frame(statep)) { + WARN_INSN(insn, "call without frame pointer save/setup"); return 1; + } - if (insn->alts) { - for (alt = insn->alts; alt; alt = alt->next) { - ret = validate_branch(file, func, alt->insn, state); - if (ret) { - BT_INSN(insn, "(alt)"); - return ret; - } + break; + + case INSN_JUMP_CONDITIONAL: + case INSN_JUMP_UNCONDITIONAL: + if (is_sibling_call(insn)) { + TRACE_INSN(insn, "sibling call"); + ret = validate_sibling_call(file, insn, statep); + if (ret) + return ret; + + } else if (insn->jump_dest) { + if (insn->type == INSN_JUMP_UNCONDITIONAL) + TRACE_INSN(insn, "unconditional jump"); + else + TRACE_INSN(insn, "jump taken"); + + ret = validate_branch(file, func, insn->jump_dest, *statep); + if (ret) { + BT_INSN(insn, "(branch)"); + return ret; } } - if (skip_alt_group(insn)) + if (insn->type == INSN_JUMP_UNCONDITIONAL) return 0; - if (handle_insn_ops(insn, next_insn, &state)) - return 1; - - switch (insn->type) { - - case INSN_RETURN: - return validate_return(func, insn, &state); + TRACE_INSN(insn, "jump not taken"); + break; - case INSN_CALL: - case INSN_CALL_DYNAMIC: - ret = validate_call(file, insn, &state); + case INSN_JUMP_DYNAMIC: + case INSN_JUMP_DYNAMIC_CONDITIONAL: + TRACE_INSN(insn, "indirect jump"); + if (is_sibling_call(insn)) { + ret = validate_sibling_call(file, insn, statep); if (ret) return ret; + } - if (opts.stackval && func && !is_special_call(insn) && - !has_valid_stack_frame(&state)) { - WARN_INSN(insn, "call without frame pointer save/setup"); - return 1; - } + if (insn->type == INSN_JUMP_DYNAMIC) + return 0; - break; + break; - case INSN_JUMP_CONDITIONAL: - case INSN_JUMP_UNCONDITIONAL: - if (is_sibling_call(insn)) { - ret = validate_sibling_call(file, insn, &state); - if (ret) - return ret; + case INSN_SYSCALL: + TRACE_INSN(insn, "syscall"); + if (func && (!next_insn || !next_insn->hint)) { + WARN_INSN(insn, "unsupported instruction in callable function"); + return 1; + } - } else if (insn->jump_dest) { - ret = validate_branch(file, func, - insn->jump_dest, state); - if (ret) { - BT_INSN(insn, "(branch)"); - return ret; - } - } + break; - if (insn->type == INSN_JUMP_UNCONDITIONAL) - return 0; + case INSN_SYSRET: + TRACE_INSN(insn, "sysret"); + if (func && (!next_insn || !next_insn->hint)) { + WARN_INSN(insn, "unsupported instruction in callable function"); + return 1; + } + + return 0; + case INSN_STAC: + TRACE_INSN(insn, "stac"); + if (!opts.uaccess) break; - case INSN_JUMP_DYNAMIC: - case INSN_JUMP_DYNAMIC_CONDITIONAL: - if (is_sibling_call(insn)) { - ret = validate_sibling_call(file, insn, &state); - if (ret) - return ret; - } + if (statep->uaccess) { + WARN_INSN(insn, "recursive UACCESS enable"); + return 1; + } - if (insn->type == INSN_JUMP_DYNAMIC) - return 0; + statep->uaccess = true; + break; + case INSN_CLAC: + TRACE_INSN(insn, "clac"); + if (!opts.uaccess) break; - case INSN_SYSCALL: - if (func && (!next_insn || !next_insn->hint)) { - WARN_INSN(insn, "unsupported instruction in callable function"); - return 1; - } + if (!statep->uaccess && func) { + WARN_INSN(insn, "redundant UACCESS disable"); + return 1; + } - break; + if (func_uaccess_safe(func) && !statep->uaccess_stack) { + WARN_INSN(insn, "UACCESS-safe disables UACCESS"); + return 1; + } - case INSN_SYSRET: - if (func && (!next_insn || !next_insn->hint)) { - WARN_INSN(insn, "unsupported instruction in callable function"); - return 1; - } + statep->uaccess = false; + break; - return 0; + case INSN_STD: + TRACE_INSN(insn, "std"); + if (statep->df) { + WARN_INSN(insn, "recursive STD"); + return 1; + } - case INSN_STAC: - if (!opts.uaccess) - break; + statep->df = true; + break; - if (state.uaccess) { - WARN_INSN(insn, "recursive UACCESS enable"); - return 1; - } + case INSN_CLD: + TRACE_INSN(insn, "cld"); + if (!statep->df && func) { + WARN_INSN(insn, "redundant CLD"); + return 1; + } - state.uaccess = true; - break; + statep->df = false; + break; - case INSN_CLAC: - if (!opts.uaccess) - break; + default: + break; + } - if (!state.uaccess && func) { - WARN_INSN(insn, "redundant UACCESS disable"); - return 1; - } + if (insn->dead_end) + TRACE_INSN(insn, "dead end"); - if (func_uaccess_safe(func) && !state.uaccess_stack) { - WARN_INSN(insn, "UACCESS-safe disables UACCESS"); - return 1; - } + *dead_end = insn->dead_end; + return 0; +} - state.uaccess = false; - break; +/* + * Follow the branch starting at the given instruction, and recursively follow + * any other branches (jumps). Meanwhile, track the frame pointer state at + * each instruction and validate all the rules described in + * tools/objtool/Documentation/objtool.txt. + */ +static int do_validate_branch(struct objtool_file *file, struct symbol *func, + struct instruction *insn, struct insn_state state) +{ + struct instruction *next_insn, *prev_insn = NULL; + bool dead_end; + int ret; - case INSN_STD: - if (state.df) { - WARN_INSN(insn, "recursive STD"); - return 1; - } + if (func && func->ignore) + return 0; - state.df = true; - break; + do { + insn->trace = 0; + next_insn = next_insn_to_validate(file, insn); - case INSN_CLD: - if (!state.df && func) { - WARN_INSN(insn, "redundant CLD"); - return 1; - } + if (opts.checksum && func && insn->sec) + checksum_update_insn(file, func, insn); - state.df = false; - break; + if (func && insn_func(insn) && func != insn_func(insn)->pfunc) { + /* Ignore KCFI type preambles, which always fall through */ + if (is_prefix_func(func)) + return 0; - default: - break; + if (file->ignore_unreachables) + return 0; + + WARN("%s() falls through to next function %s()", + func->name, insn_func(insn)->name); + func->warned = 1; + + return 1; } - if (insn->dead_end) - return 0; + ret = validate_insn(file, func, insn, &state, prev_insn, next_insn, + &dead_end); + + if (!insn->trace) { + if (ret) + TRACE_INSN(insn, "warning (%d)", ret); + else + TRACE_INSN(insn, NULL); + } - if (!next_insn) { + if (!dead_end && !next_insn) { if (state.cfi.cfa.base == CFI_UNDEFINED) return 0; if (file->ignore_unreachables) @@ -3802,15 +4006,28 @@ static int validate_branch(struct objtool_file *file, struct symbol *func, WARN("%s%sunexpected end of section %s", func ? func->name : "", func ? "(): " : "", - sec->name); + insn->sec->name); return 1; } prev_insn = insn; insn = next_insn; - } - return 0; + } while (!dead_end); + + return ret; +} + +static int validate_branch(struct objtool_file *file, struct symbol *func, + struct instruction *insn, struct insn_state state) +{ + int ret; + + trace_depth_inc(); + ret = do_validate_branch(file, func, insn, state); + trace_depth_dec(); + + return ret; } static int validate_unwind_hint(struct objtool_file *file, @@ -3818,7 +4035,13 @@ static int validate_unwind_hint(struct objtool_file *file, struct insn_state *state) { if (insn->hint && !insn->visited) { - int ret = validate_branch(file, insn_func(insn), insn, *state); + struct symbol *func = insn_func(insn); + int ret; + + if (opts.checksum) + checksum_init(func); + + ret = validate_branch(file, func, insn, *state); if (ret) BT_INSN(insn, "<=== (hint)"); return ret; @@ -4062,7 +4285,8 @@ static bool ignore_unreachable_insn(struct objtool_file *file, struct instructio struct instruction *prev_insn; int i; - if (insn->type == INSN_NOP || insn->type == INSN_TRAP || (func && func->ignore)) + if (insn->type == INSN_NOP || insn->type == INSN_TRAP || + insn->hole || (func && func->ignore)) return true; /* @@ -4073,47 +4297,6 @@ static bool ignore_unreachable_insn(struct objtool_file *file, struct instructio !strcmp(insn->sec->name, ".altinstr_aux")) return true; - /* - * Whole archive runs might encounter dead code from weak symbols. - * This is where the linker will have dropped the weak symbol in - * favour of a regular symbol, but leaves the code in place. - * - * In this case we'll find a piece of code (whole function) that is not - * covered by a !section symbol. Ignore them. - */ - if (opts.link && !func) { - int size = find_symbol_hole_containing(insn->sec, insn->offset); - unsigned long end = insn->offset + size; - - if (!size) /* not a hole */ - return false; - - if (size < 0) /* hole until the end */ - return true; - - sec_for_each_insn_continue(file, insn) { - /* - * If we reach a visited instruction at or before the - * end of the hole, ignore the unreachable. - */ - if (insn->visited) - return true; - - if (insn->offset >= end) - break; - - /* - * If this hole jumps to a .cold function, mark it ignore too. - */ - if (insn->jump_dest && insn_func(insn->jump_dest) && - strstr(insn_func(insn->jump_dest)->name, ".cold")) { - insn_func(insn->jump_dest)->ignore = true; - } - } - - return false; - } - if (!func) return false; @@ -4165,14 +4348,54 @@ static bool ignore_unreachable_insn(struct objtool_file *file, struct instructio return false; } -static int add_prefix_symbol(struct objtool_file *file, struct symbol *func) +/* + * For FineIBT or kCFI, a certain number of bytes preceding the function may be + * NOPs. Those NOPs may be rewritten at runtime and executed, so give them a + * proper function name: __pfx_<func>. + * + * The NOPs may not exist for the following cases: + * + * - compiler cloned functions (*.cold, *.part0, etc) + * - asm functions created with inline asm or without SYM_FUNC_START() + * + * Also, the function may already have a prefix from a previous objtool run + * (livepatch extracted functions, or manually running objtool multiple times). + * + * So return 0 if the NOPs are missing or the function already has a prefix + * symbol. + */ +static int create_prefix_symbol(struct objtool_file *file, struct symbol *func) { struct instruction *insn, *prev; + char name[SYM_NAME_LEN]; struct cfi_state *cfi; + if (!is_func_sym(func) || is_prefix_func(func) || + func->cold || func->static_call_tramp) + return 0; + + if ((strlen(func->name) + sizeof("__pfx_") > SYM_NAME_LEN)) { + WARN("%s: symbol name too long, can't create __pfx_ symbol", + func->name); + return 0; + } + + if (snprintf_check(name, SYM_NAME_LEN, "__pfx_%s", func->name)) + return -1; + + if (file->klp) { + struct symbol *pfx; + + pfx = find_symbol_by_offset(func->sec, func->offset - opts.prefix); + if (pfx && is_prefix_func(pfx) && !strcmp(pfx->name, name)) + return 0; + } + insn = find_insn(file, func->sec, func->offset); - if (!insn) + if (!insn) { + WARN("%s: can't find starting instruction", func->name); return -1; + } for (prev = prev_insn_same_sec(file, insn); prev; @@ -4180,22 +4403,27 @@ static int add_prefix_symbol(struct objtool_file *file, struct symbol *func) u64 offset; if (prev->type != INSN_NOP) - return -1; + return 0; offset = func->offset - prev->offset; if (offset > opts.prefix) - return -1; + return 0; if (offset < opts.prefix) continue; - elf_create_prefix_symbol(file->elf, func, opts.prefix); + if (!elf_create_symbol(file->elf, name, func->sec, + GELF_ST_BIND(func->sym.st_info), + GELF_ST_TYPE(func->sym.st_info), + prev->offset, opts.prefix)) + return -1; + break; } if (!prev) - return -1; + return 0; if (!insn->cfi) { /* @@ -4213,20 +4441,18 @@ static int add_prefix_symbol(struct objtool_file *file, struct symbol *func) return 0; } -static int add_prefix_symbols(struct objtool_file *file) +static int create_prefix_symbols(struct objtool_file *file) { struct section *sec; struct symbol *func; - for_each_sec(file, sec) { - if (!(sec->sh.sh_flags & SHF_EXECINSTR)) + for_each_sec(file->elf, sec) { + if (!is_text_sec(sec)) continue; sec_for_each_sym(sec, func) { - if (func->type != STT_FUNC) - continue; - - add_prefix_symbol(file, func); + if (create_prefix_symbol(file, func)) + return -1; } } @@ -4237,6 +4463,7 @@ static int validate_symbol(struct objtool_file *file, struct section *sec, struct symbol *sym, struct insn_state *state) { struct instruction *insn; + struct symbol *func; int ret; if (!sym->len) { @@ -4254,9 +4481,26 @@ static int validate_symbol(struct objtool_file *file, struct section *sec, if (opts.uaccess) state->uaccess = sym->uaccess_safe; - ret = validate_branch(file, insn_func(insn), insn, *state); + func = insn_func(insn); + + if (opts.checksum) + checksum_init(func); + + if (opts.trace && !fnmatch(opts.trace, sym->name, 0)) { + trace_enable(); + TRACE("%s: validation begin\n", sym->name); + } + + ret = validate_branch(file, func, insn, *state); if (ret) BT_INSN(insn, "<=== (sym)"); + + TRACE("%s: validation %s\n\n", sym->name, ret ? "failed" : "end"); + trace_disable(); + + if (opts.checksum) + checksum_finish(func); + return ret; } @@ -4267,7 +4511,7 @@ static int validate_section(struct objtool_file *file, struct section *sec) int warnings = 0; sec_for_each_sym(sec, func) { - if (func->type != STT_FUNC) + if (!is_func_sym(func)) continue; init_insn_state(file, &state, sec); @@ -4310,8 +4554,8 @@ static int validate_functions(struct objtool_file *file) struct section *sec; int warnings = 0; - for_each_sec(file, sec) { - if (!(sec->sh.sh_flags & SHF_EXECINSTR)) + for_each_sec(file->elf, sec) { + if (!is_text_sec(sec)) continue; warnings += validate_section(file, sec); @@ -4438,12 +4682,7 @@ static int validate_ibt_insn(struct objtool_file *file, struct instruction *insn reloc_offset(reloc) + 1, (insn->offset + insn->len) - (reloc_offset(reloc) + 1))) { - off = reloc->sym->offset; - if (reloc_type(reloc) == R_X86_64_PC32 || - reloc_type(reloc) == R_X86_64_PLT32) - off += arch_dest_reloc_offset(reloc_addend(reloc)); - else - off += reloc_addend(reloc); + off = reloc->sym->offset + arch_insn_adjusted_addend(insn, reloc); dest = find_insn(file, reloc->sym->sec, off); if (!dest) @@ -4494,10 +4733,10 @@ static int validate_ibt(struct objtool_file *file) for_each_insn(file, insn) warnings += validate_ibt_insn(file, insn); - for_each_sec(file, sec) { + for_each_sec(file->elf, sec) { /* Already done by validate_ibt_insn() */ - if (sec->sh.sh_flags & SHF_EXECINSTR) + if (is_text_sec(sec)) continue; if (!sec->rsec) @@ -4512,8 +4751,8 @@ static int validate_ibt(struct objtool_file *file) !strncmp(sec->name, ".debug", 6) || !strcmp(sec->name, ".altinstructions") || !strcmp(sec->name, ".ibt_endbr_seal") || + !strcmp(sec->name, ".kcfi_traps") || !strcmp(sec->name, ".orc_unwind_ip") || - !strcmp(sec->name, ".parainstructions") || !strcmp(sec->name, ".retpoline_sites") || !strcmp(sec->name, ".smp_locks") || !strcmp(sec->name, ".static_call_sites") || @@ -4522,12 +4761,14 @@ static int validate_ibt(struct objtool_file *file) !strcmp(sec->name, "__bug_table") || !strcmp(sec->name, "__ex_table") || !strcmp(sec->name, "__jump_table") || + !strcmp(sec->name, "__klp_funcs") || !strcmp(sec->name, "__mcount_loc") || - !strcmp(sec->name, ".kcfi_traps") || !strcmp(sec->name, ".llvm.call-graph-profile") || !strcmp(sec->name, ".llvm_bb_addr_map") || !strcmp(sec->name, "__tracepoints") || - strstr(sec->name, "__patchable_function_entries")) + !strcmp(sec->name, ".return_sites") || + !strcmp(sec->name, ".call_sites") || + !strcmp(sec->name, "__patchable_function_entries")) continue; for_each_reloc(sec->rsec, reloc) @@ -4601,87 +4842,6 @@ static int validate_reachable_instructions(struct objtool_file *file) return warnings; } -/* 'funcs' is a space-separated list of function names */ -static void disas_funcs(const char *funcs) -{ - const char *objdump_str, *cross_compile; - int size, ret; - char *cmd; - - cross_compile = getenv("CROSS_COMPILE"); - if (!cross_compile) - cross_compile = ""; - - objdump_str = "%sobjdump -wdr %s | gawk -M -v _funcs='%s' '" - "BEGIN { split(_funcs, funcs); }" - "/^$/ { func_match = 0; }" - "/<.*>:/ { " - "f = gensub(/.*<(.*)>:/, \"\\\\1\", 1);" - "for (i in funcs) {" - "if (funcs[i] == f) {" - "func_match = 1;" - "base = strtonum(\"0x\" $1);" - "break;" - "}" - "}" - "}" - "{" - "if (func_match) {" - "addr = strtonum(\"0x\" $1);" - "printf(\"%%04x \", addr - base);" - "print;" - "}" - "}' 1>&2"; - - /* fake snprintf() to calculate the size */ - size = snprintf(NULL, 0, objdump_str, cross_compile, objname, funcs) + 1; - if (size <= 0) { - WARN("objdump string size calculation failed"); - return; - } - - cmd = malloc(size); - - /* real snprintf() */ - snprintf(cmd, size, objdump_str, cross_compile, objname, funcs); - ret = system(cmd); - if (ret) { - WARN("disassembly failed: %d", ret); - return; - } -} - -static void disas_warned_funcs(struct objtool_file *file) -{ - struct symbol *sym; - char *funcs = NULL, *tmp; - - for_each_sym(file, sym) { - if (sym->warned) { - if (!funcs) { - funcs = malloc(strlen(sym->name) + 1); - if (!funcs) { - ERROR_GLIBC("malloc"); - return; - } - strcpy(funcs, sym->name); - } else { - tmp = malloc(strlen(funcs) + strlen(sym->name) + 2); - if (!tmp) { - ERROR_GLIBC("malloc"); - return; - } - sprintf(tmp, "%s %s", funcs, sym->name); - free(funcs); - funcs = tmp; - } - } - } - - if (funcs) - disas_funcs(funcs); -} - __weak bool arch_absolute_reloc(struct elf *elf, struct reloc *reloc) { unsigned int type = reloc_type(reloc); @@ -4696,7 +4856,7 @@ static int check_abs_references(struct objtool_file *file) struct reloc *reloc; int ret = 0; - for_each_sec(file, sec) { + for_each_sec(file->elf, sec) { /* absolute references in non-loadable sections are fine */ if (!(sec->sh.sh_flags & SHF_ALLOC)) continue; @@ -4751,10 +4911,35 @@ static void free_insns(struct objtool_file *file) free(chunk->addr); } +const char *objtool_disas_insn(struct instruction *insn) +{ + struct disas_context *dctx = objtool_disas_ctx; + + if (!dctx) + return ""; + + disas_insn(dctx, insn); + return disas_result(dctx); +} + int check(struct objtool_file *file) { + struct disas_context *disas_ctx = NULL; int ret = 0, warnings = 0; + /* + * Create a disassembly context if we might disassemble any + * instruction or function. + */ + if (opts.verbose || opts.backtrace || opts.trace || opts.disas) { + disas_ctx = disas_context_create(file); + if (!disas_ctx) { + opts.disas = false; + opts.trace = false; + } + objtool_disas_ctx = disas_ctx; + } + arch_initial_func_cfi_state(&initial_func_cfi); init_cfi_state(&init_cfi); init_cfi_state(&func_cfi); @@ -4770,6 +4955,10 @@ int check(struct objtool_file *file) cfi_hash_add(&init_cfi); cfi_hash_add(&func_cfi); + ret = checksum_debug_init(file); + if (ret) + goto out; + ret = decode_sections(file); if (ret) goto out; @@ -4780,7 +4969,7 @@ int check(struct objtool_file *file) if (opts.retpoline) warnings += validate_retpoline(file); - if (opts.stackval || opts.orc || opts.uaccess) { + if (validate_branch_enabled()) { int w = 0; w += validate_functions(file); @@ -4845,7 +5034,7 @@ int check(struct objtool_file *file) } if (opts.prefix) { - ret = add_prefix_symbols(file); + ret = create_prefix_symbols(file); if (ret) goto out; } @@ -4859,14 +5048,18 @@ int check(struct objtool_file *file) if (opts.noabs) warnings += check_abs_references(file); + if (opts.checksum) { + ret = create_sym_checksum_section(file); + if (ret) + goto out; + } + if (opts.orc && nr_insns) { ret = orc_create(file); if (ret) goto out; } - free_insns(file); - if (opts.stats) { printf("nr_insns_visited: %ld\n", nr_insns_visited); printf("nr_cfi: %ld\n", nr_cfi); @@ -4875,18 +5068,32 @@ int check(struct objtool_file *file) } out: - if (!ret && !warnings) - return 0; + if (ret || warnings) { + if (opts.werror && warnings) + ret = 1; + + if (opts.verbose) { + if (opts.werror && warnings) + WARN("%d warning(s) upgraded to errors", warnings); + disas_warned_funcs(disas_ctx); + } + } - if (opts.werror && warnings) - ret = 1; + if (opts.disas) + disas_funcs(disas_ctx); - if (opts.verbose) { - if (opts.werror && warnings) - WARN("%d warning(s) upgraded to errors", warnings); - print_args(); - disas_warned_funcs(file); + if (disas_ctx) { + disas_context_destroy(disas_ctx); + objtool_disas_ctx = NULL; } + free_insns(file); + + if (!ret && !warnings) + return 0; + + if (opts.backup && make_backup()) + return 1; + return ret; } diff --git a/tools/objtool/disas.c b/tools/objtool/disas.c new file mode 100644 index 000000000000..2b5059f55e40 --- /dev/null +++ b/tools/objtool/disas.c @@ -0,0 +1,1248 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2015-2017 Josh Poimboeuf <jpoimboe@redhat.com> + */ + +#define _GNU_SOURCE +#include <fnmatch.h> + +#include <objtool/arch.h> +#include <objtool/check.h> +#include <objtool/disas.h> +#include <objtool/special.h> +#include <objtool/warn.h> + +#include <bfd.h> +#include <linux/string.h> +#include <tools/dis-asm-compat.h> + +/* + * Size of the buffer for storing the result of disassembling + * a single instruction. + */ +#define DISAS_RESULT_SIZE 1024 + +struct disas_context { + struct objtool_file *file; + struct instruction *insn; + bool alt_applied; + char result[DISAS_RESULT_SIZE]; + disassembler_ftype disassembler; + struct disassemble_info info; +}; + +/* + * Maximum number of alternatives + */ +#define DISAS_ALT_MAX 5 + +/* + * Maximum number of instructions per alternative + */ +#define DISAS_ALT_INSN_MAX 50 + +/* + * Information to disassemble an alternative + */ +struct disas_alt { + struct instruction *orig_insn; /* original instruction */ + struct alternative *alt; /* alternative or NULL if default code */ + char *name; /* name for this alternative */ + int width; /* formatting width */ + struct { + char *str; /* instruction string */ + int offset; /* instruction offset */ + int nops; /* number of nops */ + } insn[DISAS_ALT_INSN_MAX]; /* alternative instructions */ + int insn_idx; /* index of the next instruction to print */ +}; + +#define DALT_DEFAULT(dalt) (!(dalt)->alt) +#define DALT_INSN(dalt) (DALT_DEFAULT(dalt) ? (dalt)->orig_insn : (dalt)->alt->insn) +#define DALT_GROUP(dalt) (DALT_INSN(dalt)->alt_group) +#define DALT_ALTID(dalt) ((dalt)->orig_insn->offset) + +#define ALT_FLAGS_SHIFT 16 +#define ALT_FLAG_NOT (1 << 0) +#define ALT_FLAG_DIRECT_CALL (1 << 1) +#define ALT_FEATURE_MASK ((1 << ALT_FLAGS_SHIFT) - 1) + +static int alt_feature(unsigned int ft_flags) +{ + return (ft_flags & ALT_FEATURE_MASK); +} + +static int alt_flags(unsigned int ft_flags) +{ + return (ft_flags >> ALT_FLAGS_SHIFT); +} + +/* + * Wrapper around asprintf() to allocate and format a string. + * Return the allocated string or NULL on error. + */ +static char *strfmt(const char *fmt, ...) +{ + va_list ap; + char *str; + int rv; + + va_start(ap, fmt); + rv = vasprintf(&str, fmt, ap); + va_end(ap); + + return rv == -1 ? NULL : str; +} + +static int sprint_name(char *str, const char *name, unsigned long offset) +{ + int len; + + if (offset) + len = sprintf(str, "%s+0x%lx", name, offset); + else + len = sprintf(str, "%s", name); + + return len; +} + +#define DINFO_FPRINTF(dinfo, ...) \ + ((*(dinfo)->fprintf_func)((dinfo)->stream, __VA_ARGS__)) + +static int disas_result_fprintf(struct disas_context *dctx, + const char *fmt, va_list ap) +{ + char *buf = dctx->result; + int avail, len; + + len = strlen(buf); + if (len >= DISAS_RESULT_SIZE - 1) { + WARN_FUNC(dctx->insn->sec, dctx->insn->offset, + "disassembly buffer is full"); + return -1; + } + avail = DISAS_RESULT_SIZE - len; + + len = vsnprintf(buf + len, avail, fmt, ap); + if (len < 0 || len >= avail) { + WARN_FUNC(dctx->insn->sec, dctx->insn->offset, + "disassembly buffer is truncated"); + return -1; + } + + return 0; +} + +static int disas_fprintf(void *stream, const char *fmt, ...) +{ + va_list arg; + int rv; + + va_start(arg, fmt); + rv = disas_result_fprintf(stream, fmt, arg); + va_end(arg); + + return rv; +} + +/* + * For init_disassemble_info_compat(). + */ +static int disas_fprintf_styled(void *stream, + enum disassembler_style style, + const char *fmt, ...) +{ + va_list arg; + int rv; + + va_start(arg, fmt); + rv = disas_result_fprintf(stream, fmt, arg); + va_end(arg); + + return rv; +} + +static void disas_print_addr_sym(struct section *sec, struct symbol *sym, + bfd_vma addr, struct disassemble_info *dinfo) +{ + char symstr[1024]; + char *str; + + if (sym) { + sprint_name(symstr, sym->name, addr - sym->offset); + DINFO_FPRINTF(dinfo, "0x%lx <%s>", addr, symstr); + } else { + str = offstr(sec, addr); + DINFO_FPRINTF(dinfo, "0x%lx <%s>", addr, str); + free(str); + } +} + +static bool disas_print_addr_alt(bfd_vma addr, struct disassemble_info *dinfo) +{ + struct disas_context *dctx = dinfo->application_data; + struct instruction *orig_first_insn; + struct alt_group *alt_group; + unsigned long offset; + struct symbol *sym; + + /* + * Check if we are processing an alternative at the original + * instruction address (i.e. if alt_applied is true) and if + * we are referencing an address inside the alternative. + * + * For example, this happens if there is a branch inside an + * alternative. In that case, the address should be updated + * to a reference inside the original instruction flow. + */ + if (!dctx->alt_applied) + return false; + + alt_group = dctx->insn->alt_group; + if (!alt_group || !alt_group->orig_group || + addr < alt_group->first_insn->offset || + addr > alt_group->last_insn->offset) + return false; + + orig_first_insn = alt_group->orig_group->first_insn; + offset = addr - alt_group->first_insn->offset; + + addr = orig_first_insn->offset + offset; + sym = orig_first_insn->sym; + + disas_print_addr_sym(orig_first_insn->sec, sym, addr, dinfo); + + return true; +} + +static void disas_print_addr_noreloc(bfd_vma addr, + struct disassemble_info *dinfo) +{ + struct disas_context *dctx = dinfo->application_data; + struct instruction *insn = dctx->insn; + struct symbol *sym = NULL; + + if (disas_print_addr_alt(addr, dinfo)) + return; + + if (insn->sym && addr >= insn->sym->offset && + addr < insn->sym->offset + insn->sym->len) { + sym = insn->sym; + } + + disas_print_addr_sym(insn->sec, sym, addr, dinfo); +} + +static void disas_print_addr_reloc(bfd_vma addr, struct disassemble_info *dinfo) +{ + struct disas_context *dctx = dinfo->application_data; + struct instruction *insn = dctx->insn; + unsigned long offset; + struct reloc *reloc; + char symstr[1024]; + char *str; + + reloc = find_reloc_by_dest_range(dctx->file->elf, insn->sec, + insn->offset, insn->len); + if (!reloc) { + /* + * There is no relocation for this instruction although + * the address to resolve points to the next instruction. + * So this is an effective reference to the next IP, for + * example: "lea 0x0(%rip),%rdi". The kernel can reference + * the next IP with _THIS_IP_ macro. + */ + DINFO_FPRINTF(dinfo, "0x%lx <_THIS_IP_>", addr); + return; + } + + offset = arch_insn_adjusted_addend(insn, reloc); + + /* + * If the relocation symbol is a section name (for example ".bss") + * then we try to further resolve the name. + */ + if (reloc->sym->type == STT_SECTION) { + str = offstr(reloc->sym->sec, reloc->sym->offset + offset); + DINFO_FPRINTF(dinfo, "0x%lx <%s>", addr, str); + free(str); + } else { + sprint_name(symstr, reloc->sym->name, offset); + DINFO_FPRINTF(dinfo, "0x%lx <%s>", addr, symstr); + } +} + +/* + * Resolve an address into a "<symbol>+<offset>" string. + */ +static void disas_print_address(bfd_vma addr, struct disassemble_info *dinfo) +{ + struct disas_context *dctx = dinfo->application_data; + struct instruction *insn = dctx->insn; + struct instruction *jump_dest; + struct symbol *sym; + bool is_reloc; + + /* + * If the instruction is a call/jump and it references a + * destination then this is likely the address we are looking + * up. So check it first. + */ + jump_dest = insn->jump_dest; + if (jump_dest && jump_dest->sym && jump_dest->offset == addr) { + if (!disas_print_addr_alt(addr, dinfo)) + disas_print_addr_sym(jump_dest->sec, jump_dest->sym, + addr, dinfo); + return; + } + + /* + * If the address points to the next instruction then there is + * probably a relocation. It can be a false positive when the + * current instruction is referencing the address of the next + * instruction. This particular case will be handled in + * disas_print_addr_reloc(). + */ + is_reloc = (addr == insn->offset + insn->len); + + /* + * The call destination offset can be the address we are looking + * up, or 0 if there is a relocation. + */ + sym = insn_call_dest(insn); + if (sym && (sym->offset == addr || (sym->offset == 0 && is_reloc))) { + DINFO_FPRINTF(dinfo, "0x%lx <%s>", addr, sym->name); + return; + } + + if (!is_reloc) + disas_print_addr_noreloc(addr, dinfo); + else + disas_print_addr_reloc(addr, dinfo); +} + +/* + * Initialize disassemble info arch, mach (32 or 64-bit) and options. + */ +int disas_info_init(struct disassemble_info *dinfo, + int arch, int mach32, int mach64, + const char *options) +{ + struct disas_context *dctx = dinfo->application_data; + struct objtool_file *file = dctx->file; + + dinfo->arch = arch; + + switch (file->elf->ehdr.e_ident[EI_CLASS]) { + case ELFCLASS32: + dinfo->mach = mach32; + break; + case ELFCLASS64: + dinfo->mach = mach64; + break; + default: + return -1; + } + + dinfo->disassembler_options = options; + + return 0; +} + +struct disas_context *disas_context_create(struct objtool_file *file) +{ + struct disas_context *dctx; + struct disassemble_info *dinfo; + int err; + + dctx = malloc(sizeof(*dctx)); + if (!dctx) { + WARN("failed to allocate disassembly context"); + return NULL; + } + + dctx->file = file; + dinfo = &dctx->info; + + init_disassemble_info_compat(dinfo, dctx, + disas_fprintf, disas_fprintf_styled); + + dinfo->read_memory_func = buffer_read_memory; + dinfo->print_address_func = disas_print_address; + dinfo->application_data = dctx; + + /* + * bfd_openr() is not used to avoid doing ELF data processing + * and caching that has already being done. Here, we just need + * to identify the target file so we call an arch specific + * function to fill some disassemble info (arch, mach). + */ + + dinfo->arch = bfd_arch_unknown; + dinfo->mach = 0; + + err = arch_disas_info_init(dinfo); + if (err || dinfo->arch == bfd_arch_unknown || dinfo->mach == 0) { + WARN("failed to init disassembly arch"); + goto error; + } + + dinfo->endian = (file->elf->ehdr.e_ident[EI_DATA] == ELFDATA2MSB) ? + BFD_ENDIAN_BIG : BFD_ENDIAN_LITTLE; + + disassemble_init_for_target(dinfo); + + dctx->disassembler = disassembler(dinfo->arch, + dinfo->endian == BFD_ENDIAN_BIG, + dinfo->mach, NULL); + if (!dctx->disassembler) { + WARN("failed to create disassembler function"); + goto error; + } + + return dctx; + +error: + free(dctx); + return NULL; +} + +void disas_context_destroy(struct disas_context *dctx) +{ + free(dctx); +} + +char *disas_result(struct disas_context *dctx) +{ + return dctx->result; +} + +#define DISAS_INSN_OFFSET_SPACE 10 +#define DISAS_INSN_SPACE 60 + +#define DISAS_PRINSN(dctx, insn, depth) \ + disas_print_insn(stdout, dctx, insn, depth, "\n") + +/* + * Print a message in the instruction flow. If sec is not NULL then the + * address at the section offset is printed in addition of the message, + * otherwise only the message is printed. + */ +static int disas_vprint(FILE *stream, struct section *sec, unsigned long offset, + int depth, const char *format, va_list ap) +{ + const char *addr_str; + int i, n; + int len; + + len = sym_name_max_len + DISAS_INSN_OFFSET_SPACE; + if (depth < 0) { + len += depth; + depth = 0; + } + + n = 0; + + if (sec) { + addr_str = offstr(sec, offset); + n += fprintf(stream, "%6lx: %-*s ", offset, len, addr_str); + free((char *)addr_str); + } else { + len += DISAS_INSN_OFFSET_SPACE + 1; + n += fprintf(stream, "%-*s", len, ""); + } + + /* print vertical bars to show the code flow */ + for (i = 0; i < depth; i++) + n += fprintf(stream, "| "); + + if (format) + n += vfprintf(stream, format, ap); + + return n; +} + +static int disas_print(FILE *stream, struct section *sec, unsigned long offset, + int depth, const char *format, ...) +{ + va_list args; + int len; + + va_start(args, format); + len = disas_vprint(stream, sec, offset, depth, format, args); + va_end(args); + + return len; +} + +/* + * Print a message in the instruction flow. If insn is not NULL then + * the instruction address is printed in addition of the message, + * otherwise only the message is printed. In all cases, the instruction + * itself is not printed. + */ +void disas_print_info(FILE *stream, struct instruction *insn, int depth, + const char *format, ...) +{ + struct section *sec; + unsigned long off; + va_list args; + + if (insn) { + sec = insn->sec; + off = insn->offset; + } else { + sec = NULL; + off = 0; + } + + va_start(args, format); + disas_vprint(stream, sec, off, depth, format, args); + va_end(args); +} + +/* + * Print an instruction address (offset and function), the instruction itself + * and an optional message. + */ +void disas_print_insn(FILE *stream, struct disas_context *dctx, + struct instruction *insn, int depth, + const char *format, ...) +{ + char fake_nop_insn[32]; + const char *insn_str; + bool fake_nop; + va_list args; + int len; + + /* + * Alternative can insert a fake nop, sometimes with no + * associated section so nothing to disassemble. + */ + fake_nop = (!insn->sec && insn->type == INSN_NOP); + if (fake_nop) { + snprintf(fake_nop_insn, 32, "<fake nop> (%d bytes)", insn->len); + insn_str = fake_nop_insn; + } else { + disas_insn(dctx, insn); + insn_str = disas_result(dctx); + } + + /* print the instruction */ + len = (depth + 1) * 2 < DISAS_INSN_SPACE ? DISAS_INSN_SPACE - (depth+1) * 2 : 1; + disas_print_info(stream, insn, depth, "%-*s", len, insn_str); + + /* print message if any */ + if (!format) + return; + + if (strcmp(format, "\n") == 0) { + fprintf(stream, "\n"); + return; + } + + fprintf(stream, " - "); + va_start(args, format); + vfprintf(stream, format, args); + va_end(args); +} + +/* + * Disassemble a single instruction. Return the size of the instruction. + * + * If alt_applied is true then insn should be an instruction from of an + * alternative (i.e. insn->alt_group != NULL), and it is disassembled + * at the location of the original code it is replacing. When the + * instruction references any address inside the alternative then + * these references will be re-adjusted to replace the original code. + */ +static size_t disas_insn_common(struct disas_context *dctx, + struct instruction *insn, + bool alt_applied) +{ + disassembler_ftype disasm = dctx->disassembler; + struct disassemble_info *dinfo = &dctx->info; + + dctx->insn = insn; + dctx->alt_applied = alt_applied; + dctx->result[0] = '\0'; + + if (insn->type == INSN_NOP) { + DINFO_FPRINTF(dinfo, "nop%d", insn->len); + return insn->len; + } + + /* + * Set the disassembler buffer to read data from the section + * containing the instruction to disassemble. + */ + dinfo->buffer = insn->sec->data->d_buf; + dinfo->buffer_vma = 0; + dinfo->buffer_length = insn->sec->sh.sh_size; + + return disasm(insn->offset, &dctx->info); +} + +size_t disas_insn(struct disas_context *dctx, struct instruction *insn) +{ + return disas_insn_common(dctx, insn, false); +} + +static size_t disas_insn_alt(struct disas_context *dctx, + struct instruction *insn) +{ + return disas_insn_common(dctx, insn, true); +} + +static struct instruction *next_insn_same_alt(struct objtool_file *file, + struct alt_group *alt_grp, + struct instruction *insn) +{ + if (alt_grp->last_insn == insn || alt_grp->nop == insn) + return NULL; + + return next_insn_same_sec(file, insn); +} + +#define alt_for_each_insn(file, alt_grp, insn) \ + for (insn = alt_grp->first_insn; \ + insn; \ + insn = next_insn_same_alt(file, alt_grp, insn)) + +/* + * Provide a name for the type of alternatives present at the + * specified instruction. + * + * An instruction can have alternatives with different types, for + * example alternative instructions and an exception table. In that + * case the name for the alternative instructions type is used. + * + * Return NULL if the instruction as no alternative. + */ +const char *disas_alt_type_name(struct instruction *insn) +{ + struct alternative *alt; + const char *name; + + name = NULL; + for (alt = insn->alts; alt; alt = alt->next) { + if (alt->type == ALT_TYPE_INSTRUCTIONS) { + name = "alternative"; + break; + } + + switch (alt->type) { + case ALT_TYPE_EX_TABLE: + name = "ex_table"; + break; + case ALT_TYPE_JUMP_TABLE: + name = "jump_table"; + break; + default: + name = "unknown"; + break; + } + } + + return name; +} + +/* + * Provide a name for an alternative. + */ +char *disas_alt_name(struct alternative *alt) +{ + char pfx[4] = { 0 }; + char *str = NULL; + const char *name; + int feature; + int flags; + int num; + + switch (alt->type) { + + case ALT_TYPE_EX_TABLE: + str = strdup("EXCEPTION"); + break; + + case ALT_TYPE_JUMP_TABLE: + str = strdup("JUMP"); + break; + + case ALT_TYPE_INSTRUCTIONS: + /* + * This is a non-default group alternative. Create a name + * based on the feature and flags associated with this + * alternative. Use either the feature name (it is available) + * or the feature number. And add a prefix to show the flags + * used. + * + * Prefix flags characters: + * + * '!' alternative used when feature not enabled + * '+' direct call alternative + * '?' unknown flag + */ + + if (!alt->insn->alt_group) + return NULL; + + feature = alt->insn->alt_group->feature; + num = alt_feature(feature); + flags = alt_flags(feature); + str = pfx; + + if (flags & ~(ALT_FLAG_NOT | ALT_FLAG_DIRECT_CALL)) + *str++ = '?'; + if (flags & ALT_FLAG_DIRECT_CALL) + *str++ = '+'; + if (flags & ALT_FLAG_NOT) + *str++ = '!'; + + name = arch_cpu_feature_name(num); + if (!name) + str = strfmt("%sFEATURE 0x%X", pfx, num); + else + str = strfmt("%s%s", pfx, name); + + break; + } + + return str; +} + +/* + * Initialize an alternative. The default alternative should be initialized + * with alt=NULL. + */ +static int disas_alt_init(struct disas_alt *dalt, + struct instruction *orig_insn, + struct alternative *alt) +{ + dalt->orig_insn = orig_insn; + dalt->alt = alt; + dalt->insn_idx = 0; + dalt->name = alt ? disas_alt_name(alt) : strdup("DEFAULT"); + if (!dalt->name) + return -1; + dalt->width = strlen(dalt->name); + + return 0; +} + +static int disas_alt_add_insn(struct disas_alt *dalt, int index, char *insn_str, + int offset, int nops) +{ + int len; + + if (index >= DISAS_ALT_INSN_MAX) { + WARN("Alternative %lx.%s has more instructions than supported", + DALT_ALTID(dalt), dalt->name); + return -1; + } + + len = strlen(insn_str); + dalt->insn[index].str = insn_str; + dalt->insn[index].offset = offset; + dalt->insn[index].nops = nops; + if (len > dalt->width) + dalt->width = len; + + return 0; +} + +static int disas_alt_jump(struct disas_alt *dalt) +{ + struct instruction *orig_insn; + struct instruction *dest_insn; + char suffix[2] = { 0 }; + char *str; + int nops; + + orig_insn = dalt->orig_insn; + dest_insn = dalt->alt->insn; + + if (orig_insn->type == INSN_NOP) { + if (orig_insn->len == 5) + suffix[0] = 'q'; + str = strfmt("jmp%-3s %lx <%s+0x%lx>", suffix, + dest_insn->offset, dest_insn->sym->name, + dest_insn->offset - dest_insn->sym->offset); + nops = 0; + } else { + str = strfmt("nop%d", orig_insn->len); + nops = orig_insn->len; + } + + if (!str) + return -1; + + disas_alt_add_insn(dalt, 0, str, 0, nops); + + return 1; +} + +/* + * Disassemble an exception table alternative. + */ +static int disas_alt_extable(struct disas_alt *dalt) +{ + struct instruction *alt_insn; + char *str; + + alt_insn = dalt->alt->insn; + str = strfmt("resume at 0x%lx <%s+0x%lx>", + alt_insn->offset, alt_insn->sym->name, + alt_insn->offset - alt_insn->sym->offset); + if (!str) + return -1; + + disas_alt_add_insn(dalt, 0, str, 0, 0); + + return 1; +} + +/* + * Disassemble an alternative and store instructions in the disas_alt + * structure. Return the number of instructions in the alternative. + */ +static int disas_alt_group(struct disas_context *dctx, struct disas_alt *dalt) +{ + struct objtool_file *file; + struct instruction *insn; + int offset; + char *str; + int count; + int nops; + int err; + + file = dctx->file; + count = 0; + offset = 0; + nops = 0; + + alt_for_each_insn(file, DALT_GROUP(dalt), insn) { + + disas_insn_alt(dctx, insn); + str = strdup(disas_result(dctx)); + if (!str) + return -1; + + nops = insn->type == INSN_NOP ? insn->len : 0; + err = disas_alt_add_insn(dalt, count, str, offset, nops); + if (err) + break; + offset += insn->len; + count++; + } + + return count; +} + +/* + * Disassemble the default alternative. + */ +static int disas_alt_default(struct disas_context *dctx, struct disas_alt *dalt) +{ + char *str; + int nops; + int err; + + if (DALT_GROUP(dalt)) + return disas_alt_group(dctx, dalt); + + /* + * Default alternative with no alt_group: this is the default + * code associated with either a jump table or an exception + * table and no other instruction alternatives. In that case + * the default alternative is made of a single instruction. + */ + disas_insn(dctx, dalt->orig_insn); + str = strdup(disas_result(dctx)); + if (!str) + return -1; + nops = dalt->orig_insn->type == INSN_NOP ? dalt->orig_insn->len : 0; + err = disas_alt_add_insn(dalt, 0, str, 0, nops); + if (err) + return -1; + + return 1; +} + +/* + * For each alternative, if there is an instruction at the specified + * offset then print this instruction, otherwise print a blank entry. + * The offset is an offset from the start of the alternative. + * + * Return the offset for the next instructions to print, or -1 if all + * instructions have been printed. + */ +static int disas_alt_print_insn(struct disas_alt *dalts, int alt_count, + int insn_count, int offset) +{ + struct disas_alt *dalt; + int offset_next; + char *str; + int i, j; + + offset_next = -1; + + for (i = 0; i < alt_count; i++) { + dalt = &dalts[i]; + j = dalt->insn_idx; + if (j == -1) { + printf("| %-*s ", dalt->width, ""); + continue; + } + + if (dalt->insn[j].offset == offset) { + str = dalt->insn[j].str; + printf("| %-*s ", dalt->width, str ?: ""); + if (++j < insn_count) { + dalt->insn_idx = j; + } else { + dalt->insn_idx = -1; + continue; + } + } else { + printf("| %-*s ", dalt->width, ""); + } + + if (dalt->insn[j].offset > 0 && + (offset_next == -1 || + (dalt->insn[j].offset < offset_next))) + offset_next = dalt->insn[j].offset; + } + printf("\n"); + + return offset_next; +} + +/* + * Print all alternatives side-by-side. + */ +static void disas_alt_print_wide(char *alt_name, struct disas_alt *dalts, int alt_count, + int insn_count) +{ + struct instruction *orig_insn; + int offset_next; + int offset; + int i; + + orig_insn = dalts[0].orig_insn; + + /* + * Print an header with the name of each alternative. + */ + disas_print_info(stdout, orig_insn, -2, NULL); + + if (strlen(alt_name) > dalts[0].width) + dalts[0].width = strlen(alt_name); + printf("| %-*s ", dalts[0].width, alt_name); + + for (i = 1; i < alt_count; i++) + printf("| %-*s ", dalts[i].width, dalts[i].name); + + printf("\n"); + + /* + * Print instructions for each alternative. + */ + offset_next = 0; + do { + offset = offset_next; + disas_print(stdout, orig_insn->sec, orig_insn->offset + offset, + -2, NULL); + offset_next = disas_alt_print_insn(dalts, alt_count, insn_count, + offset); + } while (offset_next > offset); +} + +/* + * Print all alternatives one above the other. + */ +static void disas_alt_print_compact(char *alt_name, struct disas_alt *dalts, + int alt_count, int insn_count) +{ + struct instruction *orig_insn; + int width; + int i, j; + int len; + + orig_insn = dalts[0].orig_insn; + + len = disas_print(stdout, orig_insn->sec, orig_insn->offset, 0, NULL); + printf("%s\n", alt_name); + + /* + * If all alternatives have a single instruction then print each + * alternative on a single line. Otherwise, print alternatives + * one above the other with a clear separation. + */ + + if (insn_count == 1) { + width = 0; + for (i = 0; i < alt_count; i++) { + if (dalts[i].width > width) + width = dalts[i].width; + } + + for (i = 0; i < alt_count; i++) { + printf("%*s= %-*s (if %s)\n", len, "", width, + dalts[i].insn[0].str, dalts[i].name); + } + + return; + } + + for (i = 0; i < alt_count; i++) { + printf("%*s= %s\n", len, "", dalts[i].name); + for (j = 0; j < insn_count; j++) { + if (!dalts[i].insn[j].str) + break; + disas_print(stdout, orig_insn->sec, + orig_insn->offset + dalts[i].insn[j].offset, 0, + "| %s\n", dalts[i].insn[j].str); + } + printf("%*s|\n", len, ""); + } +} + +/* + * Trim NOPs in alternatives. This replaces trailing NOPs in alternatives + * with a single indication of the number of bytes covered with NOPs. + * + * Return the maximum numbers of instructions in all alternatives after + * trailing NOPs have been trimmed. + */ +static int disas_alt_trim_nops(struct disas_alt *dalts, int alt_count, + int insn_count) +{ + struct disas_alt *dalt; + int nops_count; + const char *s; + int offset; + int count; + int nops; + int i, j; + + count = 0; + for (i = 0; i < alt_count; i++) { + offset = 0; + nops = 0; + nops_count = 0; + dalt = &dalts[i]; + for (j = insn_count - 1; j >= 0; j--) { + if (!dalt->insn[j].str || !dalt->insn[j].nops) + break; + offset = dalt->insn[j].offset; + free(dalt->insn[j].str); + dalt->insn[j].offset = 0; + dalt->insn[j].str = NULL; + nops += dalt->insn[j].nops; + nops_count++; + } + + /* + * All trailing NOPs have been removed. If there was a single + * NOP instruction then re-add it. If there was a block of + * NOPs then indicate the number of bytes than the block + * covers (nop*<number-of-bytes>). + */ + if (nops_count) { + s = nops_count == 1 ? "" : "*"; + dalt->insn[j + 1].str = strfmt("nop%s%d", s, nops); + dalt->insn[j + 1].offset = offset; + dalt->insn[j + 1].nops = nops; + j++; + } + + if (j > count) + count = j; + } + + return count + 1; +} + +/* + * Disassemble an alternative. + * + * Return the last instruction in the default alternative so that + * disassembly can continue with the next instruction. Return NULL + * on error. + */ +static void *disas_alt(struct disas_context *dctx, + struct instruction *orig_insn) +{ + struct disas_alt dalts[DISAS_ALT_MAX] = { 0 }; + struct instruction *last_insn = NULL; + struct alternative *alt; + struct disas_alt *dalt; + int insn_count = 0; + int alt_count = 0; + char *alt_name; + int count; + int i, j; + int err; + + alt_name = strfmt("<%s.%lx>", disas_alt_type_name(orig_insn), + orig_insn->offset); + if (!alt_name) { + WARN("Failed to define name for alternative at instruction 0x%lx", + orig_insn->offset); + goto done; + } + + /* + * Initialize and disassemble the default alternative. + */ + err = disas_alt_init(&dalts[0], orig_insn, NULL); + if (err) { + WARN("%s: failed to initialize default alternative", alt_name); + goto done; + } + + insn_count = disas_alt_default(dctx, &dalts[0]); + if (insn_count < 0) { + WARN("%s: failed to disassemble default alternative", alt_name); + goto done; + } + + /* + * Initialize and disassemble all other alternatives. + */ + i = 1; + for (alt = orig_insn->alts; alt; alt = alt->next) { + if (i >= DISAS_ALT_MAX) { + WARN("%s has more alternatives than supported", alt_name); + break; + } + + dalt = &dalts[i]; + err = disas_alt_init(dalt, orig_insn, alt); + if (err) { + WARN("%s: failed to disassemble alternative", alt_name); + goto done; + } + + count = -1; + switch (dalt->alt->type) { + case ALT_TYPE_INSTRUCTIONS: + count = disas_alt_group(dctx, dalt); + break; + case ALT_TYPE_EX_TABLE: + count = disas_alt_extable(dalt); + break; + case ALT_TYPE_JUMP_TABLE: + count = disas_alt_jump(dalt); + break; + } + if (count < 0) { + WARN("%s: failed to disassemble alternative %s", + alt_name, dalt->name); + goto done; + } + + insn_count = count > insn_count ? count : insn_count; + i++; + } + alt_count = i; + + /* + * Print default and non-default alternatives. + */ + + insn_count = disas_alt_trim_nops(dalts, alt_count, insn_count); + + if (opts.wide) + disas_alt_print_wide(alt_name, dalts, alt_count, insn_count); + else + disas_alt_print_compact(alt_name, dalts, alt_count, insn_count); + + last_insn = orig_insn->alt_group ? orig_insn->alt_group->last_insn : + orig_insn; + +done: + for (i = 0; i < alt_count; i++) { + free(dalts[i].name); + for (j = 0; j < insn_count; j++) + free(dalts[i].insn[j].str); + } + + free(alt_name); + + return last_insn; +} + +/* + * Disassemble a function. + */ +static void disas_func(struct disas_context *dctx, struct symbol *func) +{ + struct instruction *insn_start; + struct instruction *insn; + + printf("%s:\n", func->name); + sym_for_each_insn(dctx->file, func, insn) { + if (insn->alts) { + insn_start = insn; + insn = disas_alt(dctx, insn); + if (insn) + continue; + /* + * There was an error with disassembling + * the alternative. Resume disassembling + * at the current instruction, this will + * disassemble the default alternative + * only and continue with the code after + * the alternative. + */ + insn = insn_start; + } + + DISAS_PRINSN(dctx, insn, 0); + } + printf("\n"); +} + +/* + * Disassemble all warned functions. + */ +void disas_warned_funcs(struct disas_context *dctx) +{ + struct symbol *sym; + + if (!dctx) + return; + + for_each_sym(dctx->file->elf, sym) { + if (sym->warned) + disas_func(dctx, sym); + } +} + +void disas_funcs(struct disas_context *dctx) +{ + bool disas_all = !strcmp(opts.disas, "*"); + struct section *sec; + struct symbol *sym; + + for_each_sec(dctx->file->elf, sec) { + + if (!(sec->sh.sh_flags & SHF_EXECINSTR)) + continue; + + sec_for_each_sym(sec, sym) { + /* + * If the function had a warning and the verbose + * option is used then the function was already + * disassemble. + */ + if (opts.verbose && sym->warned) + continue; + + if (disas_all || fnmatch(opts.disas, sym->name, 0) == 0) + disas_func(dctx, sym); + } + } +} diff --git a/tools/objtool/elf.c b/tools/objtool/elf.c index ca5d77db692a..6a8ed9c62323 100644 --- a/tools/objtool/elf.c +++ b/tools/objtool/elf.c @@ -16,12 +16,17 @@ #include <string.h> #include <unistd.h> #include <errno.h> +#include <libgen.h> +#include <ctype.h> #include <linux/interval_tree_generic.h> #include <objtool/builtin.h> - #include <objtool/elf.h> #include <objtool/warn.h> +#define ALIGN_UP(x, align_to) (((x) + ((align_to)-1)) & ~((align_to)-1)) +#define ALIGN_UP_POW2(x) (1U << ((8 * sizeof(x)) - __builtin_clz((x) - 1U))) +#define MAX(a, b) ((a) > (b) ? (a) : (b)) + static inline u32 str_hash(const char *str) { return jhash(str, strlen(str), 0); @@ -92,11 +97,12 @@ static inline unsigned long __sym_start(struct symbol *s) static inline unsigned long __sym_last(struct symbol *s) { - return s->offset + s->len - 1; + return s->offset + (s->len ? s->len - 1 : 0); } INTERVAL_TREE_DEFINE(struct symbol, node, unsigned long, __subtree_last, - __sym_start, __sym_last, static, __sym) + __sym_start, __sym_last, static inline __maybe_unused, + __sym) #define __sym_for_each(_iter, _tree, _start, _end) \ for (_iter = __sym_iter_first((_tree), (_start), (_end)); \ @@ -108,7 +114,7 @@ struct symbol_hole { }; /* - * Find !section symbol where @offset is after it. + * Find the last symbol before @offset. */ static int symbol_hole_by_offset(const void *key, const struct rb_node *node) { @@ -119,8 +125,7 @@ static int symbol_hole_by_offset(const void *key, const struct rb_node *node) return -1; if (sh->key >= s->offset + s->len) { - if (s->type != STT_SECTION) - sh->sym = s; + sh->sym = s; return 1; } @@ -167,11 +172,11 @@ static struct symbol *find_symbol_by_index(struct elf *elf, unsigned int idx) struct symbol *find_symbol_by_offset(struct section *sec, unsigned long offset) { struct rb_root_cached *tree = (struct rb_root_cached *)&sec->symbol_tree; - struct symbol *iter; + struct symbol *sym; - __sym_for_each(iter, tree, offset, offset) { - if (iter->offset == offset && iter->type != STT_SECTION) - return iter; + __sym_for_each(sym, tree, offset, offset) { + if (sym->offset == offset && !is_sec_sym(sym)) + return sym->alias; } return NULL; @@ -180,11 +185,11 @@ struct symbol *find_symbol_by_offset(struct section *sec, unsigned long offset) struct symbol *find_func_by_offset(struct section *sec, unsigned long offset) { struct rb_root_cached *tree = (struct rb_root_cached *)&sec->symbol_tree; - struct symbol *iter; + struct symbol *func; - __sym_for_each(iter, tree, offset, offset) { - if (iter->offset == offset && iter->type == STT_FUNC) - return iter; + __sym_for_each(func, tree, offset, offset) { + if (func->offset == offset && is_func_sym(func)) + return func->alias; } return NULL; @@ -193,14 +198,29 @@ struct symbol *find_func_by_offset(struct section *sec, unsigned long offset) struct symbol *find_symbol_containing(const struct section *sec, unsigned long offset) { struct rb_root_cached *tree = (struct rb_root_cached *)&sec->symbol_tree; - struct symbol *iter; + struct symbol *sym = NULL, *tmp; - __sym_for_each(iter, tree, offset, offset) { - if (iter->type != STT_SECTION) - return iter; + __sym_for_each(tmp, tree, offset, offset) { + if (tmp->len) { + if (!sym) { + sym = tmp; + continue; + } + + if (sym->offset != tmp->offset || sym->len != tmp->len) { + /* + * In the rare case of overlapping symbols, + * pick the smaller one. + * + * TODO: outlaw overlapping symbols + */ + if (tmp->len < sym->len) + sym = tmp; + } + } } - return NULL; + return sym ? sym->alias : NULL; } /* @@ -246,11 +266,11 @@ int find_symbol_hole_containing(const struct section *sec, unsigned long offset) struct symbol *find_func_containing(struct section *sec, unsigned long offset) { struct rb_root_cached *tree = (struct rb_root_cached *)&sec->symbol_tree; - struct symbol *iter; + struct symbol *func; - __sym_for_each(iter, tree, offset, offset) { - if (iter->type == STT_FUNC) - return iter; + __sym_for_each(func, tree, offset, offset) { + if (is_func_sym(func)) + return func->alias; } return NULL; @@ -268,6 +288,35 @@ struct symbol *find_symbol_by_name(const struct elf *elf, const char *name) return NULL; } +/* Find local symbol with matching STT_FILE */ +static struct symbol *find_local_symbol_by_file_and_name(const struct elf *elf, + struct symbol *file, + const char *name) +{ + struct symbol *sym; + + elf_hash_for_each_possible(symbol_name, sym, name_hash, str_hash(name)) { + if (sym->bind == STB_LOCAL && sym->file == file && + !strcmp(sym->name, name)) { + return sym; + } + } + + return NULL; +} + +struct symbol *find_global_symbol_by_name(const struct elf *elf, const char *name) +{ + struct symbol *sym; + + elf_hash_for_each_possible(symbol_name, sym, name_hash, str_hash(name)) { + if (!strcmp(sym->name, name) && !is_local_sym(sym)) + return sym; + } + + return NULL; +} + struct reloc *find_reloc_by_dest_range(const struct elf *elf, struct section *sec, unsigned long offset, unsigned int len) { @@ -358,14 +407,14 @@ static int read_sections(struct elf *elf) return -1; } - if (sec->sh.sh_size != 0 && !is_dwarf_section(sec)) { + if (sec_size(sec) != 0 && !is_dwarf_section(sec)) { sec->data = elf_getdata(s, NULL); if (!sec->data) { ERROR_ELF("elf_getdata"); return -1; } if (sec->data->d_off != 0 || - sec->data->d_size != sec->sh.sh_size) { + sec->data->d_size != sec_size(sec)) { ERROR("unexpected data attributes for %s", sec->name); return -1; } @@ -393,7 +442,38 @@ static int read_sections(struct elf *elf) return 0; } -static void elf_add_symbol(struct elf *elf, struct symbol *sym) +static const char *demangle_name(struct symbol *sym) +{ + char *str; + + if (!is_local_sym(sym)) + return sym->name; + + if (!is_func_sym(sym) && !is_object_sym(sym)) + return sym->name; + + if (!strstarts(sym->name, "__UNIQUE_ID_") && !strchr(sym->name, '.')) + return sym->name; + + str = strdup(sym->name); + if (!str) { + ERROR_GLIBC("strdup"); + return NULL; + } + + for (int i = strlen(str) - 1; i >= 0; i--) { + char c = str[i]; + + if (!isdigit(c) && c != '.') { + str[i + 1] = '\0'; + break; + } + } + + return str; +} + +static int elf_add_symbol(struct elf *elf, struct symbol *sym) { struct list_head *entry; struct rb_node *pnode; @@ -405,14 +485,15 @@ static void elf_add_symbol(struct elf *elf, struct symbol *sym) sym->type = GELF_ST_TYPE(sym->sym.st_info); sym->bind = GELF_ST_BIND(sym->sym.st_info); - if (sym->type == STT_FILE) + if (is_file_sym(sym)) elf->num_files++; sym->offset = sym->sym.st_value; sym->len = sym->sym.st_size; __sym_for_each(iter, &sym->sec->symbol_tree, sym->offset, sym->offset) { - if (iter->offset == sym->offset && iter->type == sym->type) + if (!is_undef_sym(iter) && iter->offset == sym->offset && + iter->type == sym->type && iter->len == sym->len) iter->alias = sym; } @@ -423,21 +504,44 @@ static void elf_add_symbol(struct elf *elf, struct symbol *sym) else entry = &sym->sec->symbol_list; list_add(&sym->list, entry); + + list_add_tail(&sym->global_list, &elf->symbols); elf_hash_add(symbol, &sym->hash, sym->idx); elf_hash_add(symbol_name, &sym->name_hash, str_hash(sym->name)); - /* - * Don't store empty STT_NOTYPE symbols in the rbtree. They - * can exist within a function, confusing the sorting. - */ - if (!sym->len) - __sym_remove(sym, &sym->sec->symbol_tree); + if (is_func_sym(sym) && + (strstarts(sym->name, "__pfx_") || + strstarts(sym->name, "__cfi_") || + strstarts(sym->name, "__pi___pfx_") || + strstarts(sym->name, "__pi___cfi_"))) + sym->prefix = 1; + + if (strstarts(sym->name, ".klp.sym")) + sym->klp = 1; + + if (!sym->klp && !is_sec_sym(sym) && strstr(sym->name, ".cold")) { + sym->cold = 1; + + /* + * Clang doesn't mark cold subfunctions as STT_FUNC, which + * breaks several objtool assumptions. Fake it. + */ + sym->type = STT_FUNC; + } + + sym->pfunc = sym->cfunc = sym; + + sym->demangled_name = demangle_name(sym); + if (!sym->demangled_name) + return -1; + + return 0; } static int read_symbols(struct elf *elf) { struct section *symtab, *symtab_shndx, *sec; - struct symbol *sym, *pfunc; + struct symbol *sym, *pfunc, *file = NULL; int symbols_nr, i; char *coldstr; Elf_Data *shndx_data = NULL; @@ -469,6 +573,9 @@ static int read_symbols(struct elf *elf) ERROR_GLIBC("calloc"); return -1; } + + INIT_LIST_HEAD(&elf->symbols); + for (i = 0; i < symbols_nr; i++) { sym = &elf->symbol_data[i]; @@ -477,14 +584,14 @@ static int read_symbols(struct elf *elf) if (!gelf_getsymshndx(symtab->data, shndx_data, i, &sym->sym, &shndx)) { ERROR_ELF("gelf_getsymshndx"); - goto err; + return -1; } sym->name = elf_strptr(elf->elf, symtab->sh.sh_link, sym->sym.st_name); if (!sym->name) { ERROR_ELF("elf_strptr"); - goto err; + return -1; } if ((sym->sym.st_shndx > SHN_UNDEF && @@ -496,7 +603,7 @@ static int read_symbols(struct elf *elf) sym->sec = find_section_by_index(elf, shndx); if (!sym->sec) { ERROR("couldn't find section for symbol %s", sym->name); - goto err; + return -1; } if (GELF_ST_TYPE(sym->sym.st_info) == STT_SECTION) { sym->name = sym->sec->name; @@ -505,7 +612,13 @@ static int read_symbols(struct elf *elf) } else sym->sec = find_section_by_index(elf, 0); - elf_add_symbol(elf, sym); + if (elf_add_symbol(elf, sym)) + return -1; + + if (sym->type == STT_FILE) + file = sym; + else if (sym->bind == STB_LOCAL) + sym->file = file; } if (opts.stats) { @@ -518,18 +631,15 @@ static int read_symbols(struct elf *elf) sec_for_each_sym(sec, sym) { char *pname; size_t pnamelen; - if (sym->type != STT_FUNC) - continue; - - if (sym->pfunc == NULL) - sym->pfunc = sym; - if (sym->cfunc == NULL) - sym->cfunc = sym; + if (!sym->cold) + continue; coldstr = strstr(sym->name, ".cold"); - if (!coldstr) - continue; + if (!coldstr) { + ERROR("%s(): cold subfunction without \".cold\"?", sym->name); + return -1; + } pnamelen = coldstr - sym->name; pname = strndup(sym->name, pnamelen); @@ -538,7 +648,9 @@ static int read_symbols(struct elf *elf) return -1; } - pfunc = find_symbol_by_name(elf, pname); + pfunc = find_local_symbol_by_file_and_name(elf, sym->file, pname); + if (!pfunc) + pfunc = find_global_symbol_by_name(elf, pname); free(pname); if (!pfunc) { @@ -546,8 +658,9 @@ static int read_symbols(struct elf *elf) return -1; } - sym->pfunc = pfunc; + sym->pfunc = pfunc->alias; pfunc->cfunc = sym; + pfunc->alias->cfunc = sym; /* * Unfortunately, -fnoreorder-functions puts the child @@ -566,10 +679,6 @@ static int read_symbols(struct elf *elf) } return 0; - -err: - free(sym); - return -1; } static int mark_group_syms(struct elf *elf) @@ -583,7 +692,7 @@ static int mark_group_syms(struct elf *elf) return -1; } - list_for_each_entry(sec, &elf->sections, list) { + for_each_sec(elf, sec) { if (sec->sh.sh_type == SHT_GROUP && sec->sh.sh_link == symtab->idx) { sym = find_symbol_by_index(elf, sec->sh.sh_info); @@ -624,7 +733,7 @@ static int elf_update_sym_relocs(struct elf *elf, struct symbol *sym) static int elf_update_symbol(struct elf *elf, struct section *symtab, struct section *symtab_shndx, struct symbol *sym) { - Elf32_Word shndx = sym->sec ? sym->sec->idx : SHN_UNDEF; + Elf32_Word shndx; Elf_Data *symtab_data = NULL, *shndx_data = NULL; Elf64_Xword entsize = symtab->sh.sh_entsize; int max_idx, idx = sym->idx; @@ -632,8 +741,7 @@ static int elf_update_symbol(struct elf *elf, struct section *symtab, bool is_special_shndx = sym->sym.st_shndx >= SHN_LORESERVE && sym->sym.st_shndx != SHN_XINDEX; - if (is_special_shndx) - shndx = sym->sym.st_shndx; + shndx = is_special_shndx ? sym->sym.st_shndx : sym->sec->idx; s = elf_getscn(elf->elf, symtab->idx); if (!s) { @@ -731,7 +839,7 @@ static int elf_update_symbol(struct elf *elf, struct section *symtab, } /* setup extended section index magic and write the symbol */ - if ((shndx >= SHN_UNDEF && shndx < SHN_LORESERVE) || is_special_shndx) { + if (shndx < SHN_LORESERVE || is_special_shndx) { sym->sym.st_shndx = shndx; if (!shndx_data) shndx = 0; @@ -751,24 +859,58 @@ static int elf_update_symbol(struct elf *elf, struct section *symtab, return 0; } -static struct symbol * -__elf_create_symbol(struct elf *elf, struct symbol *sym) +struct symbol *elf_create_symbol(struct elf *elf, const char *name, + struct section *sec, unsigned int bind, + unsigned int type, unsigned long offset, + size_t size) { struct section *symtab, *symtab_shndx; Elf32_Word first_non_local, new_idx; - struct symbol *old; + struct symbol *old, *sym; - symtab = find_section_by_name(elf, ".symtab"); - if (symtab) { - symtab_shndx = find_section_by_name(elf, ".symtab_shndx"); + sym = calloc(1, sizeof(*sym)); + if (!sym) { + ERROR_GLIBC("calloc"); + return NULL; + } + + sym->name = strdup(name); + if (!sym->name) { + ERROR_GLIBC("strdup"); + return NULL; + } + + if (type != STT_SECTION) { + sym->sym.st_name = elf_add_string(elf, NULL, sym->name); + if (sym->sym.st_name == -1) + return NULL; + } + + if (sec) { + sym->sec = sec; } else { + sym->sec = find_section_by_index(elf, 0); + if (!sym->sec) { + ERROR("no NULL section"); + return NULL; + } + } + + sym->sym.st_info = GELF_ST_INFO(bind, type); + sym->sym.st_value = offset; + sym->sym.st_size = size; + + symtab = find_section_by_name(elf, ".symtab"); + if (!symtab) { ERROR("no .symtab"); return NULL; } + symtab_shndx = find_section_by_name(elf, ".symtab_shndx"); + new_idx = sec_num_entries(symtab); - if (GELF_ST_BIND(sym->sym.st_info) != STB_LOCAL) + if (bind != STB_LOCAL) goto non_local; /* @@ -806,10 +948,8 @@ __elf_create_symbol(struct elf *elf, struct symbol *sym) non_local: sym->idx = new_idx; - if (elf_update_symbol(elf, symtab, symtab_shndx, sym)) { - ERROR("elf_update_symbol"); + if (sym->idx && elf_update_symbol(elf, symtab, symtab_shndx, sym)) return NULL; - } symtab->sh.sh_size += symtab->sh.sh_entsize; mark_sec_changed(elf, symtab, true); @@ -819,70 +959,28 @@ non_local: mark_sec_changed(elf, symtab_shndx, true); } - return sym; -} - -static struct symbol * -elf_create_section_symbol(struct elf *elf, struct section *sec) -{ - struct symbol *sym = calloc(1, sizeof(*sym)); - - if (!sym) { - ERROR_GLIBC("malloc"); + if (elf_add_symbol(elf, sym)) return NULL; - } - - sym->name = sec->name; - sym->sec = sec; - - // st_name 0 - sym->sym.st_info = GELF_ST_INFO(STB_LOCAL, STT_SECTION); - // st_other 0 - // st_value 0 - // st_size 0 - - sym = __elf_create_symbol(elf, sym); - if (sym) - elf_add_symbol(elf, sym); return sym; } -static int elf_add_string(struct elf *elf, struct section *strtab, char *str); - -struct symbol * -elf_create_prefix_symbol(struct elf *elf, struct symbol *orig, long size) +struct symbol *elf_create_section_symbol(struct elf *elf, struct section *sec) { struct symbol *sym = calloc(1, sizeof(*sym)); - size_t namelen = strlen(orig->name) + sizeof("__pfx_"); - char *name = malloc(namelen); - if (!sym || !name) { - ERROR_GLIBC("malloc"); + sym = elf_create_symbol(elf, sec->name, sec, STB_LOCAL, STT_SECTION, 0, 0); + if (!sym) return NULL; - } - snprintf(name, namelen, "__pfx_%s", orig->name); - - sym->name = name; - sym->sec = orig->sec; - - sym->sym.st_name = elf_add_string(elf, NULL, name); - sym->sym.st_info = orig->sym.st_info; - sym->sym.st_value = orig->sym.st_value - size; - sym->sym.st_size = size; - - sym = __elf_create_symbol(elf, sym); - if (sym) - elf_add_symbol(elf, sym); + sec->sym = sym; return sym; } -static struct reloc *elf_init_reloc(struct elf *elf, struct section *rsec, - unsigned int reloc_idx, - unsigned long offset, struct symbol *sym, - s64 addend, unsigned int type) +struct reloc *elf_init_reloc(struct elf *elf, struct section *rsec, + unsigned int reloc_idx, unsigned long offset, + struct symbol *sym, s64 addend, unsigned int type) { struct reloc *reloc, empty = { 0 }; @@ -922,9 +1020,9 @@ struct reloc *elf_init_reloc_text_sym(struct elf *elf, struct section *sec, unsigned long insn_off) { struct symbol *sym = insn_sec->sym; - int addend = insn_off; + s64 addend = insn_off; - if (!(insn_sec->sh.sh_flags & SHF_EXECINSTR)) { + if (!is_text_sec(insn_sec)) { ERROR("bad call to %s() for data symbol %s", __func__, sym->name); return NULL; } @@ -939,8 +1037,6 @@ struct reloc *elf_init_reloc_text_sym(struct elf *elf, struct section *sec, sym = elf_create_section_symbol(elf, insn_sec); if (!sym) return NULL; - - insn_sec->sym = sym; } return elf_init_reloc(elf, sec->rsec, reloc_idx, offset, sym, addend, @@ -953,7 +1049,7 @@ struct reloc *elf_init_reloc_data_sym(struct elf *elf, struct section *sec, struct symbol *sym, s64 addend) { - if (sym->sec && (sec->sh.sh_flags & SHF_EXECINSTR)) { + if (is_text_sec(sec)) { ERROR("bad call to %s() for text symbol %s", __func__, sym->name); return NULL; } @@ -986,12 +1082,16 @@ static int read_relocs(struct elf *elf) rsec->base->rsec = rsec; - nr_reloc = 0; + /* nr_alloc_relocs=0: libelf owns d_buf */ + rsec->nr_alloc_relocs = 0; + rsec->relocs = calloc(sec_num_entries(rsec), sizeof(*reloc)); if (!rsec->relocs) { ERROR_GLIBC("calloc"); return -1; } + + nr_reloc = 0; for (i = 0; i < sec_num_entries(rsec); i++) { reloc = &rsec->relocs[i]; @@ -1044,6 +1144,12 @@ struct elf *elf_open_read(const char *name, int flags) goto err; } + elf->name = strdup(name); + if (!elf->name) { + ERROR_GLIBC("strdup"); + return NULL; + } + if ((flags & O_ACCMODE) == O_RDONLY) cmd = ELF_C_READ_MMAP; else if ((flags & O_ACCMODE) == O_RDWR) @@ -1081,11 +1187,142 @@ err: return NULL; } -static int elf_add_string(struct elf *elf, struct section *strtab, char *str) +struct elf *elf_create_file(GElf_Ehdr *ehdr, const char *name) { - Elf_Data *data; - Elf_Scn *s; - int len; + struct section *null, *symtab, *strtab, *shstrtab; + char *dir, *base, *tmp_name; + struct symbol *sym; + struct elf *elf; + + elf_version(EV_CURRENT); + + elf = calloc(1, sizeof(*elf)); + if (!elf) { + ERROR_GLIBC("calloc"); + return NULL; + } + + INIT_LIST_HEAD(&elf->sections); + + dir = strdup(name); + if (!dir) { + ERROR_GLIBC("strdup"); + return NULL; + } + + dir = dirname(dir); + + base = strdup(name); + if (!base) { + ERROR_GLIBC("strdup"); + return NULL; + } + + base = basename(base); + + tmp_name = malloc(256); + if (!tmp_name) { + ERROR_GLIBC("malloc"); + return NULL; + } + + snprintf(tmp_name, 256, "%s/%s.XXXXXX", dir, base); + + elf->fd = mkstemp(tmp_name); + if (elf->fd == -1) { + ERROR_GLIBC("can't create tmp file"); + exit(1); + } + + elf->tmp_name = tmp_name; + + elf->name = strdup(name); + if (!elf->name) { + ERROR_GLIBC("strdup"); + return NULL; + } + + elf->elf = elf_begin(elf->fd, ELF_C_WRITE, NULL); + if (!elf->elf) { + ERROR_ELF("elf_begin"); + return NULL; + } + + if (!gelf_newehdr(elf->elf, ELFCLASS64)) { + ERROR_ELF("gelf_newehdr"); + return NULL; + } + + memcpy(&elf->ehdr, ehdr, sizeof(elf->ehdr)); + + if (!gelf_update_ehdr(elf->elf, &elf->ehdr)) { + ERROR_ELF("gelf_update_ehdr"); + return NULL; + } + + INIT_LIST_HEAD(&elf->symbols); + + if (!elf_alloc_hash(section, 1000) || + !elf_alloc_hash(section_name, 1000) || + !elf_alloc_hash(symbol, 10000) || + !elf_alloc_hash(symbol_name, 10000) || + !elf_alloc_hash(reloc, 100000)) + return NULL; + + null = elf_create_section(elf, NULL, 0, 0, SHT_NULL, 0, 0); + shstrtab = elf_create_section(elf, NULL, 0, 0, SHT_STRTAB, 1, 0); + strtab = elf_create_section(elf, NULL, 0, 0, SHT_STRTAB, 1, 0); + + if (!null || !shstrtab || !strtab) + return NULL; + + null->name = ""; + shstrtab->name = ".shstrtab"; + strtab->name = ".strtab"; + + null->sh.sh_name = elf_add_string(elf, shstrtab, null->name); + shstrtab->sh.sh_name = elf_add_string(elf, shstrtab, shstrtab->name); + strtab->sh.sh_name = elf_add_string(elf, shstrtab, strtab->name); + + if (null->sh.sh_name == -1 || shstrtab->sh.sh_name == -1 || strtab->sh.sh_name == -1) + return NULL; + + elf_hash_add(section_name, &null->name_hash, str_hash(null->name)); + elf_hash_add(section_name, &strtab->name_hash, str_hash(strtab->name)); + elf_hash_add(section_name, &shstrtab->name_hash, str_hash(shstrtab->name)); + + if (elf_add_string(elf, strtab, "") == -1) + return NULL; + + symtab = elf_create_section(elf, ".symtab", 0x18, 0x18, SHT_SYMTAB, 0x8, 0); + if (!symtab) + return NULL; + + symtab->sh.sh_link = strtab->idx; + symtab->sh.sh_info = 1; + + elf->ehdr.e_shstrndx = shstrtab->idx; + if (!gelf_update_ehdr(elf->elf, &elf->ehdr)) { + ERROR_ELF("gelf_update_ehdr"); + return NULL; + } + + sym = calloc(1, sizeof(*sym)); + if (!sym) { + ERROR_GLIBC("calloc"); + return NULL; + } + + sym->name = ""; + sym->sec = null; + elf_add_symbol(elf, sym); + + return elf; +} + +unsigned int elf_add_string(struct elf *elf, struct section *strtab, const char *str) +{ + unsigned int offset; if (!strtab) strtab = find_section_by_name(elf, ".strtab"); @@ -1094,76 +1331,109 @@ static int elf_add_string(struct elf *elf, struct section *strtab, char *str) return -1; } - s = elf_getscn(elf->elf, strtab->idx); + if (!strtab->sh.sh_addralign) { + ERROR("'%s': invalid sh_addralign", strtab->name); + return -1; + } + + offset = ALIGN_UP(strtab->sh.sh_size, strtab->sh.sh_addralign); + + if (!elf_add_data(elf, strtab, str, strlen(str) + 1)) + return -1; + + return offset; +} + +void *elf_add_data(struct elf *elf, struct section *sec, const void *data, size_t size) +{ + unsigned long offset; + Elf_Scn *s; + + if (!sec->sh.sh_addralign) { + ERROR("'%s': invalid sh_addralign", sec->name); + return NULL; + } + + s = elf_getscn(elf->elf, sec->idx); if (!s) { ERROR_ELF("elf_getscn"); - return -1; + return NULL; } - data = elf_newdata(s); - if (!data) { + sec->data = elf_newdata(s); + if (!sec->data) { ERROR_ELF("elf_newdata"); - return -1; + return NULL; } - data->d_buf = str; - data->d_size = strlen(str) + 1; - data->d_align = 1; + sec->data->d_buf = calloc(1, size); + if (!sec->data->d_buf) { + ERROR_GLIBC("calloc"); + return NULL; + } - len = strtab->sh.sh_size; - strtab->sh.sh_size += data->d_size; + if (data) + memcpy(sec->data->d_buf, data, size); - mark_sec_changed(elf, strtab, true); + sec->data->d_size = size; + sec->data->d_align = 1; - return len; + offset = ALIGN_UP(sec->sh.sh_size, sec->sh.sh_addralign); + sec->sh.sh_size = offset + size; + + mark_sec_changed(elf, sec, true); + + return sec->data->d_buf; } struct section *elf_create_section(struct elf *elf, const char *name, - size_t entsize, unsigned int nr) + size_t size, size_t entsize, + unsigned int type, unsigned int align, + unsigned int flags) { struct section *sec, *shstrtab; - size_t size = entsize * nr; Elf_Scn *s; - sec = malloc(sizeof(*sec)); + if (name && find_section_by_name(elf, name)) { + ERROR("section '%s' already exists", name); + return NULL; + } + + sec = calloc(1, sizeof(*sec)); if (!sec) { - ERROR_GLIBC("malloc"); + ERROR_GLIBC("calloc"); return NULL; } - memset(sec, 0, sizeof(*sec)); INIT_LIST_HEAD(&sec->symbol_list); + /* don't actually create the section, just the data structures */ + if (type == SHT_NULL) + goto add; + s = elf_newscn(elf->elf); if (!s) { ERROR_ELF("elf_newscn"); return NULL; } - sec->name = strdup(name); - if (!sec->name) { - ERROR_GLIBC("strdup"); - return NULL; - } - sec->idx = elf_ndxscn(s); - sec->data = elf_newdata(s); - if (!sec->data) { - ERROR_ELF("elf_newdata"); - return NULL; - } + if (size) { + sec->data = elf_newdata(s); + if (!sec->data) { + ERROR_ELF("elf_newdata"); + return NULL; + } - sec->data->d_size = size; - sec->data->d_align = 1; + sec->data->d_size = size; + sec->data->d_align = 1; - if (size) { - sec->data->d_buf = malloc(size); + sec->data->d_buf = calloc(1, size); if (!sec->data->d_buf) { - ERROR_GLIBC("malloc"); + ERROR_GLIBC("calloc"); return NULL; } - memset(sec->data->d_buf, 0, size); } if (!gelf_getshdr(s, &sec->sh)) { @@ -1173,34 +1443,152 @@ struct section *elf_create_section(struct elf *elf, const char *name, sec->sh.sh_size = size; sec->sh.sh_entsize = entsize; - sec->sh.sh_type = SHT_PROGBITS; - sec->sh.sh_addralign = 1; - sec->sh.sh_flags = SHF_ALLOC; - - /* Add section name to .shstrtab (or .strtab for Clang) */ - shstrtab = find_section_by_name(elf, ".shstrtab"); - if (!shstrtab) - shstrtab = find_section_by_name(elf, ".strtab"); - if (!shstrtab) { - ERROR("can't find .shstrtab or .strtab section"); - return NULL; + sec->sh.sh_type = type; + sec->sh.sh_addralign = align; + sec->sh.sh_flags = flags; + + if (name) { + sec->name = strdup(name); + if (!sec->name) { + ERROR("strdup"); + return NULL; + } + + /* Add section name to .shstrtab (or .strtab for Clang) */ + shstrtab = find_section_by_name(elf, ".shstrtab"); + if (!shstrtab) { + shstrtab = find_section_by_name(elf, ".strtab"); + if (!shstrtab) { + ERROR("can't find .shstrtab or .strtab"); + return NULL; + } + } + sec->sh.sh_name = elf_add_string(elf, shstrtab, sec->name); + if (sec->sh.sh_name == -1) + return NULL; + + elf_hash_add(section_name, &sec->name_hash, str_hash(sec->name)); } - sec->sh.sh_name = elf_add_string(elf, shstrtab, sec->name); - if (sec->sh.sh_name == -1) - return NULL; +add: list_add_tail(&sec->list, &elf->sections); elf_hash_add(section, &sec->hash, sec->idx); - elf_hash_add(section_name, &sec->name_hash, str_hash(sec->name)); mark_sec_changed(elf, sec, true); return sec; } -static struct section *elf_create_rela_section(struct elf *elf, - struct section *sec, - unsigned int reloc_nr) +static int elf_alloc_reloc(struct elf *elf, struct section *rsec) +{ + struct reloc *old_relocs, *old_relocs_end, *new_relocs; + unsigned int nr_relocs_old = sec_num_entries(rsec); + unsigned int nr_relocs_new = nr_relocs_old + 1; + unsigned long nr_alloc; + struct symbol *sym; + + if (!rsec->data) { + rsec->data = elf_newdata(elf_getscn(elf->elf, rsec->idx)); + if (!rsec->data) { + ERROR_ELF("elf_newdata"); + return -1; + } + + rsec->data->d_align = 1; + rsec->data->d_type = ELF_T_RELA; + rsec->data->d_buf = NULL; + } + + rsec->data->d_size = nr_relocs_new * elf_rela_size(elf); + rsec->sh.sh_size = rsec->data->d_size; + + nr_alloc = MAX(64, ALIGN_UP_POW2(nr_relocs_new)); + if (nr_alloc <= rsec->nr_alloc_relocs) + return 0; + + if (rsec->data->d_buf && !rsec->nr_alloc_relocs) { + void *orig_buf = rsec->data->d_buf; + + /* + * The original d_buf is owned by libelf so it can't be + * realloced. + */ + rsec->data->d_buf = malloc(nr_alloc * elf_rela_size(elf)); + if (!rsec->data->d_buf) { + ERROR_GLIBC("malloc"); + return -1; + } + memcpy(rsec->data->d_buf, orig_buf, + nr_relocs_old * elf_rela_size(elf)); + } else { + rsec->data->d_buf = realloc(rsec->data->d_buf, + nr_alloc * elf_rela_size(elf)); + if (!rsec->data->d_buf) { + ERROR_GLIBC("realloc"); + return -1; + } + } + + rsec->nr_alloc_relocs = nr_alloc; + + old_relocs = rsec->relocs; + new_relocs = calloc(nr_alloc, sizeof(struct reloc)); + if (!new_relocs) { + ERROR_GLIBC("calloc"); + return -1; + } + + if (!old_relocs) + goto done; + + /* + * The struct reloc's address has changed. Update all the symbols and + * relocs which reference it. + */ + + old_relocs_end = &old_relocs[nr_relocs_old]; + for_each_sym(elf, sym) { + struct reloc *reloc; + + reloc = sym->relocs; + if (!reloc) + continue; + + if (reloc >= old_relocs && reloc < old_relocs_end) + sym->relocs = &new_relocs[reloc - old_relocs]; + + while (1) { + struct reloc *next_reloc = sym_next_reloc(reloc); + + if (!next_reloc) + break; + + if (next_reloc >= old_relocs && next_reloc < old_relocs_end) + set_sym_next_reloc(reloc, &new_relocs[next_reloc - old_relocs]); + + reloc = next_reloc; + } + } + + memcpy(new_relocs, old_relocs, nr_relocs_old * sizeof(struct reloc)); + + for (int i = 0; i < nr_relocs_old; i++) { + struct reloc *old = &old_relocs[i]; + struct reloc *new = &new_relocs[i]; + u32 key = reloc_hash(old); + + elf_hash_del(reloc, &old->hash, key); + elf_hash_add(reloc, &new->hash, key); + } + + free(old_relocs); +done: + rsec->relocs = new_relocs; + return 0; +} + +struct section *elf_create_rela_section(struct elf *elf, struct section *sec, + unsigned int nr_relocs) { struct section *rsec; char *rsec_name; @@ -1213,41 +1601,72 @@ static struct section *elf_create_rela_section(struct elf *elf, strcpy(rsec_name, ".rela"); strcat(rsec_name, sec->name); - rsec = elf_create_section(elf, rsec_name, elf_rela_size(elf), reloc_nr); + rsec = elf_create_section(elf, rsec_name, nr_relocs * elf_rela_size(elf), + elf_rela_size(elf), SHT_RELA, elf_addr_size(elf), + SHF_INFO_LINK); free(rsec_name); if (!rsec) return NULL; - rsec->data->d_type = ELF_T_RELA; - rsec->sh.sh_type = SHT_RELA; - rsec->sh.sh_addralign = elf_addr_size(elf); - rsec->sh.sh_link = find_section_by_name(elf, ".symtab")->idx; - rsec->sh.sh_info = sec->idx; - rsec->sh.sh_flags = SHF_INFO_LINK; + if (nr_relocs) { + rsec->data->d_type = ELF_T_RELA; - rsec->relocs = calloc(sec_num_entries(rsec), sizeof(struct reloc)); - if (!rsec->relocs) { - ERROR_GLIBC("calloc"); - return NULL; + rsec->nr_alloc_relocs = nr_relocs; + rsec->relocs = calloc(nr_relocs, sizeof(struct reloc)); + if (!rsec->relocs) { + ERROR_GLIBC("calloc"); + return NULL; + } } + rsec->sh.sh_link = find_section_by_name(elf, ".symtab")->idx; + rsec->sh.sh_info = sec->idx; + sec->rsec = rsec; rsec->base = sec; return rsec; } +struct reloc *elf_create_reloc(struct elf *elf, struct section *sec, + unsigned long offset, + struct symbol *sym, s64 addend, + unsigned int type) +{ + struct section *rsec = sec->rsec; + + if (!rsec) { + rsec = elf_create_rela_section(elf, sec, 0); + if (!rsec) + return NULL; + } + + if (find_reloc_by_dest(elf, sec, offset)) { + ERROR_FUNC(sec, offset, "duplicate reloc"); + return NULL; + } + + if (elf_alloc_reloc(elf, rsec)) + return NULL; + + mark_sec_changed(elf, rsec, true); + + return elf_init_reloc(elf, rsec, sec_num_entries(rsec) - 1, offset, sym, + addend, type); +} + struct section *elf_create_section_pair(struct elf *elf, const char *name, size_t entsize, unsigned int nr, - unsigned int reloc_nr) + unsigned int nr_relocs) { struct section *sec; - sec = elf_create_section(elf, name, entsize, nr); + sec = elf_create_section(elf, name, nr * entsize, entsize, + SHT_PROGBITS, 1, SHF_ALLOC); if (!sec) return NULL; - if (!elf_create_rela_section(elf, sec, reloc_nr)) + if (!elf_create_rela_section(elf, sec, nr_relocs)) return NULL; return sec; @@ -1282,7 +1701,7 @@ int elf_write_insn(struct elf *elf, struct section *sec, */ static int elf_truncate_section(struct elf *elf, struct section *sec) { - u64 size = sec->sh.sh_size; + u64 size = sec_size(sec); bool truncated = false; Elf_Data *data = NULL; Elf_Scn *s; @@ -1296,7 +1715,6 @@ static int elf_truncate_section(struct elf *elf, struct section *sec) for (;;) { /* get next data descriptor for the relevant section */ data = elf_getdata(s, data); - if (!data) { if (size) { ERROR("end of section data but non-zero size left\n"); @@ -1332,8 +1750,8 @@ int elf_write(struct elf *elf) /* Update changed relocation sections and section headers: */ list_for_each_entry(sec, &elf->sections, list) { - if (sec->truncate) - elf_truncate_section(elf, sec); + if (sec->truncate && elf_truncate_section(elf, sec)) + return -1; if (sec_changed(sec)) { s = elf_getscn(elf->elf, sec->idx); @@ -1366,7 +1784,7 @@ int elf_write(struct elf *elf) return 0; } -void elf_close(struct elf *elf) +int elf_close(struct elf *elf) { if (elf->elf) elf_end(elf->elf); @@ -1374,8 +1792,12 @@ void elf_close(struct elf *elf) if (elf->fd > 0) close(elf->fd); + if (elf->tmp_name && rename(elf->tmp_name, elf->name)) + return -1; + /* * NOTE: All remaining allocations are leaked on purpose. Objtool is * about to exit anyway. */ + return 0; } diff --git a/tools/objtool/include/objtool/arch.h b/tools/objtool/include/objtool/arch.h index be33c7b43180..8866158975fc 100644 --- a/tools/objtool/include/objtool/arch.h +++ b/tools/objtool/include/objtool/arch.h @@ -71,7 +71,7 @@ struct stack_op { struct instruction; -int arch_ftrace_match(char *name); +int arch_ftrace_match(const char *name); void arch_initial_func_cfi_state(struct cfi_init_state *state); @@ -83,7 +83,8 @@ bool arch_callee_saved_reg(unsigned char reg); unsigned long arch_jump_destination(struct instruction *insn); -unsigned long arch_dest_reloc_offset(int addend); +s64 arch_insn_adjusted_addend(struct instruction *insn, struct reloc *reloc); +u64 arch_adjusted_addend(struct reloc *reloc); const char *arch_nop_insn(int len); const char *arch_ret_insn(int len); @@ -102,4 +103,15 @@ bool arch_absolute_reloc(struct elf *elf, struct reloc *reloc); unsigned int arch_reloc_size(struct reloc *reloc); unsigned long arch_jump_table_sym_offset(struct reloc *reloc, struct reloc *table); +extern const char *arch_reg_name[CFI_NUM_REGS]; + +#ifdef DISAS + +#include <bfd.h> +#include <dis-asm.h> + +int arch_disas_info_init(struct disassemble_info *dinfo); + +#endif /* DISAS */ + #endif /* _ARCH_H */ diff --git a/tools/objtool/include/objtool/builtin.h b/tools/objtool/include/objtool/builtin.h index ab22673862e1..b9e229ed4dc0 100644 --- a/tools/objtool/include/objtool/builtin.h +++ b/tools/objtool/include/objtool/builtin.h @@ -9,12 +9,15 @@ struct opts { /* actions: */ + bool cfi; + bool checksum; bool dump_orc; bool hack_jump_label; bool hack_noinstr; bool hack_skylake; bool ibt; bool mcount; + bool noabs; bool noinstr; bool orc; bool retpoline; @@ -25,11 +28,12 @@ struct opts { bool static_call; bool uaccess; int prefix; - bool cfi; - bool noabs; + const char *disas; /* options: */ bool backtrace; + bool backup; + const char *debug_checksum; bool dryrun; bool link; bool mnop; @@ -38,8 +42,10 @@ struct opts { const char *output; bool sec_address; bool stats; + const char *trace; bool verbose; bool werror; + bool wide; }; extern struct opts opts; @@ -48,6 +54,8 @@ int cmd_parse_options(int argc, const char **argv, const char * const usage[]); int objtool_run(int argc, const char **argv); -void print_args(void); +int make_backup(void); + +int cmd_klp(int argc, const char **argv); #endif /* _BUILTIN_H */ diff --git a/tools/objtool/include/objtool/check.h b/tools/objtool/include/objtool/check.h index 00fb745e7233..2e1346ad5e92 100644 --- a/tools/objtool/include/objtool/check.h +++ b/tools/objtool/include/objtool/check.h @@ -36,6 +36,19 @@ struct alt_group { struct cfi_state **cfi; bool ignore; + unsigned int feature; +}; + +enum alternative_type { + ALT_TYPE_INSTRUCTIONS, + ALT_TYPE_JUMP_TABLE, + ALT_TYPE_EX_TABLE, +}; + +struct alternative { + struct alternative *next; + struct instruction *insn; + enum alternative_type type; }; #define INSN_CHUNK_BITS 8 @@ -64,8 +77,11 @@ struct instruction { noendbr : 1, unret : 1, visited : 4, - no_reloc : 1; - /* 10 bit hole */ + no_reloc : 1, + hole : 1, + fake : 1, + trace : 1; + /* 9 bit hole */ struct alt_group *alt_group; struct instruction *jump_dest; @@ -115,6 +131,15 @@ static inline bool is_jump(struct instruction *insn) return is_static_jump(insn) || is_dynamic_jump(insn); } +static inline struct symbol *insn_call_dest(struct instruction *insn) +{ + if (insn->type == INSN_JUMP_DYNAMIC || + insn->type == INSN_CALL_DYNAMIC) + return NULL; + + return insn->_call_dest; +} + struct instruction *find_insn(struct objtool_file *file, struct section *sec, unsigned long offset); @@ -125,4 +150,14 @@ struct instruction *next_insn_same_sec(struct objtool_file *file, struct instruc insn && insn->sec == _sec; \ insn = next_insn_same_sec(file, insn)) +#define sym_for_each_insn(file, sym, insn) \ + for (insn = find_insn(file, sym->sec, sym->offset); \ + insn && insn->offset < sym->offset + sym->len; \ + insn = next_insn_same_sec(file, insn)) + +const char *objtool_disas_insn(struct instruction *insn); + +extern size_t sym_name_max_len; +extern struct disas_context *objtool_disas_ctx; + #endif /* _CHECK_H */ diff --git a/tools/objtool/include/objtool/checksum.h b/tools/objtool/include/objtool/checksum.h new file mode 100644 index 000000000000..7fe21608722a --- /dev/null +++ b/tools/objtool/include/objtool/checksum.h @@ -0,0 +1,43 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#ifndef _OBJTOOL_CHECKSUM_H +#define _OBJTOOL_CHECKSUM_H + +#include <objtool/elf.h> + +#ifdef BUILD_KLP + +static inline void checksum_init(struct symbol *func) +{ + if (func && !func->csum.state) { + func->csum.state = XXH3_createState(); + XXH3_64bits_reset(func->csum.state); + } +} + +static inline void checksum_update(struct symbol *func, + struct instruction *insn, + const void *data, size_t size) +{ + XXH3_64bits_update(func->csum.state, data, size); + dbg_checksum(func, insn, XXH3_64bits_digest(func->csum.state)); +} + +static inline void checksum_finish(struct symbol *func) +{ + if (func && func->csum.state) { + func->csum.checksum = XXH3_64bits_digest(func->csum.state); + func->csum.state = NULL; + } +} + +#else /* !BUILD_KLP */ + +static inline void checksum_init(struct symbol *func) {} +static inline void checksum_update(struct symbol *func, + struct instruction *insn, + const void *data, size_t size) {} +static inline void checksum_finish(struct symbol *func) {} + +#endif /* !BUILD_KLP */ + +#endif /* _OBJTOOL_CHECKSUM_H */ diff --git a/tools/objtool/include/objtool/checksum_types.h b/tools/objtool/include/objtool/checksum_types.h new file mode 100644 index 000000000000..507efdd8ab5b --- /dev/null +++ b/tools/objtool/include/objtool/checksum_types.h @@ -0,0 +1,25 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _OBJTOOL_CHECKSUM_TYPES_H +#define _OBJTOOL_CHECKSUM_TYPES_H + +struct sym_checksum { + u64 addr; + u64 checksum; +}; + +#ifdef BUILD_KLP + +#include <xxhash.h> + +struct checksum { + XXH3_state_t *state; + XXH64_hash_t checksum; +}; + +#else /* !BUILD_KLP */ + +struct checksum {}; + +#endif /* !BUILD_KLP */ + +#endif /* _OBJTOOL_CHECKSUM_TYPES_H */ diff --git a/tools/objtool/include/objtool/disas.h b/tools/objtool/include/objtool/disas.h new file mode 100644 index 000000000000..e8f395eff159 --- /dev/null +++ b/tools/objtool/include/objtool/disas.h @@ -0,0 +1,81 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (c) 2025, Oracle and/or its affiliates. + */ + +#ifndef _DISAS_H +#define _DISAS_H + +struct alternative; +struct disas_context; +struct disassemble_info; + +#ifdef DISAS + +struct disas_context *disas_context_create(struct objtool_file *file); +void disas_context_destroy(struct disas_context *dctx); +void disas_warned_funcs(struct disas_context *dctx); +void disas_funcs(struct disas_context *dctx); +int disas_info_init(struct disassemble_info *dinfo, + int arch, int mach32, int mach64, + const char *options); +size_t disas_insn(struct disas_context *dctx, struct instruction *insn); +char *disas_result(struct disas_context *dctx); +void disas_print_info(FILE *stream, struct instruction *insn, int depth, + const char *format, ...); +void disas_print_insn(FILE *stream, struct disas_context *dctx, + struct instruction *insn, int depth, + const char *format, ...); +char *disas_alt_name(struct alternative *alt); +const char *disas_alt_type_name(struct instruction *insn); + +#else /* DISAS */ + +#include <objtool/warn.h> + +static inline struct disas_context *disas_context_create(struct objtool_file *file) +{ + WARN("Rebuild with libopcodes for disassembly support"); + return NULL; +} + +static inline void disas_context_destroy(struct disas_context *dctx) {} +static inline void disas_warned_funcs(struct disas_context *dctx) {} +static inline void disas_funcs(struct disas_context *dctx) {} + +static inline int disas_info_init(struct disassemble_info *dinfo, + int arch, int mach32, int mach64, + const char *options) +{ + return -1; +} + +static inline size_t disas_insn(struct disas_context *dctx, + struct instruction *insn) +{ + return -1; +} + +static inline char *disas_result(struct disas_context *dctx) +{ + return NULL; +} + +static inline void disas_print_info(FILE *stream, struct instruction *insn, + int depth, const char *format, ...) {} +static inline void disas_print_insn(FILE *stream, struct disas_context *dctx, + struct instruction *insn, int depth, + const char *format, ...) {} +static inline char *disas_alt_name(struct alternative *alt) +{ + return NULL; +} + +static inline const char *disas_alt_type_name(struct instruction *insn) +{ + return NULL; +} + +#endif /* DISAS */ + +#endif /* _DISAS_H */ diff --git a/tools/objtool/include/objtool/elf.h b/tools/objtool/include/objtool/elf.h index df8434d3b744..e12c516bd320 100644 --- a/tools/objtool/include/objtool/elf.h +++ b/tools/objtool/include/objtool/elf.h @@ -8,12 +8,21 @@ #include <stdio.h> #include <gelf.h> +#include <linux/string.h> #include <linux/list.h> #include <linux/hashtable.h> #include <linux/rbtree.h> #include <linux/jhash.h> + +#include <objtool/endianness.h> +#include <objtool/checksum_types.h> #include <arch/elf.h> +#define SEC_NAME_LEN 1024 +#define SYM_NAME_LEN 512 + +#define bswap_if_needed(elf, val) __bswap_if_needed(&elf->ehdr, val) + #ifdef LIBELF_USE_DEPRECATED # define elf_getshdrnum elf_getshnum # define elf_getshdrstrndx elf_getshstrndx @@ -40,24 +49,27 @@ struct section { struct section *base, *rsec; struct symbol *sym; Elf_Data *data; - char *name; + const char *name; int idx; bool _changed, text, rodata, noinstr, init, truncate; struct reloc *relocs; + unsigned long nr_alloc_relocs; + struct section *twin; }; struct symbol { struct list_head list; + struct list_head global_list; struct rb_node node; struct elf_hash_node hash; struct elf_hash_node name_hash; GElf_Sym sym; struct section *sec; - char *name; + const char *name, *demangled_name; unsigned int idx, len; unsigned long offset; unsigned long __subtree_last; - struct symbol *pfunc, *cfunc, *alias; + struct symbol *pfunc, *cfunc, *alias, *file; unsigned char bind, type; u8 uaccess_safe : 1; u8 static_call_tramp : 1; @@ -71,9 +83,17 @@ struct symbol { u8 frame_pointer : 1; u8 ignore : 1; u8 nocfi : 1; + u8 cold : 1; + u8 prefix : 1; + u8 debug_checksum : 1; + u8 changed : 1; + u8 included : 1; + u8 klp : 1; struct list_head pv_target; struct reloc *relocs; struct section *group_sec; + struct checksum csum; + struct symbol *twin, *clone; }; struct reloc { @@ -88,9 +108,10 @@ struct elf { GElf_Ehdr ehdr; int fd; bool changed; - char *name; + const char *name, *tmp_name; unsigned int num_files; struct list_head sections; + struct list_head symbols; unsigned long num_relocs; int symbol_bits; @@ -110,14 +131,37 @@ struct elf { }; struct elf *elf_open_read(const char *name, int flags); +struct elf *elf_create_file(GElf_Ehdr *ehdr, const char *name); struct section *elf_create_section(struct elf *elf, const char *name, - size_t entsize, unsigned int nr); + size_t size, size_t entsize, + unsigned int type, unsigned int align, + unsigned int flags); struct section *elf_create_section_pair(struct elf *elf, const char *name, size_t entsize, unsigned int nr, unsigned int reloc_nr); -struct symbol *elf_create_prefix_symbol(struct elf *elf, struct symbol *orig, long size); +struct section *elf_create_rela_section(struct elf *elf, struct section *sec, + unsigned int reloc_nr); + +struct symbol *elf_create_symbol(struct elf *elf, const char *name, + struct section *sec, unsigned int bind, + unsigned int type, unsigned long offset, + size_t size); +struct symbol *elf_create_section_symbol(struct elf *elf, struct section *sec); + +void *elf_add_data(struct elf *elf, struct section *sec, const void *data, + size_t size); + +unsigned int elf_add_string(struct elf *elf, struct section *strtab, const char *str); + +struct reloc *elf_create_reloc(struct elf *elf, struct section *sec, + unsigned long offset, struct symbol *sym, + s64 addend, unsigned int type); + +struct reloc *elf_init_reloc(struct elf *elf, struct section *rsec, + unsigned int reloc_idx, unsigned long offset, + struct symbol *sym, s64 addend, unsigned int type); struct reloc *elf_init_reloc_text_sym(struct elf *elf, struct section *sec, unsigned long offset, @@ -131,16 +175,17 @@ struct reloc *elf_init_reloc_data_sym(struct elf *elf, struct section *sec, struct symbol *sym, s64 addend); -int elf_write_insn(struct elf *elf, struct section *sec, - unsigned long offset, unsigned int len, - const char *insn); +int elf_write_insn(struct elf *elf, struct section *sec, unsigned long offset, + unsigned int len, const char *insn); + int elf_write(struct elf *elf); -void elf_close(struct elf *elf); +int elf_close(struct elf *elf); struct section *find_section_by_name(const struct elf *elf, const char *name); struct symbol *find_func_by_offset(struct section *sec, unsigned long offset); struct symbol *find_symbol_by_offset(struct section *sec, unsigned long offset); struct symbol *find_symbol_by_name(const struct elf *elf, const char *name); +struct symbol *find_global_symbol_by_name(const struct elf *elf, const char *name); struct symbol *find_symbol_containing(const struct section *sec, unsigned long offset); int find_symbol_hole_containing(const struct section *sec, unsigned long offset); struct reloc *find_reloc_by_dest(const struct elf *elf, struct section *sec, unsigned long offset); @@ -178,11 +223,76 @@ static inline unsigned int elf_text_rela_type(struct elf *elf) return elf_addr_size(elf) == 4 ? R_TEXT32 : R_TEXT64; } +static inline bool is_undef_sym(struct symbol *sym) +{ + return !sym->sec->idx; +} + +static inline bool is_null_sym(struct symbol *sym) +{ + return !sym->idx; +} + +static inline bool is_sec_sym(struct symbol *sym) +{ + return sym->type == STT_SECTION; +} + +static inline bool is_object_sym(struct symbol *sym) +{ + return sym->type == STT_OBJECT; +} + +static inline bool is_func_sym(struct symbol *sym) +{ + return sym->type == STT_FUNC; +} + +static inline bool is_file_sym(struct symbol *sym) +{ + return sym->type == STT_FILE; +} + +static inline bool is_notype_sym(struct symbol *sym) +{ + return sym->type == STT_NOTYPE; +} + +static inline bool is_global_sym(struct symbol *sym) +{ + return sym->bind == STB_GLOBAL; +} + +static inline bool is_weak_sym(struct symbol *sym) +{ + return sym->bind == STB_WEAK; +} + +static inline bool is_local_sym(struct symbol *sym) +{ + return sym->bind == STB_LOCAL; +} + +static inline bool is_prefix_func(struct symbol *sym) +{ + return sym->prefix; +} + static inline bool is_reloc_sec(struct section *sec) { return sec->sh.sh_type == SHT_RELA || sec->sh.sh_type == SHT_REL; } +static inline bool is_string_sec(struct section *sec) +{ + return sec->sh.sh_flags & SHF_STRINGS; +} + +static inline bool is_text_sec(struct section *sec) +{ + return sec->sh.sh_flags & SHF_EXECINSTR; +} + static inline bool sec_changed(struct section *sec) { return sec->_changed; @@ -223,6 +333,11 @@ static inline bool is_32bit_reloc(struct reloc *reloc) return reloc->sec->sh.sh_entsize < 16; } +static inline unsigned long sec_size(struct section *sec) +{ + return sec->sh.sh_size; +} + #define __get_reloc_field(reloc, field) \ ({ \ is_32bit_reloc(reloc) ? \ @@ -300,6 +415,15 @@ static inline void set_reloc_type(struct elf *elf, struct reloc *reloc, unsigned mark_sec_changed(elf, reloc->sec, true); } +static inline unsigned int annotype(struct elf *elf, struct section *sec, + struct reloc *reloc) +{ + unsigned int type; + + type = *(u32 *)(sec->data->d_buf + (reloc_idx(reloc) * 8) + 4); + return bswap_if_needed(elf, type); +} + #define RELOC_JUMP_TABLE_BIT 1UL /* Does reloc mark the beginning of a jump table? */ @@ -325,28 +449,54 @@ static inline void set_sym_next_reloc(struct reloc *reloc, struct reloc *next) reloc->_sym_next_reloc = (unsigned long)next | bit; } -#define for_each_sec(file, sec) \ - list_for_each_entry(sec, &file->elf->sections, list) +#define for_each_sec(elf, sec) \ + list_for_each_entry(sec, &elf->sections, list) #define sec_for_each_sym(sec, sym) \ list_for_each_entry(sym, &sec->symbol_list, list) -#define for_each_sym(file, sym) \ - for (struct section *__sec, *__fake = (struct section *)1; \ - __fake; __fake = NULL) \ - for_each_sec(file, __sec) \ - sec_for_each_sym(__sec, sym) +#define sec_prev_sym(sym) \ + sym->sec && sym->list.prev != &sym->sec->symbol_list ? \ + list_prev_entry(sym, list) : NULL + +#define for_each_sym(elf, sym) \ + list_for_each_entry(sym, &elf->symbols, global_list) + +#define for_each_sym_continue(elf, sym) \ + list_for_each_entry_continue(sym, &elf->symbols, global_list) + +#define rsec_next_reloc(rsec, reloc) \ + reloc_idx(reloc) < sec_num_entries(rsec) - 1 ? reloc + 1 : NULL #define for_each_reloc(rsec, reloc) \ - for (int __i = 0, __fake = 1; __fake; __fake = 0) \ - for (reloc = rsec->relocs; \ - __i < sec_num_entries(rsec); \ - __i++, reloc++) + for (reloc = rsec->relocs; reloc; reloc = rsec_next_reloc(rsec, reloc)) #define for_each_reloc_from(rsec, reloc) \ - for (int __i = reloc_idx(reloc); \ - __i < sec_num_entries(rsec); \ - __i++, reloc++) + for (; reloc; reloc = rsec_next_reloc(rsec, reloc)) + +#define for_each_reloc_continue(rsec, reloc) \ + for (reloc = rsec_next_reloc(rsec, reloc); reloc; \ + reloc = rsec_next_reloc(rsec, reloc)) + +#define sym_for_each_reloc(elf, sym, reloc) \ + for (reloc = find_reloc_by_dest_range(elf, sym->sec, \ + sym->offset, sym->len); \ + reloc && reloc_offset(reloc) < sym->offset + sym->len; \ + reloc = rsec_next_reloc(sym->sec->rsec, reloc)) + +static inline struct symbol *get_func_prefix(struct symbol *func) +{ + struct symbol *prev; + + if (!is_func_sym(func)) + return NULL; + + prev = sec_prev_sym(func); + if (prev && is_prefix_func(prev)) + return prev; + + return NULL; +} #define OFFSET_STRIDE_BITS 4 #define OFFSET_STRIDE (1UL << OFFSET_STRIDE_BITS) diff --git a/tools/objtool/include/objtool/endianness.h b/tools/objtool/include/objtool/endianness.h index 4d2aa9b0fe2f..aebcd2338668 100644 --- a/tools/objtool/include/objtool/endianness.h +++ b/tools/objtool/include/objtool/endianness.h @@ -4,7 +4,6 @@ #include <linux/kernel.h> #include <endian.h> -#include <objtool/elf.h> /* * Does a byte swap if target file endianness doesn't match the host, i.e. cross @@ -12,16 +11,16 @@ * To be used for multi-byte values conversion, which are read from / about * to be written to a target native endianness ELF file. */ -static inline bool need_bswap(struct elf *elf) +static inline bool need_bswap(GElf_Ehdr *ehdr) { return (__BYTE_ORDER == __LITTLE_ENDIAN) ^ - (elf->ehdr.e_ident[EI_DATA] == ELFDATA2LSB); + (ehdr->e_ident[EI_DATA] == ELFDATA2LSB); } -#define bswap_if_needed(elf, val) \ +#define __bswap_if_needed(ehdr, val) \ ({ \ __typeof__(val) __ret; \ - bool __need_bswap = need_bswap(elf); \ + bool __need_bswap = need_bswap(ehdr); \ switch (sizeof(val)) { \ case 8: \ __ret = __need_bswap ? bswap_64(val) : (val); break; \ diff --git a/tools/objtool/include/objtool/klp.h b/tools/objtool/include/objtool/klp.h new file mode 100644 index 000000000000..ad830a7ce55b --- /dev/null +++ b/tools/objtool/include/objtool/klp.h @@ -0,0 +1,35 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#ifndef _OBJTOOL_KLP_H +#define _OBJTOOL_KLP_H + +#define SHF_RELA_LIVEPATCH 0x00100000 +#define SHN_LIVEPATCH 0xff20 + +/* + * __klp_objects and __klp_funcs are created by klp diff and used by the patch + * module init code to build the klp_patch, klp_object and klp_func structs + * needed by the livepatch API. + */ +#define KLP_OBJECTS_SEC "__klp_objects" +#define KLP_FUNCS_SEC "__klp_funcs" + +/* + * __klp_relocs is an intermediate section which are created by klp diff and + * converted into KLP symbols/relas by "objtool klp post-link". This is needed + * to work around the linker, which doesn't preserve SHN_LIVEPATCH or + * SHF_RELA_LIVEPATCH, nor does it support having two RELA sections for a + * single PROGBITS section. + */ +#define KLP_RELOCS_SEC "__klp_relocs" +#define KLP_STRINGS_SEC ".rodata.klp.str1.1" + +struct klp_reloc { + void *offset; + void *sym; + u32 type; +}; + +int cmd_klp_diff(int argc, const char **argv); +int cmd_klp_post_link(int argc, const char **argv); + +#endif /* _OBJTOOL_KLP_H */ diff --git a/tools/objtool/include/objtool/objtool.h b/tools/objtool/include/objtool/objtool.h index c0dc86a78ff6..6dc12a59ad00 100644 --- a/tools/objtool/include/objtool/objtool.h +++ b/tools/objtool/include/objtool/objtool.h @@ -28,7 +28,7 @@ struct objtool_file { struct list_head mcount_loc_list; struct list_head endbr_list; struct list_head call_list; - bool ignore_unreachables, hints, rodata; + bool ignore_unreachables, hints, rodata, klp; unsigned int nr_endbr; unsigned int nr_endbr_int; @@ -39,6 +39,10 @@ struct objtool_file { struct pv_state *pv_ops; }; +char *top_level_dir(const char *file); + +int init_signal_handler(void); + struct objtool_file *objtool_open_read(const char *_objname); int objtool_pv_add(struct objtool_file *file, int idx, struct symbol *func); diff --git a/tools/objtool/include/objtool/special.h b/tools/objtool/include/objtool/special.h index 72d09c0adf1a..121c3761899c 100644 --- a/tools/objtool/include/objtool/special.h +++ b/tools/objtool/include/objtool/special.h @@ -25,7 +25,7 @@ struct special_alt { struct section *new_sec; unsigned long new_off; - unsigned int orig_len, new_len; /* group only */ + unsigned int orig_len, new_len, feature; /* group only */ }; int special_get_alts(struct elf *elf, struct list_head *alts); @@ -38,4 +38,6 @@ bool arch_support_alt_relocation(struct special_alt *special_alt, struct reloc *arch_find_switch_table(struct objtool_file *file, struct instruction *insn, unsigned long *table_size); +const char *arch_cpu_feature_name(int feature_number); + #endif /* _SPECIAL_H */ diff --git a/tools/objtool/include/objtool/trace.h b/tools/objtool/include/objtool/trace.h new file mode 100644 index 000000000000..70b574366797 --- /dev/null +++ b/tools/objtool/include/objtool/trace.h @@ -0,0 +1,141 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (c) 2025, Oracle and/or its affiliates. + */ + +#ifndef _TRACE_H +#define _TRACE_H + +#include <objtool/check.h> +#include <objtool/disas.h> + +#ifdef DISAS + +extern bool trace; +extern int trace_depth; + +#define TRACE(fmt, ...) \ +({ if (trace) \ + fprintf(stderr, fmt, ##__VA_ARGS__); \ +}) + +/* + * Print the instruction address and a message. The instruction + * itself is not printed. + */ +#define TRACE_ADDR(insn, fmt, ...) \ +({ \ + if (trace) { \ + disas_print_info(stderr, insn, trace_depth - 1, \ + fmt "\n", ##__VA_ARGS__); \ + } \ +}) + +/* + * Print the instruction address, the instruction and a message. + */ +#define TRACE_INSN(insn, fmt, ...) \ +({ \ + if (trace) { \ + disas_print_insn(stderr, objtool_disas_ctx, \ + insn, trace_depth - 1, \ + fmt, ##__VA_ARGS__); \ + fprintf(stderr, "\n"); \ + insn->trace = 1; \ + } \ +}) + +#define TRACE_INSN_STATE(insn, sprev, snext) \ +({ \ + if (trace) \ + trace_insn_state(insn, sprev, snext); \ +}) + +#define TRACE_ALT_FMT(pfx, fmt) pfx "<%s.%lx> " fmt +#define TRACE_ALT_ARG(insn) disas_alt_type_name(insn), (insn)->offset + +#define TRACE_ALT(insn, fmt, ...) \ + TRACE_INSN(insn, TRACE_ALT_FMT("", fmt), \ + TRACE_ALT_ARG(insn), ##__VA_ARGS__) + +#define TRACE_ALT_INFO(insn, pfx, fmt, ...) \ + TRACE_ADDR(insn, TRACE_ALT_FMT(pfx, fmt), \ + TRACE_ALT_ARG(insn), ##__VA_ARGS__) + +#define TRACE_ALT_INFO_NOADDR(insn, pfx, fmt, ...) \ + TRACE_ADDR(NULL, TRACE_ALT_FMT(pfx, fmt), \ + TRACE_ALT_ARG(insn), ##__VA_ARGS__) + +#define TRACE_ALT_BEGIN(insn, alt, alt_name) \ +({ \ + if (trace) { \ + alt_name = disas_alt_name(alt); \ + trace_alt_begin(insn, alt, alt_name); \ + } \ +}) + +#define TRACE_ALT_END(insn, alt, alt_name) \ +({ \ + if (trace) { \ + trace_alt_end(insn, alt, alt_name); \ + free(alt_name); \ + } \ +}) + +static inline void trace_enable(void) +{ + trace = true; + trace_depth = 0; +} + +static inline void trace_disable(void) +{ + trace = false; +} + +static inline void trace_depth_inc(void) +{ + if (trace) + trace_depth++; +} + +static inline void trace_depth_dec(void) +{ + if (trace) + trace_depth--; +} + +void trace_insn_state(struct instruction *insn, struct insn_state *sprev, + struct insn_state *snext); +void trace_alt_begin(struct instruction *orig_insn, struct alternative *alt, + char *alt_name); +void trace_alt_end(struct instruction *orig_insn, struct alternative *alt, + char *alt_name); + +#else /* DISAS */ + +#define TRACE(fmt, ...) ({}) +#define TRACE_ADDR(insn, fmt, ...) ({}) +#define TRACE_INSN(insn, fmt, ...) ({}) +#define TRACE_INSN_STATE(insn, sprev, snext) ({}) +#define TRACE_ALT(insn, fmt, ...) ({}) +#define TRACE_ALT_INFO(insn, fmt, ...) ({}) +#define TRACE_ALT_INFO_NOADDR(insn, fmt, ...) ({}) +#define TRACE_ALT_BEGIN(insn, alt, alt_name) ({}) +#define TRACE_ALT_END(insn, alt, alt_name) ({}) + + +static inline void trace_enable(void) {} +static inline void trace_disable(void) {} +static inline void trace_depth_inc(void) {} +static inline void trace_depth_dec(void) {} +static inline void trace_alt_begin(struct instruction *orig_insn, + struct alternative *alt, + char *alt_name) {}; +static inline void trace_alt_end(struct instruction *orig_insn, + struct alternative *alt, + char *alt_name) {}; + +#endif + +#endif /* _TRACE_H */ diff --git a/tools/objtool/include/objtool/util.h b/tools/objtool/include/objtool/util.h new file mode 100644 index 000000000000..a0180b312f73 --- /dev/null +++ b/tools/objtool/include/objtool/util.h @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#ifndef _UTIL_H +#define _UTIL_H + +#include <objtool/warn.h> + +#define snprintf_check(str, size, format, args...) \ +({ \ + int __ret = snprintf(str, size, format, args); \ + if (__ret < 0) \ + ERROR_GLIBC("snprintf"); \ + else if (__ret >= size) \ + ERROR("snprintf() failed for '" format "'", args); \ + else \ + __ret = 0; \ + __ret; \ +}) + +#endif /* _UTIL_H */ diff --git a/tools/objtool/include/objtool/warn.h b/tools/objtool/include/objtool/warn.h index cb8fe846d9dd..25ff7942b4d5 100644 --- a/tools/objtool/include/objtool/warn.h +++ b/tools/objtool/include/objtool/warn.h @@ -77,9 +77,11 @@ static inline char *offstr(struct section *sec, unsigned long offset) #define WARN_INSN(insn, format, ...) \ ({ \ struct instruction *_insn = (insn); \ - if (!_insn->sym || !_insn->sym->warned) \ + if (!_insn->sym || !_insn->sym->warned) { \ WARN_FUNC(_insn->sec, _insn->offset, format, \ ##__VA_ARGS__); \ + BT_INSN(_insn, ""); \ + } \ if (_insn->sym) \ _insn->sym->warned = 1; \ }) @@ -87,10 +89,15 @@ static inline char *offstr(struct section *sec, unsigned long offset) #define BT_INSN(insn, format, ...) \ ({ \ if (opts.verbose || opts.backtrace) { \ - struct instruction *_insn = (insn); \ - char *_str = offstr(_insn->sec, _insn->offset); \ - WARN(" %s: " format, _str, ##__VA_ARGS__); \ - free(_str); \ + struct instruction *__insn = (insn); \ + char *_str = offstr(__insn->sec, __insn->offset); \ + const char *_istr = objtool_disas_insn(__insn); \ + int _len; \ + _len = snprintf(NULL, 0, " %s: " format, _str, ##__VA_ARGS__); \ + _len = (_len < 50) ? 50 - _len : 0; \ + WARN(" %s: " format " %*s%s", _str, ##__VA_ARGS__, _len, "", _istr); \ + free(_str); \ + __insn->trace = 1; \ } \ }) @@ -102,4 +109,53 @@ static inline char *offstr(struct section *sec, unsigned long offset) #define ERROR_FUNC(sec, offset, format, ...) __WARN_FUNC(ERROR_STR, sec, offset, format, ##__VA_ARGS__) #define ERROR_INSN(insn, format, ...) WARN_FUNC(insn->sec, insn->offset, format, ##__VA_ARGS__) +extern bool debug; +extern int indent; + +static inline void unindent(int *unused) { indent--; } + +/* + * Clang prior to 17 is being silly and considers many __cleanup() variables + * as unused (because they are, their sole purpose is to go out of scope). + * + * https://github.com/llvm/llvm-project/commit/877210faa447f4cc7db87812f8ed80e398fedd61 + */ +#undef __cleanup +#define __cleanup(func) __maybe_unused __attribute__((__cleanup__(func))) + +#define __dbg(format, ...) \ + fprintf(stderr, \ + "DEBUG: %s%s" format "\n", \ + objname ?: "", \ + objname ? ": " : "", \ + ##__VA_ARGS__) + +#define dbg(args...) \ +({ \ + if (unlikely(debug)) \ + __dbg(args); \ +}) + +#define __dbg_indent(format, ...) \ +({ \ + if (unlikely(debug)) \ + __dbg("%*s" format, indent * 8, "", ##__VA_ARGS__); \ +}) + +#define dbg_indent(args...) \ + int __cleanup(unindent) __dummy_##__COUNTER__; \ + __dbg_indent(args); \ + indent++ + +#define dbg_checksum(func, insn, checksum) \ +({ \ + if (unlikely(insn->sym && insn->sym->pfunc && \ + insn->sym->pfunc->debug_checksum)) { \ + char *insn_off = offstr(insn->sec, insn->offset); \ + __dbg("checksum: %s %s %016lx", \ + func->name, insn_off, checksum); \ + free(insn_off); \ + } \ +}) + #endif /* _WARN_H */ diff --git a/tools/objtool/klp-diff.c b/tools/objtool/klp-diff.c new file mode 100644 index 000000000000..4d1f9e9977eb --- /dev/null +++ b/tools/objtool/klp-diff.c @@ -0,0 +1,1723 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +#define _GNU_SOURCE /* memmem() */ +#include <subcmd/parse-options.h> +#include <stdlib.h> +#include <string.h> +#include <libgen.h> +#include <stdio.h> +#include <ctype.h> + +#include <objtool/objtool.h> +#include <objtool/warn.h> +#include <objtool/arch.h> +#include <objtool/klp.h> +#include <objtool/util.h> +#include <arch/special.h> + +#include <linux/objtool_types.h> +#include <linux/livepatch_external.h> +#include <linux/stringify.h> +#include <linux/string.h> +#include <linux/jhash.h> + +#define sizeof_field(TYPE, MEMBER) sizeof((((TYPE *)0)->MEMBER)) + +struct elfs { + struct elf *orig, *patched, *out; + const char *modname; +}; + +struct export { + struct hlist_node hash; + char *mod, *sym; +}; + +static const char * const klp_diff_usage[] = { + "objtool klp diff [<options>] <in1.o> <in2.o> <out.o>", + NULL, +}; + +static const struct option klp_diff_options[] = { + OPT_GROUP("Options:"), + OPT_BOOLEAN('d', "debug", &debug, "enable debug output"), + OPT_END(), +}; + +static DEFINE_HASHTABLE(exports, 15); + +static inline u32 str_hash(const char *str) +{ + return jhash(str, strlen(str), 0); +} + +static char *escape_str(const char *orig) +{ + size_t len = 0; + const char *a; + char *b, *new; + + for (a = orig; *a; a++) { + switch (*a) { + case '\001': len += 5; break; + case '\n': + case '\t': len += 2; break; + default: len++; + } + } + + new = malloc(len + 1); + if (!new) + return NULL; + + for (a = orig, b = new; *a; a++) { + switch (*a) { + case '\001': memcpy(b, "<SOH>", 5); b += 5; break; + case '\n': *b++ = '\\'; *b++ = 'n'; break; + case '\t': *b++ = '\\'; *b++ = 't'; break; + default: *b++ = *a; + } + } + + *b = '\0'; + return new; +} + +static int read_exports(void) +{ + const char *symvers = "Module.symvers"; + char line[1024], *path = NULL; + unsigned int line_num = 1; + FILE *file; + + file = fopen(symvers, "r"); + if (!file) { + path = top_level_dir(symvers); + if (!path) { + ERROR("can't open '%s', \"objtool diff\" should be run from the kernel tree", symvers); + return -1; + } + + file = fopen(path, "r"); + if (!file) { + ERROR_GLIBC("fopen"); + return -1; + } + } + + while (fgets(line, 1024, file)) { + char *sym, *mod, *type; + struct export *export; + + sym = strchr(line, '\t'); + if (!sym) { + ERROR("malformed Module.symvers (sym) at line %d", line_num); + return -1; + } + + *sym++ = '\0'; + + mod = strchr(sym, '\t'); + if (!mod) { + ERROR("malformed Module.symvers (mod) at line %d", line_num); + return -1; + } + + *mod++ = '\0'; + + type = strchr(mod, '\t'); + if (!type) { + ERROR("malformed Module.symvers (type) at line %d", line_num); + return -1; + } + + *type++ = '\0'; + + if (*sym == '\0' || *mod == '\0') { + ERROR("malformed Module.symvers at line %d", line_num); + return -1; + } + + export = calloc(1, sizeof(*export)); + if (!export) { + ERROR_GLIBC("calloc"); + return -1; + } + + export->mod = strdup(mod); + if (!export->mod) { + ERROR_GLIBC("strdup"); + return -1; + } + + export->sym = strdup(sym); + if (!export->sym) { + ERROR_GLIBC("strdup"); + return -1; + } + + hash_add(exports, &export->hash, str_hash(sym)); + } + + free(path); + fclose(file); + + return 0; +} + +static int read_sym_checksums(struct elf *elf) +{ + struct section *sec; + + sec = find_section_by_name(elf, ".discard.sym_checksum"); + if (!sec) { + ERROR("'%s' missing .discard.sym_checksum section, file not processed by 'objtool --checksum'?", + elf->name); + return -1; + } + + if (!sec->rsec) { + ERROR("missing reloc section for .discard.sym_checksum"); + return -1; + } + + if (sec_size(sec) % sizeof(struct sym_checksum)) { + ERROR("struct sym_checksum size mismatch"); + return -1; + } + + for (int i = 0; i < sec_size(sec) / sizeof(struct sym_checksum); i++) { + struct sym_checksum *sym_checksum; + struct reloc *reloc; + struct symbol *sym; + + sym_checksum = (struct sym_checksum *)sec->data->d_buf + i; + + reloc = find_reloc_by_dest(elf, sec, i * sizeof(*sym_checksum)); + if (!reloc) { + ERROR("can't find reloc for sym_checksum[%d]", i); + return -1; + } + + sym = reloc->sym; + + if (is_sec_sym(sym)) { + ERROR("not sure how to handle section %s", sym->name); + return -1; + } + + if (is_func_sym(sym)) + sym->csum.checksum = sym_checksum->checksum; + } + + return 0; +} + +static struct symbol *first_file_symbol(struct elf *elf) +{ + struct symbol *sym; + + for_each_sym(elf, sym) { + if (is_file_sym(sym)) + return sym; + } + + return NULL; +} + +static struct symbol *next_file_symbol(struct elf *elf, struct symbol *sym) +{ + for_each_sym_continue(elf, sym) { + if (is_file_sym(sym)) + return sym; + } + + return NULL; +} + +/* + * Certain static local variables should never be correlated. They will be + * used in place rather than referencing the originals. + */ +static bool is_uncorrelated_static_local(struct symbol *sym) +{ + static const char * const vars[] = { + "__already_done.", + "__func__.", + "__key.", + "__warned.", + "_entry.", + "_entry_ptr.", + "_rs.", + "descriptor.", + "CSWTCH.", + }; + + if (!is_object_sym(sym) || !is_local_sym(sym)) + return false; + + if (!strcmp(sym->sec->name, ".data.once")) + return true; + + for (int i = 0; i < ARRAY_SIZE(vars); i++) { + if (strstarts(sym->name, vars[i])) + return true; + } + + return false; +} + +/* + * Clang emits several useless .Ltmp_* code labels. + */ +static bool is_clang_tmp_label(struct symbol *sym) +{ + return sym->type == STT_NOTYPE && + is_text_sec(sym->sec) && + strstarts(sym->name, ".Ltmp") && + isdigit(sym->name[5]); +} + +static bool is_special_section(struct section *sec) +{ + static const char * const specials[] = { + ".altinstructions", + ".smp_locks", + "__bug_table", + "__ex_table", + "__jump_table", + "__mcount_loc", + + /* + * Extract .static_call_sites here to inherit non-module + * preferential treatment. The later static call processing + * during klp module build will be skipped when it sees this + * section already exists. + */ + ".static_call_sites", + }; + + static const char * const non_special_discards[] = { + ".discard.addressable", + ".discard.sym_checksum", + }; + + if (is_text_sec(sec)) + return false; + + for (int i = 0; i < ARRAY_SIZE(specials); i++) { + if (!strcmp(sec->name, specials[i])) + return true; + } + + /* Most .discard data sections are special */ + for (int i = 0; i < ARRAY_SIZE(non_special_discards); i++) { + if (!strcmp(sec->name, non_special_discards[i])) + return false; + } + + return strstarts(sec->name, ".discard."); +} + +/* + * These sections are referenced by special sections but aren't considered + * special sections themselves. + */ +static bool is_special_section_aux(struct section *sec) +{ + static const char * const specials_aux[] = { + ".altinstr_replacement", + ".altinstr_aux", + }; + + for (int i = 0; i < ARRAY_SIZE(specials_aux); i++) { + if (!strcmp(sec->name, specials_aux[i])) + return true; + } + + return false; +} + +/* + * These symbols should never be correlated, so their local patched versions + * are used instead of linking to the originals. + */ +static bool dont_correlate(struct symbol *sym) +{ + return is_file_sym(sym) || + is_null_sym(sym) || + is_sec_sym(sym) || + is_prefix_func(sym) || + is_uncorrelated_static_local(sym) || + is_clang_tmp_label(sym) || + is_string_sec(sym->sec) || + is_special_section(sym->sec) || + is_special_section_aux(sym->sec) || + strstarts(sym->name, "__initcall__"); +} + +/* + * For each symbol in the original kernel, find its corresponding "twin" in the + * patched kernel. + */ +static int correlate_symbols(struct elfs *e) +{ + struct symbol *file1_sym, *file2_sym; + struct symbol *sym1, *sym2; + + /* Correlate locals */ + for (file1_sym = first_file_symbol(e->orig), + file2_sym = first_file_symbol(e->patched); ; + file1_sym = next_file_symbol(e->orig, file1_sym), + file2_sym = next_file_symbol(e->patched, file2_sym)) { + + if (!file1_sym && file2_sym) { + ERROR("FILE symbol mismatch: NULL != %s", file2_sym->name); + return -1; + } + + if (file1_sym && !file2_sym) { + ERROR("FILE symbol mismatch: %s != NULL", file1_sym->name); + return -1; + } + + if (!file1_sym) + break; + + if (strcmp(file1_sym->name, file2_sym->name)) { + ERROR("FILE symbol mismatch: %s != %s", file1_sym->name, file2_sym->name); + return -1; + } + + file1_sym->twin = file2_sym; + file2_sym->twin = file1_sym; + + sym1 = file1_sym; + + for_each_sym_continue(e->orig, sym1) { + if (is_file_sym(sym1) || !is_local_sym(sym1)) + break; + + if (dont_correlate(sym1)) + continue; + + sym2 = file2_sym; + for_each_sym_continue(e->patched, sym2) { + if (is_file_sym(sym2) || !is_local_sym(sym2)) + break; + + if (sym2->twin || dont_correlate(sym2)) + continue; + + if (strcmp(sym1->demangled_name, sym2->demangled_name)) + continue; + + sym1->twin = sym2; + sym2->twin = sym1; + break; + } + } + } + + /* Correlate globals */ + for_each_sym(e->orig, sym1) { + if (sym1->bind == STB_LOCAL) + continue; + + sym2 = find_global_symbol_by_name(e->patched, sym1->name); + + if (sym2 && !sym2->twin && !strcmp(sym1->name, sym2->name)) { + sym1->twin = sym2; + sym2->twin = sym1; + } + } + + for_each_sym(e->orig, sym1) { + if (sym1->twin || dont_correlate(sym1)) + continue; + WARN("no correlation: %s", sym1->name); + } + + return 0; +} + +/* "sympos" is used by livepatch to disambiguate duplicate symbol names */ +static unsigned long find_sympos(struct elf *elf, struct symbol *sym) +{ + bool vmlinux = str_ends_with(objname, "vmlinux.o"); + unsigned long sympos = 0, nr_matches = 0; + bool has_dup = false; + struct symbol *s; + + if (sym->bind != STB_LOCAL) + return 0; + + if (vmlinux && sym->type == STT_FUNC) { + /* + * HACK: Unfortunately, symbol ordering can differ between + * vmlinux.o and vmlinux due to the linker script emitting + * .text.unlikely* before .text*. Count .text.unlikely* first. + * + * TODO: Disambiguate symbols more reliably (checksums?) + */ + for_each_sym(elf, s) { + if (strstarts(s->sec->name, ".text.unlikely") && + !strcmp(s->name, sym->name)) { + nr_matches++; + if (s == sym) + sympos = nr_matches; + else + has_dup = true; + } + } + for_each_sym(elf, s) { + if (!strstarts(s->sec->name, ".text.unlikely") && + !strcmp(s->name, sym->name)) { + nr_matches++; + if (s == sym) + sympos = nr_matches; + else + has_dup = true; + } + } + } else { + for_each_sym(elf, s) { + if (!strcmp(s->name, sym->name)) { + nr_matches++; + if (s == sym) + sympos = nr_matches; + else + has_dup = true; + } + } + } + + if (!sympos) { + ERROR("can't find sympos for %s", sym->name); + return ULONG_MAX; + } + + return has_dup ? sympos : 0; +} + +static int clone_sym_relocs(struct elfs *e, struct symbol *patched_sym); + +static struct symbol *__clone_symbol(struct elf *elf, struct symbol *patched_sym, + bool data_too) +{ + struct section *out_sec = NULL; + unsigned long offset = 0; + struct symbol *out_sym; + + if (data_too && !is_undef_sym(patched_sym)) { + struct section *patched_sec = patched_sym->sec; + + out_sec = find_section_by_name(elf, patched_sec->name); + if (!out_sec) { + out_sec = elf_create_section(elf, patched_sec->name, 0, + patched_sec->sh.sh_entsize, + patched_sec->sh.sh_type, + patched_sec->sh.sh_addralign, + patched_sec->sh.sh_flags); + if (!out_sec) + return NULL; + } + + if (is_string_sec(patched_sym->sec)) { + out_sym = elf_create_section_symbol(elf, out_sec); + if (!out_sym) + return NULL; + + goto sym_created; + } + + if (!is_sec_sym(patched_sym)) + offset = sec_size(out_sec); + + if (patched_sym->len || is_sec_sym(patched_sym)) { + void *data = NULL; + size_t size; + + /* bss doesn't have data */ + if (patched_sym->sec->data->d_buf) + data = patched_sym->sec->data->d_buf + patched_sym->offset; + + if (is_sec_sym(patched_sym)) + size = sec_size(patched_sym->sec); + else + size = patched_sym->len; + + if (!elf_add_data(elf, out_sec, data, size)) + return NULL; + } + } + + out_sym = elf_create_symbol(elf, patched_sym->name, out_sec, + patched_sym->bind, patched_sym->type, + offset, patched_sym->len); + if (!out_sym) + return NULL; + +sym_created: + patched_sym->clone = out_sym; + out_sym->clone = patched_sym; + + return out_sym; +} + +static const char *sym_type(struct symbol *sym) +{ + switch (sym->type) { + case STT_NOTYPE: return "NOTYPE"; + case STT_OBJECT: return "OBJECT"; + case STT_FUNC: return "FUNC"; + case STT_SECTION: return "SECTION"; + case STT_FILE: return "FILE"; + default: return "UNKNOWN"; + } +} + +static const char *sym_bind(struct symbol *sym) +{ + switch (sym->bind) { + case STB_LOCAL: return "LOCAL"; + case STB_GLOBAL: return "GLOBAL"; + case STB_WEAK: return "WEAK"; + default: return "UNKNOWN"; + } +} + +/* + * Copy a symbol to the output object, optionally including its data and + * relocations. + */ +static struct symbol *clone_symbol(struct elfs *e, struct symbol *patched_sym, + bool data_too) +{ + struct symbol *pfx; + + if (patched_sym->clone) + return patched_sym->clone; + + dbg_indent("%s%s", patched_sym->name, data_too ? " [+DATA]" : ""); + + /* Make sure the prefix gets cloned first */ + if (is_func_sym(patched_sym) && data_too) { + pfx = get_func_prefix(patched_sym); + if (pfx) + clone_symbol(e, pfx, true); + } + + if (!__clone_symbol(e->out, patched_sym, data_too)) + return NULL; + + if (data_too && clone_sym_relocs(e, patched_sym)) + return NULL; + + return patched_sym->clone; +} + +static void mark_included_function(struct symbol *func) +{ + struct symbol *pfx; + + func->included = 1; + + /* Include prefix function */ + pfx = get_func_prefix(func); + if (pfx) + pfx->included = 1; + + /* Make sure .cold parent+child always stay together */ + if (func->cfunc && func->cfunc != func) + func->cfunc->included = 1; + if (func->pfunc && func->pfunc != func) + func->pfunc->included = 1; +} + +/* + * Copy all changed functions (and their dependencies) from the patched object + * to the output object. + */ +static int mark_changed_functions(struct elfs *e) +{ + struct symbol *sym_orig, *patched_sym; + bool changed = false; + + /* Find changed functions */ + for_each_sym(e->orig, sym_orig) { + if (!is_func_sym(sym_orig) || is_prefix_func(sym_orig)) + continue; + + patched_sym = sym_orig->twin; + if (!patched_sym) + continue; + + if (sym_orig->csum.checksum != patched_sym->csum.checksum) { + patched_sym->changed = 1; + mark_included_function(patched_sym); + changed = true; + } + } + + /* Find added functions and print them */ + for_each_sym(e->patched, patched_sym) { + if (!is_func_sym(patched_sym) || is_prefix_func(patched_sym)) + continue; + + if (!patched_sym->twin) { + printf("%s: new function: %s\n", objname, patched_sym->name); + mark_included_function(patched_sym); + changed = true; + } + } + + /* Print changed functions */ + for_each_sym(e->patched, patched_sym) { + if (patched_sym->changed) + printf("%s: changed function: %s\n", objname, patched_sym->name); + } + + return !changed ? -1 : 0; +} + +static int clone_included_functions(struct elfs *e) +{ + struct symbol *patched_sym; + + for_each_sym(e->patched, patched_sym) { + if (patched_sym->included) { + if (!clone_symbol(e, patched_sym, true)) + return -1; + } + } + + return 0; +} + +/* + * Determine whether a relocation should reference the section rather than the + * underlying symbol. + */ +static bool section_reference_needed(struct section *sec) +{ + /* + * String symbols are zero-length and uncorrelated. It's easier to + * deal with them as section symbols. + */ + if (is_string_sec(sec)) + return true; + + /* + * .rodata has mostly anonymous data so there's no way to determine the + * length of a needed reference. just copy the whole section if needed. + */ + if (strstarts(sec->name, ".rodata")) + return true; + + /* UBSAN anonymous data */ + if (strstarts(sec->name, ".data..Lubsan") || /* GCC */ + strstarts(sec->name, ".data..L__unnamed_")) /* Clang */ + return true; + + return false; +} + +static bool is_reloc_allowed(struct reloc *reloc) +{ + return section_reference_needed(reloc->sym->sec) == is_sec_sym(reloc->sym); +} + +static struct export *find_export(struct symbol *sym) +{ + struct export *export; + + hash_for_each_possible(exports, export, hash, str_hash(sym->name)) { + if (!strcmp(export->sym, sym->name)) + return export; + } + + return NULL; +} + +static const char *__find_modname(struct elfs *e) +{ + struct section *sec; + char *name; + + sec = find_section_by_name(e->orig, ".modinfo"); + if (!sec) { + ERROR("missing .modinfo section"); + return NULL; + } + + name = memmem(sec->data->d_buf, sec_size(sec), "\0name=", 6); + if (name) + return name + 6; + + name = strdup(e->orig->name); + if (!name) { + ERROR_GLIBC("strdup"); + return NULL; + } + + for (char *c = name; *c; c++) { + if (*c == '/') + name = c + 1; + else if (*c == '-') + *c = '_'; + else if (*c == '.') { + *c = '\0'; + break; + } + } + + return name; +} + +/* Get the object's module name as defined by the kernel (and klp_object) */ +static const char *find_modname(struct elfs *e) +{ + const char *modname; + + if (e->modname) + return e->modname; + + modname = __find_modname(e); + e->modname = modname; + return modname; +} + +/* + * Copying a function from its native compiled environment to a kernel module + * removes its natural access to local functions/variables and unexported + * globals. References to such symbols need to be converted to KLP relocs so + * the kernel arch relocation code knows to apply them and where to find the + * symbols. Particularly, duplicate static symbols need to be disambiguated. + */ +static bool klp_reloc_needed(struct reloc *patched_reloc) +{ + struct symbol *patched_sym = patched_reloc->sym; + struct export *export; + + /* no external symbol to reference */ + if (dont_correlate(patched_sym)) + return false; + + /* For included functions, a regular reloc will do. */ + if (patched_sym->included) + return false; + + /* + * If exported by a module, it has to be a klp reloc. Thanks to the + * clusterfunk that is late module patching, the patch module is + * allowed to be loaded before any modules it depends on. + * + * If exported by vmlinux, a normal reloc will do. + */ + export = find_export(patched_sym); + if (export) + return strcmp(export->mod, "vmlinux"); + + if (!patched_sym->twin) { + /* + * Presumably the symbol and its reference were added by the + * patch. The symbol could be defined in this .o or in another + * .o in the patch module. + * + * This check needs to be *after* the export check due to the + * possibility of the patch adding a new UNDEF reference to an + * exported symbol. + */ + return false; + } + + /* Unexported symbol which lives in the original vmlinux or module. */ + return true; +} + +static int convert_reloc_sym_to_secsym(struct elf *elf, struct reloc *reloc) +{ + struct symbol *sym = reloc->sym; + struct section *sec = sym->sec; + + if (!sec->sym && !elf_create_section_symbol(elf, sec)) + return -1; + + reloc->sym = sec->sym; + set_reloc_sym(elf, reloc, sym->idx); + set_reloc_addend(elf, reloc, sym->offset + reloc_addend(reloc)); + return 0; +} + +static int convert_reloc_secsym_to_sym(struct elf *elf, struct reloc *reloc) +{ + struct symbol *sym = reloc->sym; + struct section *sec = sym->sec; + + /* If the symbol has a dedicated section, it's easy to find */ + sym = find_symbol_by_offset(sec, 0); + if (sym && sym->len == sec_size(sec)) + goto found_sym; + + /* No dedicated section; find the symbol manually */ + sym = find_symbol_containing(sec, arch_adjusted_addend(reloc)); + if (!sym) { + /* + * This can happen for special section references to weak code + * whose symbol has been stripped by the linker. + */ + return -1; + } + +found_sym: + reloc->sym = sym; + set_reloc_sym(elf, reloc, sym->idx); + set_reloc_addend(elf, reloc, reloc_addend(reloc) - sym->offset); + return 0; +} + +/* + * Convert a relocation symbol reference to the needed format: either a section + * symbol or the underlying symbol itself. + */ +static int convert_reloc_sym(struct elf *elf, struct reloc *reloc) +{ + if (is_reloc_allowed(reloc)) + return 0; + + if (section_reference_needed(reloc->sym->sec)) + return convert_reloc_sym_to_secsym(elf, reloc); + else + return convert_reloc_secsym_to_sym(elf, reloc); +} + +/* + * Convert a regular relocation to a klp relocation (sort of). + */ +static int clone_reloc_klp(struct elfs *e, struct reloc *patched_reloc, + struct section *sec, unsigned long offset, + struct export *export) +{ + struct symbol *patched_sym = patched_reloc->sym; + s64 addend = reloc_addend(patched_reloc); + const char *sym_modname, *sym_orig_name; + static struct section *klp_relocs; + struct symbol *sym, *klp_sym; + unsigned long klp_reloc_off; + char sym_name[SYM_NAME_LEN]; + struct klp_reloc klp_reloc; + unsigned long sympos; + + if (!patched_sym->twin) { + ERROR("unexpected klp reloc for new symbol %s", patched_sym->name); + return -1; + } + + /* + * Keep the original reloc intact for now to avoid breaking objtool run + * which relies on proper relocations for many of its features. This + * will be disabled later by "objtool klp post-link". + * + * Convert it to UNDEF (and WEAK to avoid modpost warnings). + */ + + sym = patched_sym->clone; + if (!sym) { + /* STB_WEAK: avoid modpost undefined symbol warnings */ + sym = elf_create_symbol(e->out, patched_sym->name, NULL, + STB_WEAK, patched_sym->type, 0, 0); + if (!sym) + return -1; + + patched_sym->clone = sym; + sym->clone = patched_sym; + } + + if (!elf_create_reloc(e->out, sec, offset, sym, addend, reloc_type(patched_reloc))) + return -1; + + /* + * Create the KLP symbol. + */ + + if (export) { + sym_modname = export->mod; + sym_orig_name = export->sym; + sympos = 0; + } else { + sym_modname = find_modname(e); + if (!sym_modname) + return -1; + + sym_orig_name = patched_sym->twin->name; + sympos = find_sympos(e->orig, patched_sym->twin); + if (sympos == ULONG_MAX) + return -1; + } + + /* symbol format: .klp.sym.modname.sym_name,sympos */ + if (snprintf_check(sym_name, SYM_NAME_LEN, KLP_SYM_PREFIX "%s.%s,%ld", + sym_modname, sym_orig_name, sympos)) + return -1; + + klp_sym = find_symbol_by_name(e->out, sym_name); + if (!klp_sym) { + __dbg_indent("%s", sym_name); + + /* STB_WEAK: avoid modpost undefined symbol warnings */ + klp_sym = elf_create_symbol(e->out, sym_name, NULL, + STB_WEAK, patched_sym->type, 0, 0); + if (!klp_sym) + return -1; + } + + /* + * Create the __klp_relocs entry. This will be converted to an actual + * KLP rela by "objtool klp post-link". + * + * This intermediate step is necessary to prevent corruption by the + * linker, which doesn't know how to properly handle two rela sections + * applying to the same base section. + */ + + if (!klp_relocs) { + klp_relocs = elf_create_section(e->out, KLP_RELOCS_SEC, 0, + 0, SHT_PROGBITS, 8, SHF_ALLOC); + if (!klp_relocs) + return -1; + } + + klp_reloc_off = sec_size(klp_relocs); + memset(&klp_reloc, 0, sizeof(klp_reloc)); + + klp_reloc.type = reloc_type(patched_reloc); + if (!elf_add_data(e->out, klp_relocs, &klp_reloc, sizeof(klp_reloc))) + return -1; + + /* klp_reloc.offset */ + if (!sec->sym && !elf_create_section_symbol(e->out, sec)) + return -1; + + if (!elf_create_reloc(e->out, klp_relocs, + klp_reloc_off + offsetof(struct klp_reloc, offset), + sec->sym, offset, R_ABS64)) + return -1; + + /* klp_reloc.sym */ + if (!elf_create_reloc(e->out, klp_relocs, + klp_reloc_off + offsetof(struct klp_reloc, sym), + klp_sym, addend, R_ABS64)) + return -1; + + return 0; +} + +#define dbg_clone_reloc(sec, offset, patched_sym, addend, export, klp) \ + dbg_indent("%s+0x%lx: %s%s0x%lx [%s%s%s%s%s%s]", \ + sec->name, offset, patched_sym->name, \ + addend >= 0 ? "+" : "-", labs(addend), \ + sym_type(patched_sym), \ + patched_sym->type == STT_SECTION ? "" : " ", \ + patched_sym->type == STT_SECTION ? "" : sym_bind(patched_sym), \ + is_undef_sym(patched_sym) ? " UNDEF" : "", \ + export ? " EXPORTED" : "", \ + klp ? " KLP" : "") + +/* Copy a reloc and its symbol to the output object */ +static int clone_reloc(struct elfs *e, struct reloc *patched_reloc, + struct section *sec, unsigned long offset) +{ + struct symbol *patched_sym = patched_reloc->sym; + struct export *export = find_export(patched_sym); + long addend = reloc_addend(patched_reloc); + struct symbol *out_sym; + bool klp; + + if (!is_reloc_allowed(patched_reloc)) { + ERROR_FUNC(patched_reloc->sec->base, reloc_offset(patched_reloc), + "missing symbol for reference to %s+%ld", + patched_sym->name, addend); + return -1; + } + + klp = klp_reloc_needed(patched_reloc); + + dbg_clone_reloc(sec, offset, patched_sym, addend, export, klp); + + if (klp) { + if (clone_reloc_klp(e, patched_reloc, sec, offset, export)) + return -1; + + return 0; + } + + /* + * Why !export sets 'data_too': + * + * Unexported non-klp symbols need to live in the patch module, + * otherwise there will be unresolved symbols. Notably, this includes: + * + * - New functions/data + * - String sections + * - Special section entries + * - Uncorrelated static local variables + * - UBSAN sections + */ + out_sym = clone_symbol(e, patched_sym, patched_sym->included || !export); + if (!out_sym) + return -1; + + /* + * For strings, all references use section symbols, thanks to + * section_reference_needed(). clone_symbol() has cloned an empty + * version of the string section. Now copy the string itself. + */ + if (is_string_sec(patched_sym->sec)) { + const char *str = patched_sym->sec->data->d_buf + addend; + + __dbg_indent("\"%s\"", escape_str(str)); + + addend = elf_add_string(e->out, out_sym->sec, str); + if (addend == -1) + return -1; + } + + if (!elf_create_reloc(e->out, sec, offset, out_sym, addend, + reloc_type(patched_reloc))) + return -1; + + return 0; +} + +/* Copy all relocs needed for a symbol's contents */ +static int clone_sym_relocs(struct elfs *e, struct symbol *patched_sym) +{ + struct section *patched_rsec = patched_sym->sec->rsec; + struct reloc *patched_reloc; + unsigned long start, end; + struct symbol *out_sym; + + out_sym = patched_sym->clone; + if (!out_sym) { + ERROR("no clone for %s", patched_sym->name); + return -1; + } + + if (!patched_rsec) + return 0; + + if (!is_sec_sym(patched_sym) && !patched_sym->len) + return 0; + + if (is_string_sec(patched_sym->sec)) + return 0; + + if (is_sec_sym(patched_sym)) { + start = 0; + end = sec_size(patched_sym->sec); + } else { + start = patched_sym->offset; + end = start + patched_sym->len; + } + + for_each_reloc(patched_rsec, patched_reloc) { + unsigned long offset; + + if (reloc_offset(patched_reloc) < start || + reloc_offset(patched_reloc) >= end) + continue; + + /* + * Skip any reloc referencing .altinstr_aux. Its code is + * always patched by alternatives. See ALTERNATIVE_TERNARY(). + */ + if (patched_reloc->sym->sec && + !strcmp(patched_reloc->sym->sec->name, ".altinstr_aux")) + continue; + + if (convert_reloc_sym(e->patched, patched_reloc)) { + ERROR_FUNC(patched_rsec->base, reloc_offset(patched_reloc), + "failed to convert reloc sym '%s' to its proper format", + patched_reloc->sym->name); + return -1; + } + + offset = out_sym->offset + (reloc_offset(patched_reloc) - patched_sym->offset); + + if (clone_reloc(e, patched_reloc, out_sym->sec, offset)) + return -1; + } + return 0; + +} + +static int create_fake_symbol(struct elf *elf, struct section *sec, + unsigned long offset, size_t size) +{ + char name[SYM_NAME_LEN]; + unsigned int type; + static int ctr; + char *c; + + if (snprintf_check(name, SYM_NAME_LEN, "%s_%d", sec->name, ctr++)) + return -1; + + for (c = name; *c; c++) + if (*c == '.') + *c = '_'; + + /* + * STT_NOTYPE: Prevent objtool from validating .altinstr_replacement + * while still allowing objdump to disassemble it. + */ + type = is_text_sec(sec) ? STT_NOTYPE : STT_OBJECT; + return elf_create_symbol(elf, name, sec, STB_LOCAL, type, offset, size) ? 0 : -1; +} + +/* + * Special sections (alternatives, etc) are basically arrays of structs. + * For all the special sections, create a symbol for each struct entry. This + * is a bit cumbersome, but it makes the extracting of the individual entries + * much more straightforward. + * + * There are three ways to identify the entry sizes for a special section: + * + * 1) ELF section header sh_entsize: Ideally this would be used almost + * everywhere. But unfortunately the toolchains make it difficult. The + * assembler .[push]section directive syntax only takes entsize when + * combined with SHF_MERGE. But Clang disallows combining SHF_MERGE with + * SHF_WRITE. And some special sections do need to be writable. + * + * Another place this wouldn't work is .altinstr_replacement, whose entries + * don't have a fixed size. + * + * 2) ANNOTATE_DATA_SPECIAL: This is a lightweight objtool annotation which + * points to the beginning of each entry. The size of the entry is then + * inferred by the location of the subsequent annotation (or end of + * section). + * + * 3) Simple array of pointers: If the special section is just a basic array of + * pointers, the entry size can be inferred by the number of relocations. + * No annotations needed. + * + * Note I also tried to create per-entry symbols at the time of creation, in + * the original [inline] asm. Unfortunately, creating uniquely named symbols + * is trickier than one might think, especially with Clang inline asm. I + * eventually just gave up trying to make that work, in favor of using + * ANNOTATE_DATA_SPECIAL and creating the symbols here after the fact. + */ +static int create_fake_symbols(struct elf *elf) +{ + struct section *sec; + struct reloc *reloc; + + /* + * 1) Make symbols for all the ANNOTATE_DATA_SPECIAL entries: + */ + + sec = find_section_by_name(elf, ".discard.annotate_data"); + if (!sec || !sec->rsec) + return 0; + + for_each_reloc(sec->rsec, reloc) { + unsigned long offset, size; + struct reloc *next_reloc; + + if (annotype(elf, sec, reloc) != ANNOTYPE_DATA_SPECIAL) + continue; + + offset = reloc_addend(reloc); + + size = 0; + next_reloc = reloc; + for_each_reloc_continue(sec->rsec, next_reloc) { + if (annotype(elf, sec, next_reloc) != ANNOTYPE_DATA_SPECIAL || + next_reloc->sym->sec != reloc->sym->sec) + continue; + + size = reloc_addend(next_reloc) - offset; + break; + } + + if (!size) + size = sec_size(reloc->sym->sec) - offset; + + if (create_fake_symbol(elf, reloc->sym->sec, offset, size)) + return -1; + } + + /* + * 2) Make symbols for sh_entsize, and simple arrays of pointers: + */ + + for_each_sec(elf, sec) { + unsigned int entry_size; + unsigned long offset; + + if (!is_special_section(sec) || find_symbol_by_offset(sec, 0)) + continue; + + if (!sec->rsec) { + ERROR("%s: missing special section relocations", sec->name); + return -1; + } + + entry_size = sec->sh.sh_entsize; + if (!entry_size) { + entry_size = arch_reloc_size(sec->rsec->relocs); + if (sec_size(sec) != entry_size * sec_num_entries(sec->rsec)) { + ERROR("%s: missing special section entsize or annotations", sec->name); + return -1; + } + } + + for (offset = 0; offset < sec_size(sec); offset += entry_size) { + if (create_fake_symbol(elf, sec, offset, entry_size)) + return -1; + } + } + + return 0; +} + +/* Keep a special section entry if it references an included function */ +static bool should_keep_special_sym(struct elf *elf, struct symbol *sym) +{ + struct reloc *reloc; + + if (is_sec_sym(sym) || !sym->sec->rsec) + return false; + + sym_for_each_reloc(elf, sym, reloc) { + if (convert_reloc_sym(elf, reloc)) + continue; + + if (is_func_sym(reloc->sym) && reloc->sym->included) + return true; + } + + return false; +} + +/* + * Klp relocations aren't allowed for __jump_table and .static_call_sites if + * the referenced symbol lives in a kernel module, because such klp relocs may + * be applied after static branch/call init, resulting in code corruption. + * + * Validate a special section entry to avoid that. Note that an inert + * tracepoint is harmless enough, in that case just skip the entry and print a + * warning. Otherwise, return an error. + * + * This is only a temporary limitation which will be fixed when livepatch adds + * support for submodules: fully self-contained modules which are embedded in + * the top-level livepatch module's data and which can be loaded on demand when + * their corresponding to-be-patched module gets loaded. Then klp relocs can + * be retired. + * + * Return: + * -1: error: validation failed + * 1: warning: tracepoint skipped + * 0: success + */ +static int validate_special_section_klp_reloc(struct elfs *e, struct symbol *sym) +{ + bool static_branch = !strcmp(sym->sec->name, "__jump_table"); + bool static_call = !strcmp(sym->sec->name, ".static_call_sites"); + struct symbol *code_sym = NULL; + unsigned long code_offset = 0; + struct reloc *reloc; + int ret = 0; + + if (!static_branch && !static_call) + return 0; + + sym_for_each_reloc(e->patched, sym, reloc) { + const char *sym_modname; + struct export *export; + + /* Static branch/call keys are always STT_OBJECT */ + if (reloc->sym->type != STT_OBJECT) { + + /* Save code location which can be printed below */ + if (reloc->sym->type == STT_FUNC && !code_sym) { + code_sym = reloc->sym; + code_offset = reloc_addend(reloc); + } + + continue; + } + + if (!klp_reloc_needed(reloc)) + continue; + + export = find_export(reloc->sym); + if (export) { + sym_modname = export->mod; + } else { + sym_modname = find_modname(e); + if (!sym_modname) + return -1; + } + + /* vmlinux keys are ok */ + if (!strcmp(sym_modname, "vmlinux")) + continue; + + if (static_branch) { + if (strstarts(reloc->sym->name, "__tracepoint_")) { + WARN("%s: disabling unsupported tracepoint %s", + code_sym->name, reloc->sym->name + 13); + ret = 1; + continue; + } + + ERROR("%s+0x%lx: unsupported static branch key %s. Use static_key_enabled() instead", + code_sym->name, code_offset, reloc->sym->name); + return -1; + } + + /* static call */ + if (strstarts(reloc->sym->name, "__SCK__tp_func_")) { + ret = 1; + continue; + } + + ERROR("%s()+0x%lx: unsupported static call key %s. Use KLP_STATIC_CALL() instead", + code_sym->name, code_offset, reloc->sym->name); + return -1; + } + + return ret; +} + +static int clone_special_section(struct elfs *e, struct section *patched_sec) +{ + struct symbol *patched_sym; + + /* + * Extract all special section symbols (and their dependencies) which + * reference included functions. + */ + sec_for_each_sym(patched_sec, patched_sym) { + int ret; + + if (!is_object_sym(patched_sym)) + continue; + + if (!should_keep_special_sym(e->patched, patched_sym)) + continue; + + ret = validate_special_section_klp_reloc(e, patched_sym); + if (ret < 0) + return -1; + if (ret > 0) + continue; + + if (!clone_symbol(e, patched_sym, true)) + return -1; + } + + return 0; +} + +/* Extract only the needed bits from special sections */ +static int clone_special_sections(struct elfs *e) +{ + struct section *patched_sec; + + if (create_fake_symbols(e->patched)) + return -1; + + for_each_sec(e->patched, patched_sec) { + if (is_special_section(patched_sec)) { + if (clone_special_section(e, patched_sec)) + return -1; + } + } + + return 0; +} + +/* + * Create __klp_objects and __klp_funcs sections which are intermediate + * sections provided as input to the patch module's init code for building the + * klp_patch, klp_object and klp_func structs for the livepatch API. + */ +static int create_klp_sections(struct elfs *e) +{ + size_t obj_size = sizeof(struct klp_object_ext); + size_t func_size = sizeof(struct klp_func_ext); + struct section *obj_sec, *funcs_sec, *str_sec; + struct symbol *funcs_sym, *str_sym, *sym; + char sym_name[SYM_NAME_LEN]; + unsigned int nr_funcs = 0; + const char *modname; + void *obj_data; + s64 addend; + + obj_sec = elf_create_section_pair(e->out, KLP_OBJECTS_SEC, obj_size, 0, 0); + if (!obj_sec) + return -1; + + funcs_sec = elf_create_section_pair(e->out, KLP_FUNCS_SEC, func_size, 0, 0); + if (!funcs_sec) + return -1; + + funcs_sym = elf_create_section_symbol(e->out, funcs_sec); + if (!funcs_sym) + return -1; + + str_sec = elf_create_section(e->out, KLP_STRINGS_SEC, 0, 0, + SHT_PROGBITS, 1, + SHF_ALLOC | SHF_STRINGS | SHF_MERGE); + if (!str_sec) + return -1; + + if (elf_add_string(e->out, str_sec, "") == -1) + return -1; + + str_sym = elf_create_section_symbol(e->out, str_sec); + if (!str_sym) + return -1; + + /* allocate klp_object_ext */ + obj_data = elf_add_data(e->out, obj_sec, NULL, obj_size); + if (!obj_data) + return -1; + + modname = find_modname(e); + if (!modname) + return -1; + + /* klp_object_ext.name */ + if (strcmp(modname, "vmlinux")) { + addend = elf_add_string(e->out, str_sec, modname); + if (addend == -1) + return -1; + + if (!elf_create_reloc(e->out, obj_sec, + offsetof(struct klp_object_ext, name), + str_sym, addend, R_ABS64)) + return -1; + } + + /* klp_object_ext.funcs */ + if (!elf_create_reloc(e->out, obj_sec, offsetof(struct klp_object_ext, funcs), + funcs_sym, 0, R_ABS64)) + return -1; + + for_each_sym(e->out, sym) { + unsigned long offset = nr_funcs * func_size; + unsigned long sympos; + void *func_data; + + if (!is_func_sym(sym) || sym->cold || !sym->clone || !sym->clone->changed) + continue; + + /* allocate klp_func_ext */ + func_data = elf_add_data(e->out, funcs_sec, NULL, func_size); + if (!func_data) + return -1; + + /* klp_func_ext.old_name */ + addend = elf_add_string(e->out, str_sec, sym->clone->twin->name); + if (addend == -1) + return -1; + + if (!elf_create_reloc(e->out, funcs_sec, + offset + offsetof(struct klp_func_ext, old_name), + str_sym, addend, R_ABS64)) + return -1; + + /* klp_func_ext.new_func */ + if (!elf_create_reloc(e->out, funcs_sec, + offset + offsetof(struct klp_func_ext, new_func), + sym, 0, R_ABS64)) + return -1; + + /* klp_func_ext.sympos */ + BUILD_BUG_ON(sizeof(sympos) != sizeof_field(struct klp_func_ext, sympos)); + sympos = find_sympos(e->orig, sym->clone->twin); + if (sympos == ULONG_MAX) + return -1; + memcpy(func_data + offsetof(struct klp_func_ext, sympos), &sympos, + sizeof_field(struct klp_func_ext, sympos)); + + nr_funcs++; + } + + /* klp_object_ext.nr_funcs */ + BUILD_BUG_ON(sizeof(nr_funcs) != sizeof_field(struct klp_object_ext, nr_funcs)); + memcpy(obj_data + offsetof(struct klp_object_ext, nr_funcs), &nr_funcs, + sizeof_field(struct klp_object_ext, nr_funcs)); + + /* + * Find callback pointers created by KLP_PRE_PATCH_CALLBACK() and + * friends, and add them to the klp object. + */ + + if (snprintf_check(sym_name, SYM_NAME_LEN, KLP_PRE_PATCH_PREFIX "%s", modname)) + return -1; + + sym = find_symbol_by_name(e->out, sym_name); + if (sym) { + struct reloc *reloc; + + reloc = find_reloc_by_dest(e->out, sym->sec, sym->offset); + + if (!elf_create_reloc(e->out, obj_sec, + offsetof(struct klp_object_ext, callbacks) + + offsetof(struct klp_callbacks, pre_patch), + reloc->sym, reloc_addend(reloc), R_ABS64)) + return -1; + } + + if (snprintf_check(sym_name, SYM_NAME_LEN, KLP_POST_PATCH_PREFIX "%s", modname)) + return -1; + + sym = find_symbol_by_name(e->out, sym_name); + if (sym) { + struct reloc *reloc; + + reloc = find_reloc_by_dest(e->out, sym->sec, sym->offset); + + if (!elf_create_reloc(e->out, obj_sec, + offsetof(struct klp_object_ext, callbacks) + + offsetof(struct klp_callbacks, post_patch), + reloc->sym, reloc_addend(reloc), R_ABS64)) + return -1; + } + + if (snprintf_check(sym_name, SYM_NAME_LEN, KLP_PRE_UNPATCH_PREFIX "%s", modname)) + return -1; + + sym = find_symbol_by_name(e->out, sym_name); + if (sym) { + struct reloc *reloc; + + reloc = find_reloc_by_dest(e->out, sym->sec, sym->offset); + + if (!elf_create_reloc(e->out, obj_sec, + offsetof(struct klp_object_ext, callbacks) + + offsetof(struct klp_callbacks, pre_unpatch), + reloc->sym, reloc_addend(reloc), R_ABS64)) + return -1; + } + + if (snprintf_check(sym_name, SYM_NAME_LEN, KLP_POST_UNPATCH_PREFIX "%s", modname)) + return -1; + + sym = find_symbol_by_name(e->out, sym_name); + if (sym) { + struct reloc *reloc; + + reloc = find_reloc_by_dest(e->out, sym->sec, sym->offset); + + if (!elf_create_reloc(e->out, obj_sec, + offsetof(struct klp_object_ext, callbacks) + + offsetof(struct klp_callbacks, post_unpatch), + reloc->sym, reloc_addend(reloc), R_ABS64)) + return -1; + } + + return 0; +} + +/* + * Copy all .modinfo import_ns= tags to ensure all namespaced exported symbols + * can be accessed via normal relocs. + */ +static int copy_import_ns(struct elfs *e) +{ + struct section *patched_sec, *out_sec = NULL; + char *import_ns, *data_end; + + patched_sec = find_section_by_name(e->patched, ".modinfo"); + if (!patched_sec) + return 0; + + import_ns = patched_sec->data->d_buf; + if (!import_ns) + return 0; + + for (data_end = import_ns + sec_size(patched_sec); + import_ns < data_end; + import_ns += strlen(import_ns) + 1) { + + import_ns = memmem(import_ns, data_end - import_ns, "import_ns=", 10); + if (!import_ns) + return 0; + + if (!out_sec) { + out_sec = find_section_by_name(e->out, ".modinfo"); + if (!out_sec) { + out_sec = elf_create_section(e->out, ".modinfo", 0, + patched_sec->sh.sh_entsize, + patched_sec->sh.sh_type, + patched_sec->sh.sh_addralign, + patched_sec->sh.sh_flags); + if (!out_sec) + return -1; + } + } + + if (!elf_add_data(e->out, out_sec, import_ns, strlen(import_ns) + 1)) + return -1; + } + + return 0; +} + +int cmd_klp_diff(int argc, const char **argv) +{ + struct elfs e = {0}; + + argc = parse_options(argc, argv, klp_diff_options, klp_diff_usage, 0); + if (argc != 3) + usage_with_options(klp_diff_usage, klp_diff_options); + + objname = argv[0]; + + e.orig = elf_open_read(argv[0], O_RDONLY); + e.patched = elf_open_read(argv[1], O_RDONLY); + e.out = NULL; + + if (!e.orig || !e.patched) + return -1; + + if (read_exports()) + return -1; + + if (read_sym_checksums(e.orig)) + return -1; + + if (read_sym_checksums(e.patched)) + return -1; + + if (correlate_symbols(&e)) + return -1; + + if (mark_changed_functions(&e)) + return 0; + + e.out = elf_create_file(&e.orig->ehdr, argv[2]); + if (!e.out) + return -1; + + if (clone_included_functions(&e)) + return -1; + + if (clone_special_sections(&e)) + return -1; + + if (create_klp_sections(&e)) + return -1; + + if (copy_import_ns(&e)) + return -1; + + if (elf_write(e.out)) + return -1; + + return elf_close(e.out); +} diff --git a/tools/objtool/klp-post-link.c b/tools/objtool/klp-post-link.c new file mode 100644 index 000000000000..c013e39957b1 --- /dev/null +++ b/tools/objtool/klp-post-link.c @@ -0,0 +1,168 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Read the intermediate KLP reloc/symbol representations created by klp diff + * and convert them to the proper format required by livepatch. This needs to + * run last to avoid linker wreckage. Linkers don't tend to handle the "two + * rela sections for a single base section" case very well, nor do they like + * SHN_LIVEPATCH. + * + * This is the final tool in the livepatch module generation pipeline: + * + * kernel builds -> objtool klp diff -> module link -> objtool klp post-link + */ + +#include <fcntl.h> +#include <gelf.h> +#include <objtool/objtool.h> +#include <objtool/warn.h> +#include <objtool/klp.h> +#include <objtool/util.h> +#include <linux/livepatch_external.h> + +static int fix_klp_relocs(struct elf *elf) +{ + struct section *symtab, *klp_relocs; + + klp_relocs = find_section_by_name(elf, KLP_RELOCS_SEC); + if (!klp_relocs) + return 0; + + symtab = find_section_by_name(elf, ".symtab"); + if (!symtab) { + ERROR("missing .symtab"); + return -1; + } + + for (int i = 0; i < sec_size(klp_relocs) / sizeof(struct klp_reloc); i++) { + struct klp_reloc *klp_reloc; + unsigned long klp_reloc_off; + struct section *sec, *tmp, *klp_rsec; + unsigned long offset; + struct reloc *reloc; + char sym_modname[64]; + char rsec_name[SEC_NAME_LEN]; + u64 addend; + struct symbol *sym, *klp_sym; + + klp_reloc_off = i * sizeof(*klp_reloc); + klp_reloc = klp_relocs->data->d_buf + klp_reloc_off; + + /* + * Read __klp_relocs[i]: + */ + + /* klp_reloc.sec_offset */ + reloc = find_reloc_by_dest(elf, klp_relocs, + klp_reloc_off + offsetof(struct klp_reloc, offset)); + if (!reloc) { + ERROR("malformed " KLP_RELOCS_SEC " section"); + return -1; + } + + sec = reloc->sym->sec; + offset = reloc_addend(reloc); + + /* klp_reloc.sym */ + reloc = find_reloc_by_dest(elf, klp_relocs, + klp_reloc_off + offsetof(struct klp_reloc, sym)); + if (!reloc) { + ERROR("malformed " KLP_RELOCS_SEC " section"); + return -1; + } + + klp_sym = reloc->sym; + addend = reloc_addend(reloc); + + /* symbol format: .klp.sym.modname.sym_name,sympos */ + if (sscanf(klp_sym->name + strlen(KLP_SYM_PREFIX), "%55[^.]", sym_modname) != 1) + ERROR("can't find modname in klp symbol '%s'", klp_sym->name); + + /* + * Create the KLP rela: + */ + + /* section format: .klp.rela.sec_objname.section_name */ + if (snprintf_check(rsec_name, SEC_NAME_LEN, + KLP_RELOC_SEC_PREFIX "%s.%s", + sym_modname, sec->name)) + return -1; + + klp_rsec = find_section_by_name(elf, rsec_name); + if (!klp_rsec) { + klp_rsec = elf_create_section(elf, rsec_name, 0, + elf_rela_size(elf), + SHT_RELA, elf_addr_size(elf), + SHF_ALLOC | SHF_INFO_LINK | SHF_RELA_LIVEPATCH); + if (!klp_rsec) + return -1; + + klp_rsec->sh.sh_link = symtab->idx; + klp_rsec->sh.sh_info = sec->idx; + klp_rsec->base = sec; + } + + tmp = sec->rsec; + sec->rsec = klp_rsec; + if (!elf_create_reloc(elf, sec, offset, klp_sym, addend, klp_reloc->type)) + return -1; + sec->rsec = tmp; + + /* + * Fix up the corresponding KLP symbol: + */ + + klp_sym->sym.st_shndx = SHN_LIVEPATCH; + if (!gelf_update_sym(symtab->data, klp_sym->idx, &klp_sym->sym)) { + ERROR_ELF("gelf_update_sym"); + return -1; + } + + /* + * Disable the original non-KLP reloc by converting it to R_*_NONE: + */ + + reloc = find_reloc_by_dest(elf, sec, offset); + sym = reloc->sym; + sym->sym.st_shndx = SHN_LIVEPATCH; + set_reloc_type(elf, reloc, 0); + if (!gelf_update_sym(symtab->data, sym->idx, &sym->sym)) { + ERROR_ELF("gelf_update_sym"); + return -1; + } + } + + return 0; +} + +/* + * This runs on the livepatch module after all other linking has been done. It + * converts the intermediate __klp_relocs section into proper KLP relocs to be + * processed by livepatch. This needs to run last to avoid linker wreckage. + * Linkers don't tend to handle the "two rela sections for a single base + * section" case very well, nor do they appreciate SHN_LIVEPATCH. + */ +int cmd_klp_post_link(int argc, const char **argv) +{ + struct elf *elf; + + argc--; + argv++; + + if (argc != 1) { + fprintf(stderr, "%d\n", argc); + fprintf(stderr, "usage: objtool link <file.ko>\n"); + return -1; + } + + elf = elf_open_read(argv[0], O_RDWR); + if (!elf) + return -1; + + if (fix_klp_relocs(elf)) + return -1; + + if (elf_write(elf)) + return -1; + + return elf_close(elf); +} diff --git a/tools/objtool/noreturns.h b/tools/objtool/noreturns.h index 802895fae3ca..14f8ab653449 100644 --- a/tools/objtool/noreturns.h +++ b/tools/objtool/noreturns.h @@ -36,6 +36,7 @@ NORETURN(machine_real_restart) NORETURN(make_task_dead) NORETURN(mpt_halt_firmware) NORETURN(mwait_play_dead) +NORETURN(native_play_dead) NORETURN(nmi_panic_self_stop) NORETURN(panic) NORETURN(vpanic) diff --git a/tools/objtool/objtool.c b/tools/objtool/objtool.c index 5c8b974ad0f9..1c3622117c33 100644 --- a/tools/objtool/objtool.c +++ b/tools/objtool/objtool.c @@ -16,7 +16,8 @@ #include <objtool/objtool.h> #include <objtool/warn.h> -bool help; +bool debug; +int indent; static struct objtool_file file; @@ -71,13 +72,54 @@ int objtool_pv_add(struct objtool_file *f, int idx, struct symbol *func) return 0; } +char *top_level_dir(const char *file) +{ + ssize_t len, self_len, file_len; + char self[PATH_MAX], *str; + int i; + + len = readlink("/proc/self/exe", self, sizeof(self) - 1); + if (len <= 0) + return NULL; + self[len] = '\0'; + + for (i = 0; i < 3; i++) { + char *s = strrchr(self, '/'); + if (!s) + return NULL; + *s = '\0'; + } + + self_len = strlen(self); + file_len = strlen(file); + + str = malloc(self_len + file_len + 2); + if (!str) + return NULL; + + memcpy(str, self, self_len); + str[self_len] = '/'; + strcpy(str + self_len + 1, file); + + return str; +} + int main(int argc, const char **argv) { static const char *UNUSED = "OBJTOOL_NOT_IMPLEMENTED"; + if (init_signal_handler()) + return -1; + /* libsubcmd init */ exec_cmd_init("objtool", UNUSED, UNUSED, UNUSED); pager_init(UNUSED); + if (argc > 1 && !strcmp(argv[1], "klp")) { + argc--; + argv++; + return cmd_klp(argc, argv); + } + return objtool_run(argc, argv); } diff --git a/tools/objtool/orc_dump.c b/tools/objtool/orc_dump.c index 1dd9fc18fe62..5a979f52425a 100644 --- a/tools/objtool/orc_dump.c +++ b/tools/objtool/orc_dump.c @@ -8,7 +8,6 @@ #include <objtool/objtool.h> #include <objtool/orc.h> #include <objtool/warn.h> -#include <objtool/endianness.h> int orc_dump(const char *filename) { diff --git a/tools/objtool/orc_gen.c b/tools/objtool/orc_gen.c index 922e6aac7cea..1045e1380ffd 100644 --- a/tools/objtool/orc_gen.c +++ b/tools/objtool/orc_gen.c @@ -12,7 +12,6 @@ #include <objtool/check.h> #include <objtool/orc.h> #include <objtool/warn.h> -#include <objtool/endianness.h> struct orc_list_entry { struct list_head list; @@ -57,7 +56,7 @@ int orc_create(struct objtool_file *file) /* Build a deduplicated list of ORC entries: */ INIT_LIST_HEAD(&orc_list); - for_each_sec(file, sec) { + for_each_sec(file->elf, sec) { struct orc_entry orc, prev_orc = {0}; struct instruction *insn; bool empty = true; @@ -127,7 +126,11 @@ int orc_create(struct objtool_file *file) return -1; } orc_sec = elf_create_section(file->elf, ".orc_unwind", - sizeof(struct orc_entry), nr); + nr * sizeof(struct orc_entry), + sizeof(struct orc_entry), + SHT_PROGBITS, + 1, + SHF_ALLOC); if (!orc_sec) return -1; diff --git a/tools/objtool/signal.c b/tools/objtool/signal.c new file mode 100644 index 000000000000..af5c65c0fb2d --- /dev/null +++ b/tools/objtool/signal.c @@ -0,0 +1,135 @@ +/* + * signal.c: Register a sigaltstack for objtool, to be able to + * run a signal handler on a separate stack even if + * the main process stack has overflown. Print out + * stack overflow errors when this happens. + */ +#include <stdio.h> +#include <stdlib.h> +#include <signal.h> +#include <unistd.h> +#include <sys/resource.h> +#include <string.h> + +#include <objtool/objtool.h> +#include <objtool/warn.h> + +static unsigned long stack_limit; + +static bool is_stack_overflow(void *fault_addr) +{ + unsigned long fault = (unsigned long)fault_addr; + + /* Check if fault is in the guard page just below the limit. */ + return fault < stack_limit && fault >= stack_limit - 4096; +} + +static void signal_handler(int sig_num, siginfo_t *info, void *context) +{ + struct sigaction sa_dfl = {0}; + const char *sig_name; + char msg[256]; + int msg_len; + + switch (sig_num) { + case SIGSEGV: sig_name = "SIGSEGV"; break; + case SIGBUS: sig_name = "SIGBUS"; break; + case SIGILL: sig_name = "SIGILL"; break; + case SIGABRT: sig_name = "SIGABRT"; break; + default: sig_name = "Unknown signal"; break; + } + + if (is_stack_overflow(info->si_addr)) { + msg_len = snprintf(msg, sizeof(msg), + "%s: error: %s: objtool stack overflow!\n", + objname, sig_name); + } else { + msg_len = snprintf(msg, sizeof(msg), + "%s: error: %s: objtool crash!\n", + objname, sig_name); + } + + msg_len = write(STDERR_FILENO, msg, msg_len); + + /* Re-raise the signal to trigger the core dump */ + sa_dfl.sa_handler = SIG_DFL; + sigaction(sig_num, &sa_dfl, NULL); + raise(sig_num); +} + +static int read_stack_limit(void) +{ + unsigned long stack_start, stack_end; + struct rlimit rlim; + char line[256]; + int ret = 0; + FILE *fp; + + if (getrlimit(RLIMIT_STACK, &rlim)) { + ERROR_GLIBC("getrlimit"); + return -1; + } + + fp = fopen("/proc/self/maps", "r"); + if (!fp) { + ERROR_GLIBC("fopen"); + return -1; + } + + while (fgets(line, sizeof(line), fp)) { + if (strstr(line, "[stack]")) { + if (sscanf(line, "%lx-%lx", &stack_start, &stack_end) != 2) { + ERROR_GLIBC("sscanf"); + ret = -1; + goto done; + } + stack_limit = stack_end - rlim.rlim_cur; + goto done; + } + } + + ret = -1; + ERROR("/proc/self/maps: can't find [stack]"); + +done: + fclose(fp); + + return ret; +} + +int init_signal_handler(void) +{ + int signals[] = {SIGSEGV, SIGBUS, SIGILL, SIGABRT}; + struct sigaction sa; + stack_t ss; + + if (read_stack_limit()) + return -1; + + ss.ss_sp = malloc(SIGSTKSZ); + if (!ss.ss_sp) { + ERROR_GLIBC("malloc"); + return -1; + } + ss.ss_size = SIGSTKSZ; + ss.ss_flags = 0; + + if (sigaltstack(&ss, NULL) == -1) { + ERROR_GLIBC("sigaltstack"); + return -1; + } + + sa.sa_sigaction = signal_handler; + sigemptyset(&sa.sa_mask); + + sa.sa_flags = SA_ONSTACK | SA_SIGINFO; + + for (int i = 0; i < ARRAY_SIZE(signals); i++) { + if (sigaction(signals[i], &sa, NULL) == -1) { + ERROR_GLIBC("sigaction"); + return -1; + } + } + + return 0; +} diff --git a/tools/objtool/special.c b/tools/objtool/special.c index c80fed8a840e..2a533afbc69a 100644 --- a/tools/objtool/special.c +++ b/tools/objtool/special.c @@ -15,7 +15,6 @@ #include <objtool/builtin.h> #include <objtool/special.h> #include <objtool/warn.h> -#include <objtool/endianness.h> struct special_entry { const char *sec; @@ -82,6 +81,8 @@ static int get_alt_entry(struct elf *elf, const struct special_entry *entry, entry->orig_len); alt->new_len = *(unsigned char *)(sec->data->d_buf + offset + entry->new_len); + alt->feature = *(unsigned int *)(sec->data->d_buf + offset + + entry->feature); } orig_reloc = find_reloc_by_dest(elf, sec, offset + entry->orig); @@ -133,7 +134,7 @@ int special_get_alts(struct elf *elf, struct list_head *alts) struct section *sec; unsigned int nr_entries; struct special_alt *alt; - int idx, ret; + int idx; INIT_LIST_HEAD(alts); @@ -142,12 +143,12 @@ int special_get_alts(struct elf *elf, struct list_head *alts) if (!sec) continue; - if (sec->sh.sh_size % entry->size != 0) { + if (sec_size(sec) % entry->size != 0) { ERROR("%s size not a multiple of %d", sec->name, entry->size); return -1; } - nr_entries = sec->sh.sh_size / entry->size; + nr_entries = sec_size(sec) / entry->size; for (idx = 0; idx < nr_entries; idx++) { alt = malloc(sizeof(*alt)); @@ -157,11 +158,8 @@ int special_get_alts(struct elf *elf, struct list_head *alts) } memset(alt, 0, sizeof(*alt)); - ret = get_alt_entry(elf, entry, sec, idx, alt); - if (ret > 0) - continue; - if (ret < 0) - return ret; + if (get_alt_entry(elf, entry, sec, idx, alt)) + return -1; list_add_tail(&alt->list, alts); } diff --git a/tools/objtool/sync-check.sh b/tools/objtool/sync-check.sh index 81d120d05442..e38167ca56a9 100755 --- a/tools/objtool/sync-check.sh +++ b/tools/objtool/sync-check.sh @@ -16,6 +16,8 @@ arch/x86/include/asm/orc_types.h arch/x86/include/asm/emulate_prefix.h arch/x86/lib/x86-opcode-map.txt arch/x86/tools/gen-insn-attr-x86.awk +include/linux/interval_tree_generic.h +include/linux/livepatch_external.h include/linux/static_call_types.h " diff --git a/tools/objtool/trace.c b/tools/objtool/trace.c new file mode 100644 index 000000000000..5dec44dab781 --- /dev/null +++ b/tools/objtool/trace.c @@ -0,0 +1,203 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2025, Oracle and/or its affiliates. + */ + +#include <objtool/trace.h> + +bool trace; +int trace_depth; + +/* + * Macros to trace CFI state attributes changes. + */ + +#define TRACE_CFI_ATTR(attr, prev, next, fmt, ...) \ +({ \ + if ((prev)->attr != (next)->attr) \ + TRACE("%s=" fmt " ", #attr, __VA_ARGS__); \ +}) + +#define TRACE_CFI_ATTR_BOOL(attr, prev, next) \ + TRACE_CFI_ATTR(attr, prev, next, \ + "%s", (next)->attr ? "true" : "false") + +#define TRACE_CFI_ATTR_NUM(attr, prev, next, fmt) \ + TRACE_CFI_ATTR(attr, prev, next, fmt, (next)->attr) + +#define CFI_REG_NAME_MAXLEN 16 + +/* + * Return the name of a register. Note that the same static buffer + * is returned if the name is dynamically generated. + */ +static const char *cfi_reg_name(unsigned int reg) +{ + static char rname_buffer[CFI_REG_NAME_MAXLEN]; + const char *rname; + + switch (reg) { + case CFI_UNDEFINED: + return "<undefined>"; + case CFI_CFA: + return "cfa"; + case CFI_SP_INDIRECT: + return "(sp)"; + case CFI_BP_INDIRECT: + return "(bp)"; + } + + if (reg < CFI_NUM_REGS) { + rname = arch_reg_name[reg]; + if (rname) + return rname; + } + + if (snprintf(rname_buffer, CFI_REG_NAME_MAXLEN, "r%d", reg) == -1) + return "<error>"; + + return (const char *)rname_buffer; +} + +/* + * Functions and macros to trace CFI registers changes. + */ + +static void trace_cfi_reg(const char *prefix, int reg, const char *fmt, + int base_prev, int offset_prev, + int base_next, int offset_next) +{ + char *rname; + + if (base_prev == base_next && offset_prev == offset_next) + return; + + if (prefix) + TRACE("%s:", prefix); + + if (base_next == CFI_UNDEFINED) { + TRACE("%1$s=<undef> ", cfi_reg_name(reg)); + } else { + rname = strdup(cfi_reg_name(reg)); + TRACE(fmt, rname, cfi_reg_name(base_next), offset_next); + free(rname); + } +} + +static void trace_cfi_reg_val(const char *prefix, int reg, + int base_prev, int offset_prev, + int base_next, int offset_next) +{ + trace_cfi_reg(prefix, reg, "%1$s=%2$s%3$+d ", + base_prev, offset_prev, base_next, offset_next); +} + +static void trace_cfi_reg_ref(const char *prefix, int reg, + int base_prev, int offset_prev, + int base_next, int offset_next) +{ + trace_cfi_reg(prefix, reg, "%1$s=(%2$s%3$+d) ", + base_prev, offset_prev, base_next, offset_next); +} + +#define TRACE_CFI_REG_VAL(reg, prev, next) \ + trace_cfi_reg_val(NULL, reg, prev.base, prev.offset, \ + next.base, next.offset) + +#define TRACE_CFI_REG_REF(reg, prev, next) \ + trace_cfi_reg_ref(NULL, reg, prev.base, prev.offset, \ + next.base, next.offset) + +void trace_insn_state(struct instruction *insn, struct insn_state *sprev, + struct insn_state *snext) +{ + struct cfi_state *cprev, *cnext; + int i; + + if (!memcmp(sprev, snext, sizeof(struct insn_state))) + return; + + cprev = &sprev->cfi; + cnext = &snext->cfi; + + disas_print_insn(stderr, objtool_disas_ctx, insn, + trace_depth - 1, "state: "); + + /* print registers changes */ + TRACE_CFI_REG_VAL(CFI_CFA, cprev->cfa, cnext->cfa); + for (i = 0; i < CFI_NUM_REGS; i++) { + TRACE_CFI_REG_VAL(i, cprev->vals[i], cnext->vals[i]); + TRACE_CFI_REG_REF(i, cprev->regs[i], cnext->regs[i]); + } + + /* print attributes changes */ + TRACE_CFI_ATTR_NUM(stack_size, cprev, cnext, "%d"); + TRACE_CFI_ATTR_BOOL(drap, cprev, cnext); + if (cnext->drap) { + trace_cfi_reg_val("drap", cnext->drap_reg, + cprev->drap_reg, cprev->drap_offset, + cnext->drap_reg, cnext->drap_offset); + } + TRACE_CFI_ATTR_BOOL(bp_scratch, cprev, cnext); + TRACE_CFI_ATTR_NUM(instr, sprev, snext, "%d"); + TRACE_CFI_ATTR_NUM(uaccess_stack, sprev, snext, "%u"); + + TRACE("\n"); + + insn->trace = 1; +} + +void trace_alt_begin(struct instruction *orig_insn, struct alternative *alt, + char *alt_name) +{ + struct instruction *alt_insn; + char suffix[2]; + + alt_insn = alt->insn; + + if (alt->type == ALT_TYPE_EX_TABLE) { + /* + * When there is an exception table then the instruction + * at the original location is executed but it can cause + * an exception. In that case, the execution will be + * redirected to the alternative instruction. + * + * The instruction at the original location can have + * instruction alternatives, so we just print the location + * of the instruction that can cause the exception and + * not the instruction itself. + */ + TRACE_ALT_INFO_NOADDR(orig_insn, "/ ", "%s for instruction at 0x%lx <%s+0x%lx>", + alt_name, + orig_insn->offset, orig_insn->sym->name, + orig_insn->offset - orig_insn->sym->offset); + } else { + TRACE_ALT_INFO_NOADDR(orig_insn, "/ ", "%s", alt_name); + } + + if (alt->type == ALT_TYPE_JUMP_TABLE) { + /* + * For a jump alternative, if the default instruction is + * a NOP then it is replaced with the jmp instruction, + * otherwise it is replaced with a NOP instruction. + */ + trace_depth++; + if (orig_insn->type == INSN_NOP) { + suffix[0] = (orig_insn->len == 5) ? 'q' : '\0'; + TRACE_ADDR(orig_insn, "jmp%-3s %lx <%s+0x%lx>", suffix, + alt_insn->offset, alt_insn->sym->name, + alt_insn->offset - alt_insn->sym->offset); + } else { + TRACE_ADDR(orig_insn, "nop%d", orig_insn->len); + trace_depth--; + } + } +} + +void trace_alt_end(struct instruction *orig_insn, struct alternative *alt, + char *alt_name) +{ + if (alt->type == ALT_TYPE_JUMP_TABLE && orig_insn->type == INSN_NOP) + trace_depth--; + TRACE_ALT_INFO_NOADDR(orig_insn, "\\ ", "%s", alt_name); +} diff --git a/tools/objtool/weak.c b/tools/objtool/weak.c index d83f607733b0..d6562f292259 100644 --- a/tools/objtool/weak.c +++ b/tools/objtool/weak.c @@ -8,6 +8,8 @@ #include <stdbool.h> #include <errno.h> #include <objtool/objtool.h> +#include <objtool/arch.h> +#include <objtool/builtin.h> #define UNSUPPORTED(name) \ ({ \ @@ -24,3 +26,8 @@ int __weak orc_create(struct objtool_file *file) { UNSUPPORTED("ORC"); } + +int __weak cmd_klp(int argc, const char **argv) +{ + UNSUPPORTED("klp"); +} diff --git a/tools/perf/Makefile.perf b/tools/perf/Makefile.perf index 47c906b807ef..02f87c49801f 100644 --- a/tools/perf/Makefile.perf +++ b/tools/perf/Makefile.perf @@ -234,12 +234,12 @@ endif # The fixdep build - we force fixdep tool to be built as # the first target in the separate make session not to be # disturbed by any parallel make jobs. Once fixdep is done -# we issue the requested build with FIXDEP=1 variable. +# we issue the requested build with FIXDEP_BUILT=1 variable. # # The fixdep build is disabled for $(NON_CONFIG_TARGETS) # targets, because it's not necessary. -ifdef FIXDEP +ifdef FIXDEP_BUILT force_fixdep := 0 else force_fixdep := $(config) @@ -286,7 +286,7 @@ $(goals) all: sub-make sub-make: fixdep @./check-headers.sh - $(Q)$(MAKE) FIXDEP=1 -f Makefile.perf $(goals) + $(Q)$(MAKE) FIXDEP_BUILT=1 -f Makefile.perf $(goals) else # force_fixdep diff --git a/tools/perf/trace/beauty/include/linux/socket.h b/tools/perf/trace/beauty/include/linux/socket.h index 3b262487ec06..77d7c59f5d8b 100644 --- a/tools/perf/trace/beauty/include/linux/socket.h +++ b/tools/perf/trace/beauty/include/linux/socket.h @@ -34,10 +34,7 @@ typedef __kernel_sa_family_t sa_family_t; struct sockaddr { sa_family_t sa_family; /* address family, AF_xxx */ - union { - char sa_data_min[14]; /* Minimum 14 bytes of protocol address */ - DECLARE_FLEX_ARRAY(char, sa_data); - }; + char sa_data[14]; /* 14 bytes of protocol address */ }; struct linger { diff --git a/tools/power/acpi/tools/pfrut/pfrut.c b/tools/power/acpi/tools/pfrut/pfrut.c index 44a9ecbd91e8..4d9b0177c312 100644 --- a/tools/power/acpi/tools/pfrut/pfrut.c +++ b/tools/power/acpi/tools/pfrut/pfrut.c @@ -222,6 +222,7 @@ int main(int argc, char *argv[]) fd_update_log = open("/dev/acpi_pfr_telemetry0", O_RDWR); if (fd_update_log < 0) { printf("PFRT device not supported - Quit...\n"); + close(fd_update); return 1; } @@ -265,7 +266,8 @@ int main(int argc, char *argv[]) printf("chunk2_size:%d\n", data_info.chunk2_size); printf("rollover_cnt:%d\n", data_info.rollover_cnt); printf("reset_cnt:%d\n", data_info.reset_cnt); - + close(fd_update); + close(fd_update_log); return 0; } @@ -358,6 +360,7 @@ int main(int argc, char *argv[]) if (ret == -1) { perror("Failed to load capsule file"); + munmap(addr_map_capsule, st.st_size); close(fd_capsule); close(fd_update); close(fd_update_log); @@ -420,7 +423,7 @@ int main(int argc, char *argv[]) if (p_mmap == MAP_FAILED) { perror("mmap error."); close(fd_update_log); - + free(log_buf); return 1; } diff --git a/tools/power/cpupower/Makefile b/tools/power/cpupower/Makefile index c43db1c41205..a1df9196dc45 100644 --- a/tools/power/cpupower/Makefile +++ b/tools/power/cpupower/Makefile @@ -37,9 +37,7 @@ NLS ?= true # cpufreq-bench benchmarking tool CPUFREQ_BENCH ?= true -# Do not build libraries, but build the code in statically -# Libraries are still built, otherwise the Makefile code would -# be rather ugly. +# Build the code, including libraries, statically. export STATIC ?= false # Prefix to the directories we're installing to @@ -207,14 +205,25 @@ $(OUTPUT)lib/%.o: $(LIB_SRC) $(LIB_HEADERS) $(ECHO) " CC " $@ $(QUIET) $(CC) $(CFLAGS) -fPIC -o $@ -c lib/$*.c -$(OUTPUT)libcpupower.so.$(LIB_VER): $(LIB_OBJS) +ifeq ($(strip $(STATIC)),true) +LIBCPUPOWER := libcpupower.a +else +LIBCPUPOWER := libcpupower.so.$(LIB_VER) +endif + +$(OUTPUT)$(LIBCPUPOWER): $(LIB_OBJS) +ifeq ($(strip $(STATIC)),true) + $(ECHO) " AR " $@ + $(QUIET) $(AR) rcs $@ $(LIB_OBJS) +else $(ECHO) " LD " $@ $(QUIET) $(CC) -shared $(CFLAGS) $(LDFLAGS) -o $@ \ -Wl,-soname,libcpupower.so.$(LIB_MAJ) $(LIB_OBJS) @ln -sf $(@F) $(OUTPUT)libcpupower.so @ln -sf $(@F) $(OUTPUT)libcpupower.so.$(LIB_MAJ) +endif -libcpupower: $(OUTPUT)libcpupower.so.$(LIB_VER) +libcpupower: $(OUTPUT)$(LIBCPUPOWER) # Let all .o files depend on its .c file and all headers # Might be worth to put this into utils/Makefile at some point of time @@ -224,7 +233,7 @@ $(OUTPUT)%.o: %.c $(ECHO) " CC " $@ $(QUIET) $(CC) $(CFLAGS) -I./lib -I ./utils -o $@ -c $*.c -$(OUTPUT)cpupower: $(UTIL_OBJS) $(OUTPUT)libcpupower.so.$(LIB_VER) +$(OUTPUT)cpupower: $(UTIL_OBJS) $(OUTPUT)$(LIBCPUPOWER) $(ECHO) " CC " $@ ifeq ($(strip $(STATIC)),true) $(QUIET) $(CC) $(CFLAGS) $(LDFLAGS) $(UTIL_OBJS) -lrt -lpci -L$(OUTPUT) -o $@ @@ -269,7 +278,7 @@ update-po: $(OUTPUT)po/$(PACKAGE).pot done; endif -compile-bench: $(OUTPUT)libcpupower.so.$(LIB_VER) +compile-bench: $(OUTPUT)$(LIBCPUPOWER) @V=$(V) confdir=$(confdir) $(MAKE) -C bench O=$(OUTPUT) # we compile into subdirectories. if the target directory is not the @@ -287,6 +296,7 @@ clean: -find $(OUTPUT) \( -not -type d \) -and \( -name '*~' -o -name '*.[oas]' \) -type f -print \ | xargs rm -f -rm -f $(OUTPUT)cpupower + -rm -f $(OUTPUT)libcpupower.a -rm -f $(OUTPUT)libcpupower.so* -rm -rf $(OUTPUT)po/*.gmo -rm -rf $(OUTPUT)po/*.pot @@ -295,7 +305,11 @@ clean: install-lib: libcpupower $(INSTALL) -d $(DESTDIR)${libdir} +ifeq ($(strip $(STATIC)),true) + $(CP) $(OUTPUT)libcpupower.a $(DESTDIR)${libdir}/ +else $(CP) $(OUTPUT)libcpupower.so* $(DESTDIR)${libdir}/ +endif $(INSTALL) -d $(DESTDIR)${includedir} $(INSTALL_DATA) lib/cpufreq.h $(DESTDIR)${includedir}/cpufreq.h $(INSTALL_DATA) lib/cpuidle.h $(DESTDIR)${includedir}/cpuidle.h @@ -336,11 +350,7 @@ install-bench: compile-bench @#DESTDIR must be set from outside to survive @sbindir=$(sbindir) bindir=$(bindir) docdir=$(docdir) confdir=$(confdir) $(MAKE) -C bench O=$(OUTPUT) install -ifeq ($(strip $(STATIC)),true) -install: all install-tools install-man $(INSTALL_NLS) $(INSTALL_BENCH) -else install: all install-lib install-tools install-man $(INSTALL_NLS) $(INSTALL_BENCH) -endif uninstall: - rm -f $(DESTDIR)${libdir}/libcpupower.* diff --git a/tools/power/x86/turbostat/turbostat.8 b/tools/power/x86/turbostat/turbostat.8 index 3340def58d01..1551fcdbfd8a 100644 --- a/tools/power/x86/turbostat/turbostat.8 +++ b/tools/power/x86/turbostat/turbostat.8 @@ -101,7 +101,7 @@ The column name "all" can be used to enable all disabled-by-default built-in cou .PP \fB--show column\fP show only the specified built-in columns. May be invoked multiple times, or with a comma-separated list of column names. .PP -\fB--show CATEGORY --hide CATEGORY\fP Show and hide also accept a single CATEGORY of columns: "all", "topology", "idle", "frequency", "power", "cpuidle", "hwidle", "swidle", "other". "idle" (enabled by default), includes "hwidle" and "pct_idle". "cpuidle" (default disabled) includes cpuidle software invocation counters. "swidle" includes "cpuidle" plus "pct_idle". "hwidle" includes only hardware based idle residency counters. Older versions of turbostat used the term "sysfs" for what is now "swidle". +\fB--show CATEGORY --hide CATEGORY\fP Show and hide also accept a comma-separated-list of CATEGORIES of columns: "all", "topology", "idle", "frequency", "power", "cpuidle", "hwidle", "swidle", "cache", "llc", "other". "idle" (enabled by default), includes "hwidle" and "pct_idle". "cpuidle" (default disabled) includes cpuidle software invocation counters. "swidle" includes "cpuidle" plus "pct_idle". "hwidle" includes only hardware based idle residency counters. Older versions of turbostat used the term "sysfs" for what is now "swidle". .PP \fB--Dump\fP displays the raw counter values. .PP @@ -159,6 +159,10 @@ The system configuration dump (if --quiet is not used) is followed by statistics .PP \fBSMI\fP The number of System Management Interrupts serviced CPU during the measurement interval. While this counter is actually per-CPU, SMI are triggered on all processors, so the number should be the same for all CPUs. .PP +\fBLLCkRPS\fP Last Level Cache Thousands of References Per Second. For CPUs with an L3 LLC, this is the number of references that CPU made to the L3 (and the number of misses that CPU made to it's L2). For CPUs with an L2 LLC, this is the number of references to the L2 (and the number of misses to the CPU's L1). The system summary row shows the sum for all CPUs. In both cases, the value displayed is the actual value divided by 1000 in the interest of usually fitting into 8 columns. +.PP +\fBLLC%hit\fP Last Level Cache Hit Rate %. Hit Rate Percent = 100.0 * (References - Misses)/References. The system summary row shows the weighted average for all CPUs (100.0 * (Sum_References - Sum_Misses)/Sum_References). +.PP \fBC1, C2, C3...\fP The number times Linux requested the C1, C2, C3 idle state during the measurement interval. The system summary line shows the sum for all CPUs. These are C-state names as exported in /sys/devices/system/cpu/cpu*/cpuidle/state*/name. While their names are generic, their attributes are processor specific. They the system description section of output shows what MWAIT sub-states they are mapped to on each system. These counters are in the "cpuidle" group, which is disabled, by default. .PP \fBC1+, C2+, C3+...\fP The idle governor idle state misprediction statistics. Inidcates the number times Linux requested the C1, C2, C3 idle state during the measurement interval, but should have requested a deeper idle state (if it exists and enabled). These statistics come from the /sys/devices/system/cpu/cpu*/cpuidle/state*/below file. These counters are in the "cpuidle" group, which is disabled, by default. @@ -410,25 +414,24 @@ CPU pCPU%c1 CPU%c1 .fi .SH ADD PERF COUNTER EXAMPLE #2 (using virtual cpu device) -Here we run on hybrid, Raptor Lake platform. -We limit turbostat to show output for just cpu0 (pcore) and cpu12 (ecore). +Here we run on hybrid, Meteor Lake platform. +We limit turbostat to show output for just cpu0 (pcore) and cpu4 (ecore). We add a counter showing number of L3 cache misses, using virtual "cpu" device, labeling it with the column header, "VCMISS". We add a counter showing number of L3 cache misses, using virtual "cpu_core" device, -labeling it with the column header, "PCMISS". This will fail on ecore cpu12. +labeling it with the column header, "PCMISS". This will fail on ecore cpu4. We add a counter showing number of L3 cache misses, using virtual "cpu_atom" device, labeling it with the column header, "ECMISS". This will fail on pcore cpu0. We display it only once, after the conclusion of 0.1 second sleep. .nf -sudo ./turbostat --quiet --cpu 0,12 --show CPU --add perf/cpu/cache-misses,cpu,delta,raw,VCMISS --add perf/cpu_core/cache-misses,cpu,delta,raw,PCMISS --add perf/cpu_atom/cache-misses,cpu,delta,raw,ECMISS sleep .1 +sudo ./turbostat --quiet --cpu 0,4 --show CPU --add perf/cpu/cache-misses,cpu,delta,VCMISS --add perf/cpu_core/cache-misses,cpu,delta,PCMISS --add perf/cpu_atom/cache-misses,cpu,delta,ECMISS sleep 5 turbostat: added_perf_counters_init_: perf/cpu_atom/cache-misses: failed to open counter on cpu0 -turbostat: added_perf_counters_init_: perf/cpu_core/cache-misses: failed to open counter on cpu12 -0.104630 sec -CPU ECMISS PCMISS VCMISS -- 0x0000000000000000 0x0000000000000000 0x0000000000000000 -0 0x0000000000000000 0x0000000000007951 0x0000000000007796 -12 0x000000000001137a 0x0000000000000000 0x0000000000011392 - +turbostat: added_perf_counters_init_: perf/cpu_core/cache-misses: failed to open counter on cpu4 +5.001207 sec +CPU ECMISS PCMISS VCMISS +- 41586506 46291219 87877749 +4 83173012 0 83173040 +0 0 92582439 92582458 .fi .SH ADD PMT COUNTER EXAMPLE diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index f2512d78bcbd..5ad45c2ac5bd 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -142,6 +142,7 @@ struct msr_counter { #define FLAGS_SHOW (1 << 1) #define SYSFS_PERCPU (1 << 1) }; +static int use_android_msr_path; struct msr_counter bic[] = { { 0x0, "usec", NULL, 0, 0, 0, NULL, 0 }, @@ -209,6 +210,8 @@ struct msr_counter bic[] = { { 0x0, "NMI", NULL, 0, 0, 0, NULL, 0 }, { 0x0, "CPU%c1e", NULL, 0, 0, 0, NULL, 0 }, { 0x0, "pct_idle", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "LLCkRPS", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "LLC%hit", NULL, 0, 0, 0, NULL, 0 }, }; /* n.b. bic_names must match the order in bic[], above */ @@ -278,6 +281,8 @@ enum bic_names { BIC_NMI, BIC_CPU_c1e, BIC_pct_idle, + BIC_LLC_RPS, + BIC_LLC_HIT, MAX_BIC }; @@ -305,6 +310,7 @@ static cpu_set_t bic_group_frequency; static cpu_set_t bic_group_hw_idle; static cpu_set_t bic_group_sw_idle; static cpu_set_t bic_group_idle; +static cpu_set_t bic_group_cache; static cpu_set_t bic_group_other; static cpu_set_t bic_group_disabled_by_default; static cpu_set_t bic_enabled; @@ -413,9 +419,14 @@ static void bic_groups_init(void) SET_BIC(BIC_pct_idle, &bic_group_sw_idle); BIC_INIT(&bic_group_idle); + CPU_OR(&bic_group_idle, &bic_group_idle, &bic_group_hw_idle); SET_BIC(BIC_pct_idle, &bic_group_idle); + BIC_INIT(&bic_group_cache); + SET_BIC(BIC_LLC_RPS, &bic_group_cache); + SET_BIC(BIC_LLC_HIT, &bic_group_cache); + BIC_INIT(&bic_group_other); SET_BIC(BIC_IRQ, &bic_group_other); SET_BIC(BIC_NMI, &bic_group_other); @@ -466,12 +477,11 @@ static void bic_groups_init(void) #define PCL_10 14 /* PC10 */ #define PCLUNL 15 /* Unlimited */ -struct amperf_group_fd; - char *proc_stat = "/proc/stat"; FILE *outf; int *fd_percpu; int *fd_instr_count_percpu; +int *fd_llc_percpu; struct timeval interval_tv = { 5, 0 }; struct timespec interval_ts = { 5, 0 }; @@ -482,11 +492,12 @@ unsigned int quiet; unsigned int shown; unsigned int sums_need_wide_columns; unsigned int rapl_joules; +unsigned int valid_rapl_msrs; unsigned int summary_only; unsigned int list_header_only; unsigned int dump_only; unsigned int force_load; -unsigned int has_aperf; +unsigned int cpuid_has_aperf_mperf; unsigned int has_aperf_access; unsigned int has_epb; unsigned int has_turbo; @@ -552,8 +563,7 @@ static struct gfx_sysfs_info gfx_info[GFX_MAX]; int get_msr(int cpu, off_t offset, unsigned long long *msr); int add_counter(unsigned int msr_num, char *path, char *name, - unsigned int width, enum counter_scope scope, - enum counter_type type, enum counter_format format, int flags, int package_num); + unsigned int width, enum counter_scope scope, enum counter_type type, enum counter_format format, int flags, int package_num); /* Model specific support Start */ @@ -578,7 +588,7 @@ struct platform_features { bool has_cst_prewake_bit; /* Cstate prewake bit in MSR_IA32_POWER_CTL */ int trl_msrs; /* MSR_TURBO_RATIO_LIMIT/LIMIT1/LIMIT2/SECONDARY, Atom TRL MSRs */ int plr_msrs; /* MSR_CORE/GFX/RING_PERF_LIMIT_REASONS */ - int rapl_msrs; /* RAPL PKG/DRAM/CORE/GFX MSRs, AMD RAPL MSRs */ + int plat_rapl_msrs; /* RAPL PKG/DRAM/CORE/GFX MSRs, AMD RAPL MSRs */ bool has_per_core_rapl; /* Indicates cores energy collection is per-core, not per-package. AMD specific for now */ bool has_rapl_divisor; /* Divisor for Energy unit raw value from MSR_RAPL_POWER_UNIT */ bool has_fixed_rapl_unit; /* Fixed Energy Unit used for DRAM RAPL Domain */ @@ -733,7 +743,7 @@ static const struct platform_features snb_features = { .cst_limit = CST_LIMIT_SNB, .has_irtl_msrs = 1, .trl_msrs = TRL_BASE, - .rapl_msrs = RAPL_PKG | RAPL_CORE_ALL | RAPL_GFX | RAPL_PKG_POWER_INFO, + .plat_rapl_msrs = RAPL_PKG | RAPL_CORE_ALL | RAPL_GFX | RAPL_PKG_POWER_INFO, }; static const struct platform_features snx_features = { @@ -745,7 +755,7 @@ static const struct platform_features snx_features = { .cst_limit = CST_LIMIT_SNB, .has_irtl_msrs = 1, .trl_msrs = TRL_BASE, - .rapl_msrs = RAPL_PKG_ALL | RAPL_CORE_ALL | RAPL_DRAM_ALL, + .plat_rapl_msrs = RAPL_PKG_ALL | RAPL_CORE_ALL | RAPL_DRAM_ALL, }; static const struct platform_features ivb_features = { @@ -758,7 +768,7 @@ static const struct platform_features ivb_features = { .cst_limit = CST_LIMIT_SNB, .has_irtl_msrs = 1, .trl_msrs = TRL_BASE, - .rapl_msrs = RAPL_PKG | RAPL_CORE_ALL | RAPL_GFX | RAPL_PKG_POWER_INFO, + .plat_rapl_msrs = RAPL_PKG | RAPL_CORE_ALL | RAPL_GFX | RAPL_PKG_POWER_INFO, }; static const struct platform_features ivx_features = { @@ -770,7 +780,7 @@ static const struct platform_features ivx_features = { .cst_limit = CST_LIMIT_SNB, .has_irtl_msrs = 1, .trl_msrs = TRL_BASE | TRL_LIMIT1, - .rapl_msrs = RAPL_PKG_ALL | RAPL_CORE_ALL | RAPL_DRAM_ALL, + .plat_rapl_msrs = RAPL_PKG_ALL | RAPL_CORE_ALL | RAPL_DRAM_ALL, }; static const struct platform_features hsw_features = { @@ -784,7 +794,7 @@ static const struct platform_features hsw_features = { .has_irtl_msrs = 1, .trl_msrs = TRL_BASE, .plr_msrs = PLR_CORE | PLR_GFX | PLR_RING, - .rapl_msrs = RAPL_PKG | RAPL_CORE_ALL | RAPL_GFX | RAPL_PKG_POWER_INFO, + .plat_rapl_msrs = RAPL_PKG | RAPL_CORE_ALL | RAPL_GFX | RAPL_PKG_POWER_INFO, }; static const struct platform_features hsx_features = { @@ -798,7 +808,7 @@ static const struct platform_features hsx_features = { .has_irtl_msrs = 1, .trl_msrs = TRL_BASE | TRL_LIMIT1 | TRL_LIMIT2, .plr_msrs = PLR_CORE | PLR_RING, - .rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL, + .plat_rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL, .has_fixed_rapl_unit = 1, }; @@ -813,7 +823,7 @@ static const struct platform_features hswl_features = { .has_irtl_msrs = 1, .trl_msrs = TRL_BASE, .plr_msrs = PLR_CORE | PLR_GFX | PLR_RING, - .rapl_msrs = RAPL_PKG | RAPL_CORE_ALL | RAPL_GFX | RAPL_PKG_POWER_INFO, + .plat_rapl_msrs = RAPL_PKG | RAPL_CORE_ALL | RAPL_GFX | RAPL_PKG_POWER_INFO, }; static const struct platform_features hswg_features = { @@ -827,7 +837,7 @@ static const struct platform_features hswg_features = { .has_irtl_msrs = 1, .trl_msrs = TRL_BASE, .plr_msrs = PLR_CORE | PLR_GFX | PLR_RING, - .rapl_msrs = RAPL_PKG | RAPL_CORE_ALL | RAPL_GFX | RAPL_PKG_POWER_INFO, + .plat_rapl_msrs = RAPL_PKG | RAPL_CORE_ALL | RAPL_GFX | RAPL_PKG_POWER_INFO, }; static const struct platform_features bdw_features = { @@ -840,7 +850,7 @@ static const struct platform_features bdw_features = { .cst_limit = CST_LIMIT_HSW, .has_irtl_msrs = 1, .trl_msrs = TRL_BASE, - .rapl_msrs = RAPL_PKG | RAPL_CORE_ALL | RAPL_GFX | RAPL_PKG_POWER_INFO, + .plat_rapl_msrs = RAPL_PKG | RAPL_CORE_ALL | RAPL_GFX | RAPL_PKG_POWER_INFO, }; static const struct platform_features bdwg_features = { @@ -853,7 +863,7 @@ static const struct platform_features bdwg_features = { .cst_limit = CST_LIMIT_HSW, .has_irtl_msrs = 1, .trl_msrs = TRL_BASE, - .rapl_msrs = RAPL_PKG | RAPL_CORE_ALL | RAPL_GFX | RAPL_PKG_POWER_INFO, + .plat_rapl_msrs = RAPL_PKG | RAPL_CORE_ALL | RAPL_GFX | RAPL_PKG_POWER_INFO, }; static const struct platform_features bdx_features = { @@ -867,7 +877,7 @@ static const struct platform_features bdx_features = { .has_irtl_msrs = 1, .has_cst_auto_convension = 1, .trl_msrs = TRL_BASE, - .rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL, + .plat_rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL, .has_fixed_rapl_unit = 1, }; @@ -884,7 +894,7 @@ static const struct platform_features skl_features = { .has_ext_cst_msrs = 1, .trl_msrs = TRL_BASE, .tcc_offset_bits = 6, - .rapl_msrs = RAPL_PKG_ALL | RAPL_CORE_ALL | RAPL_DRAM | RAPL_DRAM_PERF_STATUS | RAPL_GFX | RAPL_PSYS, + .plat_rapl_msrs = RAPL_PKG_ALL | RAPL_CORE_ALL | RAPL_DRAM | RAPL_DRAM_PERF_STATUS | RAPL_GFX | RAPL_PSYS, .enable_tsc_tweak = 1, }; @@ -901,7 +911,7 @@ static const struct platform_features cnl_features = { .has_ext_cst_msrs = 1, .trl_msrs = TRL_BASE, .tcc_offset_bits = 6, - .rapl_msrs = RAPL_PKG_ALL | RAPL_CORE_ALL | RAPL_DRAM | RAPL_DRAM_PERF_STATUS | RAPL_GFX | RAPL_PSYS, + .plat_rapl_msrs = RAPL_PKG_ALL | RAPL_CORE_ALL | RAPL_DRAM | RAPL_DRAM_PERF_STATUS | RAPL_GFX | RAPL_PSYS, .enable_tsc_tweak = 1, }; @@ -919,7 +929,7 @@ static const struct platform_features adl_features = { .has_ext_cst_msrs = cnl_features.has_ext_cst_msrs, .trl_msrs = cnl_features.trl_msrs, .tcc_offset_bits = cnl_features.tcc_offset_bits, - .rapl_msrs = cnl_features.rapl_msrs, + .plat_rapl_msrs = cnl_features.plat_rapl_msrs, .enable_tsc_tweak = cnl_features.enable_tsc_tweak, }; @@ -937,7 +947,7 @@ static const struct platform_features lnl_features = { .has_ext_cst_msrs = adl_features.has_ext_cst_msrs, .trl_msrs = adl_features.trl_msrs, .tcc_offset_bits = adl_features.tcc_offset_bits, - .rapl_msrs = adl_features.rapl_msrs, + .plat_rapl_msrs = adl_features.plat_rapl_msrs, .enable_tsc_tweak = adl_features.enable_tsc_tweak, }; @@ -952,7 +962,7 @@ static const struct platform_features skx_features = { .has_irtl_msrs = 1, .has_cst_auto_convension = 1, .trl_msrs = TRL_BASE | TRL_CORECOUNT, - .rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL, + .plat_rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL, .has_fixed_rapl_unit = 1, }; @@ -968,7 +978,7 @@ static const struct platform_features icx_features = { .has_irtl_msrs = 1, .has_cst_prewake_bit = 1, .trl_msrs = TRL_BASE | TRL_CORECOUNT, - .rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL | RAPL_PSYS, + .plat_rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL | RAPL_PSYS, .has_fixed_rapl_unit = 1, }; @@ -985,7 +995,7 @@ static const struct platform_features spr_features = { .has_cst_prewake_bit = 1, .has_fixed_rapl_psys_unit = 1, .trl_msrs = TRL_BASE | TRL_CORECOUNT, - .rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL | RAPL_PSYS, + .plat_rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL | RAPL_PSYS, }; static const struct platform_features dmr_features = { @@ -1000,7 +1010,7 @@ static const struct platform_features dmr_features = { .has_fixed_rapl_psys_unit = spr_features.has_fixed_rapl_psys_unit, .trl_msrs = spr_features.trl_msrs, .has_msr_module_c6_res_ms = 1, /* DMR has Dual-Core-Module and MC6 MSR */ - .rapl_msrs = 0, /* DMR does not have RAPL MSRs */ + .plat_rapl_msrs = 0, /* DMR does not have RAPL MSRs */ .plr_msrs = 0, /* DMR does not have PLR MSRs */ .has_irtl_msrs = 0, /* DMR does not have IRTL MSRs */ .has_config_tdp = 0, /* DMR does not have CTDP MSRs */ @@ -1019,7 +1029,7 @@ static const struct platform_features srf_features = { .has_irtl_msrs = 1, .has_cst_prewake_bit = 1, .trl_msrs = TRL_BASE | TRL_CORECOUNT, - .rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL | RAPL_PSYS, + .plat_rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL | RAPL_PSYS, }; static const struct platform_features grr_features = { @@ -1035,7 +1045,7 @@ static const struct platform_features grr_features = { .has_irtl_msrs = 1, .has_cst_prewake_bit = 1, .trl_msrs = TRL_BASE | TRL_CORECOUNT, - .rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL | RAPL_PSYS, + .plat_rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL | RAPL_PSYS, }; static const struct platform_features slv_features = { @@ -1048,7 +1058,7 @@ static const struct platform_features slv_features = { .has_msr_c6_demotion_policy_config = 1, .has_msr_atom_pkg_c6_residency = 1, .trl_msrs = TRL_ATOM, - .rapl_msrs = RAPL_PKG | RAPL_CORE, + .plat_rapl_msrs = RAPL_PKG | RAPL_CORE, .has_rapl_divisor = 1, .rapl_quirk_tdp = 30, }; @@ -1061,7 +1071,7 @@ static const struct platform_features slvd_features = { .cst_limit = CST_LIMIT_SLV, .has_msr_atom_pkg_c6_residency = 1, .trl_msrs = TRL_BASE, - .rapl_msrs = RAPL_PKG | RAPL_CORE, + .plat_rapl_msrs = RAPL_PKG | RAPL_CORE, .rapl_quirk_tdp = 30, }; @@ -1082,7 +1092,7 @@ static const struct platform_features gmt_features = { .cst_limit = CST_LIMIT_GMT, .has_irtl_msrs = 1, .trl_msrs = TRL_BASE | TRL_CORECOUNT, - .rapl_msrs = RAPL_PKG | RAPL_PKG_POWER_INFO, + .plat_rapl_msrs = RAPL_PKG | RAPL_PKG_POWER_INFO, }; static const struct platform_features gmtd_features = { @@ -1095,7 +1105,7 @@ static const struct platform_features gmtd_features = { .has_irtl_msrs = 1, .has_msr_core_c1_res = 1, .trl_msrs = TRL_BASE | TRL_CORECOUNT, - .rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL | RAPL_CORE_ENERGY_STATUS, + .plat_rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL | RAPL_CORE_ENERGY_STATUS, }; static const struct platform_features gmtp_features = { @@ -1107,7 +1117,7 @@ static const struct platform_features gmtp_features = { .cst_limit = CST_LIMIT_GMT, .has_irtl_msrs = 1, .trl_msrs = TRL_BASE, - .rapl_msrs = RAPL_PKG | RAPL_PKG_POWER_INFO, + .plat_rapl_msrs = RAPL_PKG | RAPL_PKG_POWER_INFO, }; static const struct platform_features tmt_features = { @@ -1118,7 +1128,7 @@ static const struct platform_features tmt_features = { .cst_limit = CST_LIMIT_GMT, .has_irtl_msrs = 1, .trl_msrs = TRL_BASE, - .rapl_msrs = RAPL_PKG_ALL | RAPL_CORE_ALL | RAPL_DRAM | RAPL_DRAM_PERF_STATUS | RAPL_GFX, + .plat_rapl_msrs = RAPL_PKG_ALL | RAPL_CORE_ALL | RAPL_DRAM | RAPL_DRAM_PERF_STATUS | RAPL_GFX, .enable_tsc_tweak = 1, }; @@ -1130,7 +1140,7 @@ static const struct platform_features tmtd_features = { .cst_limit = CST_LIMIT_GMT, .has_irtl_msrs = 1, .trl_msrs = TRL_BASE | TRL_CORECOUNT, - .rapl_msrs = RAPL_PKG_ALL, + .plat_rapl_msrs = RAPL_PKG_ALL, }; static const struct platform_features knl_features = { @@ -1142,7 +1152,7 @@ static const struct platform_features knl_features = { .cst_limit = CST_LIMIT_KNL, .has_msr_knl_core_c6_residency = 1, .trl_msrs = TRL_KNL, - .rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL, + .plat_rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL, .has_fixed_rapl_unit = 1, .need_perf_multiplier = 1, }; @@ -1151,7 +1161,7 @@ static const struct platform_features default_features = { }; static const struct platform_features amd_features_with_rapl = { - .rapl_msrs = RAPL_AMD_F17H, + .plat_rapl_msrs = RAPL_AMD_F17H, .has_per_core_rapl = 1, .rapl_quirk_tdp = 280, /* This is the max stock TDP of HEDT/Server Fam17h+ chips */ }; @@ -1210,6 +1220,9 @@ static const struct platform_data turbostat_pdata[] = { { INTEL_ARROWLAKE, &adl_features }, { INTEL_LUNARLAKE_M, &lnl_features }, { INTEL_PANTHERLAKE_L, &lnl_features }, + { INTEL_NOVALAKE, &lnl_features }, + { INTEL_NOVALAKE_L, &lnl_features }, + { INTEL_WILDCATLAKE_L, &lnl_features }, { INTEL_ATOM_SILVERMONT, &slv_features }, { INTEL_ATOM_SILVERMONT_D, &slvd_features }, { INTEL_ATOM_AIRMONT, &amt_features }, @@ -1294,8 +1307,7 @@ char *progname; #define CPU_SUBSET_MAXCPUS 8192 /* need to use before probe... */ cpu_set_t *cpu_present_set, *cpu_possible_set, *cpu_effective_set, *cpu_allowed_set, *cpu_affinity_set, *cpu_subset; -size_t cpu_present_setsize, cpu_possible_setsize, cpu_effective_setsize, cpu_allowed_setsize, cpu_affinity_setsize, - cpu_subset_size; +size_t cpu_present_setsize, cpu_possible_setsize, cpu_effective_setsize, cpu_allowed_setsize, cpu_affinity_setsize, cpu_subset_size; #define MAX_ADDED_THREAD_COUNTERS 24 #define MAX_ADDED_CORE_COUNTERS 8 #define MAX_ADDED_PACKAGE_COUNTERS 16 @@ -1991,6 +2003,10 @@ void pmt_counter_resize(struct pmt_counter *pcounter, unsigned int new_size) pmt_counter_resize_(pcounter, new_size); } +struct llc_stats { + unsigned long long references; + unsigned long long misses; +}; struct thread_data { struct timeval tv_begin; struct timeval tv_end; @@ -2003,6 +2019,7 @@ struct thread_data { unsigned long long irq_count; unsigned long long nmi_count; unsigned int smi_count; + struct llc_stats llc; unsigned int cpu_id; unsigned int apic_id; unsigned int x2apic_id; @@ -2118,7 +2135,7 @@ off_t idx_to_offset(int idx) switch (idx) { case IDX_PKG_ENERGY: - if (platform->rapl_msrs & RAPL_AMD_F17H) + if (valid_rapl_msrs & RAPL_AMD_F17H) offset = MSR_PKG_ENERGY_STAT; else offset = MSR_PKG_ENERGY_STATUS; @@ -2184,19 +2201,19 @@ int idx_valid(int idx) { switch (idx) { case IDX_PKG_ENERGY: - return platform->rapl_msrs & (RAPL_PKG | RAPL_AMD_F17H); + return valid_rapl_msrs & (RAPL_PKG | RAPL_AMD_F17H); case IDX_DRAM_ENERGY: - return platform->rapl_msrs & RAPL_DRAM; + return valid_rapl_msrs & RAPL_DRAM; case IDX_PP0_ENERGY: - return platform->rapl_msrs & RAPL_CORE_ENERGY_STATUS; + return valid_rapl_msrs & RAPL_CORE_ENERGY_STATUS; case IDX_PP1_ENERGY: - return platform->rapl_msrs & RAPL_GFX; + return valid_rapl_msrs & RAPL_GFX; case IDX_PKG_PERF: - return platform->rapl_msrs & RAPL_PKG_PERF_STATUS; + return valid_rapl_msrs & RAPL_PKG_PERF_STATUS; case IDX_DRAM_PERF: - return platform->rapl_msrs & RAPL_DRAM_PERF_STATUS; + return valid_rapl_msrs & RAPL_DRAM_PERF_STATUS; case IDX_PSYS_ENERGY: - return platform->rapl_msrs & RAPL_PSYS; + return valid_rapl_msrs & RAPL_PSYS; default: return 0; } @@ -2362,23 +2379,19 @@ int for_all_cpus(int (func) (struct thread_data *, struct core_data *, struct pk return retval; } -int is_cpu_first_thread_in_core(PER_THREAD_PARAMS) +int is_cpu_first_thread_in_core(struct thread_data *t, struct core_data *c) { - UNUSED(p); - return ((int)t->cpu_id == c->base_cpu || c->base_cpu < 0); } -int is_cpu_first_core_in_package(PER_THREAD_PARAMS) +int is_cpu_first_core_in_package(struct thread_data *t, struct pkg_data *p) { - UNUSED(c); - return ((int)t->cpu_id == p->base_cpu || p->base_cpu < 0); } -int is_cpu_first_thread_in_package(PER_THREAD_PARAMS) +int is_cpu_first_thread_in_package(struct thread_data *t, struct core_data *c, struct pkg_data *p) { - return is_cpu_first_thread_in_core(t, c, p) && is_cpu_first_core_in_package(t, c, p); + return is_cpu_first_thread_in_core(t, c) && is_cpu_first_core_in_package(t, p); } int cpu_migrate(int cpu) @@ -2400,20 +2413,11 @@ int get_msr_fd(int cpu) if (fd) return fd; -#if defined(ANDROID) - sprintf(pathname, "/dev/msr%d", cpu); -#else - sprintf(pathname, "/dev/cpu/%d/msr", cpu); -#endif + sprintf(pathname, use_android_msr_path ? "/dev/msr%d" : "/dev/cpu/%d/msr", cpu); fd = open(pathname, O_RDONLY); if (fd < 0) -#if defined(ANDROID) - err(-1, "%s open failed, try chown or chmod +r /dev/msr*, " - "or run with --no-msr, or run as root", pathname); -#else - err(-1, "%s open failed, try chown or chmod +r /dev/cpu/*/msr, " - "or run with --no-msr, or run as root", pathname); -#endif + err(-1, "%s open failed, try chown or chmod +r %s, " + "or run with --no-msr, or run as root", pathname, use_android_msr_path ? "/dev/msr*" : "/dev/cpu/*/msr"); fd_percpu[cpu] = fd; return fd; @@ -2432,6 +2436,13 @@ static void bic_disable_msr_access(void) free_sys_msr_counters(); } +static void bic_disable_perf_access(void) +{ + CLR_BIC(BIC_IPC, &bic_enabled); + CLR_BIC(BIC_LLC_RPS, &bic_enabled); + CLR_BIC(BIC_LLC_HIT, &bic_enabled); +} + static long perf_event_open(struct perf_event_attr *hw_event, pid_t pid, int cpu, int group_fd, unsigned long flags) { assert(!no_perf); @@ -2512,7 +2523,7 @@ int add_rapl_msr_counter(int cpu, const struct rapl_counter_arch_info *cai) { int ret; - if (!(platform->rapl_msrs & cai->feature_mask)) + if (!(valid_rapl_msrs & cai->feature_mask)) return -1; ret = add_msr_counter(cpu, cai->msr); @@ -2656,6 +2667,12 @@ void bic_lookup(cpu_set_t *ret_set, char *name_list, enum show_hide_mode mode) } else if (!strcmp(name_list, "idle")) { CPU_OR(ret_set, ret_set, &bic_group_idle); break; + } else if (!strcmp(name_list, "cache")) { + CPU_OR(ret_set, ret_set, &bic_group_cache); + break; + } else if (!strcmp(name_list, "llc")) { + CPU_OR(ret_set, ret_set, &bic_group_cache); + break; } else if (!strcmp(name_list, "swidle")) { CPU_OR(ret_set, ret_set, &bic_group_sw_idle); break; @@ -2677,8 +2694,7 @@ void bic_lookup(cpu_set_t *ret_set, char *name_list, enum show_hide_mode mode) if (mode == SHOW_LIST) { deferred_add_names[deferred_add_index++] = name_list; if (deferred_add_index >= MAX_DEFERRED) { - fprintf(stderr, "More than max %d un-recognized --add options '%s'\n", - MAX_DEFERRED, name_list); + fprintf(stderr, "More than max %d un-recognized --add options '%s'\n", MAX_DEFERRED, name_list); help(); exit(1); } @@ -2687,8 +2703,7 @@ void bic_lookup(cpu_set_t *ret_set, char *name_list, enum show_hide_mode mode) if (debug) fprintf(stderr, "deferred \"%s\"\n", name_list); if (deferred_skip_index >= MAX_DEFERRED) { - fprintf(stderr, "More than max %d un-recognized --skip options '%s'\n", - MAX_DEFERRED, name_list); + fprintf(stderr, "More than max %d un-recognized --skip options '%s'\n", MAX_DEFERRED, name_list); help(); exit(1); } @@ -2702,6 +2717,47 @@ void bic_lookup(cpu_set_t *ret_set, char *name_list, enum show_hide_mode mode) } } +/* + * print_name() + * Print column header name for raw 64-bit counter in 16 columns (at least 8-char plus a tab) + * Otherwise, allow the name + tab to fit within 8-coumn tab-stop. + * In both cases, left justififed, just like other turbostat columns, + * to allow the column values to consume the tab. + * + * Yes, 32-bit counters can overflow 8-columns, and + * 64-bit counters can overflow 16-columns, but that is uncommon. + */ +static inline int print_name(int width, int *printed, char *delim, char *name, enum counter_type type, enum counter_format format) +{ + UNUSED(type); + + if (format == FORMAT_RAW && width >= 64) + return (sprintf(outp, "%s%-8s", (*printed++ ? delim : ""), name)); + else + return (sprintf(outp, "%s%s", (*printed++ ? delim : ""), name)); +} + +static inline int print_hex_value(int width, int *printed, char *delim, unsigned long long value) +{ + if (width <= 32) + return (sprintf(outp, "%s%08x", (*printed++ ? delim : ""), (unsigned int)value)); + else + return (sprintf(outp, "%s%016llx", (*printed++ ? delim : ""), value)); +} + +static inline int print_decimal_value(int width, int *printed, char *delim, unsigned long long value) +{ + if (width <= 32) + return (sprintf(outp, "%s%d", (*printed++ ? delim : ""), (unsigned int)value)); + else + return (sprintf(outp, "%s%-8lld", (*printed++ ? delim : ""), value)); +} + +static inline int print_float_value(int *printed, char *delim, double value) +{ + return (sprintf(outp, "%s%0.2f", (*printed++ ? delim : ""), value)); +} + void print_header(char *delim) { struct msr_counter *mp; @@ -2757,50 +2813,28 @@ void print_header(char *delim) if (DO_BIC(BIC_SMI)) outp += sprintf(outp, "%sSMI", (printed++ ? delim : "")); - for (mp = sys.tp; mp; mp = mp->next) { + if (DO_BIC(BIC_LLC_RPS)) + outp += sprintf(outp, "%sLLCkRPS", (printed++ ? delim : "")); - if (mp->format == FORMAT_RAW || mp->format == FORMAT_AVERAGE) { - if (mp->width == 64) - outp += sprintf(outp, "%s%18.18s", (printed++ ? delim : ""), mp->name); - else - outp += sprintf(outp, "%s%10.10s", (printed++ ? delim : ""), mp->name); - } else { - if ((mp->type == COUNTER_ITEMS) && sums_need_wide_columns) - outp += sprintf(outp, "%s%8s", (printed++ ? delim : ""), mp->name); - else - outp += sprintf(outp, "%s%s", (printed++ ? delim : ""), mp->name); - } - } + if (DO_BIC(BIC_LLC_HIT)) + outp += sprintf(outp, "%sLLC%%hit", (printed++ ? delim : "")); - for (pp = sys.perf_tp; pp; pp = pp->next) { + for (mp = sys.tp; mp; mp = mp->next) + outp += print_name(mp->width, &printed, delim, mp->name, mp->type, mp->format); - if (pp->format == FORMAT_RAW) { - if (pp->width == 64) - outp += sprintf(outp, "%s%18.18s", (printed++ ? delim : ""), pp->name); - else - outp += sprintf(outp, "%s%10.10s", (printed++ ? delim : ""), pp->name); - } else { - if ((pp->type == COUNTER_ITEMS) && sums_need_wide_columns) - outp += sprintf(outp, "%s%8s", (printed++ ? delim : ""), pp->name); - else - outp += sprintf(outp, "%s%s", (printed++ ? delim : ""), pp->name); - } - } + for (pp = sys.perf_tp; pp; pp = pp->next) + outp += print_name(pp->width, &printed, delim, pp->name, pp->type, pp->format); ppmt = sys.pmt_tp; while (ppmt) { switch (ppmt->type) { case PMT_TYPE_RAW: - if (pmt_counter_get_width(ppmt) <= 32) - outp += sprintf(outp, "%s%10.10s", (printed++ ? delim : ""), ppmt->name); - else - outp += sprintf(outp, "%s%18.18s", (printed++ ? delim : ""), ppmt->name); - + outp += print_name(pmt_counter_get_width(ppmt), &printed, delim, ppmt->name, COUNTER_ITEMS, ppmt->format); break; case PMT_TYPE_XTAL_TIME: case PMT_TYPE_TCORE_CLOCK: - outp += sprintf(outp, "%s%s", (printed++ ? delim : ""), ppmt->name); + outp += print_name(32, &printed, delim, ppmt->name, COUNTER_ITEMS, ppmt->format); break; } @@ -2825,63 +2859,36 @@ void print_header(char *delim) if (DO_BIC(BIC_CORE_THROT_CNT)) outp += sprintf(outp, "%sCoreThr", (printed++ ? delim : "")); - if (platform->rapl_msrs && !rapl_joules) { + if (valid_rapl_msrs && !rapl_joules) { if (DO_BIC(BIC_CorWatt) && platform->has_per_core_rapl) outp += sprintf(outp, "%sCorWatt", (printed++ ? delim : "")); - } else if (platform->rapl_msrs && rapl_joules) { + } else if (valid_rapl_msrs && rapl_joules) { if (DO_BIC(BIC_Cor_J) && platform->has_per_core_rapl) outp += sprintf(outp, "%sCor_J", (printed++ ? delim : "")); } - for (mp = sys.cp; mp; mp = mp->next) { - if (mp->format == FORMAT_RAW || mp->format == FORMAT_AVERAGE) { - if (mp->width == 64) - outp += sprintf(outp, "%s%18.18s", delim, mp->name); - else - outp += sprintf(outp, "%s%10.10s", delim, mp->name); - } else { - if ((mp->type == COUNTER_ITEMS) && sums_need_wide_columns) - outp += sprintf(outp, "%s%8s", delim, mp->name); - else - outp += sprintf(outp, "%s%s", delim, mp->name); - } - } + for (mp = sys.cp; mp; mp = mp->next) + outp += print_name(mp->width, &printed, delim, mp->name, mp->type, mp->format); - for (pp = sys.perf_cp; pp; pp = pp->next) { - - if (pp->format == FORMAT_RAW) { - if (pp->width == 64) - outp += sprintf(outp, "%s%18.18s", (printed++ ? delim : ""), pp->name); - else - outp += sprintf(outp, "%s%10.10s", (printed++ ? delim : ""), pp->name); - } else { - if ((pp->type == COUNTER_ITEMS) && sums_need_wide_columns) - outp += sprintf(outp, "%s%8s", (printed++ ? delim : ""), pp->name); - else - outp += sprintf(outp, "%s%s", (printed++ ? delim : ""), pp->name); - } - } + for (pp = sys.perf_cp; pp; pp = pp->next) + outp += print_name(pp->width, &printed, delim, pp->name, pp->type, pp->format); ppmt = sys.pmt_cp; while (ppmt) { switch (ppmt->type) { case PMT_TYPE_RAW: - if (pmt_counter_get_width(ppmt) <= 32) - outp += sprintf(outp, "%s%10.10s", (printed++ ? delim : ""), ppmt->name); - else - outp += sprintf(outp, "%s%18.18s", (printed++ ? delim : ""), ppmt->name); + outp += print_name(pmt_counter_get_width(ppmt), &printed, delim, ppmt->name, COUNTER_ITEMS, ppmt->format); break; case PMT_TYPE_XTAL_TIME: case PMT_TYPE_TCORE_CLOCK: - outp += sprintf(outp, "%s%s", (printed++ ? delim : ""), ppmt->name); + outp += print_name(32, &printed, delim, ppmt->name, COUNTER_ITEMS, ppmt->format); break; } ppmt = ppmt->next; } - if (DO_BIC(BIC_PkgTmp)) outp += sprintf(outp, "%sPkgTmp", (printed++ ? delim : "")); @@ -2963,51 +2970,22 @@ void print_header(char *delim) if (DO_BIC(BIC_UNCORE_MHZ)) outp += sprintf(outp, "%sUncMHz", (printed++ ? delim : "")); - for (mp = sys.pp; mp; mp = mp->next) { - if (mp->format == FORMAT_RAW || mp->format == FORMAT_AVERAGE) { - if (mp->width == 64) - outp += sprintf(outp, "%s%18.18s", delim, mp->name); - else if (mp->width == 32) - outp += sprintf(outp, "%s%10.10s", delim, mp->name); - else - outp += sprintf(outp, "%s%7.7s", delim, mp->name); - } else { - if ((mp->type == COUNTER_ITEMS) && sums_need_wide_columns) - outp += sprintf(outp, "%s%8s", delim, mp->name); - else - outp += sprintf(outp, "%s%7.7s", delim, mp->name); - } - } - - for (pp = sys.perf_pp; pp; pp = pp->next) { + for (mp = sys.pp; mp; mp = mp->next) + outp += print_name(mp->width, &printed, delim, mp->name, mp->type, mp->format); - if (pp->format == FORMAT_RAW) { - if (pp->width == 64) - outp += sprintf(outp, "%s%18.18s", (printed++ ? delim : ""), pp->name); - else - outp += sprintf(outp, "%s%10.10s", (printed++ ? delim : ""), pp->name); - } else { - if ((pp->type == COUNTER_ITEMS) && sums_need_wide_columns) - outp += sprintf(outp, "%s%8s", (printed++ ? delim : ""), pp->name); - else - outp += sprintf(outp, "%s%s", (printed++ ? delim : ""), pp->name); - } - } + for (pp = sys.perf_pp; pp; pp = pp->next) + outp += print_name(pp->width, &printed, delim, pp->name, pp->type, pp->format); ppmt = sys.pmt_pp; while (ppmt) { switch (ppmt->type) { case PMT_TYPE_RAW: - if (pmt_counter_get_width(ppmt) <= 32) - outp += sprintf(outp, "%s%10.10s", (printed++ ? delim : ""), ppmt->name); - else - outp += sprintf(outp, "%s%18.18s", (printed++ ? delim : ""), ppmt->name); - + outp += print_name(pmt_counter_get_width(ppmt), &printed, delim, ppmt->name, COUNTER_ITEMS, ppmt->format); break; case PMT_TYPE_XTAL_TIME: case PMT_TYPE_TCORE_CLOCK: - outp += sprintf(outp, "%s%s", (printed++ ? delim : ""), ppmt->name); + outp += print_name(32, &printed, delim, ppmt->name, COUNTER_ITEMS, ppmt->format); break; } @@ -3022,6 +3000,25 @@ void print_header(char *delim) outp += sprintf(outp, "\n"); } +/* + * pct() + * + * If absolute value is < 1.1, return percentage + * otherwise, return nan + * + * return value is appropriate for printing percentages with %f + * while flagging some obvious erroneous values. + */ +double pct(double d) +{ + + double abs = fabs(d); + + if (abs < 1.10) + return (100.0 * d); + return nan(""); +} + int dump_counters(PER_THREAD_PARAMS) { int i; @@ -3047,14 +3044,16 @@ int dump_counters(PER_THREAD_PARAMS) if (DO_BIC(BIC_SMI)) outp += sprintf(outp, "SMI: %d\n", t->smi_count); + outp += sprintf(outp, "LLC refs: %lld", t->llc.references); + outp += sprintf(outp, "LLC miss: %lld", t->llc.misses); + outp += sprintf(outp, "LLC Hit%%: %.2f", pct((t->llc.references - t->llc.misses) / t->llc.references)); + for (i = 0, mp = sys.tp; mp; i++, mp = mp->next) { - outp += - sprintf(outp, "tADDED [%d] %8s msr0x%x: %08llX %s\n", i, mp->name, mp->msr_num, - t->counter[i], mp->sp->path); + outp += sprintf(outp, "tADDED [%d] %8s msr0x%x: %08llX %s\n", i, mp->name, mp->msr_num, t->counter[i], mp->sp->path); } } - if (c && is_cpu_first_thread_in_core(t, c, p)) { + if (c && is_cpu_first_thread_in_core(t, c)) { outp += sprintf(outp, "core: %d\n", c->core_id); outp += sprintf(outp, "c3: %016llX\n", c->c3); outp += sprintf(outp, "c6: %016llX\n", c->c6); @@ -3069,14 +3068,12 @@ int dump_counters(PER_THREAD_PARAMS) outp += sprintf(outp, "Joules: %0llX (scale: %lf)\n", energy_value, energy_scale); for (i = 0, mp = sys.cp; mp; i++, mp = mp->next) { - outp += - sprintf(outp, "cADDED [%d] %8s msr0x%x: %08llX %s\n", i, mp->name, mp->msr_num, - c->counter[i], mp->sp->path); + outp += sprintf(outp, "cADDED [%d] %8s msr0x%x: %08llX %s\n", i, mp->name, mp->msr_num, c->counter[i], mp->sp->path); } outp += sprintf(outp, "mc6_us: %016llX\n", c->mc6_us); } - if (p && is_cpu_first_core_in_package(t, c, p)) { + if (p && is_cpu_first_core_in_package(t, p)) { outp += sprintf(outp, "package: %d\n", p->package_id); outp += sprintf(outp, "Weighted cores: %016llX\n", p->pkg_wtd_core_c0); @@ -3106,9 +3103,7 @@ int dump_counters(PER_THREAD_PARAMS) outp += sprintf(outp, "PTM: %dC\n", p->pkg_temp_c); for (i = 0, mp = sys.pp; mp; i++, mp = mp->next) { - outp += - sprintf(outp, "pADDED [%d] %8s msr0x%x: %08llX %s\n", i, mp->name, mp->msr_num, - p->counter[i], mp->sp->path); + outp += sprintf(outp, "pADDED [%d] %8s msr0x%x: %08llX %s\n", i, mp->name, mp->msr_num, p->counter[i], mp->sp->path); } } @@ -3134,6 +3129,26 @@ double rapl_counter_get_value(const struct rapl_counter *c, enum rapl_unit desir return scaled; } +void get_perf_llc_stats(int cpu, struct llc_stats *llc) +{ + struct read_format { + unsigned long long num_read; + struct llc_stats llc; + } r; + const ssize_t expected_read_size = sizeof(r); + ssize_t actual_read_size; + + actual_read_size = read(fd_llc_percpu[cpu], &r, expected_read_size); + + if (actual_read_size == -1) + err(-1, "%s(cpu%d,) %d,,%ld\n", __func__, cpu, fd_llc_percpu[cpu], expected_read_size); + + llc->references = r.llc.references; + llc->misses = r.llc.misses; + if (actual_read_size != expected_read_size) + warn("%s: failed to read perf_data (req %zu act %zu)", __func__, expected_read_size, actual_read_size); +} + /* * column formatting convention & formats */ @@ -3143,7 +3158,8 @@ int format_counters(PER_THREAD_PARAMS) struct platform_counters *pplat_cnt = NULL; double interval_float, tsc; - char *fmt8; + char *fmt8 = "%s%.2f"; + int i; struct msr_counter *mp; struct perf_counter_info *pp; @@ -3157,11 +3173,11 @@ int format_counters(PER_THREAD_PARAMS) } /* if showing only 1st thread in core and this isn't one, bail out */ - if (show_core_only && !is_cpu_first_thread_in_core(t, c, p)) + if (show_core_only && !is_cpu_first_thread_in_core(t, c)) return 0; /* if showing only 1st thread in pkg and this isn't one, bail out */ - if (show_pkg_only && !is_cpu_first_core_in_package(t, c, p)) + if (show_pkg_only && !is_cpu_first_core_in_package(t, p)) return 0; /*if not summary line and --cpu is used */ @@ -3223,8 +3239,7 @@ int format_counters(PER_THREAD_PARAMS) } if (DO_BIC(BIC_Node)) { if (t) - outp += sprintf(outp, "%s%d", - (printed++ ? delim : ""), cpus[t->cpu_id].physical_node_id); + outp += sprintf(outp, "%s%d", (printed++ ? delim : ""), cpus[t->cpu_id].physical_node_id); else outp += sprintf(outp, "%s-", (printed++ ? delim : "")); } @@ -3246,15 +3261,13 @@ int format_counters(PER_THREAD_PARAMS) outp += sprintf(outp, "%s%.0f", (printed++ ? delim : ""), 1.0 / units * t->aperf / interval_float); if (DO_BIC(BIC_Busy)) - outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * t->mperf / tsc); + outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), pct(t->mperf / tsc)); if (DO_BIC(BIC_Bzy_MHz)) { if (has_base_hz) - outp += - sprintf(outp, "%s%.0f", (printed++ ? delim : ""), base_hz / units * t->aperf / t->mperf); + outp += sprintf(outp, "%s%.0f", (printed++ ? delim : ""), base_hz / units * t->aperf / t->mperf); else - outp += sprintf(outp, "%s%.0f", (printed++ ? delim : ""), - tsc / units * t->aperf / t->mperf / interval_float); + outp += sprintf(outp, "%s%.0f", (printed++ ? delim : ""), tsc / units * t->aperf / t->mperf / interval_float); } if (DO_BIC(BIC_TSC_MHz)) @@ -3283,96 +3296,81 @@ int format_counters(PER_THREAD_PARAMS) if (DO_BIC(BIC_SMI)) outp += sprintf(outp, "%s%d", (printed++ ? delim : ""), t->smi_count); - /* Added counters */ + /* LLC Stats */ + if (DO_BIC(BIC_LLC_RPS) || DO_BIC(BIC_LLC_HIT)) { + if (DO_BIC(BIC_LLC_RPS)) + outp += sprintf(outp, "%s%.0f", (printed++ ? delim : ""), t->llc.references / interval_float / 1000); + + if (DO_BIC(BIC_LLC_HIT)) + outp += sprintf(outp, fmt8, (printed++ ? delim : ""), pct((t->llc.references - t->llc.misses) / t->llc.references)); + } + + /* Added Thread Counters */ for (i = 0, mp = sys.tp; mp; i++, mp = mp->next) { - if (mp->format == FORMAT_RAW || mp->format == FORMAT_AVERAGE) { - if (mp->width == 32) - outp += - sprintf(outp, "%s0x%08x", (printed++ ? delim : ""), (unsigned int)t->counter[i]); - else - outp += sprintf(outp, "%s0x%016llx", (printed++ ? delim : ""), t->counter[i]); - } else if (mp->format == FORMAT_DELTA) { - if ((mp->type == COUNTER_ITEMS) && sums_need_wide_columns) - outp += sprintf(outp, "%s%8lld", (printed++ ? delim : ""), t->counter[i]); - else - outp += sprintf(outp, "%s%lld", (printed++ ? delim : ""), t->counter[i]); - } else if (mp->format == FORMAT_PERCENT) { + if (mp->format == FORMAT_RAW) + outp += print_hex_value(mp->width, &printed, delim, t->counter[i]); + else if (mp->format == FORMAT_DELTA || mp->format == FORMAT_AVERAGE) + outp += print_decimal_value(mp->width, &printed, delim, t->counter[i]); + else if (mp->format == FORMAT_PERCENT) { if (mp->type == COUNTER_USEC) - outp += - sprintf(outp, "%s%.2f", (printed++ ? delim : ""), - t->counter[i] / interval_float / 10000); + outp += print_float_value(&printed, delim, t->counter[i] / interval_float / 10000); else - outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * t->counter[i] / tsc); + outp += print_float_value(&printed, delim, pct(t->counter[i] / tsc)); } } - /* Added perf counters */ + /* Added perf Thread Counters */ for (i = 0, pp = sys.perf_tp; pp; ++i, pp = pp->next) { - if (pp->format == FORMAT_RAW) { - if (pp->width == 32) - outp += - sprintf(outp, "%s0x%08x", (printed++ ? delim : ""), - (unsigned int)t->perf_counter[i]); - else - outp += sprintf(outp, "%s0x%016llx", (printed++ ? delim : ""), t->perf_counter[i]); - } else if (pp->format == FORMAT_DELTA) { - if ((pp->type == COUNTER_ITEMS) && sums_need_wide_columns) - outp += sprintf(outp, "%s%8lld", (printed++ ? delim : ""), t->perf_counter[i]); - else - outp += sprintf(outp, "%s%lld", (printed++ ? delim : ""), t->perf_counter[i]); - } else if (pp->format == FORMAT_PERCENT) { + if (pp->format == FORMAT_RAW) + outp += print_hex_value(pp->width, &printed, delim, t->perf_counter[i]); + else if (pp->format == FORMAT_DELTA || mp->format == FORMAT_AVERAGE) + outp += print_decimal_value(pp->width, &printed, delim, t->perf_counter[i]); + else if (pp->format == FORMAT_PERCENT) { if (pp->type == COUNTER_USEC) - outp += - sprintf(outp, "%s%.2f", (printed++ ? delim : ""), - t->perf_counter[i] / interval_float / 10000); + outp += print_float_value(&printed, delim, t->perf_counter[i] / interval_float / 10000); else - outp += - sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * t->perf_counter[i] / tsc); + outp += print_float_value(&printed, delim, pct(t->perf_counter[i] / tsc)); } } + /* Added PMT Thread Counters */ for (i = 0, ppmt = sys.pmt_tp; ppmt; i++, ppmt = ppmt->next) { const unsigned long value_raw = t->pmt_counter[i]; double value_converted; switch (ppmt->type) { case PMT_TYPE_RAW: - if (pmt_counter_get_width(ppmt) <= 32) - outp += sprintf(outp, "%s0x%08x", (printed++ ? delim : ""), - (unsigned int)t->pmt_counter[i]); - else - outp += sprintf(outp, "%s0x%016llx", (printed++ ? delim : ""), t->pmt_counter[i]); - + outp += print_hex_value(pmt_counter_get_width(ppmt), &printed, delim, t->pmt_counter[i]); break; case PMT_TYPE_XTAL_TIME: - value_converted = 100.0 * value_raw / crystal_hz / interval_float; + value_converted = pct(value_raw / crystal_hz / interval_float); outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), value_converted); break; case PMT_TYPE_TCORE_CLOCK: - value_converted = 100.0 * value_raw / tcore_clock_freq_hz / interval_float; + value_converted = pct(value_raw / tcore_clock_freq_hz / interval_float); outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), value_converted); } } /* C1 */ if (DO_BIC(BIC_CPU_c1)) - outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * t->c1 / tsc); + outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), pct(t->c1 / tsc)); /* print per-core data only for 1st thread in core */ - if (!is_cpu_first_thread_in_core(t, c, p)) + if (!is_cpu_first_thread_in_core(t, c)) goto done; if (DO_BIC(BIC_CPU_c3)) - outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * c->c3 / tsc); + outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), pct(c->c3 / tsc)); if (DO_BIC(BIC_CPU_c6)) - outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * c->c6 / tsc); + outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), pct(c->c6 / tsc)); if (DO_BIC(BIC_CPU_c7)) - outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * c->c7 / tsc); + outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), pct(c->c7 / tsc)); /* Mod%c6 */ if (DO_BIC(BIC_Mod_c6)) - outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * c->mc6_us / tsc); + outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), pct(c->mc6_us / tsc)); if (DO_BIC(BIC_CoreTmp)) outp += sprintf(outp, "%s%d", (printed++ ? delim : ""), c->core_temp_c); @@ -3381,77 +3379,53 @@ int format_counters(PER_THREAD_PARAMS) if (DO_BIC(BIC_CORE_THROT_CNT)) outp += sprintf(outp, "%s%lld", (printed++ ? delim : ""), c->core_throt_cnt); + /* Added Core Counters */ for (i = 0, mp = sys.cp; mp; i++, mp = mp->next) { - if (mp->format == FORMAT_RAW || mp->format == FORMAT_AVERAGE) { - if (mp->width == 32) - outp += - sprintf(outp, "%s0x%08x", (printed++ ? delim : ""), (unsigned int)c->counter[i]); - else - outp += sprintf(outp, "%s0x%016llx", (printed++ ? delim : ""), c->counter[i]); - } else if (mp->format == FORMAT_DELTA) { - if ((mp->type == COUNTER_ITEMS) && sums_need_wide_columns) - outp += sprintf(outp, "%s%8lld", (printed++ ? delim : ""), c->counter[i]); - else - outp += sprintf(outp, "%s%lld", (printed++ ? delim : ""), c->counter[i]); - } else if (mp->format == FORMAT_PERCENT) { - outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * c->counter[i] / tsc); - } + if (mp->format == FORMAT_RAW) + outp += print_hex_value(mp->width, &printed, delim, c->counter[i]); + else if (mp->format == FORMAT_DELTA || mp->format == FORMAT_AVERAGE) + outp += print_decimal_value(mp->width, &printed, delim, c->counter[i]); + else if (mp->format == FORMAT_PERCENT) + outp += print_float_value(&printed, delim, pct(c->counter[i] / tsc)); } + /* Added perf Core counters */ for (i = 0, pp = sys.perf_cp; pp; i++, pp = pp->next) { - if (pp->format == FORMAT_RAW) { - if (pp->width == 32) - outp += - sprintf(outp, "%s0x%08x", (printed++ ? delim : ""), - (unsigned int)c->perf_counter[i]); - else - outp += sprintf(outp, "%s0x%016llx", (printed++ ? delim : ""), c->perf_counter[i]); - } else if (pp->format == FORMAT_DELTA) { - if ((pp->type == COUNTER_ITEMS) && sums_need_wide_columns) - outp += sprintf(outp, "%s%8lld", (printed++ ? delim : ""), c->perf_counter[i]); - else - outp += sprintf(outp, "%s%lld", (printed++ ? delim : ""), c->perf_counter[i]); - } else if (pp->format == FORMAT_PERCENT) { - outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * c->perf_counter[i] / tsc); - } + if (pp->format == FORMAT_RAW) + outp += print_hex_value(pp->width, &printed, delim, c->perf_counter[i]); + else if (pp->format == FORMAT_DELTA || mp->format == FORMAT_AVERAGE) + outp += print_decimal_value(pp->width, &printed, delim, c->perf_counter[i]); + else if (pp->format == FORMAT_PERCENT) + outp += print_float_value(&printed, delim, pct(c->perf_counter[i] / tsc)); } + /* Added PMT Core counters */ for (i = 0, ppmt = sys.pmt_cp; ppmt; i++, ppmt = ppmt->next) { const unsigned long value_raw = c->pmt_counter[i]; double value_converted; switch (ppmt->type) { case PMT_TYPE_RAW: - if (pmt_counter_get_width(ppmt) <= 32) - outp += sprintf(outp, "%s0x%08x", (printed++ ? delim : ""), - (unsigned int)c->pmt_counter[i]); - else - outp += sprintf(outp, "%s0x%016llx", (printed++ ? delim : ""), c->pmt_counter[i]); - + outp += print_hex_value(pmt_counter_get_width(ppmt), &printed, delim, c->pmt_counter[i]); break; case PMT_TYPE_XTAL_TIME: - value_converted = 100.0 * value_raw / crystal_hz / interval_float; - outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), value_converted); + value_converted = pct(value_raw / crystal_hz / interval_float); + outp += print_float_value(&printed, delim, value_converted); break; case PMT_TYPE_TCORE_CLOCK: - value_converted = 100.0 * value_raw / tcore_clock_freq_hz / interval_float; - outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), value_converted); + value_converted = pct(value_raw / tcore_clock_freq_hz / interval_float); + outp += print_float_value(&printed, delim, value_converted); } } - fmt8 = "%s%.2f"; - if (DO_BIC(BIC_CorWatt) && platform->has_per_core_rapl) - outp += - sprintf(outp, fmt8, (printed++ ? delim : ""), - rapl_counter_get_value(&c->core_energy, RAPL_UNIT_WATTS, interval_float)); + outp += sprintf(outp, fmt8, (printed++ ? delim : ""), rapl_counter_get_value(&c->core_energy, RAPL_UNIT_WATTS, interval_float)); if (DO_BIC(BIC_Cor_J) && platform->has_per_core_rapl) - outp += sprintf(outp, fmt8, (printed++ ? delim : ""), - rapl_counter_get_value(&c->core_energy, RAPL_UNIT_JOULES, interval_float)); + outp += sprintf(outp, fmt8, (printed++ ? delim : ""), rapl_counter_get_value(&c->core_energy, RAPL_UNIT_JOULES, interval_float)); /* print per-package data only for 1st core in package */ - if (!is_cpu_first_core_in_package(t, c, p)) + if (!is_cpu_first_core_in_package(t, p)) goto done; /* PkgTmp */ @@ -3463,8 +3437,7 @@ int format_counters(PER_THREAD_PARAMS) if (p->gfx_rc6_ms == -1) { /* detect GFX counter reset */ outp += sprintf(outp, "%s**.**", (printed++ ? delim : "")); } else { - outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), - p->gfx_rc6_ms / 10.0 / interval_float); + outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), p->gfx_rc6_ms / 10.0 / interval_float); } } @@ -3481,8 +3454,7 @@ int format_counters(PER_THREAD_PARAMS) if (p->sam_mc6_ms == -1) { /* detect GFX counter reset */ outp += sprintf(outp, "%s**.**", (printed++ ? delim : "")); } else { - outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), - p->sam_mc6_ms / 10.0 / interval_float); + outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), p->sam_mc6_ms / 10.0 / interval_float); } } @@ -3496,150 +3468,112 @@ int format_counters(PER_THREAD_PARAMS) /* Totl%C0, Any%C0 GFX%C0 CPUGFX% */ if (DO_BIC(BIC_Totl_c0)) - outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->pkg_wtd_core_c0 / tsc); + outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100 * p->pkg_wtd_core_c0 / tsc); /* can exceed 100% */ if (DO_BIC(BIC_Any_c0)) - outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->pkg_any_core_c0 / tsc); + outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), pct(p->pkg_any_core_c0 / tsc)); if (DO_BIC(BIC_GFX_c0)) - outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->pkg_any_gfxe_c0 / tsc); + outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), pct(p->pkg_any_gfxe_c0 / tsc)); if (DO_BIC(BIC_CPUGFX)) - outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->pkg_both_core_gfxe_c0 / tsc); + outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), pct(p->pkg_both_core_gfxe_c0 / tsc)); if (DO_BIC(BIC_Pkgpc2)) - outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->pc2 / tsc); + outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), pct(p->pc2 / tsc)); if (DO_BIC(BIC_Pkgpc3)) - outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->pc3 / tsc); + outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), pct(p->pc3 / tsc)); if (DO_BIC(BIC_Pkgpc6)) - outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->pc6 / tsc); + outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), pct(p->pc6 / tsc)); if (DO_BIC(BIC_Pkgpc7)) - outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->pc7 / tsc); + outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), pct(p->pc7 / tsc)); if (DO_BIC(BIC_Pkgpc8)) - outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->pc8 / tsc); + outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), pct(p->pc8 / tsc)); if (DO_BIC(BIC_Pkgpc9)) - outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->pc9 / tsc); + outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), pct(p->pc9 / tsc)); if (DO_BIC(BIC_Pkgpc10)) - outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->pc10 / tsc); + outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), pct(p->pc10 / tsc)); if (DO_BIC(BIC_Diec6)) - outp += - sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->die_c6 / crystal_hz / interval_float); + outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), pct(p->die_c6 / crystal_hz / interval_float)); if (DO_BIC(BIC_CPU_LPI)) { if (p->cpu_lpi >= 0) - outp += - sprintf(outp, "%s%.2f", (printed++ ? delim : ""), - 100.0 * p->cpu_lpi / 1000000.0 / interval_float); + outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), pct(p->cpu_lpi / 1000000.0 / interval_float)); else outp += sprintf(outp, "%s(neg)", (printed++ ? delim : "")); } if (DO_BIC(BIC_SYS_LPI)) { if (p->sys_lpi >= 0) - outp += - sprintf(outp, "%s%.2f", (printed++ ? delim : ""), - 100.0 * p->sys_lpi / 1000000.0 / interval_float); + outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), pct(p->sys_lpi / 1000000.0 / interval_float)); else outp += sprintf(outp, "%s(neg)", (printed++ ? delim : "")); } if (DO_BIC(BIC_PkgWatt)) - outp += - sprintf(outp, fmt8, (printed++ ? delim : ""), - rapl_counter_get_value(&p->energy_pkg, RAPL_UNIT_WATTS, interval_float)); + outp += sprintf(outp, fmt8, (printed++ ? delim : ""), rapl_counter_get_value(&p->energy_pkg, RAPL_UNIT_WATTS, interval_float)); if (DO_BIC(BIC_CorWatt) && !platform->has_per_core_rapl) - outp += - sprintf(outp, fmt8, (printed++ ? delim : ""), - rapl_counter_get_value(&p->energy_cores, RAPL_UNIT_WATTS, interval_float)); + outp += sprintf(outp, fmt8, (printed++ ? delim : ""), rapl_counter_get_value(&p->energy_cores, RAPL_UNIT_WATTS, interval_float)); if (DO_BIC(BIC_GFXWatt)) - outp += - sprintf(outp, fmt8, (printed++ ? delim : ""), - rapl_counter_get_value(&p->energy_gfx, RAPL_UNIT_WATTS, interval_float)); + outp += sprintf(outp, fmt8, (printed++ ? delim : ""), rapl_counter_get_value(&p->energy_gfx, RAPL_UNIT_WATTS, interval_float)); if (DO_BIC(BIC_RAMWatt)) - outp += - sprintf(outp, fmt8, (printed++ ? delim : ""), - rapl_counter_get_value(&p->energy_dram, RAPL_UNIT_WATTS, interval_float)); + outp += sprintf(outp, fmt8, (printed++ ? delim : ""), rapl_counter_get_value(&p->energy_dram, RAPL_UNIT_WATTS, interval_float)); if (DO_BIC(BIC_Pkg_J)) - outp += sprintf(outp, fmt8, (printed++ ? delim : ""), - rapl_counter_get_value(&p->energy_pkg, RAPL_UNIT_JOULES, interval_float)); + outp += sprintf(outp, fmt8, (printed++ ? delim : ""), rapl_counter_get_value(&p->energy_pkg, RAPL_UNIT_JOULES, interval_float)); if (DO_BIC(BIC_Cor_J) && !platform->has_per_core_rapl) - outp += sprintf(outp, fmt8, (printed++ ? delim : ""), - rapl_counter_get_value(&p->energy_cores, RAPL_UNIT_JOULES, interval_float)); + outp += sprintf(outp, fmt8, (printed++ ? delim : ""), rapl_counter_get_value(&p->energy_cores, RAPL_UNIT_JOULES, interval_float)); if (DO_BIC(BIC_GFX_J)) - outp += sprintf(outp, fmt8, (printed++ ? delim : ""), - rapl_counter_get_value(&p->energy_gfx, RAPL_UNIT_JOULES, interval_float)); + outp += sprintf(outp, fmt8, (printed++ ? delim : ""), rapl_counter_get_value(&p->energy_gfx, RAPL_UNIT_JOULES, interval_float)); if (DO_BIC(BIC_RAM_J)) - outp += sprintf(outp, fmt8, (printed++ ? delim : ""), - rapl_counter_get_value(&p->energy_dram, RAPL_UNIT_JOULES, interval_float)); + outp += sprintf(outp, fmt8, (printed++ ? delim : ""), rapl_counter_get_value(&p->energy_dram, RAPL_UNIT_JOULES, interval_float)); if (DO_BIC(BIC_PKG__)) outp += - sprintf(outp, fmt8, (printed++ ? delim : ""), - rapl_counter_get_value(&p->rapl_pkg_perf_status, RAPL_UNIT_WATTS, interval_float)); + sprintf(outp, fmt8, (printed++ ? delim : ""), rapl_counter_get_value(&p->rapl_pkg_perf_status, RAPL_UNIT_WATTS, interval_float)); if (DO_BIC(BIC_RAM__)) outp += - sprintf(outp, fmt8, (printed++ ? delim : ""), - rapl_counter_get_value(&p->rapl_dram_perf_status, RAPL_UNIT_WATTS, interval_float)); + sprintf(outp, fmt8, (printed++ ? delim : ""), rapl_counter_get_value(&p->rapl_dram_perf_status, RAPL_UNIT_WATTS, interval_float)); /* UncMHz */ if (DO_BIC(BIC_UNCORE_MHZ)) outp += sprintf(outp, "%s%d", (printed++ ? delim : ""), p->uncore_mhz); + /* Added Package Counters */ for (i = 0, mp = sys.pp; mp; i++, mp = mp->next) { - if (mp->format == FORMAT_RAW || mp->format == FORMAT_AVERAGE) { - if (mp->width == 32) - outp += - sprintf(outp, "%s0x%08x", (printed++ ? delim : ""), (unsigned int)p->counter[i]); - else - outp += sprintf(outp, "%s0x%016llx", (printed++ ? delim : ""), p->counter[i]); - } else if (mp->format == FORMAT_DELTA) { - if ((mp->type == COUNTER_ITEMS) && sums_need_wide_columns) - outp += sprintf(outp, "%s%8lld", (printed++ ? delim : ""), p->counter[i]); - else - outp += sprintf(outp, "%s%lld", (printed++ ? delim : ""), p->counter[i]); - } else if (mp->format == FORMAT_PERCENT) { - outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->counter[i] / tsc); - } else if (mp->type == COUNTER_K2M) + if (mp->format == FORMAT_RAW) + outp += print_hex_value(mp->width, &printed, delim, p->counter[i]); + else if (mp->type == COUNTER_K2M) outp += sprintf(outp, "%s%d", (printed++ ? delim : ""), (unsigned int)p->counter[i] / 1000); + else if (mp->format == FORMAT_DELTA || mp->format == FORMAT_AVERAGE) + outp += print_decimal_value(mp->width, &printed, delim, p->counter[i]); + else if (mp->format == FORMAT_PERCENT) + outp += print_float_value(&printed, delim, pct(p->counter[i] / tsc)); } + /* Added perf Package Counters */ for (i = 0, pp = sys.perf_pp; pp; i++, pp = pp->next) { - if (pp->format == FORMAT_RAW) { - if (pp->width == 32) - outp += - sprintf(outp, "%s0x%08x", (printed++ ? delim : ""), - (unsigned int)p->perf_counter[i]); - else - outp += sprintf(outp, "%s0x%016llx", (printed++ ? delim : ""), p->perf_counter[i]); - } else if (pp->format == FORMAT_DELTA) { - if ((pp->type == COUNTER_ITEMS) && sums_need_wide_columns) - outp += sprintf(outp, "%s%8lld", (printed++ ? delim : ""), p->perf_counter[i]); - else - outp += sprintf(outp, "%s%lld", (printed++ ? delim : ""), p->perf_counter[i]); - } else if (pp->format == FORMAT_PERCENT) { - outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->perf_counter[i] / tsc); - } else if (pp->type == COUNTER_K2M) { - outp += - sprintf(outp, "%s%d", (printed++ ? delim : ""), (unsigned int)p->perf_counter[i] / 1000); - } + if (pp->format == FORMAT_RAW) + outp += print_hex_value(pp->width, &printed, delim, p->perf_counter[i]); + else if (pp->type == COUNTER_K2M) + outp += sprintf(outp, "%s%d", (printed++ ? delim : ""), (unsigned int)p->perf_counter[i] / 1000); + else if (pp->format == FORMAT_DELTA || mp->format == FORMAT_AVERAGE) + outp += print_decimal_value(pp->width, &printed, delim, p->perf_counter[i]); + else if (pp->format == FORMAT_PERCENT) + outp += print_float_value(&printed, delim, pct(p->perf_counter[i] / tsc)); } + /* Added PMT Package Counters */ for (i = 0, ppmt = sys.pmt_pp; ppmt; i++, ppmt = ppmt->next) { const unsigned long value_raw = p->pmt_counter[i]; double value_converted; switch (ppmt->type) { case PMT_TYPE_RAW: - if (pmt_counter_get_width(ppmt) <= 32) - outp += sprintf(outp, "%s0x%08x", (printed++ ? delim : ""), - (unsigned int)p->pmt_counter[i]); - else - outp += sprintf(outp, "%s0x%016llx", (printed++ ? delim : ""), p->pmt_counter[i]); - + outp += print_hex_value(pmt_counter_get_width(ppmt), &printed, delim, p->pmt_counter[i]); break; case PMT_TYPE_XTAL_TIME: - value_converted = 100.0 * value_raw / crystal_hz / interval_float; - outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), value_converted); + value_converted = pct(value_raw / crystal_hz / interval_float); + outp += print_float_value(&printed, delim, value_converted); break; case PMT_TYPE_TCORE_CLOCK: - value_converted = 100.0 * value_raw / tcore_clock_freq_hz / interval_float; - outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), value_converted); + value_converted = pct(value_raw / tcore_clock_freq_hz / interval_float); + outp += print_float_value(&printed, delim, value_converted); } } @@ -3754,11 +3688,10 @@ int delta_package(struct pkg_data *new, struct pkg_data *old) old->energy_gfx.raw_value = new->energy_gfx.raw_value - old->energy_gfx.raw_value; old->energy_dram.raw_value = new->energy_dram.raw_value - old->energy_dram.raw_value; old->rapl_pkg_perf_status.raw_value = new->rapl_pkg_perf_status.raw_value - old->rapl_pkg_perf_status.raw_value; - old->rapl_dram_perf_status.raw_value = - new->rapl_dram_perf_status.raw_value - old->rapl_dram_perf_status.raw_value; + old->rapl_dram_perf_status.raw_value = new->rapl_dram_perf_status.raw_value - old->rapl_dram_perf_status.raw_value; for (i = 0, mp = sys.pp; mp; i++, mp = mp->next) { - if (mp->format == FORMAT_RAW || mp->format == FORMAT_AVERAGE) + if (mp->format == FORMAT_RAW) old->counter[i] = new->counter[i]; else if (mp->format == FORMAT_AVERAGE) old->counter[i] = new->counter[i]; @@ -3862,8 +3795,7 @@ int delta_thread(struct thread_data *new, struct thread_data *old, struct core_d /* check for TSC < 1 Mcycles over interval */ if (old->tsc < (1000 * 1000)) errx(-3, "Insanely slow TSC rate, TSC stops in idle?\n" - "You can disable all c-states by booting with \"idle=poll\"\n" - "or just the deep ones with \"processor.max_cstate=1\""); + "You can disable all c-states by booting with \"idle=poll\"\n" "or just the deep ones with \"processor.max_cstate=1\""); old->c1 = new->c1 - old->c1; @@ -3892,8 +3824,7 @@ int delta_thread(struct thread_data *new, struct thread_data *old, struct core_d old->c1 = 0; else { /* normal case, derive c1 */ - old->c1 = (old->tsc * tsc_tweak) - old->mperf - core_delta->c3 - - core_delta->c6 - core_delta->c7; + old->c1 = (old->tsc * tsc_tweak) - old->mperf - core_delta->c3 - core_delta->c6 - core_delta->c7; } } @@ -3915,6 +3846,12 @@ int delta_thread(struct thread_data *new, struct thread_data *old, struct core_d if (DO_BIC(BIC_SMI)) old->smi_count = new->smi_count - old->smi_count; + if (DO_BIC(BIC_LLC_RPS)) + old->llc.references = new->llc.references - old->llc.references; + + if (DO_BIC(BIC_LLC_HIT)) + old->llc.misses = new->llc.misses - old->llc.misses; + for (i = 0, mp = sys.tp; mp; i++, mp = mp->next) { if (mp->format == FORMAT_RAW || mp->format == FORMAT_AVERAGE) old->counter[i] = new->counter[i]; @@ -3939,20 +3876,19 @@ int delta_thread(struct thread_data *new, struct thread_data *old, struct core_d return 0; } -int delta_cpu(struct thread_data *t, struct core_data *c, - struct pkg_data *p, struct thread_data *t2, struct core_data *c2, struct pkg_data *p2) +int delta_cpu(struct thread_data *t, struct core_data *c, struct pkg_data *p, struct thread_data *t2, struct core_data *c2, struct pkg_data *p2) { int retval = 0; /* calculate core delta only for 1st thread in core */ - if (is_cpu_first_thread_in_core(t, c, p)) + if (is_cpu_first_thread_in_core(t, c)) delta_core(c, c2); /* always calculate thread delta */ retval = delta_thread(t, t2, c2); /* c2 is core delta */ /* calculate package delta only for 1st core in package */ - if (is_cpu_first_core_in_package(t, c, p)) + if (is_cpu_first_core_in_package(t, p)) retval |= delta_package(p, p2); return retval; @@ -3993,6 +3929,9 @@ void clear_counters(PER_THREAD_PARAMS) t->nmi_count = 0; t->smi_count = 0; + t->llc.references = 0; + t->llc.misses = 0; + c->c3 = 0; c->c6 = 0; c->c7 = 0; @@ -4001,6 +3940,9 @@ void clear_counters(PER_THREAD_PARAMS) rapl_counter_clear(&c->core_energy); c->core_throt_cnt = 0; + t->llc.references = 0; + t->llc.misses = 0; + p->pkg_wtd_core_c0 = 0; p->pkg_any_core_c0 = 0; p->pkg_any_gfxe_c0 = 0; @@ -4098,6 +4040,9 @@ int sum_counters(PER_THREAD_PARAMS) average.threads.nmi_count += t->nmi_count; average.threads.smi_count += t->smi_count; + average.threads.llc.references += t->llc.references; + average.threads.llc.misses += t->llc.misses; + for (i = 0, mp = sys.tp; mp; i++, mp = mp->next) { if (mp->format == FORMAT_RAW) continue; @@ -4115,7 +4060,7 @@ int sum_counters(PER_THREAD_PARAMS) } /* sum per-core values only for 1st thread in core */ - if (!is_cpu_first_thread_in_core(t, c, p)) + if (!is_cpu_first_thread_in_core(t, c)) return 0; average.cores.c3 += c->c3; @@ -4145,7 +4090,7 @@ int sum_counters(PER_THREAD_PARAMS) } /* sum per-pkg values only for 1st core in pkg */ - if (!is_cpu_first_core_in_package(t, c, p)) + if (!is_cpu_first_core_in_package(t, p)) return 0; if (DO_BIC(BIC_Totl_c0)) @@ -4411,8 +4356,7 @@ unsigned long long get_legacy_uncore_mhz(int package) */ for (die = 0; die <= topo.max_die_id; ++die) { - sprintf(path, "/sys/devices/system/cpu/intel_uncore_frequency/package_%02d_die_%02d/current_freq_khz", - package, die); + sprintf(path, "/sys/devices/system/cpu/intel_uncore_frequency/package_%02d_die_%02d/current_freq_khz", package, die); if (access(path, R_OK) == 0) return (snapshot_sysfs_counter(path) / 1000); @@ -4523,11 +4467,6 @@ int get_core_throt_cnt(int cpu, unsigned long long *cnt) return 0; } -struct amperf_group_fd { - int aperf; /* Also the group descriptor */ - int mperf; -}; - static int read_perf_counter_info(const char *const path, const char *const parse_format, void *value_ptr) { int fdmt; @@ -4727,8 +4666,7 @@ int get_rapl_counters(int cpu, unsigned int domain, struct core_data *c, struct const ssize_t actual_read_size = read(rci->fd_perf, &perf_data[0], sizeof(perf_data)); if (actual_read_size != expected_read_size) - err(-1, "%s: failed to read perf_data (%zu %zu)", __func__, expected_read_size, - actual_read_size); + err(-1, "%s: failed to read perf_data (%zu %zu)", __func__, expected_read_size, actual_read_size); } for (unsigned int i = 0, pi = 1; i < NUM_RAPL_COUNTERS; ++i) { @@ -4966,8 +4904,7 @@ int get_smi_aperf_mperf(unsigned int cpu, struct thread_data *t) const ssize_t actual_read_size = read(mci->fd_perf, &perf_data[0], sizeof(perf_data)); if (actual_read_size != expected_read_size) - err(-1, "%s: failed to read perf_data (%zu %zu)", __func__, expected_read_size, - actual_read_size); + err(-1, "%s: failed to read perf_data (%zu %zu)", __func__, expected_read_size, actual_read_size); } for (unsigned int i = 0, pi = 1; i < NUM_MSR_COUNTERS; ++i) { @@ -5121,6 +5058,9 @@ int get_counters(PER_THREAD_PARAMS) get_smi_aperf_mperf(cpu, t); + if (DO_BIC(BIC_LLC_RPS) || DO_BIC(BIC_LLC_HIT)) + get_perf_llc_stats(cpu, &t->llc); + if (DO_BIC(BIC_IPC)) if (read(get_instr_count_fd(cpu), &t->instr_count, sizeof(long long)) != sizeof(long long)) return -4; @@ -5144,7 +5084,7 @@ int get_counters(PER_THREAD_PARAMS) t->pmt_counter[i] = pmt_read_counter(pp, t->cpu_id); /* collect core counters only for 1st thread in core */ - if (!is_cpu_first_thread_in_core(t, c, p)) + if (!is_cpu_first_thread_in_core(t, c)) goto done; if (platform->has_per_core_rapl) { @@ -5188,7 +5128,7 @@ int get_counters(PER_THREAD_PARAMS) c->pmt_counter[i] = pmt_read_counter(pp, c->core_id); /* collect package counters only for 1st core in package */ - if (!is_cpu_first_core_in_package(t, c, p)) + if (!is_cpu_first_core_in_package(t, p)) goto done; if (DO_BIC(BIC_Totl_c0)) { @@ -5277,48 +5217,39 @@ char *pkg_cstate_limit_strings[] = { "unknown", "reserved", "pc0", "pc1", "pc2", "pc3", "pc4", "pc6", "pc6n", "pc6r", "pc7", "pc7s", "pc8", "pc9", "pc10", "unlimited" }; -int nhm_pkg_cstate_limits[16] = - { PCL__0, PCL__1, PCL__3, PCL__6, PCL__7, PCLRSV, PCLRSV, PCLUNL, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, +int nhm_pkg_cstate_limits[16] = { PCL__0, PCL__1, PCL__3, PCL__6, PCL__7, PCLRSV, PCLRSV, PCLUNL, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV }; -int snb_pkg_cstate_limits[16] = - { PCL__0, PCL__2, PCL_6N, PCL_6R, PCL__7, PCL_7S, PCLRSV, PCLUNL, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, +int snb_pkg_cstate_limits[16] = { PCL__0, PCL__2, PCL_6N, PCL_6R, PCL__7, PCL_7S, PCLRSV, PCLUNL, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV }; -int hsw_pkg_cstate_limits[16] = - { PCL__0, PCL__2, PCL__3, PCL__6, PCL__7, PCL_7S, PCL__8, PCL__9, PCLUNL, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, +int hsw_pkg_cstate_limits[16] = { PCL__0, PCL__2, PCL__3, PCL__6, PCL__7, PCL_7S, PCL__8, PCL__9, PCLUNL, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV }; -int slv_pkg_cstate_limits[16] = - { PCL__0, PCL__1, PCLRSV, PCLRSV, PCL__4, PCLRSV, PCL__6, PCL__7, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, +int slv_pkg_cstate_limits[16] = { PCL__0, PCL__1, PCLRSV, PCLRSV, PCL__4, PCLRSV, PCL__6, PCL__7, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCL__6, PCL__7 }; -int amt_pkg_cstate_limits[16] = - { PCLUNL, PCL__1, PCL__2, PCLRSV, PCLRSV, PCLRSV, PCL__6, PCL__7, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, +int amt_pkg_cstate_limits[16] = { PCLUNL, PCL__1, PCL__2, PCLRSV, PCLRSV, PCLRSV, PCL__6, PCL__7, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV }; -int phi_pkg_cstate_limits[16] = - { PCL__0, PCL__2, PCL_6N, PCL_6R, PCLRSV, PCLRSV, PCLRSV, PCLUNL, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, +int phi_pkg_cstate_limits[16] = { PCL__0, PCL__2, PCL_6N, PCL_6R, PCLRSV, PCLRSV, PCLRSV, PCLUNL, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV }; -int glm_pkg_cstate_limits[16] = - { PCLUNL, PCL__1, PCL__3, PCL__6, PCL__7, PCL_7S, PCL__8, PCL__9, PCL_10, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, +int glm_pkg_cstate_limits[16] = { PCLUNL, PCL__1, PCL__3, PCL__6, PCL__7, PCL_7S, PCL__8, PCL__9, PCL_10, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV }; -int skx_pkg_cstate_limits[16] = - { PCL__0, PCL__2, PCL_6N, PCL_6R, PCLRSV, PCLRSV, PCLRSV, PCLUNL, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, +int skx_pkg_cstate_limits[16] = { PCL__0, PCL__2, PCL_6N, PCL_6R, PCLRSV, PCLRSV, PCLRSV, PCLUNL, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV }; -int icx_pkg_cstate_limits[16] = - { PCL__0, PCL__2, PCL__6, PCL__6, PCLRSV, PCLRSV, PCLRSV, PCLUNL, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, +int icx_pkg_cstate_limits[16] = { PCL__0, PCL__2, PCL__6, PCL__6, PCLRSV, PCLRSV, PCLRSV, PCLUNL, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV }; @@ -5393,8 +5324,7 @@ static void dump_power_ctl(void) return; get_msr(base_cpu, MSR_IA32_POWER_CTL, &msr); - fprintf(outf, "cpu%d: MSR_IA32_POWER_CTL: 0x%08llx (C1E auto-promotion: %sabled)\n", - base_cpu, msr, msr & 0x2 ? "EN" : "DIS"); + fprintf(outf, "cpu%d: MSR_IA32_POWER_CTL: 0x%08llx (C1E auto-promotion: %sabled)\n", base_cpu, msr, msr & 0x2 ? "EN" : "DIS"); /* C-state Pre-wake Disable (CSTATE_PREWAKE_DISABLE) */ if (platform->has_cst_prewake_bit) @@ -5487,8 +5417,7 @@ static void dump_turbo_ratio_limits(int trl_msr_offset) ratio = (msr >> shift) & 0xFF; group_size = (core_counts >> shift) & 0xFF; if (ratio) - fprintf(outf, "%d * %.1f = %.1f MHz max turbo %d active cores\n", - ratio, bclk, ratio * bclk, group_size); + fprintf(outf, "%d * %.1f = %.1f MHz max turbo %d active cores\n", ratio, bclk, ratio * bclk, group_size); } return; @@ -5586,9 +5515,7 @@ static void dump_knl_turbo_ratio_limits(void) for (i = buckets_no - 1; i >= 0; i--) if (i > 0 ? ratio[i] != ratio[i - 1] : 1) - fprintf(outf, - "%d * %.1f = %.1f MHz max turbo %d active cores\n", - ratio[i], bclk, ratio[i] * bclk, cores[i]); + fprintf(outf, "%d * %.1f = %.1f MHz max turbo %d active cores\n", ratio[i], bclk, ratio[i] * bclk, cores[i]); } static void dump_cst_cfg(void) @@ -5673,43 +5600,37 @@ void print_irtl(void) if (platform->supported_cstates & PC3) { get_msr(base_cpu, MSR_PKGC3_IRTL, &msr); fprintf(outf, "cpu%d: MSR_PKGC3_IRTL: 0x%08llx (", base_cpu, msr); - fprintf(outf, "%svalid, %lld ns)\n", msr & (1 << 15) ? "" : "NOT", - (msr & 0x3FF) * irtl_time_units[(msr >> 10) & 0x3]); + fprintf(outf, "%svalid, %lld ns)\n", msr & (1 << 15) ? "" : "NOT", (msr & 0x3FF) * irtl_time_units[(msr >> 10) & 0x3]); } if (platform->supported_cstates & PC6) { get_msr(base_cpu, MSR_PKGC6_IRTL, &msr); fprintf(outf, "cpu%d: MSR_PKGC6_IRTL: 0x%08llx (", base_cpu, msr); - fprintf(outf, "%svalid, %lld ns)\n", msr & (1 << 15) ? "" : "NOT", - (msr & 0x3FF) * irtl_time_units[(msr >> 10) & 0x3]); + fprintf(outf, "%svalid, %lld ns)\n", msr & (1 << 15) ? "" : "NOT", (msr & 0x3FF) * irtl_time_units[(msr >> 10) & 0x3]); } if (platform->supported_cstates & PC7) { get_msr(base_cpu, MSR_PKGC7_IRTL, &msr); fprintf(outf, "cpu%d: MSR_PKGC7_IRTL: 0x%08llx (", base_cpu, msr); - fprintf(outf, "%svalid, %lld ns)\n", msr & (1 << 15) ? "" : "NOT", - (msr & 0x3FF) * irtl_time_units[(msr >> 10) & 0x3]); + fprintf(outf, "%svalid, %lld ns)\n", msr & (1 << 15) ? "" : "NOT", (msr & 0x3FF) * irtl_time_units[(msr >> 10) & 0x3]); } if (platform->supported_cstates & PC8) { get_msr(base_cpu, MSR_PKGC8_IRTL, &msr); fprintf(outf, "cpu%d: MSR_PKGC8_IRTL: 0x%08llx (", base_cpu, msr); - fprintf(outf, "%svalid, %lld ns)\n", msr & (1 << 15) ? "" : "NOT", - (msr & 0x3FF) * irtl_time_units[(msr >> 10) & 0x3]); + fprintf(outf, "%svalid, %lld ns)\n", msr & (1 << 15) ? "" : "NOT", (msr & 0x3FF) * irtl_time_units[(msr >> 10) & 0x3]); } if (platform->supported_cstates & PC9) { get_msr(base_cpu, MSR_PKGC9_IRTL, &msr); fprintf(outf, "cpu%d: MSR_PKGC9_IRTL: 0x%08llx (", base_cpu, msr); - fprintf(outf, "%svalid, %lld ns)\n", msr & (1 << 15) ? "" : "NOT", - (msr & 0x3FF) * irtl_time_units[(msr >> 10) & 0x3]); + fprintf(outf, "%svalid, %lld ns)\n", msr & (1 << 15) ? "" : "NOT", (msr & 0x3FF) * irtl_time_units[(msr >> 10) & 0x3]); } if (platform->supported_cstates & PC10) { get_msr(base_cpu, MSR_PKGC10_IRTL, &msr); fprintf(outf, "cpu%d: MSR_PKGC10_IRTL: 0x%08llx (", base_cpu, msr); - fprintf(outf, "%svalid, %lld ns)\n", msr & (1 << 15) ? "" : "NOT", - (msr & 0x3FF) * irtl_time_units[(msr >> 10) & 0x3]); + fprintf(outf, "%svalid, %lld ns)\n", msr & (1 << 15) ? "" : "NOT", (msr & 0x3FF) * irtl_time_units[(msr >> 10) & 0x3]); } } @@ -5743,6 +5664,20 @@ void free_fd_instr_count_percpu(void) fd_instr_count_percpu = NULL; } +void free_fd_llc_percpu(void) +{ + if (!fd_llc_percpu) + return; + + for (int i = 0; i < topo.max_cpu_num + 1; ++i) { + if (fd_llc_percpu[i] != 0) + close(fd_llc_percpu[i]); + } + + free(fd_llc_percpu); + fd_llc_percpu = NULL; +} + void free_fd_cstate(void) { if (!ccstate_counter_info) @@ -5867,6 +5802,7 @@ void free_all_buffers(void) free_fd_percpu(); free_fd_instr_count_percpu(); + free_fd_llc_percpu(); free_fd_msr(); free_fd_rapl_percpu(); free_fd_cstate(); @@ -6213,6 +6149,7 @@ void linux_perf_init(void); void msr_perf_init(void); void rapl_perf_init(void); void cstate_perf_init(void); +void perf_llc_init(void); void added_perf_counters_init(void); void pmt_init(void); @@ -6224,10 +6161,10 @@ void re_initialize(void) msr_perf_init(); rapl_perf_init(); cstate_perf_init(); + perf_llc_init(); added_perf_counters_init(); pmt_init(); - fprintf(outf, "turbostat: re-initialized with num_cpus %d, allowed_cpus %d\n", topo.num_cpus, - topo.allowed_cpus); + fprintf(outf, "turbostat: re-initialized with num_cpus %d, allowed_cpus %d\n", topo.num_cpus, topo.allowed_cpus); } void set_max_cpu_num(void) @@ -6673,6 +6610,7 @@ release_timer: timer_delete(timerid); release_msr: free(per_cpu_msr_sum); + per_cpu_msr_sum = NULL; } /* @@ -6797,21 +6735,43 @@ restart: } } -void check_dev_msr() +int probe_dev_msr(void) { struct stat sb; char pathname[32]; - if (no_msr) - return; -#if defined(ANDROID) sprintf(pathname, "/dev/msr%d", base_cpu); -#else + return !stat(pathname, &sb); +} + +int probe_dev_cpu_msr(void) +{ + struct stat sb; + char pathname[32]; + sprintf(pathname, "/dev/cpu/%d/msr", base_cpu); -#endif - if (stat(pathname, &sb)) - if (system("/sbin/modprobe msr > /dev/null 2>&1")) - no_msr = 1; + return !stat(pathname, &sb); +} + +int probe_msr_driver(void) +{ + if (probe_dev_msr()) { + use_android_msr_path = 1; + return 1; + } + return probe_dev_cpu_msr(); +} + +void check_msr_driver(void) +{ + if (probe_msr_driver()) + return; + + if (system("/sbin/modprobe msr > /dev/null 2>&1")) + no_msr = 1; + + if (!probe_msr_driver()) + no_msr = 1; } /* @@ -6866,11 +6826,7 @@ void check_msr_permission(void) failed += check_for_cap_sys_rawio(); /* test file permissions */ -#if defined(ANDROID) - sprintf(pathname, "/dev/msr%d", base_cpu); -#else - sprintf(pathname, "/dev/cpu/%d/msr", base_cpu); -#endif + sprintf(pathname, use_android_msr_path ? "/dev/msr%d" : "/dev/cpu/%d/msr", base_cpu); if (euidaccess(pathname, R_OK)) { failed++; } @@ -6999,8 +6955,7 @@ static void probe_intel_uncore_frequency_legacy(void) int k, l; char path_base[128]; - sprintf(path_base, "/sys/devices/system/cpu/intel_uncore_frequency/package_%02d_die_%02d", i, - j); + sprintf(path_base, "/sys/devices/system/cpu/intel_uncore_frequency/package_%02d_die_%02d", i, j); sprintf(path, "%s/current_freq_khz", path_base); if (access(path, R_OK)) @@ -7083,8 +7038,7 @@ static void probe_intel_uncore_frequency_cluster(void) */ if BIC_IS_ENABLED (BIC_UNCORE_MHZ) - add_counter(0, path, name_buf, 0, SCOPE_PACKAGE, COUNTER_K2M, FORMAT_AVERAGE, 0, - package_id); + add_counter(0, path, name_buf, 0, SCOPE_PACKAGE, COUNTER_K2M, FORMAT_AVERAGE, 0, package_id); if (quiet) continue; @@ -7093,8 +7047,7 @@ static void probe_intel_uncore_frequency_cluster(void) k = read_sysfs_int(path); sprintf(path, "%s/max_freq_khz", path_base); l = read_sysfs_int(path); - fprintf(outf, "Uncore Frequency package%d domain%d cluster%d: %d - %d MHz ", package_id, domain_id, - cluster_id, k / 1000, l / 1000); + fprintf(outf, "Uncore Frequency package%d domain%d cluster%d: %d - %d MHz ", package_id, domain_id, cluster_id, k / 1000, l / 1000); sprintf(path, "%s/initial_min_freq_khz", path_base); k = read_sysfs_int(path); @@ -7156,21 +7109,17 @@ static void probe_graphics(void) else goto next; - set_graphics_fp("/sys/class/drm/card0/device/tile0/gt0/gtidle/idle_residency_ms", - gt0_is_gt ? GFX_rc6 : SAM_mc6); + set_graphics_fp("/sys/class/drm/card0/device/tile0/gt0/gtidle/idle_residency_ms", gt0_is_gt ? GFX_rc6 : SAM_mc6); set_graphics_fp("/sys/class/drm/card0/device/tile0/gt0/freq0/cur_freq", gt0_is_gt ? GFX_MHz : SAM_MHz); - set_graphics_fp("/sys/class/drm/card0/device/tile0/gt0/freq0/act_freq", - gt0_is_gt ? GFX_ACTMHz : SAM_ACTMHz); + set_graphics_fp("/sys/class/drm/card0/device/tile0/gt0/freq0/act_freq", gt0_is_gt ? GFX_ACTMHz : SAM_ACTMHz); - set_graphics_fp("/sys/class/drm/card0/device/tile0/gt1/gtidle/idle_residency_ms", - gt0_is_gt ? SAM_mc6 : GFX_rc6); + set_graphics_fp("/sys/class/drm/card0/device/tile0/gt1/gtidle/idle_residency_ms", gt0_is_gt ? SAM_mc6 : GFX_rc6); set_graphics_fp("/sys/class/drm/card0/device/tile0/gt1/freq0/cur_freq", gt0_is_gt ? SAM_MHz : GFX_MHz); - set_graphics_fp("/sys/class/drm/card0/device/tile0/gt1/freq0/act_freq", - gt0_is_gt ? SAM_ACTMHz : GFX_ACTMHz); + set_graphics_fp("/sys/class/drm/card0/device/tile0/gt1/freq0/act_freq", gt0_is_gt ? SAM_ACTMHz : GFX_ACTMHz); goto end; } @@ -7425,8 +7374,7 @@ int print_hwp(PER_THREAD_PARAMS) "(high %d guar %d eff %d low %d)\n", cpu, msr, (unsigned int)HWP_HIGHEST_PERF(msr), - (unsigned int)HWP_GUARANTEED_PERF(msr), - (unsigned int)HWP_MOSTEFFICIENT_PERF(msr), (unsigned int)HWP_LOWEST_PERF(msr)); + (unsigned int)HWP_GUARANTEED_PERF(msr), (unsigned int)HWP_MOSTEFFICIENT_PERF(msr), (unsigned int)HWP_LOWEST_PERF(msr)); if (get_msr(cpu, MSR_HWP_REQUEST, &msr)) return 0; @@ -7437,8 +7385,7 @@ int print_hwp(PER_THREAD_PARAMS) (unsigned int)(((msr) >> 0) & 0xff), (unsigned int)(((msr) >> 8) & 0xff), (unsigned int)(((msr) >> 16) & 0xff), - (unsigned int)(((msr) >> 24) & 0xff), - (unsigned int)(((msr) >> 32) & 0xff3), (unsigned int)(((msr) >> 42) & 0x1)); + (unsigned int)(((msr) >> 24) & 0xff), (unsigned int)(((msr) >> 32) & 0xff3), (unsigned int)(((msr) >> 42) & 0x1)); if (has_hwp_pkg) { if (get_msr(cpu, MSR_HWP_REQUEST_PKG, &msr)) @@ -7449,23 +7396,20 @@ int print_hwp(PER_THREAD_PARAMS) cpu, msr, (unsigned int)(((msr) >> 0) & 0xff), (unsigned int)(((msr) >> 8) & 0xff), - (unsigned int)(((msr) >> 16) & 0xff), - (unsigned int)(((msr) >> 24) & 0xff), (unsigned int)(((msr) >> 32) & 0xff3)); + (unsigned int)(((msr) >> 16) & 0xff), (unsigned int)(((msr) >> 24) & 0xff), (unsigned int)(((msr) >> 32) & 0xff3)); } if (has_hwp_notify) { if (get_msr(cpu, MSR_HWP_INTERRUPT, &msr)) return 0; fprintf(outf, "cpu%d: MSR_HWP_INTERRUPT: 0x%08llx " - "(%s_Guaranteed_Perf_Change, %s_Excursion_Min)\n", - cpu, msr, ((msr) & 0x1) ? "EN" : "Dis", ((msr) & 0x2) ? "EN" : "Dis"); + "(%s_Guaranteed_Perf_Change, %s_Excursion_Min)\n", cpu, msr, ((msr) & 0x1) ? "EN" : "Dis", ((msr) & 0x2) ? "EN" : "Dis"); } if (get_msr(cpu, MSR_HWP_STATUS, &msr)) return 0; fprintf(outf, "cpu%d: MSR_HWP_STATUS: 0x%08llx " - "(%sGuaranteed_Perf_Change, %sExcursion_Min)\n", - cpu, msr, ((msr) & 0x1) ? "" : "No-", ((msr) & 0x4) ? "" : "No-"); + "(%sGuaranteed_Perf_Change, %sExcursion_Min)\n", cpu, msr, ((msr) & 0x1) ? "" : "No-", ((msr) & 0x4) ? "" : "No-"); return 0; } @@ -7510,8 +7454,7 @@ int print_perf_limit(PER_THREAD_PARAMS) (msr & 1 << 6) ? "VR-Therm, " : "", (msr & 1 << 5) ? "Auto-HWP, " : "", (msr & 1 << 4) ? "Graphics, " : "", - (msr & 1 << 2) ? "bit2, " : "", - (msr & 1 << 1) ? "ThermStatus, " : "", (msr & 1 << 0) ? "PROCHOT, " : ""); + (msr & 1 << 2) ? "bit2, " : "", (msr & 1 << 1) ? "ThermStatus, " : "", (msr & 1 << 0) ? "PROCHOT, " : ""); fprintf(outf, " (Logged: %s%s%s%s%s%s%s%s%s%s%s%s%s%s)\n", (msr & 1 << 31) ? "bit31, " : "", (msr & 1 << 30) ? "bit30, " : "", @@ -7524,8 +7467,7 @@ int print_perf_limit(PER_THREAD_PARAMS) (msr & 1 << 22) ? "VR-Therm, " : "", (msr & 1 << 21) ? "Auto-HWP, " : "", (msr & 1 << 20) ? "Graphics, " : "", - (msr & 1 << 18) ? "bit18, " : "", - (msr & 1 << 17) ? "ThermStatus, " : "", (msr & 1 << 16) ? "PROCHOT, " : ""); + (msr & 1 << 18) ? "bit18, " : "", (msr & 1 << 17) ? "ThermStatus, " : "", (msr & 1 << 16) ? "PROCHOT, " : ""); } if (platform->plr_msrs & PLR_GFX) { @@ -7537,16 +7479,14 @@ int print_perf_limit(PER_THREAD_PARAMS) (msr & 1 << 4) ? "Graphics, " : "", (msr & 1 << 6) ? "VR-Therm, " : "", (msr & 1 << 8) ? "Amps, " : "", - (msr & 1 << 9) ? "GFXPwr, " : "", - (msr & 1 << 10) ? "PkgPwrL1, " : "", (msr & 1 << 11) ? "PkgPwrL2, " : ""); + (msr & 1 << 9) ? "GFXPwr, " : "", (msr & 1 << 10) ? "PkgPwrL1, " : "", (msr & 1 << 11) ? "PkgPwrL2, " : ""); fprintf(outf, " (Logged: %s%s%s%s%s%s%s%s)\n", (msr & 1 << 16) ? "PROCHOT, " : "", (msr & 1 << 17) ? "ThermStatus, " : "", (msr & 1 << 20) ? "Graphics, " : "", (msr & 1 << 22) ? "VR-Therm, " : "", (msr & 1 << 24) ? "Amps, " : "", - (msr & 1 << 25) ? "GFXPwr, " : "", - (msr & 1 << 26) ? "PkgPwrL1, " : "", (msr & 1 << 27) ? "PkgPwrL2, " : ""); + (msr & 1 << 25) ? "GFXPwr, " : "", (msr & 1 << 26) ? "PkgPwrL1, " : "", (msr & 1 << 27) ? "PkgPwrL2, " : ""); } if (platform->plr_msrs & PLR_RING) { get_msr(cpu, MSR_RING_PERF_LIMIT_REASONS, &msr); @@ -7555,14 +7495,12 @@ int print_perf_limit(PER_THREAD_PARAMS) (msr & 1 << 0) ? "PROCHOT, " : "", (msr & 1 << 1) ? "ThermStatus, " : "", (msr & 1 << 6) ? "VR-Therm, " : "", - (msr & 1 << 8) ? "Amps, " : "", - (msr & 1 << 10) ? "PkgPwrL1, " : "", (msr & 1 << 11) ? "PkgPwrL2, " : ""); + (msr & 1 << 8) ? "Amps, " : "", (msr & 1 << 10) ? "PkgPwrL1, " : "", (msr & 1 << 11) ? "PkgPwrL2, " : ""); fprintf(outf, " (Logged: %s%s%s%s%s%s)\n", (msr & 1 << 16) ? "PROCHOT, " : "", (msr & 1 << 17) ? "ThermStatus, " : "", (msr & 1 << 22) ? "VR-Therm, " : "", - (msr & 1 << 24) ? "Amps, " : "", - (msr & 1 << 26) ? "PkgPwrL1, " : "", (msr & 1 << 27) ? "PkgPwrL2, " : ""); + (msr & 1 << 24) ? "Amps, " : "", (msr & 1 << 26) ? "PkgPwrL1, " : "", (msr & 1 << 27) ? "PkgPwrL2, " : ""); } return 0; } @@ -7582,7 +7520,7 @@ double get_tdp_intel(void) { unsigned long long msr; - if (platform->rapl_msrs & RAPL_PKG_POWER_INFO) + if (valid_rapl_msrs & RAPL_PKG_POWER_INFO) if (!get_msr(base_cpu, MSR_PKG_POWER_INFO, &msr)) return ((msr >> 0) & RAPL_POWER_GRANULARITY) * rapl_power_units; return get_quirk_tdp(); @@ -7613,12 +7551,12 @@ void rapl_probe_intel(void) CLR_BIC(BIC_GFX_J, &bic_enabled); } - if (!platform->rapl_msrs || no_msr) + if (!valid_rapl_msrs || no_msr) return; - if (!(platform->rapl_msrs & RAPL_PKG_PERF_STATUS)) + if (!(valid_rapl_msrs & RAPL_PKG_PERF_STATUS)) CLR_BIC(BIC_PKG__, &bic_enabled); - if (!(platform->rapl_msrs & RAPL_DRAM_PERF_STATUS)) + if (!(valid_rapl_msrs & RAPL_DRAM_PERF_STATUS)) CLR_BIC(BIC_RAM__, &bic_enabled); /* units on package 0, verify later other packages match */ @@ -7667,7 +7605,7 @@ void rapl_probe_amd(void) CLR_BIC(BIC_Cor_J, &bic_enabled); } - if (!platform->rapl_msrs || no_msr) + if (!valid_rapl_msrs || no_msr) return; if (get_msr(base_cpu, MSR_RAPL_PWR_UNIT, &msr)) @@ -7690,8 +7628,7 @@ void print_power_limit_msr(int cpu, unsigned long long msr, char *label) cpu, label, ((msr >> 15) & 1) ? "EN" : "DIS", ((msr >> 0) & 0x7FFF) * rapl_power_units, - (1.0 + (((msr >> 22) & 0x3) / 4.0)) * (1 << ((msr >> 17) & 0x1F)) * rapl_time_units, - (((msr >> 16) & 1) ? "EN" : "DIS")); + (1.0 + (((msr >> 22) & 0x3) / 4.0)) * (1 << ((msr >> 17) & 0x1F)) * rapl_time_units, (((msr >> 16) & 1) ? "EN" : "DIS")); return; } @@ -7857,7 +7794,7 @@ int print_rapl(PER_THREAD_PARAMS) UNUSED(c); UNUSED(p); - if (!platform->rapl_msrs) + if (!valid_rapl_msrs) return 0; /* RAPL counters are per package, so print only for 1st thread/package */ @@ -7870,7 +7807,7 @@ int print_rapl(PER_THREAD_PARAMS) return -1; } - if (platform->rapl_msrs & RAPL_AMD_F17H) { + if (valid_rapl_msrs & RAPL_AMD_F17H) { msr_name = "MSR_RAPL_PWR_UNIT"; if (get_msr(cpu, MSR_RAPL_PWR_UNIT, &msr)) return -1; @@ -7883,7 +7820,7 @@ int print_rapl(PER_THREAD_PARAMS) fprintf(outf, "cpu%d: %s: 0x%08llx (%f Watts, %f Joules, %f sec.)\n", cpu, msr_name, msr, rapl_power_units, rapl_energy_units, rapl_time_units); - if (platform->rapl_msrs & RAPL_PKG_POWER_INFO) { + if (valid_rapl_msrs & RAPL_PKG_POWER_INFO) { if (get_msr(cpu, MSR_PKG_POWER_INFO, &msr)) return -5; @@ -7892,25 +7829,22 @@ int print_rapl(PER_THREAD_PARAMS) cpu, msr, ((msr >> 0) & RAPL_POWER_GRANULARITY) * rapl_power_units, ((msr >> 16) & RAPL_POWER_GRANULARITY) * rapl_power_units, - ((msr >> 32) & RAPL_POWER_GRANULARITY) * rapl_power_units, - ((msr >> 48) & RAPL_TIME_GRANULARITY) * rapl_time_units); + ((msr >> 32) & RAPL_POWER_GRANULARITY) * rapl_power_units, ((msr >> 48) & RAPL_TIME_GRANULARITY) * rapl_time_units); } - if (platform->rapl_msrs & RAPL_PKG) { + if (valid_rapl_msrs & RAPL_PKG) { if (get_msr(cpu, MSR_PKG_POWER_LIMIT, &msr)) return -9; - fprintf(outf, "cpu%d: MSR_PKG_POWER_LIMIT: 0x%08llx (%slocked)\n", - cpu, msr, (msr >> 63) & 1 ? "" : "UN"); + fprintf(outf, "cpu%d: MSR_PKG_POWER_LIMIT: 0x%08llx (%slocked)\n", cpu, msr, (msr >> 63) & 1 ? "" : "UN"); print_power_limit_msr(cpu, msr, "PKG Limit #1"); fprintf(outf, "cpu%d: PKG Limit #2: %sabled (%0.3f Watts, %f* sec, clamp %sabled)\n", cpu, ((msr >> 47) & 1) ? "EN" : "DIS", ((msr >> 32) & 0x7FFF) * rapl_power_units, - (1.0 + (((msr >> 54) & 0x3) / 4.0)) * (1 << ((msr >> 49) & 0x1F)) * rapl_time_units, - ((msr >> 48) & 1) ? "EN" : "DIS"); + (1.0 + (((msr >> 54) & 0x3) / 4.0)) * (1 << ((msr >> 49) & 0x1F)) * rapl_time_units, ((msr >> 48) & 1) ? "EN" : "DIS"); if (get_msr(cpu, MSR_VR_CURRENT_CONFIG, &msr)) return -9; @@ -7920,7 +7854,7 @@ int print_rapl(PER_THREAD_PARAMS) cpu, ((msr >> 0) & 0x1FFF) * rapl_power_units, (msr >> 31) & 1 ? "" : "UN"); } - if (platform->rapl_msrs & RAPL_DRAM_POWER_INFO) { + if (valid_rapl_msrs & RAPL_DRAM_POWER_INFO) { if (get_msr(cpu, MSR_DRAM_POWER_INFO, &msr)) return -6; @@ -7928,31 +7862,28 @@ int print_rapl(PER_THREAD_PARAMS) cpu, msr, ((msr >> 0) & RAPL_POWER_GRANULARITY) * rapl_power_units, ((msr >> 16) & RAPL_POWER_GRANULARITY) * rapl_power_units, - ((msr >> 32) & RAPL_POWER_GRANULARITY) * rapl_power_units, - ((msr >> 48) & RAPL_TIME_GRANULARITY) * rapl_time_units); + ((msr >> 32) & RAPL_POWER_GRANULARITY) * rapl_power_units, ((msr >> 48) & RAPL_TIME_GRANULARITY) * rapl_time_units); } - if (platform->rapl_msrs & RAPL_DRAM) { + if (valid_rapl_msrs & RAPL_DRAM) { if (get_msr(cpu, MSR_DRAM_POWER_LIMIT, &msr)) return -9; - fprintf(outf, "cpu%d: MSR_DRAM_POWER_LIMIT: 0x%08llx (%slocked)\n", - cpu, msr, (msr >> 31) & 1 ? "" : "UN"); + fprintf(outf, "cpu%d: MSR_DRAM_POWER_LIMIT: 0x%08llx (%slocked)\n", cpu, msr, (msr >> 31) & 1 ? "" : "UN"); print_power_limit_msr(cpu, msr, "DRAM Limit"); } - if (platform->rapl_msrs & RAPL_CORE_POLICY) { + if (valid_rapl_msrs & RAPL_CORE_POLICY) { if (get_msr(cpu, MSR_PP0_POLICY, &msr)) return -7; fprintf(outf, "cpu%d: MSR_PP0_POLICY: %lld\n", cpu, msr & 0xF); } - if (platform->rapl_msrs & RAPL_CORE_POWER_LIMIT) { + if (valid_rapl_msrs & RAPL_CORE_POWER_LIMIT) { if (get_msr(cpu, MSR_PP0_POWER_LIMIT, &msr)) return -9; - fprintf(outf, "cpu%d: MSR_PP0_POWER_LIMIT: 0x%08llx (%slocked)\n", - cpu, msr, (msr >> 31) & 1 ? "" : "UN"); + fprintf(outf, "cpu%d: MSR_PP0_POWER_LIMIT: 0x%08llx (%slocked)\n", cpu, msr, (msr >> 31) & 1 ? "" : "UN"); print_power_limit_msr(cpu, msr, "Cores Limit"); } - if (platform->rapl_msrs & RAPL_GFX) { + if (valid_rapl_msrs & RAPL_GFX) { if (get_msr(cpu, MSR_PP1_POLICY, &msr)) return -8; @@ -7960,20 +7891,58 @@ int print_rapl(PER_THREAD_PARAMS) if (get_msr(cpu, MSR_PP1_POWER_LIMIT, &msr)) return -9; - fprintf(outf, "cpu%d: MSR_PP1_POWER_LIMIT: 0x%08llx (%slocked)\n", - cpu, msr, (msr >> 31) & 1 ? "" : "UN"); + fprintf(outf, "cpu%d: MSR_PP1_POWER_LIMIT: 0x%08llx (%slocked)\n", cpu, msr, (msr >> 31) & 1 ? "" : "UN"); print_power_limit_msr(cpu, msr, "GFX Limit"); } return 0; } /* + * probe_rapl_msrs + * + * initialize global valid_rapl_msrs to platform->plat_rapl_msrs + * only if PKG_ENERGY counter is enumerated and reads non-zero + */ +void probe_rapl_msrs(void) +{ + int ret; + off_t offset; + unsigned long long msr_value; + + if (no_msr) + return; + + if ((platform->plat_rapl_msrs & (RAPL_PKG | RAPL_AMD_F17H)) == 0) + return; + + offset = idx_to_offset(IDX_PKG_ENERGY); + if (offset < 0) + return; + + ret = get_msr(base_cpu, offset, &msr_value); + if (ret) { + if (debug) + fprintf(outf, "Can not read RAPL_PKG_ENERGY MSR(0x%llx)\n", (unsigned long long)offset); + return; + } + if (msr_value == 0) { + if (debug) + fprintf(outf, "RAPL_PKG_ENERGY MSR(0x%llx) == ZERO: disabling all RAPL MSRs\n", (unsigned long long)offset); + return; + } + + valid_rapl_msrs = platform->plat_rapl_msrs; /* success */ +} + +/* * probe_rapl() * * sets rapl_power_units, rapl_energy_units, rapl_time_units */ void probe_rapl(void) { + probe_rapl_msrs(); + if (genuine_intel) rapl_probe_intel(); if (authentic_amd || hygon_genuine) @@ -7984,7 +7953,7 @@ void probe_rapl(void) print_rapl_sysfs(); - if (!platform->rapl_msrs || no_msr) + if (!valid_rapl_msrs || no_msr) return; for_all_cpus(print_rapl, ODD_COUNTERS); @@ -8088,7 +8057,7 @@ int print_thermal(PER_THREAD_PARAMS) cpu = t->cpu_id; /* DTS is per-core, no need to print for each thread */ - if (!is_cpu_first_thread_in_core(t, c, p)) + if (!is_cpu_first_thread_in_core(t, c)) return 0; if (cpu_migrate(cpu)) { @@ -8096,7 +8065,7 @@ int print_thermal(PER_THREAD_PARAMS) return -1; } - if (do_ptm && is_cpu_first_core_in_package(t, c, p)) { + if (do_ptm && is_cpu_first_core_in_package(t, p)) { if (get_msr(cpu, MSR_IA32_PACKAGE_THERM_STATUS, &msr)) return 0; @@ -8108,8 +8077,7 @@ int print_thermal(PER_THREAD_PARAMS) dts = (msr >> 16) & 0x7F; dts2 = (msr >> 8) & 0x7F; - fprintf(outf, "cpu%d: MSR_IA32_PACKAGE_THERM_INTERRUPT: 0x%08llx (%d C, %d C)\n", - cpu, msr, tj_max - dts, tj_max - dts2); + fprintf(outf, "cpu%d: MSR_IA32_PACKAGE_THERM_INTERRUPT: 0x%08llx (%d C, %d C)\n", cpu, msr, tj_max - dts, tj_max - dts2); } if (do_dts && debug) { @@ -8120,16 +8088,14 @@ int print_thermal(PER_THREAD_PARAMS) dts = (msr >> 16) & 0x7F; resolution = (msr >> 27) & 0xF; - fprintf(outf, "cpu%d: MSR_IA32_THERM_STATUS: 0x%08llx (%d C +/- %d)\n", - cpu, msr, tj_max - dts, resolution); + fprintf(outf, "cpu%d: MSR_IA32_THERM_STATUS: 0x%08llx (%d C +/- %d)\n", cpu, msr, tj_max - dts, resolution); if (get_msr(cpu, MSR_IA32_THERM_INTERRUPT, &msr)) return 0; dts = (msr >> 16) & 0x7F; dts2 = (msr >> 8) & 0x7F; - fprintf(outf, "cpu%d: MSR_IA32_THERM_INTERRUPT: 0x%08llx (%d C, %d C)\n", - cpu, msr, tj_max - dts, tj_max - dts2); + fprintf(outf, "cpu%d: MSR_IA32_THERM_INTERRUPT: 0x%08llx (%d C, %d C)\n", cpu, msr, tj_max - dts, tj_max - dts2); } return 0; @@ -8203,8 +8169,7 @@ void decode_misc_enable_msr(void) msr & MSR_IA32_MISC_ENABLE_TM1 ? "" : "No-", msr & MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP ? "" : "No-", msr & MSR_IA32_MISC_ENABLE_MWAIT ? "" : "No-", - msr & MSR_IA32_MISC_ENABLE_PREFETCH_DISABLE ? "No-" : "", - msr & MSR_IA32_MISC_ENABLE_TURBO_DISABLE ? "No-" : ""); + msr & MSR_IA32_MISC_ENABLE_PREFETCH_DISABLE ? "No-" : "", msr & MSR_IA32_MISC_ENABLE_TURBO_DISABLE ? "No-" : ""); } void decode_misc_feature_control(void) @@ -8243,8 +8208,7 @@ void decode_misc_pwr_mgmt_msr(void) if (!get_msr(base_cpu, MSR_MISC_PWR_MGMT, &msr)) fprintf(outf, "cpu%d: MSR_MISC_PWR_MGMT: 0x%08llx (%sable-EIST_Coordination %sable-EPB %sable-OOB)\n", - base_cpu, msr, - msr & (1 << 0) ? "DIS" : "EN", msr & (1 << 1) ? "EN" : "DIS", msr & (1 << 8) ? "EN" : "DIS"); + base_cpu, msr, msr & (1 << 0) ? "DIS" : "EN", msr & (1 << 1) ? "EN" : "DIS", msr & (1 << 8) ? "EN" : "DIS"); } /* @@ -8297,30 +8261,26 @@ void print_dev_latency(void) close(fd); } -static int has_instr_count_access(void) +static int has_perf_instr_count_access(void) { int fd; - int has_access; if (no_perf) return 0; fd = open_perf_counter(base_cpu, PERF_TYPE_HARDWARE, PERF_COUNT_HW_INSTRUCTIONS, -1, 0); - has_access = fd != -1; - if (fd != -1) close(fd); - if (!has_access) + if (fd == -1) warnx("Failed to access %s. Some of the counters may not be available\n" - "\tRun as root to enable them or use %s to disable the access explicitly", - "instructions retired perf counter", "--no-perf"); + "\tRun as root to enable them or use %s to disable the access explicitly", "perf instructions retired counter", + "'--hide IPC' or '--no-perf'"); - return has_access; + return (fd != -1); } -int add_rapl_perf_counter(int cpu, struct rapl_counter_info_t *rci, const struct rapl_counter_arch_info *cai, - double *scale_, enum rapl_unit *unit_) +int add_rapl_perf_counter(int cpu, struct rapl_counter_info_t *rci, const struct rapl_counter_arch_info *cai, double *scale_, enum rapl_unit *unit_) { int ret = -1; @@ -8370,11 +8330,16 @@ void linux_perf_init(void) if (access("/proc/sys/kernel/perf_event_paranoid", F_OK)) return; - if (BIC_IS_ENABLED(BIC_IPC) && has_aperf) { + if (BIC_IS_ENABLED(BIC_IPC) && cpuid_has_aperf_mperf) { fd_instr_count_percpu = calloc(topo.max_cpu_num + 1, sizeof(int)); if (fd_instr_count_percpu == NULL) err(-1, "calloc fd_instr_count_percpu"); } + if (BIC_IS_ENABLED(BIC_LLC_RPS)) { + fd_llc_percpu = calloc(topo.max_cpu_num + 1, sizeof(int)); + if (fd_llc_percpu == NULL) + err(-1, "calloc fd_llc_percpu"); + } } void rapl_perf_init(void) @@ -8485,7 +8450,7 @@ void rapl_perf_init(void) /* Assumes msr_counter_info is populated */ static int has_amperf_access(void) { - return msr_counter_arch_infos[MSR_ARCH_INFO_APERF_INDEX].present && + return cpuid_has_aperf_mperf && msr_counter_arch_infos[MSR_ARCH_INFO_APERF_INDEX].present && msr_counter_arch_infos[MSR_ARCH_INFO_MPERF_INDEX].present; } @@ -8708,8 +8673,7 @@ void cstate_perf_init_(bool soft_c1) cci->source[cai->rci_index] = COUNTER_SOURCE_PERF; /* User MSR for this counter */ - } else if (pkg_cstate_limit >= cai->pkg_cstate_limit - && add_msr_counter(cpu, cai->msr) >= 0) { + } else if (pkg_cstate_limit >= cai->pkg_cstate_limit && add_msr_counter(cpu, cai->msr) >= 0) { cci->source[cai->rci_index] = COUNTER_SOURCE_MSR; cci->msr[cai->rci_index] = cai->msr; } @@ -8827,8 +8791,7 @@ void process_cpuid() hygon_genuine = 1; if (!quiet) - fprintf(outf, "CPUID(0): %.4s%.4s%.4s 0x%x CPUID levels\n", - (char *)&ebx, (char *)&edx, (char *)&ecx, max_level); + fprintf(outf, "CPUID(0): %.4s%.4s%.4s 0x%x CPUID levels\n", (char *)&ebx, (char *)&edx, (char *)&ecx, max_level); __cpuid(1, fms, ebx, ecx, edx); family = (fms >> 8) & 0xf; @@ -8857,8 +8820,7 @@ void process_cpuid() __cpuid(0x80000000, max_extended_level, ebx, ecx, edx); if (!quiet) { - fprintf(outf, "CPUID(1): family:model:stepping 0x%x:%x:%x (%d:%d:%d)", - family, model, stepping, family, model, stepping); + fprintf(outf, "CPUID(1): family:model:stepping 0x%x:%x:%x (%d:%d:%d)", family, model, stepping, family, model, stepping); if (ucode_patch_valid) fprintf(outf, " microcode 0x%x", (unsigned int)((ucode_patch >> 32) & 0xFFFFFFFF)); fputc('\n', outf); @@ -8872,8 +8834,7 @@ void process_cpuid() ecx_flags & (1 << 8) ? "TM2" : "-", edx_flags & (1 << 4) ? "TSC" : "-", edx_flags & (1 << 5) ? "MSR" : "-", - edx_flags & (1 << 22) ? "ACPI-TM" : "-", - edx_flags & (1 << 28) ? "HT" : "-", edx_flags & (1 << 29) ? "TM" : "-"); + edx_flags & (1 << 22) ? "ACPI-TM" : "-", edx_flags & (1 << 28) ? "HT" : "-", edx_flags & (1 << 29) ? "TM" : "-"); } probe_platform_features(family, model); @@ -8897,7 +8858,7 @@ void process_cpuid() */ __cpuid(0x6, eax, ebx, ecx, edx); - has_aperf = ecx & (1 << 0); + cpuid_has_aperf_mperf = ecx & (1 << 0); do_dts = eax & (1 << 0); if (do_dts) BIC_PRESENT(BIC_CoreTmp); @@ -8915,14 +8876,13 @@ void process_cpuid() if (!quiet) fprintf(outf, "CPUID(6): %sAPERF, %sTURBO, %sDTS, %sPTM, %sHWP, " "%sHWPnotify, %sHWPwindow, %sHWPepp, %sHWPpkg, %sEPB\n", - has_aperf ? "" : "No-", + cpuid_has_aperf_mperf ? "" : "No-", has_turbo ? "" : "No-", do_dts ? "" : "No-", do_ptm ? "" : "No-", has_hwp ? "" : "No-", has_hwp_notify ? "" : "No-", - has_hwp_activity_window ? "" : "No-", - has_hwp_epp ? "" : "No-", has_hwp_pkg ? "" : "No-", has_epb ? "" : "No-"); + has_hwp_activity_window ? "" : "No-", has_hwp_epp ? "" : "No-", has_hwp_pkg ? "" : "No-", has_epb ? "" : "No-"); if (!quiet) decode_misc_enable_msr(); @@ -8956,8 +8916,7 @@ void process_cpuid() if (ebx_tsc != 0) { if (!quiet && (ebx != 0)) - fprintf(outf, "CPUID(0x15): eax_crystal: %d ebx_tsc: %d ecx_crystal_hz: %d\n", - eax_crystal, ebx_tsc, crystal_hz); + fprintf(outf, "CPUID(0x15): eax_crystal: %d ebx_tsc: %d ecx_crystal_hz: %d\n", eax_crystal, ebx_tsc, crystal_hz); if (crystal_hz == 0) crystal_hz = platform->crystal_freq; @@ -8989,11 +8948,10 @@ void process_cpuid() tsc_tweak = base_hz / tsc_hz; if (!quiet) - fprintf(outf, "CPUID(0x16): base_mhz: %d max_mhz: %d bus_mhz: %d\n", - base_mhz, max_mhz, bus_mhz); + fprintf(outf, "CPUID(0x16): base_mhz: %d max_mhz: %d bus_mhz: %d\n", base_mhz, max_mhz, bus_mhz); } - if (has_aperf) + if (cpuid_has_aperf_mperf) aperf_mperf_multiplier = platform->need_perf_multiplier ? 1024 : 1; BIC_PRESENT(BIC_IRQ); @@ -9045,6 +9003,62 @@ void probe_pm_features(void) decode_misc_feature_control(); } +/* perf_llc_probe + * + * return 1 on success, else 0 + */ +int has_perf_llc_access(void) +{ + int fd; + + if (no_perf) + return 0; + + fd = open_perf_counter(base_cpu, PERF_TYPE_HARDWARE, PERF_COUNT_HW_CACHE_REFERENCES, -1, PERF_FORMAT_GROUP); + if (fd != -1) + close(fd); + + if (fd == -1) + warnx("Failed to access %s. Some of the counters may not be available\n" + "\tRun as root to enable them or use %s to disable the access explicitly", "perf LLC counters", "'--hide LLC' or '--no-perf'"); + + return (fd != -1); +} + +void perf_llc_init(void) +{ + int cpu; + int retval; + + if (no_perf) + return; + if (!(BIC_IS_ENABLED(BIC_LLC_RPS) && BIC_IS_ENABLED(BIC_LLC_HIT))) + return; + + for (cpu = 0; cpu <= topo.max_cpu_num; ++cpu) { + + if (cpu_is_not_allowed(cpu)) + continue; + + assert(fd_llc_percpu != 0); + fd_llc_percpu[cpu] = open_perf_counter(cpu, PERF_TYPE_HARDWARE, PERF_COUNT_HW_CACHE_REFERENCES, -1, PERF_FORMAT_GROUP); + if (fd_llc_percpu[cpu] == -1) { + warnx("%s: perf REFS: failed to open counter on cpu%d", __func__, cpu); + free_fd_llc_percpu(); + return; + } + assert(fd_llc_percpu != 0); + retval = open_perf_counter(cpu, PERF_TYPE_HARDWARE, PERF_COUNT_HW_CACHE_MISSES, fd_llc_percpu[cpu], PERF_FORMAT_GROUP); + if (retval == -1) { + warnx("%s: perf MISS: failed to open counter on cpu%d", __func__, cpu); + free_fd_llc_percpu(); + return; + } + } + BIC_PRESENT(BIC_LLC_RPS); + BIC_PRESENT(BIC_LLC_HIT); +} + /* * in /dev/cpu/ return success for names that are numbers * ie. filter out ".", "..", "microcode". @@ -9351,6 +9365,7 @@ void init_counter(struct thread_data *thread_base, struct core_data *core_base, t->cpu_id = cpu_id; if (!cpu_is_not_allowed(cpu_id)) { + if (c->base_cpu < 0) c->base_cpu = t->cpu_id; if (pkg_base[pkg_id].base_cpu < 0) @@ -9456,7 +9471,7 @@ bool has_added_counters(void) void check_msr_access(void) { - check_dev_msr(); + check_msr_driver(); check_msr_permission(); if (no_msr) @@ -9465,8 +9480,16 @@ void check_msr_access(void) void check_perf_access(void) { - if (no_perf || !BIC_IS_ENABLED(BIC_IPC) || !has_instr_count_access()) - CLR_BIC(BIC_IPC, &bic_enabled); + if (BIC_IS_ENABLED(BIC_IPC)) + if (!has_perf_instr_count_access()) + no_perf = 1; + + if (BIC_IS_ENABLED(BIC_LLC_RPS) || BIC_IS_ENABLED(BIC_LLC_HIT)) + if (!has_perf_llc_access()) + no_perf = 1; + + if (no_perf) + bic_disable_perf_access(); } bool perf_has_hybrid_devices(void) @@ -9589,8 +9612,7 @@ int added_perf_counters_init_(struct perf_counter_info *pinfo) perf_config = read_perf_config(perf_device, pinfo->event); if (perf_config == (unsigned int)-1) { - warnx("%s: perf/%s/%s: failed to read %s", - __func__, perf_device, pinfo->event, "config"); + warnx("%s: perf/%s/%s: failed to read %s", __func__, perf_device, pinfo->event, "config"); continue; } @@ -9601,8 +9623,7 @@ int added_perf_counters_init_(struct perf_counter_info *pinfo) fd_perf = open_perf_counter(cpu, perf_type, perf_config, -1, 0); if (fd_perf == -1) { - warnx("%s: perf/%s/%s: failed to open counter on cpu%d", - __func__, perf_device, pinfo->event, cpu); + warnx("%s: perf/%s/%s: failed to open counter on cpu%d", __func__, perf_device, pinfo->event, cpu); continue; } @@ -9611,8 +9632,7 @@ int added_perf_counters_init_(struct perf_counter_info *pinfo) pinfo->scale = perf_scale; if (debug) - fprintf(stderr, "Add perf/%s/%s cpu%d: %d\n", - perf_device, pinfo->event, cpu, pinfo->fd_perf_per_domain[next_domain]); + fprintf(stderr, "Add perf/%s/%s cpu%d: %d\n", perf_device, pinfo->event, cpu, pinfo->fd_perf_per_domain[next_domain]); } pinfo = pinfo->next; @@ -9926,8 +9946,7 @@ int pmt_add_counter(unsigned int guid, unsigned int seq, const char *name, enum } if (conflict) { - fprintf(stderr, "%s: conflicting parameters for the PMT counter with the same name %s\n", - __func__, name); + fprintf(stderr, "%s: conflicting parameters for the PMT counter with the same name %s\n", __func__, name); exit(1); } @@ -9970,8 +9989,7 @@ void pmt_init(void) * CWF with newer firmware might require a PMT_TYPE_XTAL_TIME intead of PMT_TYPE_TCORE_CLOCK. */ pmt_add_counter(PMT_CWF_MC1E_GUID, seq, "CPU%c1e", PMT_TYPE_TCORE_CLOCK, - PMT_COUNTER_CWF_MC1E_LSB, PMT_COUNTER_CWF_MC1E_MSB, offset, SCOPE_CPU, - FORMAT_DELTA, cpu_num, PMT_OPEN_TRY); + PMT_COUNTER_CWF_MC1E_LSB, PMT_COUNTER_CWF_MC1E_MSB, offset, SCOPE_CPU, FORMAT_DELTA, cpu_num, PMT_OPEN_TRY); /* * Rather complex logic for each time we go to the next loop iteration, @@ -10021,6 +10039,7 @@ void turbostat_init() linux_perf_init(); rapl_perf_init(); cstate_perf_init(); + perf_llc_init(); added_perf_counters_init(); pmt_init(); @@ -10126,7 +10145,7 @@ int get_and_dump_counters(void) void print_version() { - fprintf(outf, "turbostat version 2025.09.09 - Len Brown <lenb@kernel.org>\n"); + fprintf(outf, "turbostat version 2025.12.02 - Len Brown <lenb@kernel.org>\n"); } #define COMMAND_LINE_SIZE 2048 @@ -10166,8 +10185,7 @@ struct msr_counter *find_msrp_by_name(struct msr_counter *head, char *name) } int add_counter(unsigned int msr_num, char *path, char *name, - unsigned int width, enum counter_scope scope, - enum counter_type type, enum counter_format format, int flags, int id) + unsigned int width, enum counter_scope scope, enum counter_type type, enum counter_format format, int flags, int id) { struct msr_counter *msrp; @@ -10276,9 +10294,7 @@ int add_counter(unsigned int msr_num, char *path, char *name, struct perf_counter_info *make_perf_counter_info(const char *perf_device, const char *perf_event, const char *name, - unsigned int width, - enum counter_scope scope, - enum counter_type type, enum counter_format format) + unsigned int width, enum counter_scope scope, enum counter_type type, enum counter_format format) { struct perf_counter_info *pinfo; @@ -10353,8 +10369,7 @@ int add_perf_counter(const char *perf_device, const char *perf_event, const char // FIXME: we might not have debug here yet if (debug) - fprintf(stderr, "%s: %s/%s, name: %s, scope%d\n", - __func__, pinfo->device, pinfo->event, pinfo->name, pinfo->scope); + fprintf(stderr, "%s: %s/%s, name: %s, scope%d\n", __func__, pinfo->device, pinfo->event, pinfo->name, pinfo->scope); return 0; } @@ -10523,8 +10538,7 @@ int pmt_parse_from_path(const char *target_path, unsigned int *out_guid, unsigne pmt_diriter_init(&pmt_iter); - for (dirname = pmt_diriter_begin(&pmt_iter, SYSFS_TELEM_PATH); dirname != NULL; - dirname = pmt_diriter_next(&pmt_iter)) { + for (dirname = pmt_diriter_begin(&pmt_iter, SYSFS_TELEM_PATH); dirname != NULL; dirname = pmt_diriter_next(&pmt_iter)) { fd_telem_dir = openat(dirfd(pmt_iter.dir), dirname->d_name, O_RDONLY | O_DIRECTORY); if (fd_telem_dir == -1) @@ -10536,8 +10550,7 @@ int pmt_parse_from_path(const char *target_path, unsigned int *out_guid, unsigne } if (fstat(fd_telem_dir, &stat) == -1) { - fprintf(stderr, "%s: Failed to stat %s directory: %s", __func__, - dirname->d_name, strerror(errno)); + fprintf(stderr, "%s: Failed to stat %s directory: %s", __func__, dirname->d_name, strerror(errno)); continue; } @@ -10633,8 +10646,7 @@ void parse_add_command_pmt(char *add_command) } if (!has_scope) { - printf("%s: invalid value for scope. Expected cpu%%u, core%%u or package%%u.\n", - __func__); + printf("%s: invalid value for scope. Expected cpu%%u, core%%u or package%%u.\n", __func__); exit(1); } @@ -10710,8 +10722,7 @@ next: } if (!has_format) { - fprintf(stderr, "%s: Invalid format %s. Expected raw, average or delta\n", - __func__, format_name); + fprintf(stderr, "%s: Invalid format %s. Expected raw, average or delta\n", __func__, format_name); exit(1); } } @@ -10878,7 +10889,7 @@ void probe_cpuidle_residency(void) if (is_deferred_skip(name_buf)) continue; - add_counter(0, path, name_buf, 64, SCOPE_CPU, COUNTER_USEC, FORMAT_PERCENT, SYSFS_PERCPU, 0); + add_counter(0, path, name_buf, 32, SCOPE_CPU, COUNTER_USEC, FORMAT_PERCENT, SYSFS_PERCPU, 0); if (state > max_state) max_state = state; diff --git a/tools/power/x86/x86_energy_perf_policy/x86_energy_perf_policy.c b/tools/power/x86/x86_energy_perf_policy/x86_energy_perf_policy.c index 884a4c746f32..ac37132207a4 100644 --- a/tools/power/x86/x86_energy_perf_policy/x86_energy_perf_policy.c +++ b/tools/power/x86/x86_energy_perf_policy/x86_energy_perf_policy.c @@ -95,6 +95,8 @@ unsigned int bdx_highest_ratio; #define PATH_TO_CPU "/sys/devices/system/cpu/" #define SYSFS_PATH_MAX 255 +static int use_android_msr_path; + /* * maintain compatibility with original implementation, but don't document it: */ @@ -370,7 +372,7 @@ void validate_cpu_selected_set(void) for (cpu = 0; cpu <= max_cpu_num; ++cpu) { if (CPU_ISSET_S(cpu, cpu_setsize, cpu_selected_set)) if (!CPU_ISSET_S(cpu, cpu_setsize, cpu_present_set)) - errx(1, "Requested cpu% is not present", cpu); + errx(1, "Requested cpu%d is not present", cpu); } } @@ -518,7 +520,7 @@ void for_packages(unsigned long long pkg_set, int (func)(int)) void print_version(void) { - printf("x86_energy_perf_policy 2025.9.19 Len Brown <lenb@kernel.org>\n"); + printf("x86_energy_perf_policy 2025.11.22 Len Brown <lenb@kernel.org>\n"); } void cmdline(int argc, char **argv) @@ -660,6 +662,11 @@ void err_on_hypervisor(void) } flags = strstr(buffer, "flags"); + if (!flags) { + fclose(cpuinfo); + free(buffer); + err(1, "Failed to find 'flags' in /proc/cpuinfo"); + } rewind(cpuinfo); fseek(cpuinfo, flags - buffer, SEEK_SET); if (!fgets(buffer, 4096, cpuinfo)) { @@ -684,10 +691,12 @@ int get_msr(int cpu, int offset, unsigned long long *msr) char pathname[32]; int fd; - sprintf(pathname, "/dev/cpu/%d/msr", cpu); + sprintf(pathname, use_android_msr_path ? "/dev/msr%d" : "/dev/cpu/%d/msr", cpu); fd = open(pathname, O_RDONLY); if (fd < 0) - err(-1, "%s open failed, try chown or chmod +r /dev/cpu/*/msr, or run as root", pathname); + err(-1, "%s open failed, try chown or chmod +r %s, or run as root", + pathname, use_android_msr_path ? "/dev/msr*" : "/dev/cpu/*/msr"); + retval = pread(fd, msr, sizeof(*msr), offset); if (retval != sizeof(*msr)) { @@ -708,10 +717,11 @@ int put_msr(int cpu, int offset, unsigned long long new_msr) int retval; int fd; - sprintf(pathname, "/dev/cpu/%d/msr", cpu); + sprintf(pathname, use_android_msr_path ? "/dev/msr%d" : "/dev/cpu/%d/msr", cpu); fd = open(pathname, O_RDWR); if (fd < 0) - err(-1, "%s open failed, try chown or chmod +r /dev/cpu/*/msr, or run as root", pathname); + err(-1, "%s open failed, try chown or chmod +r %s, or run as root", + pathname, use_android_msr_path ? "/dev/msr*" : "/dev/cpu/*/msr"); retval = pwrite(fd, &new_msr, sizeof(new_msr), offset); if (retval != sizeof(new_msr)) @@ -1421,16 +1431,32 @@ void set_base_cpu(void) err(-ENODEV, "No valid cpus found"); } +static void probe_android_msr_path(void) +{ + struct stat sb; + char test_path[32]; + + sprintf(test_path, "/dev/msr%d", base_cpu); + if (stat(test_path, &sb) == 0) + use_android_msr_path = 1; +} void probe_dev_msr(void) { struct stat sb; char pathname[32]; - sprintf(pathname, "/dev/cpu/%d/msr", base_cpu); - if (stat(pathname, &sb)) - if (system("/sbin/modprobe msr > /dev/null 2>&1")) - err(-5, "no /dev/cpu/0/msr, Try \"# modprobe msr\" "); + probe_android_msr_path(); + + sprintf(pathname, use_android_msr_path ? "/dev/msr%d" : "/dev/cpu/%d/msr", base_cpu); + if (stat(pathname, &sb)) { + if (system("/sbin/modprobe msr > /dev/null 2>&1")) { + if (use_android_msr_path) + err(-5, "no /dev/msr0, Try \"# modprobe msr\" "); + else + err(-5, "no /dev/cpu/0/msr, Try \"# modprobe msr\" "); + } + } } static void get_cpuid_or_exit(unsigned int leaf, @@ -1547,6 +1573,7 @@ void parse_cpuid(void) int main(int argc, char **argv) { set_base_cpu(); + probe_dev_msr(); init_data_structures(); diff --git a/tools/sched_ext/Makefile b/tools/sched_ext/Makefile index d68780e2e03d..e4bda2474060 100644 --- a/tools/sched_ext/Makefile +++ b/tools/sched_ext/Makefile @@ -133,6 +133,7 @@ $(MAKE_DIRS): $(call msg,MKDIR,,$@) $(Q)mkdir -p $@ +ifneq ($(CROSS_COMPILE),) $(BPFOBJ): $(wildcard $(BPFDIR)/*.[ch] $(BPFDIR)/Makefile) \ $(APIDIR)/linux/bpf.h \ | $(OBJ_DIR)/libbpf @@ -141,6 +142,7 @@ $(BPFOBJ): $(wildcard $(BPFDIR)/*.[ch] $(BPFDIR)/Makefile) \ EXTRA_CFLAGS='-g -O0 -fPIC' \ LDFLAGS="$(LDFLAGS)" \ DESTDIR=$(OUTPUT_DIR) prefix= all install_headers +endif $(HOST_BPFOBJ): $(wildcard $(BPFDIR)/*.[ch] $(BPFDIR)/Makefile) \ $(APIDIR)/linux/bpf.h \ @@ -187,7 +189,7 @@ $(INCLUDE_DIR)/%.bpf.skel.h: $(SCXOBJ_DIR)/%.bpf.o $(INCLUDE_DIR)/vmlinux.h $(BP SCX_COMMON_DEPS := include/scx/common.h include/scx/user_exit_info.h | $(BINDIR) -c-sched-targets = scx_simple scx_qmap scx_central scx_flatcg +c-sched-targets = scx_simple scx_cpu0 scx_qmap scx_central scx_flatcg $(addprefix $(BINDIR)/,$(c-sched-targets)): \ $(BINDIR)/%: \ diff --git a/tools/sched_ext/include/scx/common.bpf.h b/tools/sched_ext/include/scx/common.bpf.h index 06e2551033cb..821d5791bd42 100644 --- a/tools/sched_ext/include/scx/common.bpf.h +++ b/tools/sched_ext/include/scx/common.bpf.h @@ -60,21 +60,15 @@ static inline void ___vmlinux_h_sanity_check___(void) s32 scx_bpf_create_dsq(u64 dsq_id, s32 node) __ksym; s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, bool *is_idle) __ksym; -s32 scx_bpf_select_cpu_and(struct task_struct *p, s32 prev_cpu, u64 wake_flags, - const struct cpumask *cpus_allowed, u64 flags) __ksym __weak; -void scx_bpf_dsq_insert(struct task_struct *p, u64 dsq_id, u64 slice, u64 enq_flags) __ksym __weak; -void scx_bpf_dsq_insert_vtime(struct task_struct *p, u64 dsq_id, u64 slice, u64 vtime, u64 enq_flags) __ksym __weak; +s32 __scx_bpf_select_cpu_and(struct task_struct *p, const struct cpumask *cpus_allowed, + struct scx_bpf_select_cpu_and_args *args) __ksym __weak; +bool __scx_bpf_dsq_insert_vtime(struct task_struct *p, struct scx_bpf_dsq_insert_vtime_args *args) __ksym __weak; u32 scx_bpf_dispatch_nr_slots(void) __ksym; void scx_bpf_dispatch_cancel(void) __ksym; -bool scx_bpf_dsq_move_to_local(u64 dsq_id) __ksym __weak; -void scx_bpf_dsq_move_set_slice(struct bpf_iter_scx_dsq *it__iter, u64 slice) __ksym __weak; -void scx_bpf_dsq_move_set_vtime(struct bpf_iter_scx_dsq *it__iter, u64 vtime) __ksym __weak; -bool scx_bpf_dsq_move(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id, u64 enq_flags) __ksym __weak; -bool scx_bpf_dsq_move_vtime(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id, u64 enq_flags) __ksym __weak; -u32 scx_bpf_reenqueue_local(void) __ksym; void scx_bpf_kick_cpu(s32 cpu, u64 flags) __ksym; s32 scx_bpf_dsq_nr_queued(u64 dsq_id) __ksym; void scx_bpf_destroy_dsq(u64 dsq_id) __ksym; +struct task_struct *scx_bpf_dsq_peek(u64 dsq_id) __ksym __weak; int bpf_iter_scx_dsq_new(struct bpf_iter_scx_dsq *it, u64 dsq_id, u64 flags) __ksym __weak; struct task_struct *bpf_iter_scx_dsq_next(struct bpf_iter_scx_dsq *it) __ksym __weak; void bpf_iter_scx_dsq_destroy(struct bpf_iter_scx_dsq *it) __ksym __weak; @@ -105,7 +99,6 @@ s32 scx_bpf_task_cpu(const struct task_struct *p) __ksym; struct rq *scx_bpf_cpu_rq(s32 cpu) __ksym; struct rq *scx_bpf_locked_rq(void) __ksym; struct task_struct *scx_bpf_cpu_curr(s32 cpu) __ksym __weak; -struct cgroup *scx_bpf_task_cgroup(struct task_struct *p) __ksym __weak; u64 scx_bpf_now(void) __ksym __weak; void scx_bpf_events(struct scx_event_stats *events, size_t events__sz) __ksym __weak; diff --git a/tools/sched_ext/include/scx/compat.bpf.h b/tools/sched_ext/include/scx/compat.bpf.h index dd9144624dc9..f2969c3061a7 100644 --- a/tools/sched_ext/include/scx/compat.bpf.h +++ b/tools/sched_ext/include/scx/compat.bpf.h @@ -16,119 +16,92 @@ }) /* v6.12: 819513666966 ("sched_ext: Add cgroup support") */ -#define __COMPAT_scx_bpf_task_cgroup(p) \ - (bpf_ksym_exists(scx_bpf_task_cgroup) ? \ - scx_bpf_task_cgroup((p)) : NULL) +struct cgroup *scx_bpf_task_cgroup___new(struct task_struct *p) __ksym __weak; + +#define scx_bpf_task_cgroup(p) \ + (bpf_ksym_exists(scx_bpf_task_cgroup___new) ? \ + scx_bpf_task_cgroup___new((p)) : NULL) /* * v6.13: The verb `dispatch` was too overloaded and confusing. kfuncs are * renamed to unload the verb. * - * Build error is triggered if old names are used. New binaries work with both - * new and old names. The compat macros will be removed on v6.15 release. - * * scx_bpf_dispatch_from_dsq() and friends were added during v6.12 by * 4c30f5ce4f7a ("sched_ext: Implement scx_bpf_dispatch[_vtime]_from_dsq()"). - * Preserve __COMPAT macros until v6.15. */ -void scx_bpf_dispatch___compat(struct task_struct *p, u64 dsq_id, u64 slice, u64 enq_flags) __ksym __weak; -void scx_bpf_dispatch_vtime___compat(struct task_struct *p, u64 dsq_id, u64 slice, u64 vtime, u64 enq_flags) __ksym __weak; -bool scx_bpf_consume___compat(u64 dsq_id) __ksym __weak; -void scx_bpf_dispatch_from_dsq_set_slice___compat(struct bpf_iter_scx_dsq *it__iter, u64 slice) __ksym __weak; -void scx_bpf_dispatch_from_dsq_set_vtime___compat(struct bpf_iter_scx_dsq *it__iter, u64 vtime) __ksym __weak; -bool scx_bpf_dispatch_from_dsq___compat(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id, u64 enq_flags) __ksym __weak; -bool scx_bpf_dispatch_vtime_from_dsq___compat(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id, u64 enq_flags) __ksym __weak; -int bpf_cpumask_populate(struct cpumask *dst, void *src, size_t src__sz) __ksym __weak; - -#define scx_bpf_dsq_insert(p, dsq_id, slice, enq_flags) \ - (bpf_ksym_exists(scx_bpf_dsq_insert) ? \ - scx_bpf_dsq_insert((p), (dsq_id), (slice), (enq_flags)) : \ - scx_bpf_dispatch___compat((p), (dsq_id), (slice), (enq_flags))) - -#define scx_bpf_dsq_insert_vtime(p, dsq_id, slice, vtime, enq_flags) \ - (bpf_ksym_exists(scx_bpf_dsq_insert_vtime) ? \ - scx_bpf_dsq_insert_vtime((p), (dsq_id), (slice), (vtime), (enq_flags)) : \ - scx_bpf_dispatch_vtime___compat((p), (dsq_id), (slice), (vtime), (enq_flags))) +bool scx_bpf_dsq_move_to_local___new(u64 dsq_id) __ksym __weak; +void scx_bpf_dsq_move_set_slice___new(struct bpf_iter_scx_dsq *it__iter, u64 slice) __ksym __weak; +void scx_bpf_dsq_move_set_vtime___new(struct bpf_iter_scx_dsq *it__iter, u64 vtime) __ksym __weak; +bool scx_bpf_dsq_move___new(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id, u64 enq_flags) __ksym __weak; +bool scx_bpf_dsq_move_vtime___new(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id, u64 enq_flags) __ksym __weak; + +bool scx_bpf_consume___old(u64 dsq_id) __ksym __weak; +void scx_bpf_dispatch_from_dsq_set_slice___old(struct bpf_iter_scx_dsq *it__iter, u64 slice) __ksym __weak; +void scx_bpf_dispatch_from_dsq_set_vtime___old(struct bpf_iter_scx_dsq *it__iter, u64 vtime) __ksym __weak; +bool scx_bpf_dispatch_from_dsq___old(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id, u64 enq_flags) __ksym __weak; +bool scx_bpf_dispatch_vtime_from_dsq___old(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id, u64 enq_flags) __ksym __weak; #define scx_bpf_dsq_move_to_local(dsq_id) \ - (bpf_ksym_exists(scx_bpf_dsq_move_to_local) ? \ - scx_bpf_dsq_move_to_local((dsq_id)) : \ - scx_bpf_consume___compat((dsq_id))) - -#define __COMPAT_scx_bpf_dsq_move_set_slice(it__iter, slice) \ - (bpf_ksym_exists(scx_bpf_dsq_move_set_slice) ? \ - scx_bpf_dsq_move_set_slice((it__iter), (slice)) : \ - (bpf_ksym_exists(scx_bpf_dispatch_from_dsq_set_slice___compat) ? \ - scx_bpf_dispatch_from_dsq_set_slice___compat((it__iter), (slice)) : \ + (bpf_ksym_exists(scx_bpf_dsq_move_to_local___new) ? \ + scx_bpf_dsq_move_to_local___new((dsq_id)) : \ + scx_bpf_consume___old((dsq_id))) + +#define scx_bpf_dsq_move_set_slice(it__iter, slice) \ + (bpf_ksym_exists(scx_bpf_dsq_move_set_slice___new) ? \ + scx_bpf_dsq_move_set_slice___new((it__iter), (slice)) : \ + (bpf_ksym_exists(scx_bpf_dispatch_from_dsq_set_slice___old) ? \ + scx_bpf_dispatch_from_dsq_set_slice___old((it__iter), (slice)) : \ + (void)0)) + +#define scx_bpf_dsq_move_set_vtime(it__iter, vtime) \ + (bpf_ksym_exists(scx_bpf_dsq_move_set_vtime___new) ? \ + scx_bpf_dsq_move_set_vtime___new((it__iter), (vtime)) : \ + (bpf_ksym_exists(scx_bpf_dispatch_from_dsq_set_vtime___old) ? \ + scx_bpf_dispatch_from_dsq_set_vtime___old((it__iter), (vtime)) : \ (void)0)) -#define __COMPAT_scx_bpf_dsq_move_set_vtime(it__iter, vtime) \ - (bpf_ksym_exists(scx_bpf_dsq_move_set_vtime) ? \ - scx_bpf_dsq_move_set_vtime((it__iter), (vtime)) : \ - (bpf_ksym_exists(scx_bpf_dispatch_from_dsq_set_vtime___compat) ? \ - scx_bpf_dispatch_from_dsq_set_vtime___compat((it__iter), (vtime)) : \ - (void) 0)) - -#define __COMPAT_scx_bpf_dsq_move(it__iter, p, dsq_id, enq_flags) \ - (bpf_ksym_exists(scx_bpf_dsq_move) ? \ - scx_bpf_dsq_move((it__iter), (p), (dsq_id), (enq_flags)) : \ - (bpf_ksym_exists(scx_bpf_dispatch_from_dsq___compat) ? \ - scx_bpf_dispatch_from_dsq___compat((it__iter), (p), (dsq_id), (enq_flags)) : \ +#define scx_bpf_dsq_move(it__iter, p, dsq_id, enq_flags) \ + (bpf_ksym_exists(scx_bpf_dsq_move___new) ? \ + scx_bpf_dsq_move___new((it__iter), (p), (dsq_id), (enq_flags)) : \ + (bpf_ksym_exists(scx_bpf_dispatch_from_dsq___old) ? \ + scx_bpf_dispatch_from_dsq___old((it__iter), (p), (dsq_id), (enq_flags)) : \ false)) -#define __COMPAT_scx_bpf_dsq_move_vtime(it__iter, p, dsq_id, enq_flags) \ - (bpf_ksym_exists(scx_bpf_dsq_move_vtime) ? \ - scx_bpf_dsq_move_vtime((it__iter), (p), (dsq_id), (enq_flags)) : \ - (bpf_ksym_exists(scx_bpf_dispatch_vtime_from_dsq___compat) ? \ - scx_bpf_dispatch_vtime_from_dsq___compat((it__iter), (p), (dsq_id), (enq_flags)) : \ +#define scx_bpf_dsq_move_vtime(it__iter, p, dsq_id, enq_flags) \ + (bpf_ksym_exists(scx_bpf_dsq_move_vtime___new) ? \ + scx_bpf_dsq_move_vtime___new((it__iter), (p), (dsq_id), (enq_flags)) : \ + (bpf_ksym_exists(scx_bpf_dispatch_vtime_from_dsq___old) ? \ + scx_bpf_dispatch_vtime_from_dsq___old((it__iter), (p), (dsq_id), (enq_flags)) : \ false)) +/* + * v6.15: 950ad93df2fc ("bpf: add kfunc for populating cpumask bits") + * + * Compat macro will be dropped on v6.19 release. + */ +int bpf_cpumask_populate(struct cpumask *dst, void *src, size_t src__sz) __ksym __weak; + #define __COMPAT_bpf_cpumask_populate(cpumask, src, size__sz) \ (bpf_ksym_exists(bpf_cpumask_populate) ? \ (bpf_cpumask_populate(cpumask, src, size__sz)) : -EOPNOTSUPP) -#define scx_bpf_dispatch(p, dsq_id, slice, enq_flags) \ - _Static_assert(false, "scx_bpf_dispatch() renamed to scx_bpf_dsq_insert()") - -#define scx_bpf_dispatch_vtime(p, dsq_id, slice, vtime, enq_flags) \ - _Static_assert(false, "scx_bpf_dispatch_vtime() renamed to scx_bpf_dsq_insert_vtime()") - -#define scx_bpf_consume(dsq_id) ({ \ - _Static_assert(false, "scx_bpf_consume() renamed to scx_bpf_dsq_move_to_local()"); \ - false; \ -}) - -#define scx_bpf_dispatch_from_dsq_set_slice(it__iter, slice) \ - _Static_assert(false, "scx_bpf_dispatch_from_dsq_set_slice() renamed to scx_bpf_dsq_move_set_slice()") - -#define scx_bpf_dispatch_from_dsq_set_vtime(it__iter, vtime) \ - _Static_assert(false, "scx_bpf_dispatch_from_dsq_set_vtime() renamed to scx_bpf_dsq_move_set_vtime()") - -#define scx_bpf_dispatch_from_dsq(it__iter, p, dsq_id, enq_flags) ({ \ - _Static_assert(false, "scx_bpf_dispatch_from_dsq() renamed to scx_bpf_dsq_move()"); \ - false; \ -}) - -#define scx_bpf_dispatch_vtime_from_dsq(it__iter, p, dsq_id, enq_flags) ({ \ - _Static_assert(false, "scx_bpf_dispatch_vtime_from_dsq() renamed to scx_bpf_dsq_move_vtime()"); \ - false; \ -}) - -#define __COMPAT_scx_bpf_dispatch_from_dsq_set_slice(it__iter, slice) \ - _Static_assert(false, "__COMPAT_scx_bpf_dispatch_from_dsq_set_slice() renamed to __COMPAT_scx_bpf_dsq_move_set_slice()") - -#define __COMPAT_scx_bpf_dispatch_from_dsq_set_vtime(it__iter, vtime) \ - _Static_assert(false, "__COMPAT_scx_bpf_dispatch_from_dsq_set_vtime() renamed to __COMPAT_scx_bpf_dsq_move_set_vtime()") - -#define __COMPAT_scx_bpf_dispatch_from_dsq(it__iter, p, dsq_id, enq_flags) ({ \ - _Static_assert(false, "__COMPAT_scx_bpf_dispatch_from_dsq() renamed to __COMPAT_scx_bpf_dsq_move()"); \ - false; \ -}) - -#define __COMPAT_scx_bpf_dispatch_vtime_from_dsq(it__iter, p, dsq_id, enq_flags) ({ \ - _Static_assert(false, "__COMPAT_scx_bpf_dispatch_vtime_from_dsq() renamed to __COMPAT_scx_bpf_dsq_move_vtime()"); \ - false; \ -}) +/* + * v6.19: Introduce lockless peek API for user DSQs. + * + * Preserve the following macro until v6.21. + */ +static inline struct task_struct *__COMPAT_scx_bpf_dsq_peek(u64 dsq_id) +{ + struct task_struct *p = NULL; + struct bpf_iter_scx_dsq it; + + if (bpf_ksym_exists(scx_bpf_dsq_peek)) + return scx_bpf_dsq_peek(dsq_id); + if (!bpf_iter_scx_dsq_new(&it, dsq_id, 0)) + p = bpf_iter_scx_dsq_next(&it); + bpf_iter_scx_dsq_destroy(&it); + return p; +} /** * __COMPAT_is_enq_cpu_selected - Test if SCX_ENQ_CPU_SELECTED is on @@ -248,6 +221,161 @@ static inline struct task_struct *__COMPAT_scx_bpf_cpu_curr(int cpu) } /* + * v6.19: To work around BPF maximum parameter limit, the following kfuncs are + * replaced with variants that pack scalar arguments in a struct. Wrappers are + * provided to maintain source compatibility. + * + * v6.13: scx_bpf_dsq_insert_vtime() renaming is also handled here. See the + * block on dispatch renaming above for more details. + * + * The kernel will carry the compat variants until v6.23 to maintain binary + * compatibility. After v6.23 release, remove the compat handling and move the + * wrappers to common.bpf.h. + */ +s32 scx_bpf_select_cpu_and___compat(struct task_struct *p, s32 prev_cpu, u64 wake_flags, + const struct cpumask *cpus_allowed, u64 flags) __ksym __weak; +void scx_bpf_dispatch_vtime___compat(struct task_struct *p, u64 dsq_id, u64 slice, u64 vtime, u64 enq_flags) __ksym __weak; +void scx_bpf_dsq_insert_vtime___compat(struct task_struct *p, u64 dsq_id, u64 slice, u64 vtime, u64 enq_flags) __ksym __weak; + +/** + * scx_bpf_select_cpu_and - Pick an idle CPU usable by task @p + * @p: task_struct to select a CPU for + * @prev_cpu: CPU @p was on previously + * @wake_flags: %SCX_WAKE_* flags + * @cpus_allowed: cpumask of allowed CPUs + * @flags: %SCX_PICK_IDLE* flags + * + * Inline wrapper that packs scalar arguments into a struct and calls + * __scx_bpf_select_cpu_and(). See __scx_bpf_select_cpu_and() for details. + */ +static inline s32 +scx_bpf_select_cpu_and(struct task_struct *p, s32 prev_cpu, u64 wake_flags, + const struct cpumask *cpus_allowed, u64 flags) +{ + if (bpf_core_type_exists(struct scx_bpf_select_cpu_and_args)) { + struct scx_bpf_select_cpu_and_args args = { + .prev_cpu = prev_cpu, + .wake_flags = wake_flags, + .flags = flags, + }; + + return __scx_bpf_select_cpu_and(p, cpus_allowed, &args); + } else { + return scx_bpf_select_cpu_and___compat(p, prev_cpu, wake_flags, + cpus_allowed, flags); + } +} + +/** + * scx_bpf_dsq_insert_vtime - Insert a task into the vtime priority queue of a DSQ + * @p: task_struct to insert + * @dsq_id: DSQ to insert into + * @slice: duration @p can run for in nsecs, 0 to keep the current value + * @vtime: @p's ordering inside the vtime-sorted queue of the target DSQ + * @enq_flags: SCX_ENQ_* + * + * Inline wrapper that packs scalar arguments into a struct and calls + * __scx_bpf_dsq_insert_vtime(). See __scx_bpf_dsq_insert_vtime() for details. + */ +static inline bool +scx_bpf_dsq_insert_vtime(struct task_struct *p, u64 dsq_id, u64 slice, u64 vtime, + u64 enq_flags) +{ + if (bpf_core_type_exists(struct scx_bpf_dsq_insert_vtime_args)) { + struct scx_bpf_dsq_insert_vtime_args args = { + .dsq_id = dsq_id, + .slice = slice, + .vtime = vtime, + .enq_flags = enq_flags, + }; + + return __scx_bpf_dsq_insert_vtime(p, &args); + } else if (bpf_ksym_exists(scx_bpf_dsq_insert_vtime___compat)) { + scx_bpf_dsq_insert_vtime___compat(p, dsq_id, slice, vtime, + enq_flags); + return true; + } else { + scx_bpf_dispatch_vtime___compat(p, dsq_id, slice, vtime, + enq_flags); + return true; + } +} + +/* + * v6.19: scx_bpf_dsq_insert() now returns bool instead of void. Move + * scx_bpf_dsq_insert() decl to common.bpf.h and drop compat helper after v6.22. + * The extra ___compat suffix is to work around libbpf not ignoring __SUFFIX on + * kernel side. The entire suffix can be dropped later. + * + * v6.13: scx_bpf_dsq_insert() renaming is also handled here. See the block on + * dispatch renaming above for more details. + */ +bool scx_bpf_dsq_insert___v2___compat(struct task_struct *p, u64 dsq_id, u64 slice, u64 enq_flags) __ksym __weak; +void scx_bpf_dsq_insert___v1(struct task_struct *p, u64 dsq_id, u64 slice, u64 enq_flags) __ksym __weak; +void scx_bpf_dispatch___compat(struct task_struct *p, u64 dsq_id, u64 slice, u64 enq_flags) __ksym __weak; + +static inline bool +scx_bpf_dsq_insert(struct task_struct *p, u64 dsq_id, u64 slice, u64 enq_flags) +{ + if (bpf_ksym_exists(scx_bpf_dsq_insert___v2___compat)) { + return scx_bpf_dsq_insert___v2___compat(p, dsq_id, slice, enq_flags); + } else if (bpf_ksym_exists(scx_bpf_dsq_insert___v1)) { + scx_bpf_dsq_insert___v1(p, dsq_id, slice, enq_flags); + return true; + } else { + scx_bpf_dispatch___compat(p, dsq_id, slice, enq_flags); + return true; + } +} + +/* + * v6.19: scx_bpf_task_set_slice() and scx_bpf_task_set_dsq_vtime() added to for + * sub-sched authority checks. Drop the wrappers and move the decls to + * common.bpf.h after v6.22. + */ +bool scx_bpf_task_set_slice___new(struct task_struct *p, u64 slice) __ksym __weak; +bool scx_bpf_task_set_dsq_vtime___new(struct task_struct *p, u64 vtime) __ksym __weak; + +static inline void scx_bpf_task_set_slice(struct task_struct *p, u64 slice) +{ + if (bpf_ksym_exists(scx_bpf_task_set_slice___new)) + scx_bpf_task_set_slice___new(p, slice); + else + p->scx.slice = slice; +} + +static inline void scx_bpf_task_set_dsq_vtime(struct task_struct *p, u64 vtime) +{ + if (bpf_ksym_exists(scx_bpf_task_set_dsq_vtime___new)) + scx_bpf_task_set_dsq_vtime___new(p, vtime); + else + p->scx.dsq_vtime = vtime; +} + +/* + * v6.19: The new void variant can be called from anywhere while the older v1 + * variant can only be called from ops.cpu_release(). The double ___ prefixes on + * the v2 variant need to be removed once libbpf is updated to ignore ___ prefix + * on kernel side. Drop the wrapper and move the decl to common.bpf.h after + * v6.22. + */ +u32 scx_bpf_reenqueue_local___v1(void) __ksym __weak; +void scx_bpf_reenqueue_local___v2___compat(void) __ksym __weak; + +static inline bool __COMPAT_scx_bpf_reenqueue_local_from_anywhere(void) +{ + return bpf_ksym_exists(scx_bpf_reenqueue_local___v2___compat); +} + +static inline void scx_bpf_reenqueue_local(void) +{ + if (__COMPAT_scx_bpf_reenqueue_local_from_anywhere()) + scx_bpf_reenqueue_local___v2___compat(); + else + scx_bpf_reenqueue_local___v1(); +} + +/* * Define sched_ext_ops. This may be expanded to define multiple variants for * backward compatibility. See compat.h::SCX_OPS_LOAD/ATTACH(). */ diff --git a/tools/sched_ext/include/scx/compat.h b/tools/sched_ext/include/scx/compat.h index 35c67c5174ac..8b4897fc8b99 100644 --- a/tools/sched_ext/include/scx/compat.h +++ b/tools/sched_ext/include/scx/compat.h @@ -151,6 +151,10 @@ static inline long scx_hotplug_seq(void) * * ec7e3b0463e1 ("implement-ops") in https://github.com/sched-ext/sched_ext is * the current minimum required kernel version. + * + * COMPAT: + * - v6.17: ops.cgroup_set_bandwidth() + * - v6.19: ops.cgroup_set_idle() */ #define SCX_OPS_OPEN(__ops_name, __scx_name) ({ \ struct __scx_name *__skel; \ @@ -162,6 +166,16 @@ static inline long scx_hotplug_seq(void) SCX_BUG_ON(!__skel, "Could not open " #__scx_name); \ __skel->struct_ops.__ops_name->hotplug_seq = scx_hotplug_seq(); \ SCX_ENUM_INIT(__skel); \ + if (__skel->struct_ops.__ops_name->cgroup_set_bandwidth && \ + !__COMPAT_struct_has_field("sched_ext_ops", "cgroup_set_bandwidth")) { \ + fprintf(stderr, "WARNING: kernel doesn't support ops.cgroup_set_bandwidth()\n"); \ + __skel->struct_ops.__ops_name->cgroup_set_bandwidth = NULL; \ + } \ + if (__skel->struct_ops.__ops_name->cgroup_set_idle && \ + !__COMPAT_struct_has_field("sched_ext_ops", "cgroup_set_idle")) { \ + fprintf(stderr, "WARNING: kernel doesn't support ops.cgroup_set_idle()\n"); \ + __skel->struct_ops.__ops_name->cgroup_set_idle = NULL; \ + } \ __skel; \ }) diff --git a/tools/sched_ext/scx_cpu0.bpf.c b/tools/sched_ext/scx_cpu0.bpf.c new file mode 100644 index 000000000000..6326ce598c8e --- /dev/null +++ b/tools/sched_ext/scx_cpu0.bpf.c @@ -0,0 +1,88 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * A CPU0 scheduler. + * + * This scheduler queues all tasks to a shared DSQ and only dispatches them on + * CPU0 in FIFO order. This is useful for testing bypass behavior when many + * tasks are concentrated on a single CPU. If the load balancer doesn't work, + * bypass mode can trigger task hangs or RCU stalls as the queue is long and + * there's only one CPU working on it. + * + * - Statistics tracking how many tasks are queued to local and CPU0 DSQs. + * - Termination notification for userspace. + * + * Copyright (c) 2025 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2025 Tejun Heo <tj@kernel.org> + */ +#include <scx/common.bpf.h> + +char _license[] SEC("license") = "GPL"; + +const volatile u32 nr_cpus = 32; /* !0 for veristat, set during init */ + +UEI_DEFINE(uei); + +/* + * We create a custom DSQ with ID 0 that we dispatch to and consume from on + * CPU0. + */ +#define DSQ_CPU0 0 + +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __uint(key_size, sizeof(u32)); + __uint(value_size, sizeof(u64)); + __uint(max_entries, 2); /* [local, cpu0] */ +} stats SEC(".maps"); + +static void stat_inc(u32 idx) +{ + u64 *cnt_p = bpf_map_lookup_elem(&stats, &idx); + if (cnt_p) + (*cnt_p)++; +} + +s32 BPF_STRUCT_OPS(cpu0_select_cpu, struct task_struct *p, s32 prev_cpu, u64 wake_flags) +{ + return 0; +} + +void BPF_STRUCT_OPS(cpu0_enqueue, struct task_struct *p, u64 enq_flags) +{ + /* + * select_cpu() always picks CPU0. If @p is not on CPU0, it can't run on + * CPU 0. Queue on whichever CPU it's currently only. + */ + if (scx_bpf_task_cpu(p) != 0) { + stat_inc(0); /* count local queueing */ + scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, 0); + return; + } + + stat_inc(1); /* count cpu0 queueing */ + scx_bpf_dsq_insert(p, DSQ_CPU0, SCX_SLICE_DFL, enq_flags); +} + +void BPF_STRUCT_OPS(cpu0_dispatch, s32 cpu, struct task_struct *prev) +{ + if (cpu == 0) + scx_bpf_dsq_move_to_local(DSQ_CPU0); +} + +s32 BPF_STRUCT_OPS_SLEEPABLE(cpu0_init) +{ + return scx_bpf_create_dsq(DSQ_CPU0, -1); +} + +void BPF_STRUCT_OPS(cpu0_exit, struct scx_exit_info *ei) +{ + UEI_RECORD(uei, ei); +} + +SCX_OPS_DEFINE(cpu0_ops, + .select_cpu = (void *)cpu0_select_cpu, + .enqueue = (void *)cpu0_enqueue, + .dispatch = (void *)cpu0_dispatch, + .init = (void *)cpu0_init, + .exit = (void *)cpu0_exit, + .name = "cpu0"); diff --git a/tools/sched_ext/scx_cpu0.c b/tools/sched_ext/scx_cpu0.c new file mode 100644 index 000000000000..1e4fa4ab8da9 --- /dev/null +++ b/tools/sched_ext/scx_cpu0.c @@ -0,0 +1,106 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2025 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2025 Tejun Heo <tj@kernel.org> + */ +#include <stdio.h> +#include <unistd.h> +#include <signal.h> +#include <assert.h> +#include <libgen.h> +#include <bpf/bpf.h> +#include <scx/common.h> +#include "scx_cpu0.bpf.skel.h" + +const char help_fmt[] = +"A cpu0 sched_ext scheduler.\n" +"\n" +"See the top-level comment in .bpf.c for more details.\n" +"\n" +"Usage: %s [-v]\n" +"\n" +" -v Print libbpf debug messages\n" +" -h Display this help and exit\n"; + +static bool verbose; +static volatile int exit_req; + +static int libbpf_print_fn(enum libbpf_print_level level, const char *format, va_list args) +{ + if (level == LIBBPF_DEBUG && !verbose) + return 0; + return vfprintf(stderr, format, args); +} + +static void sigint_handler(int sig) +{ + exit_req = 1; +} + +static void read_stats(struct scx_cpu0 *skel, __u64 *stats) +{ + int nr_cpus = libbpf_num_possible_cpus(); + assert(nr_cpus > 0); + __u64 cnts[2][nr_cpus]; + __u32 idx; + + memset(stats, 0, sizeof(stats[0]) * 2); + + for (idx = 0; idx < 2; idx++) { + int ret, cpu; + + ret = bpf_map_lookup_elem(bpf_map__fd(skel->maps.stats), + &idx, cnts[idx]); + if (ret < 0) + continue; + for (cpu = 0; cpu < nr_cpus; cpu++) + stats[idx] += cnts[idx][cpu]; + } +} + +int main(int argc, char **argv) +{ + struct scx_cpu0 *skel; + struct bpf_link *link; + __u32 opt; + __u64 ecode; + + libbpf_set_print(libbpf_print_fn); + signal(SIGINT, sigint_handler); + signal(SIGTERM, sigint_handler); +restart: + skel = SCX_OPS_OPEN(cpu0_ops, scx_cpu0); + + skel->rodata->nr_cpus = libbpf_num_possible_cpus(); + + while ((opt = getopt(argc, argv, "vh")) != -1) { + switch (opt) { + case 'v': + verbose = true; + break; + default: + fprintf(stderr, help_fmt, basename(argv[0])); + return opt != 'h'; + } + } + + SCX_OPS_LOAD(skel, cpu0_ops, scx_cpu0, uei); + link = SCX_OPS_ATTACH(skel, cpu0_ops, scx_cpu0); + + while (!exit_req && !UEI_EXITED(skel, uei)) { + __u64 stats[2]; + + read_stats(skel, stats); + printf("local=%llu cpu0=%llu\n", stats[0], stats[1]); + fflush(stdout); + sleep(1); + } + + bpf_link__destroy(link); + ecode = UEI_REPORT(skel, uei); + scx_cpu0__destroy(skel); + + if (UEI_ECODE_RESTART(ecode)) + goto restart; + return 0; +} diff --git a/tools/sched_ext/scx_flatcg.bpf.c b/tools/sched_ext/scx_flatcg.bpf.c index 2c720e3ecad5..43126858b8e4 100644 --- a/tools/sched_ext/scx_flatcg.bpf.c +++ b/tools/sched_ext/scx_flatcg.bpf.c @@ -382,7 +382,7 @@ void BPF_STRUCT_OPS(fcg_enqueue, struct task_struct *p, u64 enq_flags) return; } - cgrp = __COMPAT_scx_bpf_task_cgroup(p); + cgrp = scx_bpf_task_cgroup(p); cgc = find_cgrp_ctx(cgrp); if (!cgc) goto out_release; @@ -508,7 +508,7 @@ void BPF_STRUCT_OPS(fcg_runnable, struct task_struct *p, u64 enq_flags) { struct cgroup *cgrp; - cgrp = __COMPAT_scx_bpf_task_cgroup(p); + cgrp = scx_bpf_task_cgroup(p); update_active_weight_sums(cgrp, true); bpf_cgroup_release(cgrp); } @@ -521,7 +521,7 @@ void BPF_STRUCT_OPS(fcg_running, struct task_struct *p) if (fifo_sched) return; - cgrp = __COMPAT_scx_bpf_task_cgroup(p); + cgrp = scx_bpf_task_cgroup(p); cgc = find_cgrp_ctx(cgrp); if (cgc) { /* @@ -564,7 +564,7 @@ void BPF_STRUCT_OPS(fcg_stopping, struct task_struct *p, bool runnable) if (!taskc->bypassed_at) return; - cgrp = __COMPAT_scx_bpf_task_cgroup(p); + cgrp = scx_bpf_task_cgroup(p); cgc = find_cgrp_ctx(cgrp); if (cgc) { __sync_fetch_and_add(&cgc->cvtime_delta, @@ -578,7 +578,7 @@ void BPF_STRUCT_OPS(fcg_quiescent, struct task_struct *p, u64 deq_flags) { struct cgroup *cgrp; - cgrp = __COMPAT_scx_bpf_task_cgroup(p); + cgrp = scx_bpf_task_cgroup(p); update_active_weight_sums(cgrp, false); bpf_cgroup_release(cgrp); } diff --git a/tools/sched_ext/scx_qmap.bpf.c b/tools/sched_ext/scx_qmap.bpf.c index 3072b593f898..df21fad0c438 100644 --- a/tools/sched_ext/scx_qmap.bpf.c +++ b/tools/sched_ext/scx_qmap.bpf.c @@ -202,6 +202,9 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags) void *ring; s32 cpu; + if (enq_flags & SCX_ENQ_REENQ) + __sync_fetch_and_add(&nr_reenqueued, 1); + if (p->flags & PF_KTHREAD) { if (stall_kernel_nth && !(++kernel_cnt % stall_kernel_nth)) return; @@ -320,12 +323,9 @@ static bool dispatch_highpri(bool from_timer) if (tctx->highpri) { /* exercise the set_*() and vtime interface too */ - __COMPAT_scx_bpf_dsq_move_set_slice( - BPF_FOR_EACH_ITER, slice_ns * 2); - __COMPAT_scx_bpf_dsq_move_set_vtime( - BPF_FOR_EACH_ITER, highpri_seq++); - __COMPAT_scx_bpf_dsq_move_vtime( - BPF_FOR_EACH_ITER, p, HIGHPRI_DSQ, 0); + scx_bpf_dsq_move_set_slice(BPF_FOR_EACH_ITER, slice_ns * 2); + scx_bpf_dsq_move_set_vtime(BPF_FOR_EACH_ITER, highpri_seq++); + scx_bpf_dsq_move_vtime(BPF_FOR_EACH_ITER, p, HIGHPRI_DSQ, 0); } } @@ -342,9 +342,8 @@ static bool dispatch_highpri(bool from_timer) else cpu = scx_bpf_pick_any_cpu(p->cpus_ptr, 0); - if (__COMPAT_scx_bpf_dsq_move(BPF_FOR_EACH_ITER, p, - SCX_DSQ_LOCAL_ON | cpu, - SCX_ENQ_PREEMPT)) { + if (scx_bpf_dsq_move(BPF_FOR_EACH_ITER, p, SCX_DSQ_LOCAL_ON | cpu, + SCX_ENQ_PREEMPT)) { if (cpu == this_cpu) { dispatched = true; __sync_fetch_and_add(&nr_expedited_local, 1); @@ -533,20 +532,35 @@ bool BPF_STRUCT_OPS(qmap_core_sched_before, return task_qdist(a) > task_qdist(b); } -void BPF_STRUCT_OPS(qmap_cpu_release, s32 cpu, struct scx_cpu_release_args *args) +SEC("tp_btf/sched_switch") +int BPF_PROG(qmap_sched_switch, bool preempt, struct task_struct *prev, + struct task_struct *next, unsigned long prev_state) { - u32 cnt; + if (!__COMPAT_scx_bpf_reenqueue_local_from_anywhere()) + return 0; /* - * Called when @cpu is taken by a higher priority scheduling class. This - * makes @cpu no longer available for executing sched_ext tasks. As we - * don't want the tasks in @cpu's local dsq to sit there until @cpu - * becomes available again, re-enqueue them into the global dsq. See - * %SCX_ENQ_REENQ handling in qmap_enqueue(). + * If @cpu is taken by a higher priority scheduling class, it is no + * longer available for executing sched_ext tasks. As we don't want the + * tasks in @cpu's local dsq to sit there until @cpu becomes available + * again, re-enqueue them into the global dsq. See %SCX_ENQ_REENQ + * handling in qmap_enqueue(). */ - cnt = scx_bpf_reenqueue_local(); - if (cnt) - __sync_fetch_and_add(&nr_reenqueued, cnt); + switch (next->policy) { + case 1: /* SCHED_FIFO */ + case 2: /* SCHED_RR */ + case 6: /* SCHED_DEADLINE */ + scx_bpf_reenqueue_local(); + } + + return 0; +} + +void BPF_STRUCT_OPS(qmap_cpu_release, s32 cpu, struct scx_cpu_release_args *args) +{ + /* see qmap_sched_switch() to learn how to do this on newer kernels */ + if (!__COMPAT_scx_bpf_reenqueue_local_from_anywhere()) + scx_bpf_reenqueue_local(); } s32 BPF_STRUCT_OPS(qmap_init_task, struct task_struct *p, diff --git a/tools/testing/cxl/Kbuild b/tools/testing/cxl/Kbuild index 0d5ce4b74b9f..0e151d0572d1 100644 --- a/tools/testing/cxl/Kbuild +++ b/tools/testing/cxl/Kbuild @@ -4,13 +4,12 @@ ldflags-y += --wrap=is_acpi_device_node ldflags-y += --wrap=acpi_evaluate_integer ldflags-y += --wrap=acpi_pci_find_root ldflags-y += --wrap=nvdimm_bus_register -ldflags-y += --wrap=devm_cxl_port_enumerate_dports ldflags-y += --wrap=cxl_await_media_ready ldflags-y += --wrap=devm_cxl_add_rch_dport -ldflags-y += --wrap=cxl_rcd_component_reg_phys ldflags-y += --wrap=cxl_endpoint_parse_cdat ldflags-y += --wrap=cxl_dport_init_ras_reporting ldflags-y += --wrap=devm_cxl_endpoint_decoders_setup +ldflags-y += --wrap=hmat_get_extended_linear_cache_size DRIVERS := ../../../drivers CXL_SRC := $(DRIVERS)/cxl diff --git a/tools/testing/cxl/test/Kbuild b/tools/testing/cxl/test/Kbuild index 6b1927897856..af50972c8b6d 100644 --- a/tools/testing/cxl/test/Kbuild +++ b/tools/testing/cxl/test/Kbuild @@ -4,6 +4,7 @@ ccflags-y := -I$(srctree)/drivers/cxl/ -I$(srctree)/drivers/cxl/core obj-m += cxl_test.o obj-m += cxl_mock.o obj-m += cxl_mock_mem.o +obj-m += cxl_translate.o cxl_test-y := cxl.o cxl_mock-y := mock.o diff --git a/tools/testing/cxl/test/cxl.c b/tools/testing/cxl/test/cxl.c index 2d135ca533d0..81e2aef3627a 100644 --- a/tools/testing/cxl/test/cxl.c +++ b/tools/testing/cxl/test/cxl.c @@ -15,6 +15,7 @@ #include "mock.h" static int interleave_arithmetic; +static bool extended_linear_cache; #define FAKE_QTG_ID 42 @@ -26,6 +27,9 @@ static int interleave_arithmetic; #define NR_CXL_PORT_DECODERS 8 #define NR_BRIDGES (NR_CXL_HOST_BRIDGES + NR_CXL_SINGLE_HOST + NR_CXL_RCH) +#define MOCK_AUTO_REGION_SIZE_DEFAULT SZ_512M +static int mock_auto_region_size = MOCK_AUTO_REGION_SIZE_DEFAULT; + static struct platform_device *cxl_acpi; static struct platform_device *cxl_host_bridge[NR_CXL_HOST_BRIDGES]; #define NR_MULTI_ROOT (NR_CXL_HOST_BRIDGES * NR_CXL_ROOT_PORTS) @@ -426,6 +430,22 @@ static struct cxl_mock_res *alloc_mock_res(resource_size_t size, int align) return res; } +/* Only update CFMWS0 as this is used by the auto region. */ +static void cfmws_elc_update(struct acpi_cedt_cfmws *window, int index) +{ + if (!extended_linear_cache) + return; + + if (index != 0) + return; + + /* + * The window size should be 2x of the CXL region size where half is + * DRAM and half is CXL + */ + window->window_size = mock_auto_region_size * 2; +} + static int populate_cedt(void) { struct cxl_mock_res *res; @@ -450,6 +470,7 @@ static int populate_cedt(void) for (i = cfmws_start; i <= cfmws_end; i++) { struct acpi_cedt_cfmws *window = mock_cfmws[i]; + cfmws_elc_update(window, i); res = alloc_mock_res(window->window_size, SZ_256M); if (!res) return -ENOMEM; @@ -591,6 +612,25 @@ mock_acpi_evaluate_integer(acpi_handle handle, acpi_string pathname, return AE_OK; } +static int +mock_hmat_get_extended_linear_cache_size(struct resource *backing_res, + int nid, resource_size_t *cache_size) +{ + struct acpi_cedt_cfmws *window = mock_cfmws[0]; + struct resource cfmws0_res = + DEFINE_RES_MEM(window->base_hpa, window->window_size); + + if (!extended_linear_cache || + !resource_contains(&cfmws0_res, backing_res)) { + return hmat_get_extended_linear_cache_size(backing_res, + nid, cache_size); + } + + *cache_size = mock_auto_region_size; + + return 0; +} + static struct pci_bus mock_pci_bus[NR_BRIDGES]; static struct acpi_pci_root mock_pci_root[ARRAY_SIZE(mock_pci_bus)] = { [0] = { @@ -738,7 +778,6 @@ static void mock_init_hdm_decoder(struct cxl_decoder *cxld) struct cxl_endpoint_decoder *cxled; struct cxl_switch_decoder *cxlsd; struct cxl_port *port, *iter; - const int size = SZ_512M; struct cxl_memdev *cxlmd; struct cxl_dport *dport; struct device *dev; @@ -781,9 +820,11 @@ static void mock_init_hdm_decoder(struct cxl_decoder *cxld) } base = window->base_hpa; + if (extended_linear_cache) + base += mock_auto_region_size; cxld->hpa_range = (struct range) { .start = base, - .end = base + size - 1, + .end = base + mock_auto_region_size - 1, }; cxld->interleave_ways = 2; @@ -792,7 +833,8 @@ static void mock_init_hdm_decoder(struct cxl_decoder *cxld) cxld->flags = CXL_DECODER_F_ENABLE; cxled->state = CXL_DECODER_STATE_AUTO; port->commit_end = cxld->id; - devm_cxl_dpa_reserve(cxled, 0, size / cxld->interleave_ways, 0); + devm_cxl_dpa_reserve(cxled, 0, + mock_auto_region_size / cxld->interleave_ways, 0); cxld->commit = mock_decoder_commit; cxld->reset = mock_decoder_reset; @@ -841,7 +883,7 @@ static void mock_init_hdm_decoder(struct cxl_decoder *cxld) cxld->interleave_granularity = 4096; cxld->hpa_range = (struct range) { .start = base, - .end = base + size - 1, + .end = base + mock_auto_region_size - 1, }; put_device(dev); } @@ -995,37 +1037,6 @@ static int get_port_array(struct cxl_port *port, return 0; } -static int mock_cxl_port_enumerate_dports(struct cxl_port *port) -{ - struct platform_device **array; - int i, array_size; - int rc; - - rc = get_port_array(port, &array, &array_size); - if (rc) - return rc; - - for (i = 0; i < array_size; i++) { - struct platform_device *pdev = array[i]; - struct cxl_dport *dport; - - if (pdev->dev.parent != port->uport_dev) { - dev_dbg(&port->dev, "%s: mismatch parent %s\n", - dev_name(port->uport_dev), - dev_name(pdev->dev.parent)); - continue; - } - - dport = devm_cxl_add_dport(port, &pdev->dev, pdev->id, - CXL_RESOURCE_NONE); - - if (IS_ERR(dport)) - return PTR_ERR(dport); - } - - return 0; -} - static struct cxl_dport *mock_cxl_add_dport_by_dev(struct cxl_port *port, struct device *dport_dev) { @@ -1114,9 +1125,10 @@ static struct cxl_mock_ops cxl_mock_ops = { .acpi_pci_find_root = mock_acpi_pci_find_root, .devm_cxl_switch_port_decoders_setup = mock_cxl_switch_port_decoders_setup, .devm_cxl_endpoint_decoders_setup = mock_cxl_endpoint_decoders_setup, - .devm_cxl_port_enumerate_dports = mock_cxl_port_enumerate_dports, .cxl_endpoint_parse_cdat = mock_cxl_endpoint_parse_cdat, .devm_cxl_add_dport_by_dev = mock_cxl_add_dport_by_dev, + .hmat_get_extended_linear_cache_size = + mock_hmat_get_extended_linear_cache_size, .list = LIST_HEAD_INIT(cxl_mock_ops.list), }; @@ -1606,6 +1618,8 @@ static __exit void cxl_test_exit(void) module_param(interleave_arithmetic, int, 0444); MODULE_PARM_DESC(interleave_arithmetic, "Modulo:0, XOR:1"); +module_param(extended_linear_cache, bool, 0444); +MODULE_PARM_DESC(extended_linear_cache, "Enable extended linear cache support"); module_init(cxl_test_init); module_exit(cxl_test_exit); MODULE_LICENSE("GPL v2"); diff --git a/tools/testing/cxl/test/cxl_translate.c b/tools/testing/cxl/test/cxl_translate.c new file mode 100644 index 000000000000..2200ae21795c --- /dev/null +++ b/tools/testing/cxl/test/cxl_translate.c @@ -0,0 +1,445 @@ +// SPDX-License-Identifier: GPL-2.0-only +// Copyright(c) 2025 Intel Corporation. All rights reserved. + +/* Preface all log entries with "cxl_translate" */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/moduleparam.h> +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/slab.h> +#include <linux/acpi.h> +#include <cxlmem.h> +#include <cxl.h> + +/* Maximum number of test vectors and entry length */ +#define MAX_TABLE_ENTRIES 128 +#define MAX_ENTRY_LEN 128 + +/* Expected number of parameters in each test vector */ +#define EXPECTED_PARAMS 7 + +/* Module parameters for test vectors */ +static char *table[MAX_TABLE_ENTRIES]; +static int table_num; + +/* Interleave Arithmetic */ +#define MODULO_MATH 0 +#define XOR_MATH 1 + +/* + * XOR mapping configuration + * The test data sets all use the same set of xormaps. When additional + * data sets arrive for validation, this static setup will need to + * be changed to accept xormaps as additional parameters. + */ +struct cxl_cxims_data *cximsd; +static u64 xormaps[] = { + 0x2020900, + 0x4041200, + 0x1010400, + 0x800, +}; + +static int nr_maps = ARRAY_SIZE(xormaps); + +#define HBIW_TO_NR_MAPS_SIZE (CXL_DECODER_MAX_INTERLEAVE + 1) +static const int hbiw_to_nr_maps[HBIW_TO_NR_MAPS_SIZE] = { + [1] = 0, [2] = 1, [3] = 0, [4] = 2, [6] = 1, [8] = 3, [12] = 2, [16] = 4 +}; + +/** + * to_hpa - calculate an HPA offset from a DPA offset and position + * + * dpa_offset: device physical address offset + * pos: devices position in interleave + * r_eiw: region encoded interleave ways + * r_eig: region encoded interleave granularity + * hb_ways: host bridge interleave ways + * math: interleave arithmetic (MODULO_MATH or XOR_MATH) + * + * Returns: host physical address offset + */ +static u64 to_hpa(u64 dpa_offset, int pos, u8 r_eiw, u16 r_eig, u8 hb_ways, + u8 math) +{ + u64 hpa_offset; + + /* Calculate base HPA offset from DPA and position */ + hpa_offset = cxl_calculate_hpa_offset(dpa_offset, pos, r_eiw, r_eig); + + if (math == XOR_MATH) { + cximsd->nr_maps = hbiw_to_nr_maps[hb_ways]; + if (cximsd->nr_maps) + return cxl_do_xormap_calc(cximsd, hpa_offset, hb_ways); + } + return hpa_offset; +} + +/** + * to_dpa - translate an HPA offset to DPA offset + * + * hpa_offset: host physical address offset + * r_eiw: region encoded interleave ways + * r_eig: region encoded interleave granularity + * hb_ways: host bridge interleave ways + * math: interleave arithmetic (MODULO_MATH or XOR_MATH) + * + * Returns: device physical address offset + */ +static u64 to_dpa(u64 hpa_offset, u8 r_eiw, u16 r_eig, u8 hb_ways, u8 math) +{ + u64 offset = hpa_offset; + + if (math == XOR_MATH) { + cximsd->nr_maps = hbiw_to_nr_maps[hb_ways]; + if (cximsd->nr_maps) + offset = + cxl_do_xormap_calc(cximsd, hpa_offset, hb_ways); + } + return cxl_calculate_dpa_offset(offset, r_eiw, r_eig); +} + +/** + * to_pos - extract an interleave position from an HPA offset + * + * hpa_offset: host physical address offset + * r_eiw: region encoded interleave ways + * r_eig: region encoded interleave granularity + * hb_ways: host bridge interleave ways + * math: interleave arithmetic (MODULO_MATH or XOR_MATH) + * + * Returns: devices position in region interleave + */ +static u64 to_pos(u64 hpa_offset, u8 r_eiw, u16 r_eig, u8 hb_ways, u8 math) +{ + u64 offset = hpa_offset; + + /* Reverse XOR mapping if specified */ + if (math == XOR_MATH) + offset = cxl_do_xormap_calc(cximsd, hpa_offset, hb_ways); + + return cxl_calculate_position(offset, r_eiw, r_eig); +} + +/** + * run_translation_test - execute forward and reverse translations + * + * @dpa: device physical address + * @pos: expected position in region interleave + * @r_eiw: region encoded interleave ways + * @r_eig: region encoded interleave granularity + * @hb_ways: host bridge interleave ways + * @math: interleave arithmetic (MODULO_MATH or XOR_MATH) + * @expect_spa: expected system physical address + * + * Returns: 0 on success, -1 on failure + */ +static int run_translation_test(u64 dpa, int pos, u8 r_eiw, u16 r_eig, + u8 hb_ways, int math, u64 expect_hpa) +{ + u64 translated_spa, reverse_dpa; + int reverse_pos; + + /* Test Device to Host translation: DPA + POS -> SPA */ + translated_spa = to_hpa(dpa, pos, r_eiw, r_eig, hb_ways, math); + if (translated_spa != expect_hpa) { + pr_err("Device to host failed: expected HPA %llu, got %llu\n", + expect_hpa, translated_spa); + return -1; + } + + /* Test Host to Device DPA translation: SPA -> DPA */ + reverse_dpa = to_dpa(translated_spa, r_eiw, r_eig, hb_ways, math); + if (reverse_dpa != dpa) { + pr_err("Host to Device DPA failed: expected %llu, got %llu\n", + dpa, reverse_dpa); + return -1; + } + + /* Test Host to Device Position translation: SPA -> POS */ + reverse_pos = to_pos(translated_spa, r_eiw, r_eig, hb_ways, math); + if (reverse_pos != pos) { + pr_err("Position lookup failed: expected %d, got %d\n", pos, + reverse_pos); + return -1; + } + + return 0; +} + +/** + * parse_test_vector - parse a single test vector string + * + * entry: test vector string to parse + * dpa: device physical address + * pos: expected position in region interleave + * r_eiw: region encoded interleave ways + * r_eig: region encoded interleave granularity + * hb_ways: host bridge interleave ways + * math: interleave arithmetic (MODULO_MATH or XOR_MATH) + * expect_spa: expected system physical address + * + * Returns: 0 on success, negative error code on failure + */ +static int parse_test_vector(const char *entry, u64 *dpa, int *pos, u8 *r_eiw, + u16 *r_eig, u8 *hb_ways, int *math, + u64 *expect_hpa) +{ + unsigned int tmp_r_eiw, tmp_r_eig, tmp_hb_ways; + int parsed; + + parsed = sscanf(entry, "%llu %d %u %u %u %d %llu", dpa, pos, &tmp_r_eiw, + &tmp_r_eig, &tmp_hb_ways, math, expect_hpa); + + if (parsed != EXPECTED_PARAMS) { + pr_err("Parse error: expected %d parameters, got %d in '%s'\n", + EXPECTED_PARAMS, parsed, entry); + return -EINVAL; + } + if (tmp_r_eiw > U8_MAX || tmp_r_eig > U16_MAX || tmp_hb_ways > U8_MAX) { + pr_err("Parameter overflow in entry: '%s'\n", entry); + return -ERANGE; + } + if (*math != MODULO_MATH && *math != XOR_MATH) { + pr_err("Invalid math type %d in entry: '%s'\n", *math, entry); + return -EINVAL; + } + *r_eiw = tmp_r_eiw; + *r_eig = tmp_r_eig; + *hb_ways = tmp_hb_ways; + + return 0; +} + +/* + * setup_xor_mapping - Initialize XOR mapping data structure + * + * The test data sets all use the same HBIG so we can use one set + * of xormaps, and set the number to apply based on HBIW before + * calling cxl_do_xormap_calc(). + * + * When additional data sets arrive for validation with different + * HBIG's this static setup will need to be updated. + * + * Returns: 0 on success, negative error code on failure + */ +static int setup_xor_mapping(void) +{ + if (nr_maps <= 0) + return -EINVAL; + + cximsd = kzalloc(struct_size(cximsd, xormaps, nr_maps), GFP_KERNEL); + if (!cximsd) + return -ENOMEM; + + memcpy(cximsd->xormaps, xormaps, nr_maps * sizeof(*cximsd->xormaps)); + cximsd->nr_maps = nr_maps; + + return 0; +} + +static int test_random_params(void) +{ + u8 valid_eiws[] = { 0, 1, 2, 3, 4, 8, 9, 10 }; + u16 valid_eigs[] = { 0, 1, 2, 3, 4, 5, 6 }; + int i, ways, pos, reverse_pos; + u64 dpa, hpa, reverse_dpa; + int iterations = 10000; + int failures = 0; + + for (i = 0; i < iterations; i++) { + /* Generate valid random parameters for eiw, eig, pos, dpa */ + u8 eiw = valid_eiws[get_random_u32() % ARRAY_SIZE(valid_eiws)]; + u16 eig = valid_eigs[get_random_u32() % ARRAY_SIZE(valid_eigs)]; + + eiw_to_ways(eiw, &ways); + pos = get_random_u32() % ways; + dpa = get_random_u64() >> 12; + + hpa = cxl_calculate_hpa_offset(dpa, pos, eiw, eig); + reverse_dpa = cxl_calculate_dpa_offset(hpa, eiw, eig); + reverse_pos = cxl_calculate_position(hpa, eiw, eig); + + if (reverse_dpa != dpa || reverse_pos != pos) { + pr_err("test random iter %d FAIL hpa=%llu, dpa=%llu reverse_dpa=%llu, pos=%d reverse_pos=%d eiw=%u eig=%u\n", + i, hpa, dpa, reverse_dpa, pos, reverse_pos, eiw, + eig); + + if (failures++ > 10) { + pr_err("test random too many failures, stop\n"); + break; + } + } + } + pr_info("..... test random: PASS %d FAIL %d\n", i - failures, failures); + + if (failures) + return -EINVAL; + + return 0; +} + +struct param_test { + u8 eiw; + u16 eig; + int pos; + bool expect; /* true: expect pass, false: expect fail */ + const char *desc; +}; + +static struct param_test param_tests[] = { + { 0x0, 0, 0, true, "1-way, min eig=0, pos=0" }, + { 0x0, 3, 0, true, "1-way, mid eig=3, pos=0" }, + { 0x0, 6, 0, true, "1-way, max eig=6, pos=0" }, + { 0x1, 0, 0, true, "2-way, eig=0, pos=0" }, + { 0x1, 3, 1, true, "2-way, eig=3, max pos=1" }, + { 0x1, 6, 1, true, "2-way, eig=6, max pos=1" }, + { 0x2, 0, 0, true, "4-way, eig=0, pos=0" }, + { 0x2, 3, 3, true, "4-way, eig=3, max pos=3" }, + { 0x2, 6, 3, true, "4-way, eig=6, max pos=3" }, + { 0x3, 0, 0, true, "8-way, eig=0, pos=0" }, + { 0x3, 3, 7, true, "8-way, eig=3, max pos=7" }, + { 0x3, 6, 7, true, "8-way, eig=6, max pos=7" }, + { 0x4, 0, 0, true, "16-way, eig=0, pos=0" }, + { 0x4, 3, 15, true, "16-way, eig=3, max pos=15" }, + { 0x4, 6, 15, true, "16-way, eig=6, max pos=15" }, + { 0x8, 0, 0, true, "3-way, eig=0, pos=0" }, + { 0x8, 3, 2, true, "3-way, eig=3, max pos=2" }, + { 0x8, 6, 2, true, "3-way, eig=6, max pos=2" }, + { 0x9, 0, 0, true, "6-way, eig=0, pos=0" }, + { 0x9, 3, 5, true, "6-way, eig=3, max pos=5" }, + { 0x9, 6, 5, true, "6-way, eig=6, max pos=5" }, + { 0xA, 0, 0, true, "12-way, eig=0, pos=0" }, + { 0xA, 3, 11, true, "12-way, eig=3, max pos=11" }, + { 0xA, 6, 11, true, "12-way, eig=6, max pos=11" }, + { 0x5, 0, 0, false, "invalid eiw=5" }, + { 0x7, 0, 0, false, "invalid eiw=7" }, + { 0xB, 0, 0, false, "invalid eiw=0xB" }, + { 0xFF, 0, 0, false, "invalid eiw=0xFF" }, + { 0x1, 7, 0, false, "invalid eig=7 (out of range)" }, + { 0x2, 0x10, 0, false, "invalid eig=0x10" }, + { 0x3, 0xFFFF, 0, false, "invalid eig=0xFFFF" }, + { 0x1, 0, -1, false, "pos < 0" }, + { 0x1, 0, 2, false, "2-way, pos=2 (>= ways)" }, + { 0x2, 0, 4, false, "4-way, pos=4 (>= ways)" }, + { 0x3, 0, 8, false, "8-way, pos=8 (>= ways)" }, + { 0x4, 0, 16, false, "16-way, pos=16 (>= ways)" }, + { 0x8, 0, 3, false, "3-way, pos=3 (>= ways)" }, + { 0x9, 0, 6, false, "6-way, pos=6 (>= ways)" }, + { 0xA, 0, 12, false, "12-way, pos=12 (>= ways)" }, +}; + +static int test_cxl_validate_translation_params(void) +{ + int i, rc, failures = 0; + bool valid; + + for (i = 0; i < ARRAY_SIZE(param_tests); i++) { + struct param_test *t = ¶m_tests[i]; + + rc = cxl_validate_translation_params(t->eiw, t->eig, t->pos); + valid = (rc == 0); + + if (valid != t->expect) { + pr_err("test params failed: %s\n", t->desc); + failures++; + } + } + pr_info("..... test params: PASS %d FAIL %d\n", i - failures, failures); + + if (failures) + return -EINVAL; + + return 0; +} + +/* + * cxl_translate_init + * + * Run the internal validation tests when no params are passed. + * Otherwise, parse the parameters (test vectors), and kick off + * the translation test. + * + * Returns: 0 on success, negative error code on failure + */ +static int __init cxl_translate_init(void) +{ + int rc, i; + + /* If no tables are passed, validate module params only */ + if (table_num == 0) { + pr_info("Internal validation test start...\n"); + rc = test_cxl_validate_translation_params(); + if (rc) + return rc; + + rc = test_random_params(); + if (rc) + return rc; + + pr_info("Internal validation test completed successfully\n"); + + return 0; + } + + pr_info("CXL translate test module loaded with %d test vectors\n", + table_num); + + rc = setup_xor_mapping(); + if (rc) + return rc; + + /* Process each test vector */ + for (i = 0; i < table_num; i++) { + u64 dpa, expect_spa; + int pos, math; + u8 r_eiw, hb_ways; + u16 r_eig; + + pr_debug("Processing test vector %d: '%s'\n", i, table[i]); + + /* Parse the test vector */ + rc = parse_test_vector(table[i], &dpa, &pos, &r_eiw, &r_eig, + &hb_ways, &math, &expect_spa); + if (rc) { + pr_err("CXL Translate Test %d: FAIL\n" + " Failed to parse test vector '%s'\n", + i, table[i]); + continue; + } + /* Run the translation test */ + rc = run_translation_test(dpa, pos, r_eiw, r_eig, hb_ways, math, + expect_spa); + if (rc) { + pr_err("CXL Translate Test %d: FAIL\n" + " dpa=%llu pos=%d r_eiw=%u r_eig=%u hb_ways=%u math=%s expect_spa=%llu\n", + i, dpa, pos, r_eiw, r_eig, hb_ways, + (math == XOR_MATH) ? "XOR" : "MODULO", + expect_spa); + } else { + pr_info("CXL Translate Test %d: PASS\n", i); + } + } + + kfree(cximsd); + pr_info("CXL translate test completed\n"); + + return 0; +} + +static void __exit cxl_translate_exit(void) +{ + pr_info("CXL translate test module unloaded\n"); +} + +module_param_array(table, charp, &table_num, 0444); +MODULE_PARM_DESC(table, "Test vectors as space-separated decimal strings"); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("cxl_test: cxl address translation test module"); +MODULE_IMPORT_NS("CXL"); + +module_init(cxl_translate_init); +module_exit(cxl_translate_exit); diff --git a/tools/testing/cxl/test/mem.c b/tools/testing/cxl/test/mem.c index d533481672b7..176dcde570cd 100644 --- a/tools/testing/cxl/test/mem.c +++ b/tools/testing/cxl/test/mem.c @@ -250,22 +250,21 @@ static void mes_add_event(struct mock_event_store *mes, * Vary the number of events returned to simulate events occuring while the * logs are being read. */ -static int ret_limit = 0; +static atomic_t event_counter = ATOMIC_INIT(0); static int mock_get_event(struct device *dev, struct cxl_mbox_cmd *cmd) { struct cxl_get_event_payload *pl; struct mock_event_log *log; - u16 nr_overflow; + int ret_limit; u8 log_type; int i; if (cmd->size_in != sizeof(log_type)) return -EINVAL; - ret_limit = (ret_limit + 1) % CXL_TEST_EVENT_RET_MAX; - if (!ret_limit) - ret_limit = 1; + /* Vary return limit from 1 to CXL_TEST_EVENT_RET_MAX */ + ret_limit = (atomic_inc_return(&event_counter) % CXL_TEST_EVENT_RET_MAX) + 1; if (cmd->size_out < struct_size(pl, records, ret_limit)) return -EINVAL; @@ -299,7 +298,7 @@ static int mock_get_event(struct device *dev, struct cxl_mbox_cmd *cmd) u64 ns; pl->flags |= CXL_GET_EVENT_FLAG_OVERFLOW; - pl->overflow_err_count = cpu_to_le16(nr_overflow); + pl->overflow_err_count = cpu_to_le16(log->nr_overflow); ns = ktime_get_real_ns(); ns -= 5000000000; /* 5s ago */ pl->first_overflow_timestamp = cpu_to_le64(ns); diff --git a/tools/testing/cxl/test/mock.c b/tools/testing/cxl/test/mock.c index 995269a75cbd..44bce80ef3ff 100644 --- a/tools/testing/cxl/test/mock.c +++ b/tools/testing/cxl/test/mock.c @@ -111,6 +111,26 @@ acpi_status __wrap_acpi_evaluate_integer(acpi_handle handle, } EXPORT_SYMBOL(__wrap_acpi_evaluate_integer); +int __wrap_hmat_get_extended_linear_cache_size(struct resource *backing_res, + int nid, + resource_size_t *cache_size) +{ + int index, rc; + struct cxl_mock_ops *ops = get_cxl_mock_ops(&index); + + if (ops) + rc = ops->hmat_get_extended_linear_cache_size(backing_res, nid, + cache_size); + else + rc = hmat_get_extended_linear_cache_size(backing_res, nid, + cache_size); + + put_cxl_mock_ops(index); + + return rc; +} +EXPORT_SYMBOL_GPL(__wrap_hmat_get_extended_linear_cache_size); + struct acpi_pci_root *__wrap_acpi_pci_find_root(acpi_handle handle) { int index; @@ -172,21 +192,6 @@ int __wrap_devm_cxl_endpoint_decoders_setup(struct cxl_port *port) } EXPORT_SYMBOL_NS_GPL(__wrap_devm_cxl_endpoint_decoders_setup, "CXL"); -int __wrap_devm_cxl_port_enumerate_dports(struct cxl_port *port) -{ - int rc, index; - struct cxl_mock_ops *ops = get_cxl_mock_ops(&index); - - if (ops && ops->is_mock_port(port->uport_dev)) - rc = ops->devm_cxl_port_enumerate_dports(port); - else - rc = devm_cxl_port_enumerate_dports(port); - put_cxl_mock_ops(index); - - return rc; -} -EXPORT_SYMBOL_NS_GPL(__wrap_devm_cxl_port_enumerate_dports, "CXL"); - int __wrap_cxl_await_media_ready(struct cxl_dev_state *cxlds) { int rc, index; @@ -226,23 +231,6 @@ struct cxl_dport *__wrap_devm_cxl_add_rch_dport(struct cxl_port *port, } EXPORT_SYMBOL_NS_GPL(__wrap_devm_cxl_add_rch_dport, "CXL"); -resource_size_t __wrap_cxl_rcd_component_reg_phys(struct device *dev, - struct cxl_dport *dport) -{ - int index; - resource_size_t component_reg_phys; - struct cxl_mock_ops *ops = get_cxl_mock_ops(&index); - - if (ops && ops->is_mock_port(dev)) - component_reg_phys = CXL_RESOURCE_NONE; - else - component_reg_phys = cxl_rcd_component_reg_phys(dev, dport); - put_cxl_mock_ops(index); - - return component_reg_phys; -} -EXPORT_SYMBOL_NS_GPL(__wrap_cxl_rcd_component_reg_phys, "CXL"); - void __wrap_cxl_endpoint_parse_cdat(struct cxl_port *port) { int index; diff --git a/tools/testing/cxl/test/mock.h b/tools/testing/cxl/test/mock.h index 4ed932e76aae..2684b89c8aa2 100644 --- a/tools/testing/cxl/test/mock.h +++ b/tools/testing/cxl/test/mock.h @@ -19,12 +19,14 @@ struct cxl_mock_ops { bool (*is_mock_bus)(struct pci_bus *bus); bool (*is_mock_port)(struct device *dev); bool (*is_mock_dev)(struct device *dev); - int (*devm_cxl_port_enumerate_dports)(struct cxl_port *port); int (*devm_cxl_switch_port_decoders_setup)(struct cxl_port *port); int (*devm_cxl_endpoint_decoders_setup)(struct cxl_port *port); void (*cxl_endpoint_parse_cdat)(struct cxl_port *port); struct cxl_dport *(*devm_cxl_add_dport_by_dev)(struct cxl_port *port, struct device *dport_dev); + int (*hmat_get_extended_linear_cache_size)(struct resource *backing_res, + int nid, + resource_size_t *cache_size); }; void register_cxl_mock_ops(struct cxl_mock_ops *ops); diff --git a/tools/testing/ktest/config-bisect.pl b/tools/testing/ktest/config-bisect.pl index 6fd864935319..bee7cb34a289 100755 --- a/tools/testing/ktest/config-bisect.pl +++ b/tools/testing/ktest/config-bisect.pl @@ -741,9 +741,9 @@ if ($start) { die "Can not find file $bad\n"; } if ($val eq "good") { - run_command "cp $output_config $good" or die "failed to copy $config to $good\n"; + run_command "cp $output_config $good" or die "failed to copy $output_config to $good\n"; } elsif ($val eq "bad") { - run_command "cp $output_config $bad" or die "failed to copy $config to $bad\n"; + run_command "cp $output_config $bad" or die "failed to copy $output_config to $bad\n"; } } diff --git a/tools/testing/nvdimm/test/nfit.c b/tools/testing/nvdimm/test/nfit.c index cfd4378e2129..f87e9f251d13 100644 --- a/tools/testing/nvdimm/test/nfit.c +++ b/tools/testing/nvdimm/test/nfit.c @@ -670,6 +670,7 @@ static int nfit_test_search_spa(struct nvdimm_bus *bus, .addr = spa->spa, .region = NULL, }; + struct nfit_mem *nfit_mem; u64 dpa; ret = device_for_each_child(&bus->dev, &ctx, @@ -687,8 +688,12 @@ static int nfit_test_search_spa(struct nvdimm_bus *bus, */ nd_mapping = &nd_region->mapping[nd_region->ndr_mappings - 1]; nvdimm = nd_mapping->nvdimm; + nfit_mem = nvdimm_provider_data(nvdimm); + if (!nfit_mem) + return -EINVAL; - spa->devices[0].nfit_device_handle = handle[nvdimm->id]; + spa->devices[0].nfit_device_handle = + __to_nfit_memdev(nfit_mem)->device_handle; spa->num_nvdimms = 1; spa->devices[0].dpa = dpa; diff --git a/tools/testing/selftests/alsa/conf.c b/tools/testing/selftests/alsa/conf.c index e2b3a5810f47..dc7f40e68dee 100644 --- a/tools/testing/selftests/alsa/conf.c +++ b/tools/testing/selftests/alsa/conf.c @@ -448,7 +448,7 @@ int conf_get_bool(snd_config_t *root, const char *key1, const char *key2, int de ksft_exit_fail_msg("key '%s'.'%s' search error: %s\n", key1, key2, snd_strerror(ret)); ret = snd_config_get_bool(cfg); if (ret < 0) - ksft_exit_fail_msg("key '%s'.'%s' is not an bool\n", key1, key2); + ksft_exit_fail_msg("key '%s'.'%s' is not a bool\n", key1, key2); return !!ret; } diff --git a/tools/testing/selftests/arm64/fp/fp-ptrace.c b/tools/testing/selftests/arm64/fp/fp-ptrace.c index a85c19e9524e..0114108ab25f 100644 --- a/tools/testing/selftests/arm64/fp/fp-ptrace.c +++ b/tools/testing/selftests/arm64/fp/fp-ptrace.c @@ -1071,7 +1071,7 @@ static bool sve_write_supported(struct test_config *config) static bool sve_write_fpsimd_supported(struct test_config *config) { - if (!sve_supported()) + if (!sve_supported() && !sme_supported()) return false; if ((config->svcr_in & SVCR_ZA) != (config->svcr_expected & SVCR_ZA)) @@ -1231,9 +1231,6 @@ static void sve_write_fpsimd(pid_t child, struct test_config *config) vl = vl_expected(config); vq = __sve_vq_from_vl(vl); - if (!vl) - return; - iov.iov_len = SVE_PT_SIZE(vq, SVE_PT_REGS_FPSIMD); iov.iov_base = malloc(iov.iov_len); if (!iov.iov_base) { diff --git a/tools/testing/selftests/arm64/fp/sve-ptrace.c b/tools/testing/selftests/arm64/fp/sve-ptrace.c index e0fc3a001e28..f44d44618575 100644 --- a/tools/testing/selftests/arm64/fp/sve-ptrace.c +++ b/tools/testing/selftests/arm64/fp/sve-ptrace.c @@ -394,6 +394,58 @@ out: free(svebuf); } +/* Write the FPSIMD registers via the SVE regset when SVE is not supported */ +static void ptrace_sve_fpsimd_no_sve(pid_t child) +{ + void *svebuf; + struct user_sve_header *sve; + struct user_fpsimd_state *fpsimd, new_fpsimd; + unsigned int i, j; + unsigned char *p; + int ret; + + svebuf = malloc(SVE_PT_SIZE(0, SVE_PT_REGS_FPSIMD)); + if (!svebuf) { + ksft_test_result_fail("Failed to allocate FPSIMD buffer\n"); + return; + } + + /* On a system without SVE the VL should be set to 0 */ + memset(svebuf, 0, SVE_PT_SIZE(0, SVE_PT_REGS_FPSIMD)); + sve = svebuf; + sve->flags = SVE_PT_REGS_FPSIMD; + sve->size = SVE_PT_SIZE(0, SVE_PT_REGS_FPSIMD); + sve->vl = 0; + + /* Try to set a known FPSIMD state via PT_REGS_SVE */ + fpsimd = (struct user_fpsimd_state *)((char *)sve + + SVE_PT_FPSIMD_OFFSET); + for (i = 0; i < 32; ++i) { + p = (unsigned char *)&fpsimd->vregs[i]; + + for (j = 0; j < sizeof(fpsimd->vregs[i]); ++j) + p[j] = j; + } + + ret = set_sve(child, &vec_types[0], sve); + ksft_test_result(ret == 0, "FPSIMD write via SVE\n"); + if (ret) { + ksft_test_result_skip("Verify FPSIMD write via SVE\n"); + goto out; + } + + /* Verify via the FPSIMD regset */ + if (get_fpsimd(child, &new_fpsimd)) { + ksft_test_result_skip("Verify FPSIMD write via SVE\n"); + goto out; + } + ksft_test_result(memcmp(fpsimd, &new_fpsimd, sizeof(*fpsimd)) == 0, + "Verify FPSIMD write via SVE\n"); + +out: + free(svebuf); +} + /* Validate attempting to set SVE data and read SVE data */ static void ptrace_set_sve_get_sve_data(pid_t child, const struct vec_type *type, @@ -826,6 +878,15 @@ static int do_parent(pid_t child) } } + /* We support SVE writes of FPSMID format on SME only systems */ + if (!(getauxval(AT_HWCAP) & HWCAP_SVE) && + (getauxval(AT_HWCAP2) & HWCAP2_SME)) { + ptrace_sve_fpsimd_no_sve(child); + } else { + ksft_test_result_skip("FPSIMD write via SVE\n"); + ksft_test_result_skip("Verify FPSIMD write via SVE\n"); + } + ret = EXIT_SUCCESS; error: diff --git a/tools/testing/selftests/arm64/fp/zt-test.S b/tools/testing/selftests/arm64/fp/zt-test.S index 38080f3c3280..a8df05771670 100644 --- a/tools/testing/selftests/arm64/fp/zt-test.S +++ b/tools/testing/selftests/arm64/fp/zt-test.S @@ -276,7 +276,7 @@ function barf bl putdec puts ", iteration=" mov x0, x22 - bl putdec + bl putdecn puts "\tExpected [" mov x0, x10 mov x1, x12 diff --git a/tools/testing/selftests/bpf/.gitignore b/tools/testing/selftests/bpf/.gitignore index be1ee7ba7ce0..19c1638e312a 100644 --- a/tools/testing/selftests/bpf/.gitignore +++ b/tools/testing/selftests/bpf/.gitignore @@ -23,6 +23,7 @@ test_tcpnotify_user test_libbpf xdping test_cpp +test_progs_verification_cert *.d *.subskel.h *.skel.h @@ -32,7 +33,6 @@ test_cpp /cpuv4 /host-tools /tools -/runqslower /bench /veristat /sign-file diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index f00587d4ede6..b7030a6e2e76 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -46,6 +46,7 @@ endif CFLAGS += -g $(OPT_FLAGS) -rdynamic -std=gnu11 \ -Wall -Werror -fno-omit-frame-pointer \ + -Wno-unused-but-set-variable \ $(GENFLAGS) $(SAN_CFLAGS) $(LIBELF_CFLAGS) \ -I$(CURDIR) -I$(INCLUDE_DIR) -I$(GENDIR) -I$(LIBDIR) \ -I$(TOOLSINCDIR) -I$(TOOLSARCHINCDIR) -I$(APIDIR) -I$(OUTPUT) @@ -98,14 +99,11 @@ TEST_GEN_PROGS += test_progs-cpuv4 TEST_INST_SUBDIRS += cpuv4 endif -TEST_GEN_FILES = test_tc_edt.bpf.o TEST_FILES = xsk_prereqs.sh $(wildcard progs/btf_dump_test_case_*.c) # Order correspond to 'make run_tests' order TEST_PROGS := test_kmod.sh \ test_lirc_mode2.sh \ - test_tc_tunnel.sh \ - test_tc_edt.sh \ test_xdping.sh \ test_bpftool_build.sh \ test_bpftool.sh \ @@ -127,7 +125,6 @@ TEST_KMOD_TARGETS = $(addprefix $(OUTPUT)/,$(TEST_KMODS)) TEST_GEN_PROGS_EXTENDED = \ bench \ flow_dissector_load \ - runqslower \ test_cpp \ test_lirc_mode2_user \ veristat \ @@ -209,8 +206,6 @@ HOST_INCLUDE_DIR := $(INCLUDE_DIR) endif HOST_BPFOBJ := $(HOST_BUILD_DIR)/libbpf/libbpf.a RESOLVE_BTFIDS := $(HOST_BUILD_DIR)/resolve_btfids/resolve_btfids -RUNQSLOWER_OUTPUT := $(BUILD_DIR)/runqslower/ - VMLINUX_BTF_PATHS ?= $(if $(O),$(O)/vmlinux) \ $(if $(KBUILD_OUTPUT),$(KBUILD_OUTPUT)/vmlinux) \ ../../../../vmlinux \ @@ -232,7 +227,7 @@ $(notdir $(TEST_GEN_PROGS) $(TEST_KMODS) \ MAKE_DIRS := $(sort $(BUILD_DIR)/libbpf $(HOST_BUILD_DIR)/libbpf \ $(BUILD_DIR)/bpftool $(HOST_BUILD_DIR)/bpftool \ $(HOST_BUILD_DIR)/resolve_btfids \ - $(RUNQSLOWER_OUTPUT) $(INCLUDE_DIR)) + $(INCLUDE_DIR)) $(MAKE_DIRS): $(call msg,MKDIR,,$@) $(Q)mkdir -p $@ @@ -304,17 +299,6 @@ TRUNNER_BPFTOOL := $(DEFAULT_BPFTOOL) USE_BOOTSTRAP := "bootstrap/" endif -$(OUTPUT)/runqslower: $(BPFOBJ) | $(DEFAULT_BPFTOOL) $(RUNQSLOWER_OUTPUT) - $(Q)$(MAKE) $(submake_extras) -C $(TOOLSDIR)/bpf/runqslower \ - OUTPUT=$(RUNQSLOWER_OUTPUT) VMLINUX_BTF=$(VMLINUX_BTF) \ - BPFTOOL_OUTPUT=$(HOST_BUILD_DIR)/bpftool/ \ - BPFOBJ_OUTPUT=$(BUILD_DIR)/libbpf/ \ - BPFOBJ=$(BPFOBJ) BPF_INCLUDE=$(INCLUDE_DIR) \ - BPF_TARGET_ENDIAN=$(BPF_TARGET_ENDIAN) \ - EXTRA_CFLAGS='-g $(OPT_FLAGS) $(SAN_CFLAGS) $(EXTRA_CFLAGS)' \ - EXTRA_LDFLAGS='$(SAN_LDFLAGS) $(EXTRA_LDFLAGS)' && \ - cp $(RUNQSLOWER_OUTPUT)runqslower $@ - TEST_GEN_PROGS_EXTENDED += $(TRUNNER_BPFTOOL) $(TEST_GEN_PROGS) $(TEST_GEN_PROGS_EXTENDED): $(BPFOBJ) @@ -453,7 +437,9 @@ BPF_CFLAGS = -g -Wall -Werror -D__TARGET_ARCH_$(SRCARCH) $(MENDIAN) \ -I$(abspath $(OUTPUT)/../usr/include) \ -std=gnu11 \ -fno-strict-aliasing \ - -Wno-compare-distinct-pointer-types + -Wno-compare-distinct-pointer-types \ + -Wno-initializer-overrides \ + # # TODO: enable me -Wsign-compare CLANG_CFLAGS = $(CLANG_SYS_INCLUDES) @@ -498,7 +484,8 @@ LINKED_SKELS := test_static_linked.skel.h linked_funcs.skel.h \ LSKELS := fexit_sleep.c trace_printk.c trace_vprintk.c map_ptr_kern.c \ core_kern.c core_kern_overflow.c test_ringbuf.c \ - test_ringbuf_n.c test_ringbuf_map_key.c test_ringbuf_write.c + test_ringbuf_n.c test_ringbuf_map_key.c test_ringbuf_write.c \ + test_ringbuf_overwrite.c LSKELS_SIGNED := fentry_test.c fexit_test.c atomics.c @@ -543,6 +530,8 @@ TRUNNER_TEST_OBJS := $$(patsubst %.c,$$(TRUNNER_OUTPUT)/%.test.o, \ $$(notdir $$(wildcard $(TRUNNER_TESTS_DIR)/*.c))) TRUNNER_EXTRA_OBJS := $$(patsubst %.c,$$(TRUNNER_OUTPUT)/%.o, \ $$(filter %.c,$(TRUNNER_EXTRA_SOURCES))) +TRUNNER_LIB_OBJS := $$(patsubst %.c,$$(TRUNNER_OUTPUT)/%.o, \ + $$(filter %.c,$(TRUNNER_LIB_SOURCES))) TRUNNER_EXTRA_HDRS := $$(filter %.h,$(TRUNNER_EXTRA_SOURCES)) TRUNNER_TESTS_HDR := $(TRUNNER_TESTS_DIR)/tests.h TRUNNER_BPF_SRCS := $$(notdir $$(wildcard $(TRUNNER_BPF_PROGS_DIR)/*.c)) @@ -686,6 +675,10 @@ $(TRUNNER_EXTRA_OBJS): $(TRUNNER_OUTPUT)/%.o: \ $$(call msg,EXT-OBJ,$(TRUNNER_BINARY),$$@) $(Q)$$(CC) $$(CFLAGS) -c $$< $$(LDLIBS) -o $$@ +$(TRUNNER_LIB_OBJS): $(TRUNNER_OUTPUT)/%.o:$(TOOLSDIR)/lib/%.c + $$(call msg,LIB-OBJ,$(TRUNNER_BINARY),$$@) + $(Q)$$(CC) $$(CFLAGS) -c $$< $$(LDLIBS) -o $$@ + # non-flavored in-srctree builds receive special treatment, in particular, we # do not need to copy extra resources (see e.g. test_btf_dump_case()) $(TRUNNER_BINARY)-extras: $(TRUNNER_EXTRA_FILES) | $(TRUNNER_OUTPUT) @@ -699,6 +692,7 @@ $(OUTPUT)/$(TRUNNER_BINARY): | $(TRUNNER_BPF_OBJS) $(OUTPUT)/$(TRUNNER_BINARY): $(TRUNNER_TEST_OBJS) \ $(TRUNNER_EXTRA_OBJS) $$(BPFOBJ) \ + $(TRUNNER_LIB_OBJS) \ $(RESOLVE_BTFIDS) \ $(TRUNNER_BPFTOOL) \ $(OUTPUT)/veristat \ @@ -721,7 +715,8 @@ $(VERIFICATION_CERT) $(PRIVATE_KEY): $(VERIFY_SIG_SETUP) $(Q)$(VERIFY_SIG_SETUP) genkey $(BUILD_DIR) $(VERIFY_SIG_HDR): $(VERIFICATION_CERT) - $(Q)xxd -i -n test_progs_verification_cert $< > $@ + $(Q)ln -fs $< test_progs_verification_cert && \ + xxd -i test_progs_verification_cert > $@ # Define test_progs test runner. TRUNNER_TESTS_DIR := prog_tests @@ -745,6 +740,7 @@ TRUNNER_EXTRA_SOURCES := test_progs.c \ $(VERIFY_SIG_HDR) \ flow_dissector_load.h \ ip_check_defrag_frags.h +TRUNNER_LIB_SOURCES := find_bit.c TRUNNER_EXTRA_FILES := $(OUTPUT)/urandom_read \ $(OUTPUT)/liburandom_read.so \ $(OUTPUT)/xdp_synproxy \ @@ -782,6 +778,7 @@ endif TRUNNER_TESTS_DIR := map_tests TRUNNER_BPF_PROGS_DIR := progs TRUNNER_EXTRA_SOURCES := test_maps.c +TRUNNER_LIB_SOURCES := TRUNNER_EXTRA_FILES := TRUNNER_BPF_BUILD_RULE := $$(error no BPF objects should be built) TRUNNER_BPF_CFLAGS := @@ -803,7 +800,7 @@ $(OUTPUT)/test_verifier: test_verifier.c verifier/tests.h $(BPFOBJ) | $(OUTPUT) $(Q)$(CC) $(CFLAGS) $(filter %.a %.o %.c,$^) $(LDLIBS) -o $@ # Include find_bit.c to compile xskxceiver. -EXTRA_SRC := $(TOOLSDIR)/lib/find_bit.c +EXTRA_SRC := $(TOOLSDIR)/lib/find_bit.c prog_tests/test_xsk.c prog_tests/test_xsk.h $(OUTPUT)/xskxceiver: $(EXTRA_SRC) xskxceiver.c xskxceiver.h $(OUTPUT)/network_helpers.o $(OUTPUT)/xsk.o $(OUTPUT)/xsk_xdp_progs.skel.h $(BPFOBJ) | $(OUTPUT) $(call msg,BINARY,,$@) $(Q)$(CC) $(CFLAGS) $(filter %.a %.o %.c,$^) $(LDLIBS) -o $@ @@ -893,7 +890,8 @@ EXTRA_CLEAN := $(SCRATCH_DIR) $(HOST_SCRATCH_DIR) \ $(addprefix $(OUTPUT)/,*.o *.d *.skel.h *.lskel.h *.subskel.h \ no_alu32 cpuv4 bpf_gcc \ liburandom_read.so) \ - $(OUTPUT)/FEATURE-DUMP.selftests + $(OUTPUT)/FEATURE-DUMP.selftests \ + test_progs_verification_cert .PHONY: docs docs-clean diff --git a/tools/testing/selftests/bpf/benchs/bench_ringbufs.c b/tools/testing/selftests/bpf/benchs/bench_ringbufs.c index e1ee979e6acc..01bdce692799 100644 --- a/tools/testing/selftests/bpf/benchs/bench_ringbufs.c +++ b/tools/testing/selftests/bpf/benchs/bench_ringbufs.c @@ -19,6 +19,8 @@ static struct { int ringbuf_sz; /* per-ringbuf, in bytes */ bool ringbuf_use_output; /* use slower output API */ int perfbuf_sz; /* per-CPU size, in pages */ + bool overwrite; + bool bench_producer; } args = { .back2back = false, .batch_cnt = 500, @@ -27,6 +29,8 @@ static struct { .ringbuf_sz = 512 * 1024, .ringbuf_use_output = false, .perfbuf_sz = 128, + .overwrite = false, + .bench_producer = false, }; enum { @@ -35,6 +39,8 @@ enum { ARG_RB_BATCH_CNT = 2002, ARG_RB_SAMPLED = 2003, ARG_RB_SAMPLE_RATE = 2004, + ARG_RB_OVERWRITE = 2005, + ARG_RB_BENCH_PRODUCER = 2006, }; static const struct argp_option opts[] = { @@ -43,6 +49,8 @@ static const struct argp_option opts[] = { { "rb-batch-cnt", ARG_RB_BATCH_CNT, "CNT", 0, "Set BPF-side record batch count"}, { "rb-sampled", ARG_RB_SAMPLED, NULL, 0, "Notification sampling"}, { "rb-sample-rate", ARG_RB_SAMPLE_RATE, "RATE", 0, "Notification sample rate"}, + { "rb-overwrite", ARG_RB_OVERWRITE, NULL, 0, "Overwrite mode"}, + { "rb-bench-producer", ARG_RB_BENCH_PRODUCER, NULL, 0, "Benchmark producer"}, {}, }; @@ -72,6 +80,12 @@ static error_t parse_arg(int key, char *arg, struct argp_state *state) argp_usage(state); } break; + case ARG_RB_OVERWRITE: + args.overwrite = true; + break; + case ARG_RB_BENCH_PRODUCER: + args.bench_producer = true; + break; default: return ARGP_ERR_UNKNOWN; } @@ -95,8 +109,33 @@ static inline void bufs_trigger_batch(void) static void bufs_validate(void) { - if (env.consumer_cnt != 1) { - fprintf(stderr, "rb-libbpf benchmark needs one consumer!\n"); + if (args.bench_producer && strcmp(env.bench_name, "rb-libbpf")) { + fprintf(stderr, "--rb-bench-producer only works with rb-libbpf!\n"); + exit(1); + } + + if (args.overwrite && !args.bench_producer) { + fprintf(stderr, "overwrite mode only works with --rb-bench-producer for now!\n"); + exit(1); + } + + if (args.bench_producer && env.consumer_cnt != 0) { + fprintf(stderr, "no consumer is needed for --rb-bench-producer!\n"); + exit(1); + } + + if (args.bench_producer && args.back2back) { + fprintf(stderr, "back-to-back mode makes no sense for --rb-bench-producer!\n"); + exit(1); + } + + if (args.bench_producer && args.sampled) { + fprintf(stderr, "sampling mode makes no sense for --rb-bench-producer!\n"); + exit(1); + } + + if (!args.bench_producer && env.consumer_cnt != 1) { + fprintf(stderr, "benchmarks without --rb-bench-producer require exactly one consumer!\n"); exit(1); } @@ -128,12 +167,17 @@ static void ringbuf_libbpf_measure(struct bench_res *res) { struct ringbuf_libbpf_ctx *ctx = &ringbuf_libbpf_ctx; - res->hits = atomic_swap(&buf_hits.value, 0); + if (args.bench_producer) + res->hits = atomic_swap(&ctx->skel->bss->hits, 0); + else + res->hits = atomic_swap(&buf_hits.value, 0); res->drops = atomic_swap(&ctx->skel->bss->dropped, 0); } static struct ringbuf_bench *ringbuf_setup_skeleton(void) { + __u32 flags; + struct bpf_map *ringbuf; struct ringbuf_bench *skel; setup_libbpf(); @@ -146,12 +190,19 @@ static struct ringbuf_bench *ringbuf_setup_skeleton(void) skel->rodata->batch_cnt = args.batch_cnt; skel->rodata->use_output = args.ringbuf_use_output ? 1 : 0; + skel->rodata->bench_producer = args.bench_producer; if (args.sampled) /* record data + header take 16 bytes */ skel->rodata->wakeup_data_size = args.sample_rate * 16; - bpf_map__set_max_entries(skel->maps.ringbuf, args.ringbuf_sz); + ringbuf = skel->maps.ringbuf; + if (args.overwrite) { + flags = bpf_map__map_flags(ringbuf) | BPF_F_RB_OVERWRITE; + bpf_map__set_map_flags(ringbuf, flags); + } + + bpf_map__set_max_entries(ringbuf, args.ringbuf_sz); if (ringbuf_bench__load(skel)) { fprintf(stderr, "failed to load skeleton\n"); @@ -171,10 +222,12 @@ static void ringbuf_libbpf_setup(void) { struct ringbuf_libbpf_ctx *ctx = &ringbuf_libbpf_ctx; struct bpf_link *link; + int map_fd; ctx->skel = ringbuf_setup_skeleton(); - ctx->ringbuf = ring_buffer__new(bpf_map__fd(ctx->skel->maps.ringbuf), - buf_process_sample, NULL, NULL); + + map_fd = bpf_map__fd(ctx->skel->maps.ringbuf); + ctx->ringbuf = ring_buffer__new(map_fd, buf_process_sample, NULL, NULL); if (!ctx->ringbuf) { fprintf(stderr, "failed to create ringbuf\n"); exit(1); diff --git a/tools/testing/selftests/bpf/benchs/bench_trigger.c b/tools/testing/selftests/bpf/benchs/bench_trigger.c index 1e2aff007c2a..34018fc3927f 100644 --- a/tools/testing/selftests/bpf/benchs/bench_trigger.c +++ b/tools/testing/selftests/bpf/benchs/bench_trigger.c @@ -180,10 +180,10 @@ static void trigger_kernel_count_setup(void) { setup_ctx(); bpf_program__set_autoload(ctx.skel->progs.trigger_driver, false); - bpf_program__set_autoload(ctx.skel->progs.trigger_count, true); + bpf_program__set_autoload(ctx.skel->progs.trigger_kernel_count, true); load_ctx(); /* override driver program */ - ctx.driver_prog_fd = bpf_program__fd(ctx.skel->progs.trigger_count); + ctx.driver_prog_fd = bpf_program__fd(ctx.skel->progs.trigger_kernel_count); } static void trigger_kprobe_setup(void) diff --git a/tools/testing/selftests/bpf/benchs/run_bench_ringbufs.sh b/tools/testing/selftests/bpf/benchs/run_bench_ringbufs.sh index 91e3567962ff..83e05e837871 100755 --- a/tools/testing/selftests/bpf/benchs/run_bench_ringbufs.sh +++ b/tools/testing/selftests/bpf/benchs/run_bench_ringbufs.sh @@ -49,3 +49,7 @@ for b in 1 2 3 4 8 12 16 20 24 28 32 36 40 44 48 52; do summarize "rb-libbpf nr_prod $b" "$($RUN_RB_BENCH -p$b --rb-batch-cnt 50 rb-libbpf)" done +header "Ringbuf, multi-producer contention in overwrite mode, no consumer" +for b in 1 2 3 4 8 12 16 20 24 28 32 36 40 44 48 52; do + summarize "rb-prod nr_prod $b" "$($RUN_BENCH -p$b --rb-batch-cnt 50 --rb-overwrite --rb-bench-producer rb-libbpf)" +done diff --git a/tools/testing/selftests/bpf/bpf_arena_list.h b/tools/testing/selftests/bpf/bpf_arena_list.h index 85dbc3ea4da5..e16fa7d95fcf 100644 --- a/tools/testing/selftests/bpf/bpf_arena_list.h +++ b/tools/testing/selftests/bpf/bpf_arena_list.h @@ -64,14 +64,12 @@ static inline void list_add_head(arena_list_node_t *n, arena_list_head_t *h) static inline void __list_del(arena_list_node_t *n) { - arena_list_node_t *next = n->next, *tmp; + arena_list_node_t *next = n->next; arena_list_node_t * __arena *pprev = n->pprev; cast_user(next); cast_kern(pprev); - tmp = *pprev; - cast_kern(tmp); - WRITE_ONCE(tmp, next); + WRITE_ONCE(*pprev, next); if (next) { cast_user(pprev); cast_kern(next); diff --git a/tools/testing/selftests/bpf/bpf_arena_strsearch.h b/tools/testing/selftests/bpf/bpf_arena_strsearch.h new file mode 100644 index 000000000000..c1b6eaa905bb --- /dev/null +++ b/tools/testing/selftests/bpf/bpf_arena_strsearch.h @@ -0,0 +1,128 @@ +/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */ +/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */ +#pragma once +#include "bpf_arena_common.h" + +__noinline int bpf_arena_strlen(const char __arena *s __arg_arena) +{ + const char __arena *sc; + + for (sc = s; *sc != '\0'; ++sc) + cond_break; + return sc - s; +} + +/** + * glob_match - Shell-style pattern matching, like !fnmatch(pat, str, 0) + * @pat: Shell-style pattern to match, e.g. "*.[ch]". + * @str: String to match. The pattern must match the entire string. + * + * Perform shell-style glob matching, returning true (1) if the match + * succeeds, or false (0) if it fails. Equivalent to !fnmatch(@pat, @str, 0). + * + * Pattern metacharacters are ?, *, [ and \. + * (And, inside character classes, !, - and ].) + * + * This is small and simple implementation intended for device blacklists + * where a string is matched against a number of patterns. Thus, it + * does not preprocess the patterns. It is non-recursive, and run-time + * is at most quadratic: strlen(@str)*strlen(@pat). + * + * An example of the worst case is glob_match("*aaaaa", "aaaaaaaaaa"); + * it takes 6 passes over the pattern before matching the string. + * + * Like !fnmatch(@pat, @str, 0) and unlike the shell, this does NOT + * treat / or leading . specially; it isn't actually used for pathnames. + * + * Note that according to glob(7) (and unlike bash), character classes + * are complemented by a leading !; this does not support the regex-style + * [^a-z] syntax. + * + * An opening bracket without a matching close is matched literally. + */ +__noinline bool glob_match(char const __arena *pat __arg_arena, char const __arena *str __arg_arena) +{ + /* + * Backtrack to previous * on mismatch and retry starting one + * character later in the string. Because * matches all characters + * (no exception for /), it can be easily proved that there's + * never a need to backtrack multiple levels. + */ + char const __arena *back_pat = NULL, *back_str; + + /* + * Loop over each token (character or class) in pat, matching + * it against the remaining unmatched tail of str. Return false + * on mismatch, or true after matching the trailing nul bytes. + */ + for (;;) { + unsigned char c = *str++; + unsigned char d = *pat++; + + switch (d) { + case '?': /* Wildcard: anything but nul */ + if (c == '\0') + return false; + break; + case '*': /* Any-length wildcard */ + if (*pat == '\0') /* Optimize trailing * case */ + return true; + back_pat = pat; + back_str = --str; /* Allow zero-length match */ + break; + case '[': { /* Character class */ + bool match = false, inverted = (*pat == '!'); + char const __arena *class = pat + inverted; + unsigned char a = *class++; + + /* + * Iterate over each span in the character class. + * A span is either a single character a, or a + * range a-b. The first span may begin with ']'. + */ + do { + unsigned char b = a; + + if (a == '\0') /* Malformed */ + goto literal; + + if (class[0] == '-' && class[1] != ']') { + b = class[1]; + + if (b == '\0') + goto literal; + + class += 2; + /* Any special action if a > b? */ + } + match |= (a <= c && c <= b); + cond_break; + } while ((a = *class++) != ']'); + + if (match == inverted) + goto backtrack; + pat = class; + } + break; + case '\\': + d = *pat++; + __attribute__((__fallthrough__)); + default: /* Literal character */ +literal: + if (c == d) { + if (d == '\0') + return true; + break; + } +backtrack: + if (c == '\0' || !back_pat) + return false; /* No point continuing */ + /* Try again from last *, one character later in str. */ + pat = back_pat; + str = ++back_str; + break; + } + cond_break; + } + return false; +} diff --git a/tools/testing/selftests/bpf/bpf_kfuncs.h b/tools/testing/selftests/bpf/bpf_kfuncs.h index 794d44d19c88..e0189254bb6e 100644 --- a/tools/testing/selftests/bpf/bpf_kfuncs.h +++ b/tools/testing/selftests/bpf/bpf_kfuncs.h @@ -28,8 +28,8 @@ extern int bpf_dynptr_from_skb_meta(struct __sk_buff *skb, __u64 flags, * Either a direct pointer to the dynptr data or a pointer to the user-provided * buffer if unable to obtain a direct pointer */ -extern void *bpf_dynptr_slice(const struct bpf_dynptr *ptr, __u32 offset, - void *buffer, __u32 buffer__szk) __ksym __weak; +extern void *bpf_dynptr_slice(const struct bpf_dynptr *ptr, __u64 offset, + void *buffer, __u64 buffer__szk) __ksym __weak; /* Description * Obtain a read-write pointer to the dynptr's data @@ -37,13 +37,13 @@ extern void *bpf_dynptr_slice(const struct bpf_dynptr *ptr, __u32 offset, * Either a direct pointer to the dynptr data or a pointer to the user-provided * buffer if unable to obtain a direct pointer */ -extern void *bpf_dynptr_slice_rdwr(const struct bpf_dynptr *ptr, __u32 offset, - void *buffer, __u32 buffer__szk) __ksym __weak; +extern void *bpf_dynptr_slice_rdwr(const struct bpf_dynptr *ptr, __u64 offset, void *buffer, + __u64 buffer__szk) __ksym __weak; -extern int bpf_dynptr_adjust(const struct bpf_dynptr *ptr, __u32 start, __u32 end) __ksym __weak; +extern int bpf_dynptr_adjust(const struct bpf_dynptr *ptr, __u64 start, __u64 end) __ksym __weak; extern bool bpf_dynptr_is_null(const struct bpf_dynptr *ptr) __ksym __weak; extern bool bpf_dynptr_is_rdonly(const struct bpf_dynptr *ptr) __ksym __weak; -extern __u32 bpf_dynptr_size(const struct bpf_dynptr *ptr) __ksym __weak; +extern __u64 bpf_dynptr_size(const struct bpf_dynptr *ptr) __ksym __weak; extern int bpf_dynptr_clone(const struct bpf_dynptr *ptr, struct bpf_dynptr *clone__init) __ksym __weak; /* Description diff --git a/tools/testing/selftests/bpf/config b/tools/testing/selftests/bpf/config index f2a2fd236ca8..558839e3c185 100644 --- a/tools/testing/selftests/bpf/config +++ b/tools/testing/selftests/bpf/config @@ -126,3 +126,8 @@ CONFIG_XDP_SOCKETS=y CONFIG_XFRM_INTERFACE=y CONFIG_TCP_CONG_DCTCP=y CONFIG_TCP_CONG_BBR=y +CONFIG_INFINIBAND=y +CONFIG_SMC=y +CONFIG_SMC_HS_CTRL_BPF=y +CONFIG_DIBS=y +CONFIG_DIBS_LO=y
\ No newline at end of file diff --git a/tools/testing/selftests/bpf/network_helpers.c b/tools/testing/selftests/bpf/network_helpers.c index cdf7b6641444..0a6a5561bed3 100644 --- a/tools/testing/selftests/bpf/network_helpers.c +++ b/tools/testing/selftests/bpf/network_helpers.c @@ -97,7 +97,7 @@ int settimeo(int fd, int timeout_ms) int start_server_addr(int type, const struct sockaddr_storage *addr, socklen_t addrlen, const struct network_helper_opts *opts) { - int fd; + int on = 1, fd; if (!opts) opts = &default_opts; @@ -111,6 +111,12 @@ int start_server_addr(int type, const struct sockaddr_storage *addr, socklen_t a if (settimeo(fd, opts->timeout_ms)) goto error_close; + if (type == SOCK_STREAM && + setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on))) { + log_err("Failed to enable SO_REUSEADDR"); + goto error_close; + } + if (opts->post_socket_cb && opts->post_socket_cb(fd, opts->cb_opts)) { log_err("Failed to call post_socket_cb"); @@ -766,6 +772,50 @@ int send_recv_data(int lfd, int fd, uint32_t total_bytes) return err; } +int tc_prog_attach(const char *dev, int ingress_fd, int egress_fd) +{ + int ifindex, ret; + + if (!ASSERT_TRUE(ingress_fd >= 0 || egress_fd >= 0, + "at least one program fd is valid")) + return -1; + + ifindex = if_nametoindex(dev); + if (!ASSERT_NEQ(ifindex, 0, "get ifindex")) + return -1; + + DECLARE_LIBBPF_OPTS(bpf_tc_hook, hook, .ifindex = ifindex, + .attach_point = BPF_TC_INGRESS | BPF_TC_EGRESS); + DECLARE_LIBBPF_OPTS(bpf_tc_opts, opts1, .handle = 1, + .priority = 1, .prog_fd = ingress_fd); + DECLARE_LIBBPF_OPTS(bpf_tc_opts, opts2, .handle = 1, + .priority = 1, .prog_fd = egress_fd); + + ret = bpf_tc_hook_create(&hook); + if (!ASSERT_OK(ret, "create tc hook")) + return ret; + + if (ingress_fd >= 0) { + hook.attach_point = BPF_TC_INGRESS; + ret = bpf_tc_attach(&hook, &opts1); + if (!ASSERT_OK(ret, "bpf_tc_attach")) { + bpf_tc_hook_destroy(&hook); + return ret; + } + } + + if (egress_fd >= 0) { + hook.attach_point = BPF_TC_EGRESS; + ret = bpf_tc_attach(&hook, &opts2); + if (!ASSERT_OK(ret, "bpf_tc_attach")) { + bpf_tc_hook_destroy(&hook); + return ret; + } + } + + return 0; +} + #ifdef TRAFFIC_MONITOR struct tmonitor_ctx { pcap_t *pcap; diff --git a/tools/testing/selftests/bpf/network_helpers.h b/tools/testing/selftests/bpf/network_helpers.h index ef208eefd571..79a010c88e11 100644 --- a/tools/testing/selftests/bpf/network_helpers.h +++ b/tools/testing/selftests/bpf/network_helpers.h @@ -255,6 +255,22 @@ struct tmonitor_ctx; typedef int (*tm_print_fn_t)(const char *format, va_list args); +/** + * tc_prog_attach - attach BPF program(s) to an interface + * + * Takes file descriptors pointing to at least one, at most two BPF + * programs, and attach those programs to an interface ingress, egress or + * both. + * + * @dev: string containing the interface name + * @ingress_fd: file descriptor of the program to attach to interface ingress + * @egress_fd: file descriptor of the program to attach to interface egress + * + * Returns 0 on success, -1 if no valid file descriptor has been found, if + * the interface name is invalid or if an error ocurred during attach. + */ +int tc_prog_attach(const char *dev, int ingress_fd, int egress_fd); + #ifdef TRAFFIC_MONITOR struct tmonitor_ctx *traffic_monitor_start(const char *netns, const char *test_name, const char *subtest_name); diff --git a/tools/testing/selftests/bpf/prog_tests/arena_strsearch.c b/tools/testing/selftests/bpf/prog_tests/arena_strsearch.c new file mode 100644 index 000000000000..f81a0c066505 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/arena_strsearch.c @@ -0,0 +1,30 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */ +#include <test_progs.h> +#include "arena_strsearch.skel.h" + +static void test_arena_str(void) +{ + LIBBPF_OPTS(bpf_test_run_opts, opts); + struct arena_strsearch *skel; + int ret; + + skel = arena_strsearch__open_and_load(); + if (!ASSERT_OK_PTR(skel, "arena_strsearch__open_and_load")) + return; + + ret = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.arena_strsearch), &opts); + ASSERT_OK(ret, "ret_add"); + ASSERT_OK(opts.retval, "retval"); + if (skel->bss->skip) { + printf("%s:SKIP:compiler doesn't support arena_cast\n", __func__); + test__skip(); + } + arena_strsearch__destroy(skel); +} + +void test_arena_strsearch(void) +{ + if (test__start_subtest("arena_strsearch")) + test_arena_str(); +} diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_gotox.c b/tools/testing/selftests/bpf/prog_tests/bpf_gotox.c new file mode 100644 index 000000000000..d138cc7b1bda --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/bpf_gotox.c @@ -0,0 +1,292 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <test_progs.h> + +#include <linux/if_ether.h> +#include <linux/in.h> +#include <linux/ip.h> +#include <linux/ipv6.h> +#include <linux/in6.h> +#include <linux/udp.h> +#include <linux/tcp.h> + +#include <sys/syscall.h> +#include <bpf/bpf.h> + +#include "bpf_gotox.skel.h" + +static void __test_run(struct bpf_program *prog, void *ctx_in, size_t ctx_size_in) +{ + LIBBPF_OPTS(bpf_test_run_opts, topts, + .ctx_in = ctx_in, + .ctx_size_in = ctx_size_in, + ); + int err, prog_fd; + + prog_fd = bpf_program__fd(prog); + err = bpf_prog_test_run_opts(prog_fd, &topts); + ASSERT_OK(err, "test_run_opts err"); +} + +static void __subtest(struct bpf_gotox *skel, void (*check)(struct bpf_gotox *)) +{ + if (skel->data->skip) + test__skip(); + else + check(skel); +} + +static void check_simple(struct bpf_gotox *skel, + struct bpf_program *prog, + __u64 ctx_in, + __u64 expected) +{ + skel->bss->ret_user = 0; + + __test_run(prog, &ctx_in, sizeof(ctx_in)); + + if (!ASSERT_EQ(skel->bss->ret_user, expected, "skel->bss->ret_user")) + return; +} + +static void check_simple_fentry(struct bpf_gotox *skel, + struct bpf_program *prog, + __u64 ctx_in, + __u64 expected) +{ + skel->bss->in_user = ctx_in; + skel->bss->ret_user = 0; + + /* trigger */ + usleep(1); + + if (!ASSERT_EQ(skel->bss->ret_user, expected, "skel->bss->ret_user")) + return; +} + +/* validate that for two loads of the same jump table libbpf generates only one map */ +static void check_one_map_two_jumps(struct bpf_gotox *skel) +{ + struct bpf_prog_info prog_info; + struct bpf_map_info map_info; + __u32 len; + __u32 map_ids[16]; + int prog_fd, map_fd; + int ret; + int i; + bool seen = false; + + memset(&prog_info, 0, sizeof(prog_info)); + prog_info.map_ids = (long)map_ids; + prog_info.nr_map_ids = ARRAY_SIZE(map_ids); + prog_fd = bpf_program__fd(skel->progs.one_map_two_jumps); + if (!ASSERT_GE(prog_fd, 0, "bpf_program__fd(one_map_two_jumps)")) + return; + + len = sizeof(prog_info); + ret = bpf_obj_get_info_by_fd(prog_fd, &prog_info, &len); + if (!ASSERT_OK(ret, "bpf_obj_get_info_by_fd(prog_fd)")) + return; + + for (i = 0; i < prog_info.nr_map_ids; i++) { + map_fd = bpf_map_get_fd_by_id(map_ids[i]); + if (!ASSERT_GE(map_fd, 0, "bpf_map_get_fd_by_id")) + return; + + len = sizeof(map_info); + memset(&map_info, 0, len); + ret = bpf_obj_get_info_by_fd(map_fd, &map_info, &len); + if (!ASSERT_OK(ret, "bpf_obj_get_info_by_fd(map_fd)")) { + close(map_fd); + return; + } + + if (map_info.type == BPF_MAP_TYPE_INSN_ARRAY) { + if (!ASSERT_EQ(seen, false, "more than one INSN_ARRAY map")) { + close(map_fd); + return; + } + seen = true; + } + close(map_fd); + } + + ASSERT_EQ(seen, true, "no INSN_ARRAY map"); +} + +static void check_one_switch(struct bpf_gotox *skel) +{ + __u64 in[] = {0, 1, 2, 3, 4, 5, 77}; + __u64 out[] = {2, 3, 4, 5, 7, 19, 19}; + int i; + + for (i = 0; i < ARRAY_SIZE(in); i++) + check_simple(skel, skel->progs.one_switch, in[i], out[i]); +} + +static void check_one_switch_non_zero_sec_off(struct bpf_gotox *skel) +{ + __u64 in[] = {0, 1, 2, 3, 4, 5, 77}; + __u64 out[] = {2, 3, 4, 5, 7, 19, 19}; + int i; + + for (i = 0; i < ARRAY_SIZE(in); i++) + check_simple(skel, skel->progs.one_switch_non_zero_sec_off, in[i], out[i]); +} + +static void check_two_switches(struct bpf_gotox *skel) +{ + __u64 in[] = {0, 1, 2, 3, 4, 5, 77}; + __u64 out[] = {103, 104, 107, 205, 115, 1019, 1019}; + int i; + + for (i = 0; i < ARRAY_SIZE(in); i++) + check_simple(skel, skel->progs.two_switches, in[i], out[i]); +} + +static void check_big_jump_table(struct bpf_gotox *skel) +{ + __u64 in[] = {0, 11, 27, 31, 22, 45, 99}; + __u64 out[] = {2, 3, 4, 5, 19, 19, 19}; + int i; + + for (i = 0; i < ARRAY_SIZE(in); i++) + check_simple(skel, skel->progs.big_jump_table, in[i], out[i]); +} + +static void check_one_jump_two_maps(struct bpf_gotox *skel) +{ + __u64 in[] = {0, 1, 2, 3, 4, 5, 77}; + __u64 out[] = {12, 15, 7 , 15, 12, 15, 15}; + int i; + + for (i = 0; i < ARRAY_SIZE(in); i++) + check_simple(skel, skel->progs.one_jump_two_maps, in[i], out[i]); +} + +static void check_static_global(struct bpf_gotox *skel) +{ + __u64 in[] = {0, 1, 2, 3, 4, 5, 77}; + __u64 out[] = {2, 3, 4, 5, 7, 19, 19}; + int i; + + for (i = 0; i < ARRAY_SIZE(in); i++) + check_simple(skel, skel->progs.use_static_global1, in[i], out[i]); + for (i = 0; i < ARRAY_SIZE(in); i++) + check_simple(skel, skel->progs.use_static_global2, in[i], out[i]); +} + +static void check_nonstatic_global(struct bpf_gotox *skel) +{ + __u64 in[] = {0, 1, 2, 3, 4, 5, 77}; + __u64 out[] = {2, 3, 4, 5, 7, 19, 19}; + int i; + + for (i = 0; i < ARRAY_SIZE(in); i++) + check_simple(skel, skel->progs.use_nonstatic_global1, in[i], out[i]); + + for (i = 0; i < ARRAY_SIZE(in); i++) + check_simple(skel, skel->progs.use_nonstatic_global2, in[i], out[i]); +} + +static void check_other_sec(struct bpf_gotox *skel) +{ + struct bpf_link *link; + __u64 in[] = {0, 1, 2, 3, 4, 5, 77}; + __u64 out[] = {2, 3, 4, 5, 7, 19, 19}; + int i; + + link = bpf_program__attach(skel->progs.simple_test_other_sec); + if (!ASSERT_OK_PTR(link, "link")) + return; + + for (i = 0; i < ARRAY_SIZE(in); i++) + check_simple_fentry(skel, skel->progs.simple_test_other_sec, in[i], out[i]); + + bpf_link__destroy(link); +} + +static void check_static_global_other_sec(struct bpf_gotox *skel) +{ + struct bpf_link *link; + __u64 in[] = {0, 1, 2, 3, 4, 5, 77}; + __u64 out[] = {2, 3, 4, 5, 7, 19, 19}; + int i; + + link = bpf_program__attach(skel->progs.use_static_global_other_sec); + if (!ASSERT_OK_PTR(link, "link")) + return; + + for (i = 0; i < ARRAY_SIZE(in); i++) + check_simple_fentry(skel, skel->progs.use_static_global_other_sec, in[i], out[i]); + + bpf_link__destroy(link); +} + +static void check_nonstatic_global_other_sec(struct bpf_gotox *skel) +{ + struct bpf_link *link; + __u64 in[] = {0, 1, 2, 3, 4, 5, 77}; + __u64 out[] = {2, 3, 4, 5, 7, 19, 19}; + int i; + + link = bpf_program__attach(skel->progs.use_nonstatic_global_other_sec); + if (!ASSERT_OK_PTR(link, "link")) + return; + + for (i = 0; i < ARRAY_SIZE(in); i++) + check_simple_fentry(skel, skel->progs.use_nonstatic_global_other_sec, in[i], out[i]); + + bpf_link__destroy(link); +} + +void test_bpf_gotox(void) +{ + struct bpf_gotox *skel; + int ret; + + skel = bpf_gotox__open(); + if (!ASSERT_NEQ(skel, NULL, "bpf_gotox__open")) + return; + + ret = bpf_gotox__load(skel); + if (!ASSERT_OK(ret, "bpf_gotox__load")) + return; + + skel->bss->pid = getpid(); + + if (test__start_subtest("one-switch")) + __subtest(skel, check_one_switch); + + if (test__start_subtest("one-switch-non-zero-sec-offset")) + __subtest(skel, check_one_switch_non_zero_sec_off); + + if (test__start_subtest("two-switches")) + __subtest(skel, check_two_switches); + + if (test__start_subtest("big-jump-table")) + __subtest(skel, check_big_jump_table); + + if (test__start_subtest("static-global")) + __subtest(skel, check_static_global); + + if (test__start_subtest("nonstatic-global")) + __subtest(skel, check_nonstatic_global); + + if (test__start_subtest("other-sec")) + __subtest(skel, check_other_sec); + + if (test__start_subtest("static-global-other-sec")) + __subtest(skel, check_static_global_other_sec); + + if (test__start_subtest("nonstatic-global-other-sec")) + __subtest(skel, check_nonstatic_global_other_sec); + + if (test__start_subtest("one-jump-two-maps")) + __subtest(skel, check_one_jump_two_maps); + + if (test__start_subtest("one-map-two-jumps")) + __subtest(skel, check_one_map_two_jumps); + + bpf_gotox__destroy(skel); +} diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_insn_array.c b/tools/testing/selftests/bpf/prog_tests/bpf_insn_array.c new file mode 100644 index 000000000000..269870bec941 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/bpf_insn_array.c @@ -0,0 +1,504 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <bpf/bpf.h> +#include <test_progs.h> + +#ifdef __x86_64__ +static int map_create(__u32 map_type, __u32 max_entries) +{ + const char *map_name = "insn_array"; + __u32 key_size = 4; + __u32 value_size = sizeof(struct bpf_insn_array_value); + + return bpf_map_create(map_type, map_name, key_size, value_size, max_entries, NULL); +} + +static int prog_load(struct bpf_insn *insns, __u32 insn_cnt, int *fd_array, __u32 fd_array_cnt) +{ + LIBBPF_OPTS(bpf_prog_load_opts, opts); + + opts.fd_array = fd_array; + opts.fd_array_cnt = fd_array_cnt; + + return bpf_prog_load(BPF_PROG_TYPE_XDP, NULL, "GPL", insns, insn_cnt, &opts); +} + +static void __check_success(struct bpf_insn *insns, __u32 insn_cnt, __u32 *map_in, __u32 *map_out) +{ + struct bpf_insn_array_value val = {}; + int prog_fd = -1, map_fd, i; + + map_fd = map_create(BPF_MAP_TYPE_INSN_ARRAY, insn_cnt); + if (!ASSERT_GE(map_fd, 0, "map_create")) + return; + + for (i = 0; i < insn_cnt; i++) { + val.orig_off = map_in[i]; + if (!ASSERT_EQ(bpf_map_update_elem(map_fd, &i, &val, 0), 0, "bpf_map_update_elem")) + goto cleanup; + } + + if (!ASSERT_EQ(bpf_map_freeze(map_fd), 0, "bpf_map_freeze")) + goto cleanup; + + prog_fd = prog_load(insns, insn_cnt, &map_fd, 1); + if (!ASSERT_GE(prog_fd, 0, "bpf(BPF_PROG_LOAD)")) + goto cleanup; + + for (i = 0; i < insn_cnt; i++) { + char buf[64]; + + if (!ASSERT_EQ(bpf_map_lookup_elem(map_fd, &i, &val), 0, "bpf_map_lookup_elem")) + goto cleanup; + + snprintf(buf, sizeof(buf), "val.xlated_off should be equal map_out[%d]", i); + ASSERT_EQ(val.xlated_off, map_out[i], buf); + } + +cleanup: + close(prog_fd); + close(map_fd); +} + +/* + * Load a program, which will not be anyhow mangled by the verifier. Add an + * insn_array map pointing to every instruction. Check that it hasn't changed + * after the program load. + */ +static void check_one_to_one_mapping(void) +{ + struct bpf_insn insns[] = { + BPF_MOV64_IMM(BPF_REG_0, 4), + BPF_MOV64_IMM(BPF_REG_0, 3), + BPF_MOV64_IMM(BPF_REG_0, 2), + BPF_MOV64_IMM(BPF_REG_0, 1), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }; + __u32 map_in[] = {0, 1, 2, 3, 4, 5}; + __u32 map_out[] = {0, 1, 2, 3, 4, 5}; + + __check_success(insns, ARRAY_SIZE(insns), map_in, map_out); +} + +/* + * Load a program with two patches (get jiffies, for simplicity). Add an + * insn_array map pointing to every instruction. Check how it was changed + * after the program load. + */ +static void check_simple(void) +{ + struct bpf_insn insns[] = { + BPF_MOV64_IMM(BPF_REG_0, 2), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_jiffies64), + BPF_MOV64_IMM(BPF_REG_0, 1), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_jiffies64), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }; + __u32 map_in[] = {0, 1, 2, 3, 4, 5}; + __u32 map_out[] = {0, 1, 4, 5, 8, 9}; + + __check_success(insns, ARRAY_SIZE(insns), map_in, map_out); +} + +/* + * Verifier can delete code in two cases: nops & dead code. From insn + * array's point of view, the two cases are the same, so test using + * the simplest method: by loading some nops + */ +static void check_deletions(void) +{ + struct bpf_insn insns[] = { + BPF_MOV64_IMM(BPF_REG_0, 2), + BPF_JMP_IMM(BPF_JA, 0, 0, 0), /* nop */ + BPF_MOV64_IMM(BPF_REG_0, 1), + BPF_JMP_IMM(BPF_JA, 0, 0, 0), /* nop */ + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }; + __u32 map_in[] = {0, 1, 2, 3, 4, 5}; + __u32 map_out[] = {0, -1, 1, -1, 2, 3}; + + __check_success(insns, ARRAY_SIZE(insns), map_in, map_out); +} + +/* + * Same test as check_deletions, but also add code which adds instructions + */ +static void check_deletions_with_functions(void) +{ + struct bpf_insn insns[] = { + BPF_JMP_IMM(BPF_JA, 0, 0, 0), /* nop */ + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_jiffies64), + BPF_JMP_IMM(BPF_JA, 0, 0, 0), /* nop */ + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 2), + BPF_MOV64_IMM(BPF_REG_0, 1), + BPF_EXIT_INSN(), + BPF_JMP_IMM(BPF_JA, 0, 0, 0), /* nop */ + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_jiffies64), + BPF_JMP_IMM(BPF_JA, 0, 0, 0), /* nop */ + BPF_MOV64_IMM(BPF_REG_0, 2), + BPF_EXIT_INSN(), + }; + __u32 map_in[] = { 0, 1, 2, 3, 4, 5, /* func */ 6, 7, 8, 9, 10}; + __u32 map_out[] = {-1, 0, -1, 3, 4, 5, /* func */ -1, 6, -1, 9, 10}; + + __check_success(insns, ARRAY_SIZE(insns), map_in, map_out); +} + +/* + * Try to load a program with a map which points to outside of the program + */ +static void check_out_of_bounds_index(void) +{ + struct bpf_insn insns[] = { + BPF_MOV64_IMM(BPF_REG_0, 4), + BPF_MOV64_IMM(BPF_REG_0, 3), + BPF_MOV64_IMM(BPF_REG_0, 2), + BPF_MOV64_IMM(BPF_REG_0, 1), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }; + int prog_fd, map_fd; + struct bpf_insn_array_value val = {}; + int key; + + map_fd = map_create(BPF_MAP_TYPE_INSN_ARRAY, 1); + if (!ASSERT_GE(map_fd, 0, "map_create")) + return; + + key = 0; + val.orig_off = ARRAY_SIZE(insns); /* too big */ + if (!ASSERT_EQ(bpf_map_update_elem(map_fd, &key, &val, 0), 0, "bpf_map_update_elem")) + goto cleanup; + + if (!ASSERT_EQ(bpf_map_freeze(map_fd), 0, "bpf_map_freeze")) + goto cleanup; + + prog_fd = prog_load(insns, ARRAY_SIZE(insns), &map_fd, 1); + if (!ASSERT_EQ(prog_fd, -EINVAL, "program should have been rejected (prog_fd != -EINVAL)")) { + close(prog_fd); + goto cleanup; + } + +cleanup: + close(map_fd); +} + +/* + * Try to load a program with a map which points to the middle of 16-bit insn + */ +static void check_mid_insn_index(void) +{ + struct bpf_insn insns[] = { + BPF_LD_IMM64(BPF_REG_0, 0), /* 2 x 8 */ + BPF_EXIT_INSN(), + }; + int prog_fd, map_fd; + struct bpf_insn_array_value val = {}; + int key; + + map_fd = map_create(BPF_MAP_TYPE_INSN_ARRAY, 1); + if (!ASSERT_GE(map_fd, 0, "map_create")) + return; + + key = 0; + val.orig_off = 1; /* middle of 16-byte instruction */ + if (!ASSERT_EQ(bpf_map_update_elem(map_fd, &key, &val, 0), 0, "bpf_map_update_elem")) + goto cleanup; + + if (!ASSERT_EQ(bpf_map_freeze(map_fd), 0, "bpf_map_freeze")) + goto cleanup; + + prog_fd = prog_load(insns, ARRAY_SIZE(insns), &map_fd, 1); + if (!ASSERT_EQ(prog_fd, -EINVAL, "program should have been rejected (prog_fd != -EINVAL)")) { + close(prog_fd); + goto cleanup; + } + +cleanup: + close(map_fd); +} + +static void check_incorrect_index(void) +{ + check_out_of_bounds_index(); + check_mid_insn_index(); +} + +static int set_bpf_jit_harden(char *level) +{ + char old_level; + int err = -1; + int fd = -1; + + fd = open("/proc/sys/net/core/bpf_jit_harden", O_RDWR | O_NONBLOCK); + if (fd < 0) { + ASSERT_FAIL("open .../bpf_jit_harden returned %d (errno=%d)", fd, errno); + return -1; + } + + err = read(fd, &old_level, 1); + if (err != 1) { + ASSERT_FAIL("read from .../bpf_jit_harden returned %d (errno=%d)", err, errno); + err = -1; + goto end; + } + + lseek(fd, 0, SEEK_SET); + + err = write(fd, level, 1); + if (err != 1) { + ASSERT_FAIL("write to .../bpf_jit_harden returned %d (errno=%d)", err, errno); + err = -1; + goto end; + } + + err = 0; + *level = old_level; +end: + if (fd >= 0) + close(fd); + return err; +} + +static void check_blindness(void) +{ + struct bpf_insn insns[] = { + BPF_MOV64_IMM(BPF_REG_0, 4), + BPF_MOV64_IMM(BPF_REG_0, 3), + BPF_MOV64_IMM(BPF_REG_0, 2), + BPF_MOV64_IMM(BPF_REG_0, 1), + BPF_EXIT_INSN(), + }; + int prog_fd = -1, map_fd; + struct bpf_insn_array_value val = {}; + char bpf_jit_harden = '@'; /* non-exizsting value */ + int i; + + map_fd = map_create(BPF_MAP_TYPE_INSN_ARRAY, ARRAY_SIZE(insns)); + if (!ASSERT_GE(map_fd, 0, "map_create")) + return; + + for (i = 0; i < ARRAY_SIZE(insns); i++) { + val.orig_off = i; + if (!ASSERT_EQ(bpf_map_update_elem(map_fd, &i, &val, 0), 0, "bpf_map_update_elem")) + goto cleanup; + } + + if (!ASSERT_EQ(bpf_map_freeze(map_fd), 0, "bpf_map_freeze")) + goto cleanup; + + bpf_jit_harden = '2'; + if (set_bpf_jit_harden(&bpf_jit_harden)) { + bpf_jit_harden = '@'; /* open, read or write failed => no write was done */ + goto cleanup; + } + + prog_fd = prog_load(insns, ARRAY_SIZE(insns), &map_fd, 1); + if (!ASSERT_GE(prog_fd, 0, "bpf(BPF_PROG_LOAD)")) + goto cleanup; + + for (i = 0; i < ARRAY_SIZE(insns); i++) { + char fmt[32]; + + if (!ASSERT_EQ(bpf_map_lookup_elem(map_fd, &i, &val), 0, "bpf_map_lookup_elem")) + goto cleanup; + + snprintf(fmt, sizeof(fmt), "val should be equal 3*%d", i); + ASSERT_EQ(val.xlated_off, i * 3, fmt); + } + +cleanup: + /* restore the old one */ + if (bpf_jit_harden != '@') + set_bpf_jit_harden(&bpf_jit_harden); + + close(prog_fd); + close(map_fd); +} + +/* Once map was initialized, it should be frozen */ +static void check_load_unfrozen_map(void) +{ + struct bpf_insn insns[] = { + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }; + int prog_fd = -1, map_fd; + struct bpf_insn_array_value val = {}; + int i; + + map_fd = map_create(BPF_MAP_TYPE_INSN_ARRAY, ARRAY_SIZE(insns)); + if (!ASSERT_GE(map_fd, 0, "map_create")) + return; + + for (i = 0; i < ARRAY_SIZE(insns); i++) { + val.orig_off = i; + if (!ASSERT_EQ(bpf_map_update_elem(map_fd, &i, &val, 0), 0, "bpf_map_update_elem")) + goto cleanup; + } + + prog_fd = prog_load(insns, ARRAY_SIZE(insns), &map_fd, 1); + if (!ASSERT_EQ(prog_fd, -EINVAL, "program should have been rejected (prog_fd != -EINVAL)")) + goto cleanup; + + /* correctness: now freeze the map, the program should load fine */ + + if (!ASSERT_EQ(bpf_map_freeze(map_fd), 0, "bpf_map_freeze")) + goto cleanup; + + prog_fd = prog_load(insns, ARRAY_SIZE(insns), &map_fd, 1); + if (!ASSERT_GE(prog_fd, 0, "bpf(BPF_PROG_LOAD)")) + goto cleanup; + + for (i = 0; i < ARRAY_SIZE(insns); i++) { + if (!ASSERT_EQ(bpf_map_lookup_elem(map_fd, &i, &val), 0, "bpf_map_lookup_elem")) + goto cleanup; + + ASSERT_EQ(val.xlated_off, i, "val should be equal i"); + } + +cleanup: + close(prog_fd); + close(map_fd); +} + +/* Map can be used only by one BPF program */ +static void check_no_map_reuse(void) +{ + struct bpf_insn insns[] = { + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }; + int prog_fd = -1, map_fd, extra_fd = -1; + struct bpf_insn_array_value val = {}; + int i; + + map_fd = map_create(BPF_MAP_TYPE_INSN_ARRAY, ARRAY_SIZE(insns)); + if (!ASSERT_GE(map_fd, 0, "map_create")) + return; + + for (i = 0; i < ARRAY_SIZE(insns); i++) { + val.orig_off = i; + if (!ASSERT_EQ(bpf_map_update_elem(map_fd, &i, &val, 0), 0, "bpf_map_update_elem")) + goto cleanup; + } + + if (!ASSERT_EQ(bpf_map_freeze(map_fd), 0, "bpf_map_freeze")) + goto cleanup; + + prog_fd = prog_load(insns, ARRAY_SIZE(insns), &map_fd, 1); + if (!ASSERT_GE(prog_fd, 0, "bpf(BPF_PROG_LOAD)")) + goto cleanup; + + for (i = 0; i < ARRAY_SIZE(insns); i++) { + if (!ASSERT_EQ(bpf_map_lookup_elem(map_fd, &i, &val), 0, "bpf_map_lookup_elem")) + goto cleanup; + + ASSERT_EQ(val.xlated_off, i, "val should be equal i"); + } + + extra_fd = prog_load(insns, ARRAY_SIZE(insns), &map_fd, 1); + if (!ASSERT_EQ(extra_fd, -EBUSY, "program should have been rejected (extra_fd != -EBUSY)")) + goto cleanup; + + /* correctness: check that prog is still loadable without fd_array */ + extra_fd = prog_load(insns, ARRAY_SIZE(insns), NULL, 0); + if (!ASSERT_GE(extra_fd, 0, "bpf(BPF_PROG_LOAD): expected no error")) + goto cleanup; + +cleanup: + close(extra_fd); + close(prog_fd); + close(map_fd); +} + +static void check_bpf_no_lookup(void) +{ + struct bpf_insn insns[] = { + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), + BPF_EXIT_INSN(), + }; + int prog_fd = -1, map_fd; + + map_fd = map_create(BPF_MAP_TYPE_INSN_ARRAY, 1); + if (!ASSERT_GE(map_fd, 0, "map_create")) + return; + + insns[0].imm = map_fd; + + if (!ASSERT_EQ(bpf_map_freeze(map_fd), 0, "bpf_map_freeze")) + goto cleanup; + + prog_fd = prog_load(insns, ARRAY_SIZE(insns), NULL, 0); + if (!ASSERT_EQ(prog_fd, -EINVAL, "program should have been rejected (prog_fd != -EINVAL)")) + goto cleanup; + + /* correctness: check that prog is still loadable with normal map */ + close(map_fd); + map_fd = map_create(BPF_MAP_TYPE_ARRAY, 1); + insns[0].imm = map_fd; + prog_fd = prog_load(insns, ARRAY_SIZE(insns), NULL, 0); + if (!ASSERT_GE(prog_fd, 0, "bpf(BPF_PROG_LOAD)")) + goto cleanup; + +cleanup: + close(prog_fd); + close(map_fd); +} + +static void check_bpf_side(void) +{ + check_bpf_no_lookup(); +} + +static void __test_bpf_insn_array(void) +{ + /* Test if offsets are adjusted properly */ + + if (test__start_subtest("one2one")) + check_one_to_one_mapping(); + + if (test__start_subtest("simple")) + check_simple(); + + if (test__start_subtest("deletions")) + check_deletions(); + + if (test__start_subtest("deletions-with-functions")) + check_deletions_with_functions(); + + if (test__start_subtest("blindness")) + check_blindness(); + + /* Check all kinds of operations and related restrictions */ + + if (test__start_subtest("incorrect-index")) + check_incorrect_index(); + + if (test__start_subtest("load-unfrozen-map")) + check_load_unfrozen_map(); + + if (test__start_subtest("no-map-reuse")) + check_no_map_reuse(); + + if (test__start_subtest("bpf-side-ops")) + check_bpf_side(); +} +#else +static void __test_bpf_insn_array(void) +{ + test__skip(); +} +#endif + +void test_bpf_insn_array(void) +{ + __test_bpf_insn_array(); +} diff --git a/tools/testing/selftests/bpf/prog_tests/btf.c b/tools/testing/selftests/bpf/prog_tests/btf.c index 8a9ba4292109..054ecb6b1e9f 100644 --- a/tools/testing/selftests/bpf/prog_tests/btf.c +++ b/tools/testing/selftests/bpf/prog_tests/btf.c @@ -7496,6 +7496,71 @@ static struct btf_dedup_test dedup_tests[] = { }, }, { + .descr = "dedup: recursive typedef", + /* + * This test simulates a recursive typedef, which in GO is defined as such: + * + * type Foo func() Foo + * + * In BTF terms, this is represented as a TYPEDEF referencing + * a FUNC_PROTO that returns the same TYPEDEF. + */ + .input = { + .raw_types = { + /* + * [1] typedef Foo -> func() Foo + * [2] func_proto() -> Foo + * [3] typedef Foo -> func() Foo + * [4] func_proto() -> Foo + */ + BTF_TYPEDEF_ENC(NAME_NTH(1), 2), /* [1] */ + BTF_FUNC_PROTO_ENC(1, 0), /* [2] */ + BTF_TYPEDEF_ENC(NAME_NTH(1), 4), /* [3] */ + BTF_FUNC_PROTO_ENC(3, 0), /* [4] */ + BTF_END_RAW, + }, + BTF_STR_SEC("\0Foo"), + }, + .expect = { + .raw_types = { + BTF_TYPEDEF_ENC(NAME_NTH(1), 2), /* [1] */ + BTF_FUNC_PROTO_ENC(1, 0), /* [2] */ + BTF_END_RAW, + }, + BTF_STR_SEC("\0Foo"), + }, +}, +{ + .descr = "dedup: typedef", + /* + * // CU 1: + * typedef int foo; + * + * // CU 2: + * typedef int foo; + */ + .input = { + .raw_types = { + /* CU 1 */ + BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), /* [1] */ + BTF_TYPEDEF_ENC(NAME_NTH(1), 1), /* [2] */ + /* CU 2 */ + BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), /* [3] */ + BTF_TYPEDEF_ENC(NAME_NTH(1), 3), /* [4] */ + BTF_END_RAW, + }, + BTF_STR_SEC("\0foo"), + }, + .expect = { + .raw_types = { + BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), /* [1] */ + BTF_TYPEDEF_ENC(NAME_NTH(1), 1), /* [2] */ + BTF_END_RAW, + }, + BTF_STR_SEC("\0foo"), + }, +}, +{ .descr = "dedup: typedef tags", .input = { .raw_types = { diff --git a/tools/testing/selftests/bpf/prog_tests/btf_split.c b/tools/testing/selftests/bpf/prog_tests/btf_split.c index 3696fb9a05ed..2d47cad50a51 100644 --- a/tools/testing/selftests/bpf/prog_tests/btf_split.c +++ b/tools/testing/selftests/bpf/prog_tests/btf_split.c @@ -12,11 +12,45 @@ static void btf_dump_printf(void *ctx, const char *fmt, va_list args) vfprintf(ctx, fmt, args); } +/* Write raw BTF to file, return number of bytes written or negative errno */ +static ssize_t btf_raw_write(struct btf *btf, char *file) +{ + ssize_t written = 0; + const void *data; + __u32 size = 0; + int fd, ret; + + fd = mkstemp(file); + if (!ASSERT_GE(fd, 0, "create_file")) + return -errno; + + data = btf__raw_data(btf, &size); + if (!ASSERT_OK_PTR(data, "btf__raw_data")) { + close(fd); + return -EINVAL; + } + while (written < size) { + ret = write(fd, data + written, size - written); + if (!ASSERT_GE(ret, 0, "write succeeded")) { + close(fd); + return -errno; + } + written += ret; + } + close(fd); + return written; +} + static void __test_btf_split(bool multi) { + char multisplit_btf_file[] = "/tmp/test_btf_multisplit.XXXXXX"; + char split_btf_file[] = "/tmp/test_btf_split.XXXXXX"; + char base_btf_file[] = "/tmp/test_btf_base.XXXXXX"; + ssize_t multisplit_btf_sz = 0, split_btf_sz = 0, base_btf_sz = 0; struct btf_dump *d = NULL; - const struct btf_type *t; - struct btf *btf1, *btf2, *btf3 = NULL; + const struct btf_type *t, *ot; + struct btf *btf1 = NULL, *btf2 = NULL, *btf3 = NULL; + struct btf *btf4 = NULL, *btf5 = NULL, *btf6 = NULL; int str_off, i, err; btf1 = btf__new_empty(); @@ -123,6 +157,45 @@ static void __test_btf_split(bool multi) " int uf2;\n" "};\n\n", "c_dump"); + /* write base, split BTFs to files and ensure parsing succeeds */ + base_btf_sz = btf_raw_write(btf1, base_btf_file); + if (base_btf_sz < 0) + goto cleanup; + split_btf_sz = btf_raw_write(btf2, split_btf_file); + if (split_btf_sz < 0) + goto cleanup; + btf4 = btf__parse(base_btf_file, NULL); + if (!ASSERT_OK_PTR(btf4, "parse_base")) + goto cleanup; + btf5 = btf__parse_split(split_btf_file, btf4); + if (!ASSERT_OK_PTR(btf5, "parse_split")) + goto cleanup; + if (multi) { + multisplit_btf_sz = btf_raw_write(btf3, multisplit_btf_file); + if (multisplit_btf_sz < 0) + goto cleanup; + btf6 = btf__parse_split(multisplit_btf_file, btf5); + if (!ASSERT_OK_PTR(btf6, "parse_multisplit")) + goto cleanup; + } else { + btf6 = btf5; + } + + if (!ASSERT_EQ(btf__type_cnt(btf3), btf__type_cnt(btf6), "cmp_type_cnt")) + goto cleanup; + + /* compare parsed to original BTF */ + for (i = 1; i < btf__type_cnt(btf6); i++) { + t = btf__type_by_id(btf6, i); + if (!ASSERT_OK_PTR(t, "type_in_parsed_btf")) + goto cleanup; + ot = btf__type_by_id(btf3, i); + if (!ASSERT_OK_PTR(ot, "type_in_orig_btf")) + goto cleanup; + if (!ASSERT_EQ(memcmp(t, ot, sizeof(*ot)), 0, "cmp_parsed_orig_btf")) + goto cleanup; + } + cleanup: if (dump_buf_file) fclose(dump_buf_file); @@ -132,6 +205,16 @@ cleanup: btf__free(btf2); if (btf2 != btf3) btf__free(btf3); + btf__free(btf4); + btf__free(btf5); + if (btf5 != btf6) + btf__free(btf6); + if (base_btf_sz > 0) + unlink(base_btf_file); + if (split_btf_sz > 0) + unlink(split_btf_file); + if (multisplit_btf_sz > 0) + unlink(multisplit_btf_file); } void test_btf_split(void) diff --git a/tools/testing/selftests/bpf/prog_tests/check_mtu.c b/tools/testing/selftests/bpf/prog_tests/check_mtu.c index 2a9a30650350..65b4512967e7 100644 --- a/tools/testing/selftests/bpf/prog_tests/check_mtu.c +++ b/tools/testing/selftests/bpf/prog_tests/check_mtu.c @@ -153,6 +153,26 @@ static void test_check_mtu_run_tc(struct test_check_mtu *skel, ASSERT_EQ(mtu_result, mtu_expect, "MTU-compare-user"); } +static void test_chk_segs_flag(struct test_check_mtu *skel, __u32 mtu) +{ + int err, prog_fd = bpf_program__fd(skel->progs.tc_chk_segs_flag); + struct __sk_buff skb = { + .gso_size = 10, + }; + LIBBPF_OPTS(bpf_test_run_opts, topts, + .data_in = &pkt_v4, + .data_size_in = sizeof(pkt_v4), + .ctx_in = &skb, + .ctx_size_in = sizeof(skb), + ); + + /* Lower the mtu to test the BPF_MTU_CHK_SEGS */ + SYS_NOFAIL("ip link set dev lo mtu 10"); + err = bpf_prog_test_run_opts(prog_fd, &topts); + SYS_NOFAIL("ip link set dev lo mtu %u", mtu); + ASSERT_OK(err, "test_run"); + ASSERT_EQ(topts.retval, BPF_OK, "retval"); +} static void test_check_mtu_tc(__u32 mtu, __u32 ifindex) { @@ -177,11 +197,12 @@ static void test_check_mtu_tc(__u32 mtu, __u32 ifindex) test_check_mtu_run_tc(skel, skel->progs.tc_minus_delta, mtu); test_check_mtu_run_tc(skel, skel->progs.tc_input_len, mtu); test_check_mtu_run_tc(skel, skel->progs.tc_input_len_exceed, mtu); + test_chk_segs_flag(skel, mtu); cleanup: test_check_mtu__destroy(skel); } -void serial_test_check_mtu(void) +void test_ns_check_mtu(void) { int mtu_lo; diff --git a/tools/testing/selftests/bpf/prog_tests/cls_redirect.c b/tools/testing/selftests/bpf/prog_tests/cls_redirect.c index 34b59f6baca1..7488a7606e6a 100644 --- a/tools/testing/selftests/bpf/prog_tests/cls_redirect.c +++ b/tools/testing/selftests/bpf/prog_tests/cls_redirect.c @@ -22,79 +22,37 @@ static int duration = 0; -struct addr_port { - in_port_t port; - union { - struct in_addr in_addr; - struct in6_addr in6_addr; - }; -}; - -struct tuple { - int family; - struct addr_port src; - struct addr_port dst; -}; - -static bool fill_addr_port(const struct sockaddr *sa, struct addr_port *ap) -{ - const struct sockaddr_in6 *in6; - const struct sockaddr_in *in; - - switch (sa->sa_family) { - case AF_INET: - in = (const struct sockaddr_in *)sa; - ap->in_addr = in->sin_addr; - ap->port = in->sin_port; - return true; - - case AF_INET6: - in6 = (const struct sockaddr_in6 *)sa; - ap->in6_addr = in6->sin6_addr; - ap->port = in6->sin6_port; - return true; - - default: - return false; - } -} -static bool set_up_conn(const struct sockaddr *addr, socklen_t len, int type, - int *server, int *conn, struct tuple *tuple) +static bool set_up_conn(const struct sockaddr_storage *addr, socklen_t len, int type, + int *server, int *conn, + struct sockaddr_storage *src, + struct sockaddr_storage *dst) { struct sockaddr_storage ss; socklen_t slen = sizeof(ss); - struct sockaddr *sa = (struct sockaddr *)&ss; - *server = start_server_addr(type, (struct sockaddr_storage *)addr, len, NULL); + *server = start_server_addr(type, addr, len, NULL); if (*server < 0) return false; - if (CHECK_FAIL(getsockname(*server, sa, &slen))) + if (CHECK_FAIL(getsockname(*server, (struct sockaddr *)&ss, &slen))) goto close_server; - *conn = connect_to_addr(type, (struct sockaddr_storage *)sa, slen, NULL); + *conn = connect_to_addr(type, &ss, slen, NULL); if (*conn < 0) goto close_server; /* We want to simulate packets arriving at conn, so we have to * swap src and dst. */ - slen = sizeof(ss); - if (CHECK_FAIL(getsockname(*conn, sa, &slen))) - goto close_conn; - - if (CHECK_FAIL(!fill_addr_port(sa, &tuple->dst))) + slen = sizeof(*dst); + if (CHECK_FAIL(getsockname(*conn, (struct sockaddr *)dst, &slen))) goto close_conn; - slen = sizeof(ss); - if (CHECK_FAIL(getpeername(*conn, sa, &slen))) + slen = sizeof(*src); + if (CHECK_FAIL(getpeername(*conn, (struct sockaddr *)src, &slen))) goto close_conn; - if (CHECK_FAIL(!fill_addr_port(sa, &tuple->src))) - goto close_conn; - - tuple->family = ss.ss_family; return true; close_conn: @@ -110,17 +68,16 @@ static socklen_t prepare_addr(struct sockaddr_storage *addr, int family) { struct sockaddr_in *addr4; struct sockaddr_in6 *addr6; + memset(addr, 0, sizeof(*addr)); switch (family) { case AF_INET: addr4 = (struct sockaddr_in *)addr; - memset(addr4, 0, sizeof(*addr4)); addr4->sin_family = family; addr4->sin_addr.s_addr = htonl(INADDR_LOOPBACK); return sizeof(*addr4); case AF_INET6: addr6 = (struct sockaddr_in6 *)addr; - memset(addr6, 0, sizeof(*addr6)); addr6->sin6_family = family; addr6->sin6_addr = in6addr_loopback; return sizeof(*addr6); @@ -242,9 +199,15 @@ static void encap_init(encap_headers_t *encap, uint8_t hop_count, uint8_t proto) } static size_t build_input(const struct test_cfg *test, void *const buf, - const struct tuple *tuple) + const struct sockaddr_storage *src, + const struct sockaddr_storage *dst) { - in_port_t sport = tuple->src.port; + struct sockaddr_in6 *src_in6 = (struct sockaddr_in6 *)src; + struct sockaddr_in6 *dst_in6 = (struct sockaddr_in6 *)dst; + struct sockaddr_in *src_in = (struct sockaddr_in *)src; + struct sockaddr_in *dst_in = (struct sockaddr_in *)dst; + sa_family_t family = src->ss_family; + in_port_t sport, dport; encap_headers_t encap; struct iphdr ip; struct ipv6hdr ipv6; @@ -254,8 +217,11 @@ static size_t build_input(const struct test_cfg *test, void *const buf, uint8_t *p = buf; int proto; + sport = (family == AF_INET) ? src_in->sin_port : src_in6->sin6_port; + dport = (family == AF_INET) ? dst_in->sin_port : dst_in6->sin6_port; + proto = IPPROTO_IPIP; - if (tuple->family == AF_INET6) + if (family == AF_INET6) proto = IPPROTO_IPV6; encap_init(&encap, test->hops == ONE_HOP ? 1 : 0, proto); @@ -270,15 +236,15 @@ static size_t build_input(const struct test_cfg *test, void *const buf, if (test->type == UDP) proto = IPPROTO_UDP; - switch (tuple->family) { + switch (family) { case AF_INET: ip = (struct iphdr){ .ihl = 5, .version = 4, .ttl = IPDEFTTL, .protocol = proto, - .saddr = tuple->src.in_addr.s_addr, - .daddr = tuple->dst.in_addr.s_addr, + .saddr = src_in->sin_addr.s_addr, + .daddr = dst_in->sin_addr.s_addr, }; p = mempcpy(p, &ip, sizeof(ip)); break; @@ -287,8 +253,8 @@ static size_t build_input(const struct test_cfg *test, void *const buf, .version = 6, .hop_limit = IPDEFTTL, .nexthdr = proto, - .saddr = tuple->src.in6_addr, - .daddr = tuple->dst.in6_addr, + .saddr = src_in6->sin6_addr, + .daddr = dst_in6->sin6_addr, }; p = mempcpy(p, &ipv6, sizeof(ipv6)); break; @@ -303,18 +269,16 @@ static size_t build_input(const struct test_cfg *test, void *const buf, case TCP: tcp = (struct tcphdr){ .source = sport, - .dest = tuple->dst.port, + .dest = dport, + .syn = (test->flags == SYN), + .ack = (test->flags == ACK), }; - if (test->flags == SYN) - tcp.syn = true; - if (test->flags == ACK) - tcp.ack = true; p = mempcpy(p, &tcp, sizeof(tcp)); break; case UDP: udp = (struct udphdr){ .source = sport, - .dest = tuple->dst.port, + .dest = dport, }; p = mempcpy(p, &udp, sizeof(udp)); break; @@ -339,27 +303,26 @@ static void test_cls_redirect_common(struct bpf_program *prog) LIBBPF_OPTS(bpf_test_run_opts, tattr); int families[] = { AF_INET, AF_INET6 }; struct sockaddr_storage ss; - struct sockaddr *addr; socklen_t slen; int i, j, err, prog_fd; int servers[__NR_KIND][ARRAY_SIZE(families)] = {}; int conns[__NR_KIND][ARRAY_SIZE(families)] = {}; - struct tuple tuples[__NR_KIND][ARRAY_SIZE(families)]; + struct sockaddr_storage srcs[__NR_KIND][ARRAY_SIZE(families)]; + struct sockaddr_storage dsts[__NR_KIND][ARRAY_SIZE(families)]; - addr = (struct sockaddr *)&ss; for (i = 0; i < ARRAY_SIZE(families); i++) { slen = prepare_addr(&ss, families[i]); if (CHECK_FAIL(!slen)) goto cleanup; - if (CHECK_FAIL(!set_up_conn(addr, slen, SOCK_DGRAM, + if (CHECK_FAIL(!set_up_conn(&ss, slen, SOCK_DGRAM, &servers[UDP][i], &conns[UDP][i], - &tuples[UDP][i]))) + &srcs[UDP][i], &dsts[UDP][i]))) goto cleanup; - if (CHECK_FAIL(!set_up_conn(addr, slen, SOCK_STREAM, + if (CHECK_FAIL(!set_up_conn(&ss, slen, SOCK_STREAM, &servers[TCP][i], &conns[TCP][i], - &tuples[TCP][i]))) + &srcs[TCP][i], &dsts[TCP][i]))) goto cleanup; } @@ -368,11 +331,12 @@ static void test_cls_redirect_common(struct bpf_program *prog) struct test_cfg *test = &tests[i]; for (j = 0; j < ARRAY_SIZE(families); j++) { - struct tuple *tuple = &tuples[test->type][j]; + struct sockaddr_storage *src = &srcs[test->type][j]; + struct sockaddr_storage *dst = &dsts[test->type][j]; char input[256]; char tmp[256]; - test_str(tmp, sizeof(tmp), test, tuple->family); + test_str(tmp, sizeof(tmp), test, families[j]); if (!test__start_subtest(tmp)) continue; @@ -380,7 +344,7 @@ static void test_cls_redirect_common(struct bpf_program *prog) tattr.data_size_out = sizeof(tmp); tattr.data_in = input; - tattr.data_size_in = build_input(test, input, tuple); + tattr.data_size_in = build_input(test, input, src, dst); if (CHECK_FAIL(!tattr.data_size_in)) continue; diff --git a/tools/testing/selftests/bpf/prog_tests/file_reader.c b/tools/testing/selftests/bpf/prog_tests/file_reader.c new file mode 100644 index 000000000000..5cde32b35da4 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/file_reader.c @@ -0,0 +1,117 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */ + +#include <test_progs.h> +#include <network_helpers.h> +#include "file_reader.skel.h" +#include "file_reader_fail.skel.h" +#include <dlfcn.h> +#include <sys/mman.h> + +const char *user_ptr = "hello world"; +char file_contents[256000]; + +void *get_executable_base_addr(void) +{ + Dl_info info; + + if (!dladdr((void *)&get_executable_base_addr, &info)) { + fprintf(stderr, "dladdr failed\n"); + return NULL; + } + + return info.dli_fbase; +} + +static int initialize_file_contents(void) +{ + int fd, page_sz = sysconf(_SC_PAGESIZE); + ssize_t n = 0, cur, off; + void *addr; + + fd = open("/proc/self/exe", O_RDONLY); + if (!ASSERT_OK_FD(fd, "Open /proc/self/exe\n")) + return 1; + + do { + cur = read(fd, file_contents + n, sizeof(file_contents) - n); + if (!ASSERT_GT(cur, 0, "read success")) + break; + n += cur; + } while (n < sizeof(file_contents)); + + close(fd); + + if (!ASSERT_EQ(n, sizeof(file_contents), "Read /proc/self/exe\n")) + return 1; + + addr = get_executable_base_addr(); + if (!ASSERT_NEQ(addr, NULL, "get executable address")) + return 1; + + /* page-align base file address */ + addr = (void *)((unsigned long)addr & ~(page_sz - 1)); + + /* + * Page out range 0..512K, use 0..256K for positive tests and + * 256K..512K for negative tests expecting page faults + */ + for (off = 0; off < sizeof(file_contents) * 2; off += page_sz) { + if (!ASSERT_OK(madvise(addr + off, page_sz, MADV_PAGEOUT), + "madvise pageout")) + return errno; + } + + return 0; +} + +static void run_test(const char *prog_name) +{ + struct file_reader *skel; + struct bpf_program *prog; + int err, fd; + + err = initialize_file_contents(); + if (!ASSERT_OK(err, "initialize file contents")) + return; + + skel = file_reader__open(); + if (!ASSERT_OK_PTR(skel, "file_reader__open")) + return; + + bpf_object__for_each_program(prog, skel->obj) { + bpf_program__set_autoload(prog, strcmp(bpf_program__name(prog), prog_name) == 0); + } + + memcpy(skel->bss->user_buf, file_contents, sizeof(file_contents)); + skel->bss->pid = getpid(); + + err = file_reader__load(skel); + if (!ASSERT_OK(err, "file_reader__load")) + goto cleanup; + + err = file_reader__attach(skel); + if (!ASSERT_OK(err, "file_reader__attach")) + goto cleanup; + + fd = open("/proc/self/exe", O_RDONLY); + if (fd >= 0) + close(fd); + + ASSERT_EQ(skel->bss->err, 0, "err"); + ASSERT_EQ(skel->bss->run_success, 1, "run_success"); +cleanup: + file_reader__destroy(skel); +} + +void test_file_reader(void) +{ + if (test__start_subtest("on_open_expect_fault")) + run_test("on_open_expect_fault"); + + if (test__start_subtest("on_open_validate_file_read")) + run_test("on_open_validate_file_read"); + + if (test__start_subtest("negative")) + RUN_TESTS(file_reader_fail); +} diff --git a/tools/testing/selftests/bpf/prog_tests/htab_update.c b/tools/testing/selftests/bpf/prog_tests/htab_update.c index 2bc85f4814f4..d0b405eb2966 100644 --- a/tools/testing/selftests/bpf/prog_tests/htab_update.c +++ b/tools/testing/selftests/bpf/prog_tests/htab_update.c @@ -15,17 +15,17 @@ struct htab_update_ctx { static void test_reenter_update(void) { struct htab_update *skel; - unsigned int key, value; + void *value = NULL; + unsigned int key, value_size; int err; skel = htab_update__open(); if (!ASSERT_OK_PTR(skel, "htab_update__open")) return; - /* lookup_elem_raw() may be inlined and find_kernel_btf_id() will return -ESRCH */ - bpf_program__set_autoload(skel->progs.lookup_elem_raw, true); + bpf_program__set_autoload(skel->progs.bpf_obj_free_fields, true); err = htab_update__load(skel); - if (!ASSERT_TRUE(!err || err == -ESRCH, "htab_update__load") || err) + if (!ASSERT_TRUE(!err, "htab_update__load") || err) goto out; skel->bss->pid = getpid(); @@ -33,14 +33,33 @@ static void test_reenter_update(void) if (!ASSERT_OK(err, "htab_update__attach")) goto out; - /* Will trigger the reentrancy of bpf_map_update_elem() */ + value_size = bpf_map__value_size(skel->maps.htab); + + value = calloc(1, value_size); + if (!ASSERT_OK_PTR(value, "calloc value")) + goto out; + /* + * First update: plain insert. This should NOT trigger the re-entrancy + * path, because there is no old element to free yet. + */ key = 0; - value = 0; - err = bpf_map_update_elem(bpf_map__fd(skel->maps.htab), &key, &value, 0); - if (!ASSERT_OK(err, "add element")) + err = bpf_map_update_elem(bpf_map__fd(skel->maps.htab), &key, value, BPF_ANY); + if (!ASSERT_OK(err, "first update (insert)")) + goto out; + + /* + * Second update: replace existing element with same key and trigger + * the reentrancy of bpf_map_update_elem(). + * check_and_free_fields() calls bpf_obj_free_fields() on the old + * value, which is where fentry program runs and performs a nested + * bpf_map_update_elem(), triggering -EDEADLK. + */ + memset(value, 0, value_size); + err = bpf_map_update_elem(bpf_map__fd(skel->maps.htab), &key, value, BPF_ANY); + if (!ASSERT_OK(err, "second update (replace)")) goto out; - ASSERT_EQ(skel->bss->update_err, -EBUSY, "no reentrancy"); + ASSERT_EQ(skel->bss->update_err, -EDEADLK, "no reentrancy"); out: htab_update__destroy(skel); } diff --git a/tools/testing/selftests/bpf/prog_tests/kmem_cache_iter.c b/tools/testing/selftests/bpf/prog_tests/kmem_cache_iter.c index 1de14b111931..6e35e13c2022 100644 --- a/tools/testing/selftests/bpf/prog_tests/kmem_cache_iter.c +++ b/tools/testing/selftests/bpf/prog_tests/kmem_cache_iter.c @@ -57,7 +57,8 @@ static void subtest_kmem_cache_iter_check_slabinfo(struct kmem_cache_iter *skel) if (!ASSERT_OK(ret, "kmem_cache_lookup")) break; - ASSERT_STREQ(r.name, name, "kmem_cache_name"); + ASSERT_STRNEQ(r.name, name, sizeof(r.name) - 1, + "kmem_cache_name"); ASSERT_EQ(r.obj_size, objsize, "kmem_cache_objsize"); seen++; diff --git a/tools/testing/selftests/bpf/prog_tests/perf_branches.c b/tools/testing/selftests/bpf/prog_tests/perf_branches.c index bc24f83339d6..0a7ef770c487 100644 --- a/tools/testing/selftests/bpf/prog_tests/perf_branches.c +++ b/tools/testing/selftests/bpf/prog_tests/perf_branches.c @@ -15,6 +15,10 @@ static void check_good_sample(struct test_perf_branches *skel) int pbe_size = sizeof(struct perf_branch_entry); int duration = 0; + if (CHECK(!skel->bss->run_cnt, "invalid run_cnt", + "checked sample validity before prog run")) + return; + if (CHECK(!skel->bss->valid, "output not valid", "no valid sample from prog")) return; @@ -45,6 +49,10 @@ static void check_bad_sample(struct test_perf_branches *skel) int written_stack = skel->bss->written_stack_out; int duration = 0; + if (CHECK(!skel->bss->run_cnt, "invalid run_cnt", + "checked sample validity before prog run")) + return; + if (CHECK(!skel->bss->valid, "output not valid", "no valid sample from prog")) return; @@ -83,8 +91,12 @@ static void test_perf_branches_common(int perf_fd, err = pthread_setaffinity_np(pthread_self(), sizeof(cpu_set), &cpu_set); if (CHECK(err, "set_affinity", "cpu #0, err %d\n", err)) goto out_destroy; - /* spin the loop for a while (random high number) */ - for (i = 0; i < 1000000; ++i) + + /* Spin the loop for a while by using a high iteration count, and by + * checking whether the specific run count marker has been explicitly + * incremented at least once by the backing perf_event BPF program. + */ + for (i = 0; i < 100000000 && !*(volatile int *)&skel->bss->run_cnt; ++i) ++j; test_perf_branches__detach(skel); @@ -116,11 +128,11 @@ static void test_perf_branches_hw(void) pfd = syscall(__NR_perf_event_open, &attr, -1, 0, -1, PERF_FLAG_FD_CLOEXEC); /* - * Some setups don't support branch records (virtual machines, !x86), - * so skip test in this case. + * Some setups don't support LBR (virtual machines, !x86, AMD Milan Zen + * 3 which only supports BRS), so skip test in this case. */ if (pfd < 0) { - if (errno == ENOENT || errno == EOPNOTSUPP) { + if (errno == ENOENT || errno == EOPNOTSUPP || errno == EINVAL) { printf("%s:SKIP:no PERF_SAMPLE_BRANCH_STACK\n", __func__); test__skip(); diff --git a/tools/testing/selftests/bpf/prog_tests/rcu_read_lock.c b/tools/testing/selftests/bpf/prog_tests/rcu_read_lock.c index c9f855e5da24..246eb259c08a 100644 --- a/tools/testing/selftests/bpf/prog_tests/rcu_read_lock.c +++ b/tools/testing/selftests/bpf/prog_tests/rcu_read_lock.c @@ -28,6 +28,7 @@ static void test_success(void) bpf_program__set_autoload(skel->progs.two_regions, true); bpf_program__set_autoload(skel->progs.non_sleepable_1, true); bpf_program__set_autoload(skel->progs.non_sleepable_2, true); + bpf_program__set_autoload(skel->progs.nested_rcu_region, true); bpf_program__set_autoload(skel->progs.task_trusted_non_rcuptr, true); bpf_program__set_autoload(skel->progs.rcu_read_lock_subprog, true); bpf_program__set_autoload(skel->progs.rcu_read_lock_global_subprog, true); @@ -78,7 +79,8 @@ static const char * const inproper_region_tests[] = { "non_sleepable_rcu_mismatch", "inproper_sleepable_helper", "inproper_sleepable_kfunc", - "nested_rcu_region", + "nested_rcu_region_unbalanced_1", + "nested_rcu_region_unbalanced_2", "rcu_read_lock_global_subprog_lock", "rcu_read_lock_global_subprog_unlock", "rcu_read_lock_sleepable_helper_global_subprog", diff --git a/tools/testing/selftests/bpf/prog_tests/refcounted_kptr.c b/tools/testing/selftests/bpf/prog_tests/refcounted_kptr.c index d6bd5e16e637..d2c0542716a8 100644 --- a/tools/testing/selftests/bpf/prog_tests/refcounted_kptr.c +++ b/tools/testing/selftests/bpf/prog_tests/refcounted_kptr.c @@ -44,3 +44,59 @@ void test_refcounted_kptr_wrong_owner(void) ASSERT_OK(opts.retval, "rbtree_wrong_owner_remove_fail_a2 retval"); refcounted_kptr__destroy(skel); } + +void test_percpu_hash_refcounted_kptr_refcount_leak(void) +{ + struct refcounted_kptr *skel; + int cpu_nr, fd, err, key = 0; + struct bpf_map *map; + size_t values_sz; + u64 *values; + LIBBPF_OPTS(bpf_test_run_opts, opts, + .data_in = &pkt_v4, + .data_size_in = sizeof(pkt_v4), + .repeat = 1, + ); + + cpu_nr = libbpf_num_possible_cpus(); + if (!ASSERT_GT(cpu_nr, 0, "libbpf_num_possible_cpus")) + return; + + values = calloc(cpu_nr, sizeof(u64)); + if (!ASSERT_OK_PTR(values, "calloc values")) + return; + + skel = refcounted_kptr__open_and_load(); + if (!ASSERT_OK_PTR(skel, "refcounted_kptr__open_and_load")) { + free(values); + return; + } + + values_sz = cpu_nr * sizeof(u64); + memset(values, 0, values_sz); + + map = skel->maps.percpu_hash; + err = bpf_map__update_elem(map, &key, sizeof(key), values, values_sz, 0); + if (!ASSERT_OK(err, "bpf_map__update_elem")) + goto out; + + fd = bpf_program__fd(skel->progs.percpu_hash_refcount_leak); + err = bpf_prog_test_run_opts(fd, &opts); + if (!ASSERT_OK(err, "bpf_prog_test_run_opts")) + goto out; + if (!ASSERT_EQ(opts.retval, 2, "opts.retval")) + goto out; + + err = bpf_map__update_elem(map, &key, sizeof(key), values, values_sz, 0); + if (!ASSERT_OK(err, "bpf_map__update_elem")) + goto out; + + fd = bpf_program__fd(skel->progs.check_percpu_hash_refcount); + err = bpf_prog_test_run_opts(fd, &opts); + ASSERT_OK(err, "bpf_prog_test_run_opts"); + ASSERT_EQ(opts.retval, 1, "opts.retval"); + +out: + refcounted_kptr__destroy(skel); + free(values); +} diff --git a/tools/testing/selftests/bpf/prog_tests/res_spin_lock.c b/tools/testing/selftests/bpf/prog_tests/res_spin_lock.c index 8c6c2043a432..f0a8c828f8f1 100644 --- a/tools/testing/selftests/bpf/prog_tests/res_spin_lock.c +++ b/tools/testing/selftests/bpf/prog_tests/res_spin_lock.c @@ -110,8 +110,8 @@ void serial_test_res_spin_lock_stress(void) ASSERT_OK(load_module("bpf_test_rqspinlock.ko", false), "load module AA"); sleep(5); unload_module("bpf_test_rqspinlock", false); - - ASSERT_OK(load_module_params("bpf_test_rqspinlock.ko", "test_ab=1", false), "load module ABBA"); - sleep(5); - unload_module("bpf_test_rqspinlock", false); + /* + * Insert bpf_test_rqspinlock.ko manually with test_mode=[1|2] to test + * other cases (ABBA, ABBCCA). + */ } diff --git a/tools/testing/selftests/bpf/prog_tests/ringbuf.c b/tools/testing/selftests/bpf/prog_tests/ringbuf.c index d1e4cb28a72c..64520684d2cb 100644 --- a/tools/testing/selftests/bpf/prog_tests/ringbuf.c +++ b/tools/testing/selftests/bpf/prog_tests/ringbuf.c @@ -17,6 +17,7 @@ #include "test_ringbuf_n.lskel.h" #include "test_ringbuf_map_key.lskel.h" #include "test_ringbuf_write.lskel.h" +#include "test_ringbuf_overwrite.lskel.h" #define EDONE 7777 @@ -497,6 +498,68 @@ cleanup: test_ringbuf_map_key_lskel__destroy(skel_map_key); } +static void ringbuf_overwrite_mode_subtest(void) +{ + unsigned long size, len1, len2, len3, len4, len5; + unsigned long expect_avail_data, expect_prod_pos, expect_over_pos; + struct test_ringbuf_overwrite_lskel *skel; + int page_size = getpagesize(); + int err; + + skel = test_ringbuf_overwrite_lskel__open(); + if (!ASSERT_OK_PTR(skel, "skel_open")) + return; + + size = page_size; + len1 = page_size / 2; + len2 = page_size / 4; + len3 = size - len1 - len2 - BPF_RINGBUF_HDR_SZ * 3; + len4 = len3 - 8; + len5 = len3; /* retry with len3 */ + + skel->maps.ringbuf.max_entries = size; + skel->rodata->LEN1 = len1; + skel->rodata->LEN2 = len2; + skel->rodata->LEN3 = len3; + skel->rodata->LEN4 = len4; + skel->rodata->LEN5 = len5; + + skel->bss->pid = getpid(); + + err = test_ringbuf_overwrite_lskel__load(skel); + if (!ASSERT_OK(err, "skel_load")) + goto cleanup; + + err = test_ringbuf_overwrite_lskel__attach(skel); + if (!ASSERT_OK(err, "skel_attach")) + goto cleanup; + + syscall(__NR_getpgid); + + ASSERT_EQ(skel->bss->reserve1_fail, 0, "reserve 1"); + ASSERT_EQ(skel->bss->reserve2_fail, 0, "reserve 2"); + ASSERT_EQ(skel->bss->reserve3_fail, 1, "reserve 3"); + ASSERT_EQ(skel->bss->reserve4_fail, 0, "reserve 4"); + ASSERT_EQ(skel->bss->reserve5_fail, 0, "reserve 5"); + + ASSERT_EQ(skel->bss->ring_size, size, "check_ring_size"); + + expect_avail_data = len2 + len4 + len5 + 3 * BPF_RINGBUF_HDR_SZ; + ASSERT_EQ(skel->bss->avail_data, expect_avail_data, "check_avail_size"); + + ASSERT_EQ(skel->bss->cons_pos, 0, "check_cons_pos"); + + expect_prod_pos = len1 + len2 + len4 + len5 + 4 * BPF_RINGBUF_HDR_SZ; + ASSERT_EQ(skel->bss->prod_pos, expect_prod_pos, "check_prod_pos"); + + expect_over_pos = len1 + BPF_RINGBUF_HDR_SZ; + ASSERT_EQ(skel->bss->over_pos, expect_over_pos, "check_over_pos"); + + test_ringbuf_overwrite_lskel__detach(skel); +cleanup: + test_ringbuf_overwrite_lskel__destroy(skel); +} + void test_ringbuf(void) { if (test__start_subtest("ringbuf")) @@ -507,4 +570,6 @@ void test_ringbuf(void) ringbuf_map_key_subtest(); if (test__start_subtest("ringbuf_write")) ringbuf_write_subtest(); + if (test__start_subtest("ringbuf_overwrite_mode")) + ringbuf_overwrite_mode_subtest(); } diff --git a/tools/testing/selftests/bpf/prog_tests/select_reuseport.c b/tools/testing/selftests/bpf/prog_tests/select_reuseport.c index 036d4760d2c1..3dbcc091f16c 100644 --- a/tools/testing/selftests/bpf/prog_tests/select_reuseport.c +++ b/tools/testing/selftests/bpf/prog_tests/select_reuseport.c @@ -41,11 +41,7 @@ static struct bpf_object *obj; static __u32 index_zero; static int epfd; -static union sa46 { - struct sockaddr_in6 v6; - struct sockaddr_in v4; - sa_family_t family; -} srv_sa; +static struct sockaddr_storage srv_sa; #define RET_IF(condition, tag, format...) ({ \ if (CHECK_FAIL(condition)) { \ @@ -135,24 +131,24 @@ static int prepare_bpf_obj(void) return 0; } -static void sa46_init_loopback(union sa46 *sa, sa_family_t family) +static void ss_init_loopback(struct sockaddr_storage *sa, sa_family_t family) { memset(sa, 0, sizeof(*sa)); - sa->family = family; - if (sa->family == AF_INET6) - sa->v6.sin6_addr = in6addr_loopback; + sa->ss_family = family; + if (sa->ss_family == AF_INET6) + ((struct sockaddr_in6 *)sa)->sin6_addr = in6addr_loopback; else - sa->v4.sin_addr.s_addr = htonl(INADDR_LOOPBACK); + ((struct sockaddr_in *)sa)->sin_addr.s_addr = htonl(INADDR_LOOPBACK); } -static void sa46_init_inany(union sa46 *sa, sa_family_t family) +static void ss_init_inany(struct sockaddr_storage *sa, sa_family_t family) { memset(sa, 0, sizeof(*sa)); - sa->family = family; - if (sa->family == AF_INET6) - sa->v6.sin6_addr = in6addr_any; + sa->ss_family = family; + if (sa->ss_family == AF_INET6) + ((struct sockaddr_in6 *)sa)->sin6_addr = in6addr_any; else - sa->v4.sin_addr.s_addr = INADDR_ANY; + ((struct sockaddr_in *)sa)->sin_addr.s_addr = INADDR_ANY; } static int read_int_sysctl(const char *sysctl) @@ -228,7 +224,7 @@ static void check_data(int type, sa_family_t family, const struct cmd *cmd, int cli_fd) { struct data_check expected = {}, result; - union sa46 cli_sa; + struct sockaddr_storage cli_sa; socklen_t addrlen; int err; @@ -251,26 +247,32 @@ static void check_data(int type, sa_family_t family, const struct cmd *cmd, } if (family == AF_INET6) { + struct sockaddr_in6 *srv_v6 = (struct sockaddr_in6 *)&srv_sa; + struct sockaddr_in6 *cli_v6 = (struct sockaddr_in6 *)&cli_sa; + expected.eth_protocol = htons(ETH_P_IPV6); - expected.bind_inany = !srv_sa.v6.sin6_addr.s6_addr32[3] && - !srv_sa.v6.sin6_addr.s6_addr32[2] && - !srv_sa.v6.sin6_addr.s6_addr32[1] && - !srv_sa.v6.sin6_addr.s6_addr32[0]; + expected.bind_inany = !srv_v6->sin6_addr.s6_addr32[3] && + !srv_v6->sin6_addr.s6_addr32[2] && + !srv_v6->sin6_addr.s6_addr32[1] && + !srv_v6->sin6_addr.s6_addr32[0]; - memcpy(&expected.skb_addrs[0], cli_sa.v6.sin6_addr.s6_addr32, - sizeof(cli_sa.v6.sin6_addr)); + memcpy(&expected.skb_addrs[0], cli_v6->sin6_addr.s6_addr32, + sizeof(cli_v6->sin6_addr)); memcpy(&expected.skb_addrs[4], &in6addr_loopback, sizeof(in6addr_loopback)); - expected.skb_ports[0] = cli_sa.v6.sin6_port; - expected.skb_ports[1] = srv_sa.v6.sin6_port; + expected.skb_ports[0] = cli_v6->sin6_port; + expected.skb_ports[1] = srv_v6->sin6_port; } else { + struct sockaddr_in *srv_v4 = (struct sockaddr_in *)&srv_sa; + struct sockaddr_in *cli_v4 = (struct sockaddr_in *)&cli_sa; + expected.eth_protocol = htons(ETH_P_IP); - expected.bind_inany = !srv_sa.v4.sin_addr.s_addr; + expected.bind_inany = !srv_v4->sin_addr.s_addr; - expected.skb_addrs[0] = cli_sa.v4.sin_addr.s_addr; + expected.skb_addrs[0] = cli_v4->sin_addr.s_addr; expected.skb_addrs[1] = htonl(INADDR_LOOPBACK); - expected.skb_ports[0] = cli_sa.v4.sin_port; - expected.skb_ports[1] = srv_sa.v4.sin_port; + expected.skb_ports[0] = cli_v4->sin_port; + expected.skb_ports[1] = srv_v4->sin_port; } if (memcmp(&result, &expected, offsetof(struct data_check, @@ -364,16 +366,15 @@ static void check_results(void) static int send_data(int type, sa_family_t family, void *data, size_t len, enum result expected) { - union sa46 cli_sa; + struct sockaddr_storage cli_sa; int fd, err; fd = socket(family, type, 0); RET_ERR(fd == -1, "socket()", "fd:%d errno:%d\n", fd, errno); - sa46_init_loopback(&cli_sa, family); + ss_init_loopback(&cli_sa, family); err = bind(fd, (struct sockaddr *)&cli_sa, sizeof(cli_sa)); RET_ERR(fd == -1, "bind(cli_sa)", "err:%d errno:%d\n", err, errno); - err = sendto(fd, data, len, MSG_FASTOPEN, (struct sockaddr *)&srv_sa, sizeof(srv_sa)); RET_ERR(err != len && expected >= PASS, @@ -589,9 +590,9 @@ static void prepare_sk_fds(int type, sa_family_t family, bool inany) socklen_t addrlen; if (inany) - sa46_init_inany(&srv_sa, family); + ss_init_inany(&srv_sa, family); else - sa46_init_loopback(&srv_sa, family); + ss_init_loopback(&srv_sa, family); addrlen = sizeof(srv_sa); /* diff --git a/tools/testing/selftests/bpf/prog_tests/send_signal.c b/tools/testing/selftests/bpf/prog_tests/send_signal.c index 1702aa592c2c..7ac4d5a488aa 100644 --- a/tools/testing/selftests/bpf/prog_tests/send_signal.c +++ b/tools/testing/selftests/bpf/prog_tests/send_signal.c @@ -206,6 +206,11 @@ destroy_skel: skel_open_load_failure: close(pipe_c2p[0]); close(pipe_p2c[1]); + /* + * Child is either about to exit cleanly or stuck in case of errors. + * Nudge it to exit. + */ + kill(pid, SIGKILL); wait(NULL); } diff --git a/tools/testing/selftests/bpf/prog_tests/sk_bypass_prot_mem.c b/tools/testing/selftests/bpf/prog_tests/sk_bypass_prot_mem.c new file mode 100644 index 000000000000..e4940583924b --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/sk_bypass_prot_mem.c @@ -0,0 +1,292 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright 2025 Google LLC */ + +#include <test_progs.h> +#include "sk_bypass_prot_mem.skel.h" +#include "network_helpers.h" + +#define NR_PAGES 32 +#define NR_SOCKETS 2 +#define BUF_TOTAL (NR_PAGES * 4096 / NR_SOCKETS) +#define BUF_SINGLE 1024 +#define NR_SEND (BUF_TOTAL / BUF_SINGLE) + +struct test_case { + char name[8]; + int family; + int type; + int (*create_sockets)(struct test_case *test_case, int sk[], int len); + long (*get_memory_allocated)(struct test_case *test_case, struct sk_bypass_prot_mem *skel); +}; + +static int tcp_create_sockets(struct test_case *test_case, int sk[], int len) +{ + int server, i, err = 0; + + server = start_server(test_case->family, test_case->type, NULL, 0, 0); + if (!ASSERT_GE(server, 0, "start_server_str")) + return server; + + /* Keep for-loop so we can change NR_SOCKETS easily. */ + for (i = 0; i < len; i += 2) { + sk[i] = connect_to_fd(server, 0); + if (sk[i] < 0) { + ASSERT_GE(sk[i], 0, "connect_to_fd"); + err = sk[i]; + break; + } + + sk[i + 1] = accept(server, NULL, NULL); + if (sk[i + 1] < 0) { + ASSERT_GE(sk[i + 1], 0, "accept"); + err = sk[i + 1]; + break; + } + } + + close(server); + + return err; +} + +static int udp_create_sockets(struct test_case *test_case, int sk[], int len) +{ + int i, j, err, rcvbuf = BUF_TOTAL; + + /* Keep for-loop so we can change NR_SOCKETS easily. */ + for (i = 0; i < len; i += 2) { + sk[i] = start_server(test_case->family, test_case->type, NULL, 0, 0); + if (sk[i] < 0) { + ASSERT_GE(sk[i], 0, "start_server"); + return sk[i]; + } + + sk[i + 1] = connect_to_fd(sk[i], 0); + if (sk[i + 1] < 0) { + ASSERT_GE(sk[i + 1], 0, "connect_to_fd"); + return sk[i + 1]; + } + + err = connect_fd_to_fd(sk[i], sk[i + 1], 0); + if (err) { + ASSERT_EQ(err, 0, "connect_fd_to_fd"); + return err; + } + + for (j = 0; j < 2; j++) { + err = setsockopt(sk[i + j], SOL_SOCKET, SO_RCVBUF, &rcvbuf, sizeof(int)); + if (err) { + ASSERT_EQ(err, 0, "setsockopt(SO_RCVBUF)"); + return err; + } + } + } + + return 0; +} + +static long get_memory_allocated(struct test_case *test_case, + bool *activated, long *memory_allocated) +{ + int sk; + + *activated = true; + + /* AF_INET and AF_INET6 share the same memory_allocated. + * tcp_init_sock() is called by AF_INET and AF_INET6, + * but udp_lib_init_sock() is inline. + */ + sk = socket(AF_INET, test_case->type, 0); + if (!ASSERT_GE(sk, 0, "get_memory_allocated")) + return -1; + + close(sk); + + return *memory_allocated; +} + +static long tcp_get_memory_allocated(struct test_case *test_case, struct sk_bypass_prot_mem *skel) +{ + return get_memory_allocated(test_case, + &skel->bss->tcp_activated, + &skel->bss->tcp_memory_allocated); +} + +static long udp_get_memory_allocated(struct test_case *test_case, struct sk_bypass_prot_mem *skel) +{ + return get_memory_allocated(test_case, + &skel->bss->udp_activated, + &skel->bss->udp_memory_allocated); +} + +static int check_bypass(struct test_case *test_case, + struct sk_bypass_prot_mem *skel, bool bypass) +{ + char buf[BUF_SINGLE] = {}; + long memory_allocated[2]; + int sk[NR_SOCKETS]; + int err, i, j; + + for (i = 0; i < ARRAY_SIZE(sk); i++) + sk[i] = -1; + + err = test_case->create_sockets(test_case, sk, ARRAY_SIZE(sk)); + if (err) + goto close; + + memory_allocated[0] = test_case->get_memory_allocated(test_case, skel); + + /* allocate pages >= NR_PAGES */ + for (i = 0; i < ARRAY_SIZE(sk); i++) { + for (j = 0; j < NR_SEND; j++) { + int bytes = send(sk[i], buf, sizeof(buf), 0); + + /* Avoid too noisy logs when something failed. */ + if (bytes != sizeof(buf)) { + ASSERT_EQ(bytes, sizeof(buf), "send"); + if (bytes < 0) { + err = bytes; + goto drain; + } + } + } + } + + memory_allocated[1] = test_case->get_memory_allocated(test_case, skel); + + if (bypass) + ASSERT_LE(memory_allocated[1], memory_allocated[0] + 10, "bypass"); + else + ASSERT_GT(memory_allocated[1], memory_allocated[0] + NR_PAGES, "no bypass"); + +drain: + if (test_case->type == SOCK_DGRAM) { + /* UDP starts purging sk->sk_receive_queue after one RCU + * grace period, then udp_memory_allocated goes down, + * so drain the queue before close(). + */ + for (i = 0; i < ARRAY_SIZE(sk); i++) { + for (j = 0; j < NR_SEND; j++) { + int bytes = recv(sk[i], buf, 1, MSG_DONTWAIT | MSG_TRUNC); + + if (bytes == sizeof(buf)) + continue; + if (bytes != -1 || errno != EAGAIN) + PRINT_FAIL("bytes: %d, errno: %s\n", bytes, strerror(errno)); + break; + } + } + } + +close: + for (i = 0; i < ARRAY_SIZE(sk); i++) { + if (sk[i] < 0) + break; + + close(sk[i]); + } + + return err; +} + +static void run_test(struct test_case *test_case) +{ + struct sk_bypass_prot_mem *skel; + struct nstoken *nstoken; + int cgroup, err; + + skel = sk_bypass_prot_mem__open_and_load(); + if (!ASSERT_OK_PTR(skel, "open_and_load")) + return; + + skel->bss->nr_cpus = libbpf_num_possible_cpus(); + + err = sk_bypass_prot_mem__attach(skel); + if (!ASSERT_OK(err, "attach")) + goto destroy_skel; + + cgroup = test__join_cgroup("/sk_bypass_prot_mem"); + if (!ASSERT_GE(cgroup, 0, "join_cgroup")) + goto destroy_skel; + + err = make_netns("sk_bypass_prot_mem"); + if (!ASSERT_EQ(err, 0, "make_netns")) + goto close_cgroup; + + nstoken = open_netns("sk_bypass_prot_mem"); + if (!ASSERT_OK_PTR(nstoken, "open_netns")) + goto remove_netns; + + err = check_bypass(test_case, skel, false); + if (!ASSERT_EQ(err, 0, "test_bypass(false)")) + goto close_netns; + + err = write_sysctl("/proc/sys/net/core/bypass_prot_mem", "1"); + if (!ASSERT_EQ(err, 0, "write_sysctl(1)")) + goto close_netns; + + err = check_bypass(test_case, skel, true); + if (!ASSERT_EQ(err, 0, "test_bypass(true by sysctl)")) + goto close_netns; + + err = write_sysctl("/proc/sys/net/core/bypass_prot_mem", "0"); + if (!ASSERT_EQ(err, 0, "write_sysctl(0)")) + goto close_netns; + + skel->links.sock_create = bpf_program__attach_cgroup(skel->progs.sock_create, cgroup); + if (!ASSERT_OK_PTR(skel->links.sock_create, "attach_cgroup(sock_create)")) + goto close_netns; + + err = check_bypass(test_case, skel, true); + ASSERT_EQ(err, 0, "test_bypass(true by bpf)"); + +close_netns: + close_netns(nstoken); +remove_netns: + remove_netns("sk_bypass_prot_mem"); +close_cgroup: + close(cgroup); +destroy_skel: + sk_bypass_prot_mem__destroy(skel); +} + +static struct test_case test_cases[] = { + { + .name = "TCP ", + .family = AF_INET, + .type = SOCK_STREAM, + .create_sockets = tcp_create_sockets, + .get_memory_allocated = tcp_get_memory_allocated, + }, + { + .name = "UDP ", + .family = AF_INET, + .type = SOCK_DGRAM, + .create_sockets = udp_create_sockets, + .get_memory_allocated = udp_get_memory_allocated, + }, + { + .name = "TCPv6", + .family = AF_INET6, + .type = SOCK_STREAM, + .create_sockets = tcp_create_sockets, + .get_memory_allocated = tcp_get_memory_allocated, + }, + { + .name = "UDPv6", + .family = AF_INET6, + .type = SOCK_DGRAM, + .create_sockets = udp_create_sockets, + .get_memory_allocated = udp_get_memory_allocated, + }, +}; + +void serial_test_sk_bypass_prot_mem(void) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(test_cases); i++) { + if (test__start_subtest(test_cases[i].name)) + run_test(&test_cases[i]); + } +} diff --git a/tools/testing/selftests/bpf/prog_tests/string_kfuncs.c b/tools/testing/selftests/bpf/prog_tests/string_kfuncs.c index 4d66fad3c8bd..0f3bf594e7a5 100644 --- a/tools/testing/selftests/bpf/prog_tests/string_kfuncs.c +++ b/tools/testing/selftests/bpf/prog_tests/string_kfuncs.c @@ -20,7 +20,9 @@ static const char * const test_cases[] = { "strcspn_str", "strcspn_reject", "strstr", + "strcasestr", "strnstr", + "strncasestr", }; void run_too_long_tests(void) diff --git a/tools/testing/selftests/bpf/prog_tests/test_bpf_smc.c b/tools/testing/selftests/bpf/prog_tests/test_bpf_smc.c new file mode 100644 index 000000000000..de22734abc4d --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/test_bpf_smc.c @@ -0,0 +1,390 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <test_progs.h> +#include <linux/genetlink.h> +#include "network_helpers.h" +#include "bpf_smc.skel.h" + +#ifndef IPPROTO_SMC +#define IPPROTO_SMC 256 +#endif + +#define CLIENT_IP "127.0.0.1" +#define SERVER_IP "127.0.1.0" +#define SERVER_IP_VIA_RISK_PATH "127.0.2.0" + +#define SERVICE_1 80 +#define SERVICE_2 443 +#define SERVICE_3 8443 + +#define TEST_NS "bpf_smc_netns" + +static struct netns_obj *test_netns; + +struct smc_policy_ip_key { + __u32 sip; + __u32 dip; +}; + +struct smc_policy_ip_value { + __u8 mode; +}; + +#if defined(__s390x__) +/* s390x has default seid */ +static bool setup_ueid(void) { return true; } +static void cleanup_ueid(void) {} +#else +enum { + SMC_NETLINK_ADD_UEID = 10, + SMC_NETLINK_REMOVE_UEID +}; + +enum { + SMC_NLA_EID_TABLE_UNSPEC, + SMC_NLA_EID_TABLE_ENTRY, /* string */ +}; + +struct msgtemplate { + struct nlmsghdr n; + struct genlmsghdr g; + char buf[1024]; +}; + +#define GENLMSG_DATA(glh) ((void *)(NLMSG_DATA(glh) + GENL_HDRLEN)) +#define GENLMSG_PAYLOAD(glh) (NLMSG_PAYLOAD(glh, 0) - GENL_HDRLEN) +#define NLA_DATA(na) ((void *)((char *)(na) + NLA_HDRLEN)) +#define NLA_PAYLOAD(len) ((len) - NLA_HDRLEN) + +#define SMC_GENL_FAMILY_NAME "SMC_GEN_NETLINK" +#define SMC_BPFTEST_UEID "SMC-BPFTEST-UEID" + +static uint16_t smc_nl_family_id = -1; + +static int send_cmd(int fd, __u16 nlmsg_type, __u32 nlmsg_pid, + __u16 nlmsg_flags, __u8 genl_cmd, __u16 nla_type, + void *nla_data, int nla_len) +{ + struct nlattr *na; + struct sockaddr_nl nladdr; + int r, buflen; + char *buf; + + struct msgtemplate msg = {0}; + + msg.n.nlmsg_len = NLMSG_LENGTH(GENL_HDRLEN); + msg.n.nlmsg_type = nlmsg_type; + msg.n.nlmsg_flags = nlmsg_flags; + msg.n.nlmsg_seq = 0; + msg.n.nlmsg_pid = nlmsg_pid; + msg.g.cmd = genl_cmd; + msg.g.version = 1; + na = (struct nlattr *)GENLMSG_DATA(&msg); + na->nla_type = nla_type; + na->nla_len = nla_len + 1 + NLA_HDRLEN; + memcpy(NLA_DATA(na), nla_data, nla_len); + msg.n.nlmsg_len += NLMSG_ALIGN(na->nla_len); + + buf = (char *)&msg; + buflen = msg.n.nlmsg_len; + memset(&nladdr, 0, sizeof(nladdr)); + nladdr.nl_family = AF_NETLINK; + + while ((r = sendto(fd, buf, buflen, 0, (struct sockaddr *)&nladdr, + sizeof(nladdr))) < buflen) { + if (r > 0) { + buf += r; + buflen -= r; + } else if (errno != EAGAIN) { + return -1; + } + } + return 0; +} + +static bool get_smc_nl_family_id(void) +{ + struct sockaddr_nl nl_src; + struct msgtemplate msg; + struct nlattr *nl; + int fd, ret; + pid_t pid; + + fd = socket(AF_NETLINK, SOCK_RAW, NETLINK_GENERIC); + if (!ASSERT_OK_FD(fd, "nl_family socket")) + return false; + + pid = getpid(); + + memset(&nl_src, 0, sizeof(nl_src)); + nl_src.nl_family = AF_NETLINK; + nl_src.nl_pid = pid; + + ret = bind(fd, (struct sockaddr *)&nl_src, sizeof(nl_src)); + if (!ASSERT_OK(ret, "nl_family bind")) + goto fail; + + ret = send_cmd(fd, GENL_ID_CTRL, pid, + NLM_F_REQUEST, CTRL_CMD_GETFAMILY, + CTRL_ATTR_FAMILY_NAME, (void *)SMC_GENL_FAMILY_NAME, + strlen(SMC_GENL_FAMILY_NAME)); + if (!ASSERT_OK(ret, "nl_family query")) + goto fail; + + ret = recv(fd, &msg, sizeof(msg), 0); + if (!ASSERT_FALSE(msg.n.nlmsg_type == NLMSG_ERROR || ret < 0 || + !NLMSG_OK(&msg.n, ret), "nl_family response")) + goto fail; + + nl = (struct nlattr *)GENLMSG_DATA(&msg); + nl = (struct nlattr *)((char *)nl + NLA_ALIGN(nl->nla_len)); + if (!ASSERT_EQ(nl->nla_type, CTRL_ATTR_FAMILY_ID, "nl_family nla type")) + goto fail; + + smc_nl_family_id = *(uint16_t *)NLA_DATA(nl); + close(fd); + return true; +fail: + close(fd); + return false; +} + +static bool smc_ueid(int op) +{ + struct sockaddr_nl nl_src; + struct msgtemplate msg; + struct nlmsgerr *err; + char test_ueid[32]; + int fd, ret; + pid_t pid; + + /* UEID required */ + memset(test_ueid, '\x20', sizeof(test_ueid)); + memcpy(test_ueid, SMC_BPFTEST_UEID, strlen(SMC_BPFTEST_UEID)); + fd = socket(AF_NETLINK, SOCK_RAW, NETLINK_GENERIC); + if (!ASSERT_OK_FD(fd, "ueid socket")) + return false; + + pid = getpid(); + memset(&nl_src, 0, sizeof(nl_src)); + nl_src.nl_family = AF_NETLINK; + nl_src.nl_pid = pid; + + ret = bind(fd, (struct sockaddr *)&nl_src, sizeof(nl_src)); + if (!ASSERT_OK(ret, "ueid bind")) + goto fail; + + ret = send_cmd(fd, smc_nl_family_id, pid, + NLM_F_REQUEST | NLM_F_ACK, op, SMC_NLA_EID_TABLE_ENTRY, + (void *)test_ueid, sizeof(test_ueid)); + if (!ASSERT_OK(ret, "ueid cmd")) + goto fail; + + ret = recv(fd, &msg, sizeof(msg), 0); + if (!ASSERT_FALSE(ret < 0 || + !NLMSG_OK(&msg.n, ret), "ueid response")) + goto fail; + + if (msg.n.nlmsg_type == NLMSG_ERROR) { + err = NLMSG_DATA(&msg); + switch (op) { + case SMC_NETLINK_REMOVE_UEID: + if (!ASSERT_FALSE((err->error && err->error != -ENOENT), + "ueid remove")) + goto fail; + break; + case SMC_NETLINK_ADD_UEID: + if (!ASSERT_OK(err->error, "ueid add")) + goto fail; + break; + default: + break; + } + } + close(fd); + return true; +fail: + close(fd); + return false; +} + +static bool setup_ueid(void) +{ + /* get smc nl id */ + if (!get_smc_nl_family_id()) + return false; + /* clear old ueid for bpftest */ + smc_ueid(SMC_NETLINK_REMOVE_UEID); + /* smc-loopback required ueid */ + return smc_ueid(SMC_NETLINK_ADD_UEID); +} + +static void cleanup_ueid(void) +{ + smc_ueid(SMC_NETLINK_REMOVE_UEID); +} +#endif /* __s390x__ */ + +static bool setup_netns(void) +{ + test_netns = netns_new(TEST_NS, true); + if (!ASSERT_OK_PTR(test_netns, "open net namespace")) + goto fail_netns; + + SYS(fail_ip, "ip addr add 127.0.1.0/8 dev lo"); + SYS(fail_ip, "ip addr add 127.0.2.0/8 dev lo"); + + return true; +fail_ip: + netns_free(test_netns); +fail_netns: + return false; +} + +static void cleanup_netns(void) +{ + netns_free(test_netns); +} + +static bool setup_smc(void) +{ + if (!setup_ueid()) + return false; + + if (!setup_netns()) + goto fail_netns; + + return true; +fail_netns: + cleanup_ueid(); + return false; +} + +static int set_client_addr_cb(int fd, void *opts) +{ + const char *src = (const char *)opts; + struct sockaddr_in localaddr; + + localaddr.sin_family = AF_INET; + localaddr.sin_port = htons(0); + localaddr.sin_addr.s_addr = inet_addr(src); + return !ASSERT_OK(bind(fd, &localaddr, sizeof(localaddr)), "client bind"); +} + +static void run_link(const char *src, const char *dst, int port) +{ + struct network_helper_opts opts = {0}; + int server, client; + + server = start_server_str(AF_INET, SOCK_STREAM, dst, port, NULL); + if (!ASSERT_OK_FD(server, "start service_1")) + return; + + opts.proto = IPPROTO_TCP; + opts.post_socket_cb = set_client_addr_cb; + opts.cb_opts = (void *)src; + + client = connect_to_fd_opts(server, &opts); + if (!ASSERT_OK_FD(client, "start connect")) + goto fail_client; + + close(client); +fail_client: + close(server); +} + +static void block_link(int map_fd, const char *src, const char *dst) +{ + struct smc_policy_ip_value val = { .mode = /* block */ 0 }; + struct smc_policy_ip_key key = { + .sip = inet_addr(src), + .dip = inet_addr(dst), + }; + + bpf_map_update_elem(map_fd, &key, &val, BPF_ANY); +} + +/* + * This test describes a real-life service topology as follows: + * + * +-------------> service_1 + * link 1 | | + * +--------------------> server | link 2 + * | | V + * | +-------------> service_2 + * | link 3 + * client -------------------> server_via_unsafe_path -> service_3 + * + * Among them, + * 1. link-1 is very suitable for using SMC. + * 2. link-2 is not suitable for using SMC, because the mode of this link is + * kind of short-link services. + * 3. link-3 is also not suitable for using SMC, because the RDMA link is + * unavailable and needs to go through a long timeout before it can fallback + * to TCP. + * To achieve this goal, we use a customized SMC ip strategy via smc_hs_ctrl. + */ +static void test_topo(void) +{ + struct bpf_smc *skel; + int rc, map_fd; + + skel = bpf_smc__open_and_load(); + if (!ASSERT_OK_PTR(skel, "bpf_smc__open_and_load")) + return; + + rc = bpf_smc__attach(skel); + if (!ASSERT_OK(rc, "bpf_smc__attach")) + goto fail; + + map_fd = bpf_map__fd(skel->maps.smc_policy_ip); + if (!ASSERT_OK_FD(map_fd, "bpf_map__fd")) + goto fail; + + /* Mock the process of transparent replacement, since we will modify + * protocol to ipproto_smc accropding to it via + * fmod_ret/update_socket_protocol. + */ + write_sysctl("/proc/sys/net/smc/hs_ctrl", "linkcheck"); + + /* Configure ip strat */ + block_link(map_fd, CLIENT_IP, SERVER_IP_VIA_RISK_PATH); + block_link(map_fd, SERVER_IP, SERVER_IP); + + /* should go with smc */ + run_link(CLIENT_IP, SERVER_IP, SERVICE_1); + /* should go with smc fallback */ + run_link(SERVER_IP, SERVER_IP, SERVICE_2); + + ASSERT_EQ(skel->bss->smc_cnt, 2, "smc count"); + ASSERT_EQ(skel->bss->fallback_cnt, 1, "fallback count"); + + /* should go with smc */ + run_link(CLIENT_IP, SERVER_IP, SERVICE_2); + + ASSERT_EQ(skel->bss->smc_cnt, 3, "smc count"); + ASSERT_EQ(skel->bss->fallback_cnt, 1, "fallback count"); + + /* should go with smc fallback */ + run_link(CLIENT_IP, SERVER_IP_VIA_RISK_PATH, SERVICE_3); + + ASSERT_EQ(skel->bss->smc_cnt, 4, "smc count"); + ASSERT_EQ(skel->bss->fallback_cnt, 2, "fallback count"); + +fail: + bpf_smc__destroy(skel); +} + +void test_bpf_smc(void) +{ + if (!setup_smc()) { + printf("setup for smc test failed, test SKIP:\n"); + test__skip(); + return; + } + + if (test__start_subtest("topo")) + test_topo(); + + cleanup_ueid(); + cleanup_netns(); +} diff --git a/tools/testing/selftests/bpf/prog_tests/test_lsm.c b/tools/testing/selftests/bpf/prog_tests/test_lsm.c index 2a27f3714f5c..bdc4fc06bc5a 100644 --- a/tools/testing/selftests/bpf/prog_tests/test_lsm.c +++ b/tools/testing/selftests/bpf/prog_tests/test_lsm.c @@ -139,7 +139,7 @@ static void test_lsm_tailcall(void) if (CHECK_FAIL(!err)) goto close_prog; - prog_fd = bpf_program__fd(skel->progs.lsm_file_alloc_security_prog); + prog_fd = bpf_program__fd(skel->progs.lsm_kernfs_init_security_prog); if (CHECK_FAIL(prog_fd < 0)) goto close_prog; diff --git a/tools/testing/selftests/bpf/prog_tests/test_tc_edt.c b/tools/testing/selftests/bpf/prog_tests/test_tc_edt.c new file mode 100644 index 000000000000..462512fb191f --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/test_tc_edt.c @@ -0,0 +1,145 @@ +// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause + +/* + * BPF-based flow shaping + * + * The test brings up two veth in two isolated namespaces, attach some flow + * shaping program onto it, and ensures that a manual speedtest maximum + * value matches the rate set in the BPF shapers. + */ + +#include <asm-generic/socket.h> +#include <stdio.h> +#include <unistd.h> +#include <fcntl.h> +#include <math.h> +#include <sys/time.h> +#include <sys/socket.h> +#include <bpf/libbpf.h> +#include <pthread.h> +#include "test_progs.h" +#include "network_helpers.h" +#include "test_tc_edt.skel.h" + +#define SERVER_NS "tc-edt-server-ns" +#define CLIENT_NS "tc-edt-client-ns" +#define IP4_ADDR_VETH1 "192.168.1.1" +#define IP4_ADDR_VETH2 "192.168.1.2" +#define IP4_ADDR_VETH2_HEX 0xC0A80102 + +#define TIMEOUT_MS 2000 +#define TEST_PORT 9000 +#define TARGET_RATE_MBPS 5.0 +#define TX_BYTES_COUNT (1 * 1000 * 1000) +#define RATE_ERROR_PERCENT 2.0 + +struct connection { + int server_listen_fd; + int server_conn_fd; + int client_conn_fd; +}; + +static int setup(struct test_tc_edt *skel) +{ + struct nstoken *nstoken_client, *nstoken_server; + int ret; + + if (!ASSERT_OK(make_netns(CLIENT_NS), "create client ns")) + goto fail; + if (!ASSERT_OK(make_netns(SERVER_NS), "create server ns")) + goto fail_delete_client_ns; + + nstoken_client = open_netns(CLIENT_NS); + if (!ASSERT_OK_PTR(nstoken_client, "open client ns")) + goto fail_delete_server_ns; + SYS(fail_close_client_ns, "ip link add veth1 type veth peer name %s", + "veth2 netns " SERVER_NS); + SYS(fail_close_client_ns, "ip -4 addr add " IP4_ADDR_VETH1 "/24 dev veth1"); + SYS(fail_close_client_ns, "ip link set veth1 up"); + + nstoken_server = open_netns(SERVER_NS); + if (!ASSERT_OK_PTR(nstoken_server, "enter server ns")) + goto fail_close_client_ns; + SYS(fail_close_server_ns, "ip -4 addr add " IP4_ADDR_VETH2 "/24 dev veth2"); + SYS(fail_close_server_ns, "ip link set veth2 up"); + SYS(fail_close_server_ns, "tc qdisc add dev veth2 root fq"); + ret = tc_prog_attach("veth2", -1, bpf_program__fd(skel->progs.tc_prog)); + if (!ASSERT_OK(ret, "attach bpf prog")) + goto fail_close_server_ns; + skel->bss->target_rate = TARGET_RATE_MBPS * 1000 * 1000; + close_netns(nstoken_server); + close_netns(nstoken_client); + + return 0; + +fail_close_server_ns: + close_netns(nstoken_server); +fail_close_client_ns: + close_netns(nstoken_client); +fail_delete_server_ns: + remove_netns(SERVER_NS); +fail_delete_client_ns: + remove_netns(CLIENT_NS); +fail: + return -1; +} + +static void cleanup(void) +{ + remove_netns(CLIENT_NS); + remove_netns(SERVER_NS); +} + +static void run_test(void) +{ + int server_fd, client_fd, err; + double rate_mbps, rate_error; + struct nstoken *nstoken; + __u64 ts_start, ts_end; + + nstoken = open_netns(SERVER_NS); + if (!ASSERT_OK_PTR(nstoken, "open server ns")) + return; + server_fd = start_server(AF_INET, SOCK_STREAM, IP4_ADDR_VETH2, + TEST_PORT, TIMEOUT_MS); + if (!ASSERT_OK_FD(server_fd, "start server")) + return; + + close_netns(nstoken); + nstoken = open_netns(CLIENT_NS); + if (!ASSERT_OK_PTR(nstoken, "open client ns")) + return; + client_fd = connect_to_fd(server_fd, 0); + if (!ASSERT_OK_FD(client_fd, "connect client")) + return; + + ts_start = get_time_ns(); + err = send_recv_data(server_fd, client_fd, TX_BYTES_COUNT); + ts_end = get_time_ns(); + close_netns(nstoken); + ASSERT_OK(err, "send_recv_data"); + + rate_mbps = TX_BYTES_COUNT / ((ts_end - ts_start) / 1000.0); + rate_error = + fabs((rate_mbps - TARGET_RATE_MBPS) * 100.0 / TARGET_RATE_MBPS); + + ASSERT_LE(rate_error, RATE_ERROR_PERCENT, + "rate error is lower than threshold"); +} + +void test_tc_edt(void) +{ + struct test_tc_edt *skel; + + skel = test_tc_edt__open_and_load(); + if (!ASSERT_OK_PTR(skel, "skel open and load")) + return; + + if (!ASSERT_OK(setup(skel), "global setup")) + return; + + run_test(); + + cleanup(); + test_tc_edt__destroy(skel); +} diff --git a/tools/testing/selftests/bpf/prog_tests/test_tc_tunnel.c b/tools/testing/selftests/bpf/prog_tests/test_tc_tunnel.c new file mode 100644 index 000000000000..0fe0a8f62486 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/test_tc_tunnel.c @@ -0,0 +1,714 @@ +// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause + +/* + * End-to-end eBPF tunnel test suite + * The file tests BPF network tunnels implementation. For each tunnel + * type, the test validates that: + * - basic communication can first be established between the two veths + * - when adding a BPF-based encapsulation on client egress, it now fails + * to communicate with the server + * - when adding a kernel-based decapsulation on server ingress, client + * can now connect + * - when replacing the kernel-based decapsulation with a BPF-based one, + * the client can still connect + */ + +#include <stdio.h> +#include <unistd.h> +#include <fcntl.h> +#include <sys/socket.h> +#include <bpf/libbpf.h> + +#include "test_progs.h" +#include "network_helpers.h" +#include "test_tc_tunnel.skel.h" + +#define SERVER_NS "tc-tunnel-server-ns" +#define CLIENT_NS "tc-tunnel-client-ns" +#define MAC_ADDR_VETH1 "00:11:22:33:44:55" +#define IP4_ADDR_VETH1 "192.168.1.1" +#define IP6_ADDR_VETH1 "fd::1" +#define MAC_ADDR_VETH2 "66:77:88:99:AA:BB" +#define IP4_ADDR_VETH2 "192.168.1.2" +#define IP6_ADDR_VETH2 "fd::2" + +#define TEST_NAME_MAX_LEN 64 +#define PROG_NAME_MAX_LEN 64 +#define TUNNEL_ARGS_MAX_LEN 128 +#define BUFFER_LEN 2000 +#define DEFAULT_TEST_DATA_SIZE 100 +#define GSO_TEST_DATA_SIZE BUFFER_LEN + +#define TIMEOUT_MS 1000 +#define TEST_PORT 8000 +#define UDP_PORT 5555 +#define MPLS_UDP_PORT 6635 +#define FOU_MPLS_PROTO 137 +#define VXLAN_ID 1 +#define VXLAN_PORT 8472 +#define MPLS_TABLE_ENTRIES_COUNT 65536 + +static char tx_buffer[BUFFER_LEN], rx_buffer[BUFFER_LEN]; + +struct subtest_cfg { + char *ebpf_tun_type; + char *iproute_tun_type; + char *mac_tun_type; + int ipproto; + void (*extra_decap_mod_args_cb)(struct subtest_cfg *cfg, char *dst); + bool tunnel_need_veth_mac; + bool configure_fou_rx_port; + char *tmode; + bool expect_kern_decap_failure; + bool configure_mpls; + bool test_gso; + char *tunnel_client_addr; + char *tunnel_server_addr; + char name[TEST_NAME_MAX_LEN]; + char *server_addr; + int client_egress_prog_fd; + int server_ingress_prog_fd; + char extra_decap_mod_args[TUNNEL_ARGS_MAX_LEN]; + int server_fd; +}; + +struct connection { + int client_fd; + int server_fd; +}; + +static int build_subtest_name(struct subtest_cfg *cfg, char *dst, size_t size) +{ + int ret; + + ret = snprintf(dst, size, "%s_%s", cfg->ebpf_tun_type, + cfg->mac_tun_type); + + return ret < 0 ? ret : 0; +} + +static int set_subtest_progs(struct subtest_cfg *cfg, struct test_tc_tunnel *skel) +{ + char prog_name[PROG_NAME_MAX_LEN]; + struct bpf_program *prog; + int ret; + + ret = snprintf(prog_name, PROG_NAME_MAX_LEN, "__encap_"); + if (ret < 0) + return ret; + ret = build_subtest_name(cfg, prog_name + ret, PROG_NAME_MAX_LEN - ret); + if (ret < 0) + return ret; + prog = bpf_object__find_program_by_name(skel->obj, prog_name); + if (!prog) + return -1; + + cfg->client_egress_prog_fd = bpf_program__fd(prog); + cfg->server_ingress_prog_fd = bpf_program__fd(skel->progs.decap_f); + return 0; +} + +static void set_subtest_addresses(struct subtest_cfg *cfg) +{ + if (cfg->ipproto == 6) + cfg->server_addr = IP6_ADDR_VETH2; + else + cfg->server_addr = IP4_ADDR_VETH2; + + /* Some specific tunnel types need specific addressing, it then + * has been already set in the configuration table. Otherwise, + * deduce the relevant addressing from the ipproto + */ + if (cfg->tunnel_client_addr && cfg->tunnel_server_addr) + return; + + if (cfg->ipproto == 6) { + cfg->tunnel_client_addr = IP6_ADDR_VETH1; + cfg->tunnel_server_addr = IP6_ADDR_VETH2; + } else { + cfg->tunnel_client_addr = IP4_ADDR_VETH1; + cfg->tunnel_server_addr = IP4_ADDR_VETH2; + } +} + +static int run_server(struct subtest_cfg *cfg) +{ + int family = cfg->ipproto == 6 ? AF_INET6 : AF_INET; + struct nstoken *nstoken; + struct network_helper_opts opts = { + .timeout_ms = TIMEOUT_MS + }; + + nstoken = open_netns(SERVER_NS); + if (!ASSERT_OK_PTR(nstoken, "open server ns")) + return -1; + + cfg->server_fd = start_server_str(family, SOCK_STREAM, cfg->server_addr, + TEST_PORT, &opts); + close_netns(nstoken); + if (!ASSERT_OK_FD(cfg->server_fd, "start server")) + return -1; + + return 0; +} + +static int check_server_rx_data(struct subtest_cfg *cfg, + struct connection *conn, int len) +{ + int err; + + memset(rx_buffer, 0, BUFFER_LEN); + err = recv(conn->server_fd, rx_buffer, len, 0); + if (!ASSERT_EQ(err, len, "check rx data len")) + return 1; + if (!ASSERT_MEMEQ(tx_buffer, rx_buffer, len, "check received data")) + return 1; + return 0; +} + +static struct connection *connect_client_to_server(struct subtest_cfg *cfg) +{ + struct network_helper_opts opts = {.timeout_ms = 500}; + int family = cfg->ipproto == 6 ? AF_INET6 : AF_INET; + struct connection *conn = NULL; + int client_fd, server_fd; + + conn = malloc(sizeof(struct connection)); + if (!conn) + return conn; + + client_fd = connect_to_addr_str(family, SOCK_STREAM, cfg->server_addr, + TEST_PORT, &opts); + + if (client_fd < 0) { + free(conn); + return NULL; + } + + server_fd = accept(cfg->server_fd, NULL, NULL); + if (server_fd < 0) { + close(client_fd); + free(conn); + return NULL; + } + + conn->server_fd = server_fd; + conn->client_fd = client_fd; + + return conn; +} + +static void disconnect_client_from_server(struct subtest_cfg *cfg, + struct connection *conn) +{ + close(conn->server_fd); + close(conn->client_fd); + free(conn); +} + +static int send_and_test_data(struct subtest_cfg *cfg, bool must_succeed) +{ + struct connection *conn; + int err, res = -1; + + conn = connect_client_to_server(cfg); + if (!must_succeed && !ASSERT_ERR_PTR(conn, "connection that must fail")) + goto end; + else if (!must_succeed) + return 0; + + if (!ASSERT_OK_PTR(conn, "connection that must succeed")) + return -1; + + err = send(conn->client_fd, tx_buffer, DEFAULT_TEST_DATA_SIZE, 0); + if (!ASSERT_EQ(err, DEFAULT_TEST_DATA_SIZE, "send data from client")) + goto end; + if (check_server_rx_data(cfg, conn, DEFAULT_TEST_DATA_SIZE)) + goto end; + + if (!cfg->test_gso) { + res = 0; + goto end; + } + + err = send(conn->client_fd, tx_buffer, GSO_TEST_DATA_SIZE, 0); + if (!ASSERT_EQ(err, GSO_TEST_DATA_SIZE, "send (large) data from client")) + goto end; + if (check_server_rx_data(cfg, conn, DEFAULT_TEST_DATA_SIZE)) + goto end; + + res = 0; +end: + disconnect_client_from_server(cfg, conn); + return res; +} + +static void vxlan_decap_mod_args_cb(struct subtest_cfg *cfg, char *dst) +{ + snprintf(dst, TUNNEL_ARGS_MAX_LEN, "id %d dstport %d udp6zerocsumrx", + VXLAN_ID, VXLAN_PORT); +} + +static void udp_decap_mod_args_cb(struct subtest_cfg *cfg, char *dst) +{ + bool is_mpls = !strcmp(cfg->mac_tun_type, "mpls"); + + snprintf(dst, TUNNEL_ARGS_MAX_LEN, + "encap fou encap-sport auto encap-dport %d", + is_mpls ? MPLS_UDP_PORT : UDP_PORT); +} + +static int configure_fou_rx_port(struct subtest_cfg *cfg, bool add) +{ + bool is_mpls = strcmp(cfg->mac_tun_type, "mpls") == 0; + int fou_proto; + + if (is_mpls) + fou_proto = FOU_MPLS_PROTO; + else + fou_proto = cfg->ipproto == 6 ? 41 : 4; + + SYS(fail, "ip fou %s port %d ipproto %d%s", add ? "add" : "del", + is_mpls ? MPLS_UDP_PORT : UDP_PORT, fou_proto, + cfg->ipproto == 6 ? " -6" : ""); + + return 0; +fail: + return 1; +} + +static int add_fou_rx_port(struct subtest_cfg *cfg) +{ + return configure_fou_rx_port(cfg, true); +} + +static int del_fou_rx_port(struct subtest_cfg *cfg) +{ + return configure_fou_rx_port(cfg, false); +} + +static int update_tunnel_intf_addr(struct subtest_cfg *cfg) +{ + SYS(fail, "ip link set dev testtun0 address " MAC_ADDR_VETH2); + return 0; +fail: + return -1; +} + +static int configure_kernel_for_mpls(struct subtest_cfg *cfg) +{ + SYS(fail, "sysctl -qw net.mpls.platform_labels=%d", + MPLS_TABLE_ENTRIES_COUNT); + SYS(fail, "ip -f mpls route add 1000 dev lo"); + SYS(fail, "ip link set lo up"); + SYS(fail, "sysctl -qw net.mpls.conf.testtun0.input=1"); + SYS(fail, "sysctl -qw net.ipv4.conf.lo.rp_filter=0"); + return 0; +fail: + return -1; +} + +static int configure_encapsulation(struct subtest_cfg *cfg) +{ + int ret; + + ret = tc_prog_attach("veth1", -1, cfg->client_egress_prog_fd); + + return ret; +} + +static int configure_kernel_decapsulation(struct subtest_cfg *cfg) +{ + struct nstoken *nstoken = open_netns(SERVER_NS); + int ret = -1; + + if (!ASSERT_OK_PTR(nstoken, "open server ns")) + return ret; + + if (cfg->configure_fou_rx_port && + !ASSERT_OK(add_fou_rx_port(cfg), "configure FOU RX port")) + goto fail; + SYS(fail, "ip link add name testtun0 type %s %s remote %s local %s %s", + cfg->iproute_tun_type, cfg->tmode ? cfg->tmode : "", + cfg->tunnel_client_addr, cfg->tunnel_server_addr, + cfg->extra_decap_mod_args); + if (cfg->tunnel_need_veth_mac && + !ASSERT_OK(update_tunnel_intf_addr(cfg), "update testtun0 mac")) + goto fail; + if (cfg->configure_mpls && + (!ASSERT_OK(configure_kernel_for_mpls(cfg), + "configure MPLS decap"))) + goto fail; + SYS(fail, "sysctl -qw net.ipv4.conf.all.rp_filter=0"); + SYS(fail, "sysctl -qw net.ipv4.conf.testtun0.rp_filter=0"); + SYS(fail, "ip link set dev testtun0 up"); + + ret = 0; +fail: + close_netns(nstoken); + return ret; +} + +static void remove_kernel_decapsulation(struct subtest_cfg *cfg) +{ + SYS_NOFAIL("ip link del testtun0"); + if (cfg->configure_mpls) + SYS_NOFAIL("ip -f mpls route del 1000 dev lo"); + if (cfg->configure_fou_rx_port) + del_fou_rx_port(cfg); +} + +static int configure_ebpf_decapsulation(struct subtest_cfg *cfg) +{ + struct nstoken *nstoken = open_netns(SERVER_NS); + int ret = -1; + + if (!ASSERT_OK_PTR(nstoken, "open server ns")) + return ret; + + if (!cfg->expect_kern_decap_failure) + SYS(fail, "ip link del testtun0"); + + if (!ASSERT_OK(tc_prog_attach("veth2", cfg->server_ingress_prog_fd, -1), + "attach_program")) + goto fail; + + ret = 0; +fail: + close_netns(nstoken); + return ret; +} + +static void run_test(struct subtest_cfg *cfg) +{ + struct nstoken *nstoken; + + if (!ASSERT_OK(run_server(cfg), "run server")) + return; + + nstoken = open_netns(CLIENT_NS); + if (!ASSERT_OK_PTR(nstoken, "open client ns")) + goto fail; + + /* Basic communication must work */ + if (!ASSERT_OK(send_and_test_data(cfg, true), "connect without any encap")) + goto fail; + + /* Attach encapsulation program to client */ + if (!ASSERT_OK(configure_encapsulation(cfg), "configure encapsulation")) + goto fail; + + /* If supported, insert kernel decap module, connection must succeed */ + if (!cfg->expect_kern_decap_failure) { + if (!ASSERT_OK(configure_kernel_decapsulation(cfg), + "configure kernel decapsulation")) + goto fail; + if (!ASSERT_OK(send_and_test_data(cfg, true), + "connect with encap prog and kern decap")) + goto fail; + } + + /* Replace kernel decapsulation with BPF decapsulation, test must pass */ + if (!ASSERT_OK(configure_ebpf_decapsulation(cfg), "configure ebpf decapsulation")) + goto fail; + ASSERT_OK(send_and_test_data(cfg, true), "connect with encap and decap progs"); + +fail: + close_netns(nstoken); + close(cfg->server_fd); +} + +static int setup(void) +{ + struct nstoken *nstoken_client, *nstoken_server; + int fd, err; + + fd = open("/dev/urandom", O_RDONLY); + if (!ASSERT_OK_FD(fd, "open urandom")) + goto fail; + err = read(fd, tx_buffer, BUFFER_LEN); + close(fd); + + if (!ASSERT_EQ(err, BUFFER_LEN, "read random bytes")) + goto fail; + + /* Configure the testing network */ + if (!ASSERT_OK(make_netns(CLIENT_NS), "create client ns") || + !ASSERT_OK(make_netns(SERVER_NS), "create server ns")) + goto fail; + + nstoken_client = open_netns(CLIENT_NS); + if (!ASSERT_OK_PTR(nstoken_client, "open client ns")) + goto fail_delete_ns; + SYS(fail_close_ns_client, "ip link add %s type veth peer name %s", + "veth1 mtu 1500 netns " CLIENT_NS " address " MAC_ADDR_VETH1, + "veth2 mtu 1500 netns " SERVER_NS " address " MAC_ADDR_VETH2); + SYS(fail_close_ns_client, "ethtool -K veth1 tso off"); + SYS(fail_close_ns_client, "ip link set veth1 up"); + nstoken_server = open_netns(SERVER_NS); + if (!ASSERT_OK_PTR(nstoken_server, "open server ns")) + goto fail_close_ns_client; + SYS(fail_close_ns_server, "ip link set veth2 up"); + + close_netns(nstoken_server); + close_netns(nstoken_client); + return 0; + +fail_close_ns_server: + close_netns(nstoken_server); +fail_close_ns_client: + close_netns(nstoken_client); +fail_delete_ns: + SYS_NOFAIL("ip netns del " CLIENT_NS); + SYS_NOFAIL("ip netns del " SERVER_NS); +fail: + return -1; +} + +static int subtest_setup(struct test_tc_tunnel *skel, struct subtest_cfg *cfg) +{ + struct nstoken *nstoken_client, *nstoken_server; + int ret = -1; + + set_subtest_addresses(cfg); + if (!ASSERT_OK(set_subtest_progs(cfg, skel), + "find subtest progs")) + goto fail; + if (cfg->extra_decap_mod_args_cb) + cfg->extra_decap_mod_args_cb(cfg, cfg->extra_decap_mod_args); + + nstoken_client = open_netns(CLIENT_NS); + if (!ASSERT_OK_PTR(nstoken_client, "open client ns")) + goto fail; + SYS(fail_close_client_ns, + "ip -4 addr add " IP4_ADDR_VETH1 "/24 dev veth1"); + SYS(fail_close_client_ns, "ip -4 route flush table main"); + SYS(fail_close_client_ns, + "ip -4 route add " IP4_ADDR_VETH2 " mtu 1450 dev veth1"); + SYS(fail_close_client_ns, + "ip -6 addr add " IP6_ADDR_VETH1 "/64 dev veth1 nodad"); + SYS(fail_close_client_ns, "ip -6 route flush table main"); + SYS(fail_close_client_ns, + "ip -6 route add " IP6_ADDR_VETH2 " mtu 1430 dev veth1"); + nstoken_server = open_netns(SERVER_NS); + if (!ASSERT_OK_PTR(nstoken_server, "open server ns")) + goto fail_close_client_ns; + SYS(fail_close_server_ns, + "ip -4 addr add " IP4_ADDR_VETH2 "/24 dev veth2"); + SYS(fail_close_server_ns, + "ip -6 addr add " IP6_ADDR_VETH2 "/64 dev veth2 nodad"); + + ret = 0; + +fail_close_server_ns: + close_netns(nstoken_server); +fail_close_client_ns: + close_netns(nstoken_client); +fail: + return ret; +} + + +static void subtest_cleanup(struct subtest_cfg *cfg) +{ + struct nstoken *nstoken; + + nstoken = open_netns(CLIENT_NS); + if (ASSERT_OK_PTR(nstoken, "open clien ns")) { + SYS_NOFAIL("tc qdisc delete dev veth1 parent ffff:fff1"); + SYS_NOFAIL("ip a flush veth1"); + close_netns(nstoken); + } + nstoken = open_netns(SERVER_NS); + if (ASSERT_OK_PTR(nstoken, "open clien ns")) { + SYS_NOFAIL("tc qdisc delete dev veth2 parent ffff:fff1"); + SYS_NOFAIL("ip a flush veth2"); + if (!cfg->expect_kern_decap_failure) + remove_kernel_decapsulation(cfg); + close_netns(nstoken); + } +} + +static void cleanup(void) +{ + remove_netns(CLIENT_NS); + remove_netns(SERVER_NS); +} + +static struct subtest_cfg subtests_cfg[] = { + { + .ebpf_tun_type = "ipip", + .mac_tun_type = "none", + .iproute_tun_type = "ipip", + .ipproto = 4, + }, + { + .ebpf_tun_type = "ipip6", + .mac_tun_type = "none", + .iproute_tun_type = "ip6tnl", + .ipproto = 4, + .tunnel_client_addr = IP6_ADDR_VETH1, + .tunnel_server_addr = IP6_ADDR_VETH2, + }, + { + .ebpf_tun_type = "ip6tnl", + .iproute_tun_type = "ip6tnl", + .mac_tun_type = "none", + .ipproto = 6, + }, + { + .mac_tun_type = "none", + .ebpf_tun_type = "sit", + .iproute_tun_type = "sit", + .ipproto = 6, + .tunnel_client_addr = IP4_ADDR_VETH1, + .tunnel_server_addr = IP4_ADDR_VETH2, + }, + { + .ebpf_tun_type = "vxlan", + .mac_tun_type = "eth", + .iproute_tun_type = "vxlan", + .ipproto = 4, + .extra_decap_mod_args_cb = vxlan_decap_mod_args_cb, + .tunnel_need_veth_mac = true + }, + { + .ebpf_tun_type = "ip6vxlan", + .mac_tun_type = "eth", + .iproute_tun_type = "vxlan", + .ipproto = 6, + .extra_decap_mod_args_cb = vxlan_decap_mod_args_cb, + .tunnel_need_veth_mac = true + }, + { + .ebpf_tun_type = "gre", + .mac_tun_type = "none", + .iproute_tun_type = "gre", + .ipproto = 4, + .test_gso = true + }, + { + .ebpf_tun_type = "gre", + .mac_tun_type = "eth", + .iproute_tun_type = "gretap", + .ipproto = 4, + .tunnel_need_veth_mac = true, + .test_gso = true + }, + { + .ebpf_tun_type = "gre", + .mac_tun_type = "mpls", + .iproute_tun_type = "gre", + .ipproto = 4, + .configure_mpls = true, + .test_gso = true + }, + { + .ebpf_tun_type = "ip6gre", + .mac_tun_type = "none", + .iproute_tun_type = "ip6gre", + .ipproto = 6, + .test_gso = true, + }, + { + .ebpf_tun_type = "ip6gre", + .mac_tun_type = "eth", + .iproute_tun_type = "ip6gretap", + .ipproto = 6, + .tunnel_need_veth_mac = true, + .test_gso = true + }, + { + .ebpf_tun_type = "ip6gre", + .mac_tun_type = "mpls", + .iproute_tun_type = "ip6gre", + .ipproto = 6, + .configure_mpls = true, + .test_gso = true + }, + { + .ebpf_tun_type = "udp", + .mac_tun_type = "none", + .iproute_tun_type = "ipip", + .ipproto = 4, + .extra_decap_mod_args_cb = udp_decap_mod_args_cb, + .configure_fou_rx_port = true, + .test_gso = true + }, + { + .ebpf_tun_type = "udp", + .mac_tun_type = "eth", + .iproute_tun_type = "ipip", + .ipproto = 4, + .extra_decap_mod_args_cb = udp_decap_mod_args_cb, + .configure_fou_rx_port = true, + .expect_kern_decap_failure = true, + .test_gso = true + }, + { + .ebpf_tun_type = "udp", + .mac_tun_type = "mpls", + .iproute_tun_type = "ipip", + .ipproto = 4, + .extra_decap_mod_args_cb = udp_decap_mod_args_cb, + .configure_fou_rx_port = true, + .tmode = "mode any ttl 255", + .configure_mpls = true, + .test_gso = true + }, + { + .ebpf_tun_type = "ip6udp", + .mac_tun_type = "none", + .iproute_tun_type = "ip6tnl", + .ipproto = 6, + .extra_decap_mod_args_cb = udp_decap_mod_args_cb, + .configure_fou_rx_port = true, + .test_gso = true + }, + { + .ebpf_tun_type = "ip6udp", + .mac_tun_type = "eth", + .iproute_tun_type = "ip6tnl", + .ipproto = 6, + .extra_decap_mod_args_cb = udp_decap_mod_args_cb, + .configure_fou_rx_port = true, + .expect_kern_decap_failure = true, + .test_gso = true + }, + { + .ebpf_tun_type = "ip6udp", + .mac_tun_type = "mpls", + .iproute_tun_type = "ip6tnl", + .ipproto = 6, + .extra_decap_mod_args_cb = udp_decap_mod_args_cb, + .configure_fou_rx_port = true, + .tmode = "mode any ttl 255", + .expect_kern_decap_failure = true, + .test_gso = true + }, +}; + +void test_tc_tunnel(void) +{ + struct test_tc_tunnel *skel; + struct subtest_cfg *cfg; + int i, ret; + + skel = test_tc_tunnel__open_and_load(); + if (!ASSERT_OK_PTR(skel, "skel open and load")) + return; + + if (!ASSERT_OK(setup(), "global setup")) + return; + + for (i = 0; i < ARRAY_SIZE(subtests_cfg); i++) { + cfg = &subtests_cfg[i]; + ret = build_subtest_name(cfg, cfg->name, TEST_NAME_MAX_LEN); + if (ret < 0 || !test__start_subtest(cfg->name)) + continue; + if (subtest_setup(skel, cfg) == 0) + run_test(cfg); + subtest_cleanup(cfg); + } + cleanup(); +} diff --git a/tools/testing/selftests/bpf/prog_tests/test_tunnel.c b/tools/testing/selftests/bpf/prog_tests/test_tunnel.c index bae0e9de277d..eb9309931272 100644 --- a/tools/testing/selftests/bpf/prog_tests/test_tunnel.c +++ b/tools/testing/selftests/bpf/prog_tests/test_tunnel.c @@ -534,85 +534,6 @@ static void ping6_dev1(void) close_netns(nstoken); } -static int attach_tc_prog(int ifindex, int igr_fd, int egr_fd) -{ - DECLARE_LIBBPF_OPTS(bpf_tc_hook, hook, .ifindex = ifindex, - .attach_point = BPF_TC_INGRESS | BPF_TC_EGRESS); - DECLARE_LIBBPF_OPTS(bpf_tc_opts, opts1, .handle = 1, - .priority = 1, .prog_fd = igr_fd); - DECLARE_LIBBPF_OPTS(bpf_tc_opts, opts2, .handle = 1, - .priority = 1, .prog_fd = egr_fd); - int ret; - - ret = bpf_tc_hook_create(&hook); - if (!ASSERT_OK(ret, "create tc hook")) - return ret; - - if (igr_fd >= 0) { - hook.attach_point = BPF_TC_INGRESS; - ret = bpf_tc_attach(&hook, &opts1); - if (!ASSERT_OK(ret, "bpf_tc_attach")) { - bpf_tc_hook_destroy(&hook); - return ret; - } - } - - if (egr_fd >= 0) { - hook.attach_point = BPF_TC_EGRESS; - ret = bpf_tc_attach(&hook, &opts2); - if (!ASSERT_OK(ret, "bpf_tc_attach")) { - bpf_tc_hook_destroy(&hook); - return ret; - } - } - - return 0; -} - -static int generic_attach(const char *dev, int igr_fd, int egr_fd) -{ - int ifindex; - - if (!ASSERT_OK_FD(igr_fd, "check ingress fd")) - return -1; - if (!ASSERT_OK_FD(egr_fd, "check egress fd")) - return -1; - - ifindex = if_nametoindex(dev); - if (!ASSERT_NEQ(ifindex, 0, "get ifindex")) - return -1; - - return attach_tc_prog(ifindex, igr_fd, egr_fd); -} - -static int generic_attach_igr(const char *dev, int igr_fd) -{ - int ifindex; - - if (!ASSERT_OK_FD(igr_fd, "check ingress fd")) - return -1; - - ifindex = if_nametoindex(dev); - if (!ASSERT_NEQ(ifindex, 0, "get ifindex")) - return -1; - - return attach_tc_prog(ifindex, igr_fd, -1); -} - -static int generic_attach_egr(const char *dev, int egr_fd) -{ - int ifindex; - - if (!ASSERT_OK_FD(egr_fd, "check egress fd")) - return -1; - - ifindex = if_nametoindex(dev); - if (!ASSERT_NEQ(ifindex, 0, "get ifindex")) - return -1; - - return attach_tc_prog(ifindex, -1, egr_fd); -} - static void test_vxlan_tunnel(void) { struct test_tunnel_kern *skel = NULL; @@ -635,12 +556,12 @@ static void test_vxlan_tunnel(void) goto done; get_src_prog_fd = bpf_program__fd(skel->progs.vxlan_get_tunnel_src); set_src_prog_fd = bpf_program__fd(skel->progs.vxlan_set_tunnel_src); - if (generic_attach(VXLAN_TUNL_DEV1, get_src_prog_fd, set_src_prog_fd)) + if (tc_prog_attach(VXLAN_TUNL_DEV1, get_src_prog_fd, set_src_prog_fd)) goto done; /* load and attach bpf prog to veth dev tc hook point */ set_dst_prog_fd = bpf_program__fd(skel->progs.veth_set_outer_dst); - if (generic_attach_igr("veth1", set_dst_prog_fd)) + if (tc_prog_attach("veth1", set_dst_prog_fd, -1)) goto done; /* load and attach prog set_md to tunnel dev tc hook point at_ns0 */ @@ -648,7 +569,7 @@ static void test_vxlan_tunnel(void) if (!ASSERT_OK_PTR(nstoken, "setns src")) goto done; set_dst_prog_fd = bpf_program__fd(skel->progs.vxlan_set_tunnel_dst); - if (generic_attach_egr(VXLAN_TUNL_DEV0, set_dst_prog_fd)) + if (tc_prog_attach(VXLAN_TUNL_DEV0, -1, set_dst_prog_fd)) goto done; close_netns(nstoken); @@ -695,7 +616,7 @@ static void test_ip6vxlan_tunnel(void) goto done; get_src_prog_fd = bpf_program__fd(skel->progs.ip6vxlan_get_tunnel_src); set_src_prog_fd = bpf_program__fd(skel->progs.ip6vxlan_set_tunnel_src); - if (generic_attach(IP6VXLAN_TUNL_DEV1, get_src_prog_fd, set_src_prog_fd)) + if (tc_prog_attach(IP6VXLAN_TUNL_DEV1, get_src_prog_fd, set_src_prog_fd)) goto done; /* load and attach prog set_md to tunnel dev tc hook point at_ns0 */ @@ -703,7 +624,7 @@ static void test_ip6vxlan_tunnel(void) if (!ASSERT_OK_PTR(nstoken, "setns src")) goto done; set_dst_prog_fd = bpf_program__fd(skel->progs.ip6vxlan_set_tunnel_dst); - if (generic_attach_egr(IP6VXLAN_TUNL_DEV0, set_dst_prog_fd)) + if (tc_prog_attach(IP6VXLAN_TUNL_DEV0, -1, set_dst_prog_fd)) goto done; close_netns(nstoken); @@ -764,7 +685,7 @@ static void test_ipip_tunnel(enum ipip_encap encap) skel->progs.ipip_set_tunnel); } - if (generic_attach(IPIP_TUNL_DEV1, get_src_prog_fd, set_src_prog_fd)) + if (tc_prog_attach(IPIP_TUNL_DEV1, get_src_prog_fd, set_src_prog_fd)) goto done; ping_dev0(); @@ -797,7 +718,7 @@ static void test_xfrm_tunnel(void) /* attach tc prog to tunnel dev */ tc_prog_fd = bpf_program__fd(skel->progs.xfrm_get_state); - if (generic_attach_igr("veth1", tc_prog_fd)) + if (tc_prog_attach("veth1", tc_prog_fd, -1)) goto done; /* attach xdp prog to tunnel dev */ @@ -870,7 +791,7 @@ static void test_gre_tunnel(enum gre_test test) if (!ASSERT_OK(err, "add tunnel")) goto done; - if (generic_attach(GRE_TUNL_DEV1, get_fd, set_fd)) + if (tc_prog_attach(GRE_TUNL_DEV1, get_fd, set_fd)) goto done; ping_dev0(); @@ -911,7 +832,7 @@ static void test_ip6gre_tunnel(enum ip6gre_test test) set_fd = bpf_program__fd(skel->progs.ip6gretap_set_tunnel); get_fd = bpf_program__fd(skel->progs.ip6gretap_get_tunnel); - if (generic_attach(IP6GRE_TUNL_DEV1, get_fd, set_fd)) + if (tc_prog_attach(IP6GRE_TUNL_DEV1, get_fd, set_fd)) goto done; ping6_veth0(); @@ -954,7 +875,7 @@ static void test_erspan_tunnel(enum erspan_test test) set_fd = bpf_program__fd(skel->progs.erspan_set_tunnel); get_fd = bpf_program__fd(skel->progs.erspan_get_tunnel); - if (generic_attach(ERSPAN_TUNL_DEV1, get_fd, set_fd)) + if (tc_prog_attach(ERSPAN_TUNL_DEV1, get_fd, set_fd)) goto done; ping_dev0(); @@ -990,7 +911,7 @@ static void test_ip6erspan_tunnel(enum erspan_test test) set_fd = bpf_program__fd(skel->progs.ip4ip6erspan_set_tunnel); get_fd = bpf_program__fd(skel->progs.ip4ip6erspan_get_tunnel); - if (generic_attach(IP6ERSPAN_TUNL_DEV1, get_fd, set_fd)) + if (tc_prog_attach(IP6ERSPAN_TUNL_DEV1, get_fd, set_fd)) goto done; ping6_veth0(); @@ -1017,7 +938,7 @@ static void test_geneve_tunnel(void) set_fd = bpf_program__fd(skel->progs.geneve_set_tunnel); get_fd = bpf_program__fd(skel->progs.geneve_get_tunnel); - if (generic_attach(GENEVE_TUNL_DEV1, get_fd, set_fd)) + if (tc_prog_attach(GENEVE_TUNL_DEV1, get_fd, set_fd)) goto done; ping_dev0(); @@ -1044,7 +965,7 @@ static void test_ip6geneve_tunnel(void) set_fd = bpf_program__fd(skel->progs.ip6geneve_set_tunnel); get_fd = bpf_program__fd(skel->progs.ip6geneve_get_tunnel); - if (generic_attach(IP6GENEVE_TUNL_DEV1, get_fd, set_fd)) + if (tc_prog_attach(IP6GENEVE_TUNL_DEV1, get_fd, set_fd)) goto done; ping_dev0(); @@ -1083,7 +1004,7 @@ static void test_ip6tnl_tunnel(enum ip6tnl_test test) get_fd = bpf_program__fd(skel->progs.ip6ip6_get_tunnel); break; } - if (generic_attach(IP6TNL_TUNL_DEV1, get_fd, set_fd)) + if (tc_prog_attach(IP6TNL_TUNL_DEV1, get_fd, set_fd)) goto done; ping6_veth0(); diff --git a/tools/testing/selftests/bpf/prog_tests/test_xsk.c b/tools/testing/selftests/bpf/prog_tests/test_xsk.c new file mode 100644 index 000000000000..5af28f359cfd --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/test_xsk.c @@ -0,0 +1,2596 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <bpf/bpf.h> +#include <errno.h> +#include <linux/bitmap.h> +#include <linux/if_link.h> +#include <linux/mman.h> +#include <linux/netdev.h> +#include <poll.h> +#include <pthread.h> +#include <signal.h> +#include <string.h> +#include <sys/mman.h> +#include <sys/socket.h> +#include <sys/time.h> +#include <unistd.h> + +#include "network_helpers.h" +#include "test_xsk.h" +#include "xsk_xdp_common.h" +#include "xsk_xdp_progs.skel.h" + +#define DEFAULT_BATCH_SIZE 64 +#define MIN_PKT_SIZE 64 +#define MAX_ETH_JUMBO_SIZE 9000 +#define MAX_INTERFACES 2 +#define MAX_TEARDOWN_ITER 10 +#define MAX_TX_BUDGET_DEFAULT 32 +#define PKT_DUMP_NB_TO_PRINT 16 +/* Just to align the data in the packet */ +#define PKT_HDR_SIZE (sizeof(struct ethhdr) + 2) +#define POLL_TMOUT 1000 +#define THREAD_TMOUT 3 +#define UMEM_HEADROOM_TEST_SIZE 128 +#define XSK_DESC__INVALID_OPTION (0xffff) +#define XSK_UMEM__INVALID_FRAME_SIZE (MAX_ETH_JUMBO_SIZE + 1) +#define XSK_UMEM__LARGE_FRAME_SIZE (3 * 1024) +#define XSK_UMEM__MAX_FRAME_SIZE (4 * 1024) + +static const u8 g_mac[ETH_ALEN] = {0x55, 0x44, 0x33, 0x22, 0x11, 0x00}; + +bool opt_verbose; +pthread_barrier_t barr; +pthread_mutex_t pacing_mutex = PTHREAD_MUTEX_INITIALIZER; + +int pkts_in_flight; + +/* The payload is a word consisting of a packet sequence number in the upper + * 16-bits and a intra packet data sequence number in the lower 16 bits. So the 3rd packet's + * 5th word of data will contain the number (2<<16) | 4 as they are numbered from 0. + */ +static void write_payload(void *dest, u32 pkt_nb, u32 start, u32 size) +{ + u32 *ptr = (u32 *)dest, i; + + start /= sizeof(*ptr); + size /= sizeof(*ptr); + for (i = 0; i < size; i++) + ptr[i] = htonl(pkt_nb << 16 | (i + start)); +} + +static void gen_eth_hdr(struct xsk_socket_info *xsk, struct ethhdr *eth_hdr) +{ + memcpy(eth_hdr->h_dest, xsk->dst_mac, ETH_ALEN); + memcpy(eth_hdr->h_source, xsk->src_mac, ETH_ALEN); + eth_hdr->h_proto = htons(ETH_P_LOOPBACK); +} + +static bool is_umem_valid(struct ifobject *ifobj) +{ + return !!ifobj->umem->umem; +} + +static u32 mode_to_xdp_flags(enum test_mode mode) +{ + return (mode == TEST_MODE_SKB) ? XDP_FLAGS_SKB_MODE : XDP_FLAGS_DRV_MODE; +} + +static u64 umem_size(struct xsk_umem_info *umem) +{ + return umem->num_frames * umem->frame_size; +} + +int xsk_configure_umem(struct ifobject *ifobj, struct xsk_umem_info *umem, void *buffer, + u64 size) +{ + struct xsk_umem_config cfg = { + .fill_size = XSK_RING_PROD__DEFAULT_NUM_DESCS, + .comp_size = XSK_RING_CONS__DEFAULT_NUM_DESCS, + .frame_size = umem->frame_size, + .frame_headroom = umem->frame_headroom, + .flags = XSK_UMEM__DEFAULT_FLAGS + }; + int ret; + + if (umem->fill_size) + cfg.fill_size = umem->fill_size; + + if (umem->comp_size) + cfg.comp_size = umem->comp_size; + + if (umem->unaligned_mode) + cfg.flags |= XDP_UMEM_UNALIGNED_CHUNK_FLAG; + + ret = xsk_umem__create(&umem->umem, buffer, size, + &umem->fq, &umem->cq, &cfg); + if (ret) + return ret; + + umem->buffer = buffer; + if (ifobj->shared_umem && ifobj->rx_on) { + umem->base_addr = umem_size(umem); + umem->next_buffer = umem_size(umem); + } + + return 0; +} + +static u64 umem_alloc_buffer(struct xsk_umem_info *umem) +{ + u64 addr; + + addr = umem->next_buffer; + umem->next_buffer += umem->frame_size; + if (umem->next_buffer >= umem->base_addr + umem_size(umem)) + umem->next_buffer = umem->base_addr; + + return addr; +} + +static void umem_reset_alloc(struct xsk_umem_info *umem) +{ + umem->next_buffer = 0; +} + +static int enable_busy_poll(struct xsk_socket_info *xsk) +{ + int sock_opt; + + sock_opt = 1; + if (setsockopt(xsk_socket__fd(xsk->xsk), SOL_SOCKET, SO_PREFER_BUSY_POLL, + (void *)&sock_opt, sizeof(sock_opt)) < 0) + return -errno; + + sock_opt = 20; + if (setsockopt(xsk_socket__fd(xsk->xsk), SOL_SOCKET, SO_BUSY_POLL, + (void *)&sock_opt, sizeof(sock_opt)) < 0) + return -errno; + + sock_opt = xsk->batch_size; + if (setsockopt(xsk_socket__fd(xsk->xsk), SOL_SOCKET, SO_BUSY_POLL_BUDGET, + (void *)&sock_opt, sizeof(sock_opt)) < 0) + return -errno; + + return 0; +} + +int xsk_configure_socket(struct xsk_socket_info *xsk, struct xsk_umem_info *umem, + struct ifobject *ifobject, bool shared) +{ + struct xsk_socket_config cfg = {}; + struct xsk_ring_cons *rxr; + struct xsk_ring_prod *txr; + + xsk->umem = umem; + cfg.rx_size = xsk->rxqsize; + cfg.tx_size = XSK_RING_PROD__DEFAULT_NUM_DESCS; + cfg.bind_flags = ifobject->bind_flags; + if (shared) + cfg.bind_flags |= XDP_SHARED_UMEM; + if (ifobject->mtu > MAX_ETH_PKT_SIZE) + cfg.bind_flags |= XDP_USE_SG; + if (umem->comp_size) + cfg.tx_size = umem->comp_size; + if (umem->fill_size) + cfg.rx_size = umem->fill_size; + + txr = ifobject->tx_on ? &xsk->tx : NULL; + rxr = ifobject->rx_on ? &xsk->rx : NULL; + return xsk_socket__create(&xsk->xsk, ifobject->ifindex, 0, umem->umem, rxr, txr, &cfg); +} + +#define MAX_SKB_FRAGS_PATH "/proc/sys/net/core/max_skb_frags" +static unsigned int get_max_skb_frags(void) +{ + unsigned int max_skb_frags = 0; + FILE *file; + + file = fopen(MAX_SKB_FRAGS_PATH, "r"); + if (!file) { + ksft_print_msg("Error opening %s\n", MAX_SKB_FRAGS_PATH); + return 0; + } + + if (fscanf(file, "%u", &max_skb_frags) != 1) + ksft_print_msg("Error reading %s\n", MAX_SKB_FRAGS_PATH); + + fclose(file); + return max_skb_frags; +} + +static int set_ring_size(struct ifobject *ifobj) +{ + int ret; + u32 ctr = 0; + + while (ctr++ < SOCK_RECONF_CTR) { + ret = set_hw_ring_size(ifobj->ifname, &ifobj->ring); + if (!ret) + break; + + /* Retry if it fails */ + if (ctr >= SOCK_RECONF_CTR || errno != EBUSY) + return -errno; + + usleep(USLEEP_MAX); + } + + return ret; +} + +int hw_ring_size_reset(struct ifobject *ifobj) +{ + ifobj->ring.tx_pending = ifobj->set_ring.default_tx; + ifobj->ring.rx_pending = ifobj->set_ring.default_rx; + return set_ring_size(ifobj); +} + +static void __test_spec_init(struct test_spec *test, struct ifobject *ifobj_tx, + struct ifobject *ifobj_rx) +{ + u32 i, j; + + for (i = 0; i < MAX_INTERFACES; i++) { + struct ifobject *ifobj = i ? ifobj_rx : ifobj_tx; + + ifobj->xsk = &ifobj->xsk_arr[0]; + ifobj->use_poll = false; + ifobj->use_fill_ring = true; + ifobj->release_rx = true; + ifobj->validation_func = NULL; + ifobj->use_metadata = false; + + if (i == 0) { + ifobj->rx_on = false; + ifobj->tx_on = true; + } else { + ifobj->rx_on = true; + ifobj->tx_on = false; + } + + memset(ifobj->umem, 0, sizeof(*ifobj->umem)); + ifobj->umem->num_frames = DEFAULT_UMEM_BUFFERS; + ifobj->umem->frame_size = XSK_UMEM__DEFAULT_FRAME_SIZE; + + for (j = 0; j < MAX_SOCKETS; j++) { + memset(&ifobj->xsk_arr[j], 0, sizeof(ifobj->xsk_arr[j])); + ifobj->xsk_arr[j].rxqsize = XSK_RING_CONS__DEFAULT_NUM_DESCS; + ifobj->xsk_arr[j].batch_size = DEFAULT_BATCH_SIZE; + if (i == 0) + ifobj->xsk_arr[j].pkt_stream = test->tx_pkt_stream_default; + else + ifobj->xsk_arr[j].pkt_stream = test->rx_pkt_stream_default; + + memcpy(ifobj->xsk_arr[j].src_mac, g_mac, ETH_ALEN); + memcpy(ifobj->xsk_arr[j].dst_mac, g_mac, ETH_ALEN); + ifobj->xsk_arr[j].src_mac[5] += ((j * 2) + 0); + ifobj->xsk_arr[j].dst_mac[5] += ((j * 2) + 1); + } + } + + if (ifobj_tx->hw_ring_size_supp) + hw_ring_size_reset(ifobj_tx); + + test->ifobj_tx = ifobj_tx; + test->ifobj_rx = ifobj_rx; + test->current_step = 0; + test->total_steps = 1; + test->nb_sockets = 1; + test->fail = false; + test->set_ring = false; + test->adjust_tail = false; + test->adjust_tail_support = false; + test->mtu = MAX_ETH_PKT_SIZE; + test->xdp_prog_rx = ifobj_rx->xdp_progs->progs.xsk_def_prog; + test->xskmap_rx = ifobj_rx->xdp_progs->maps.xsk; + test->xdp_prog_tx = ifobj_tx->xdp_progs->progs.xsk_def_prog; + test->xskmap_tx = ifobj_tx->xdp_progs->maps.xsk; +} + +void test_init(struct test_spec *test, struct ifobject *ifobj_tx, + struct ifobject *ifobj_rx, enum test_mode mode, + const struct test_spec *test_to_run) +{ + struct pkt_stream *tx_pkt_stream; + struct pkt_stream *rx_pkt_stream; + u32 i; + + tx_pkt_stream = test->tx_pkt_stream_default; + rx_pkt_stream = test->rx_pkt_stream_default; + memset(test, 0, sizeof(*test)); + test->tx_pkt_stream_default = tx_pkt_stream; + test->rx_pkt_stream_default = rx_pkt_stream; + + for (i = 0; i < MAX_INTERFACES; i++) { + struct ifobject *ifobj = i ? ifobj_rx : ifobj_tx; + + ifobj->bind_flags = XDP_USE_NEED_WAKEUP; + if (mode == TEST_MODE_ZC) + ifobj->bind_flags |= XDP_ZEROCOPY; + else + ifobj->bind_flags |= XDP_COPY; + } + + memcpy(test->name, test_to_run->name, MAX_TEST_NAME_SIZE); + test->test_func = test_to_run->test_func; + test->mode = mode; + __test_spec_init(test, ifobj_tx, ifobj_rx); +} + +static void test_spec_reset(struct test_spec *test) +{ + __test_spec_init(test, test->ifobj_tx, test->ifobj_rx); +} + +static void test_spec_set_xdp_prog(struct test_spec *test, struct bpf_program *xdp_prog_rx, + struct bpf_program *xdp_prog_tx, struct bpf_map *xskmap_rx, + struct bpf_map *xskmap_tx) +{ + test->xdp_prog_rx = xdp_prog_rx; + test->xdp_prog_tx = xdp_prog_tx; + test->xskmap_rx = xskmap_rx; + test->xskmap_tx = xskmap_tx; +} + +static int test_spec_set_mtu(struct test_spec *test, int mtu) +{ + int err; + + if (test->ifobj_rx->mtu != mtu) { + err = xsk_set_mtu(test->ifobj_rx->ifindex, mtu); + if (err) + return err; + test->ifobj_rx->mtu = mtu; + } + if (test->ifobj_tx->mtu != mtu) { + err = xsk_set_mtu(test->ifobj_tx->ifindex, mtu); + if (err) + return err; + test->ifobj_tx->mtu = mtu; + } + + return 0; +} + +void pkt_stream_reset(struct pkt_stream *pkt_stream) +{ + if (pkt_stream) { + pkt_stream->current_pkt_nb = 0; + pkt_stream->nb_rx_pkts = 0; + } +} + +static struct pkt *pkt_stream_get_next_tx_pkt(struct pkt_stream *pkt_stream) +{ + if (pkt_stream->current_pkt_nb >= pkt_stream->nb_pkts) + return NULL; + + return &pkt_stream->pkts[pkt_stream->current_pkt_nb++]; +} + +static struct pkt *pkt_stream_get_next_rx_pkt(struct pkt_stream *pkt_stream, u32 *pkts_sent) +{ + while (pkt_stream->current_pkt_nb < pkt_stream->nb_pkts) { + (*pkts_sent)++; + if (pkt_stream->pkts[pkt_stream->current_pkt_nb].valid) + return &pkt_stream->pkts[pkt_stream->current_pkt_nb++]; + pkt_stream->current_pkt_nb++; + } + return NULL; +} + +void pkt_stream_delete(struct pkt_stream *pkt_stream) +{ + free(pkt_stream->pkts); + free(pkt_stream); +} + +void pkt_stream_restore_default(struct test_spec *test) +{ + struct pkt_stream *tx_pkt_stream = test->ifobj_tx->xsk->pkt_stream; + struct pkt_stream *rx_pkt_stream = test->ifobj_rx->xsk->pkt_stream; + + if (tx_pkt_stream != test->tx_pkt_stream_default) { + pkt_stream_delete(test->ifobj_tx->xsk->pkt_stream); + test->ifobj_tx->xsk->pkt_stream = test->tx_pkt_stream_default; + } + + if (rx_pkt_stream != test->rx_pkt_stream_default) { + pkt_stream_delete(test->ifobj_rx->xsk->pkt_stream); + test->ifobj_rx->xsk->pkt_stream = test->rx_pkt_stream_default; + } +} + +static struct pkt_stream *__pkt_stream_alloc(u32 nb_pkts) +{ + struct pkt_stream *pkt_stream; + + pkt_stream = calloc(1, sizeof(*pkt_stream)); + if (!pkt_stream) + return NULL; + + pkt_stream->pkts = calloc(nb_pkts, sizeof(*pkt_stream->pkts)); + if (!pkt_stream->pkts) { + free(pkt_stream); + return NULL; + } + + pkt_stream->nb_pkts = nb_pkts; + return pkt_stream; +} + +static u32 pkt_nb_frags(u32 frame_size, struct pkt_stream *pkt_stream, struct pkt *pkt) +{ + u32 nb_frags = 1, next_frag; + + if (!pkt) + return 1; + + if (!pkt_stream->verbatim) { + if (!pkt->valid || !pkt->len) + return 1; + return ceil_u32(pkt->len, frame_size); + } + + /* Search for the end of the packet in verbatim mode */ + if (!pkt_continues(pkt->options)) + return nb_frags; + + next_frag = pkt_stream->current_pkt_nb; + pkt++; + while (next_frag++ < pkt_stream->nb_pkts) { + nb_frags++; + if (!pkt_continues(pkt->options) || !pkt->valid) + break; + pkt++; + } + return nb_frags; +} + +static bool set_pkt_valid(int offset, u32 len) +{ + return len <= MAX_ETH_JUMBO_SIZE; +} + +static void pkt_set(struct pkt_stream *pkt_stream, struct pkt *pkt, int offset, u32 len) +{ + pkt->offset = offset; + pkt->len = len; + pkt->valid = set_pkt_valid(offset, len); +} + +static void pkt_stream_pkt_set(struct pkt_stream *pkt_stream, struct pkt *pkt, int offset, u32 len) +{ + bool prev_pkt_valid = pkt->valid; + + pkt_set(pkt_stream, pkt, offset, len); + pkt_stream->nb_valid_entries += pkt->valid - prev_pkt_valid; +} + +static u32 pkt_get_buffer_len(struct xsk_umem_info *umem, u32 len) +{ + return ceil_u32(len, umem->frame_size) * umem->frame_size; +} + +static struct pkt_stream *__pkt_stream_generate(u32 nb_pkts, u32 pkt_len, u32 nb_start, u32 nb_off) +{ + struct pkt_stream *pkt_stream; + u32 i; + + pkt_stream = __pkt_stream_alloc(nb_pkts); + if (!pkt_stream) + return NULL; + + pkt_stream->nb_pkts = nb_pkts; + pkt_stream->max_pkt_len = pkt_len; + for (i = 0; i < nb_pkts; i++) { + struct pkt *pkt = &pkt_stream->pkts[i]; + + pkt_stream_pkt_set(pkt_stream, pkt, 0, pkt_len); + pkt->pkt_nb = nb_start + i * nb_off; + } + + return pkt_stream; +} + +struct pkt_stream *pkt_stream_generate(u32 nb_pkts, u32 pkt_len) +{ + return __pkt_stream_generate(nb_pkts, pkt_len, 0, 1); +} + +static struct pkt_stream *pkt_stream_clone(struct pkt_stream *pkt_stream) +{ + return pkt_stream_generate(pkt_stream->nb_pkts, pkt_stream->pkts[0].len); +} + +static int pkt_stream_replace_ifobject(struct ifobject *ifobj, u32 nb_pkts, u32 pkt_len) +{ + ifobj->xsk->pkt_stream = pkt_stream_generate(nb_pkts, pkt_len); + + if (!ifobj->xsk->pkt_stream) + return -ENOMEM; + + return 0; +} + +static int pkt_stream_replace(struct test_spec *test, u32 nb_pkts, u32 pkt_len) +{ + int ret; + + ret = pkt_stream_replace_ifobject(test->ifobj_tx, nb_pkts, pkt_len); + if (ret) + return ret; + + return pkt_stream_replace_ifobject(test->ifobj_rx, nb_pkts, pkt_len); +} + +static int __pkt_stream_replace_half(struct ifobject *ifobj, u32 pkt_len, + int offset) +{ + struct pkt_stream *pkt_stream; + u32 i; + + pkt_stream = pkt_stream_clone(ifobj->xsk->pkt_stream); + if (!pkt_stream) + return -ENOMEM; + + for (i = 1; i < ifobj->xsk->pkt_stream->nb_pkts; i += 2) + pkt_stream_pkt_set(pkt_stream, &pkt_stream->pkts[i], offset, pkt_len); + + ifobj->xsk->pkt_stream = pkt_stream; + + return 0; +} + +static int pkt_stream_replace_half(struct test_spec *test, u32 pkt_len, int offset) +{ + int ret = __pkt_stream_replace_half(test->ifobj_tx, pkt_len, offset); + + if (ret) + return ret; + + return __pkt_stream_replace_half(test->ifobj_rx, pkt_len, offset); +} + +static int pkt_stream_receive_half(struct test_spec *test) +{ + struct pkt_stream *pkt_stream = test->ifobj_tx->xsk->pkt_stream; + u32 i; + + if (test->ifobj_rx->xsk->pkt_stream != test->rx_pkt_stream_default) + /* Packet stream has already been replaced so we have to release this one. + * The newly created one will be freed by the restore_default() at the + * end of the test + */ + pkt_stream_delete(test->ifobj_rx->xsk->pkt_stream); + + test->ifobj_rx->xsk->pkt_stream = pkt_stream_generate(pkt_stream->nb_pkts, + pkt_stream->pkts[0].len); + if (!test->ifobj_rx->xsk->pkt_stream) + return -ENOMEM; + + pkt_stream = test->ifobj_rx->xsk->pkt_stream; + for (i = 1; i < pkt_stream->nb_pkts; i += 2) + pkt_stream->pkts[i].valid = false; + + pkt_stream->nb_valid_entries /= 2; + + return 0; +} + +static int pkt_stream_even_odd_sequence(struct test_spec *test) +{ + struct pkt_stream *pkt_stream; + u32 i; + + for (i = 0; i < test->nb_sockets; i++) { + pkt_stream = test->ifobj_tx->xsk_arr[i].pkt_stream; + pkt_stream = __pkt_stream_generate(pkt_stream->nb_pkts / 2, + pkt_stream->pkts[0].len, i, 2); + if (!pkt_stream) + return -ENOMEM; + test->ifobj_tx->xsk_arr[i].pkt_stream = pkt_stream; + + pkt_stream = test->ifobj_rx->xsk_arr[i].pkt_stream; + pkt_stream = __pkt_stream_generate(pkt_stream->nb_pkts / 2, + pkt_stream->pkts[0].len, i, 2); + if (!pkt_stream) + return -ENOMEM; + test->ifobj_rx->xsk_arr[i].pkt_stream = pkt_stream; + } + + return 0; +} + +static void release_even_odd_sequence(struct test_spec *test) +{ + struct pkt_stream *later_free_tx = test->ifobj_tx->xsk->pkt_stream; + struct pkt_stream *later_free_rx = test->ifobj_rx->xsk->pkt_stream; + int i; + + for (i = 0; i < test->nb_sockets; i++) { + /* later_free_{rx/tx} will be freed by restore_default() */ + if (test->ifobj_tx->xsk_arr[i].pkt_stream != later_free_tx) + pkt_stream_delete(test->ifobj_tx->xsk_arr[i].pkt_stream); + if (test->ifobj_rx->xsk_arr[i].pkt_stream != later_free_rx) + pkt_stream_delete(test->ifobj_rx->xsk_arr[i].pkt_stream); + } + +} + +static u64 pkt_get_addr(struct pkt *pkt, struct xsk_umem_info *umem) +{ + if (!pkt->valid) + return pkt->offset; + return pkt->offset + umem_alloc_buffer(umem); +} + +static void pkt_stream_cancel(struct pkt_stream *pkt_stream) +{ + pkt_stream->current_pkt_nb--; +} + +static void pkt_generate(struct xsk_socket_info *xsk, struct xsk_umem_info *umem, u64 addr, u32 len, + u32 pkt_nb, u32 bytes_written) +{ + void *data = xsk_umem__get_data(umem->buffer, addr); + + if (len < MIN_PKT_SIZE) + return; + + if (!bytes_written) { + gen_eth_hdr(xsk, data); + + len -= PKT_HDR_SIZE; + data += PKT_HDR_SIZE; + } else { + bytes_written -= PKT_HDR_SIZE; + } + + write_payload(data, pkt_nb, bytes_written, len); +} + +static struct pkt_stream *__pkt_stream_generate_custom(struct ifobject *ifobj, struct pkt *frames, + u32 nb_frames, bool verbatim) +{ + u32 i, len = 0, pkt_nb = 0, payload = 0; + struct pkt_stream *pkt_stream; + + pkt_stream = __pkt_stream_alloc(nb_frames); + if (!pkt_stream) + return NULL; + + for (i = 0; i < nb_frames; i++) { + struct pkt *pkt = &pkt_stream->pkts[pkt_nb]; + struct pkt *frame = &frames[i]; + + pkt->offset = frame->offset; + if (verbatim) { + *pkt = *frame; + pkt->pkt_nb = payload; + if (!frame->valid || !pkt_continues(frame->options)) + payload++; + } else { + if (frame->valid) + len += frame->len; + if (frame->valid && pkt_continues(frame->options)) + continue; + + pkt->pkt_nb = pkt_nb; + pkt->len = len; + pkt->valid = frame->valid; + pkt->options = 0; + + len = 0; + } + + print_verbose("offset: %d len: %u valid: %u options: %u pkt_nb: %u\n", + pkt->offset, pkt->len, pkt->valid, pkt->options, pkt->pkt_nb); + + if (pkt->valid && pkt->len > pkt_stream->max_pkt_len) + pkt_stream->max_pkt_len = pkt->len; + + if (pkt->valid) + pkt_stream->nb_valid_entries++; + + pkt_nb++; + } + + pkt_stream->nb_pkts = pkt_nb; + pkt_stream->verbatim = verbatim; + return pkt_stream; +} + +static int pkt_stream_generate_custom(struct test_spec *test, struct pkt *pkts, u32 nb_pkts) +{ + struct pkt_stream *pkt_stream; + + pkt_stream = __pkt_stream_generate_custom(test->ifobj_tx, pkts, nb_pkts, true); + if (!pkt_stream) + return -ENOMEM; + test->ifobj_tx->xsk->pkt_stream = pkt_stream; + + pkt_stream = __pkt_stream_generate_custom(test->ifobj_rx, pkts, nb_pkts, false); + if (!pkt_stream) + return -ENOMEM; + test->ifobj_rx->xsk->pkt_stream = pkt_stream; + + return 0; +} + +static void pkt_print_data(u32 *data, u32 cnt) +{ + u32 i; + + for (i = 0; i < cnt; i++) { + u32 seqnum, pkt_nb; + + seqnum = ntohl(*data) & 0xffff; + pkt_nb = ntohl(*data) >> 16; + ksft_print_msg("%u:%u ", pkt_nb, seqnum); + data++; + } +} + +static void pkt_dump(void *pkt, u32 len, bool eth_header) +{ + struct ethhdr *ethhdr = pkt; + u32 i, *data; + + if (eth_header) { + /*extract L2 frame */ + ksft_print_msg("DEBUG>> L2: dst mac: "); + for (i = 0; i < ETH_ALEN; i++) + ksft_print_msg("%02X", ethhdr->h_dest[i]); + + ksft_print_msg("\nDEBUG>> L2: src mac: "); + for (i = 0; i < ETH_ALEN; i++) + ksft_print_msg("%02X", ethhdr->h_source[i]); + + data = pkt + PKT_HDR_SIZE; + } else { + data = pkt; + } + + /*extract L5 frame */ + ksft_print_msg("\nDEBUG>> L5: seqnum: "); + pkt_print_data(data, PKT_DUMP_NB_TO_PRINT); + ksft_print_msg("...."); + if (len > PKT_DUMP_NB_TO_PRINT * sizeof(u32)) { + ksft_print_msg("\n.... "); + pkt_print_data(data + len / sizeof(u32) - PKT_DUMP_NB_TO_PRINT, + PKT_DUMP_NB_TO_PRINT); + } + ksft_print_msg("\n---------------------------------------\n"); +} + +static bool is_offset_correct(struct xsk_umem_info *umem, struct pkt *pkt, u64 addr) +{ + u32 headroom = umem->unaligned_mode ? 0 : umem->frame_headroom; + u32 offset = addr % umem->frame_size, expected_offset; + int pkt_offset = pkt->valid ? pkt->offset : 0; + + if (!umem->unaligned_mode) + pkt_offset = 0; + + expected_offset = (pkt_offset + headroom + XDP_PACKET_HEADROOM) % umem->frame_size; + + if (offset == expected_offset) + return true; + + ksft_print_msg("[%s] expected [%u], got [%u]\n", __func__, expected_offset, offset); + return false; +} + +static bool is_metadata_correct(struct pkt *pkt, void *buffer, u64 addr) +{ + void *data = xsk_umem__get_data(buffer, addr); + struct xdp_info *meta = data - sizeof(struct xdp_info); + + if (meta->count != pkt->pkt_nb) { + ksft_print_msg("[%s] expected meta_count [%d], got meta_count [%llu]\n", + __func__, pkt->pkt_nb, + (unsigned long long)meta->count); + return false; + } + + return true; +} + +static int is_adjust_tail_supported(struct xsk_xdp_progs *skel_rx, bool *supported) +{ + struct bpf_map *data_map; + int adjust_value = 0; + int key = 0; + int ret; + + data_map = bpf_object__find_map_by_name(skel_rx->obj, "xsk_xdp_.bss"); + if (!data_map || !bpf_map__is_internal(data_map)) { + ksft_print_msg("Error: could not find bss section of XDP program\n"); + return -EINVAL; + } + + ret = bpf_map_lookup_elem(bpf_map__fd(data_map), &key, &adjust_value); + if (ret) { + ksft_print_msg("Error: bpf_map_lookup_elem failed with error %d\n", ret); + return ret; + } + + /* Set the 'adjust_value' variable to -EOPNOTSUPP in the XDP program if the adjust_tail + * helper is not supported. Skip the adjust_tail test case in this scenario. + */ + *supported = adjust_value != -EOPNOTSUPP; + + return 0; +} + +static bool is_frag_valid(struct xsk_umem_info *umem, u64 addr, u32 len, u32 expected_pkt_nb, + u32 bytes_processed) +{ + u32 seqnum, pkt_nb, *pkt_data, words_to_end, expected_seqnum; + void *data = xsk_umem__get_data(umem->buffer, addr); + + addr -= umem->base_addr; + + if (addr >= umem->num_frames * umem->frame_size || + addr + len > umem->num_frames * umem->frame_size) { + ksft_print_msg("Frag invalid addr: %llx len: %u\n", + (unsigned long long)addr, len); + return false; + } + if (!umem->unaligned_mode && addr % umem->frame_size + len > umem->frame_size) { + ksft_print_msg("Frag crosses frame boundary addr: %llx len: %u\n", + (unsigned long long)addr, len); + return false; + } + + pkt_data = data; + if (!bytes_processed) { + pkt_data += PKT_HDR_SIZE / sizeof(*pkt_data); + len -= PKT_HDR_SIZE; + } else { + bytes_processed -= PKT_HDR_SIZE; + } + + expected_seqnum = bytes_processed / sizeof(*pkt_data); + seqnum = ntohl(*pkt_data) & 0xffff; + pkt_nb = ntohl(*pkt_data) >> 16; + + if (expected_pkt_nb != pkt_nb) { + ksft_print_msg("[%s] expected pkt_nb [%u], got pkt_nb [%u]\n", + __func__, expected_pkt_nb, pkt_nb); + goto error; + } + if (expected_seqnum != seqnum) { + ksft_print_msg("[%s] expected seqnum at start [%u], got seqnum [%u]\n", + __func__, expected_seqnum, seqnum); + goto error; + } + + words_to_end = len / sizeof(*pkt_data) - 1; + pkt_data += words_to_end; + seqnum = ntohl(*pkt_data) & 0xffff; + expected_seqnum += words_to_end; + if (expected_seqnum != seqnum) { + ksft_print_msg("[%s] expected seqnum at end [%u], got seqnum [%u]\n", + __func__, expected_seqnum, seqnum); + goto error; + } + + return true; + +error: + pkt_dump(data, len, !bytes_processed); + return false; +} + +static bool is_pkt_valid(struct pkt *pkt, void *buffer, u64 addr, u32 len) +{ + if (pkt->len != len) { + ksft_print_msg("[%s] expected packet length [%d], got length [%d]\n", + __func__, pkt->len, len); + pkt_dump(xsk_umem__get_data(buffer, addr), len, true); + return false; + } + + return true; +} + +static u32 load_value(u32 *counter) +{ + return __atomic_load_n(counter, __ATOMIC_ACQUIRE); +} + +static bool kick_tx_with_check(struct xsk_socket_info *xsk, int *ret) +{ + u32 max_budget = MAX_TX_BUDGET_DEFAULT; + u32 cons, ready_to_send; + int delta; + + cons = load_value(xsk->tx.consumer); + ready_to_send = load_value(xsk->tx.producer) - cons; + *ret = sendto(xsk_socket__fd(xsk->xsk), NULL, 0, MSG_DONTWAIT, NULL, 0); + + delta = load_value(xsk->tx.consumer) - cons; + /* By default, xsk should consume exact @max_budget descs at one + * send in this case where hitting the max budget limit in while + * loop is triggered in __xsk_generic_xmit(). Please make sure that + * the number of descs to be sent is larger than @max_budget, or + * else the tx.consumer will be updated in xskq_cons_peek_desc() + * in time which hides the issue we try to verify. + */ + if (ready_to_send > max_budget && delta != max_budget) + return false; + + return true; +} + +int kick_tx(struct xsk_socket_info *xsk) +{ + int ret; + + if (xsk->check_consumer) { + if (!kick_tx_with_check(xsk, &ret)) + return TEST_FAILURE; + } else { + ret = sendto(xsk_socket__fd(xsk->xsk), NULL, 0, MSG_DONTWAIT, NULL, 0); + } + if (ret >= 0) + return TEST_PASS; + if (errno == ENOBUFS || errno == EAGAIN || errno == EBUSY || errno == ENETDOWN) { + usleep(100); + return TEST_PASS; + } + return TEST_FAILURE; +} + +int kick_rx(struct xsk_socket_info *xsk) +{ + int ret; + + ret = recvfrom(xsk_socket__fd(xsk->xsk), NULL, 0, MSG_DONTWAIT, NULL, NULL); + if (ret < 0) + return TEST_FAILURE; + + return TEST_PASS; +} + +static int complete_pkts(struct xsk_socket_info *xsk, int batch_size) +{ + unsigned int rcvd; + u32 idx; + int ret; + + if (xsk_ring_prod__needs_wakeup(&xsk->tx)) { + ret = kick_tx(xsk); + if (ret) + return TEST_FAILURE; + } + + rcvd = xsk_ring_cons__peek(&xsk->umem->cq, batch_size, &idx); + if (rcvd) { + if (rcvd > xsk->outstanding_tx) { + u64 addr = *xsk_ring_cons__comp_addr(&xsk->umem->cq, idx + rcvd - 1); + + ksft_print_msg("[%s] Too many packets completed\n", __func__); + ksft_print_msg("Last completion address: %llx\n", + (unsigned long long)addr); + return TEST_FAILURE; + } + + xsk_ring_cons__release(&xsk->umem->cq, rcvd); + xsk->outstanding_tx -= rcvd; + } + + return TEST_PASS; +} + +static int __receive_pkts(struct test_spec *test, struct xsk_socket_info *xsk) +{ + u32 frags_processed = 0, nb_frags = 0, pkt_len = 0; + u32 idx_rx = 0, idx_fq = 0, rcvd, pkts_sent = 0; + struct pkt_stream *pkt_stream = xsk->pkt_stream; + struct ifobject *ifobj = test->ifobj_rx; + struct xsk_umem_info *umem = xsk->umem; + struct pollfd fds = { }; + struct pkt *pkt; + u64 first_addr = 0; + int ret; + + fds.fd = xsk_socket__fd(xsk->xsk); + fds.events = POLLIN; + + ret = kick_rx(xsk); + if (ret) + return TEST_FAILURE; + + if (ifobj->use_poll) { + ret = poll(&fds, 1, POLL_TMOUT); + if (ret < 0) + return TEST_FAILURE; + + if (!ret) { + if (!is_umem_valid(test->ifobj_tx)) + return TEST_PASS; + + ksft_print_msg("ERROR: [%s] Poll timed out\n", __func__); + return TEST_CONTINUE; + } + + if (!(fds.revents & POLLIN)) + return TEST_CONTINUE; + } + + rcvd = xsk_ring_cons__peek(&xsk->rx, xsk->batch_size, &idx_rx); + if (!rcvd) + return TEST_CONTINUE; + + if (ifobj->use_fill_ring) { + ret = xsk_ring_prod__reserve(&umem->fq, rcvd, &idx_fq); + while (ret != rcvd) { + if (xsk_ring_prod__needs_wakeup(&umem->fq)) { + ret = poll(&fds, 1, POLL_TMOUT); + if (ret < 0) + return TEST_FAILURE; + } + ret = xsk_ring_prod__reserve(&umem->fq, rcvd, &idx_fq); + } + } + + while (frags_processed < rcvd) { + const struct xdp_desc *desc = xsk_ring_cons__rx_desc(&xsk->rx, idx_rx++); + u64 addr = desc->addr, orig; + + orig = xsk_umem__extract_addr(addr); + addr = xsk_umem__add_offset_to_addr(addr); + + if (!nb_frags) { + pkt = pkt_stream_get_next_rx_pkt(pkt_stream, &pkts_sent); + if (!pkt) { + ksft_print_msg("[%s] received too many packets addr: %lx len %u\n", + __func__, addr, desc->len); + return TEST_FAILURE; + } + } + + print_verbose("Rx: addr: %lx len: %u options: %u pkt_nb: %u valid: %u\n", + addr, desc->len, desc->options, pkt->pkt_nb, pkt->valid); + + if (!is_frag_valid(umem, addr, desc->len, pkt->pkt_nb, pkt_len) || + !is_offset_correct(umem, pkt, addr) || (ifobj->use_metadata && + !is_metadata_correct(pkt, umem->buffer, addr))) + return TEST_FAILURE; + + if (!nb_frags++) + first_addr = addr; + frags_processed++; + pkt_len += desc->len; + if (ifobj->use_fill_ring) + *xsk_ring_prod__fill_addr(&umem->fq, idx_fq++) = orig; + + if (pkt_continues(desc->options)) + continue; + + /* The complete packet has been received */ + if (!is_pkt_valid(pkt, umem->buffer, first_addr, pkt_len) || + !is_offset_correct(umem, pkt, addr)) + return TEST_FAILURE; + + pkt_stream->nb_rx_pkts++; + nb_frags = 0; + pkt_len = 0; + } + + if (nb_frags) { + /* In the middle of a packet. Start over from beginning of packet. */ + idx_rx -= nb_frags; + xsk_ring_cons__cancel(&xsk->rx, nb_frags); + if (ifobj->use_fill_ring) { + idx_fq -= nb_frags; + xsk_ring_prod__cancel(&umem->fq, nb_frags); + } + frags_processed -= nb_frags; + } + + if (ifobj->use_fill_ring) + xsk_ring_prod__submit(&umem->fq, frags_processed); + if (ifobj->release_rx) + xsk_ring_cons__release(&xsk->rx, frags_processed); + + pthread_mutex_lock(&pacing_mutex); + pkts_in_flight -= pkts_sent; + pthread_mutex_unlock(&pacing_mutex); + pkts_sent = 0; + + return TEST_CONTINUE; +} + +bool all_packets_received(struct test_spec *test, struct xsk_socket_info *xsk, u32 sock_num, + unsigned long *bitmap) +{ + struct pkt_stream *pkt_stream = xsk->pkt_stream; + + if (!pkt_stream) { + __set_bit(sock_num, bitmap); + return false; + } + + if (pkt_stream->nb_rx_pkts == pkt_stream->nb_valid_entries) { + __set_bit(sock_num, bitmap); + if (bitmap_full(bitmap, test->nb_sockets)) + return true; + } + + return false; +} + +static int receive_pkts(struct test_spec *test) +{ + struct timeval tv_end, tv_now, tv_timeout = {THREAD_TMOUT, 0}; + DECLARE_BITMAP(bitmap, test->nb_sockets); + struct xsk_socket_info *xsk; + u32 sock_num = 0; + int res, ret; + + bitmap_zero(bitmap, test->nb_sockets); + + ret = gettimeofday(&tv_now, NULL); + if (ret) + return TEST_FAILURE; + + timeradd(&tv_now, &tv_timeout, &tv_end); + + while (1) { + xsk = &test->ifobj_rx->xsk_arr[sock_num]; + + if ((all_packets_received(test, xsk, sock_num, bitmap))) + break; + + res = __receive_pkts(test, xsk); + if (!(res == TEST_PASS || res == TEST_CONTINUE)) + return res; + + ret = gettimeofday(&tv_now, NULL); + if (ret) + return TEST_FAILURE; + + if (timercmp(&tv_now, &tv_end, >)) { + ksft_print_msg("ERROR: [%s] Receive loop timed out\n", __func__); + return TEST_FAILURE; + } + sock_num = (sock_num + 1) % test->nb_sockets; + } + + return TEST_PASS; +} + +static int __send_pkts(struct ifobject *ifobject, struct xsk_socket_info *xsk, bool timeout) +{ + u32 i, idx = 0, valid_pkts = 0, valid_frags = 0, buffer_len; + struct pkt_stream *pkt_stream = xsk->pkt_stream; + struct xsk_umem_info *umem = ifobject->umem; + bool use_poll = ifobject->use_poll; + struct pollfd fds = { }; + int ret; + + buffer_len = pkt_get_buffer_len(umem, pkt_stream->max_pkt_len); + /* pkts_in_flight might be negative if many invalid packets are sent */ + if (pkts_in_flight >= (int)((umem_size(umem) - xsk->batch_size * buffer_len) / + buffer_len)) { + ret = kick_tx(xsk); + if (ret) + return TEST_FAILURE; + return TEST_CONTINUE; + } + + fds.fd = xsk_socket__fd(xsk->xsk); + fds.events = POLLOUT; + + while (xsk_ring_prod__reserve(&xsk->tx, xsk->batch_size, &idx) < xsk->batch_size) { + if (use_poll) { + ret = poll(&fds, 1, POLL_TMOUT); + if (timeout) { + if (ret < 0) { + ksft_print_msg("ERROR: [%s] Poll error %d\n", + __func__, errno); + return TEST_FAILURE; + } + if (ret == 0) + return TEST_PASS; + break; + } + if (ret <= 0) { + ksft_print_msg("ERROR: [%s] Poll error %d\n", + __func__, errno); + return TEST_FAILURE; + } + } + + complete_pkts(xsk, xsk->batch_size); + } + + for (i = 0; i < xsk->batch_size; i++) { + struct pkt *pkt = pkt_stream_get_next_tx_pkt(pkt_stream); + u32 nb_frags_left, nb_frags, bytes_written = 0; + + if (!pkt) + break; + + nb_frags = pkt_nb_frags(umem->frame_size, pkt_stream, pkt); + if (nb_frags > xsk->batch_size - i) { + pkt_stream_cancel(pkt_stream); + xsk_ring_prod__cancel(&xsk->tx, xsk->batch_size - i); + break; + } + nb_frags_left = nb_frags; + + while (nb_frags_left--) { + struct xdp_desc *tx_desc = xsk_ring_prod__tx_desc(&xsk->tx, idx + i); + + tx_desc->addr = pkt_get_addr(pkt, ifobject->umem); + if (pkt_stream->verbatim) { + tx_desc->len = pkt->len; + tx_desc->options = pkt->options; + } else if (nb_frags_left) { + tx_desc->len = umem->frame_size; + tx_desc->options = XDP_PKT_CONTD; + } else { + tx_desc->len = pkt->len - bytes_written; + tx_desc->options = 0; + } + if (pkt->valid) + pkt_generate(xsk, umem, tx_desc->addr, tx_desc->len, pkt->pkt_nb, + bytes_written); + bytes_written += tx_desc->len; + + print_verbose("Tx addr: %llx len: %u options: %u pkt_nb: %u\n", + tx_desc->addr, tx_desc->len, tx_desc->options, pkt->pkt_nb); + + if (nb_frags_left) { + i++; + if (pkt_stream->verbatim) + pkt = pkt_stream_get_next_tx_pkt(pkt_stream); + } + } + + if (pkt && pkt->valid) { + valid_pkts++; + valid_frags += nb_frags; + } + } + + pthread_mutex_lock(&pacing_mutex); + pkts_in_flight += valid_pkts; + pthread_mutex_unlock(&pacing_mutex); + + xsk_ring_prod__submit(&xsk->tx, i); + xsk->outstanding_tx += valid_frags; + + if (use_poll) { + ret = poll(&fds, 1, POLL_TMOUT); + if (ret <= 0) { + if (ret == 0 && timeout) + return TEST_PASS; + + ksft_print_msg("ERROR: [%s] Poll error %d\n", __func__, ret); + return TEST_FAILURE; + } + } + + if (!timeout) { + if (complete_pkts(xsk, i)) + return TEST_FAILURE; + + usleep(10); + return TEST_PASS; + } + + return TEST_CONTINUE; +} + +static int wait_for_tx_completion(struct xsk_socket_info *xsk) +{ + struct timeval tv_end, tv_now, tv_timeout = {THREAD_TMOUT, 0}; + int ret; + + ret = gettimeofday(&tv_now, NULL); + if (ret) + return TEST_FAILURE; + timeradd(&tv_now, &tv_timeout, &tv_end); + + while (xsk->outstanding_tx) { + ret = gettimeofday(&tv_now, NULL); + if (ret) + return TEST_FAILURE; + if (timercmp(&tv_now, &tv_end, >)) { + ksft_print_msg("ERROR: [%s] Transmission loop timed out\n", __func__); + return TEST_FAILURE; + } + + complete_pkts(xsk, xsk->batch_size); + } + + return TEST_PASS; +} + +bool all_packets_sent(struct test_spec *test, unsigned long *bitmap) +{ + return bitmap_full(bitmap, test->nb_sockets); +} + +static int send_pkts(struct test_spec *test, struct ifobject *ifobject) +{ + bool timeout = !is_umem_valid(test->ifobj_rx); + DECLARE_BITMAP(bitmap, test->nb_sockets); + u32 i, ret; + + bitmap_zero(bitmap, test->nb_sockets); + + while (!(all_packets_sent(test, bitmap))) { + for (i = 0; i < test->nb_sockets; i++) { + struct pkt_stream *pkt_stream; + + pkt_stream = ifobject->xsk_arr[i].pkt_stream; + if (!pkt_stream || pkt_stream->current_pkt_nb >= pkt_stream->nb_pkts) { + __set_bit(i, bitmap); + continue; + } + ret = __send_pkts(ifobject, &ifobject->xsk_arr[i], timeout); + if (ret == TEST_CONTINUE && !test->fail) + continue; + + if ((ret || test->fail) && !timeout) + return TEST_FAILURE; + + if (ret == TEST_PASS && timeout) + return ret; + + ret = wait_for_tx_completion(&ifobject->xsk_arr[i]); + if (ret) + return TEST_FAILURE; + } + } + + return TEST_PASS; +} + +static int get_xsk_stats(struct xsk_socket *xsk, struct xdp_statistics *stats) +{ + int fd = xsk_socket__fd(xsk), err; + socklen_t optlen, expected_len; + + optlen = sizeof(*stats); + err = getsockopt(fd, SOL_XDP, XDP_STATISTICS, stats, &optlen); + if (err) { + ksft_print_msg("[%s] getsockopt(XDP_STATISTICS) error %u %s\n", + __func__, -err, strerror(-err)); + return TEST_FAILURE; + } + + expected_len = sizeof(struct xdp_statistics); + if (optlen != expected_len) { + ksft_print_msg("[%s] getsockopt optlen error. Expected: %u got: %u\n", + __func__, expected_len, optlen); + return TEST_FAILURE; + } + + return TEST_PASS; +} + +static int validate_rx_dropped(struct ifobject *ifobject) +{ + struct xsk_socket *xsk = ifobject->xsk->xsk; + struct xdp_statistics stats; + int err; + + err = kick_rx(ifobject->xsk); + if (err) + return TEST_FAILURE; + + err = get_xsk_stats(xsk, &stats); + if (err) + return TEST_FAILURE; + + /* The receiver calls getsockopt after receiving the last (valid) + * packet which is not the final packet sent in this test (valid and + * invalid packets are sent in alternating fashion with the final + * packet being invalid). Since the last packet may or may not have + * been dropped already, both outcomes must be allowed. + */ + if (stats.rx_dropped == ifobject->xsk->pkt_stream->nb_pkts / 2 || + stats.rx_dropped == ifobject->xsk->pkt_stream->nb_pkts / 2 - 1) + return TEST_PASS; + + return TEST_FAILURE; +} + +static int validate_rx_full(struct ifobject *ifobject) +{ + struct xsk_socket *xsk = ifobject->xsk->xsk; + struct xdp_statistics stats; + int err; + + usleep(1000); + err = kick_rx(ifobject->xsk); + if (err) + return TEST_FAILURE; + + err = get_xsk_stats(xsk, &stats); + if (err) + return TEST_FAILURE; + + if (stats.rx_ring_full) + return TEST_PASS; + + return TEST_FAILURE; +} + +static int validate_fill_empty(struct ifobject *ifobject) +{ + struct xsk_socket *xsk = ifobject->xsk->xsk; + struct xdp_statistics stats; + int err; + + usleep(1000); + err = kick_rx(ifobject->xsk); + if (err) + return TEST_FAILURE; + + err = get_xsk_stats(xsk, &stats); + if (err) + return TEST_FAILURE; + + if (stats.rx_fill_ring_empty_descs) + return TEST_PASS; + + return TEST_FAILURE; +} + +static int validate_tx_invalid_descs(struct ifobject *ifobject) +{ + struct xsk_socket *xsk = ifobject->xsk->xsk; + int fd = xsk_socket__fd(xsk); + struct xdp_statistics stats; + socklen_t optlen; + int err; + + optlen = sizeof(stats); + err = getsockopt(fd, SOL_XDP, XDP_STATISTICS, &stats, &optlen); + if (err) { + ksft_print_msg("[%s] getsockopt(XDP_STATISTICS) error %u %s\n", + __func__, -err, strerror(-err)); + return TEST_FAILURE; + } + + if (stats.tx_invalid_descs != ifobject->xsk->pkt_stream->nb_pkts / 2) { + ksft_print_msg("[%s] tx_invalid_descs incorrect. Got [%llu] expected [%u]\n", + __func__, + (unsigned long long)stats.tx_invalid_descs, + ifobject->xsk->pkt_stream->nb_pkts); + return TEST_FAILURE; + } + + return TEST_PASS; +} + +static int xsk_configure(struct test_spec *test, struct ifobject *ifobject, + struct xsk_umem_info *umem, bool tx) +{ + int i, ret; + + for (i = 0; i < test->nb_sockets; i++) { + bool shared = (ifobject->shared_umem && tx) ? true : !!i; + u32 ctr = 0; + + while (ctr++ < SOCK_RECONF_CTR) { + ret = xsk_configure_socket(&ifobject->xsk_arr[i], umem, + ifobject, shared); + if (!ret) + break; + + /* Retry if it fails as xsk_socket__create() is asynchronous */ + if (ctr >= SOCK_RECONF_CTR) + return ret; + usleep(USLEEP_MAX); + } + if (ifobject->busy_poll) { + ret = enable_busy_poll(&ifobject->xsk_arr[i]); + if (ret) + return ret; + } + } + + return 0; +} + +static int thread_common_ops_tx(struct test_spec *test, struct ifobject *ifobject) +{ + int ret = xsk_configure(test, ifobject, test->ifobj_rx->umem, true); + + if (ret) + return ret; + ifobject->xsk = &ifobject->xsk_arr[0]; + ifobject->xskmap = test->ifobj_rx->xskmap; + memcpy(ifobject->umem, test->ifobj_rx->umem, sizeof(struct xsk_umem_info)); + ifobject->umem->base_addr = 0; + + return 0; +} + +static int xsk_populate_fill_ring(struct xsk_umem_info *umem, struct pkt_stream *pkt_stream, + bool fill_up) +{ + u32 rx_frame_size = umem->frame_size - XDP_PACKET_HEADROOM; + u32 idx = 0, filled = 0, buffers_to_fill, nb_pkts; + int ret; + + if (umem->num_frames < XSK_RING_PROD__DEFAULT_NUM_DESCS) + buffers_to_fill = umem->num_frames; + else + buffers_to_fill = umem->fill_size; + + ret = xsk_ring_prod__reserve(&umem->fq, buffers_to_fill, &idx); + if (ret != buffers_to_fill) + return -ENOSPC; + + while (filled < buffers_to_fill) { + struct pkt *pkt = pkt_stream_get_next_rx_pkt(pkt_stream, &nb_pkts); + u64 addr; + u32 i; + + for (i = 0; i < pkt_nb_frags(rx_frame_size, pkt_stream, pkt); i++) { + if (!pkt) { + if (!fill_up) + break; + addr = filled * umem->frame_size + umem->base_addr; + } else if (pkt->offset >= 0) { + addr = pkt->offset % umem->frame_size + umem_alloc_buffer(umem); + } else { + addr = pkt->offset + umem_alloc_buffer(umem); + } + + *xsk_ring_prod__fill_addr(&umem->fq, idx++) = addr; + if (++filled >= buffers_to_fill) + break; + } + } + xsk_ring_prod__submit(&umem->fq, filled); + xsk_ring_prod__cancel(&umem->fq, buffers_to_fill - filled); + + pkt_stream_reset(pkt_stream); + umem_reset_alloc(umem); + + return 0; +} + +static int thread_common_ops(struct test_spec *test, struct ifobject *ifobject) +{ + LIBBPF_OPTS(bpf_xdp_query_opts, opts); + int mmap_flags; + u64 umem_sz; + void *bufs; + int ret; + u32 i; + + umem_sz = ifobject->umem->num_frames * ifobject->umem->frame_size; + mmap_flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_NORESERVE; + + if (ifobject->umem->unaligned_mode) + mmap_flags |= MAP_HUGETLB | MAP_HUGE_2MB; + + if (ifobject->shared_umem) + umem_sz *= 2; + + bufs = mmap(NULL, umem_sz, PROT_READ | PROT_WRITE, mmap_flags, -1, 0); + if (bufs == MAP_FAILED) + return -errno; + + ret = xsk_configure_umem(ifobject, ifobject->umem, bufs, umem_sz); + if (ret) + return ret; + + ret = xsk_configure(test, ifobject, ifobject->umem, false); + if (ret) + return ret; + + ifobject->xsk = &ifobject->xsk_arr[0]; + + if (!ifobject->rx_on) + return 0; + + ret = xsk_populate_fill_ring(ifobject->umem, ifobject->xsk->pkt_stream, + ifobject->use_fill_ring); + if (ret) + return ret; + + for (i = 0; i < test->nb_sockets; i++) { + ifobject->xsk = &ifobject->xsk_arr[i]; + ret = xsk_update_xskmap(ifobject->xskmap, ifobject->xsk->xsk, i); + if (ret) + return ret; + } + + return 0; +} + +void *worker_testapp_validate_tx(void *arg) +{ + struct test_spec *test = (struct test_spec *)arg; + struct ifobject *ifobject = test->ifobj_tx; + int err; + + if (test->current_step == 1) { + if (!ifobject->shared_umem) { + if (thread_common_ops(test, ifobject)) { + test->fail = true; + pthread_exit(NULL); + } + } else { + if (thread_common_ops_tx(test, ifobject)) { + test->fail = true; + pthread_exit(NULL); + } + } + } + + err = send_pkts(test, ifobject); + + if (!err && ifobject->validation_func) + err = ifobject->validation_func(ifobject); + if (err) + test->fail = true; + + pthread_exit(NULL); +} + +void *worker_testapp_validate_rx(void *arg) +{ + struct test_spec *test = (struct test_spec *)arg; + struct ifobject *ifobject = test->ifobj_rx; + int err; + + if (test->current_step == 1) { + err = thread_common_ops(test, ifobject); + } else { + xsk_clear_xskmap(ifobject->xskmap); + err = xsk_update_xskmap(ifobject->xskmap, ifobject->xsk->xsk, 0); + if (err) + ksft_print_msg("Error: Failed to update xskmap, error %s\n", + strerror(-err)); + } + + pthread_barrier_wait(&barr); + + /* We leave only now in case of error to avoid getting stuck in the barrier */ + if (err) { + test->fail = true; + pthread_exit(NULL); + } + + err = receive_pkts(test); + + if (!err && ifobject->validation_func) + err = ifobject->validation_func(ifobject); + + if (err) { + if (!test->adjust_tail) { + test->fail = true; + } else { + bool supported; + + if (is_adjust_tail_supported(ifobject->xdp_progs, &supported)) + test->fail = true; + else if (!supported) + test->adjust_tail_support = false; + else + test->fail = true; + } + } + + pthread_exit(NULL); +} + +static void testapp_clean_xsk_umem(struct ifobject *ifobj) +{ + u64 umem_sz = ifobj->umem->num_frames * ifobj->umem->frame_size; + + if (ifobj->shared_umem) + umem_sz *= 2; + + umem_sz = ceil_u64(umem_sz, HUGEPAGE_SIZE) * HUGEPAGE_SIZE; + xsk_umem__delete(ifobj->umem->umem); + munmap(ifobj->umem->buffer, umem_sz); +} + +static void handler(int signum) +{ + pthread_exit(NULL); +} + +static bool xdp_prog_changed_rx(struct test_spec *test) +{ + struct ifobject *ifobj = test->ifobj_rx; + + return ifobj->xdp_prog != test->xdp_prog_rx || ifobj->mode != test->mode; +} + +static bool xdp_prog_changed_tx(struct test_spec *test) +{ + struct ifobject *ifobj = test->ifobj_tx; + + return ifobj->xdp_prog != test->xdp_prog_tx || ifobj->mode != test->mode; +} + +static int xsk_reattach_xdp(struct ifobject *ifobj, struct bpf_program *xdp_prog, + struct bpf_map *xskmap, enum test_mode mode) +{ + int err; + + xsk_detach_xdp_program(ifobj->ifindex, mode_to_xdp_flags(ifobj->mode)); + err = xsk_attach_xdp_program(xdp_prog, ifobj->ifindex, mode_to_xdp_flags(mode)); + if (err) { + ksft_print_msg("Error attaching XDP program\n"); + return err; + } + + if (ifobj->mode != mode && (mode == TEST_MODE_DRV || mode == TEST_MODE_ZC)) + if (!xsk_is_in_mode(ifobj->ifindex, XDP_FLAGS_DRV_MODE)) { + ksft_print_msg("ERROR: XDP prog not in DRV mode\n"); + return -EINVAL; + } + + ifobj->xdp_prog = xdp_prog; + ifobj->xskmap = xskmap; + ifobj->mode = mode; + + return 0; +} + +static int xsk_attach_xdp_progs(struct test_spec *test, struct ifobject *ifobj_rx, + struct ifobject *ifobj_tx) +{ + int err = 0; + + if (xdp_prog_changed_rx(test)) { + err = xsk_reattach_xdp(ifobj_rx, test->xdp_prog_rx, test->xskmap_rx, test->mode); + if (err) + return err; + } + + if (!ifobj_tx || ifobj_tx->shared_umem) + return 0; + + if (xdp_prog_changed_tx(test)) + err = xsk_reattach_xdp(ifobj_tx, test->xdp_prog_tx, test->xskmap_tx, test->mode); + + return err; +} + +static void clean_sockets(struct test_spec *test, struct ifobject *ifobj) +{ + u32 i; + + if (!ifobj || !test) + return; + + for (i = 0; i < test->nb_sockets; i++) + xsk_socket__delete(ifobj->xsk_arr[i].xsk); +} + +static void clean_umem(struct test_spec *test, struct ifobject *ifobj1, struct ifobject *ifobj2) +{ + if (!ifobj1) + return; + + testapp_clean_xsk_umem(ifobj1); + if (ifobj2 && !ifobj2->shared_umem) + testapp_clean_xsk_umem(ifobj2); +} + +static int __testapp_validate_traffic(struct test_spec *test, struct ifobject *ifobj1, + struct ifobject *ifobj2) +{ + pthread_t t0, t1; + int err; + + if (test->mtu > MAX_ETH_PKT_SIZE) { + if (test->mode == TEST_MODE_ZC && (!ifobj1->multi_buff_zc_supp || + (ifobj2 && !ifobj2->multi_buff_zc_supp))) { + ksft_print_msg("Multi buffer for zero-copy not supported.\n"); + return TEST_SKIP; + } + if (test->mode != TEST_MODE_ZC && (!ifobj1->multi_buff_supp || + (ifobj2 && !ifobj2->multi_buff_supp))) { + ksft_print_msg("Multi buffer not supported.\n"); + return TEST_SKIP; + } + } + err = test_spec_set_mtu(test, test->mtu); + if (err) { + ksft_print_msg("Error, could not set mtu.\n"); + return TEST_FAILURE; + } + + if (ifobj2) { + if (pthread_barrier_init(&barr, NULL, 2)) + return TEST_FAILURE; + pkt_stream_reset(ifobj2->xsk->pkt_stream); + } + + test->current_step++; + pkt_stream_reset(ifobj1->xsk->pkt_stream); + pkts_in_flight = 0; + + signal(SIGUSR1, handler); + /*Spawn RX thread */ + pthread_create(&t0, NULL, ifobj1->func_ptr, test); + + if (ifobj2) { + pthread_barrier_wait(&barr); + if (pthread_barrier_destroy(&barr)) { + pthread_kill(t0, SIGUSR1); + clean_sockets(test, ifobj1); + clean_umem(test, ifobj1, NULL); + return TEST_FAILURE; + } + + /*Spawn TX thread */ + pthread_create(&t1, NULL, ifobj2->func_ptr, test); + + pthread_join(t1, NULL); + } + + if (!ifobj2) + pthread_kill(t0, SIGUSR1); + else + pthread_join(t0, NULL); + + if (test->total_steps == test->current_step || test->fail) { + clean_sockets(test, ifobj1); + clean_sockets(test, ifobj2); + clean_umem(test, ifobj1, ifobj2); + } + + if (test->fail) + return TEST_FAILURE; + + return TEST_PASS; +} + +static int testapp_validate_traffic(struct test_spec *test) +{ + struct ifobject *ifobj_rx = test->ifobj_rx; + struct ifobject *ifobj_tx = test->ifobj_tx; + + if ((ifobj_rx->umem->unaligned_mode && !ifobj_rx->unaligned_supp) || + (ifobj_tx->umem->unaligned_mode && !ifobj_tx->unaligned_supp)) { + ksft_print_msg("No huge pages present.\n"); + return TEST_SKIP; + } + + if (test->set_ring) { + if (ifobj_tx->hw_ring_size_supp) { + if (set_ring_size(ifobj_tx)) { + ksft_print_msg("Failed to change HW ring size.\n"); + return TEST_FAILURE; + } + } else { + ksft_print_msg("Changing HW ring size not supported.\n"); + return TEST_SKIP; + } + } + + if (xsk_attach_xdp_progs(test, ifobj_rx, ifobj_tx)) + return TEST_FAILURE; + return __testapp_validate_traffic(test, ifobj_rx, ifobj_tx); +} + +static int testapp_validate_traffic_single_thread(struct test_spec *test, struct ifobject *ifobj) +{ + return __testapp_validate_traffic(test, ifobj, NULL); +} + +int testapp_teardown(struct test_spec *test) +{ + int i; + + for (i = 0; i < MAX_TEARDOWN_ITER; i++) { + if (testapp_validate_traffic(test)) + return TEST_FAILURE; + test_spec_reset(test); + } + + return TEST_PASS; +} + +static void swap_directions(struct ifobject **ifobj1, struct ifobject **ifobj2) +{ + thread_func_t tmp_func_ptr = (*ifobj1)->func_ptr; + struct ifobject *tmp_ifobj = (*ifobj1); + + (*ifobj1)->func_ptr = (*ifobj2)->func_ptr; + (*ifobj2)->func_ptr = tmp_func_ptr; + + *ifobj1 = *ifobj2; + *ifobj2 = tmp_ifobj; +} + +int testapp_bidirectional(struct test_spec *test) +{ + int res; + + test->ifobj_tx->rx_on = true; + test->ifobj_rx->tx_on = true; + test->total_steps = 2; + if (testapp_validate_traffic(test)) + return TEST_FAILURE; + + print_verbose("Switching Tx/Rx direction\n"); + swap_directions(&test->ifobj_rx, &test->ifobj_tx); + res = __testapp_validate_traffic(test, test->ifobj_rx, test->ifobj_tx); + + swap_directions(&test->ifobj_rx, &test->ifobj_tx); + return res; +} + +static int swap_xsk_resources(struct test_spec *test) +{ + int ret; + + test->ifobj_tx->xsk_arr[0].pkt_stream = NULL; + test->ifobj_rx->xsk_arr[0].pkt_stream = NULL; + test->ifobj_tx->xsk_arr[1].pkt_stream = test->tx_pkt_stream_default; + test->ifobj_rx->xsk_arr[1].pkt_stream = test->rx_pkt_stream_default; + test->ifobj_tx->xsk = &test->ifobj_tx->xsk_arr[1]; + test->ifobj_rx->xsk = &test->ifobj_rx->xsk_arr[1]; + + ret = xsk_update_xskmap(test->ifobj_rx->xskmap, test->ifobj_rx->xsk->xsk, 0); + if (ret) + return TEST_FAILURE; + + return TEST_PASS; +} + +int testapp_xdp_prog_cleanup(struct test_spec *test) +{ + test->total_steps = 2; + test->nb_sockets = 2; + if (testapp_validate_traffic(test)) + return TEST_FAILURE; + + if (swap_xsk_resources(test)) { + clean_sockets(test, test->ifobj_rx); + clean_sockets(test, test->ifobj_tx); + clean_umem(test, test->ifobj_rx, test->ifobj_tx); + return TEST_FAILURE; + } + + return testapp_validate_traffic(test); +} + +int testapp_headroom(struct test_spec *test) +{ + test->ifobj_rx->umem->frame_headroom = UMEM_HEADROOM_TEST_SIZE; + return testapp_validate_traffic(test); +} + +int testapp_stats_rx_dropped(struct test_spec *test) +{ + if (test->mode == TEST_MODE_ZC) { + ksft_print_msg("Can not run RX_DROPPED test for ZC mode\n"); + return TEST_SKIP; + } + + if (pkt_stream_replace_half(test, MIN_PKT_SIZE * 4, 0)) + return TEST_FAILURE; + test->ifobj_rx->umem->frame_headroom = test->ifobj_rx->umem->frame_size - + XDP_PACKET_HEADROOM - MIN_PKT_SIZE * 3; + if (pkt_stream_receive_half(test)) + return TEST_FAILURE; + test->ifobj_rx->validation_func = validate_rx_dropped; + return testapp_validate_traffic(test); +} + +int testapp_stats_tx_invalid_descs(struct test_spec *test) +{ + if (pkt_stream_replace_half(test, XSK_UMEM__INVALID_FRAME_SIZE, 0)) + return TEST_FAILURE; + test->ifobj_tx->validation_func = validate_tx_invalid_descs; + return testapp_validate_traffic(test); +} + +int testapp_stats_rx_full(struct test_spec *test) +{ + if (pkt_stream_replace(test, DEFAULT_UMEM_BUFFERS + DEFAULT_UMEM_BUFFERS / 2, MIN_PKT_SIZE)) + return TEST_FAILURE; + test->ifobj_rx->xsk->pkt_stream = pkt_stream_generate(DEFAULT_UMEM_BUFFERS, MIN_PKT_SIZE); + + test->ifobj_rx->xsk->rxqsize = DEFAULT_UMEM_BUFFERS; + test->ifobj_rx->release_rx = false; + test->ifobj_rx->validation_func = validate_rx_full; + return testapp_validate_traffic(test); +} + +int testapp_stats_fill_empty(struct test_spec *test) +{ + if (pkt_stream_replace(test, DEFAULT_UMEM_BUFFERS + DEFAULT_UMEM_BUFFERS / 2, MIN_PKT_SIZE)) + return TEST_FAILURE; + test->ifobj_rx->xsk->pkt_stream = pkt_stream_generate(DEFAULT_UMEM_BUFFERS, MIN_PKT_SIZE); + + test->ifobj_rx->use_fill_ring = false; + test->ifobj_rx->validation_func = validate_fill_empty; + return testapp_validate_traffic(test); +} + +int testapp_send_receive_unaligned(struct test_spec *test) +{ + test->ifobj_tx->umem->unaligned_mode = true; + test->ifobj_rx->umem->unaligned_mode = true; + /* Let half of the packets straddle a 4K buffer boundary */ + if (pkt_stream_replace_half(test, MIN_PKT_SIZE, -MIN_PKT_SIZE / 2)) + return TEST_FAILURE; + + return testapp_validate_traffic(test); +} + +int testapp_send_receive_unaligned_mb(struct test_spec *test) +{ + test->mtu = MAX_ETH_JUMBO_SIZE; + test->ifobj_tx->umem->unaligned_mode = true; + test->ifobj_rx->umem->unaligned_mode = true; + if (pkt_stream_replace(test, DEFAULT_PKT_CNT, MAX_ETH_JUMBO_SIZE)) + return TEST_FAILURE; + return testapp_validate_traffic(test); +} + +int testapp_single_pkt(struct test_spec *test) +{ + struct pkt pkts[] = {{0, MIN_PKT_SIZE, 0, true}}; + + if (pkt_stream_generate_custom(test, pkts, ARRAY_SIZE(pkts))) + return TEST_FAILURE; + return testapp_validate_traffic(test); +} + +int testapp_send_receive_mb(struct test_spec *test) +{ + test->mtu = MAX_ETH_JUMBO_SIZE; + if (pkt_stream_replace(test, DEFAULT_PKT_CNT, MAX_ETH_JUMBO_SIZE)) + return TEST_FAILURE; + + return testapp_validate_traffic(test); +} + +int testapp_invalid_desc_mb(struct test_spec *test) +{ + struct xsk_umem_info *umem = test->ifobj_tx->umem; + u64 umem_size = umem->num_frames * umem->frame_size; + struct pkt pkts[] = { + /* Valid packet for synch to start with */ + {0, MIN_PKT_SIZE, 0, true, 0}, + /* Zero frame len is not legal */ + {0, XSK_UMEM__LARGE_FRAME_SIZE, 0, false, XDP_PKT_CONTD}, + {0, XSK_UMEM__LARGE_FRAME_SIZE, 0, false, XDP_PKT_CONTD}, + {0, 0, 0, false, 0}, + /* Invalid address in the second frame */ + {0, XSK_UMEM__LARGE_FRAME_SIZE, 0, false, XDP_PKT_CONTD}, + {umem_size, XSK_UMEM__LARGE_FRAME_SIZE, 0, false, XDP_PKT_CONTD}, + /* Invalid len in the middle */ + {0, XSK_UMEM__LARGE_FRAME_SIZE, 0, false, XDP_PKT_CONTD}, + {0, XSK_UMEM__INVALID_FRAME_SIZE, 0, false, XDP_PKT_CONTD}, + /* Invalid options in the middle */ + {0, XSK_UMEM__LARGE_FRAME_SIZE, 0, false, XDP_PKT_CONTD}, + {0, XSK_UMEM__LARGE_FRAME_SIZE, 0, false, XSK_DESC__INVALID_OPTION}, + /* Transmit 2 frags, receive 3 */ + {0, XSK_UMEM__MAX_FRAME_SIZE, 0, true, XDP_PKT_CONTD}, + {0, XSK_UMEM__MAX_FRAME_SIZE, 0, true, 0}, + /* Middle frame crosses chunk boundary with small length */ + {0, XSK_UMEM__LARGE_FRAME_SIZE, 0, false, XDP_PKT_CONTD}, + {-MIN_PKT_SIZE / 2, MIN_PKT_SIZE, 0, false, 0}, + /* Valid packet for synch so that something is received */ + {0, MIN_PKT_SIZE, 0, true, 0}}; + + if (umem->unaligned_mode) { + /* Crossing a chunk boundary allowed */ + pkts[12].valid = true; + pkts[13].valid = true; + } + + test->mtu = MAX_ETH_JUMBO_SIZE; + if (pkt_stream_generate_custom(test, pkts, ARRAY_SIZE(pkts))) + return TEST_FAILURE; + return testapp_validate_traffic(test); +} + +int testapp_invalid_desc(struct test_spec *test) +{ + struct xsk_umem_info *umem = test->ifobj_tx->umem; + u64 umem_size = umem->num_frames * umem->frame_size; + struct pkt pkts[] = { + /* Zero packet address allowed */ + {0, MIN_PKT_SIZE, 0, true}, + /* Allowed packet */ + {0, MIN_PKT_SIZE, 0, true}, + /* Straddling the start of umem */ + {-2, MIN_PKT_SIZE, 0, false}, + /* Packet too large */ + {0, XSK_UMEM__INVALID_FRAME_SIZE, 0, false}, + /* Up to end of umem allowed */ + {umem_size - MIN_PKT_SIZE - 2 * umem->frame_size, MIN_PKT_SIZE, 0, true}, + /* After umem ends */ + {umem_size, MIN_PKT_SIZE, 0, false}, + /* Straddle the end of umem */ + {umem_size - MIN_PKT_SIZE / 2, MIN_PKT_SIZE, 0, false}, + /* Straddle a 4K boundary */ + {0x1000 - MIN_PKT_SIZE / 2, MIN_PKT_SIZE, 0, false}, + /* Straddle a 2K boundary */ + {0x800 - MIN_PKT_SIZE / 2, MIN_PKT_SIZE, 0, true}, + /* Valid packet for synch so that something is received */ + {0, MIN_PKT_SIZE, 0, true}}; + + if (umem->unaligned_mode) { + /* Crossing a page boundary allowed */ + pkts[7].valid = true; + } + if (umem->frame_size == XSK_UMEM__DEFAULT_FRAME_SIZE / 2) { + /* Crossing a 2K frame size boundary not allowed */ + pkts[8].valid = false; + } + + if (test->ifobj_tx->shared_umem) { + pkts[4].offset += umem_size; + pkts[5].offset += umem_size; + pkts[6].offset += umem_size; + } + + if (pkt_stream_generate_custom(test, pkts, ARRAY_SIZE(pkts))) + return TEST_FAILURE; + return testapp_validate_traffic(test); +} + +int testapp_xdp_drop(struct test_spec *test) +{ + struct xsk_xdp_progs *skel_rx = test->ifobj_rx->xdp_progs; + struct xsk_xdp_progs *skel_tx = test->ifobj_tx->xdp_progs; + + test_spec_set_xdp_prog(test, skel_rx->progs.xsk_xdp_drop, skel_tx->progs.xsk_xdp_drop, + skel_rx->maps.xsk, skel_tx->maps.xsk); + + if (pkt_stream_receive_half(test)) + return TEST_FAILURE; + return testapp_validate_traffic(test); +} + +int testapp_xdp_metadata_copy(struct test_spec *test) +{ + struct xsk_xdp_progs *skel_rx = test->ifobj_rx->xdp_progs; + struct xsk_xdp_progs *skel_tx = test->ifobj_tx->xdp_progs; + + test_spec_set_xdp_prog(test, skel_rx->progs.xsk_xdp_populate_metadata, + skel_tx->progs.xsk_xdp_populate_metadata, + skel_rx->maps.xsk, skel_tx->maps.xsk); + test->ifobj_rx->use_metadata = true; + + skel_rx->bss->count = 0; + + return testapp_validate_traffic(test); +} + +int testapp_xdp_shared_umem(struct test_spec *test) +{ + struct xsk_xdp_progs *skel_rx = test->ifobj_rx->xdp_progs; + struct xsk_xdp_progs *skel_tx = test->ifobj_tx->xdp_progs; + int ret; + + test->total_steps = 1; + test->nb_sockets = 2; + + test_spec_set_xdp_prog(test, skel_rx->progs.xsk_xdp_shared_umem, + skel_tx->progs.xsk_xdp_shared_umem, + skel_rx->maps.xsk, skel_tx->maps.xsk); + + if (pkt_stream_even_odd_sequence(test)) + return TEST_FAILURE; + + ret = testapp_validate_traffic(test); + + release_even_odd_sequence(test); + + return ret; +} + +int testapp_poll_txq_tmout(struct test_spec *test) +{ + test->ifobj_tx->use_poll = true; + /* create invalid frame by set umem frame_size and pkt length equal to 2048 */ + test->ifobj_tx->umem->frame_size = 2048; + if (pkt_stream_replace(test, 2 * DEFAULT_PKT_CNT, 2048)) + return TEST_FAILURE; + return testapp_validate_traffic_single_thread(test, test->ifobj_tx); +} + +int testapp_poll_rxq_tmout(struct test_spec *test) +{ + test->ifobj_rx->use_poll = true; + return testapp_validate_traffic_single_thread(test, test->ifobj_rx); +} + +int testapp_too_many_frags(struct test_spec *test) +{ + struct pkt *pkts; + u32 max_frags, i; + int ret = TEST_FAILURE; + + if (test->mode == TEST_MODE_ZC) { + max_frags = test->ifobj_tx->xdp_zc_max_segs; + } else { + max_frags = get_max_skb_frags(); + if (!max_frags) { + ksft_print_msg("Can't get MAX_SKB_FRAGS from system, using default (17)\n"); + max_frags = 17; + } + max_frags += 1; + } + + pkts = calloc(2 * max_frags + 2, sizeof(struct pkt)); + if (!pkts) + return TEST_FAILURE; + + test->mtu = MAX_ETH_JUMBO_SIZE; + + /* Valid packet for synch */ + pkts[0].len = MIN_PKT_SIZE; + pkts[0].valid = true; + + /* One valid packet with the max amount of frags */ + for (i = 1; i < max_frags + 1; i++) { + pkts[i].len = MIN_PKT_SIZE; + pkts[i].options = XDP_PKT_CONTD; + pkts[i].valid = true; + } + pkts[max_frags].options = 0; + + /* An invalid packet with the max amount of frags but signals packet + * continues on the last frag + */ + for (i = max_frags + 1; i < 2 * max_frags + 1; i++) { + pkts[i].len = MIN_PKT_SIZE; + pkts[i].options = XDP_PKT_CONTD; + pkts[i].valid = false; + } + + /* Valid packet for synch */ + pkts[2 * max_frags + 1].len = MIN_PKT_SIZE; + pkts[2 * max_frags + 1].valid = true; + + if (pkt_stream_generate_custom(test, pkts, 2 * max_frags + 2)) { + free(pkts); + return TEST_FAILURE; + } + + ret = testapp_validate_traffic(test); + free(pkts); + return ret; +} + +static int xsk_load_xdp_programs(struct ifobject *ifobj) +{ + ifobj->xdp_progs = xsk_xdp_progs__open_and_load(); + if (libbpf_get_error(ifobj->xdp_progs)) + return libbpf_get_error(ifobj->xdp_progs); + + return 0; +} + +/* Simple test */ +static bool hugepages_present(void) +{ + size_t mmap_sz = 2 * DEFAULT_UMEM_BUFFERS * XSK_UMEM__DEFAULT_FRAME_SIZE; + void *bufs; + + bufs = mmap(NULL, mmap_sz, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB, -1, MAP_HUGE_2MB); + if (bufs == MAP_FAILED) + return false; + + mmap_sz = ceil_u64(mmap_sz, HUGEPAGE_SIZE) * HUGEPAGE_SIZE; + munmap(bufs, mmap_sz); + return true; +} + +int init_iface(struct ifobject *ifobj, thread_func_t func_ptr) +{ + LIBBPF_OPTS(bpf_xdp_query_opts, query_opts); + int err; + + ifobj->func_ptr = func_ptr; + + err = xsk_load_xdp_programs(ifobj); + if (err) { + ksft_print_msg("Error loading XDP program\n"); + return err; + } + + if (hugepages_present()) + ifobj->unaligned_supp = true; + + err = bpf_xdp_query(ifobj->ifindex, XDP_FLAGS_DRV_MODE, &query_opts); + if (err) { + ksft_print_msg("Error querying XDP capabilities\n"); + return err; + } + if (query_opts.feature_flags & NETDEV_XDP_ACT_RX_SG) + ifobj->multi_buff_supp = true; + if (query_opts.feature_flags & NETDEV_XDP_ACT_XSK_ZEROCOPY) { + if (query_opts.xdp_zc_max_segs > 1) { + ifobj->multi_buff_zc_supp = true; + ifobj->xdp_zc_max_segs = query_opts.xdp_zc_max_segs; + } else { + ifobj->xdp_zc_max_segs = 0; + } + } + + return 0; +} + +int testapp_send_receive(struct test_spec *test) +{ + return testapp_validate_traffic(test); +} + +int testapp_send_receive_2k_frame(struct test_spec *test) +{ + test->ifobj_tx->umem->frame_size = 2048; + test->ifobj_rx->umem->frame_size = 2048; + if (pkt_stream_replace(test, DEFAULT_PKT_CNT, MIN_PKT_SIZE)) + return TEST_FAILURE; + return testapp_validate_traffic(test); +} + +int testapp_poll_rx(struct test_spec *test) +{ + test->ifobj_rx->use_poll = true; + return testapp_validate_traffic(test); +} + +int testapp_poll_tx(struct test_spec *test) +{ + test->ifobj_tx->use_poll = true; + return testapp_validate_traffic(test); +} + +int testapp_aligned_inv_desc(struct test_spec *test) +{ + return testapp_invalid_desc(test); +} + +int testapp_aligned_inv_desc_2k_frame(struct test_spec *test) +{ + test->ifobj_tx->umem->frame_size = 2048; + test->ifobj_rx->umem->frame_size = 2048; + return testapp_invalid_desc(test); +} + +int testapp_unaligned_inv_desc(struct test_spec *test) +{ + test->ifobj_tx->umem->unaligned_mode = true; + test->ifobj_rx->umem->unaligned_mode = true; + return testapp_invalid_desc(test); +} + +int testapp_unaligned_inv_desc_4001_frame(struct test_spec *test) +{ + u64 page_size, umem_size; + + /* Odd frame size so the UMEM doesn't end near a page boundary. */ + test->ifobj_tx->umem->frame_size = 4001; + test->ifobj_rx->umem->frame_size = 4001; + test->ifobj_tx->umem->unaligned_mode = true; + test->ifobj_rx->umem->unaligned_mode = true; + /* This test exists to test descriptors that staddle the end of + * the UMEM but not a page. + */ + page_size = sysconf(_SC_PAGESIZE); + umem_size = test->ifobj_tx->umem->num_frames * test->ifobj_tx->umem->frame_size; + assert(umem_size % page_size > MIN_PKT_SIZE); + assert(umem_size % page_size < page_size - MIN_PKT_SIZE); + + return testapp_invalid_desc(test); +} + +int testapp_aligned_inv_desc_mb(struct test_spec *test) +{ + return testapp_invalid_desc_mb(test); +} + +int testapp_unaligned_inv_desc_mb(struct test_spec *test) +{ + test->ifobj_tx->umem->unaligned_mode = true; + test->ifobj_rx->umem->unaligned_mode = true; + return testapp_invalid_desc_mb(test); +} + +int testapp_xdp_metadata(struct test_spec *test) +{ + return testapp_xdp_metadata_copy(test); +} + +int testapp_xdp_metadata_mb(struct test_spec *test) +{ + test->mtu = MAX_ETH_JUMBO_SIZE; + return testapp_xdp_metadata_copy(test); +} + +int testapp_hw_sw_min_ring_size(struct test_spec *test) +{ + int ret; + + test->set_ring = true; + test->total_steps = 2; + test->ifobj_tx->ring.tx_pending = DEFAULT_BATCH_SIZE; + test->ifobj_tx->ring.rx_pending = DEFAULT_BATCH_SIZE * 2; + test->ifobj_tx->xsk->batch_size = 1; + test->ifobj_rx->xsk->batch_size = 1; + ret = testapp_validate_traffic(test); + if (ret) + return ret; + + /* Set batch size to hw_ring_size - 1 */ + test->ifobj_tx->xsk->batch_size = DEFAULT_BATCH_SIZE - 1; + test->ifobj_rx->xsk->batch_size = DEFAULT_BATCH_SIZE - 1; + return testapp_validate_traffic(test); +} + +int testapp_hw_sw_max_ring_size(struct test_spec *test) +{ + u32 max_descs = XSK_RING_PROD__DEFAULT_NUM_DESCS * 4; + int ret; + + test->set_ring = true; + test->total_steps = 2; + test->ifobj_tx->ring.tx_pending = test->ifobj_tx->ring.tx_max_pending; + test->ifobj_tx->ring.rx_pending = test->ifobj_tx->ring.rx_max_pending; + test->ifobj_rx->umem->num_frames = max_descs; + test->ifobj_rx->umem->fill_size = max_descs; + test->ifobj_rx->umem->comp_size = max_descs; + test->ifobj_tx->xsk->batch_size = XSK_RING_PROD__DEFAULT_NUM_DESCS; + test->ifobj_rx->xsk->batch_size = XSK_RING_PROD__DEFAULT_NUM_DESCS; + + ret = testapp_validate_traffic(test); + if (ret) + return ret; + + /* Set batch_size to 8152 for testing, as the ice HW ignores the 3 lowest bits when + * updating the Rx HW tail register. + */ + test->ifobj_tx->xsk->batch_size = test->ifobj_tx->ring.tx_max_pending - 8; + test->ifobj_rx->xsk->batch_size = test->ifobj_tx->ring.tx_max_pending - 8; + if (pkt_stream_replace(test, max_descs, MIN_PKT_SIZE)) { + clean_sockets(test, test->ifobj_tx); + clean_sockets(test, test->ifobj_rx); + clean_umem(test, test->ifobj_rx, test->ifobj_tx); + return TEST_FAILURE; + } + + return testapp_validate_traffic(test); +} + +static int testapp_xdp_adjust_tail(struct test_spec *test, int adjust_value) +{ + struct xsk_xdp_progs *skel_rx = test->ifobj_rx->xdp_progs; + struct xsk_xdp_progs *skel_tx = test->ifobj_tx->xdp_progs; + + test_spec_set_xdp_prog(test, skel_rx->progs.xsk_xdp_adjust_tail, + skel_tx->progs.xsk_xdp_adjust_tail, + skel_rx->maps.xsk, skel_tx->maps.xsk); + + skel_rx->bss->adjust_value = adjust_value; + + return testapp_validate_traffic(test); +} + +static int testapp_adjust_tail(struct test_spec *test, u32 value, u32 pkt_len) +{ + int ret; + + test->adjust_tail_support = true; + test->adjust_tail = true; + test->total_steps = 1; + + ret = pkt_stream_replace_ifobject(test->ifobj_tx, DEFAULT_BATCH_SIZE, pkt_len); + if (ret) + return TEST_FAILURE; + + ret = pkt_stream_replace_ifobject(test->ifobj_rx, DEFAULT_BATCH_SIZE, pkt_len + value); + if (ret) + return TEST_FAILURE; + + ret = testapp_xdp_adjust_tail(test, value); + if (ret) + return ret; + + if (!test->adjust_tail_support) { + ksft_print_msg("%s %sResize pkt with bpf_xdp_adjust_tail() not supported\n", + mode_string(test), busy_poll_string(test)); + return TEST_SKIP; + } + + return 0; +} + +int testapp_adjust_tail_shrink(struct test_spec *test) +{ + /* Shrink by 4 bytes for testing purpose */ + return testapp_adjust_tail(test, -4, MIN_PKT_SIZE * 2); +} + +int testapp_adjust_tail_shrink_mb(struct test_spec *test) +{ + test->mtu = MAX_ETH_JUMBO_SIZE; + /* Shrink by the frag size */ + return testapp_adjust_tail(test, -XSK_UMEM__MAX_FRAME_SIZE, XSK_UMEM__LARGE_FRAME_SIZE * 2); +} + +int testapp_adjust_tail_grow(struct test_spec *test) +{ + /* Grow by 4 bytes for testing purpose */ + return testapp_adjust_tail(test, 4, MIN_PKT_SIZE * 2); +} + +int testapp_adjust_tail_grow_mb(struct test_spec *test) +{ + test->mtu = MAX_ETH_JUMBO_SIZE; + /* Grow by (frag_size - last_frag_Size) - 1 to stay inside the last fragment */ + return testapp_adjust_tail(test, (XSK_UMEM__MAX_FRAME_SIZE / 2) - 1, + XSK_UMEM__LARGE_FRAME_SIZE * 2); +} + +int testapp_tx_queue_consumer(struct test_spec *test) +{ + int nr_packets; + + if (test->mode == TEST_MODE_ZC) { + ksft_print_msg("Can not run TX_QUEUE_CONSUMER test for ZC mode\n"); + return TEST_SKIP; + } + + nr_packets = MAX_TX_BUDGET_DEFAULT + 1; + if (pkt_stream_replace(test, nr_packets, MIN_PKT_SIZE)) + return TEST_FAILURE; + test->ifobj_tx->xsk->batch_size = nr_packets; + test->ifobj_tx->xsk->check_consumer = true; + + return testapp_validate_traffic(test); +} + +struct ifobject *ifobject_create(void) +{ + struct ifobject *ifobj; + + ifobj = calloc(1, sizeof(struct ifobject)); + if (!ifobj) + return NULL; + + ifobj->xsk_arr = calloc(MAX_SOCKETS, sizeof(*ifobj->xsk_arr)); + if (!ifobj->xsk_arr) + goto out_xsk_arr; + + ifobj->umem = calloc(1, sizeof(*ifobj->umem)); + if (!ifobj->umem) + goto out_umem; + + return ifobj; + +out_umem: + free(ifobj->xsk_arr); +out_xsk_arr: + free(ifobj); + return NULL; +} + +void ifobject_delete(struct ifobject *ifobj) +{ + free(ifobj->umem); + free(ifobj->xsk_arr); + free(ifobj); +} diff --git a/tools/testing/selftests/bpf/prog_tests/test_xsk.h b/tools/testing/selftests/bpf/prog_tests/test_xsk.h new file mode 100644 index 000000000000..8fc78a057de0 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/test_xsk.h @@ -0,0 +1,298 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef TEST_XSK_H_ +#define TEST_XSK_H_ + +#include <linux/ethtool.h> +#include <linux/if_xdp.h> + +#include "../kselftest.h" +#include "xsk.h" + +#ifndef SO_PREFER_BUSY_POLL +#define SO_PREFER_BUSY_POLL 69 +#endif + +#ifndef SO_BUSY_POLL_BUDGET +#define SO_BUSY_POLL_BUDGET 70 +#endif + +#define TEST_PASS 0 +#define TEST_FAILURE -1 +#define TEST_CONTINUE 1 +#define TEST_SKIP 2 + +#define DEFAULT_PKT_CNT (4 * 1024) +#define DEFAULT_UMEM_BUFFERS (DEFAULT_PKT_CNT / 4) +#define HUGEPAGE_SIZE (2 * 1024 * 1024) +#define MIN_PKT_SIZE 64 +#define MAX_ETH_PKT_SIZE 1518 +#define MAX_INTERFACE_NAME_CHARS 16 +#define MAX_TEST_NAME_SIZE 48 +#define SOCK_RECONF_CTR 10 +#define USLEEP_MAX 10000 + +extern bool opt_verbose; +#define print_verbose(x...) do { if (opt_verbose) ksft_print_msg(x); } while (0) + + +static inline u32 ceil_u32(u32 a, u32 b) +{ + return (a + b - 1) / b; +} + +static inline u64 ceil_u64(u64 a, u64 b) +{ + return (a + b - 1) / b; +} + +/* Simple test */ +enum test_mode { + TEST_MODE_SKB, + TEST_MODE_DRV, + TEST_MODE_ZC, + TEST_MODE_ALL +}; + +struct ifobject; +struct test_spec; +typedef int (*validation_func_t)(struct ifobject *ifobj); +typedef void *(*thread_func_t)(void *arg); +typedef int (*test_func_t)(struct test_spec *test); + +struct xsk_socket_info { + struct xsk_ring_cons rx; + struct xsk_ring_prod tx; + struct xsk_umem_info *umem; + struct xsk_socket *xsk; + struct pkt_stream *pkt_stream; + u32 outstanding_tx; + u32 rxqsize; + u32 batch_size; + u8 dst_mac[ETH_ALEN]; + u8 src_mac[ETH_ALEN]; + bool check_consumer; +}; + +int kick_rx(struct xsk_socket_info *xsk); +int kick_tx(struct xsk_socket_info *xsk); + +struct xsk_umem_info { + struct xsk_ring_prod fq; + struct xsk_ring_cons cq; + struct xsk_umem *umem; + u64 next_buffer; + u32 num_frames; + u32 frame_headroom; + void *buffer; + u32 frame_size; + u32 base_addr; + u32 fill_size; + u32 comp_size; + bool unaligned_mode; +}; + +struct set_hw_ring { + u32 default_tx; + u32 default_rx; +}; + +int hw_ring_size_reset(struct ifobject *ifobj); + +struct ifobject { + char ifname[MAX_INTERFACE_NAME_CHARS]; + struct xsk_socket_info *xsk; + struct xsk_socket_info *xsk_arr; + struct xsk_umem_info *umem; + thread_func_t func_ptr; + validation_func_t validation_func; + struct xsk_xdp_progs *xdp_progs; + struct bpf_map *xskmap; + struct bpf_program *xdp_prog; + struct ethtool_ringparam ring; + struct set_hw_ring set_ring; + enum test_mode mode; + int ifindex; + int mtu; + u32 bind_flags; + u32 xdp_zc_max_segs; + bool tx_on; + bool rx_on; + bool use_poll; + bool busy_poll; + bool use_fill_ring; + bool release_rx; + bool shared_umem; + bool use_metadata; + bool unaligned_supp; + bool multi_buff_supp; + bool multi_buff_zc_supp; + bool hw_ring_size_supp; +}; +struct ifobject *ifobject_create(void); +void ifobject_delete(struct ifobject *ifobj); +int init_iface(struct ifobject *ifobj, thread_func_t func_ptr); + +int xsk_configure_umem(struct ifobject *ifobj, struct xsk_umem_info *umem, void *buffer, u64 size); +int xsk_configure_socket(struct xsk_socket_info *xsk, struct xsk_umem_info *umem, + struct ifobject *ifobject, bool shared); + + +struct pkt { + int offset; + u32 len; + u32 pkt_nb; + bool valid; + u16 options; +}; + +struct pkt_stream { + u32 nb_pkts; + u32 current_pkt_nb; + struct pkt *pkts; + u32 max_pkt_len; + u32 nb_rx_pkts; + u32 nb_valid_entries; + bool verbatim; +}; + +static inline bool pkt_continues(u32 options) +{ + return options & XDP_PKT_CONTD; +} + +struct pkt_stream *pkt_stream_generate(u32 nb_pkts, u32 pkt_len); +void pkt_stream_delete(struct pkt_stream *pkt_stream); +void pkt_stream_reset(struct pkt_stream *pkt_stream); +void pkt_stream_restore_default(struct test_spec *test); + +struct test_spec { + struct ifobject *ifobj_tx; + struct ifobject *ifobj_rx; + struct pkt_stream *tx_pkt_stream_default; + struct pkt_stream *rx_pkt_stream_default; + struct bpf_program *xdp_prog_rx; + struct bpf_program *xdp_prog_tx; + struct bpf_map *xskmap_rx; + struct bpf_map *xskmap_tx; + test_func_t test_func; + int mtu; + u16 total_steps; + u16 current_step; + u16 nb_sockets; + bool fail; + bool set_ring; + bool adjust_tail; + bool adjust_tail_support; + enum test_mode mode; + char name[MAX_TEST_NAME_SIZE]; +}; + +#define busy_poll_string(test) (test)->ifobj_tx->busy_poll ? "BUSY-POLL " : "" +static inline char *mode_string(struct test_spec *test) +{ + switch (test->mode) { + case TEST_MODE_SKB: + return "SKB"; + case TEST_MODE_DRV: + return "DRV"; + case TEST_MODE_ZC: + return "ZC"; + default: + return "BOGUS"; + } +} + +void test_init(struct test_spec *test, struct ifobject *ifobj_tx, + struct ifobject *ifobj_rx, enum test_mode mode, + const struct test_spec *test_to_run); + +int testapp_adjust_tail_grow(struct test_spec *test); +int testapp_adjust_tail_grow_mb(struct test_spec *test); +int testapp_adjust_tail_shrink(struct test_spec *test); +int testapp_adjust_tail_shrink_mb(struct test_spec *test); +int testapp_aligned_inv_desc(struct test_spec *test); +int testapp_aligned_inv_desc_2k_frame(struct test_spec *test); +int testapp_aligned_inv_desc_mb(struct test_spec *test); +int testapp_bidirectional(struct test_spec *test); +int testapp_headroom(struct test_spec *test); +int testapp_hw_sw_max_ring_size(struct test_spec *test); +int testapp_hw_sw_min_ring_size(struct test_spec *test); +int testapp_poll_rx(struct test_spec *test); +int testapp_poll_rxq_tmout(struct test_spec *test); +int testapp_poll_tx(struct test_spec *test); +int testapp_poll_txq_tmout(struct test_spec *test); +int testapp_send_receive(struct test_spec *test); +int testapp_send_receive_2k_frame(struct test_spec *test); +int testapp_send_receive_mb(struct test_spec *test); +int testapp_send_receive_unaligned(struct test_spec *test); +int testapp_send_receive_unaligned_mb(struct test_spec *test); +int testapp_single_pkt(struct test_spec *test); +int testapp_stats_fill_empty(struct test_spec *test); +int testapp_stats_rx_dropped(struct test_spec *test); +int testapp_stats_tx_invalid_descs(struct test_spec *test); +int testapp_stats_rx_full(struct test_spec *test); +int testapp_teardown(struct test_spec *test); +int testapp_too_many_frags(struct test_spec *test); +int testapp_tx_queue_consumer(struct test_spec *test); +int testapp_unaligned_inv_desc(struct test_spec *test); +int testapp_unaligned_inv_desc_4001_frame(struct test_spec *test); +int testapp_unaligned_inv_desc_mb(struct test_spec *test); +int testapp_xdp_drop(struct test_spec *test); +int testapp_xdp_metadata(struct test_spec *test); +int testapp_xdp_metadata_mb(struct test_spec *test); +int testapp_xdp_prog_cleanup(struct test_spec *test); +int testapp_xdp_shared_umem(struct test_spec *test); + +void *worker_testapp_validate_rx(void *arg); +void *worker_testapp_validate_tx(void *arg); + +static const struct test_spec tests[] = { + {.name = "SEND_RECEIVE", .test_func = testapp_send_receive}, + {.name = "SEND_RECEIVE_2K_FRAME", .test_func = testapp_send_receive_2k_frame}, + {.name = "SEND_RECEIVE_SINGLE_PKT", .test_func = testapp_single_pkt}, + {.name = "POLL_RX", .test_func = testapp_poll_rx}, + {.name = "POLL_TX", .test_func = testapp_poll_tx}, + {.name = "POLL_RXQ_FULL", .test_func = testapp_poll_rxq_tmout}, + {.name = "POLL_TXQ_FULL", .test_func = testapp_poll_txq_tmout}, + {.name = "ALIGNED_INV_DESC", .test_func = testapp_aligned_inv_desc}, + {.name = "ALIGNED_INV_DESC_2K_FRAME_SIZE", .test_func = testapp_aligned_inv_desc_2k_frame}, + {.name = "UMEM_HEADROOM", .test_func = testapp_headroom}, + {.name = "BIDIRECTIONAL", .test_func = testapp_bidirectional}, + {.name = "STAT_RX_DROPPED", .test_func = testapp_stats_rx_dropped}, + {.name = "STAT_TX_INVALID", .test_func = testapp_stats_tx_invalid_descs}, + {.name = "STAT_RX_FULL", .test_func = testapp_stats_rx_full}, + {.name = "STAT_FILL_EMPTY", .test_func = testapp_stats_fill_empty}, + {.name = "XDP_PROG_CLEANUP", .test_func = testapp_xdp_prog_cleanup}, + {.name = "XDP_DROP_HALF", .test_func = testapp_xdp_drop}, + {.name = "XDP_SHARED_UMEM", .test_func = testapp_xdp_shared_umem}, + {.name = "XDP_METADATA_COPY", .test_func = testapp_xdp_metadata}, + {.name = "XDP_METADATA_COPY_MULTI_BUFF", .test_func = testapp_xdp_metadata_mb}, + {.name = "ALIGNED_INV_DESC_MULTI_BUFF", .test_func = testapp_aligned_inv_desc_mb}, + {.name = "TOO_MANY_FRAGS", .test_func = testapp_too_many_frags}, + {.name = "XDP_ADJUST_TAIL_SHRINK", .test_func = testapp_adjust_tail_shrink}, + {.name = "TX_QUEUE_CONSUMER", .test_func = testapp_tx_queue_consumer}, + }; + +static const struct test_spec ci_skip_tests[] = { + /* Flaky tests */ + {.name = "XDP_ADJUST_TAIL_SHRINK_MULTI_BUFF", .test_func = testapp_adjust_tail_shrink_mb}, + {.name = "XDP_ADJUST_TAIL_GROW", .test_func = testapp_adjust_tail_grow}, + {.name = "XDP_ADJUST_TAIL_GROW_MULTI_BUFF", .test_func = testapp_adjust_tail_grow_mb}, + {.name = "SEND_RECEIVE_9K_PACKETS", .test_func = testapp_send_receive_mb}, + /* Tests with huge page dependency */ + {.name = "SEND_RECEIVE_UNALIGNED", .test_func = testapp_send_receive_unaligned}, + {.name = "UNALIGNED_INV_DESC", .test_func = testapp_unaligned_inv_desc}, + {.name = "UNALIGNED_INV_DESC_4001_FRAME_SIZE", + .test_func = testapp_unaligned_inv_desc_4001_frame}, + {.name = "SEND_RECEIVE_UNALIGNED_9K_PACKETS", + .test_func = testapp_send_receive_unaligned_mb}, + {.name = "UNALIGNED_INV_DESC_MULTI_BUFF", .test_func = testapp_unaligned_inv_desc_mb}, + /* Test with HW ring size dependency */ + {.name = "HW_SW_MIN_RING_SIZE", .test_func = testapp_hw_sw_min_ring_size}, + {.name = "HW_SW_MAX_RING_SIZE", .test_func = testapp_hw_sw_max_ring_size}, + /* Too long test */ + {.name = "TEARDOWN", .test_func = testapp_teardown}, +}; + + +#endif /* TEST_XSK_H_ */ diff --git a/tools/testing/selftests/bpf/prog_tests/verifier.c b/tools/testing/selftests/bpf/prog_tests/verifier.c index 28e81161e6fc..4b4b081b46cc 100644 --- a/tools/testing/selftests/bpf/prog_tests/verifier.c +++ b/tools/testing/selftests/bpf/prog_tests/verifier.c @@ -7,6 +7,7 @@ #include "verifier_arena.skel.h" #include "verifier_arena_large.skel.h" #include "verifier_array_access.skel.h" +#include "verifier_async_cb_context.skel.h" #include "verifier_basic_stack.skel.h" #include "verifier_bitfield_write.skel.h" #include "verifier_bounds.skel.h" @@ -34,6 +35,7 @@ #include "verifier_global_subprogs.skel.h" #include "verifier_global_ptr_args.skel.h" #include "verifier_gotol.skel.h" +#include "verifier_gotox.skel.h" #include "verifier_helper_access_var_len.skel.h" #include "verifier_helper_packet_access.skel.h" #include "verifier_helper_restricted.skel.h" @@ -172,6 +174,7 @@ void test_verifier_div_overflow(void) { RUN(verifier_div_overflow); } void test_verifier_global_subprogs(void) { RUN(verifier_global_subprogs); } void test_verifier_global_ptr_args(void) { RUN(verifier_global_ptr_args); } void test_verifier_gotol(void) { RUN(verifier_gotol); } +void test_verifier_gotox(void) { RUN(verifier_gotox); } void test_verifier_helper_access_var_len(void) { RUN(verifier_helper_access_var_len); } void test_verifier_helper_packet_access(void) { RUN(verifier_helper_packet_access); } void test_verifier_helper_restricted(void) { RUN(verifier_helper_restricted); } @@ -280,6 +283,7 @@ void test_verifier_array_access(void) verifier_array_access__elf_bytes, init_array_access_maps); } +void test_verifier_async_cb_context(void) { RUN(verifier_async_cb_context); } static int init_value_ptr_arith_maps(struct bpf_object *obj) { diff --git a/tools/testing/selftests/bpf/prog_tests/wq.c b/tools/testing/selftests/bpf/prog_tests/wq.c index 99e438fe12ac..15c67d23128b 100644 --- a/tools/testing/selftests/bpf/prog_tests/wq.c +++ b/tools/testing/selftests/bpf/prog_tests/wq.c @@ -38,3 +38,59 @@ void serial_test_failures_wq(void) { RUN_TESTS(wq_failures); } + +static void test_failure_map_no_btf(void) +{ + struct wq *skel = NULL; + char log[8192]; + const struct bpf_insn *insns; + size_t insn_cnt; + int ret, err, map_fd; + LIBBPF_OPTS(bpf_prog_load_opts, opts, .log_size = sizeof(log), .log_buf = log, + .log_level = 2); + + skel = wq__open(); + if (!ASSERT_OK_PTR(skel, "skel_open")) + return; + + err = bpf_object__prepare(skel->obj); + if (!ASSERT_OK(err, "skel__prepare")) + goto out; + + map_fd = bpf_map_create(BPF_MAP_TYPE_ARRAY, "map_no_btf", sizeof(__u32), sizeof(__u64), 100, + NULL); + if (!ASSERT_GT(map_fd, -1, "map create")) + goto out; + + err = bpf_map__reuse_fd(skel->maps.array, map_fd); + if (!ASSERT_OK(err, "map reuse fd")) { + close(map_fd); + goto out; + } + + insns = bpf_program__insns(skel->progs.test_map_no_btf); + if (!ASSERT_OK_PTR(insns, "insns ptr")) + goto out; + + insn_cnt = bpf_program__insn_cnt(skel->progs.test_map_no_btf); + if (!ASSERT_GT(insn_cnt, 0u, "insn cnt")) + goto out; + + ret = bpf_prog_load(BPF_PROG_TYPE_TRACEPOINT, NULL, "GPL", insns, insn_cnt, &opts); + if (!ASSERT_LT(ret, 0, "prog load failed")) { + if (ret > 0) + close(ret); + goto out; + } + + ASSERT_HAS_SUBSTR(log, "map 'map_no_btf' has to have BTF in order to use bpf_wq", + "log complains no map BTF"); +out: + wq__destroy(skel); +} + +void test_wq_custom(void) +{ + if (test__start_subtest("test_failure_map_no_btf")) + test_failure_map_no_btf(); +} diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_context_test_run.c b/tools/testing/selftests/bpf/prog_tests/xdp_context_test_run.c index 178292d1251a..ee94c281888a 100644 --- a/tools/testing/selftests/bpf/prog_tests/xdp_context_test_run.c +++ b/tools/testing/selftests/bpf/prog_tests/xdp_context_test_run.c @@ -124,10 +124,10 @@ static int send_test_packet(int ifindex) int n, sock = -1; __u8 packet[sizeof(struct ethhdr) + TEST_PAYLOAD_LEN]; - /* The ethernet header is not relevant for this test and doesn't need to - * be meaningful. - */ - struct ethhdr eth = { 0 }; + /* We use the Ethernet header only to identify the test packet */ + struct ethhdr eth = { + .h_source = { 0x12, 0x34, 0xDE, 0xAD, 0xBE, 0xEF }, + }; memcpy(packet, ð, sizeof(eth)); memcpy(packet + sizeof(eth), test_payload, TEST_PAYLOAD_LEN); @@ -160,8 +160,16 @@ static int write_test_packet(int tap_fd) __u8 packet[sizeof(struct ethhdr) + TEST_PAYLOAD_LEN]; int n; - /* The ethernet header doesn't need to be valid for this test */ - memset(packet, 0, sizeof(struct ethhdr)); + /* The Ethernet header is mostly not relevant. We use it to identify the + * test packet and some BPF helpers we exercise expect to operate on + * Ethernet frames carrying IP packets. Pretend that's the case. + */ + struct ethhdr eth = { + .h_source = { 0x12, 0x34, 0xDE, 0xAD, 0xBE, 0xEF }, + .h_proto = htons(ETH_P_IP), + }; + + memcpy(packet, ð, sizeof(eth)); memcpy(packet + sizeof(struct ethhdr), test_payload, TEST_PAYLOAD_LEN); n = write(tap_fd, packet, sizeof(packet)); @@ -171,31 +179,19 @@ static int write_test_packet(int tap_fd) return 0; } -static void assert_test_result(const struct bpf_map *result_map) -{ - int err; - __u32 map_key = 0; - __u8 map_value[TEST_PAYLOAD_LEN]; - - err = bpf_map__lookup_elem(result_map, &map_key, sizeof(map_key), - &map_value, TEST_PAYLOAD_LEN, BPF_ANY); - if (!ASSERT_OK(err, "lookup test_result")) - return; - - ASSERT_MEMEQ(&map_value, &test_payload, TEST_PAYLOAD_LEN, - "test_result map contains test payload"); -} - -static bool clear_test_result(struct bpf_map *result_map) +static void dump_err_stream(const struct bpf_program *prog) { - const __u8 v[sizeof(test_payload)] = {}; - const __u32 k = 0; - int err; - - err = bpf_map__update_elem(result_map, &k, sizeof(k), v, sizeof(v), BPF_ANY); - ASSERT_OK(err, "update test_result"); + char buf[512]; + int ret; - return err == 0; + ret = 0; + do { + ret = bpf_prog_stream_read(bpf_program__fd(prog), + BPF_STREAM_STDERR, buf, sizeof(buf), + NULL); + if (ret > 0) + fwrite(buf, sizeof(buf[0]), ret, stderr); + } while (ret > 0); } void test_xdp_context_veth(void) @@ -270,11 +266,14 @@ void test_xdp_context_veth(void) if (!ASSERT_GE(tx_ifindex, 0, "if_nametoindex tx")) goto close; + skel->bss->test_pass = false; + ret = send_test_packet(tx_ifindex); if (!ASSERT_OK(ret, "send_test_packet")) goto close; - assert_test_result(skel->maps.test_result); + if (!ASSERT_TRUE(skel->bss->test_pass, "test_pass")) + dump_err_stream(tc_prog); close: close_netns(nstoken); @@ -286,7 +285,7 @@ close: static void test_tuntap(struct bpf_program *xdp_prog, struct bpf_program *tc_prio_1_prog, struct bpf_program *tc_prio_2_prog, - struct bpf_map *result_map) + bool *test_pass) { LIBBPF_OPTS(bpf_tc_hook, tc_hook, .attach_point = BPF_TC_INGRESS); LIBBPF_OPTS(bpf_tc_opts, tc_opts, .handle = 1, .priority = 1); @@ -295,8 +294,7 @@ static void test_tuntap(struct bpf_program *xdp_prog, int tap_ifindex; int ret; - if (!clear_test_result(result_map)) - return; + *test_pass = false; ns = netns_new(TAP_NETNS, true); if (!ASSERT_OK_PTR(ns, "create and open ns")) @@ -340,7 +338,8 @@ static void test_tuntap(struct bpf_program *xdp_prog, if (!ASSERT_OK(ret, "write_test_packet")) goto close; - assert_test_result(result_map); + if (!ASSERT_TRUE(*test_pass, "test_pass")) + dump_err_stream(tc_prio_2_prog ? : tc_prio_1_prog); close: if (tap_fd >= 0) @@ -411,7 +410,8 @@ static void test_tuntap_mirred(struct bpf_program *xdp_prog, if (!ASSERT_OK(ret, "write_test_packet")) goto close; - ASSERT_TRUE(*test_pass, "test_pass"); + if (!ASSERT_TRUE(*test_pass, "test_pass")) + dump_err_stream(tc_prog); close: if (tap_fd >= 0) @@ -431,61 +431,82 @@ void test_xdp_context_tuntap(void) test_tuntap(skel->progs.ing_xdp, skel->progs.ing_cls, NULL, /* tc prio 2 */ - skel->maps.test_result); + &skel->bss->test_pass); if (test__start_subtest("dynptr_read")) test_tuntap(skel->progs.ing_xdp, skel->progs.ing_cls_dynptr_read, NULL, /* tc prio 2 */ - skel->maps.test_result); + &skel->bss->test_pass); if (test__start_subtest("dynptr_slice")) test_tuntap(skel->progs.ing_xdp, skel->progs.ing_cls_dynptr_slice, NULL, /* tc prio 2 */ - skel->maps.test_result); + &skel->bss->test_pass); if (test__start_subtest("dynptr_write")) test_tuntap(skel->progs.ing_xdp_zalloc_meta, skel->progs.ing_cls_dynptr_write, skel->progs.ing_cls_dynptr_read, - skel->maps.test_result); + &skel->bss->test_pass); if (test__start_subtest("dynptr_slice_rdwr")) test_tuntap(skel->progs.ing_xdp_zalloc_meta, skel->progs.ing_cls_dynptr_slice_rdwr, skel->progs.ing_cls_dynptr_slice, - skel->maps.test_result); + &skel->bss->test_pass); if (test__start_subtest("dynptr_offset")) test_tuntap(skel->progs.ing_xdp_zalloc_meta, skel->progs.ing_cls_dynptr_offset_wr, skel->progs.ing_cls_dynptr_offset_rd, - skel->maps.test_result); + &skel->bss->test_pass); if (test__start_subtest("dynptr_offset_oob")) test_tuntap(skel->progs.ing_xdp, skel->progs.ing_cls_dynptr_offset_oob, skel->progs.ing_cls, - skel->maps.test_result); - if (test__start_subtest("clone_data_meta_empty_on_data_write")) + &skel->bss->test_pass); + if (test__start_subtest("clone_data_meta_survives_data_write")) test_tuntap_mirred(skel->progs.ing_xdp, - skel->progs.clone_data_meta_empty_on_data_write, + skel->progs.clone_data_meta_survives_data_write, &skel->bss->test_pass); - if (test__start_subtest("clone_data_meta_empty_on_meta_write")) + if (test__start_subtest("clone_data_meta_survives_meta_write")) test_tuntap_mirred(skel->progs.ing_xdp, - skel->progs.clone_data_meta_empty_on_meta_write, + skel->progs.clone_data_meta_survives_meta_write, &skel->bss->test_pass); - if (test__start_subtest("clone_dynptr_empty_on_data_slice_write")) + if (test__start_subtest("clone_meta_dynptr_survives_data_slice_write")) test_tuntap_mirred(skel->progs.ing_xdp, - skel->progs.clone_dynptr_empty_on_data_slice_write, + skel->progs.clone_meta_dynptr_survives_data_slice_write, &skel->bss->test_pass); - if (test__start_subtest("clone_dynptr_empty_on_meta_slice_write")) + if (test__start_subtest("clone_meta_dynptr_survives_meta_slice_write")) test_tuntap_mirred(skel->progs.ing_xdp, - skel->progs.clone_dynptr_empty_on_meta_slice_write, + skel->progs.clone_meta_dynptr_survives_meta_slice_write, &skel->bss->test_pass); - if (test__start_subtest("clone_dynptr_rdonly_before_data_dynptr_write")) + if (test__start_subtest("clone_meta_dynptr_rw_before_data_dynptr_write")) test_tuntap_mirred(skel->progs.ing_xdp, - skel->progs.clone_dynptr_rdonly_before_data_dynptr_write, + skel->progs.clone_meta_dynptr_rw_before_data_dynptr_write, &skel->bss->test_pass); - if (test__start_subtest("clone_dynptr_rdonly_before_meta_dynptr_write")) + if (test__start_subtest("clone_meta_dynptr_rw_before_meta_dynptr_write")) test_tuntap_mirred(skel->progs.ing_xdp, - skel->progs.clone_dynptr_rdonly_before_meta_dynptr_write, + skel->progs.clone_meta_dynptr_rw_before_meta_dynptr_write, &skel->bss->test_pass); + /* Tests for BPF helpers which touch headroom */ + if (test__start_subtest("helper_skb_vlan_push_pop")) + test_tuntap(skel->progs.ing_xdp, + skel->progs.helper_skb_vlan_push_pop, + NULL, /* tc prio 2 */ + &skel->bss->test_pass); + if (test__start_subtest("helper_skb_adjust_room")) + test_tuntap(skel->progs.ing_xdp, + skel->progs.helper_skb_adjust_room, + NULL, /* tc prio 2 */ + &skel->bss->test_pass); + if (test__start_subtest("helper_skb_change_head_tail")) + test_tuntap(skel->progs.ing_xdp, + skel->progs.helper_skb_change_head_tail, + NULL, /* tc prio 2 */ + &skel->bss->test_pass); + if (test__start_subtest("helper_skb_change_proto")) + test_tuntap(skel->progs.ing_xdp, + skel->progs.helper_skb_change_proto, + NULL, /* tc prio 2 */ + &skel->bss->test_pass); test_xdp_meta__destroy(skel); } diff --git a/tools/testing/selftests/bpf/prog_tests/xsk.c b/tools/testing/selftests/bpf/prog_tests/xsk.c new file mode 100644 index 000000000000..dd4c35c0e428 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/xsk.c @@ -0,0 +1,151 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <net/if.h> +#include <stdarg.h> + +#include "network_helpers.h" +#include "test_progs.h" +#include "test_xsk.h" +#include "xsk_xdp_progs.skel.h" + +#define VETH_RX "veth0" +#define VETH_TX "veth1" +#define MTU 1500 + +int setup_veth(bool busy_poll) +{ + SYS(fail, + "ip link add %s numtxqueues 4 numrxqueues 4 type veth peer name %s numtxqueues 4 numrxqueues 4", + VETH_RX, VETH_TX); + SYS(fail, "sysctl -wq net.ipv6.conf.%s.disable_ipv6=1", VETH_RX); + SYS(fail, "sysctl -wq net.ipv6.conf.%s.disable_ipv6=1", VETH_TX); + + if (busy_poll) { + SYS(fail, "echo 2 > /sys/class/net/%s/napi_defer_hard_irqs", VETH_RX); + SYS(fail, "echo 200000 > /sys/class/net/%s/gro_flush_timeout", VETH_RX); + SYS(fail, "echo 2 > /sys/class/net/%s/napi_defer_hard_irqs", VETH_TX); + SYS(fail, "echo 200000 > /sys/class/net/%s/gro_flush_timeout", VETH_TX); + } + + SYS(fail, "ip link set %s mtu %d", VETH_RX, MTU); + SYS(fail, "ip link set %s mtu %d", VETH_TX, MTU); + SYS(fail, "ip link set %s up", VETH_RX); + SYS(fail, "ip link set %s up", VETH_TX); + + return 0; + +fail: + return -1; +} + +void delete_veth(void) +{ + SYS_NOFAIL("ip link del %s", VETH_RX); + SYS_NOFAIL("ip link del %s", VETH_TX); +} + +int configure_ifobj(struct ifobject *tx, struct ifobject *rx) +{ + rx->ifindex = if_nametoindex(VETH_RX); + if (!ASSERT_OK_FD(rx->ifindex, "get RX ifindex")) + return -1; + + tx->ifindex = if_nametoindex(VETH_TX); + if (!ASSERT_OK_FD(tx->ifindex, "get TX ifindex")) + return -1; + + tx->shared_umem = false; + rx->shared_umem = false; + + + return 0; +} + +static void test_xsk(const struct test_spec *test_to_run, enum test_mode mode) +{ + struct ifobject *ifobj_tx, *ifobj_rx; + struct test_spec test; + int ret; + + ifobj_tx = ifobject_create(); + if (!ASSERT_OK_PTR(ifobj_tx, "create ifobj_tx")) + return; + + ifobj_rx = ifobject_create(); + if (!ASSERT_OK_PTR(ifobj_rx, "create ifobj_rx")) + goto delete_tx; + + if (!ASSERT_OK(configure_ifobj(ifobj_tx, ifobj_rx), "conigure ifobj")) + goto delete_rx; + + ret = get_hw_ring_size(ifobj_tx->ifname, &ifobj_tx->ring); + if (!ret) { + ifobj_tx->hw_ring_size_supp = true; + ifobj_tx->set_ring.default_tx = ifobj_tx->ring.tx_pending; + ifobj_tx->set_ring.default_rx = ifobj_tx->ring.rx_pending; + } + + if (!ASSERT_OK(init_iface(ifobj_rx, worker_testapp_validate_rx), "init RX")) + goto delete_rx; + if (!ASSERT_OK(init_iface(ifobj_tx, worker_testapp_validate_tx), "init TX")) + goto delete_rx; + + test_init(&test, ifobj_tx, ifobj_rx, 0, &tests[0]); + + test.tx_pkt_stream_default = pkt_stream_generate(DEFAULT_PKT_CNT, MIN_PKT_SIZE); + if (!ASSERT_OK_PTR(test.tx_pkt_stream_default, "TX pkt generation")) + goto delete_rx; + test.rx_pkt_stream_default = pkt_stream_generate(DEFAULT_PKT_CNT, MIN_PKT_SIZE); + if (!ASSERT_OK_PTR(test.rx_pkt_stream_default, "RX pkt generation")) + goto delete_rx; + + + test_init(&test, ifobj_tx, ifobj_rx, mode, test_to_run); + ret = test.test_func(&test); + if (ret != TEST_SKIP) + ASSERT_OK(ret, "Run test"); + pkt_stream_restore_default(&test); + + if (ifobj_tx->hw_ring_size_supp) + hw_ring_size_reset(ifobj_tx); + + pkt_stream_delete(test.tx_pkt_stream_default); + pkt_stream_delete(test.rx_pkt_stream_default); + xsk_xdp_progs__destroy(ifobj_tx->xdp_progs); + xsk_xdp_progs__destroy(ifobj_rx->xdp_progs); + +delete_rx: + ifobject_delete(ifobj_rx); +delete_tx: + ifobject_delete(ifobj_tx); +} + +void test_ns_xsk_skb(void) +{ + int i; + + if (!ASSERT_OK(setup_veth(false), "setup veth")) + return; + + for (i = 0; i < ARRAY_SIZE(tests); i++) { + if (test__start_subtest(tests[i].name)) + test_xsk(&tests[i], TEST_MODE_SKB); + } + + delete_veth(); +} + +void test_ns_xsk_drv(void) +{ + int i; + + if (!ASSERT_OK(setup_veth(false), "setup veth")) + return; + + for (i = 0; i < ARRAY_SIZE(tests); i++) { + if (test__start_subtest(tests[i].name)) + test_xsk(&tests[i], TEST_MODE_DRV); + } + + delete_veth(); +} + diff --git a/tools/testing/selftests/bpf/progs/arena_strsearch.c b/tools/testing/selftests/bpf/progs/arena_strsearch.c new file mode 100644 index 000000000000..ef6b76658f7f --- /dev/null +++ b/tools/testing/selftests/bpf/progs/arena_strsearch.c @@ -0,0 +1,146 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */ +#include <vmlinux.h> +#include "bpf_experimental.h" + +struct { + __uint(type, BPF_MAP_TYPE_ARENA); + __uint(map_flags, BPF_F_MMAPABLE); + __uint(max_entries, 100); /* number of pages */ +} arena SEC(".maps"); + +#include "bpf_arena_strsearch.h" + +struct glob_test { + char const __arena *pat, *str; + bool expected; +}; + +static bool test(char const __arena *pat, char const __arena *str, bool expected) +{ + bool match = glob_match(pat, str); + bool success = match == expected; + + /* bpf_printk("glob_match %s %s res %d ok %d", pat, str, match, success); */ + return success; +} + +/* + * The tests are all jammed together in one array to make it simpler + * to place that array in the .init.rodata section. The obvious + * "array of structures containing char *" has no way to force the + * pointed-to strings to be in a particular section. + * + * Anyway, a test consists of: + * 1. Expected glob_match result: '1' or '0'. + * 2. Pattern to match: null-terminated string + * 3. String to match against: null-terminated string + * + * The list of tests is terminated with a final '\0' instead of + * a glob_match result character. + */ +static const char __arena glob_tests[] = + /* Some basic tests */ + "1" "a\0" "a\0" + "0" "a\0" "b\0" + "0" "a\0" "aa\0" + "0" "a\0" "\0" + "1" "\0" "\0" + "0" "\0" "a\0" + /* Simple character class tests */ + "1" "[a]\0" "a\0" + "0" "[a]\0" "b\0" + "0" "[!a]\0" "a\0" + "1" "[!a]\0" "b\0" + "1" "[ab]\0" "a\0" + "1" "[ab]\0" "b\0" + "0" "[ab]\0" "c\0" + "1" "[!ab]\0" "c\0" + "1" "[a-c]\0" "b\0" + "0" "[a-c]\0" "d\0" + /* Corner cases in character class parsing */ + "1" "[a-c-e-g]\0" "-\0" + "0" "[a-c-e-g]\0" "d\0" + "1" "[a-c-e-g]\0" "f\0" + "1" "[]a-ceg-ik[]\0" "a\0" + "1" "[]a-ceg-ik[]\0" "]\0" + "1" "[]a-ceg-ik[]\0" "[\0" + "1" "[]a-ceg-ik[]\0" "h\0" + "0" "[]a-ceg-ik[]\0" "f\0" + "0" "[!]a-ceg-ik[]\0" "h\0" + "0" "[!]a-ceg-ik[]\0" "]\0" + "1" "[!]a-ceg-ik[]\0" "f\0" + /* Simple wild cards */ + "1" "?\0" "a\0" + "0" "?\0" "aa\0" + "0" "??\0" "a\0" + "1" "?x?\0" "axb\0" + "0" "?x?\0" "abx\0" + "0" "?x?\0" "xab\0" + /* Asterisk wild cards (backtracking) */ + "0" "*??\0" "a\0" + "1" "*??\0" "ab\0" + "1" "*??\0" "abc\0" + "1" "*??\0" "abcd\0" + "0" "??*\0" "a\0" + "1" "??*\0" "ab\0" + "1" "??*\0" "abc\0" + "1" "??*\0" "abcd\0" + "0" "?*?\0" "a\0" + "1" "?*?\0" "ab\0" + "1" "?*?\0" "abc\0" + "1" "?*?\0" "abcd\0" + "1" "*b\0" "b\0" + "1" "*b\0" "ab\0" + "0" "*b\0" "ba\0" + "1" "*b\0" "bb\0" + "1" "*b\0" "abb\0" + "1" "*b\0" "bab\0" + "1" "*bc\0" "abbc\0" + "1" "*bc\0" "bc\0" + "1" "*bc\0" "bbc\0" + "1" "*bc\0" "bcbc\0" + /* Multiple asterisks (complex backtracking) */ + "1" "*ac*\0" "abacadaeafag\0" + "1" "*ac*ae*ag*\0" "abacadaeafag\0" + "1" "*a*b*[bc]*[ef]*g*\0" "abacadaeafag\0" + "0" "*a*b*[ef]*[cd]*g*\0" "abacadaeafag\0" + "1" "*abcd*\0" "abcabcabcabcdefg\0" + "1" "*ab*cd*\0" "abcabcabcabcdefg\0" + "1" "*abcd*abcdef*\0" "abcabcdabcdeabcdefg\0" + "0" "*abcd*\0" "abcabcabcabcefg\0" + "0" "*ab*cd*\0" "abcabcabcabcefg\0"; + +bool skip = false; + +SEC("syscall") +int arena_strsearch(void *ctx) +{ + unsigned successes = 0; + unsigned n = 0; + char const __arena *p = glob_tests; + + /* + * Tests are jammed together in a string. The first byte is '1' + * or '0' to indicate the expected outcome, or '\0' to indicate the + * end of the tests. Then come two null-terminated strings: the + * pattern and the string to match it against. + */ + while (*p) { + bool expected = *p++ & 1; + char const __arena *pat = p; + + cond_break; + p += bpf_arena_strlen(p) + 1; + successes += test(pat, p, expected); + p += bpf_arena_strlen(p) + 1; + n++; + } + + n -= successes; + /* bpf_printk("glob: %u self-tests passed, %u failed\n", successes, n); */ + + return n ? -1 : 0; +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/bpf_cc_cubic.c b/tools/testing/selftests/bpf/progs/bpf_cc_cubic.c index 4e51785e7606..9af19dfe4e80 100644 --- a/tools/testing/selftests/bpf/progs/bpf_cc_cubic.c +++ b/tools/testing/selftests/bpf/progs/bpf_cc_cubic.c @@ -22,10 +22,6 @@ #define TCP_PACING_CA_RATIO (120) #define TCP_REORDERING (12) -#define min(a, b) ((a) < (b) ? (a) : (b)) -#define max(a, b) ((a) > (b) ? (a) : (b)) -#define after(seq2, seq1) before(seq1, seq2) - extern void cubictcp_init(struct sock *sk) __ksym; extern void cubictcp_cwnd_event(struct sock *sk, enum tcp_ca_event event) __ksym; extern __u32 cubictcp_recalc_ssthresh(struct sock *sk) __ksym; @@ -34,11 +30,6 @@ extern __u32 tcp_reno_undo_cwnd(struct sock *sk) __ksym; extern void cubictcp_acked(struct sock *sk, const struct ack_sample *sample) __ksym; extern void cubictcp_cong_avoid(struct sock *sk, __u32 ack, __u32 acked) __ksym; -static bool before(__u32 seq1, __u32 seq2) -{ - return (__s32)(seq1-seq2) < 0; -} - static __u64 div64_u64(__u64 dividend, __u64 divisor) { return dividend / divisor; diff --git a/tools/testing/selftests/bpf/progs/bpf_cubic.c b/tools/testing/selftests/bpf/progs/bpf_cubic.c index f089faa97ae6..46fb2b37d3a7 100644 --- a/tools/testing/selftests/bpf/progs/bpf_cubic.c +++ b/tools/testing/selftests/bpf/progs/bpf_cubic.c @@ -20,13 +20,6 @@ char _license[] SEC("license") = "GPL"; #define clamp(val, lo, hi) min((typeof(val))max(val, lo), hi) -#define min(a, b) ((a) < (b) ? (a) : (b)) -#define max(a, b) ((a) > (b) ? (a) : (b)) -static bool before(__u32 seq1, __u32 seq2) -{ - return (__s32)(seq1-seq2) < 0; -} -#define after(seq2, seq1) before(seq1, seq2) extern __u32 tcp_slow_start(struct tcp_sock *tp, __u32 acked) __ksym; extern void tcp_cong_avoid_ai(struct tcp_sock *tp, __u32 w, __u32 acked) __ksym; diff --git a/tools/testing/selftests/bpf/progs/bpf_dctcp.c b/tools/testing/selftests/bpf/progs/bpf_dctcp.c index 32c511bcd60b..1cc83140849f 100644 --- a/tools/testing/selftests/bpf/progs/bpf_dctcp.c +++ b/tools/testing/selftests/bpf/progs/bpf_dctcp.c @@ -13,16 +13,10 @@ #ifndef EBUSY #define EBUSY 16 #endif -#define min(a, b) ((a) < (b) ? (a) : (b)) -#define max(a, b) ((a) > (b) ? (a) : (b)) #define min_not_zero(x, y) ({ \ typeof(x) __x = (x); \ typeof(y) __y = (y); \ __x == 0 ? __y : ((__y == 0) ? __x : min(__x, __y)); }) -static bool before(__u32 seq1, __u32 seq2) -{ - return (__s32)(seq1-seq2) < 0; -} char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/bpf_gotox.c b/tools/testing/selftests/bpf/progs/bpf_gotox.c new file mode 100644 index 000000000000..216c71b94c64 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/bpf_gotox.c @@ -0,0 +1,448 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "vmlinux.h" +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> +#include <bpf/bpf_core_read.h> +#include "bpf_misc.h" + +__u64 in_user; +__u64 ret_user; + +int pid; + +/* + * Skip all the tests if compiler doesn't support indirect jumps. + * + * If tests are skipped, then all functions below are compiled as + * dummy, such that the skeleton looks the same, and the userspace + * program can avoid any checks rather than if data->skip is set. + */ +#ifdef __BPF_FEATURE_GOTOX +__u64 skip SEC(".data") = 0; +#else +__u64 skip = 1; +#endif + +struct simple_ctx { + __u64 x; +}; + +#ifdef __BPF_FEATURE_GOTOX +__u64 some_var; + +/* + * This function adds code which will be replaced by a different + * number of instructions by the verifier. This adds additional + * stress on testing the insn_array maps corresponding to indirect jumps. + */ +static __always_inline void adjust_insns(__u64 x) +{ + some_var ^= x + bpf_jiffies64(); +} + +SEC("syscall") +int one_switch(struct simple_ctx *ctx) +{ + switch (ctx->x) { + case 0: + adjust_insns(ctx->x + 1); + ret_user = 2; + break; + case 1: + adjust_insns(ctx->x + 7); + ret_user = 3; + break; + case 2: + adjust_insns(ctx->x + 9); + ret_user = 4; + break; + case 3: + adjust_insns(ctx->x + 11); + ret_user = 5; + break; + case 4: + adjust_insns(ctx->x + 17); + ret_user = 7; + break; + default: + adjust_insns(ctx->x + 177); + ret_user = 19; + break; + } + + return 0; +} + +SEC("syscall") +int one_switch_non_zero_sec_off(struct simple_ctx *ctx) +{ + switch (ctx->x) { + case 0: + adjust_insns(ctx->x + 1); + ret_user = 2; + break; + case 1: + adjust_insns(ctx->x + 7); + ret_user = 3; + break; + case 2: + adjust_insns(ctx->x + 9); + ret_user = 4; + break; + case 3: + adjust_insns(ctx->x + 11); + ret_user = 5; + break; + case 4: + adjust_insns(ctx->x + 17); + ret_user = 7; + break; + default: + adjust_insns(ctx->x + 177); + ret_user = 19; + break; + } + + return 0; +} + +SEC("fentry/" SYS_PREFIX "sys_nanosleep") +int simple_test_other_sec(struct pt_regs *ctx) +{ + __u64 x = in_user; + + if (bpf_get_current_pid_tgid() >> 32 != pid) + return 0; + + switch (x) { + case 0: + adjust_insns(x + 1); + ret_user = 2; + break; + case 1: + adjust_insns(x + 7); + ret_user = 3; + break; + case 2: + adjust_insns(x + 9); + ret_user = 4; + break; + case 3: + adjust_insns(x + 11); + ret_user = 5; + break; + case 4: + adjust_insns(x + 17); + ret_user = 7; + break; + default: + adjust_insns(x + 177); + ret_user = 19; + break; + } + + return 0; +} + +SEC("syscall") +int two_switches(struct simple_ctx *ctx) +{ + switch (ctx->x) { + case 0: + adjust_insns(ctx->x + 1); + ret_user = 2; + break; + case 1: + adjust_insns(ctx->x + 7); + ret_user = 3; + break; + case 2: + adjust_insns(ctx->x + 9); + ret_user = 4; + break; + case 3: + adjust_insns(ctx->x + 11); + ret_user = 5; + break; + case 4: + adjust_insns(ctx->x + 17); + ret_user = 7; + break; + default: + adjust_insns(ctx->x + 177); + ret_user = 19; + break; + } + + switch (ctx->x + !!ret_user) { + case 1: + adjust_insns(ctx->x + 7); + ret_user = 103; + break; + case 2: + adjust_insns(ctx->x + 9); + ret_user = 104; + break; + case 3: + adjust_insns(ctx->x + 11); + ret_user = 107; + break; + case 4: + adjust_insns(ctx->x + 11); + ret_user = 205; + break; + case 5: + adjust_insns(ctx->x + 11); + ret_user = 115; + break; + default: + adjust_insns(ctx->x + 177); + ret_user = 1019; + break; + } + + return 0; +} + +SEC("syscall") +int big_jump_table(struct simple_ctx *ctx __attribute__((unused))) +{ + const void *const jt[256] = { + [0 ... 255] = &&default_label, + [0] = &&l0, + [11] = &&l11, + [27] = &&l27, + [31] = &&l31, + }; + + goto *jt[ctx->x & 0xff]; + +l0: + adjust_insns(ctx->x + 1); + ret_user = 2; + return 0; + +l11: + adjust_insns(ctx->x + 7); + ret_user = 3; + return 0; + +l27: + adjust_insns(ctx->x + 9); + ret_user = 4; + return 0; + +l31: + adjust_insns(ctx->x + 11); + ret_user = 5; + return 0; + +default_label: + adjust_insns(ctx->x + 177); + ret_user = 19; + return 0; +} + +SEC("syscall") +int one_jump_two_maps(struct simple_ctx *ctx __attribute__((unused))) +{ + __label__ l1, l2, l3, l4; + void *jt1[2] = { &&l1, &&l2 }; + void *jt2[2] = { &&l3, &&l4 }; + unsigned int a = ctx->x % 2; + unsigned int b = (ctx->x / 2) % 2; + volatile int ret = 0; + + if (!(a < 2 && b < 2)) + return 19; + + if (ctx->x % 2) + goto *jt1[a]; + else + goto *jt2[b]; + + l1: ret += 1; + l2: ret += 3; + l3: ret += 5; + l4: ret += 7; + + ret_user = ret; + return ret; +} + +SEC("syscall") +int one_map_two_jumps(struct simple_ctx *ctx __attribute__((unused))) +{ + __label__ l1, l2, l3; + void *jt[3] = { &&l1, &&l2, &&l3 }; + unsigned int a = (ctx->x >> 2) & 1; + unsigned int b = (ctx->x >> 3) & 1; + volatile int ret = 0; + + if (ctx->x % 2) + goto *jt[a]; + + if (ctx->x % 3) + goto *jt[a + b]; + + l1: ret += 3; + l2: ret += 5; + l3: ret += 7; + + ret_user = ret; + return ret; +} + +/* Just to introduce some non-zero offsets in .text */ +static __noinline int f0(volatile struct simple_ctx *ctx __arg_ctx) +{ + if (ctx) + return 1; + else + return 13; +} + +SEC("syscall") int f1(struct simple_ctx *ctx) +{ + ret_user = 0; + return f0(ctx); +} + +static __noinline int __static_global(__u64 x) +{ + switch (x) { + case 0: + adjust_insns(x + 1); + ret_user = 2; + break; + case 1: + adjust_insns(x + 7); + ret_user = 3; + break; + case 2: + adjust_insns(x + 9); + ret_user = 4; + break; + case 3: + adjust_insns(x + 11); + ret_user = 5; + break; + case 4: + adjust_insns(x + 17); + ret_user = 7; + break; + default: + adjust_insns(x + 177); + ret_user = 19; + break; + } + + return 0; +} + +SEC("syscall") +int use_static_global1(struct simple_ctx *ctx) +{ + ret_user = 0; + return __static_global(ctx->x); +} + +SEC("syscall") +int use_static_global2(struct simple_ctx *ctx) +{ + ret_user = 0; + adjust_insns(ctx->x + 1); + return __static_global(ctx->x); +} + +SEC("fentry/" SYS_PREFIX "sys_nanosleep") +int use_static_global_other_sec(void *ctx) +{ + if (bpf_get_current_pid_tgid() >> 32 != pid) + return 0; + + return __static_global(in_user); +} + +__noinline int __nonstatic_global(__u64 x) +{ + switch (x) { + case 0: + adjust_insns(x + 1); + ret_user = 2; + break; + case 1: + adjust_insns(x + 7); + ret_user = 3; + break; + case 2: + adjust_insns(x + 9); + ret_user = 4; + break; + case 3: + adjust_insns(x + 11); + ret_user = 5; + break; + case 4: + adjust_insns(x + 17); + ret_user = 7; + break; + default: + adjust_insns(x + 177); + ret_user = 19; + break; + } + + return 0; +} + +SEC("syscall") +int use_nonstatic_global1(struct simple_ctx *ctx) +{ + ret_user = 0; + return __nonstatic_global(ctx->x); +} + +SEC("syscall") +int use_nonstatic_global2(struct simple_ctx *ctx) +{ + ret_user = 0; + adjust_insns(ctx->x + 1); + return __nonstatic_global(ctx->x); +} + +SEC("fentry/" SYS_PREFIX "sys_nanosleep") +int use_nonstatic_global_other_sec(void *ctx) +{ + if (bpf_get_current_pid_tgid() >> 32 != pid) + return 0; + + return __nonstatic_global(in_user); +} + +#else /* __BPF_FEATURE_GOTOX */ + +#define SKIP_TEST(TEST_NAME) \ + SEC("syscall") int TEST_NAME(void *ctx) \ + { \ + return 0; \ + } + +SKIP_TEST(one_switch); +SKIP_TEST(one_switch_non_zero_sec_off); +SKIP_TEST(simple_test_other_sec); +SKIP_TEST(two_switches); +SKIP_TEST(big_jump_table); +SKIP_TEST(one_jump_two_maps); +SKIP_TEST(one_map_two_jumps); +SKIP_TEST(use_static_global1); +SKIP_TEST(use_static_global2); +SKIP_TEST(use_static_global_other_sec); +SKIP_TEST(use_nonstatic_global1); +SKIP_TEST(use_nonstatic_global2); +SKIP_TEST(use_nonstatic_global_other_sec); + +#endif /* __BPF_FEATURE_GOTOX */ + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_setsockopt.c b/tools/testing/selftests/bpf/progs/bpf_iter_setsockopt.c index 774d4dbe8189..a8aa5a71d846 100644 --- a/tools/testing/selftests/bpf/progs/bpf_iter_setsockopt.c +++ b/tools/testing/selftests/bpf/progs/bpf_iter_setsockopt.c @@ -18,23 +18,10 @@ unsigned short reuse_listen_hport = 0; unsigned short listen_hport = 0; -char cubic_cc[TCP_CA_NAME_MAX] = "bpf_cubic"; +const char cubic_cc[] = "bpf_cubic"; char dctcp_cc[TCP_CA_NAME_MAX] = "bpf_dctcp"; bool random_retry = false; -static bool tcp_cc_eq(const char *a, const char *b) -{ - int i; - - for (i = 0; i < TCP_CA_NAME_MAX; i++) { - if (a[i] != b[i]) - return false; - if (!a[i]) - break; - } - - return true; -} SEC("iter/tcp") int change_tcp_cc(struct bpf_iter__tcp *ctx) @@ -58,7 +45,7 @@ int change_tcp_cc(struct bpf_iter__tcp *ctx) cur_cc, sizeof(cur_cc))) return 0; - if (!tcp_cc_eq(cur_cc, cubic_cc)) + if (bpf_strncmp(cur_cc, TCP_CA_NAME_MAX, cubic_cc)) return 0; if (random_retry && bpf_get_prandom_u32() % 4 == 1) diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_tcp4.c b/tools/testing/selftests/bpf/progs/bpf_iter_tcp4.c index 164640db3a29..b1e509b231cd 100644 --- a/tools/testing/selftests/bpf/progs/bpf_iter_tcp4.c +++ b/tools/testing/selftests/bpf/progs/bpf_iter_tcp4.c @@ -99,13 +99,13 @@ static int dump_tcp_sock(struct seq_file *seq, struct tcp_sock *tp, icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT || icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) { timer_active = 1; - timer_expires = icsk->icsk_retransmit_timer.expires; + timer_expires = sp->tcp_retransmit_timer.expires; } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) { timer_active = 4; - timer_expires = icsk->icsk_retransmit_timer.expires; - } else if (timer_pending(&sp->sk_timer)) { + timer_expires = sp->tcp_retransmit_timer.expires; + } else if (timer_pending(&icsk->icsk_keepalive_timer)) { timer_active = 2; - timer_expires = sp->sk_timer.expires; + timer_expires = icsk->icsk_keepalive_timer.expires; } else { timer_active = 0; timer_expires = bpf_jiffies64(); diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_tcp6.c b/tools/testing/selftests/bpf/progs/bpf_iter_tcp6.c index 591c703f5032..dbc7166aee91 100644 --- a/tools/testing/selftests/bpf/progs/bpf_iter_tcp6.c +++ b/tools/testing/selftests/bpf/progs/bpf_iter_tcp6.c @@ -99,13 +99,13 @@ static int dump_tcp6_sock(struct seq_file *seq, struct tcp6_sock *tp, icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT || icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) { timer_active = 1; - timer_expires = icsk->icsk_retransmit_timer.expires; + timer_expires = sp->tcp_retransmit_timer.expires; } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) { timer_active = 4; - timer_expires = icsk->icsk_retransmit_timer.expires; - } else if (timer_pending(&sp->sk_timer)) { + timer_expires = sp->tcp_retransmit_timer.expires; + } else if (timer_pending(&icsk->icsk_keepalive_timer)) { timer_active = 2; - timer_expires = sp->sk_timer.expires; + timer_expires = icsk->icsk_keepalive_timer.expires; } else { timer_active = 0; timer_expires = bpf_jiffies64(); diff --git a/tools/testing/selftests/bpf/progs/bpf_misc.h b/tools/testing/selftests/bpf/progs/bpf_misc.h index a7a1a684eed1..c9bfbe1bafc1 100644 --- a/tools/testing/selftests/bpf/progs/bpf_misc.h +++ b/tools/testing/selftests/bpf/progs/bpf_misc.h @@ -126,6 +126,9 @@ * Several __arch_* annotations could be specified at once. * When test case is not run on current arch it is marked as skipped. * __caps_unpriv Specify the capabilities that should be set when running the test. + * + * __linear_size Specify the size of the linear area of non-linear skbs, or + * 0 for linear skbs. */ #define __msg(msg) __attribute__((btf_decl_tag("comment:test_expect_msg=" XSTR(__COUNTER__) "=" msg))) #define __not_msg(msg) __attribute__((btf_decl_tag("comment:test_expect_not_msg=" XSTR(__COUNTER__) "=" msg))) @@ -159,6 +162,7 @@ #define __stderr_unpriv(msg) __attribute__((btf_decl_tag("comment:test_expect_stderr_unpriv=" XSTR(__COUNTER__) "=" msg))) #define __stdout(msg) __attribute__((btf_decl_tag("comment:test_expect_stdout=" XSTR(__COUNTER__) "=" msg))) #define __stdout_unpriv(msg) __attribute__((btf_decl_tag("comment:test_expect_stdout_unpriv=" XSTR(__COUNTER__) "=" msg))) +#define __linear_size(sz) __attribute__((btf_decl_tag("comment:test_linear_size=" XSTR(sz)))) /* Define common capabilities tested using __caps_unpriv */ #define CAP_NET_ADMIN 12 diff --git a/tools/testing/selftests/bpf/progs/bpf_smc.c b/tools/testing/selftests/bpf/progs/bpf_smc.c new file mode 100644 index 000000000000..70d8b08f5914 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/bpf_smc.c @@ -0,0 +1,117 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "vmlinux.h" + +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> +#include "bpf_tracing_net.h" + +char _license[] SEC("license") = "GPL"; + +enum { + BPF_SMC_LISTEN = 10, +}; + +struct smc_sock___local { + struct sock sk; + struct smc_sock *listen_smc; + bool use_fallback; +} __attribute__((preserve_access_index)); + +int smc_cnt = 0; +int fallback_cnt = 0; + +SEC("fentry/smc_release") +int BPF_PROG(bpf_smc_release, struct socket *sock) +{ + /* only count from one side (client) */ + if (sock->sk->__sk_common.skc_state == BPF_SMC_LISTEN) + return 0; + smc_cnt++; + return 0; +} + +SEC("fentry/smc_switch_to_fallback") +int BPF_PROG(bpf_smc_switch_to_fallback, struct smc_sock___local *smc) +{ + /* only count from one side (client) */ + if (smc && !smc->listen_smc) + fallback_cnt++; + return 0; +} + +/* go with default value if no strat was found */ +bool default_ip_strat_value = true; + +struct smc_policy_ip_key { + __u32 sip; + __u32 dip; +}; + +struct smc_policy_ip_value { + __u8 mode; +}; + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(key_size, sizeof(struct smc_policy_ip_key)); + __uint(value_size, sizeof(struct smc_policy_ip_value)); + __uint(max_entries, 128); + __uint(map_flags, BPF_F_NO_PREALLOC); +} smc_policy_ip SEC(".maps"); + +static bool smc_check(__u32 src, __u32 dst) +{ + struct smc_policy_ip_value *value; + struct smc_policy_ip_key key = { + .sip = src, + .dip = dst, + }; + + value = bpf_map_lookup_elem(&smc_policy_ip, &key); + return value ? value->mode : default_ip_strat_value; +} + +SEC("fmod_ret/update_socket_protocol") +int BPF_PROG(smc_run, int family, int type, int protocol) +{ + struct task_struct *task; + + if (family != AF_INET && family != AF_INET6) + return protocol; + + if ((type & 0xf) != SOCK_STREAM) + return protocol; + + if (protocol != 0 && protocol != IPPROTO_TCP) + return protocol; + + task = bpf_get_current_task_btf(); + /* Prevent from affecting other tests */ + if (!task || !task->nsproxy->net_ns->smc.hs_ctrl) + return protocol; + + return IPPROTO_SMC; +} + +SEC("struct_ops") +int BPF_PROG(bpf_smc_set_tcp_option_cond, const struct tcp_sock *tp, + struct inet_request_sock *ireq) +{ + return smc_check(ireq->req.__req_common.skc_daddr, + ireq->req.__req_common.skc_rcv_saddr); +} + +SEC("struct_ops") +int BPF_PROG(bpf_smc_set_tcp_option, struct tcp_sock *tp) +{ + return smc_check(tp->inet_conn.icsk_inet.sk.__sk_common.skc_rcv_saddr, + tp->inet_conn.icsk_inet.sk.__sk_common.skc_daddr); +} + +SEC(".struct_ops") +struct smc_hs_ctrl linkcheck = { + .name = "linkcheck", + .syn_option = (void *)bpf_smc_set_tcp_option, + .synack_option = (void *)bpf_smc_set_tcp_option_cond, +}; diff --git a/tools/testing/selftests/bpf/progs/bpf_tracing_net.h b/tools/testing/selftests/bpf/progs/bpf_tracing_net.h index 17db400f0e0d..d8dacef37c16 100644 --- a/tools/testing/selftests/bpf/progs/bpf_tracing_net.h +++ b/tools/testing/selftests/bpf/progs/bpf_tracing_net.h @@ -146,6 +146,20 @@ #define tcp_jiffies32 ((__u32)bpf_jiffies64()) +#ifndef min +#define min(a, b) ((a) < (b) ? (a) : (b)) +#endif +#ifndef max +#define max(a, b) ((a) > (b) ? (a) : (b)) +#endif + +static inline bool before(__u32 seq1, __u32 seq2) +{ + return (__s32)(seq1 - seq2) < 0; +} + +#define after(seq2, seq1) before(seq1, seq2) + static inline struct inet_connection_sock *inet_csk(const struct sock *sk) { return (struct inet_connection_sock *)sk; diff --git a/tools/testing/selftests/bpf/progs/connect4_prog.c b/tools/testing/selftests/bpf/progs/connect4_prog.c index 9e9ebf27b878..9d158cfad981 100644 --- a/tools/testing/selftests/bpf/progs/connect4_prog.c +++ b/tools/testing/selftests/bpf/progs/connect4_prog.c @@ -34,6 +34,9 @@ #define SOL_TCP 6 #endif +const char reno[] = "reno"; +const char cubic[] = "cubic"; + __attribute__ ((noinline)) __weak int do_bind(struct bpf_sock_addr *ctx) { @@ -50,35 +53,27 @@ int do_bind(struct bpf_sock_addr *ctx) } static __inline int verify_cc(struct bpf_sock_addr *ctx, - char expected[TCP_CA_NAME_MAX]) + const char expected[]) { char buf[TCP_CA_NAME_MAX]; - int i; if (bpf_getsockopt(ctx, SOL_TCP, TCP_CONGESTION, &buf, sizeof(buf))) return 1; - for (i = 0; i < TCP_CA_NAME_MAX; i++) { - if (buf[i] != expected[i]) - return 1; - if (buf[i] == 0) - break; - } + if (bpf_strncmp(buf, TCP_CA_NAME_MAX, expected)) + return 1; return 0; } static __inline int set_cc(struct bpf_sock_addr *ctx) { - char reno[TCP_CA_NAME_MAX] = "reno"; - char cubic[TCP_CA_NAME_MAX] = "cubic"; - - if (bpf_setsockopt(ctx, SOL_TCP, TCP_CONGESTION, &reno, sizeof(reno))) + if (bpf_setsockopt(ctx, SOL_TCP, TCP_CONGESTION, (void *)reno, sizeof(reno))) return 1; if (verify_cc(ctx, reno)) return 1; - if (bpf_setsockopt(ctx, SOL_TCP, TCP_CONGESTION, &cubic, sizeof(cubic))) + if (bpf_setsockopt(ctx, SOL_TCP, TCP_CONGESTION, (void *)cubic, sizeof(cubic))) return 1; if (verify_cc(ctx, cubic)) return 1; diff --git a/tools/testing/selftests/bpf/progs/dynptr_success.c b/tools/testing/selftests/bpf/progs/dynptr_success.c index 127dea342e5a..e0d672d93adf 100644 --- a/tools/testing/selftests/bpf/progs/dynptr_success.c +++ b/tools/testing/selftests/bpf/progs/dynptr_success.c @@ -914,8 +914,8 @@ void *user_ptr; char expected_str[384]; __u32 test_len[7] = {0/* placeholder */, 0, 1, 2, 255, 256, 257}; -typedef int (*bpf_read_dynptr_fn_t)(struct bpf_dynptr *dptr, u32 off, - u32 size, const void *unsafe_ptr); +typedef int (*bpf_read_dynptr_fn_t)(struct bpf_dynptr *dptr, u64 off, + u64 size, const void *unsafe_ptr); /* Returns the offset just before the end of the maximum sized xdp fragment. * Any write larger than 32 bytes will be split between 2 fragments. @@ -1106,16 +1106,16 @@ int test_copy_from_user_str_dynptr(void *ctx) return 0; } -static int bpf_copy_data_from_user_task(struct bpf_dynptr *dptr, u32 off, - u32 size, const void *unsafe_ptr) +static int bpf_copy_data_from_user_task(struct bpf_dynptr *dptr, u64 off, + u64 size, const void *unsafe_ptr) { struct task_struct *task = bpf_get_current_task_btf(); return bpf_copy_from_user_task_dynptr(dptr, off, size, unsafe_ptr, task); } -static int bpf_copy_data_from_user_task_str(struct bpf_dynptr *dptr, u32 off, - u32 size, const void *unsafe_ptr) +static int bpf_copy_data_from_user_task_str(struct bpf_dynptr *dptr, u64 off, + u64 size, const void *unsafe_ptr) { struct task_struct *task = bpf_get_current_task_btf(); diff --git a/tools/testing/selftests/bpf/progs/file_reader.c b/tools/testing/selftests/bpf/progs/file_reader.c new file mode 100644 index 000000000000..4d756b623557 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/file_reader.c @@ -0,0 +1,145 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */ + +#include <vmlinux.h> +#include <string.h> +#include <stdbool.h> +#include <bpf/bpf_tracing.h> +#include "bpf_misc.h" +#include "errno.h" + +char _license[] SEC("license") = "GPL"; + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, 1); + __type(key, int); + __type(value, struct elem); +} arrmap SEC(".maps"); + +struct elem { + struct file *file; + struct bpf_task_work tw; +}; + +char user_buf[256000]; +char tmp_buf[256000]; + +int pid = 0; +int err, run_success = 0; + +static int validate_file_read(struct file *file); +static int task_work_callback(struct bpf_map *map, void *key, void *value); + +SEC("lsm/file_open") +int on_open_expect_fault(void *c) +{ + struct bpf_dynptr dynptr; + struct file *file; + int local_err = 1; + __u32 user_buf_sz = sizeof(user_buf); + + if (bpf_get_current_pid_tgid() >> 32 != pid) + return 0; + + file = bpf_get_task_exe_file(bpf_get_current_task_btf()); + if (!file) + return 0; + + if (bpf_dynptr_from_file(file, 0, &dynptr)) + goto out; + + local_err = bpf_dynptr_read(tmp_buf, user_buf_sz, &dynptr, user_buf_sz, 0); + if (local_err == -EFAULT) { /* Expect page fault */ + local_err = 0; + run_success = 1; + } +out: + bpf_dynptr_file_discard(&dynptr); + if (local_err) + err = local_err; + bpf_put_file(file); + return 0; +} + +SEC("lsm/file_open") +int on_open_validate_file_read(void *c) +{ + struct task_struct *task = bpf_get_current_task_btf(); + struct elem *work; + int key = 0; + + if (bpf_get_current_pid_tgid() >> 32 != pid) + return 0; + + work = bpf_map_lookup_elem(&arrmap, &key); + if (!work) { + err = 1; + return 0; + } + bpf_task_work_schedule_signal_impl(task, &work->tw, &arrmap, task_work_callback, NULL); + return 0; +} + +/* Called in a sleepable context, read 256K bytes, cross check with user space read data */ +static int task_work_callback(struct bpf_map *map, void *key, void *value) +{ + struct task_struct *task = bpf_get_current_task_btf(); + struct file *file = bpf_get_task_exe_file(task); + + if (!file) + return 0; + + err = validate_file_read(file); + if (!err) + run_success = 1; + bpf_put_file(file); + return 0; +} + +static int verify_dynptr_read(struct bpf_dynptr *ptr, u32 off, char *user_buf, u32 len) +{ + int i; + + if (bpf_dynptr_read(tmp_buf, len, ptr, off, 0)) + return 1; + + /* Verify file contents read from BPF is the same as the one read from userspace */ + bpf_for(i, 0, len) + { + if (tmp_buf[i] != user_buf[i]) + return 1; + } + return 0; +} + +static int validate_file_read(struct file *file) +{ + struct bpf_dynptr dynptr; + int loc_err = 1, off; + __u32 user_buf_sz = sizeof(user_buf); + + if (bpf_dynptr_from_file(file, 0, &dynptr)) + goto cleanup; + + loc_err = verify_dynptr_read(&dynptr, 0, user_buf, user_buf_sz); + off = 1; + loc_err = loc_err ?: verify_dynptr_read(&dynptr, off, user_buf + off, user_buf_sz - off); + off = user_buf_sz - 1; + loc_err = loc_err ?: verify_dynptr_read(&dynptr, off, user_buf + off, user_buf_sz - off); + /* Read file with random offset and length */ + off = 4097; + loc_err = loc_err ?: verify_dynptr_read(&dynptr, off, user_buf + off, 100); + + /* Adjust dynptr, verify read */ + loc_err = loc_err ?: bpf_dynptr_adjust(&dynptr, off, off + 1); + loc_err = loc_err ?: verify_dynptr_read(&dynptr, 0, user_buf + off, 1); + /* Can't read more than 1 byte */ + loc_err = loc_err ?: verify_dynptr_read(&dynptr, 0, user_buf + off, 2) == 0; + /* Can't read with far offset */ + loc_err = loc_err ?: verify_dynptr_read(&dynptr, 1, user_buf + off, 1) == 0; + +cleanup: + bpf_dynptr_file_discard(&dynptr); + return loc_err; +} diff --git a/tools/testing/selftests/bpf/progs/file_reader_fail.c b/tools/testing/selftests/bpf/progs/file_reader_fail.c new file mode 100644 index 000000000000..32fe28ed2439 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/file_reader_fail.c @@ -0,0 +1,52 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */ + +#include <vmlinux.h> +#include <string.h> +#include <stdbool.h> +#include <bpf/bpf_tracing.h> +#include "bpf_misc.h" + +char _license[] SEC("license") = "GPL"; + +int err; +void *user_ptr; + +SEC("lsm/file_open") +__failure +__msg("Unreleased reference id=") +int on_nanosleep_unreleased_ref(void *ctx) +{ + struct task_struct *task = bpf_get_current_task_btf(); + struct file *file = bpf_get_task_exe_file(task); + struct bpf_dynptr dynptr; + + if (!file) + return 0; + + err = bpf_dynptr_from_file(file, 0, &dynptr); + return err ? 1 : 0; +} + +SEC("xdp") +__failure +__msg("Expected a dynptr of type file as arg #0") +int xdp_wrong_dynptr_type(struct xdp_md *xdp) +{ + struct bpf_dynptr dynptr; + + bpf_dynptr_from_xdp(xdp, 0, &dynptr); + bpf_dynptr_file_discard(&dynptr); + return 0; +} + +SEC("xdp") +__failure +__msg("Expected an initialized dynptr as arg #0") +int xdp_no_dynptr_type(struct xdp_md *xdp) +{ + struct bpf_dynptr dynptr; + + bpf_dynptr_file_discard(&dynptr); + return 0; +} diff --git a/tools/testing/selftests/bpf/progs/htab_update.c b/tools/testing/selftests/bpf/progs/htab_update.c index 7481bb30b29b..195d3b2fba00 100644 --- a/tools/testing/selftests/bpf/progs/htab_update.c +++ b/tools/testing/selftests/bpf/progs/htab_update.c @@ -6,24 +6,31 @@ char _license[] SEC("license") = "GPL"; +/* Map value type: has BTF-managed field (bpf_timer) */ +struct val { + struct bpf_timer t; + __u64 payload; +}; + struct { __uint(type, BPF_MAP_TYPE_HASH); __uint(max_entries, 1); - __uint(key_size, sizeof(__u32)); - __uint(value_size, sizeof(__u32)); + __type(key, __u32); + __type(value, struct val); } htab SEC(".maps"); int pid = 0; int update_err = 0; -SEC("?fentry/lookup_elem_raw") -int lookup_elem_raw(void *ctx) +SEC("?fentry/bpf_obj_free_fields") +int bpf_obj_free_fields(void *ctx) { - __u32 key = 0, value = 1; + __u32 key = 0; + struct val value = { .payload = 1 }; if ((bpf_get_current_pid_tgid() >> 32) != pid) return 0; - update_err = bpf_map_update_elem(&htab, &key, &value, 0); + update_err = bpf_map_update_elem(&htab, &key, &value, BPF_ANY); return 0; } diff --git a/tools/testing/selftests/bpf/progs/ip_check_defrag.c b/tools/testing/selftests/bpf/progs/ip_check_defrag.c index 645b2c9f7867..0e87ad1ebcfa 100644 --- a/tools/testing/selftests/bpf/progs/ip_check_defrag.c +++ b/tools/testing/selftests/bpf/progs/ip_check_defrag.c @@ -12,11 +12,6 @@ #define IP_OFFSET 0x1FFF #define NEXTHDR_FRAGMENT 44 -extern int bpf_dynptr_from_skb(struct __sk_buff *skb, __u64 flags, - struct bpf_dynptr *ptr__uninit) __ksym; -extern void *bpf_dynptr_slice(const struct bpf_dynptr *ptr, uint32_t offset, - void *buffer, uint32_t buffer__sz) __ksym; - volatile int shootdowns = 0; static bool is_frag_v4(struct iphdr *iph) diff --git a/tools/testing/selftests/bpf/progs/lsm.c b/tools/testing/selftests/bpf/progs/lsm.c index 0c13b7409947..7de173daf27b 100644 --- a/tools/testing/selftests/bpf/progs/lsm.c +++ b/tools/testing/selftests/bpf/progs/lsm.c @@ -89,14 +89,16 @@ SEC("lsm/file_mprotect") int BPF_PROG(test_int_hook, struct vm_area_struct *vma, unsigned long reqprot, unsigned long prot, int ret) { - if (ret != 0) + struct mm_struct *mm = vma->vm_mm; + + if (ret != 0 || !mm) return ret; __s32 pid = bpf_get_current_pid_tgid() >> 32; int is_stack = 0; - is_stack = (vma->vm_start <= vma->vm_mm->start_stack && - vma->vm_end >= vma->vm_mm->start_stack); + is_stack = (vma->vm_start <= mm->start_stack && + vma->vm_end >= mm->start_stack); if (is_stack && monitored_pid == pid) { mprotect_count++; diff --git a/tools/testing/selftests/bpf/progs/lsm_tailcall.c b/tools/testing/selftests/bpf/progs/lsm_tailcall.c index 49c075ce2d4c..6e7e58051e64 100644 --- a/tools/testing/selftests/bpf/progs/lsm_tailcall.c +++ b/tools/testing/selftests/bpf/progs/lsm_tailcall.c @@ -20,14 +20,14 @@ int lsm_file_permission_prog(void *ctx) return 0; } -SEC("lsm/file_alloc_security") -int lsm_file_alloc_security_prog(void *ctx) +SEC("lsm/kernfs_init_security") +int lsm_kernfs_init_security_prog(void *ctx) { return 0; } -SEC("lsm/file_alloc_security") -int lsm_file_alloc_security_entry(void *ctx) +SEC("lsm/kernfs_init_security") +int lsm_kernfs_init_security_entry(void *ctx) { bpf_tail_call_static(ctx, &jmp_table, 0); return 0; diff --git a/tools/testing/selftests/bpf/progs/rcu_read_lock.c b/tools/testing/selftests/bpf/progs/rcu_read_lock.c index 3a868a199349..d70c28824bbe 100644 --- a/tools/testing/selftests/bpf/progs/rcu_read_lock.c +++ b/tools/testing/selftests/bpf/progs/rcu_read_lock.c @@ -278,6 +278,46 @@ out: return 0; } +SEC("?fentry.s/" SYS_PREFIX "sys_nanosleep") +int nested_rcu_region_unbalanced_1(void *ctx) +{ + struct task_struct *task, *real_parent; + + /* nested rcu read lock regions */ + task = bpf_get_current_task_btf(); + bpf_rcu_read_lock(); + bpf_rcu_read_lock(); + real_parent = task->real_parent; + if (!real_parent) + goto out; + (void)bpf_task_storage_get(&map_a, real_parent, 0, 0); +out: + bpf_rcu_read_unlock(); + bpf_rcu_read_unlock(); + bpf_rcu_read_unlock(); + return 0; +} + +SEC("?fentry.s/" SYS_PREFIX "sys_nanosleep") +int nested_rcu_region_unbalanced_2(void *ctx) +{ + struct task_struct *task, *real_parent; + + /* nested rcu read lock regions */ + task = bpf_get_current_task_btf(); + bpf_rcu_read_lock(); + bpf_rcu_read_lock(); + bpf_rcu_read_lock(); + real_parent = task->real_parent; + if (!real_parent) + goto out; + (void)bpf_task_storage_get(&map_a, real_parent, 0, 0); +out: + bpf_rcu_read_unlock(); + bpf_rcu_read_unlock(); + return 0; +} + SEC("?fentry.s/" SYS_PREFIX "sys_getpgid") int task_trusted_non_rcuptr(void *ctx) { diff --git a/tools/testing/selftests/bpf/progs/refcounted_kptr.c b/tools/testing/selftests/bpf/progs/refcounted_kptr.c index 893a4fdb4b6e..1aca85d86aeb 100644 --- a/tools/testing/selftests/bpf/progs/refcounted_kptr.c +++ b/tools/testing/selftests/bpf/progs/refcounted_kptr.c @@ -568,4 +568,64 @@ err_out: return 0; } +private(kptr_ref) u64 ref; + +static int probe_read_refcount(void) +{ + u32 refcount; + + bpf_probe_read_kernel(&refcount, sizeof(refcount), (void *) ref); + return refcount; +} + +static int __insert_in_list(struct bpf_list_head *head, struct bpf_spin_lock *lock, + struct node_data __kptr **node) +{ + struct node_data *node_new, *node_ref, *node_old; + + node_new = bpf_obj_new(typeof(*node_new)); + if (!node_new) + return -1; + + node_ref = bpf_refcount_acquire(node_new); + node_old = bpf_kptr_xchg(node, node_new); + if (node_old) { + bpf_obj_drop(node_old); + bpf_obj_drop(node_ref); + return -2; + } + + bpf_spin_lock(lock); + bpf_list_push_front(head, &node_ref->l); + ref = (u64)(void *) &node_ref->ref; + bpf_spin_unlock(lock); + return probe_read_refcount(); +} + +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_HASH); + __type(key, int); + __type(value, struct map_value); + __uint(max_entries, 1); +} percpu_hash SEC(".maps"); + +SEC("tc") +int percpu_hash_refcount_leak(void *ctx) +{ + struct map_value *v; + int key = 0; + + v = bpf_map_lookup_elem(&percpu_hash, &key); + if (!v) + return 0; + + return __insert_in_list(&head, &lock, &v->node); +} + +SEC("tc") +int check_percpu_hash_refcount(void *ctx) +{ + return probe_read_refcount(); +} + char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/ringbuf_bench.c b/tools/testing/selftests/bpf/progs/ringbuf_bench.c index 6a468496f539..d96c7d1e8fc2 100644 --- a/tools/testing/selftests/bpf/progs/ringbuf_bench.c +++ b/tools/testing/selftests/bpf/progs/ringbuf_bench.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 // Copyright (c) 2020 Facebook +#include <stdbool.h> #include <linux/bpf.h> #include <stdint.h> #include <bpf/bpf_helpers.h> @@ -14,9 +15,11 @@ struct { const volatile int batch_cnt = 0; const volatile long use_output = 0; +const volatile bool bench_producer = false; long sample_val = 42; long dropped __attribute__((aligned(128))) = 0; +long hits __attribute__((aligned(128))) = 0; const volatile long wakeup_data_size = 0; @@ -24,6 +27,9 @@ static __always_inline long get_flags() { long sz; + if (bench_producer) + return BPF_RB_NO_WAKEUP; + if (!wakeup_data_size) return 0; @@ -47,6 +53,8 @@ int bench_ringbuf(void *ctx) *sample = sample_val; flags = get_flags(); bpf_ringbuf_submit(sample, flags); + if (bench_producer) + __sync_add_and_fetch(&hits, 1); } } } else { @@ -55,6 +63,9 @@ int bench_ringbuf(void *ctx) if (bpf_ringbuf_output(&ringbuf, &sample_val, sizeof(sample_val), flags)) __sync_add_and_fetch(&dropped, 1); + else if (bench_producer) + __sync_add_and_fetch(&hits, 1); + } } return 0; diff --git a/tools/testing/selftests/bpf/progs/sk_bypass_prot_mem.c b/tools/testing/selftests/bpf/progs/sk_bypass_prot_mem.c new file mode 100644 index 000000000000..09a00d11ffcc --- /dev/null +++ b/tools/testing/selftests/bpf/progs/sk_bypass_prot_mem.c @@ -0,0 +1,104 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright 2025 Google LLC */ + +#include "bpf_tracing_net.h" +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> +#include <errno.h> + +extern int tcp_memory_per_cpu_fw_alloc __ksym; +extern int udp_memory_per_cpu_fw_alloc __ksym; + +int nr_cpus; +bool tcp_activated, udp_activated; +long tcp_memory_allocated, udp_memory_allocated; + +struct sk_prot { + long *memory_allocated; + int *memory_per_cpu_fw_alloc; +}; + +static int drain_memory_per_cpu_fw_alloc(__u32 i, struct sk_prot *sk_prot_ctx) +{ + int *memory_per_cpu_fw_alloc; + + memory_per_cpu_fw_alloc = bpf_per_cpu_ptr(sk_prot_ctx->memory_per_cpu_fw_alloc, i); + if (memory_per_cpu_fw_alloc) + *sk_prot_ctx->memory_allocated += *memory_per_cpu_fw_alloc; + + return 0; +} + +static long get_memory_allocated(struct sock *_sk, int *memory_per_cpu_fw_alloc) +{ + struct sock *sk = bpf_core_cast(_sk, struct sock); + struct sk_prot sk_prot_ctx; + long memory_allocated; + + /* net_aligned_data.{tcp,udp}_memory_allocated was not available. */ + memory_allocated = sk->__sk_common.skc_prot->memory_allocated->counter; + + sk_prot_ctx.memory_allocated = &memory_allocated; + sk_prot_ctx.memory_per_cpu_fw_alloc = memory_per_cpu_fw_alloc; + + bpf_loop(nr_cpus, drain_memory_per_cpu_fw_alloc, &sk_prot_ctx, 0); + + return memory_allocated; +} + +static void fentry_init_sock(struct sock *sk, bool *activated, + long *memory_allocated, int *memory_per_cpu_fw_alloc) +{ + if (!*activated) + return; + + *memory_allocated = get_memory_allocated(sk, memory_per_cpu_fw_alloc); + *activated = false; +} + +SEC("fentry/tcp_init_sock") +int BPF_PROG(fentry_tcp_init_sock, struct sock *sk) +{ + fentry_init_sock(sk, &tcp_activated, + &tcp_memory_allocated, &tcp_memory_per_cpu_fw_alloc); + return 0; +} + +SEC("fentry/udp_init_sock") +int BPF_PROG(fentry_udp_init_sock, struct sock *sk) +{ + fentry_init_sock(sk, &udp_activated, + &udp_memory_allocated, &udp_memory_per_cpu_fw_alloc); + return 0; +} + +SEC("cgroup/sock_create") +int sock_create(struct bpf_sock *ctx) +{ + int err, val = 1; + + err = bpf_setsockopt(ctx, SOL_SOCKET, SK_BPF_BYPASS_PROT_MEM, + &val, sizeof(val)); + if (err) + goto err; + + val = 0; + + err = bpf_getsockopt(ctx, SOL_SOCKET, SK_BPF_BYPASS_PROT_MEM, + &val, sizeof(val)); + if (err) + goto err; + + if (val != 1) { + err = -EINVAL; + goto err; + } + + return 1; + +err: + bpf_set_retval(err); + return 0; +} + +char LICENSE[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/string_kfuncs_failure1.c b/tools/testing/selftests/bpf/progs/string_kfuncs_failure1.c index 99d72c68f76a..826e6b6aff7e 100644 --- a/tools/testing/selftests/bpf/progs/string_kfuncs_failure1.c +++ b/tools/testing/selftests/bpf/progs/string_kfuncs_failure1.c @@ -45,8 +45,12 @@ SEC("syscall") __retval(USER_PTR_ERR)int test_strcspn_null1(void *ctx) { return SEC("syscall") __retval(USER_PTR_ERR)int test_strcspn_null2(void *ctx) { return bpf_strcspn("hello", NULL); } SEC("syscall") __retval(USER_PTR_ERR)int test_strstr_null1(void *ctx) { return bpf_strstr(NULL, "hello"); } SEC("syscall") __retval(USER_PTR_ERR)int test_strstr_null2(void *ctx) { return bpf_strstr("hello", NULL); } +SEC("syscall") __retval(USER_PTR_ERR)int test_strcasestr_null1(void *ctx) { return bpf_strcasestr(NULL, "hello"); } +SEC("syscall") __retval(USER_PTR_ERR)int test_strcasestr_null2(void *ctx) { return bpf_strcasestr("hello", NULL); } SEC("syscall") __retval(USER_PTR_ERR)int test_strnstr_null1(void *ctx) { return bpf_strnstr(NULL, "hello", 1); } SEC("syscall") __retval(USER_PTR_ERR)int test_strnstr_null2(void *ctx) { return bpf_strnstr("hello", NULL, 1); } +SEC("syscall") __retval(USER_PTR_ERR)int test_strncasestr_null1(void *ctx) { return bpf_strncasestr(NULL, "hello", 1); } +SEC("syscall") __retval(USER_PTR_ERR)int test_strncasestr_null2(void *ctx) { return bpf_strncasestr("hello", NULL, 1); } /* Passing userspace ptr to string kfuncs */ SEC("syscall") __retval(USER_PTR_ERR) int test_strcmp_user_ptr1(void *ctx) { return bpf_strcmp(user_ptr, "hello"); } @@ -65,8 +69,12 @@ SEC("syscall") __retval(USER_PTR_ERR) int test_strcspn_user_ptr1(void *ctx) { re SEC("syscall") __retval(USER_PTR_ERR) int test_strcspn_user_ptr2(void *ctx) { return bpf_strcspn("hello", user_ptr); } SEC("syscall") __retval(USER_PTR_ERR) int test_strstr_user_ptr1(void *ctx) { return bpf_strstr(user_ptr, "hello"); } SEC("syscall") __retval(USER_PTR_ERR) int test_strstr_user_ptr2(void *ctx) { return bpf_strstr("hello", user_ptr); } +SEC("syscall") __retval(USER_PTR_ERR) int test_strcasestr_user_ptr1(void *ctx) { return bpf_strcasestr(user_ptr, "hello"); } +SEC("syscall") __retval(USER_PTR_ERR) int test_strcasestr_user_ptr2(void *ctx) { return bpf_strcasestr("hello", user_ptr); } SEC("syscall") __retval(USER_PTR_ERR) int test_strnstr_user_ptr1(void *ctx) { return bpf_strnstr(user_ptr, "hello", 1); } SEC("syscall") __retval(USER_PTR_ERR) int test_strnstr_user_ptr2(void *ctx) { return bpf_strnstr("hello", user_ptr, 1); } +SEC("syscall") __retval(USER_PTR_ERR) int test_strncasestr_user_ptr1(void *ctx) { return bpf_strncasestr(user_ptr, "hello", 1); } +SEC("syscall") __retval(USER_PTR_ERR) int test_strncasestr_user_ptr2(void *ctx) { return bpf_strncasestr("hello", user_ptr, 1); } #endif /* __TARGET_ARCH_s390 */ @@ -87,7 +95,11 @@ SEC("syscall") __retval(-EFAULT) int test_strcspn_pagefault1(void *ctx) { return SEC("syscall") __retval(-EFAULT) int test_strcspn_pagefault2(void *ctx) { return bpf_strcspn("hello", invalid_kern_ptr); } SEC("syscall") __retval(-EFAULT) int test_strstr_pagefault1(void *ctx) { return bpf_strstr(invalid_kern_ptr, "hello"); } SEC("syscall") __retval(-EFAULT) int test_strstr_pagefault2(void *ctx) { return bpf_strstr("hello", invalid_kern_ptr); } +SEC("syscall") __retval(-EFAULT) int test_strcasestr_pagefault1(void *ctx) { return bpf_strcasestr(invalid_kern_ptr, "hello"); } +SEC("syscall") __retval(-EFAULT) int test_strcasestr_pagefault2(void *ctx) { return bpf_strcasestr("hello", invalid_kern_ptr); } SEC("syscall") __retval(-EFAULT) int test_strnstr_pagefault1(void *ctx) { return bpf_strnstr(invalid_kern_ptr, "hello", 1); } SEC("syscall") __retval(-EFAULT) int test_strnstr_pagefault2(void *ctx) { return bpf_strnstr("hello", invalid_kern_ptr, 1); } +SEC("syscall") __retval(-EFAULT) int test_strncasestr_pagefault1(void *ctx) { return bpf_strncasestr(invalid_kern_ptr, "hello", 1); } +SEC("syscall") __retval(-EFAULT) int test_strncasestr_pagefault2(void *ctx) { return bpf_strncasestr("hello", invalid_kern_ptr, 1); } char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/string_kfuncs_failure2.c b/tools/testing/selftests/bpf/progs/string_kfuncs_failure2.c index e41cc5601994..05e1da1f250f 100644 --- a/tools/testing/selftests/bpf/progs/string_kfuncs_failure2.c +++ b/tools/testing/selftests/bpf/progs/string_kfuncs_failure2.c @@ -19,6 +19,8 @@ SEC("syscall") int test_strspn_accept_too_long(void *ctx) { return bpf_strspn("b SEC("syscall") int test_strcspn_str_too_long(void *ctx) { return bpf_strcspn(long_str, "b"); } SEC("syscall") int test_strcspn_reject_too_long(void *ctx) { return bpf_strcspn("b", long_str); } SEC("syscall") int test_strstr_too_long(void *ctx) { return bpf_strstr(long_str, "hello"); } +SEC("syscall") int test_strcasestr_too_long(void *ctx) { return bpf_strcasestr(long_str, "hello"); } SEC("syscall") int test_strnstr_too_long(void *ctx) { return bpf_strnstr(long_str, "hello", sizeof(long_str)); } +SEC("syscall") int test_strncasestr_too_long(void *ctx) { return bpf_strncasestr(long_str, "hello", sizeof(long_str)); } char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/string_kfuncs_success.c b/tools/testing/selftests/bpf/progs/string_kfuncs_success.c index 2e3498e37b9c..a8513964516b 100644 --- a/tools/testing/selftests/bpf/progs/string_kfuncs_success.c +++ b/tools/testing/selftests/bpf/progs/string_kfuncs_success.c @@ -33,8 +33,11 @@ __test(11) int test_strnlen(void *ctx) { return bpf_strnlen(str, 12); } __test(5) int test_strspn(void *ctx) { return bpf_strspn(str, "ehlo"); } __test(2) int test_strcspn(void *ctx) { return bpf_strcspn(str, "lo"); } __test(6) int test_strstr_found(void *ctx) { return bpf_strstr(str, "world"); } +__test(6) int test_strcasestr_found(void *ctx) { return bpf_strcasestr(str, "woRLD"); } __test(-ENOENT) int test_strstr_notfound(void *ctx) { return bpf_strstr(str, "hi"); } +__test(-ENOENT) int test_strcasestr_notfound(void *ctx) { return bpf_strcasestr(str, "hi"); } __test(0) int test_strstr_empty(void *ctx) { return bpf_strstr(str, ""); } +__test(0) int test_strcasestr_empty(void *ctx) { return bpf_strcasestr(str, ""); } __test(0) int test_strnstr_found1(void *ctx) { return bpf_strnstr("", "", 0); } __test(0) int test_strnstr_found2(void *ctx) { return bpf_strnstr(str, "hello", 5); } __test(0) int test_strnstr_found3(void *ctx) { return bpf_strnstr(str, "hello", 6); } @@ -42,5 +45,12 @@ __test(-ENOENT) int test_strnstr_notfound1(void *ctx) { return bpf_strnstr(str, __test(-ENOENT) int test_strnstr_notfound2(void *ctx) { return bpf_strnstr(str, "hello", 4); } __test(-ENOENT) int test_strnstr_notfound3(void *ctx) { return bpf_strnstr("", "a", 0); } __test(0) int test_strnstr_empty(void *ctx) { return bpf_strnstr(str, "", 1); } +__test(0) int test_strncasestr_found1(void *ctx) { return bpf_strncasestr("", "", 0); } +__test(0) int test_strncasestr_found2(void *ctx) { return bpf_strncasestr(str, "heLLO", 5); } +__test(0) int test_strncasestr_found3(void *ctx) { return bpf_strncasestr(str, "heLLO", 6); } +__test(-ENOENT) int test_strncasestr_notfound1(void *ctx) { return bpf_strncasestr(str, "hi", 10); } +__test(-ENOENT) int test_strncasestr_notfound2(void *ctx) { return bpf_strncasestr(str, "hello", 4); } +__test(-ENOENT) int test_strncasestr_notfound3(void *ctx) { return bpf_strncasestr("", "a", 0); } +__test(0) int test_strncasestr_empty(void *ctx) { return bpf_strncasestr(str, "", 1); } char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/strobemeta.h b/tools/testing/selftests/bpf/progs/strobemeta.h index a5c74d31a244..6e1918deaf26 100644 --- a/tools/testing/selftests/bpf/progs/strobemeta.h +++ b/tools/testing/selftests/bpf/progs/strobemeta.h @@ -330,9 +330,9 @@ static void *calc_location(struct strobe_value_loc *loc, void *tls_base) } bpf_probe_read_user(&tls_ptr, sizeof(void *), dtv); /* if pointer has (void *)-1 value, then TLS wasn't initialized yet */ - return tls_ptr && tls_ptr != (void *)-1 - ? tls_ptr + tls_index.offset - : NULL; + if (!tls_ptr || tls_ptr == (void *)-1) + return NULL; + return tls_ptr + tls_index.offset; } #ifdef SUBPROGS diff --git a/tools/testing/selftests/bpf/progs/tcp_ca_write_sk_pacing.c b/tools/testing/selftests/bpf/progs/tcp_ca_write_sk_pacing.c index a58b5194fc89..022291f21dfb 100644 --- a/tools/testing/selftests/bpf/progs/tcp_ca_write_sk_pacing.c +++ b/tools/testing/selftests/bpf/progs/tcp_ca_write_sk_pacing.c @@ -8,8 +8,6 @@ char _license[] SEC("license") = "GPL"; #define USEC_PER_SEC 1000000UL -#define min(a, b) ((a) < (b) ? (a) : (b)) - static unsigned int tcp_left_out(const struct tcp_sock *tp) { return tp->sacked_out + tp->lost_out; diff --git a/tools/testing/selftests/bpf/progs/test_check_mtu.c b/tools/testing/selftests/bpf/progs/test_check_mtu.c index 2ec1de11a3ae..7b6b2b342c1d 100644 --- a/tools/testing/selftests/bpf/progs/test_check_mtu.c +++ b/tools/testing/selftests/bpf/progs/test_check_mtu.c @@ -7,6 +7,7 @@ #include <stddef.h> #include <stdint.h> +#include <errno.h> char _license[] SEC("license") = "GPL"; @@ -288,3 +289,14 @@ int tc_input_len_exceed(struct __sk_buff *ctx) global_bpf_mtu_xdp = mtu_len; return retval; } + +SEC("tc") +int tc_chk_segs_flag(struct __sk_buff *ctx) +{ + __u32 mtu_len = 0; + int err; + + err = bpf_check_mtu(ctx, GLOBAL_USER_IFINDEX, &mtu_len, 0, BPF_MTU_CHK_SEGS); + + return err == -EINVAL ? BPF_OK : BPF_DROP; +} diff --git a/tools/testing/selftests/bpf/progs/test_perf_branches.c b/tools/testing/selftests/bpf/progs/test_perf_branches.c index a1ccc831c882..05ac9410cd68 100644 --- a/tools/testing/selftests/bpf/progs/test_perf_branches.c +++ b/tools/testing/selftests/bpf/progs/test_perf_branches.c @@ -8,6 +8,7 @@ #include <bpf/bpf_tracing.h> int valid = 0; +int run_cnt = 0; int required_size_out = 0; int written_stack_out = 0; int written_global_out = 0; @@ -24,6 +25,8 @@ int perf_branches(void *ctx) __u64 entries[4 * 3] = {0}; int required_size, written_stack, written_global; + ++run_cnt; + /* write to stack */ written_stack = bpf_read_branch_records(ctx, entries, sizeof(entries), 0); /* ignore spurious events */ diff --git a/tools/testing/selftests/bpf/progs/test_ringbuf_overwrite.c b/tools/testing/selftests/bpf/progs/test_ringbuf_overwrite.c new file mode 100644 index 000000000000..ff4aa67ddacc --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_ringbuf_overwrite.c @@ -0,0 +1,98 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (C) 2025. Huawei Technologies Co., Ltd */ + +#include <linux/bpf.h> +#include <bpf/bpf_helpers.h> +#include "bpf_misc.h" + +char _license[] SEC("license") = "GPL"; + +struct { + __uint(type, BPF_MAP_TYPE_RINGBUF); + __uint(map_flags, BPF_F_RB_OVERWRITE); +} ringbuf SEC(".maps"); + +int pid; + +const volatile unsigned long LEN1; +const volatile unsigned long LEN2; +const volatile unsigned long LEN3; +const volatile unsigned long LEN4; +const volatile unsigned long LEN5; + +long reserve1_fail = 0; +long reserve2_fail = 0; +long reserve3_fail = 0; +long reserve4_fail = 0; +long reserve5_fail = 0; + +unsigned long avail_data = 0; +unsigned long ring_size = 0; +unsigned long cons_pos = 0; +unsigned long prod_pos = 0; +unsigned long over_pos = 0; + +SEC("fentry/" SYS_PREFIX "sys_getpgid") +int test_overwrite_ringbuf(void *ctx) +{ + char *rec1, *rec2, *rec3, *rec4, *rec5; + int cur_pid = bpf_get_current_pid_tgid() >> 32; + + if (cur_pid != pid) + return 0; + + rec1 = bpf_ringbuf_reserve(&ringbuf, LEN1, 0); + if (!rec1) { + reserve1_fail = 1; + return 0; + } + + rec2 = bpf_ringbuf_reserve(&ringbuf, LEN2, 0); + if (!rec2) { + bpf_ringbuf_discard(rec1, 0); + reserve2_fail = 1; + return 0; + } + + rec3 = bpf_ringbuf_reserve(&ringbuf, LEN3, 0); + /* expect failure */ + if (!rec3) { + reserve3_fail = 1; + } else { + bpf_ringbuf_discard(rec1, 0); + bpf_ringbuf_discard(rec2, 0); + bpf_ringbuf_discard(rec3, 0); + return 0; + } + + rec4 = bpf_ringbuf_reserve(&ringbuf, LEN4, 0); + if (!rec4) { + reserve4_fail = 1; + bpf_ringbuf_discard(rec1, 0); + bpf_ringbuf_discard(rec2, 0); + return 0; + } + + bpf_ringbuf_submit(rec1, 0); + bpf_ringbuf_submit(rec2, 0); + bpf_ringbuf_submit(rec4, 0); + + rec5 = bpf_ringbuf_reserve(&ringbuf, LEN5, 0); + if (!rec5) { + reserve5_fail = 1; + return 0; + } + + for (int i = 0; i < LEN3; i++) + rec5[i] = 0xdd; + + bpf_ringbuf_submit(rec5, 0); + + ring_size = bpf_ringbuf_query(&ringbuf, BPF_RB_RING_SIZE); + avail_data = bpf_ringbuf_query(&ringbuf, BPF_RB_AVAIL_DATA); + cons_pos = bpf_ringbuf_query(&ringbuf, BPF_RB_CONS_POS); + prod_pos = bpf_ringbuf_query(&ringbuf, BPF_RB_PROD_POS); + over_pos = bpf_ringbuf_query(&ringbuf, BPF_RB_OVERWRITE_POS); + + return 0; +} diff --git a/tools/testing/selftests/bpf/progs/test_tc_edt.c b/tools/testing/selftests/bpf/progs/test_tc_edt.c index 950a70b61e74..4f6f03122d61 100644 --- a/tools/testing/selftests/bpf/progs/test_tc_edt.c +++ b/tools/testing/selftests/bpf/progs/test_tc_edt.c @@ -14,7 +14,6 @@ #define TIME_HORIZON_NS (2000 * 1000 * 1000) #define NS_PER_SEC 1000000000 #define ECN_HORIZON_NS 5000000 -#define THROTTLE_RATE_BPS (5 * 1000 * 1000) /* flow_key => last_tstamp timestamp used */ struct { @@ -24,12 +23,13 @@ struct { __uint(max_entries, 1); } flow_map SEC(".maps"); +__uint64_t target_rate; + static inline int throttle_flow(struct __sk_buff *skb) { int key = 0; uint64_t *last_tstamp = bpf_map_lookup_elem(&flow_map, &key); - uint64_t delay_ns = ((uint64_t)skb->len) * NS_PER_SEC / - THROTTLE_RATE_BPS; + uint64_t delay_ns = ((uint64_t)skb->len) * NS_PER_SEC / target_rate; uint64_t now = bpf_ktime_get_ns(); uint64_t tstamp, next_tstamp = 0; @@ -70,7 +70,7 @@ static inline int handle_tcp(struct __sk_buff *skb, struct tcphdr *tcp) if ((void *)(tcp + 1) > data_end) return TC_ACT_SHOT; - if (tcp->dest == bpf_htons(9000)) + if (tcp->source == bpf_htons(9000)) return throttle_flow(skb); return TC_ACT_OK; @@ -99,7 +99,8 @@ static inline int handle_ipv4(struct __sk_buff *skb) return TC_ACT_OK; } -SEC("cls_test") int tc_prog(struct __sk_buff *skb) +SEC("tc") +int tc_prog(struct __sk_buff *skb) { if (skb->protocol == bpf_htons(ETH_P_IP)) return handle_ipv4(skb); diff --git a/tools/testing/selftests/bpf/progs/test_tc_tunnel.c b/tools/testing/selftests/bpf/progs/test_tc_tunnel.c index 404124a93892..7330c61b5730 100644 --- a/tools/testing/selftests/bpf/progs/test_tc_tunnel.c +++ b/tools/testing/selftests/bpf/progs/test_tc_tunnel.c @@ -2,23 +2,11 @@ /* In-place tunneling */ -#include <stdbool.h> -#include <string.h> - -#include <linux/stddef.h> -#include <linux/bpf.h> -#include <linux/if_ether.h> -#include <linux/in.h> -#include <linux/ip.h> -#include <linux/ipv6.h> -#include <linux/mpls.h> -#include <linux/tcp.h> -#include <linux/udp.h> -#include <linux/pkt_cls.h> -#include <linux/types.h> +#include <vmlinux.h> -#include <bpf/bpf_endian.h> #include <bpf/bpf_helpers.h> +#include <bpf/bpf_endian.h> +#include "bpf_tracing_net.h" #include "bpf_compiler.h" #pragma GCC diagnostic ignored "-Waddress-of-packed-member" @@ -27,6 +15,14 @@ static const int cfg_port = 8000; static const int cfg_udp_src = 20000; +#define ETH_P_MPLS_UC 0x8847 +#define ETH_P_TEB 0x6558 + +#define MPLS_LS_S_MASK 0x00000100 +#define BPF_F_ADJ_ROOM_ENCAP_L2(len) \ + (((__u64)len & BPF_ADJ_ROOM_ENCAP_L2_MASK) \ + << BPF_ADJ_ROOM_ENCAP_L2_SHIFT) + #define L2_PAD_SZ (sizeof(struct vxlanhdr) + ETH_HLEN) #define UDP_PORT 5555 @@ -36,10 +32,9 @@ static const int cfg_udp_src = 20000; #define EXTPROTO_VXLAN 0x1 -#define VXLAN_N_VID (1u << 24) -#define VXLAN_VNI_MASK bpf_htonl((VXLAN_N_VID - 1) << 8) -#define VXLAN_FLAGS 0x8 -#define VXLAN_VNI 1 +#define VXLAN_FLAGS bpf_htonl(1<<27) +#define VNI_ID 1 +#define VXLAN_VNI bpf_htonl(VNI_ID << 8) #ifndef NEXTHDR_DEST #define NEXTHDR_DEST 60 @@ -48,12 +43,6 @@ static const int cfg_udp_src = 20000; /* MPLS label 1000 with S bit (last label) set and ttl of 255. */ static const __u32 mpls_label = __bpf_constant_htonl(1000 << 12 | MPLS_LS_S_MASK | 0xff); - -struct vxlanhdr { - __be32 vx_flags; - __be32 vx_vni; -} __attribute__((packed)); - struct gre_hdr { __be16 flags; __be16 protocol; @@ -94,8 +83,8 @@ static __always_inline void set_ipv4_csum(struct iphdr *iph) static __always_inline int __encap_ipv4(struct __sk_buff *skb, __u8 encap_proto, __u16 l2_proto, __u16 ext_proto) { + struct iphdr iph_inner = {0}; __u16 udp_dst = UDP_PORT; - struct iphdr iph_inner; struct v4hdr h_outer; struct tcphdr tcph; int olen, l2_len; @@ -122,7 +111,6 @@ static __always_inline int __encap_ipv4(struct __sk_buff *skb, __u8 encap_proto, return TC_ACT_OK; /* Derive the IPv4 header fields from the IPv6 header */ - memset(&iph_inner, 0, sizeof(iph_inner)); iph_inner.version = 4; iph_inner.ihl = 5; iph_inner.tot_len = bpf_htons(sizeof(iph6_inner) + @@ -210,7 +198,7 @@ static __always_inline int __encap_ipv4(struct __sk_buff *skb, __u8 encap_proto, struct vxlanhdr *vxlan_hdr = (struct vxlanhdr *)l2_hdr; vxlan_hdr->vx_flags = VXLAN_FLAGS; - vxlan_hdr->vx_vni = bpf_htonl((VXLAN_VNI & VXLAN_VNI_MASK) << 8); + vxlan_hdr->vx_vni = VXLAN_VNI; l2_hdr += sizeof(struct vxlanhdr); } @@ -340,7 +328,7 @@ static __always_inline int __encap_ipv6(struct __sk_buff *skb, __u8 encap_proto, struct vxlanhdr *vxlan_hdr = (struct vxlanhdr *)l2_hdr; vxlan_hdr->vx_flags = VXLAN_FLAGS; - vxlan_hdr->vx_vni = bpf_htonl((VXLAN_VNI & VXLAN_VNI_MASK) << 8); + vxlan_hdr->vx_vni = VXLAN_VNI; l2_hdr += sizeof(struct vxlanhdr); } @@ -372,8 +360,8 @@ static __always_inline int __encap_ipv6(struct __sk_buff *skb, __u8 encap_proto, static int encap_ipv6_ipip6(struct __sk_buff *skb) { + struct v6hdr h_outer = {0}; struct iphdr iph_inner; - struct v6hdr h_outer; struct tcphdr tcph; struct ethhdr eth; __u64 flags; @@ -400,13 +388,12 @@ static int encap_ipv6_ipip6(struct __sk_buff *skb) return TC_ACT_SHOT; /* prepare new outer network header */ - memset(&h_outer.ip, 0, sizeof(h_outer.ip)); h_outer.ip.version = 6; h_outer.ip.hop_limit = iph_inner.ttl; - h_outer.ip.saddr.s6_addr[1] = 0xfd; - h_outer.ip.saddr.s6_addr[15] = 1; - h_outer.ip.daddr.s6_addr[1] = 0xfd; - h_outer.ip.daddr.s6_addr[15] = 2; + h_outer.ip.saddr.in6_u.u6_addr8[1] = 0xfd; + h_outer.ip.saddr.in6_u.u6_addr8[15] = 1; + h_outer.ip.daddr.in6_u.u6_addr8[1] = 0xfd; + h_outer.ip.daddr.in6_u.u6_addr8[15] = 2; h_outer.ip.payload_len = iph_inner.tot_len; h_outer.ip.nexthdr = IPPROTO_IPIP; @@ -431,7 +418,7 @@ static __always_inline int encap_ipv6(struct __sk_buff *skb, __u8 encap_proto, return __encap_ipv6(skb, encap_proto, l2_proto, 0); } -SEC("encap_ipip_none") +SEC("tc") int __encap_ipip_none(struct __sk_buff *skb) { if (skb->protocol == __bpf_constant_htons(ETH_P_IP)) @@ -440,7 +427,7 @@ int __encap_ipip_none(struct __sk_buff *skb) return TC_ACT_OK; } -SEC("encap_gre_none") +SEC("tc") int __encap_gre_none(struct __sk_buff *skb) { if (skb->protocol == __bpf_constant_htons(ETH_P_IP)) @@ -449,7 +436,7 @@ int __encap_gre_none(struct __sk_buff *skb) return TC_ACT_OK; } -SEC("encap_gre_mpls") +SEC("tc") int __encap_gre_mpls(struct __sk_buff *skb) { if (skb->protocol == __bpf_constant_htons(ETH_P_IP)) @@ -458,7 +445,7 @@ int __encap_gre_mpls(struct __sk_buff *skb) return TC_ACT_OK; } -SEC("encap_gre_eth") +SEC("tc") int __encap_gre_eth(struct __sk_buff *skb) { if (skb->protocol == __bpf_constant_htons(ETH_P_IP)) @@ -467,7 +454,7 @@ int __encap_gre_eth(struct __sk_buff *skb) return TC_ACT_OK; } -SEC("encap_udp_none") +SEC("tc") int __encap_udp_none(struct __sk_buff *skb) { if (skb->protocol == __bpf_constant_htons(ETH_P_IP)) @@ -476,7 +463,7 @@ int __encap_udp_none(struct __sk_buff *skb) return TC_ACT_OK; } -SEC("encap_udp_mpls") +SEC("tc") int __encap_udp_mpls(struct __sk_buff *skb) { if (skb->protocol == __bpf_constant_htons(ETH_P_IP)) @@ -485,7 +472,7 @@ int __encap_udp_mpls(struct __sk_buff *skb) return TC_ACT_OK; } -SEC("encap_udp_eth") +SEC("tc") int __encap_udp_eth(struct __sk_buff *skb) { if (skb->protocol == __bpf_constant_htons(ETH_P_IP)) @@ -494,7 +481,7 @@ int __encap_udp_eth(struct __sk_buff *skb) return TC_ACT_OK; } -SEC("encap_vxlan_eth") +SEC("tc") int __encap_vxlan_eth(struct __sk_buff *skb) { if (skb->protocol == __bpf_constant_htons(ETH_P_IP)) @@ -505,7 +492,7 @@ int __encap_vxlan_eth(struct __sk_buff *skb) return TC_ACT_OK; } -SEC("encap_sit_none") +SEC("tc") int __encap_sit_none(struct __sk_buff *skb) { if (skb->protocol == __bpf_constant_htons(ETH_P_IPV6)) @@ -514,7 +501,7 @@ int __encap_sit_none(struct __sk_buff *skb) return TC_ACT_OK; } -SEC("encap_ip6tnl_none") +SEC("tc") int __encap_ip6tnl_none(struct __sk_buff *skb) { if (skb->protocol == __bpf_constant_htons(ETH_P_IPV6)) @@ -523,7 +510,7 @@ int __encap_ip6tnl_none(struct __sk_buff *skb) return TC_ACT_OK; } -SEC("encap_ipip6_none") +SEC("tc") int __encap_ipip6_none(struct __sk_buff *skb) { if (skb->protocol == __bpf_constant_htons(ETH_P_IP)) @@ -532,7 +519,7 @@ int __encap_ipip6_none(struct __sk_buff *skb) return TC_ACT_OK; } -SEC("encap_ip6gre_none") +SEC("tc") int __encap_ip6gre_none(struct __sk_buff *skb) { if (skb->protocol == __bpf_constant_htons(ETH_P_IPV6)) @@ -541,7 +528,7 @@ int __encap_ip6gre_none(struct __sk_buff *skb) return TC_ACT_OK; } -SEC("encap_ip6gre_mpls") +SEC("tc") int __encap_ip6gre_mpls(struct __sk_buff *skb) { if (skb->protocol == __bpf_constant_htons(ETH_P_IPV6)) @@ -550,7 +537,7 @@ int __encap_ip6gre_mpls(struct __sk_buff *skb) return TC_ACT_OK; } -SEC("encap_ip6gre_eth") +SEC("tc") int __encap_ip6gre_eth(struct __sk_buff *skb) { if (skb->protocol == __bpf_constant_htons(ETH_P_IPV6)) @@ -559,7 +546,7 @@ int __encap_ip6gre_eth(struct __sk_buff *skb) return TC_ACT_OK; } -SEC("encap_ip6udp_none") +SEC("tc") int __encap_ip6udp_none(struct __sk_buff *skb) { if (skb->protocol == __bpf_constant_htons(ETH_P_IPV6)) @@ -568,7 +555,7 @@ int __encap_ip6udp_none(struct __sk_buff *skb) return TC_ACT_OK; } -SEC("encap_ip6udp_mpls") +SEC("tc") int __encap_ip6udp_mpls(struct __sk_buff *skb) { if (skb->protocol == __bpf_constant_htons(ETH_P_IPV6)) @@ -577,7 +564,7 @@ int __encap_ip6udp_mpls(struct __sk_buff *skb) return TC_ACT_OK; } -SEC("encap_ip6udp_eth") +SEC("tc") int __encap_ip6udp_eth(struct __sk_buff *skb) { if (skb->protocol == __bpf_constant_htons(ETH_P_IPV6)) @@ -586,7 +573,7 @@ int __encap_ip6udp_eth(struct __sk_buff *skb) return TC_ACT_OK; } -SEC("encap_ip6vxlan_eth") +SEC("tc") int __encap_ip6vxlan_eth(struct __sk_buff *skb) { if (skb->protocol == __bpf_constant_htons(ETH_P_IPV6)) @@ -693,7 +680,7 @@ static int decap_ipv6(struct __sk_buff *skb) iph_outer.nexthdr); } -SEC("decap") +SEC("tc") int decap_f(struct __sk_buff *skb) { switch (skb->protocol) { diff --git a/tools/testing/selftests/bpf/progs/test_xdp_meta.c b/tools/testing/selftests/bpf/progs/test_xdp_meta.c index d79cb74b571e..0a0f371a2dec 100644 --- a/tools/testing/selftests/bpf/progs/test_xdp_meta.c +++ b/tools/testing/selftests/bpf/progs/test_xdp_meta.c @@ -4,6 +4,7 @@ #include <linux/if_ether.h> #include <linux/pkt_cls.h> +#include <bpf/bpf_endian.h> #include <bpf/bpf_helpers.h> #include "bpf_kfuncs.h" @@ -11,37 +12,72 @@ #define ctx_ptr(ctx, mem) (void *)(unsigned long)ctx->mem -/* Demonstrates how metadata can be passed from an XDP program to a TC program - * using bpf_xdp_adjust_meta. - * For the sake of testing the metadata support in drivers, the XDP program uses - * a fixed-size payload after the Ethernet header as metadata. The TC program - * copies the metadata it receives into a map so it can be checked from - * userspace. +/* Demonstrate passing metadata from XDP to TC using bpf_xdp_adjust_meta. + * + * The XDP program extracts a fixed-size payload following the Ethernet header + * and stores it as packet metadata to test the driver's metadata support. The + * TC program then verifies if the passed metadata is correct. */ -struct { - __uint(type, BPF_MAP_TYPE_ARRAY); - __uint(max_entries, 1); - __type(key, __u32); - __uint(value_size, META_SIZE); -} test_result SEC(".maps"); - bool test_pass; +static const __u8 smac_want[ETH_ALEN] = { + 0x12, 0x34, 0xDE, 0xAD, 0xBE, 0xEF, +}; + +static const __u8 meta_want[META_SIZE] = { + 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, + 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, + 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, + 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, +}; + +static bool check_smac(const struct ethhdr *eth) +{ + return !__builtin_memcmp(eth->h_source, smac_want, ETH_ALEN); +} + +static bool check_metadata(const char *file, int line, __u8 *meta_have) +{ + if (!__builtin_memcmp(meta_have, meta_want, META_SIZE)) + return true; + + bpf_stream_printk(BPF_STREAM_STDERR, + "FAIL:%s:%d: metadata mismatch\n" + " have:\n %pI6\n %pI6\n" + " want:\n %pI6\n %pI6\n", + file, line, + &meta_have[0x00], &meta_have[0x10], + &meta_want[0x00], &meta_want[0x10]); + return false; +} + +#define check_metadata(meta_have) check_metadata(__FILE__, __LINE__, meta_have) + +static bool check_skb_metadata(const char *file, int line, struct __sk_buff *skb) +{ + __u8 *data_meta = ctx_ptr(skb, data_meta); + __u8 *data = ctx_ptr(skb, data); + + return data_meta + META_SIZE <= data && (check_metadata)(file, line, data_meta); +} + +#define check_skb_metadata(skb) check_skb_metadata(__FILE__, __LINE__, skb) + SEC("tc") int ing_cls(struct __sk_buff *ctx) { - __u8 *data, *data_meta; - __u32 key = 0; - - data_meta = ctx_ptr(ctx, data_meta); - data = ctx_ptr(ctx, data); + __u8 *meta_have = ctx_ptr(ctx, data_meta); + __u8 *data = ctx_ptr(ctx, data); - if (data_meta + META_SIZE > data) - return TC_ACT_SHOT; + if (meta_have + META_SIZE > data) + goto out; - bpf_map_update_elem(&test_result, &key, data_meta, BPF_ANY); + if (!check_metadata(meta_have)) + goto out; + test_pass = true; +out: return TC_ACT_SHOT; } @@ -49,17 +85,17 @@ int ing_cls(struct __sk_buff *ctx) SEC("tc") int ing_cls_dynptr_read(struct __sk_buff *ctx) { + __u8 meta_have[META_SIZE]; struct bpf_dynptr meta; - const __u32 zero = 0; - __u8 *dst; - - dst = bpf_map_lookup_elem(&test_result, &zero); - if (!dst) - return TC_ACT_SHOT; bpf_dynptr_from_skb_meta(ctx, 0, &meta); - bpf_dynptr_read(dst, META_SIZE, &meta, 0, 0); + bpf_dynptr_read(meta_have, META_SIZE, &meta, 0, 0); + if (!check_metadata(meta_have)) + goto out; + + test_pass = true; +out: return TC_ACT_SHOT; } @@ -86,20 +122,18 @@ SEC("tc") int ing_cls_dynptr_slice(struct __sk_buff *ctx) { struct bpf_dynptr meta; - const __u32 zero = 0; - __u8 *dst, *src; - - dst = bpf_map_lookup_elem(&test_result, &zero); - if (!dst) - return TC_ACT_SHOT; + __u8 *meta_have; bpf_dynptr_from_skb_meta(ctx, 0, &meta); - src = bpf_dynptr_slice(&meta, 0, NULL, META_SIZE); - if (!src) - return TC_ACT_SHOT; + meta_have = bpf_dynptr_slice(&meta, 0, NULL, META_SIZE); + if (!meta_have) + goto out; - __builtin_memcpy(dst, src, META_SIZE); + if (!check_metadata(meta_have)) + goto out; + test_pass = true; +out: return TC_ACT_SHOT; } @@ -129,14 +163,12 @@ int ing_cls_dynptr_slice_rdwr(struct __sk_buff *ctx) SEC("tc") int ing_cls_dynptr_offset_rd(struct __sk_buff *ctx) { - struct bpf_dynptr meta; const __u32 chunk_len = META_SIZE / 4; - const __u32 zero = 0; + __u8 meta_have[META_SIZE]; + struct bpf_dynptr meta; __u8 *dst, *src; - dst = bpf_map_lookup_elem(&test_result, &zero); - if (!dst) - return TC_ACT_SHOT; + dst = meta_have; /* 1. Regular read */ bpf_dynptr_from_skb_meta(ctx, 0, &meta); @@ -155,9 +187,14 @@ int ing_cls_dynptr_offset_rd(struct __sk_buff *ctx) /* 4. Read from a slice starting at an offset */ src = bpf_dynptr_slice(&meta, 2 * chunk_len, NULL, chunk_len); if (!src) - return TC_ACT_SHOT; + goto out; __builtin_memcpy(dst, src, chunk_len); + if (!check_metadata(meta_have)) + goto out; + + test_pass = true; +out: return TC_ACT_SHOT; } @@ -254,7 +291,7 @@ int ing_xdp_zalloc_meta(struct xdp_md *ctx) /* Drop any non-test packets */ if (eth + 1 > ctx_ptr(ctx, data_end)) return XDP_DROP; - if (eth->h_proto != 0) + if (!check_smac(eth)) return XDP_DROP; ret = bpf_xdp_adjust_meta(ctx, -META_SIZE); @@ -294,9 +331,9 @@ int ing_xdp(struct xdp_md *ctx) /* The Linux networking stack may send other packets on the test * interface that interfere with the test. Just drop them. - * The test packets can be recognized by their ethertype of zero. + * The test packets can be recognized by their source MAC address. */ - if (eth->h_proto != 0) + if (!check_smac(eth)) return XDP_DROP; __builtin_memcpy(data_meta, payload, META_SIZE); @@ -304,22 +341,25 @@ int ing_xdp(struct xdp_md *ctx) } /* - * Check that skb->data_meta..skb->data is empty if prog writes to packet - * _payload_ using packet pointers. Applies only to cloned skbs. + * Check that, when operating on a cloned packet, skb->data_meta..skb->data is + * kept intact if prog writes to packet _payload_ using packet pointers. */ SEC("tc") -int clone_data_meta_empty_on_data_write(struct __sk_buff *ctx) +int clone_data_meta_survives_data_write(struct __sk_buff *ctx) { + __u8 *meta_have = ctx_ptr(ctx, data_meta); struct ethhdr *eth = ctx_ptr(ctx, data); if (eth + 1 > ctx_ptr(ctx, data_end)) goto out; /* Ignore non-test packets */ - if (eth->h_proto != 0) + if (!check_smac(eth)) + goto out; + + if (meta_have + META_SIZE > eth) goto out; - /* Expect no metadata */ - if (ctx->data_meta != ctx->data) + if (!check_metadata(meta_have)) goto out; /* Packet write to trigger unclone in prologue */ @@ -331,40 +371,44 @@ out: } /* - * Check that skb->data_meta..skb->data is empty if prog writes to packet - * _metadata_ using packet pointers. Applies only to cloned skbs. + * Check that, when operating on a cloned packet, skb->data_meta..skb->data is + * kept intact if prog writes to packet _metadata_ using packet pointers. */ SEC("tc") -int clone_data_meta_empty_on_meta_write(struct __sk_buff *ctx) +int clone_data_meta_survives_meta_write(struct __sk_buff *ctx) { + __u8 *meta_have = ctx_ptr(ctx, data_meta); struct ethhdr *eth = ctx_ptr(ctx, data); - __u8 *md = ctx_ptr(ctx, data_meta); if (eth + 1 > ctx_ptr(ctx, data_end)) goto out; /* Ignore non-test packets */ - if (eth->h_proto != 0) + if (!check_smac(eth)) goto out; - if (md + 1 > ctx_ptr(ctx, data)) { - /* Expect no metadata */ - test_pass = true; - } else { - /* Metadata write to trigger unclone in prologue */ - *md = 42; - } + if (meta_have + META_SIZE > eth) + goto out; + + if (!check_metadata(meta_have)) + goto out; + + /* Metadata write to trigger unclone in prologue */ + *meta_have = 42; + + test_pass = true; out: return TC_ACT_SHOT; } /* - * Check that skb_meta dynptr is writable but empty if prog writes to packet - * _payload_ using a dynptr slice. Applies only to cloned skbs. + * Check that, when operating on a cloned packet, metadata remains intact if + * prog creates a r/w slice to packet _payload_. */ SEC("tc") -int clone_dynptr_empty_on_data_slice_write(struct __sk_buff *ctx) +int clone_meta_dynptr_survives_data_slice_write(struct __sk_buff *ctx) { struct bpf_dynptr data, meta; + __u8 meta_have[META_SIZE]; struct ethhdr *eth; bpf_dynptr_from_skb(ctx, 0, &data); @@ -372,51 +416,45 @@ int clone_dynptr_empty_on_data_slice_write(struct __sk_buff *ctx) if (!eth) goto out; /* Ignore non-test packets */ - if (eth->h_proto != 0) + if (!check_smac(eth)) goto out; - /* Expect no metadata */ bpf_dynptr_from_skb_meta(ctx, 0, &meta); - if (bpf_dynptr_is_rdonly(&meta) || bpf_dynptr_size(&meta) > 0) + bpf_dynptr_read(meta_have, META_SIZE, &meta, 0, 0); + if (!check_metadata(meta_have)) goto out; - /* Packet write to trigger unclone in prologue */ - eth->h_proto = 42; - test_pass = true; out: return TC_ACT_SHOT; } /* - * Check that skb_meta dynptr is writable but empty if prog writes to packet - * _metadata_ using a dynptr slice. Applies only to cloned skbs. + * Check that, when operating on a cloned packet, metadata remains intact if + * prog creates an r/w slice to packet _metadata_. */ SEC("tc") -int clone_dynptr_empty_on_meta_slice_write(struct __sk_buff *ctx) +int clone_meta_dynptr_survives_meta_slice_write(struct __sk_buff *ctx) { struct bpf_dynptr data, meta; const struct ethhdr *eth; - __u8 *md; + __u8 *meta_have; bpf_dynptr_from_skb(ctx, 0, &data); eth = bpf_dynptr_slice(&data, 0, NULL, sizeof(*eth)); if (!eth) goto out; /* Ignore non-test packets */ - if (eth->h_proto != 0) + if (!check_smac(eth)) goto out; - /* Expect no metadata */ bpf_dynptr_from_skb_meta(ctx, 0, &meta); - if (bpf_dynptr_is_rdonly(&meta) || bpf_dynptr_size(&meta) > 0) + meta_have = bpf_dynptr_slice_rdwr(&meta, 0, NULL, META_SIZE); + if (!meta_have) goto out; - /* Metadata write to trigger unclone in prologue */ - bpf_dynptr_from_skb_meta(ctx, 0, &meta); - md = bpf_dynptr_slice_rdwr(&meta, 0, NULL, sizeof(*md)); - if (md) - *md = 42; + if (!check_metadata(meta_have)) + goto out; test_pass = true; out: @@ -424,34 +462,40 @@ out: } /* - * Check that skb_meta dynptr is read-only before prog writes to packet payload - * using dynptr_write helper. Applies only to cloned skbs. + * Check that, when operating on a cloned packet, skb_meta dynptr is read-write + * before prog writes to packet _payload_ using dynptr_write helper and metadata + * remains intact before and after the write. */ SEC("tc") -int clone_dynptr_rdonly_before_data_dynptr_write(struct __sk_buff *ctx) +int clone_meta_dynptr_rw_before_data_dynptr_write(struct __sk_buff *ctx) { struct bpf_dynptr data, meta; + __u8 meta_have[META_SIZE]; const struct ethhdr *eth; + int err; bpf_dynptr_from_skb(ctx, 0, &data); eth = bpf_dynptr_slice(&data, 0, NULL, sizeof(*eth)); if (!eth) goto out; /* Ignore non-test packets */ - if (eth->h_proto != 0) + if (!check_smac(eth)) goto out; - /* Expect read-only metadata before unclone */ + /* Expect read-write metadata before unclone */ bpf_dynptr_from_skb_meta(ctx, 0, &meta); - if (!bpf_dynptr_is_rdonly(&meta) || bpf_dynptr_size(&meta) != META_SIZE) + if (bpf_dynptr_is_rdonly(&meta)) + goto out; + + err = bpf_dynptr_read(meta_have, META_SIZE, &meta, 0, 0); + if (err || !check_metadata(meta_have)) goto out; /* Helper write to payload will unclone the packet */ bpf_dynptr_write(&data, offsetof(struct ethhdr, h_proto), "x", 1, 0); - /* Expect no metadata after unclone */ - bpf_dynptr_from_skb_meta(ctx, 0, &meta); - if (bpf_dynptr_is_rdonly(&meta) || bpf_dynptr_size(&meta) != 0) + err = bpf_dynptr_read(meta_have, META_SIZE, &meta, 0, 0); + if (err || !check_metadata(meta_have)) goto out; test_pass = true; @@ -460,31 +504,165 @@ out: } /* - * Check that skb_meta dynptr is read-only if prog writes to packet - * metadata using dynptr_write helper. Applies only to cloned skbs. + * Check that, when operating on a cloned packet, skb_meta dynptr is read-write + * before prog writes to packet _metadata_ using dynptr_write helper and + * metadata remains intact before and after the write. */ SEC("tc") -int clone_dynptr_rdonly_before_meta_dynptr_write(struct __sk_buff *ctx) +int clone_meta_dynptr_rw_before_meta_dynptr_write(struct __sk_buff *ctx) { struct bpf_dynptr data, meta; + __u8 meta_have[META_SIZE]; const struct ethhdr *eth; + int err; bpf_dynptr_from_skb(ctx, 0, &data); eth = bpf_dynptr_slice(&data, 0, NULL, sizeof(*eth)); if (!eth) goto out; /* Ignore non-test packets */ - if (eth->h_proto != 0) + if (!check_smac(eth)) goto out; - /* Expect read-only metadata */ + /* Expect read-write metadata before unclone */ bpf_dynptr_from_skb_meta(ctx, 0, &meta); - if (!bpf_dynptr_is_rdonly(&meta) || bpf_dynptr_size(&meta) != META_SIZE) + if (bpf_dynptr_is_rdonly(&meta)) goto out; - /* Metadata write. Expect failure. */ - bpf_dynptr_from_skb_meta(ctx, 0, &meta); - if (bpf_dynptr_write(&meta, 0, "x", 1, 0) != -EINVAL) + err = bpf_dynptr_read(meta_have, META_SIZE, &meta, 0, 0); + if (err || !check_metadata(meta_have)) + goto out; + + /* Helper write to metadata will unclone the packet */ + bpf_dynptr_write(&meta, 0, &meta_have[0], 1, 0); + + err = bpf_dynptr_read(meta_have, META_SIZE, &meta, 0, 0); + if (err || !check_metadata(meta_have)) + goto out; + + test_pass = true; +out: + return TC_ACT_SHOT; +} + +SEC("tc") +int helper_skb_vlan_push_pop(struct __sk_buff *ctx) +{ + int err; + + /* bpf_skb_vlan_push assumes HW offload for primary VLAN tag. Only + * secondary tag push triggers an actual MAC header modification. + */ + err = bpf_skb_vlan_push(ctx, 0, 42); + if (err) + goto out; + err = bpf_skb_vlan_push(ctx, 0, 207); + if (err) + goto out; + + if (!check_skb_metadata(ctx)) + goto out; + + err = bpf_skb_vlan_pop(ctx); + if (err) + goto out; + err = bpf_skb_vlan_pop(ctx); + if (err) + goto out; + + if (!check_skb_metadata(ctx)) + goto out; + + test_pass = true; +out: + return TC_ACT_SHOT; +} + +SEC("tc") +int helper_skb_adjust_room(struct __sk_buff *ctx) +{ + int err; + + /* Grow a 1 byte hole after the MAC header */ + err = bpf_skb_adjust_room(ctx, 1, BPF_ADJ_ROOM_MAC, 0); + if (err) + goto out; + + if (!check_skb_metadata(ctx)) + goto out; + + /* Shrink a 1 byte hole after the MAC header */ + err = bpf_skb_adjust_room(ctx, -1, BPF_ADJ_ROOM_MAC, 0); + if (err) + goto out; + + if (!check_skb_metadata(ctx)) + goto out; + + /* Grow a 256 byte hole to trigger head reallocation */ + err = bpf_skb_adjust_room(ctx, 256, BPF_ADJ_ROOM_MAC, 0); + if (err) + goto out; + + if (!check_skb_metadata(ctx)) + goto out; + + test_pass = true; +out: + return TC_ACT_SHOT; +} + +SEC("tc") +int helper_skb_change_head_tail(struct __sk_buff *ctx) +{ + int err; + + /* Reserve 1 extra in the front for packet data */ + err = bpf_skb_change_head(ctx, 1, 0); + if (err) + goto out; + + if (!check_skb_metadata(ctx)) + goto out; + + /* Reserve 256 extra bytes in the front to trigger head reallocation */ + err = bpf_skb_change_head(ctx, 256, 0); + if (err) + goto out; + + if (!check_skb_metadata(ctx)) + goto out; + + /* Reserve 4k extra bytes in the back to trigger head reallocation */ + err = bpf_skb_change_tail(ctx, ctx->len + 4096, 0); + if (err) + goto out; + + if (!check_skb_metadata(ctx)) + goto out; + + test_pass = true; +out: + return TC_ACT_SHOT; +} + +SEC("tc") +int helper_skb_change_proto(struct __sk_buff *ctx) +{ + int err; + + err = bpf_skb_change_proto(ctx, bpf_htons(ETH_P_IPV6), 0); + if (err) + goto out; + + if (!check_skb_metadata(ctx)) + goto out; + + err = bpf_skb_change_proto(ctx, bpf_htons(ETH_P_IP), 0); + if (err) + goto out; + + if (!check_skb_metadata(ctx)) goto out; test_pass = true; diff --git a/tools/testing/selftests/bpf/progs/trigger_bench.c b/tools/testing/selftests/bpf/progs/trigger_bench.c index 3d5f30c29ae3..2898b3749d07 100644 --- a/tools/testing/selftests/bpf/progs/trigger_bench.c +++ b/tools/testing/selftests/bpf/progs/trigger_bench.c @@ -42,12 +42,14 @@ int bench_trigger_uprobe_multi(void *ctx) const volatile int batch_iters = 0; SEC("?raw_tp") -int trigger_count(void *ctx) +int trigger_kernel_count(void *ctx) { int i; - for (i = 0; i < batch_iters; i++) + for (i = 0; i < batch_iters; i++) { inc_counter(); + bpf_get_numa_node_id(); + } return 0; } diff --git a/tools/testing/selftests/bpf/progs/verifier_async_cb_context.c b/tools/testing/selftests/bpf/progs/verifier_async_cb_context.c new file mode 100644 index 000000000000..7efa9521105e --- /dev/null +++ b/tools/testing/selftests/bpf/progs/verifier_async_cb_context.c @@ -0,0 +1,181 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */ + +#include <vmlinux.h> +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> +#include "bpf_misc.h" +#include "bpf_experimental.h" + +char _license[] SEC("license") = "GPL"; + +/* Timer tests */ + +struct timer_elem { + struct bpf_timer t; +}; + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, 1); + __type(key, int); + __type(value, struct timer_elem); +} timer_map SEC(".maps"); + +static int timer_cb(void *map, int *key, struct bpf_timer *timer) +{ + u32 data; + /* Timer callbacks are never sleepable, even from non-sleepable programs */ + bpf_copy_from_user(&data, sizeof(data), NULL); + return 0; +} + +SEC("fentry/bpf_fentry_test1") +__failure __msg("helper call might sleep in a non-sleepable prog") +int timer_non_sleepable_prog(void *ctx) +{ + struct timer_elem *val; + int key = 0; + + val = bpf_map_lookup_elem(&timer_map, &key); + if (!val) + return 0; + + bpf_timer_init(&val->t, &timer_map, 0); + bpf_timer_set_callback(&val->t, timer_cb); + return 0; +} + +SEC("lsm.s/file_open") +__failure __msg("helper call might sleep in a non-sleepable prog") +int timer_sleepable_prog(void *ctx) +{ + struct timer_elem *val; + int key = 0; + + val = bpf_map_lookup_elem(&timer_map, &key); + if (!val) + return 0; + + bpf_timer_init(&val->t, &timer_map, 0); + bpf_timer_set_callback(&val->t, timer_cb); + return 0; +} + +/* Workqueue tests */ + +struct wq_elem { + struct bpf_wq w; +}; + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, 1); + __type(key, int); + __type(value, struct wq_elem); +} wq_map SEC(".maps"); + +static int wq_cb(void *map, int *key, void *value) +{ + u32 data; + /* Workqueue callbacks are always sleepable, even from non-sleepable programs */ + bpf_copy_from_user(&data, sizeof(data), NULL); + return 0; +} + +SEC("fentry/bpf_fentry_test1") +__success +int wq_non_sleepable_prog(void *ctx) +{ + struct wq_elem *val; + int key = 0; + + val = bpf_map_lookup_elem(&wq_map, &key); + if (!val) + return 0; + + if (bpf_wq_init(&val->w, &wq_map, 0) != 0) + return 0; + if (bpf_wq_set_callback_impl(&val->w, wq_cb, 0, NULL) != 0) + return 0; + return 0; +} + +SEC("lsm.s/file_open") +__success +int wq_sleepable_prog(void *ctx) +{ + struct wq_elem *val; + int key = 0; + + val = bpf_map_lookup_elem(&wq_map, &key); + if (!val) + return 0; + + if (bpf_wq_init(&val->w, &wq_map, 0) != 0) + return 0; + if (bpf_wq_set_callback_impl(&val->w, wq_cb, 0, NULL) != 0) + return 0; + return 0; +} + +/* Task work tests */ + +struct task_work_elem { + struct bpf_task_work tw; +}; + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, 1); + __type(key, int); + __type(value, struct task_work_elem); +} task_work_map SEC(".maps"); + +static int task_work_cb(struct bpf_map *map, void *key, void *value) +{ + u32 data; + /* Task work callbacks are always sleepable, even from non-sleepable programs */ + bpf_copy_from_user(&data, sizeof(data), NULL); + return 0; +} + +SEC("fentry/bpf_fentry_test1") +__success +int task_work_non_sleepable_prog(void *ctx) +{ + struct task_work_elem *val; + struct task_struct *task; + int key = 0; + + val = bpf_map_lookup_elem(&task_work_map, &key); + if (!val) + return 0; + + task = bpf_get_current_task_btf(); + if (!task) + return 0; + + bpf_task_work_schedule_resume_impl(task, &val->tw, &task_work_map, task_work_cb, NULL); + return 0; +} + +SEC("lsm.s/file_open") +__success +int task_work_sleepable_prog(void *ctx) +{ + struct task_work_elem *val; + struct task_struct *task; + int key = 0; + + val = bpf_map_lookup_elem(&task_work_map, &key); + if (!val) + return 0; + + task = bpf_get_current_task_btf(); + if (!task) + return 0; + + bpf_task_work_schedule_resume_impl(task, &val->tw, &task_work_map, task_work_cb, NULL); + return 0; +} diff --git a/tools/testing/selftests/bpf/progs/verifier_bounds.c b/tools/testing/selftests/bpf/progs/verifier_bounds.c index 0a72e0228ea9..411a18437d7e 100644 --- a/tools/testing/selftests/bpf/progs/verifier_bounds.c +++ b/tools/testing/selftests/bpf/progs/verifier_bounds.c @@ -1709,4 +1709,158 @@ __naked void jeq_disagreeing_tnums(void *ctx) : __clobber_all); } +SEC("socket") +__description("conditional jump on same register, branch taken") +__not_msg("20: (b7) r0 = 1 {{.*}} R0=1") +__success __log_level(2) +__retval(0) __flag(BPF_F_TEST_REG_INVARIANTS) +__naked void condition_jump_on_same_register(void *ctx) +{ + asm volatile(" \ + call %[bpf_get_prandom_u32]; \ + w8 = 0x80000000; \ + r0 &= r8; \ + if r0 == r0 goto +1; \ + goto l1_%=; \ + if r0 >= r0 goto +1; \ + goto l1_%=; \ + if r0 s>= r0 goto +1; \ + goto l1_%=; \ + if r0 <= r0 goto +1; \ + goto l1_%=; \ + if r0 s<= r0 goto +1; \ + goto l1_%=; \ + if r0 != r0 goto l1_%=; \ + if r0 > r0 goto l1_%=; \ + if r0 s> r0 goto l1_%=; \ + if r0 < r0 goto l1_%=; \ + if r0 s< r0 goto l1_%=; \ +l0_%=: r0 = 0; \ + exit; \ +l1_%=: r0 = 1; \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("jset on same register, constant value branch taken") +__not_msg("7: (b7) r0 = 1 {{.*}} R0=1") +__success __log_level(2) +__retval(0) __flag(BPF_F_TEST_REG_INVARIANTS) +__naked void jset_on_same_register_1(void *ctx) +{ + asm volatile(" \ + r0 = 0; \ + if r0 & r0 goto l1_%=; \ + r0 = 1; \ + if r0 & r0 goto +1; \ + goto l1_%=; \ +l0_%=: r0 = 0; \ + exit; \ +l1_%=: r0 = 1; \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("jset on same register, scalar value branch taken") +__not_msg("12: (b7) r0 = 1 {{.*}} R0=1") +__success __log_level(2) +__retval(0) __flag(BPF_F_TEST_REG_INVARIANTS) +__naked void jset_on_same_register_2(void *ctx) +{ + asm volatile(" \ + /* range [1;2] */ \ + call %[bpf_get_prandom_u32]; \ + r0 &= 0x1; \ + r0 += 1; \ + if r0 & r0 goto +1; \ + goto l1_%=; \ + /* range [-2;-1] */ \ + call %[bpf_get_prandom_u32]; \ + r0 &= 0x1; \ + r0 -= 2; \ + if r0 & r0 goto +1; \ + goto l1_%=; \ +l0_%=: r0 = 0; \ + exit; \ +l1_%=: r0 = 1; \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("jset on same register, scalar value unknown branch 1") +__msg("3: (b7) r0 = 0 {{.*}} R0=0") +__msg("5: (b7) r0 = 1 {{.*}} R0=1") +__success __log_level(2) +__flag(BPF_F_TEST_REG_INVARIANTS) +__naked void jset_on_same_register_3(void *ctx) +{ + asm volatile(" \ + /* range [0;1] */ \ + call %[bpf_get_prandom_u32]; \ + r0 &= 0x1; \ + if r0 & r0 goto l1_%=; \ +l0_%=: r0 = 0; \ + exit; \ +l1_%=: r0 = 1; \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("jset on same register, scalar value unknown branch 2") +__msg("4: (b7) r0 = 0 {{.*}} R0=0") +__msg("6: (b7) r0 = 1 {{.*}} R0=1") +__success __log_level(2) +__flag(BPF_F_TEST_REG_INVARIANTS) +__naked void jset_on_same_register_4(void *ctx) +{ + asm volatile(" \ + /* range [-1;0] */ \ + call %[bpf_get_prandom_u32]; \ + r0 &= 0x1; \ + r0 -= 1; \ + if r0 & r0 goto l1_%=; \ +l0_%=: r0 = 0; \ + exit; \ +l1_%=: r0 = 1; \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("jset on same register, scalar value unknown branch 3") +__msg("4: (b7) r0 = 0 {{.*}} R0=0") +__msg("6: (b7) r0 = 1 {{.*}} R0=1") +__success __log_level(2) +__flag(BPF_F_TEST_REG_INVARIANTS) +__naked void jset_on_same_register_5(void *ctx) +{ + asm volatile(" \ + /* range [-1;1] */ \ + call %[bpf_get_prandom_u32]; \ + r0 &= 0x2; \ + r0 -= 1; \ + if r0 & r0 goto l1_%=; \ +l0_%=: r0 = 0; \ + exit; \ +l1_%=: r0 = 1; \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/verifier_direct_packet_access.c b/tools/testing/selftests/bpf/progs/verifier_direct_packet_access.c index 28b602ac9cbe..911caa8fd1b7 100644 --- a/tools/testing/selftests/bpf/progs/verifier_direct_packet_access.c +++ b/tools/testing/selftests/bpf/progs/verifier_direct_packet_access.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 /* Converted from tools/testing/selftests/bpf/verifier/direct_packet_access.c */ +#include <linux/if_ether.h> #include <linux/bpf.h> #include <bpf/bpf_helpers.h> #include "bpf_misc.h" @@ -800,4 +801,62 @@ l0_%=: /* exit(0) */ \ : __clobber_all); } +#define access_test_non_linear(name, type, desc, retval, linear_sz, off) \ + SEC(type) \ + __description("direct packet access: " #name " (non-linear, " type ", " desc ")") \ + __success __retval(retval) \ + __linear_size(linear_sz) \ + __naked void access_non_linear_##name(void) \ + { \ + asm volatile (" \ + r2 = *(u32*)(r1 + %[skb_data]); \ + r3 = *(u32*)(r1 + %[skb_data_end]); \ + r0 = r2; \ + r0 += %[offset]; \ + if r0 > r3 goto l0_%=; \ + r0 = *(u8*)(r0 - 1); \ + r0 = 0; \ + exit; \ + l0_%=: r0 = 1; \ + exit; \ + " : \ + : __imm_const(skb_data, offsetof(struct __sk_buff, data)), \ + __imm_const(skb_data_end, offsetof(struct __sk_buff, data_end)), \ + __imm_const(offset, off) \ + : __clobber_all); \ + } + +access_test_non_linear(test31, "tc", "too short eth", 1, ETH_HLEN, 22); +access_test_non_linear(test32, "tc", "too short 1", 1, 1, 22); +access_test_non_linear(test33, "tc", "long enough", 0, 22, 22); +access_test_non_linear(test34, "cgroup_skb/ingress", "too short eth", 1, ETH_HLEN, 8); +access_test_non_linear(test35, "cgroup_skb/ingress", "too short 1", 1, 1, 8); +access_test_non_linear(test36, "cgroup_skb/ingress", "long enough", 0, 22, 8); + +SEC("tc") +__description("direct packet access: test37 (non-linear, linearized)") +__success __retval(0) +__linear_size(ETH_HLEN) +__naked void access_non_linear_linearized(void) +{ + asm volatile (" \ + r6 = r1; \ + r2 = 22; \ + call %[bpf_skb_pull_data]; \ + r2 = *(u32*)(r6 + %[skb_data]); \ + r3 = *(u32*)(r6 + %[skb_data_end]); \ + r0 = r2; \ + r0 += 22; \ + if r0 > r3 goto l0_%=; \ + r0 = *(u8*)(r0 - 1); \ + exit; \ +l0_%=: r0 = 1; \ + exit; \ +" : + : __imm(bpf_skb_pull_data), + __imm_const(skb_data, offsetof(struct __sk_buff, data)), + __imm_const(skb_data_end, offsetof(struct __sk_buff, data_end)) + : __clobber_all); +} + char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/verifier_gotox.c b/tools/testing/selftests/bpf/progs/verifier_gotox.c new file mode 100644 index 000000000000..607dad058ca1 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/verifier_gotox.c @@ -0,0 +1,389 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2025 Isovalent */ + +#include <linux/bpf.h> +#include <bpf/bpf_helpers.h> +#include "bpf_misc.h" +#include "../../../include/linux/filter.h" + +#if defined(__TARGET_ARCH_x86) || defined(__TARGET_ARCH_arm64) + +#define DEFINE_SIMPLE_JUMP_TABLE_PROG(NAME, SRC_REG, OFF, IMM, OUTCOME) \ + \ + SEC("socket") \ + OUTCOME \ + __naked void jump_table_ ## NAME(void) \ + { \ + asm volatile (" \ + .pushsection .jumptables,\"\",@progbits; \ + jt0_%=: \ + .quad ret0_%= - socket; \ + .quad ret1_%= - socket; \ + .size jt0_%=, 16; \ + .global jt0_%=; \ + .popsection; \ + \ + r0 = jt0_%= ll; \ + r0 += 8; \ + r0 = *(u64 *)(r0 + 0); \ + .8byte %[gotox_r0]; \ + ret0_%=: \ + r0 = 0; \ + exit; \ + ret1_%=: \ + r0 = 1; \ + exit; \ + " : \ + : __imm_insn(gotox_r0, BPF_RAW_INSN(BPF_JMP | BPF_JA | BPF_X, BPF_REG_0, (SRC_REG), (OFF) , (IMM))) \ + : __clobber_all); \ + } + +/* + * The first program which doesn't use reserved fields + * loads and works properly. The rest fail to load. + */ +DEFINE_SIMPLE_JUMP_TABLE_PROG(ok, BPF_REG_0, 0, 0, __success __retval(1)) +DEFINE_SIMPLE_JUMP_TABLE_PROG(reserved_field_src_reg, BPF_REG_1, 0, 0, __failure __msg("BPF_JA|BPF_X uses reserved fields")) +DEFINE_SIMPLE_JUMP_TABLE_PROG(reserved_field_non_zero_off, BPF_REG_0, 1, 0, __failure __msg("BPF_JA|BPF_X uses reserved fields")) +DEFINE_SIMPLE_JUMP_TABLE_PROG(reserved_field_non_zero_imm, BPF_REG_0, 0, 1, __failure __msg("BPF_JA|BPF_X uses reserved fields")) + +/* + * Gotox is forbidden when there is no jump table loaded + * which points to the sub-function where the gotox is used + */ +SEC("socket") +__failure __msg("no jump tables found for subprog starting at 0") +__naked void jump_table_no_jump_table(void) +{ + asm volatile (" \ + .8byte %[gotox_r0]; \ + r0 = 1; \ + exit; \ +" : \ + : __imm_insn(gotox_r0, BPF_RAW_INSN(BPF_JMP | BPF_JA | BPF_X, BPF_REG_0, 0, 0 , 0)) + : __clobber_all); +} + +/* + * Incorrect type of the target register, only PTR_TO_INSN allowed + */ +SEC("socket") +__failure __msg("R1 has type scalar, expected PTR_TO_INSN") +__naked void jump_table_incorrect_dst_reg_type(void) +{ + asm volatile (" \ + .pushsection .jumptables,\"\",@progbits; \ +jt0_%=: \ + .quad ret0_%= - socket; \ + .quad ret1_%= - socket; \ + .size jt0_%=, 16; \ + .global jt0_%=; \ + .popsection; \ + \ + r0 = jt0_%= ll; \ + r0 += 8; \ + r0 = *(u64 *)(r0 + 0); \ + r1 = 42; \ + .8byte %[gotox_r1]; \ + ret0_%=: \ + r0 = 0; \ + exit; \ + ret1_%=: \ + r0 = 1; \ + exit; \ +" : \ + : __imm_insn(gotox_r1, BPF_RAW_INSN(BPF_JMP | BPF_JA | BPF_X, BPF_REG_1, 0, 0 , 0)) + : __clobber_all); +} + +#define DEFINE_INVALID_SIZE_PROG(READ_SIZE, OUTCOME) \ + \ + SEC("socket") \ + OUTCOME \ + __naked void jump_table_invalid_read_size_ ## READ_SIZE(void) \ + { \ + asm volatile (" \ + .pushsection .jumptables,\"\",@progbits; \ + jt0_%=: \ + .quad ret0_%= - socket; \ + .quad ret1_%= - socket; \ + .size jt0_%=, 16; \ + .global jt0_%=; \ + .popsection; \ + \ + r0 = jt0_%= ll; \ + r0 += 8; \ + r0 = *(" #READ_SIZE " *)(r0 + 0); \ + .8byte %[gotox_r0]; \ + ret0_%=: \ + r0 = 0; \ + exit; \ + ret1_%=: \ + r0 = 1; \ + exit; \ + " : \ + : __imm_insn(gotox_r0, BPF_RAW_INSN(BPF_JMP | BPF_JA | BPF_X, BPF_REG_0, 0, 0 , 0)) \ + : __clobber_all); \ + } + +DEFINE_INVALID_SIZE_PROG(u32, __failure __msg("Invalid read of 4 bytes from insn_array")) +DEFINE_INVALID_SIZE_PROG(u16, __failure __msg("Invalid read of 2 bytes from insn_array")) +DEFINE_INVALID_SIZE_PROG(u8, __failure __msg("Invalid read of 1 bytes from insn_array")) + +SEC("socket") +__failure __msg("misaligned value access off 0+1+0 size 8") +__naked void jump_table_misaligned_access(void) +{ + asm volatile (" \ + .pushsection .jumptables,\"\",@progbits; \ +jt0_%=: \ + .quad ret0_%= - socket; \ + .quad ret1_%= - socket; \ + .size jt0_%=, 16; \ + .global jt0_%=; \ + .popsection; \ + \ + r0 = jt0_%= ll; \ + r0 += 1; \ + r0 = *(u64 *)(r0 + 0); \ + .8byte %[gotox_r0]; \ + ret0_%=: \ + r0 = 0; \ + exit; \ + ret1_%=: \ + r0 = 1; \ + exit; \ +" : \ + : __imm_insn(gotox_r0, BPF_RAW_INSN(BPF_JMP | BPF_JA | BPF_X, BPF_REG_0, 0, 0 , 0)) + : __clobber_all); +} + +SEC("socket") +__failure __msg("invalid access to map value, value_size=16 off=24 size=8") +__naked void jump_table_invalid_mem_acceess_pos(void) +{ + asm volatile (" \ + .pushsection .jumptables,\"\",@progbits; \ +jt0_%=: \ + .quad ret0_%= - socket; \ + .quad ret1_%= - socket; \ + .size jt0_%=, 16; \ + .global jt0_%=; \ + .popsection; \ + \ + r0 = jt0_%= ll; \ + r0 += 24; \ + r0 = *(u64 *)(r0 + 0); \ + .8byte %[gotox_r0]; \ + ret0_%=: \ + r0 = 0; \ + exit; \ + ret1_%=: \ + r0 = 1; \ + exit; \ +" : \ + : __imm_insn(gotox_r0, BPF_RAW_INSN(BPF_JMP | BPF_JA | BPF_X, BPF_REG_0, 0, 0 , 0)) + : __clobber_all); +} + +SEC("socket") +__failure __msg("invalid access to map value, value_size=16 off=-24 size=8") +__naked void jump_table_invalid_mem_acceess_neg(void) +{ + asm volatile (" \ + .pushsection .jumptables,\"\",@progbits; \ +jt0_%=: \ + .quad ret0_%= - socket; \ + .quad ret1_%= - socket; \ + .size jt0_%=, 16; \ + .global jt0_%=; \ + .popsection; \ + \ + r0 = jt0_%= ll; \ + r0 -= 24; \ + r0 = *(u64 *)(r0 + 0); \ + .8byte %[gotox_r0]; \ + ret0_%=: \ + r0 = 0; \ + exit; \ + ret1_%=: \ + r0 = 1; \ + exit; \ +" : \ + : __imm_insn(gotox_r0, BPF_RAW_INSN(BPF_JMP | BPF_JA | BPF_X, BPF_REG_0, 0, 0 , 0)) + : __clobber_all); +} + +SEC("socket") +__success __retval(1) +__naked void jump_table_add_sub_ok(void) +{ + asm volatile (" \ + .pushsection .jumptables,\"\",@progbits; \ +jt0_%=: \ + .quad ret0_%= - socket; \ + .quad ret1_%= - socket; \ + .size jt0_%=, 16; \ + .global jt0_%=; \ + .popsection; \ + \ + r0 = jt0_%= ll; \ + r0 -= 24; \ + r0 += 32; \ + r0 = *(u64 *)(r0 + 0); \ + .8byte %[gotox_r0]; \ + ret0_%=: \ + r0 = 0; \ + exit; \ + ret1_%=: \ + r0 = 1; \ + exit; \ +" : \ + : __imm_insn(gotox_r0, BPF_RAW_INSN(BPF_JMP | BPF_JA | BPF_X, BPF_REG_0, 0, 0 , 0)) + : __clobber_all); +} + +SEC("socket") +__failure __msg("write into map forbidden, value_size=16 off=8 size=8") +__naked void jump_table_no_writes(void) +{ + asm volatile (" \ + .pushsection .jumptables,\"\",@progbits; \ +jt0_%=: \ + .quad ret0_%= - socket; \ + .quad ret1_%= - socket; \ + .size jt0_%=, 16; \ + .global jt0_%=; \ + .popsection; \ + \ + r0 = jt0_%= ll; \ + r0 += 8; \ + r1 = 0xbeef; \ + *(u64 *)(r0 + 0) = r1; \ + .8byte %[gotox_r0]; \ + ret0_%=: \ + r0 = 0; \ + exit; \ + ret1_%=: \ + r0 = 1; \ + exit; \ +" : \ + : __imm_insn(gotox_r0, BPF_RAW_INSN(BPF_JMP | BPF_JA | BPF_X, BPF_REG_0, 0, 0 , 0)) + : __clobber_all); +} + +#define DEFINE_JUMP_TABLE_USE_REG(REG) \ + SEC("socket") \ + __success __retval(1) \ + __naked void jump_table_use_reg_r ## REG(void) \ + { \ + asm volatile (" \ + .pushsection .jumptables,\"\",@progbits; \ + jt0_%=: \ + .quad ret0_%= - socket; \ + .quad ret1_%= - socket; \ + .size jt0_%=, 16; \ + .global jt0_%=; \ + .popsection; \ + \ + r0 = jt0_%= ll; \ + r0 += 8; \ + r" #REG " = *(u64 *)(r0 + 0); \ + .8byte %[gotox_rX]; \ + ret0_%=: \ + r0 = 0; \ + exit; \ + ret1_%=: \ + r0 = 1; \ + exit; \ + " : \ + : __imm_insn(gotox_rX, BPF_RAW_INSN(BPF_JMP | BPF_JA | BPF_X, BPF_REG_ ## REG, 0, 0 , 0)) \ + : __clobber_all); \ + } + +DEFINE_JUMP_TABLE_USE_REG(0) +DEFINE_JUMP_TABLE_USE_REG(1) +DEFINE_JUMP_TABLE_USE_REG(2) +DEFINE_JUMP_TABLE_USE_REG(3) +DEFINE_JUMP_TABLE_USE_REG(4) +DEFINE_JUMP_TABLE_USE_REG(5) +DEFINE_JUMP_TABLE_USE_REG(6) +DEFINE_JUMP_TABLE_USE_REG(7) +DEFINE_JUMP_TABLE_USE_REG(8) +DEFINE_JUMP_TABLE_USE_REG(9) + +__used static int test_subprog(void) +{ + return 0; +} + +SEC("socket") +__failure __msg("jump table for insn 4 points outside of the subprog [0,10]") +__naked void jump_table_outside_subprog(void) +{ + asm volatile (" \ + .pushsection .jumptables,\"\",@progbits; \ +jt0_%=: \ + .quad ret0_%= - socket; \ + .quad ret1_%= - socket; \ + .quad ret_out_%= - socket; \ + .size jt0_%=, 24; \ + .global jt0_%=; \ + .popsection; \ + \ + r0 = jt0_%= ll; \ + r0 += 8; \ + r0 = *(u64 *)(r0 + 0); \ + .8byte %[gotox_r0]; \ + ret0_%=: \ + r0 = 0; \ + exit; \ + ret1_%=: \ + r0 = 1; \ + call test_subprog; \ + exit; \ + ret_out_%=: \ +" : \ + : __imm_insn(gotox_r0, BPF_RAW_INSN(BPF_JMP | BPF_JA | BPF_X, BPF_REG_0, 0, 0 , 0)) + : __clobber_all); +} + +SEC("socket") +__success __retval(1) +__naked void jump_table_contains_non_unique_values(void) +{ + asm volatile (" \ + .pushsection .jumptables,\"\",@progbits; \ +jt0_%=: \ + .quad ret0_%= - socket; \ + .quad ret1_%= - socket; \ + .quad ret0_%= - socket; \ + .quad ret1_%= - socket; \ + .quad ret0_%= - socket; \ + .quad ret1_%= - socket; \ + .quad ret0_%= - socket; \ + .quad ret1_%= - socket; \ + .quad ret0_%= - socket; \ + .quad ret1_%= - socket; \ + .size jt0_%=, 80; \ + .global jt0_%=; \ + .popsection; \ + \ + r0 = jt0_%= ll; \ + r0 += 8; \ + r0 = *(u64 *)(r0 + 0); \ + .8byte %[gotox_r0]; \ + ret0_%=: \ + r0 = 0; \ + exit; \ + ret1_%=: \ + r0 = 1; \ + exit; \ +" : \ + : __imm_insn(gotox_r0, BPF_RAW_INSN(BPF_JMP | BPF_JA | BPF_X, BPF_REG_0, 0, 0 , 0)) + : __clobber_all); +} + +#endif /* __TARGET_ARCH_x86 || __TARGET_ARCH_arm64 */ + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/verifier_live_stack.c b/tools/testing/selftests/bpf/progs/verifier_live_stack.c index c0e808509268..2de105057bbc 100644 --- a/tools/testing/selftests/bpf/progs/verifier_live_stack.c +++ b/tools/testing/selftests/bpf/progs/verifier_live_stack.c @@ -292,3 +292,53 @@ __naked void syzbot_postorder_bug1(void) "exit;" ::: __clobber_all); } + +struct { + __uint(type, BPF_MAP_TYPE_PROG_ARRAY); + __uint(max_entries, 1); + __type(key, __u32); + __type(value, __u32); +} map_array SEC(".maps"); + +SEC("socket") +__failure __msg("invalid read from stack R2 off=-1024 size=8") +__flag(BPF_F_TEST_STATE_FREQ) +__naked unsigned long caller_stack_write_tail_call(void) +{ + asm volatile ( + "r6 = r1;" + "*(u64 *)(r10 - 8) = -8;" + "call %[bpf_get_prandom_u32];" + "if r0 != 42 goto 1f;" + "goto 2f;" + "1:" + "*(u64 *)(r10 - 8) = -1024;" + "2:" + "r1 = r6;" + "r2 = r10;" + "r2 += -8;" + "call write_tail_call;" + "r1 = *(u64 *)(r10 - 8);" + "r2 = r10;" + "r2 += r1;" + "r0 = *(u64 *)(r2 + 0);" + "exit;" + :: __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +static __used __naked unsigned long write_tail_call(void) +{ + asm volatile ( + "r6 = r2;" + "r2 = %[map_array] ll;" + "r3 = 0;" + "call %[bpf_tail_call];" + "*(u64 *)(r6 + 0) = -16;" + "r0 = 0;" + "exit;" + : + : __imm(bpf_tail_call), + __imm_addr(map_array) + : __clobber_all); +} diff --git a/tools/testing/selftests/bpf/progs/verifier_lsm.c b/tools/testing/selftests/bpf/progs/verifier_lsm.c index 32e5e779cb96..6af9100a37ff 100644 --- a/tools/testing/selftests/bpf/progs/verifier_lsm.c +++ b/tools/testing/selftests/bpf/progs/verifier_lsm.c @@ -4,7 +4,7 @@ #include <bpf/bpf_helpers.h> #include "bpf_misc.h" -SEC("lsm/file_alloc_security") +SEC("lsm/file_permission") __description("lsm bpf prog with -4095~0 retval. test 1") __success __naked int errno_zero_retval_test1(void *ctx) @@ -15,7 +15,7 @@ __naked int errno_zero_retval_test1(void *ctx) ::: __clobber_all); } -SEC("lsm/file_alloc_security") +SEC("lsm/file_permission") __description("lsm bpf prog with -4095~0 retval. test 2") __success __naked int errno_zero_retval_test2(void *ctx) diff --git a/tools/testing/selftests/bpf/progs/verifier_netfilter_ctx.c b/tools/testing/selftests/bpf/progs/verifier_netfilter_ctx.c index ab9f9f2620ed..e2cbc5bda65e 100644 --- a/tools/testing/selftests/bpf/progs/verifier_netfilter_ctx.c +++ b/tools/testing/selftests/bpf/progs/verifier_netfilter_ctx.c @@ -79,11 +79,6 @@ int with_invalid_ctx_access_test5(struct bpf_nf_ctx *ctx) return NF_ACCEPT; } -extern int bpf_dynptr_from_skb(struct __sk_buff *skb, __u64 flags, - struct bpf_dynptr *ptr__uninit) __ksym; -extern void *bpf_dynptr_slice(const struct bpf_dynptr *ptr, uint32_t offset, - void *buffer, uint32_t buffer__sz) __ksym; - SEC("netfilter") __description("netfilter test prog with skb and state read access") __success __failure_unpriv diff --git a/tools/testing/selftests/bpf/progs/verifier_sock.c b/tools/testing/selftests/bpf/progs/verifier_sock.c index 2b4610b53382..a2132c72d3b8 100644 --- a/tools/testing/selftests/bpf/progs/verifier_sock.c +++ b/tools/testing/selftests/bpf/progs/verifier_sock.c @@ -1117,10 +1117,17 @@ int tail_call(struct __sk_buff *sk) return 0; } -/* Tail calls invalidate packet pointers. */ +static __noinline +int static_tail_call(struct __sk_buff *sk) +{ + bpf_tail_call_static(sk, &jmp_table, 0); + return 0; +} + +/* Tail calls in sub-programs invalidate packet pointers. */ SEC("tc") __failure __msg("invalid mem access") -int invalidate_pkt_pointers_by_tail_call(struct __sk_buff *sk) +int invalidate_pkt_pointers_by_global_tail_call(struct __sk_buff *sk) { int *p = (void *)(long)sk->data; @@ -1131,4 +1138,32 @@ int invalidate_pkt_pointers_by_tail_call(struct __sk_buff *sk) return TCX_PASS; } +/* Tail calls in static sub-programs invalidate packet pointers. */ +SEC("tc") +__failure __msg("invalid mem access") +int invalidate_pkt_pointers_by_static_tail_call(struct __sk_buff *sk) +{ + int *p = (void *)(long)sk->data; + + if ((void *)(p + 1) > (void *)(long)sk->data_end) + return TCX_DROP; + static_tail_call(sk); + *p = 42; /* this is unsafe */ + return TCX_PASS; +} + +/* Direct tail calls do not invalidate packet pointers. */ +SEC("tc") +__success +int invalidate_pkt_pointers_by_tail_call(struct __sk_buff *sk) +{ + int *p = (void *)(long)sk->data; + + if ((void *)(p + 1) > (void *)(long)sk->data_end) + return TCX_DROP; + bpf_tail_call_static(sk, &jmp_table, 0); + *p = 42; /* this is NOT unsafe: tail calls don't return */ + return TCX_PASS; +} + char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/verifier_subprog_precision.c b/tools/testing/selftests/bpf/progs/verifier_subprog_precision.c index ac3e418c2a96..61886ed554de 100644 --- a/tools/testing/selftests/bpf/progs/verifier_subprog_precision.c +++ b/tools/testing/selftests/bpf/progs/verifier_subprog_precision.c @@ -793,4 +793,57 @@ __naked int stack_slot_aliases_precision(void) ); } +struct { + __uint(type, BPF_MAP_TYPE_PROG_ARRAY); + __uint(max_entries, 1); + __type(key, __u32); + __type(value, __u32); +} map_array SEC(".maps"); + +__naked __noinline __used +static unsigned long identity_tail_call(void) +{ + /* the simplest identity function involving a tail call */ + asm volatile ( + "r6 = r2;" + "r2 = %[map_array] ll;" + "r3 = 0;" + "call %[bpf_tail_call];" + "r0 = r6;" + "exit;" + : + : __imm(bpf_tail_call), + __imm_addr(map_array) + : __clobber_all); +} + +SEC("?raw_tp") +__failure __log_level(2) +__msg("13: (85) call bpf_tail_call#12") +__msg("mark_precise: frame1: last_idx 13 first_idx 0 subseq_idx -1 ") +__msg("returning from callee:") +__msg("frame1: R0=scalar() R6=3 R10=fp0") +__msg("to caller at 4:") +__msg("R0=scalar() R6=map_value(map=.data.vals,ks=4,vs=16) R10=fp0") +__msg("6: (0f) r1 += r0") +__msg("mark_precise: frame0: regs=r0 stack= before 5: (bf) r1 = r6") +__msg("mark_precise: frame0: regs=r0 stack= before 4: (27) r0 *= 4") +__msg("mark_precise: frame0: parent state regs=r0 stack=: R0=Pscalar() R6=map_value(map=.data.vals,ks=4,vs=16) R10=fp0") +__msg("math between map_value pointer and register with unbounded min value is not allowed") +__naked int subprog_result_tail_call(void) +{ + asm volatile ( + "r2 = 3;" + "call identity_tail_call;" + "r0 *= 4;" + "r1 = %[vals];" + "r1 += r0;" + "r0 = *(u32 *)(r1 + 0);" + "exit;" + : + : __imm_ptr(vals) + : __clobber_common + ); +} + char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/wq.c b/tools/testing/selftests/bpf/progs/wq.c index 2f1ba08c293e..25be2cd9d42c 100644 --- a/tools/testing/selftests/bpf/progs/wq.c +++ b/tools/testing/selftests/bpf/progs/wq.c @@ -187,3 +187,20 @@ long test_call_lru_sleepable(void *ctx) return test_elem_callback(&lru, &key, wq_callback); } + +SEC("tc") +long test_map_no_btf(void *ctx) +{ + struct elem *val; + struct bpf_wq *wq; + int key = 42; + + val = bpf_map_lookup_elem(&array, &key); + if (!val) + return -2; + + wq = &val->w; + if (bpf_wq_init(wq, &array, 0) != 0) + return -3; + return 0; +} diff --git a/tools/testing/selftests/bpf/progs/wq_failures.c b/tools/testing/selftests/bpf/progs/wq_failures.c index 4240211a1900..d06f6d40594a 100644 --- a/tools/testing/selftests/bpf/progs/wq_failures.c +++ b/tools/testing/selftests/bpf/progs/wq_failures.c @@ -142,3 +142,26 @@ long test_wrong_wq_pointer_offset(void *ctx) return -22; } + +SEC("tc") +__log_level(2) +__failure +__msg(": (85) call bpf_wq_init#") +__msg("R1 doesn't have constant offset. bpf_wq has to be at the constant offset") +long test_bad_wq_off(void *ctx) +{ + struct elem *val; + struct bpf_wq *wq; + int key = 42; + u64 unknown; + + val = bpf_map_lookup_elem(&array, &key); + if (!val) + return -2; + + unknown = bpf_get_prandom_u32(); + wq = &val->w + unknown; + if (bpf_wq_init(wq, &array, 0) != 0) + return -3; + return 0; +} diff --git a/tools/testing/selftests/bpf/test_bpftool_build.sh b/tools/testing/selftests/bpf/test_bpftool_build.sh index 1453a53ed547..b03a87571592 100755 --- a/tools/testing/selftests/bpf/test_bpftool_build.sh +++ b/tools/testing/selftests/bpf/test_bpftool_build.sh @@ -90,10 +90,6 @@ echo -e "... through kbuild\n" if [ -f ".config" ] ; then make_and_clean tools/bpf - ## "make tools/bpf" sets $(OUTPUT) to ...tools/bpf/runqslower for - ## runqslower, but the default (used for the "clean" target) is .output. - ## Let's make sure we clean runqslower's directory properly. - make -C tools/bpf/runqslower OUTPUT=${KDIR_ROOT_DIR}/tools/bpf/runqslower/ clean ## $OUTPUT is overwritten in kbuild Makefile, and thus cannot be passed ## down from toplevel Makefile to bpftool's Makefile. diff --git a/tools/testing/selftests/bpf/test_kmods/bpf_test_rqspinlock.c b/tools/testing/selftests/bpf/test_kmods/bpf_test_rqspinlock.c index 769206fc70e4..7b4ae5e81d32 100644 --- a/tools/testing/selftests/bpf/test_kmods/bpf_test_rqspinlock.c +++ b/tools/testing/selftests/bpf/test_kmods/bpf_test_rqspinlock.c @@ -5,6 +5,7 @@ #include <linux/delay.h> #include <linux/module.h> #include <linux/prandom.h> +#include <linux/ktime.h> #include <asm/rqspinlock.h> #include <linux/perf_event.h> #include <linux/kthread.h> @@ -22,48 +23,146 @@ static struct perf_event_attr hw_attr = { static rqspinlock_t lock_a; static rqspinlock_t lock_b; +static rqspinlock_t lock_c; + +#define RQSL_SLOW_THRESHOLD_MS 10 +static const unsigned int rqsl_hist_ms[] = { + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, + 12, 14, 16, 18, 20, 25, 30, 40, 50, 75, + 100, 150, 200, 250, 1000, +}; +#define RQSL_NR_HIST_BUCKETS ARRAY_SIZE(rqsl_hist_ms) + +enum rqsl_context { + RQSL_CTX_NORMAL = 0, + RQSL_CTX_NMI, + RQSL_CTX_MAX, +}; + +struct rqsl_cpu_hist { + atomic64_t hist[RQSL_CTX_MAX][RQSL_NR_HIST_BUCKETS]; + atomic64_t success[RQSL_CTX_MAX]; + atomic64_t failure[RQSL_CTX_MAX]; +}; + +static DEFINE_PER_CPU(struct rqsl_cpu_hist, rqsl_cpu_hists); + +enum rqsl_mode { + RQSL_MODE_AA = 0, + RQSL_MODE_ABBA, + RQSL_MODE_ABBCCA, +}; + +static int test_mode = RQSL_MODE_AA; +module_param(test_mode, int, 0644); +MODULE_PARM_DESC(test_mode, + "rqspinlock test mode: 0 = AA, 1 = ABBA, 2 = ABBCCA"); + +static int normal_delay = 20; +module_param(normal_delay, int, 0644); +MODULE_PARM_DESC(normal_delay, + "rqspinlock critical section length for normal context (20ms default)"); + +static int nmi_delay = 10; +module_param(nmi_delay, int, 0644); +MODULE_PARM_DESC(nmi_delay, + "rqspinlock critical section length for NMI context (10ms default)"); static struct perf_event **rqsl_evts; static int rqsl_nevts; -static bool test_ab = false; -module_param(test_ab, bool, 0644); -MODULE_PARM_DESC(test_ab, "Test ABBA situations instead of AA situations"); - static struct task_struct **rqsl_threads; static int rqsl_nthreads; static atomic_t rqsl_ready_cpus = ATOMIC_INIT(0); static int pause = 0; -static bool nmi_locks_a(int cpu) +static const char *rqsl_mode_names[] = { + [RQSL_MODE_AA] = "AA", + [RQSL_MODE_ABBA] = "ABBA", + [RQSL_MODE_ABBCCA] = "ABBCCA", +}; + +struct rqsl_lock_pair { + rqspinlock_t *worker_lock; + rqspinlock_t *nmi_lock; +}; + +static struct rqsl_lock_pair rqsl_get_lock_pair(int cpu) { - return (cpu & 1) && test_ab; + int mode = READ_ONCE(test_mode); + + switch (mode) { + default: + case RQSL_MODE_AA: + return (struct rqsl_lock_pair){ &lock_a, &lock_a }; + case RQSL_MODE_ABBA: + if (cpu & 1) + return (struct rqsl_lock_pair){ &lock_b, &lock_a }; + return (struct rqsl_lock_pair){ &lock_a, &lock_b }; + case RQSL_MODE_ABBCCA: + switch (cpu % 3) { + case 0: + return (struct rqsl_lock_pair){ &lock_a, &lock_b }; + case 1: + return (struct rqsl_lock_pair){ &lock_b, &lock_c }; + default: + return (struct rqsl_lock_pair){ &lock_c, &lock_a }; + } + } +} + +static u32 rqsl_hist_bucket_idx(u32 delta_ms) +{ + int i; + + for (i = 0; i < RQSL_NR_HIST_BUCKETS; i++) { + if (delta_ms <= rqsl_hist_ms[i]) + return i; + } + + return RQSL_NR_HIST_BUCKETS - 1; +} + +static void rqsl_record_lock_result(u64 delta_ns, enum rqsl_context ctx, int ret) +{ + struct rqsl_cpu_hist *hist = this_cpu_ptr(&rqsl_cpu_hists); + u32 delta_ms = DIV_ROUND_UP_ULL(delta_ns, NSEC_PER_MSEC); + u32 bucket = rqsl_hist_bucket_idx(delta_ms); + atomic64_t *buckets = hist->hist[ctx]; + + atomic64_inc(&buckets[bucket]); + if (!ret) + atomic64_inc(&hist->success[ctx]); + else + atomic64_inc(&hist->failure[ctx]); } static int rqspinlock_worker_fn(void *arg) { int cpu = smp_processor_id(); unsigned long flags; + u64 start_ns; int ret; if (cpu) { atomic_inc(&rqsl_ready_cpus); while (!kthread_should_stop()) { + struct rqsl_lock_pair locks = rqsl_get_lock_pair(cpu); + rqspinlock_t *worker_lock = locks.worker_lock; + if (READ_ONCE(pause)) { msleep(1000); continue; } - if (nmi_locks_a(cpu)) - ret = raw_res_spin_lock_irqsave(&lock_b, flags); - else - ret = raw_res_spin_lock_irqsave(&lock_a, flags); - mdelay(20); - if (nmi_locks_a(cpu) && !ret) - raw_res_spin_unlock_irqrestore(&lock_b, flags); - else if (!ret) - raw_res_spin_unlock_irqrestore(&lock_a, flags); + start_ns = ktime_get_mono_fast_ns(); + ret = raw_res_spin_lock_irqsave(worker_lock, flags); + rqsl_record_lock_result(ktime_get_mono_fast_ns() - start_ns, + RQSL_CTX_NORMAL, ret); + mdelay(normal_delay); + if (!ret) + raw_res_spin_unlock_irqrestore(worker_lock, flags); cpu_relax(); } return 0; @@ -91,24 +190,25 @@ static int rqspinlock_worker_fn(void *arg) static void nmi_cb(struct perf_event *event, struct perf_sample_data *data, struct pt_regs *regs) { + struct rqsl_lock_pair locks; int cpu = smp_processor_id(); unsigned long flags; + u64 start_ns; int ret; if (!cpu || READ_ONCE(pause)) return; - if (nmi_locks_a(cpu)) - ret = raw_res_spin_lock_irqsave(&lock_a, flags); - else - ret = raw_res_spin_lock_irqsave(test_ab ? &lock_b : &lock_a, flags); + locks = rqsl_get_lock_pair(cpu); + start_ns = ktime_get_mono_fast_ns(); + ret = raw_res_spin_lock_irqsave(locks.nmi_lock, flags); + rqsl_record_lock_result(ktime_get_mono_fast_ns() - start_ns, + RQSL_CTX_NMI, ret); - mdelay(10); + mdelay(nmi_delay); - if (nmi_locks_a(cpu) && !ret) - raw_res_spin_unlock_irqrestore(&lock_a, flags); - else if (!ret) - raw_res_spin_unlock_irqrestore(test_ab ? &lock_b : &lock_a, flags); + if (!ret) + raw_res_spin_unlock_irqrestore(locks.nmi_lock, flags); } static void free_rqsl_threads(void) @@ -142,13 +242,19 @@ static int bpf_test_rqspinlock_init(void) int i, ret; int ncpus = num_online_cpus(); - pr_err("Mode = %s\n", test_ab ? "ABBA" : "AA"); + if (test_mode < RQSL_MODE_AA || test_mode > RQSL_MODE_ABBCCA) { + pr_err("Invalid mode %d\n", test_mode); + return -EINVAL; + } + + pr_err("Mode = %s\n", rqsl_mode_names[test_mode]); - if (ncpus < 3) + if (ncpus < test_mode + 2) return -ENOTSUPP; raw_res_spin_lock_init(&lock_a); raw_res_spin_lock_init(&lock_b); + raw_res_spin_lock_init(&lock_c); rqsl_evts = kcalloc(ncpus - 1, sizeof(*rqsl_evts), GFP_KERNEL); if (!rqsl_evts) @@ -196,10 +302,88 @@ err_perf_events: module_init(bpf_test_rqspinlock_init); +static void rqsl_print_histograms(void) +{ + int cpu, i; + + pr_err("rqspinlock acquisition latency histogram (ms):\n"); + + for_each_online_cpu(cpu) { + struct rqsl_cpu_hist *hist = per_cpu_ptr(&rqsl_cpu_hists, cpu); + u64 norm_counts[RQSL_NR_HIST_BUCKETS]; + u64 nmi_counts[RQSL_NR_HIST_BUCKETS]; + u64 total_counts[RQSL_NR_HIST_BUCKETS]; + u64 norm_success, nmi_success, success_total; + u64 norm_failure, nmi_failure, failure_total; + u64 norm_total = 0, nmi_total = 0, total = 0; + bool has_slow = false; + + for (i = 0; i < RQSL_NR_HIST_BUCKETS; i++) { + norm_counts[i] = atomic64_read(&hist->hist[RQSL_CTX_NORMAL][i]); + nmi_counts[i] = atomic64_read(&hist->hist[RQSL_CTX_NMI][i]); + total_counts[i] = norm_counts[i] + nmi_counts[i]; + norm_total += norm_counts[i]; + nmi_total += nmi_counts[i]; + total += total_counts[i]; + if (rqsl_hist_ms[i] > RQSL_SLOW_THRESHOLD_MS && + total_counts[i]) + has_slow = true; + } + + norm_success = atomic64_read(&hist->success[RQSL_CTX_NORMAL]); + nmi_success = atomic64_read(&hist->success[RQSL_CTX_NMI]); + norm_failure = atomic64_read(&hist->failure[RQSL_CTX_NORMAL]); + nmi_failure = atomic64_read(&hist->failure[RQSL_CTX_NMI]); + success_total = norm_success + nmi_success; + failure_total = norm_failure + nmi_failure; + + if (!total) + continue; + + if (!has_slow) { + pr_err(" cpu%d: total %llu (normal %llu, nmi %llu) | " + "success %llu (normal %llu, nmi %llu) | " + "failure %llu (normal %llu, nmi %llu), all within 0-%ums\n", + cpu, total, norm_total, nmi_total, + success_total, norm_success, nmi_success, + failure_total, norm_failure, nmi_failure, + RQSL_SLOW_THRESHOLD_MS); + continue; + } + + pr_err(" cpu%d: total %llu (normal %llu, nmi %llu) | " + "success %llu (normal %llu, nmi %llu) | " + "failure %llu (normal %llu, nmi %llu)\n", + cpu, total, norm_total, nmi_total, + success_total, norm_success, nmi_success, + failure_total, norm_failure, nmi_failure); + for (i = 0; i < RQSL_NR_HIST_BUCKETS; i++) { + unsigned int start_ms; + + if (!total_counts[i]) + continue; + + start_ms = i == 0 ? 0 : rqsl_hist_ms[i - 1] + 1; + if (i == RQSL_NR_HIST_BUCKETS - 1) { + pr_err(" >= %ums: total %llu (normal %llu, nmi %llu)\n", + start_ms, total_counts[i], + norm_counts[i], nmi_counts[i]); + } else { + pr_err(" %u-%ums: total %llu (normal %llu, nmi %llu)\n", + start_ms, rqsl_hist_ms[i], + total_counts[i], + norm_counts[i], nmi_counts[i]); + } + } + } +} + static void bpf_test_rqspinlock_exit(void) { + WRITE_ONCE(pause, 1); free_rqsl_threads(); free_rqsl_evts(); + rqsl_print_histograms(); } module_exit(bpf_test_rqspinlock_exit); diff --git a/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c b/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c index ed0a4721d8fd..1669a7eeda26 100644 --- a/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c +++ b/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c @@ -926,7 +926,7 @@ __bpf_kfunc int bpf_kfunc_call_kernel_connect(struct addr_args *args) goto out; } - err = kernel_connect(sock, (struct sockaddr *)&args->addr, + err = kernel_connect(sock, (struct sockaddr_unsized *)&args->addr, args->addrlen, 0); out: mutex_unlock(&sock_lock); @@ -949,7 +949,7 @@ __bpf_kfunc int bpf_kfunc_call_kernel_bind(struct addr_args *args) goto out; } - err = kernel_bind(sock, (struct sockaddr *)&args->addr, args->addrlen); + err = kernel_bind(sock, (struct sockaddr_unsized *)&args->addr, args->addrlen); out: mutex_unlock(&sock_lock); diff --git a/tools/testing/selftests/bpf/test_loader.c b/tools/testing/selftests/bpf/test_loader.c index 74ecc281bb8c..338c035c3688 100644 --- a/tools/testing/selftests/bpf/test_loader.c +++ b/tools/testing/selftests/bpf/test_loader.c @@ -43,6 +43,7 @@ #define TEST_TAG_EXPECT_STDERR_PFX_UNPRIV "comment:test_expect_stderr_unpriv=" #define TEST_TAG_EXPECT_STDOUT_PFX "comment:test_expect_stdout=" #define TEST_TAG_EXPECT_STDOUT_PFX_UNPRIV "comment:test_expect_stdout_unpriv=" +#define TEST_TAG_LINEAR_SIZE "comment:test_linear_size=" /* Warning: duplicated in bpf_misc.h */ #define POINTER_VALUE 0xbadcafe @@ -89,6 +90,7 @@ struct test_spec { int mode_mask; int arch_mask; int load_mask; + int linear_sz; bool auxiliary; bool valid; }; @@ -633,6 +635,21 @@ static int parse_test_spec(struct test_loader *tester, &spec->unpriv.stdout); if (err) goto cleanup; + } else if (str_has_pfx(s, TEST_TAG_LINEAR_SIZE)) { + switch (bpf_program__type(prog)) { + case BPF_PROG_TYPE_SCHED_ACT: + case BPF_PROG_TYPE_SCHED_CLS: + case BPF_PROG_TYPE_CGROUP_SKB: + val = s + sizeof(TEST_TAG_LINEAR_SIZE) - 1; + err = parse_int(val, &spec->linear_sz, "test linear size"); + if (err) + goto cleanup; + break; + default: + PRINT_FAIL("__linear_size for unsupported program type"); + err = -EINVAL; + goto cleanup; + } } } @@ -1007,10 +1024,11 @@ static bool is_unpriv_capable_map(struct bpf_map *map) } } -static int do_prog_test_run(int fd_prog, int *retval, bool empty_opts) +static int do_prog_test_run(int fd_prog, int *retval, bool empty_opts, int linear_sz) { __u8 tmp_out[TEST_DATA_LEN << 2] = {}; __u8 tmp_in[TEST_DATA_LEN] = {}; + struct __sk_buff ctx = {}; int err, saved_errno; LIBBPF_OPTS(bpf_test_run_opts, topts, .data_in = tmp_in, @@ -1020,6 +1038,12 @@ static int do_prog_test_run(int fd_prog, int *retval, bool empty_opts) .repeat = 1, ); + if (linear_sz) { + ctx.data_end = linear_sz; + topts.ctx_in = &ctx; + topts.ctx_size_in = sizeof(ctx); + } + if (empty_opts) { memset(&topts, 0, sizeof(struct bpf_test_run_opts)); topts.sz = sizeof(struct bpf_test_run_opts); @@ -1269,7 +1293,8 @@ void run_subtest(struct test_loader *tester, } err = do_prog_test_run(bpf_program__fd(tprog), &retval, - bpf_program__type(tprog) == BPF_PROG_TYPE_SYSCALL ? true : false); + bpf_program__type(tprog) == BPF_PROG_TYPE_SYSCALL ? true : false, + spec->linear_sz); if (!err && retval != subspec->retval && subspec->retval != POINTER_VALUE) { PRINT_FAIL("Unexpected retval: %d != %d\n", retval, subspec->retval); goto tobj_cleanup; diff --git a/tools/testing/selftests/bpf/test_maps.c b/tools/testing/selftests/bpf/test_maps.c index 3fae9ce46ca9..ccc5acd55ff9 100644 --- a/tools/testing/selftests/bpf/test_maps.c +++ b/tools/testing/selftests/bpf/test_maps.c @@ -1399,7 +1399,8 @@ static void test_map_stress(void) static bool can_retry(int err) { return (err == EAGAIN || err == EBUSY || - (err == ENOMEM && map_opts.map_flags == BPF_F_NO_PREALLOC)); + ((err == ENOMEM || err == E2BIG) && + map_opts.map_flags == BPF_F_NO_PREALLOC)); } int map_update_retriable(int map_fd, const void *key, const void *value, int flags, int attempts, diff --git a/tools/testing/selftests/bpf/test_tag.c b/tools/testing/selftests/bpf/test_tag.c index 5546b05a0486..f1300047c1e0 100644 --- a/tools/testing/selftests/bpf/test_tag.c +++ b/tools/testing/selftests/bpf/test_tag.c @@ -116,7 +116,7 @@ static void tag_from_alg(int insns, uint8_t *tag, uint32_t len) static const struct sockaddr_alg alg = { .salg_family = AF_ALG, .salg_type = "hash", - .salg_name = "sha1", + .salg_name = "sha256", }; int fd_base, fd_alg, ret; ssize_t size; diff --git a/tools/testing/selftests/bpf/test_tc_edt.sh b/tools/testing/selftests/bpf/test_tc_edt.sh deleted file mode 100755 index 76f0bd17061f..000000000000 --- a/tools/testing/selftests/bpf/test_tc_edt.sh +++ /dev/null @@ -1,100 +0,0 @@ -#!/bin/bash -# SPDX-License-Identifier: GPL-2.0 -# -# This test installs a TC bpf program that throttles a TCP flow -# with dst port = 9000 down to 5MBps. Then it measures actual -# throughput of the flow. - -BPF_FILE="test_tc_edt.bpf.o" -if [[ $EUID -ne 0 ]]; then - echo "This script must be run as root" - echo "FAIL" - exit 1 -fi - -# check that nc, dd, and timeout are present -command -v nc >/dev/null 2>&1 || \ - { echo >&2 "nc is not available"; exit 1; } -command -v dd >/dev/null 2>&1 || \ - { echo >&2 "nc is not available"; exit 1; } -command -v timeout >/dev/null 2>&1 || \ - { echo >&2 "timeout is not available"; exit 1; } - -readonly NS_SRC="ns-src-$(mktemp -u XXXXXX)" -readonly NS_DST="ns-dst-$(mktemp -u XXXXXX)" - -readonly IP_SRC="172.16.1.100" -readonly IP_DST="172.16.2.100" - -cleanup() -{ - ip netns del ${NS_SRC} - ip netns del ${NS_DST} -} - -trap cleanup EXIT - -set -e # exit on error - -ip netns add "${NS_SRC}" -ip netns add "${NS_DST}" -ip link add veth_src type veth peer name veth_dst -ip link set veth_src netns ${NS_SRC} -ip link set veth_dst netns ${NS_DST} - -ip -netns ${NS_SRC} addr add ${IP_SRC}/24 dev veth_src -ip -netns ${NS_DST} addr add ${IP_DST}/24 dev veth_dst - -ip -netns ${NS_SRC} link set dev veth_src up -ip -netns ${NS_DST} link set dev veth_dst up - -ip -netns ${NS_SRC} route add ${IP_DST}/32 dev veth_src -ip -netns ${NS_DST} route add ${IP_SRC}/32 dev veth_dst - -# set up TC on TX -ip netns exec ${NS_SRC} tc qdisc add dev veth_src root fq -ip netns exec ${NS_SRC} tc qdisc add dev veth_src clsact -ip netns exec ${NS_SRC} tc filter add dev veth_src egress \ - bpf da obj ${BPF_FILE} sec cls_test - - -# start the listener -ip netns exec ${NS_DST} bash -c \ - "nc -4 -l -p 9000 >/dev/null &" -declare -i NC_PID=$! -sleep 1 - -declare -ir TIMEOUT=20 -declare -ir EXPECTED_BPS=5000000 - -# run the load, capture RX bytes on DST -declare -ir RX_BYTES_START=$( ip netns exec ${NS_DST} \ - cat /sys/class/net/veth_dst/statistics/rx_bytes ) - -set +e -ip netns exec ${NS_SRC} bash -c "timeout ${TIMEOUT} dd if=/dev/zero \ - bs=1000 count=1000000 > /dev/tcp/${IP_DST}/9000 2>/dev/null" -set -e - -declare -ir RX_BYTES_END=$( ip netns exec ${NS_DST} \ - cat /sys/class/net/veth_dst/statistics/rx_bytes ) - -declare -ir ACTUAL_BPS=$(( ($RX_BYTES_END - $RX_BYTES_START) / $TIMEOUT )) - -echo $TIMEOUT $ACTUAL_BPS $EXPECTED_BPS | \ - awk '{printf "elapsed: %d sec; bps difference: %.2f%%\n", - $1, ($2-$3)*100.0/$3}' - -# Pass the test if the actual bps is within 1% of the expected bps. -# The difference is usually about 0.1% on a 20-sec test, and ==> zero -# the longer the test runs. -declare -ir RES=$( echo $ACTUAL_BPS $EXPECTED_BPS | \ - awk 'function abs(x){return ((x < 0.0) ? -x : x)} - {if (abs(($1-$2)*100.0/$2) > 1.0) { print "1" } - else { print "0"} }' ) -if [ "${RES}" == "0" ] ; then - echo "PASS" -else - echo "FAIL" - exit 1 -fi diff --git a/tools/testing/selftests/bpf/test_tc_tunnel.sh b/tools/testing/selftests/bpf/test_tc_tunnel.sh deleted file mode 100755 index cb55a908bb0d..000000000000 --- a/tools/testing/selftests/bpf/test_tc_tunnel.sh +++ /dev/null @@ -1,320 +0,0 @@ -#!/bin/bash -# SPDX-License-Identifier: GPL-2.0 -# -# In-place tunneling - -BPF_FILE="test_tc_tunnel.bpf.o" -# must match the port that the bpf program filters on -readonly port=8000 - -readonly ns_prefix="ns-$$-" -readonly ns1="${ns_prefix}1" -readonly ns2="${ns_prefix}2" - -readonly ns1_v4=192.168.1.1 -readonly ns2_v4=192.168.1.2 -readonly ns1_v6=fd::1 -readonly ns2_v6=fd::2 - -# Must match port used by bpf program -readonly udpport=5555 -# MPLSoverUDP -readonly mplsudpport=6635 -readonly mplsproto=137 - -readonly infile="$(mktemp)" -readonly outfile="$(mktemp)" - -setup() { - ip netns add "${ns1}" - ip netns add "${ns2}" - - ip link add dev veth1 mtu 1500 netns "${ns1}" type veth \ - peer name veth2 mtu 1500 netns "${ns2}" - - ip netns exec "${ns1}" ethtool -K veth1 tso off - - ip -netns "${ns1}" link set veth1 up - ip -netns "${ns2}" link set veth2 up - - ip -netns "${ns1}" -4 addr add "${ns1_v4}/24" dev veth1 - ip -netns "${ns2}" -4 addr add "${ns2_v4}/24" dev veth2 - ip -netns "${ns1}" -6 addr add "${ns1_v6}/64" dev veth1 nodad - ip -netns "${ns2}" -6 addr add "${ns2_v6}/64" dev veth2 nodad - - # clamp route to reserve room for tunnel headers - ip -netns "${ns1}" -4 route flush table main - ip -netns "${ns1}" -6 route flush table main - ip -netns "${ns1}" -4 route add "${ns2_v4}" mtu 1450 dev veth1 - ip -netns "${ns1}" -6 route add "${ns2_v6}" mtu 1430 dev veth1 - - sleep 1 - - dd if=/dev/urandom of="${infile}" bs="${datalen}" count=1 status=none -} - -cleanup() { - ip netns del "${ns2}" - ip netns del "${ns1}" - - if [[ -f "${outfile}" ]]; then - rm "${outfile}" - fi - if [[ -f "${infile}" ]]; then - rm "${infile}" - fi - - if [[ -n $server_pid ]]; then - kill $server_pid 2> /dev/null - fi -} - -server_listen() { - ip netns exec "${ns2}" nc "${netcat_opt}" -l "${port}" > "${outfile}" & - server_pid=$! -} - -client_connect() { - ip netns exec "${ns1}" timeout 2 nc "${netcat_opt}" -w 1 "${addr2}" "${port}" < "${infile}" - echo $? -} - -verify_data() { - wait "${server_pid}" - server_pid= - # sha1sum returns two fields [sha1] [filepath] - # convert to bash array and access first elem - insum=($(sha1sum ${infile})) - outsum=($(sha1sum ${outfile})) - if [[ "${insum[0]}" != "${outsum[0]}" ]]; then - echo "data mismatch" - exit 1 - fi -} - -wait_for_port() { - for i in $(seq 20); do - if ip netns exec "${ns2}" ss ${2:--4}OHntl | grep -q "$1"; then - return 0 - fi - sleep 0.1 - done - return 1 -} - -set -e - -# no arguments: automated test, run all -if [[ "$#" -eq "0" ]]; then - echo "ipip" - $0 ipv4 ipip none 100 - - echo "ipip6" - $0 ipv4 ipip6 none 100 - - echo "ip6ip6" - $0 ipv6 ip6tnl none 100 - - echo "sit" - $0 ipv6 sit none 100 - - echo "ip4 vxlan" - $0 ipv4 vxlan eth 2000 - - echo "ip6 vxlan" - $0 ipv6 ip6vxlan eth 2000 - - for mac in none mpls eth ; do - echo "ip gre $mac" - $0 ipv4 gre $mac 100 - - echo "ip6 gre $mac" - $0 ipv6 ip6gre $mac 100 - - echo "ip gre $mac gso" - $0 ipv4 gre $mac 2000 - - echo "ip6 gre $mac gso" - $0 ipv6 ip6gre $mac 2000 - - echo "ip udp $mac" - $0 ipv4 udp $mac 100 - - echo "ip6 udp $mac" - $0 ipv6 ip6udp $mac 100 - - echo "ip udp $mac gso" - $0 ipv4 udp $mac 2000 - - echo "ip6 udp $mac gso" - $0 ipv6 ip6udp $mac 2000 - done - - echo "OK. All tests passed" - exit 0 -fi - -if [[ "$#" -ne "4" ]]; then - echo "Usage: $0" - echo " or: $0 <ipv4|ipv6> <tuntype> <none|mpls|eth> <data_len>" - exit 1 -fi - -case "$1" in -"ipv4") - readonly addr1="${ns1_v4}" - readonly addr2="${ns2_v4}" - readonly ipproto=4 - readonly netcat_opt=-${ipproto} - readonly foumod=fou - readonly foutype=ipip - readonly fouproto=4 - readonly fouproto_mpls=${mplsproto} - readonly gretaptype=gretap - ;; -"ipv6") - readonly addr1="${ns1_v6}" - readonly addr2="${ns2_v6}" - readonly ipproto=6 - readonly netcat_opt=-${ipproto} - readonly foumod=fou6 - readonly foutype=ip6tnl - readonly fouproto="41 -6" - readonly fouproto_mpls="${mplsproto} -6" - readonly gretaptype=ip6gretap - ;; -*) - echo "unknown arg: $1" - exit 1 - ;; -esac - -readonly tuntype=$2 -readonly mac=$3 -readonly datalen=$4 - -echo "encap ${addr1} to ${addr2}, type ${tuntype}, mac ${mac} len ${datalen}" - -trap cleanup EXIT - -setup - -# basic communication works -echo "test basic connectivity" -server_listen -wait_for_port ${port} ${netcat_opt} -client_connect -verify_data - -# clientside, insert bpf program to encap all TCP to port ${port} -# client can no longer connect -ip netns exec "${ns1}" tc qdisc add dev veth1 clsact -ip netns exec "${ns1}" tc filter add dev veth1 egress \ - bpf direct-action object-file ${BPF_FILE} \ - section "encap_${tuntype}_${mac}" -echo "test bpf encap without decap (expect failure)" -server_listen -wait_for_port ${port} ${netcat_opt} -! client_connect - -if [[ "$tuntype" =~ "udp" ]]; then - # Set up fou tunnel. - ttype="${foutype}" - targs="encap fou encap-sport auto encap-dport $udpport" - # fou may be a module; allow this to fail. - modprobe "${foumod}" ||true - if [[ "$mac" == "mpls" ]]; then - dport=${mplsudpport} - dproto=${fouproto_mpls} - tmode="mode any ttl 255" - else - dport=${udpport} - dproto=${fouproto} - fi - ip netns exec "${ns2}" ip fou add port $dport ipproto ${dproto} - targs="encap fou encap-sport auto encap-dport $dport" -elif [[ "$tuntype" =~ "gre" && "$mac" == "eth" ]]; then - ttype=$gretaptype -elif [[ "$tuntype" =~ "vxlan" && "$mac" == "eth" ]]; then - ttype="vxlan" - targs="id 1 dstport 8472 udp6zerocsumrx" -elif [[ "$tuntype" == "ipip6" ]]; then - ttype="ip6tnl" - targs="" -else - ttype=$tuntype - targs="" -fi - -# tunnel address family differs from inner for SIT -if [[ "${tuntype}" == "sit" ]]; then - link_addr1="${ns1_v4}" - link_addr2="${ns2_v4}" -elif [[ "${tuntype}" == "ipip6" ]]; then - link_addr1="${ns1_v6}" - link_addr2="${ns2_v6}" -else - link_addr1="${addr1}" - link_addr2="${addr2}" -fi - -# serverside, insert decap module -# server is still running -# client can connect again -ip netns exec "${ns2}" ip link add name testtun0 type "${ttype}" \ - ${tmode} remote "${link_addr1}" local "${link_addr2}" $targs - -expect_tun_fail=0 - -if [[ "$tuntype" == "ip6udp" && "$mac" == "mpls" ]]; then - # No support for MPLS IPv6 fou tunnel; expect failure. - expect_tun_fail=1 -elif [[ "$tuntype" =~ "udp" && "$mac" == "eth" ]]; then - # No support for TEB fou tunnel; expect failure. - expect_tun_fail=1 -elif [[ "$tuntype" =~ (gre|vxlan) && "$mac" == "eth" ]]; then - # Share ethernet address between tunnel/veth2 so L2 decap works. - ethaddr=$(ip netns exec "${ns2}" ip link show veth2 | \ - awk '/ether/ { print $2 }') - ip netns exec "${ns2}" ip link set testtun0 address $ethaddr -elif [[ "$mac" == "mpls" ]]; then - modprobe mpls_iptunnel ||true - modprobe mpls_gso ||true - ip netns exec "${ns2}" sysctl -qw net.mpls.platform_labels=65536 - ip netns exec "${ns2}" ip -f mpls route add 1000 dev lo - ip netns exec "${ns2}" ip link set lo up - ip netns exec "${ns2}" sysctl -qw net.mpls.conf.testtun0.input=1 - ip netns exec "${ns2}" sysctl -qw net.ipv4.conf.lo.rp_filter=0 -fi - -# Because packets are decapped by the tunnel they arrive on testtun0 from -# the IP stack perspective. Ensure reverse path filtering is disabled -# otherwise we drop the TCP SYN as arriving on testtun0 instead of the -# expected veth2 (veth2 is where 192.168.1.2 is configured). -ip netns exec "${ns2}" sysctl -qw net.ipv4.conf.all.rp_filter=0 -# rp needs to be disabled for both all and testtun0 as the rp value is -# selected as the max of the "all" and device-specific values. -ip netns exec "${ns2}" sysctl -qw net.ipv4.conf.testtun0.rp_filter=0 -ip netns exec "${ns2}" ip link set dev testtun0 up -if [[ "$expect_tun_fail" == 1 ]]; then - # This tunnel mode is not supported, so we expect failure. - echo "test bpf encap with tunnel device decap (expect failure)" - ! client_connect -else - echo "test bpf encap with tunnel device decap" - client_connect - verify_data - server_listen - wait_for_port ${port} ${netcat_opt} -fi - -# serverside, use BPF for decap -ip netns exec "${ns2}" ip link del dev testtun0 -ip netns exec "${ns2}" tc qdisc add dev veth2 clsact -ip netns exec "${ns2}" tc filter add dev veth2 ingress \ - bpf direct-action object-file ${BPF_FILE} section decap -echo "test bpf encap with bpf decap" -client_connect -verify_data - -echo OK diff --git a/tools/testing/selftests/bpf/xskxceiver.c b/tools/testing/selftests/bpf/xskxceiver.c index 352adc8df2d1..9234a58b0a97 100644 --- a/tools/testing/selftests/bpf/xskxceiver.c +++ b/tools/testing/selftests/bpf/xskxceiver.c @@ -74,31 +74,23 @@ #define _GNU_SOURCE #include <assert.h> #include <fcntl.h> -#include <errno.h> #include <getopt.h> #include <linux/if_link.h> #include <linux/if_ether.h> #include <linux/mman.h> #include <linux/netdev.h> -#include <linux/bitmap.h> #include <linux/ethtool.h> #include <arpa/inet.h> #include <net/if.h> #include <locale.h> -#include <poll.h> -#include <pthread.h> -#include <signal.h> #include <stdio.h> #include <stdlib.h> #include <libgen.h> -#include <string.h> #include <stddef.h> #include <sys/mman.h> -#include <sys/socket.h> -#include <sys/time.h> #include <sys/types.h> -#include <unistd.h> +#include "prog_tests/test_xsk.h" #include "xsk_xdp_progs.skel.h" #include "xsk.h" #include "xskxceiver.h" @@ -109,9 +101,6 @@ #include <network_helpers.h> -#define MAX_TX_BUDGET_DEFAULT 32 - -static bool opt_verbose; static bool opt_print_tests; static enum test_mode opt_mode = TEST_MODE_ALL; static u32 opt_run_test = RUN_ALL_TESTS; @@ -120,169 +109,12 @@ void test__fail(void) { /* for network_helpers.c */ } static void __exit_with_error(int error, const char *file, const char *func, int line) { - ksft_test_result_fail("[%s:%s:%i]: ERROR: %d/\"%s\"\n", file, func, line, error, - strerror(error)); + ksft_test_result_fail("[%s:%s:%i]: ERROR: %d/\"%s\"\n", file, func, line, + error, strerror(error)); ksft_exit_xfail(); } #define exit_with_error(error) __exit_with_error(error, __FILE__, __func__, __LINE__) -#define busy_poll_string(test) (test)->ifobj_tx->busy_poll ? "BUSY-POLL " : "" -static char *mode_string(struct test_spec *test) -{ - switch (test->mode) { - case TEST_MODE_SKB: - return "SKB"; - case TEST_MODE_DRV: - return "DRV"; - case TEST_MODE_ZC: - return "ZC"; - default: - return "BOGUS"; - } -} - -static void report_failure(struct test_spec *test) -{ - if (test->fail) - return; - - ksft_test_result_fail("FAIL: %s %s%s\n", mode_string(test), busy_poll_string(test), - test->name); - test->fail = true; -} - -/* The payload is a word consisting of a packet sequence number in the upper - * 16-bits and a intra packet data sequence number in the lower 16 bits. So the 3rd packet's - * 5th word of data will contain the number (2<<16) | 4 as they are numbered from 0. - */ -static void write_payload(void *dest, u32 pkt_nb, u32 start, u32 size) -{ - u32 *ptr = (u32 *)dest, i; - - start /= sizeof(*ptr); - size /= sizeof(*ptr); - for (i = 0; i < size; i++) - ptr[i] = htonl(pkt_nb << 16 | (i + start)); -} - -static void gen_eth_hdr(struct xsk_socket_info *xsk, struct ethhdr *eth_hdr) -{ - memcpy(eth_hdr->h_dest, xsk->dst_mac, ETH_ALEN); - memcpy(eth_hdr->h_source, xsk->src_mac, ETH_ALEN); - eth_hdr->h_proto = htons(ETH_P_LOOPBACK); -} - -static bool is_umem_valid(struct ifobject *ifobj) -{ - return !!ifobj->umem->umem; -} - -static u32 mode_to_xdp_flags(enum test_mode mode) -{ - return (mode == TEST_MODE_SKB) ? XDP_FLAGS_SKB_MODE : XDP_FLAGS_DRV_MODE; -} - -static u64 umem_size(struct xsk_umem_info *umem) -{ - return umem->num_frames * umem->frame_size; -} - -static int xsk_configure_umem(struct ifobject *ifobj, struct xsk_umem_info *umem, void *buffer, - u64 size) -{ - struct xsk_umem_config cfg = { - .fill_size = XSK_RING_PROD__DEFAULT_NUM_DESCS, - .comp_size = XSK_RING_CONS__DEFAULT_NUM_DESCS, - .frame_size = umem->frame_size, - .frame_headroom = umem->frame_headroom, - .flags = XSK_UMEM__DEFAULT_FLAGS - }; - int ret; - - if (umem->fill_size) - cfg.fill_size = umem->fill_size; - - if (umem->comp_size) - cfg.comp_size = umem->comp_size; - - if (umem->unaligned_mode) - cfg.flags |= XDP_UMEM_UNALIGNED_CHUNK_FLAG; - - ret = xsk_umem__create(&umem->umem, buffer, size, - &umem->fq, &umem->cq, &cfg); - if (ret) - return ret; - - umem->buffer = buffer; - if (ifobj->shared_umem && ifobj->rx_on) { - umem->base_addr = umem_size(umem); - umem->next_buffer = umem_size(umem); - } - - return 0; -} - -static u64 umem_alloc_buffer(struct xsk_umem_info *umem) -{ - u64 addr; - - addr = umem->next_buffer; - umem->next_buffer += umem->frame_size; - if (umem->next_buffer >= umem->base_addr + umem_size(umem)) - umem->next_buffer = umem->base_addr; - - return addr; -} - -static void umem_reset_alloc(struct xsk_umem_info *umem) -{ - umem->next_buffer = 0; -} - -static void enable_busy_poll(struct xsk_socket_info *xsk) -{ - int sock_opt; - - sock_opt = 1; - if (setsockopt(xsk_socket__fd(xsk->xsk), SOL_SOCKET, SO_PREFER_BUSY_POLL, - (void *)&sock_opt, sizeof(sock_opt)) < 0) - exit_with_error(errno); - - sock_opt = 20; - if (setsockopt(xsk_socket__fd(xsk->xsk), SOL_SOCKET, SO_BUSY_POLL, - (void *)&sock_opt, sizeof(sock_opt)) < 0) - exit_with_error(errno); - - sock_opt = xsk->batch_size; - if (setsockopt(xsk_socket__fd(xsk->xsk), SOL_SOCKET, SO_BUSY_POLL_BUDGET, - (void *)&sock_opt, sizeof(sock_opt)) < 0) - exit_with_error(errno); -} - -static int __xsk_configure_socket(struct xsk_socket_info *xsk, struct xsk_umem_info *umem, - struct ifobject *ifobject, bool shared) -{ - struct xsk_socket_config cfg = {}; - struct xsk_ring_cons *rxr; - struct xsk_ring_prod *txr; - - xsk->umem = umem; - cfg.rx_size = xsk->rxqsize; - cfg.tx_size = XSK_RING_PROD__DEFAULT_NUM_DESCS; - cfg.bind_flags = ifobject->bind_flags; - if (shared) - cfg.bind_flags |= XDP_SHARED_UMEM; - if (ifobject->mtu > MAX_ETH_PKT_SIZE) - cfg.bind_flags |= XDP_USE_SG; - if (umem->comp_size) - cfg.tx_size = umem->comp_size; - if (umem->fill_size) - cfg.rx_size = umem->fill_size; - - txr = ifobject->tx_on ? &xsk->tx : NULL; - rxr = ifobject->rx_on ? &xsk->rx : NULL; - return xsk_socket__create(&xsk->xsk, ifobject->ifindex, 0, umem->umem, rxr, txr, &cfg); -} static bool ifobj_zc_avail(struct ifobject *ifobject) { @@ -314,7 +146,7 @@ static bool ifobj_zc_avail(struct ifobject *ifobject) ifobject->bind_flags = XDP_USE_NEED_WAKEUP | XDP_ZEROCOPY; ifobject->rx_on = true; xsk->rxqsize = XSK_RING_CONS__DEFAULT_NUM_DESCS; - ret = __xsk_configure_socket(xsk, umem, ifobject, false); + ret = xsk_configure_socket(xsk, umem, ifobject, false); if (!ret) zc_avail = true; @@ -327,25 +159,6 @@ out: return zc_avail; } -#define MAX_SKB_FRAGS_PATH "/proc/sys/net/core/max_skb_frags" -static unsigned int get_max_skb_frags(void) -{ - unsigned int max_skb_frags = 0; - FILE *file; - - file = fopen(MAX_SKB_FRAGS_PATH, "r"); - if (!file) { - ksft_print_msg("Error opening %s\n", MAX_SKB_FRAGS_PATH); - return 0; - } - - if (fscanf(file, "%u", &max_skb_frags) != 1) - ksft_print_msg("Error reading %s\n", MAX_SKB_FRAGS_PATH); - - fclose(file); - return max_skb_frags; -} - static struct option long_options[] = { {"interface", required_argument, 0, 'i'}, {"busy-poll", no_argument, 0, 'b'}, @@ -446,2256 +259,36 @@ static void parse_command_line(struct ifobject *ifobj_tx, struct ifobject *ifobj } } -static int set_ring_size(struct ifobject *ifobj) -{ - int ret; - u32 ctr = 0; - - while (ctr++ < SOCK_RECONF_CTR) { - ret = set_hw_ring_size(ifobj->ifname, &ifobj->ring); - if (!ret) - break; - - /* Retry if it fails */ - if (ctr >= SOCK_RECONF_CTR || errno != EBUSY) - return -errno; - - usleep(USLEEP_MAX); - } - - return ret; -} - -static int hw_ring_size_reset(struct ifobject *ifobj) -{ - ifobj->ring.tx_pending = ifobj->set_ring.default_tx; - ifobj->ring.rx_pending = ifobj->set_ring.default_rx; - return set_ring_size(ifobj); -} - -static void __test_spec_init(struct test_spec *test, struct ifobject *ifobj_tx, - struct ifobject *ifobj_rx) -{ - u32 i, j; - - for (i = 0; i < MAX_INTERFACES; i++) { - struct ifobject *ifobj = i ? ifobj_rx : ifobj_tx; - - ifobj->xsk = &ifobj->xsk_arr[0]; - ifobj->use_poll = false; - ifobj->use_fill_ring = true; - ifobj->release_rx = true; - ifobj->validation_func = NULL; - ifobj->use_metadata = false; - - if (i == 0) { - ifobj->rx_on = false; - ifobj->tx_on = true; - } else { - ifobj->rx_on = true; - ifobj->tx_on = false; - } - - memset(ifobj->umem, 0, sizeof(*ifobj->umem)); - ifobj->umem->num_frames = DEFAULT_UMEM_BUFFERS; - ifobj->umem->frame_size = XSK_UMEM__DEFAULT_FRAME_SIZE; - - for (j = 0; j < MAX_SOCKETS; j++) { - memset(&ifobj->xsk_arr[j], 0, sizeof(ifobj->xsk_arr[j])); - ifobj->xsk_arr[j].rxqsize = XSK_RING_CONS__DEFAULT_NUM_DESCS; - ifobj->xsk_arr[j].batch_size = DEFAULT_BATCH_SIZE; - if (i == 0) - ifobj->xsk_arr[j].pkt_stream = test->tx_pkt_stream_default; - else - ifobj->xsk_arr[j].pkt_stream = test->rx_pkt_stream_default; - - memcpy(ifobj->xsk_arr[j].src_mac, g_mac, ETH_ALEN); - memcpy(ifobj->xsk_arr[j].dst_mac, g_mac, ETH_ALEN); - ifobj->xsk_arr[j].src_mac[5] += ((j * 2) + 0); - ifobj->xsk_arr[j].dst_mac[5] += ((j * 2) + 1); - } - } - - if (ifobj_tx->hw_ring_size_supp) - hw_ring_size_reset(ifobj_tx); - - test->ifobj_tx = ifobj_tx; - test->ifobj_rx = ifobj_rx; - test->current_step = 0; - test->total_steps = 1; - test->nb_sockets = 1; - test->fail = false; - test->set_ring = false; - test->adjust_tail = false; - test->adjust_tail_support = false; - test->mtu = MAX_ETH_PKT_SIZE; - test->xdp_prog_rx = ifobj_rx->xdp_progs->progs.xsk_def_prog; - test->xskmap_rx = ifobj_rx->xdp_progs->maps.xsk; - test->xdp_prog_tx = ifobj_tx->xdp_progs->progs.xsk_def_prog; - test->xskmap_tx = ifobj_tx->xdp_progs->maps.xsk; -} - -static void test_spec_init(struct test_spec *test, struct ifobject *ifobj_tx, - struct ifobject *ifobj_rx, enum test_mode mode, - const struct test_spec *test_to_run) -{ - struct pkt_stream *tx_pkt_stream; - struct pkt_stream *rx_pkt_stream; - u32 i; - - tx_pkt_stream = test->tx_pkt_stream_default; - rx_pkt_stream = test->rx_pkt_stream_default; - memset(test, 0, sizeof(*test)); - test->tx_pkt_stream_default = tx_pkt_stream; - test->rx_pkt_stream_default = rx_pkt_stream; - - for (i = 0; i < MAX_INTERFACES; i++) { - struct ifobject *ifobj = i ? ifobj_rx : ifobj_tx; - - ifobj->bind_flags = XDP_USE_NEED_WAKEUP; - if (mode == TEST_MODE_ZC) - ifobj->bind_flags |= XDP_ZEROCOPY; - else - ifobj->bind_flags |= XDP_COPY; - } - - strncpy(test->name, test_to_run->name, MAX_TEST_NAME_SIZE); - test->test_func = test_to_run->test_func; - test->mode = mode; - __test_spec_init(test, ifobj_tx, ifobj_rx); -} - -static void test_spec_reset(struct test_spec *test) -{ - __test_spec_init(test, test->ifobj_tx, test->ifobj_rx); -} - -static void test_spec_set_xdp_prog(struct test_spec *test, struct bpf_program *xdp_prog_rx, - struct bpf_program *xdp_prog_tx, struct bpf_map *xskmap_rx, - struct bpf_map *xskmap_tx) -{ - test->xdp_prog_rx = xdp_prog_rx; - test->xdp_prog_tx = xdp_prog_tx; - test->xskmap_rx = xskmap_rx; - test->xskmap_tx = xskmap_tx; -} - -static int test_spec_set_mtu(struct test_spec *test, int mtu) -{ - int err; - - if (test->ifobj_rx->mtu != mtu) { - err = xsk_set_mtu(test->ifobj_rx->ifindex, mtu); - if (err) - return err; - test->ifobj_rx->mtu = mtu; - } - if (test->ifobj_tx->mtu != mtu) { - err = xsk_set_mtu(test->ifobj_tx->ifindex, mtu); - if (err) - return err; - test->ifobj_tx->mtu = mtu; - } - - return 0; -} - -static void pkt_stream_reset(struct pkt_stream *pkt_stream) -{ - if (pkt_stream) { - pkt_stream->current_pkt_nb = 0; - pkt_stream->nb_rx_pkts = 0; - } -} - -static struct pkt *pkt_stream_get_next_tx_pkt(struct pkt_stream *pkt_stream) -{ - if (pkt_stream->current_pkt_nb >= pkt_stream->nb_pkts) - return NULL; - - return &pkt_stream->pkts[pkt_stream->current_pkt_nb++]; -} - -static struct pkt *pkt_stream_get_next_rx_pkt(struct pkt_stream *pkt_stream, u32 *pkts_sent) -{ - while (pkt_stream->current_pkt_nb < pkt_stream->nb_pkts) { - (*pkts_sent)++; - if (pkt_stream->pkts[pkt_stream->current_pkt_nb].valid) - return &pkt_stream->pkts[pkt_stream->current_pkt_nb++]; - pkt_stream->current_pkt_nb++; - } - return NULL; -} - -static void pkt_stream_delete(struct pkt_stream *pkt_stream) -{ - free(pkt_stream->pkts); - free(pkt_stream); -} - -static void pkt_stream_restore_default(struct test_spec *test) -{ - struct pkt_stream *tx_pkt_stream = test->ifobj_tx->xsk->pkt_stream; - struct pkt_stream *rx_pkt_stream = test->ifobj_rx->xsk->pkt_stream; - - if (tx_pkt_stream != test->tx_pkt_stream_default) { - pkt_stream_delete(test->ifobj_tx->xsk->pkt_stream); - test->ifobj_tx->xsk->pkt_stream = test->tx_pkt_stream_default; - } - - if (rx_pkt_stream != test->rx_pkt_stream_default) { - pkt_stream_delete(test->ifobj_rx->xsk->pkt_stream); - test->ifobj_rx->xsk->pkt_stream = test->rx_pkt_stream_default; - } -} - -static struct pkt_stream *__pkt_stream_alloc(u32 nb_pkts) -{ - struct pkt_stream *pkt_stream; - - pkt_stream = calloc(1, sizeof(*pkt_stream)); - if (!pkt_stream) - return NULL; - - pkt_stream->pkts = calloc(nb_pkts, sizeof(*pkt_stream->pkts)); - if (!pkt_stream->pkts) { - free(pkt_stream); - return NULL; - } - - pkt_stream->nb_pkts = nb_pkts; - return pkt_stream; -} - -static bool pkt_continues(u32 options) -{ - return options & XDP_PKT_CONTD; -} - -static u32 ceil_u32(u32 a, u32 b) -{ - return (a + b - 1) / b; -} - -static u32 pkt_nb_frags(u32 frame_size, struct pkt_stream *pkt_stream, struct pkt *pkt) -{ - u32 nb_frags = 1, next_frag; - - if (!pkt) - return 1; - - if (!pkt_stream->verbatim) { - if (!pkt->valid || !pkt->len) - return 1; - return ceil_u32(pkt->len, frame_size); - } - - /* Search for the end of the packet in verbatim mode */ - if (!pkt_continues(pkt->options)) - return nb_frags; - - next_frag = pkt_stream->current_pkt_nb; - pkt++; - while (next_frag++ < pkt_stream->nb_pkts) { - nb_frags++; - if (!pkt_continues(pkt->options) || !pkt->valid) - break; - pkt++; - } - return nb_frags; -} - -static bool set_pkt_valid(int offset, u32 len) -{ - return len <= MAX_ETH_JUMBO_SIZE; -} - -static void pkt_set(struct pkt_stream *pkt_stream, struct pkt *pkt, int offset, u32 len) -{ - pkt->offset = offset; - pkt->len = len; - pkt->valid = set_pkt_valid(offset, len); -} - -static void pkt_stream_pkt_set(struct pkt_stream *pkt_stream, struct pkt *pkt, int offset, u32 len) -{ - bool prev_pkt_valid = pkt->valid; - - pkt_set(pkt_stream, pkt, offset, len); - pkt_stream->nb_valid_entries += pkt->valid - prev_pkt_valid; -} - -static u32 pkt_get_buffer_len(struct xsk_umem_info *umem, u32 len) -{ - return ceil_u32(len, umem->frame_size) * umem->frame_size; -} - -static struct pkt_stream *__pkt_stream_generate(u32 nb_pkts, u32 pkt_len, u32 nb_start, u32 nb_off) -{ - struct pkt_stream *pkt_stream; - u32 i; - - pkt_stream = __pkt_stream_alloc(nb_pkts); - if (!pkt_stream) - exit_with_error(ENOMEM); - - pkt_stream->nb_pkts = nb_pkts; - pkt_stream->max_pkt_len = pkt_len; - for (i = 0; i < nb_pkts; i++) { - struct pkt *pkt = &pkt_stream->pkts[i]; - - pkt_stream_pkt_set(pkt_stream, pkt, 0, pkt_len); - pkt->pkt_nb = nb_start + i * nb_off; - } - - return pkt_stream; -} - -static struct pkt_stream *pkt_stream_generate(u32 nb_pkts, u32 pkt_len) -{ - return __pkt_stream_generate(nb_pkts, pkt_len, 0, 1); -} - -static struct pkt_stream *pkt_stream_clone(struct pkt_stream *pkt_stream) -{ - return pkt_stream_generate(pkt_stream->nb_pkts, pkt_stream->pkts[0].len); -} - -static void pkt_stream_replace_ifobject(struct ifobject *ifobj, u32 nb_pkts, u32 pkt_len) -{ - ifobj->xsk->pkt_stream = pkt_stream_generate(nb_pkts, pkt_len); -} - -static void pkt_stream_replace(struct test_spec *test, u32 nb_pkts, u32 pkt_len) -{ - pkt_stream_replace_ifobject(test->ifobj_tx, nb_pkts, pkt_len); - pkt_stream_replace_ifobject(test->ifobj_rx, nb_pkts, pkt_len); -} - -static void __pkt_stream_replace_half(struct ifobject *ifobj, u32 pkt_len, - int offset) -{ - struct pkt_stream *pkt_stream; - u32 i; - - pkt_stream = pkt_stream_clone(ifobj->xsk->pkt_stream); - for (i = 1; i < ifobj->xsk->pkt_stream->nb_pkts; i += 2) - pkt_stream_pkt_set(pkt_stream, &pkt_stream->pkts[i], offset, pkt_len); - - ifobj->xsk->pkt_stream = pkt_stream; -} - -static void pkt_stream_replace_half(struct test_spec *test, u32 pkt_len, int offset) -{ - __pkt_stream_replace_half(test->ifobj_tx, pkt_len, offset); - __pkt_stream_replace_half(test->ifobj_rx, pkt_len, offset); -} - -static void pkt_stream_receive_half(struct test_spec *test) -{ - struct pkt_stream *pkt_stream = test->ifobj_tx->xsk->pkt_stream; - u32 i; - - test->ifobj_rx->xsk->pkt_stream = pkt_stream_generate(pkt_stream->nb_pkts, - pkt_stream->pkts[0].len); - pkt_stream = test->ifobj_rx->xsk->pkt_stream; - for (i = 1; i < pkt_stream->nb_pkts; i += 2) - pkt_stream->pkts[i].valid = false; - - pkt_stream->nb_valid_entries /= 2; -} - -static void pkt_stream_even_odd_sequence(struct test_spec *test) -{ - struct pkt_stream *pkt_stream; - u32 i; - - for (i = 0; i < test->nb_sockets; i++) { - pkt_stream = test->ifobj_tx->xsk_arr[i].pkt_stream; - pkt_stream = __pkt_stream_generate(pkt_stream->nb_pkts / 2, - pkt_stream->pkts[0].len, i, 2); - test->ifobj_tx->xsk_arr[i].pkt_stream = pkt_stream; - - pkt_stream = test->ifobj_rx->xsk_arr[i].pkt_stream; - pkt_stream = __pkt_stream_generate(pkt_stream->nb_pkts / 2, - pkt_stream->pkts[0].len, i, 2); - test->ifobj_rx->xsk_arr[i].pkt_stream = pkt_stream; - } -} - -static u64 pkt_get_addr(struct pkt *pkt, struct xsk_umem_info *umem) -{ - if (!pkt->valid) - return pkt->offset; - return pkt->offset + umem_alloc_buffer(umem); -} - -static void pkt_stream_cancel(struct pkt_stream *pkt_stream) -{ - pkt_stream->current_pkt_nb--; -} - -static void pkt_generate(struct xsk_socket_info *xsk, struct xsk_umem_info *umem, u64 addr, u32 len, - u32 pkt_nb, u32 bytes_written) -{ - void *data = xsk_umem__get_data(umem->buffer, addr); - - if (len < MIN_PKT_SIZE) - return; - - if (!bytes_written) { - gen_eth_hdr(xsk, data); - - len -= PKT_HDR_SIZE; - data += PKT_HDR_SIZE; - } else { - bytes_written -= PKT_HDR_SIZE; - } - - write_payload(data, pkt_nb, bytes_written, len); -} - -static struct pkt_stream *__pkt_stream_generate_custom(struct ifobject *ifobj, struct pkt *frames, - u32 nb_frames, bool verbatim) -{ - u32 i, len = 0, pkt_nb = 0, payload = 0; - struct pkt_stream *pkt_stream; - - pkt_stream = __pkt_stream_alloc(nb_frames); - if (!pkt_stream) - exit_with_error(ENOMEM); - - for (i = 0; i < nb_frames; i++) { - struct pkt *pkt = &pkt_stream->pkts[pkt_nb]; - struct pkt *frame = &frames[i]; - - pkt->offset = frame->offset; - if (verbatim) { - *pkt = *frame; - pkt->pkt_nb = payload; - if (!frame->valid || !pkt_continues(frame->options)) - payload++; - } else { - if (frame->valid) - len += frame->len; - if (frame->valid && pkt_continues(frame->options)) - continue; - - pkt->pkt_nb = pkt_nb; - pkt->len = len; - pkt->valid = frame->valid; - pkt->options = 0; - - len = 0; - } - - print_verbose("offset: %d len: %u valid: %u options: %u pkt_nb: %u\n", - pkt->offset, pkt->len, pkt->valid, pkt->options, pkt->pkt_nb); - - if (pkt->valid && pkt->len > pkt_stream->max_pkt_len) - pkt_stream->max_pkt_len = pkt->len; - - if (pkt->valid) - pkt_stream->nb_valid_entries++; - - pkt_nb++; - } - - pkt_stream->nb_pkts = pkt_nb; - pkt_stream->verbatim = verbatim; - return pkt_stream; -} - -static void pkt_stream_generate_custom(struct test_spec *test, struct pkt *pkts, u32 nb_pkts) -{ - struct pkt_stream *pkt_stream; - - pkt_stream = __pkt_stream_generate_custom(test->ifobj_tx, pkts, nb_pkts, true); - test->ifobj_tx->xsk->pkt_stream = pkt_stream; - - pkt_stream = __pkt_stream_generate_custom(test->ifobj_rx, pkts, nb_pkts, false); - test->ifobj_rx->xsk->pkt_stream = pkt_stream; -} - -static void pkt_print_data(u32 *data, u32 cnt) -{ - u32 i; - - for (i = 0; i < cnt; i++) { - u32 seqnum, pkt_nb; - - seqnum = ntohl(*data) & 0xffff; - pkt_nb = ntohl(*data) >> 16; - ksft_print_msg("%u:%u ", pkt_nb, seqnum); - data++; - } -} - -static void pkt_dump(void *pkt, u32 len, bool eth_header) -{ - struct ethhdr *ethhdr = pkt; - u32 i, *data; - - if (eth_header) { - /*extract L2 frame */ - ksft_print_msg("DEBUG>> L2: dst mac: "); - for (i = 0; i < ETH_ALEN; i++) - ksft_print_msg("%02X", ethhdr->h_dest[i]); - - ksft_print_msg("\nDEBUG>> L2: src mac: "); - for (i = 0; i < ETH_ALEN; i++) - ksft_print_msg("%02X", ethhdr->h_source[i]); - - data = pkt + PKT_HDR_SIZE; - } else { - data = pkt; - } - - /*extract L5 frame */ - ksft_print_msg("\nDEBUG>> L5: seqnum: "); - pkt_print_data(data, PKT_DUMP_NB_TO_PRINT); - ksft_print_msg("...."); - if (len > PKT_DUMP_NB_TO_PRINT * sizeof(u32)) { - ksft_print_msg("\n.... "); - pkt_print_data(data + len / sizeof(u32) - PKT_DUMP_NB_TO_PRINT, - PKT_DUMP_NB_TO_PRINT); - } - ksft_print_msg("\n---------------------------------------\n"); -} - -static bool is_offset_correct(struct xsk_umem_info *umem, struct pkt *pkt, u64 addr) -{ - u32 headroom = umem->unaligned_mode ? 0 : umem->frame_headroom; - u32 offset = addr % umem->frame_size, expected_offset; - int pkt_offset = pkt->valid ? pkt->offset : 0; - - if (!umem->unaligned_mode) - pkt_offset = 0; - - expected_offset = (pkt_offset + headroom + XDP_PACKET_HEADROOM) % umem->frame_size; - - if (offset == expected_offset) - return true; - - ksft_print_msg("[%s] expected [%u], got [%u]\n", __func__, expected_offset, offset); - return false; -} - -static bool is_metadata_correct(struct pkt *pkt, void *buffer, u64 addr) -{ - void *data = xsk_umem__get_data(buffer, addr); - struct xdp_info *meta = data - sizeof(struct xdp_info); - - if (meta->count != pkt->pkt_nb) { - ksft_print_msg("[%s] expected meta_count [%d], got meta_count [%llu]\n", - __func__, pkt->pkt_nb, - (unsigned long long)meta->count); - return false; - } - - return true; -} - -static bool is_adjust_tail_supported(struct xsk_xdp_progs *skel_rx) -{ - struct bpf_map *data_map; - int adjust_value = 0; - int key = 0; - int ret; - - data_map = bpf_object__find_map_by_name(skel_rx->obj, "xsk_xdp_.bss"); - if (!data_map || !bpf_map__is_internal(data_map)) { - ksft_print_msg("Error: could not find bss section of XDP program\n"); - exit_with_error(errno); - } - - ret = bpf_map_lookup_elem(bpf_map__fd(data_map), &key, &adjust_value); - if (ret) { - ksft_print_msg("Error: bpf_map_lookup_elem failed with error %d\n", ret); - exit_with_error(errno); - } - - /* Set the 'adjust_value' variable to -EOPNOTSUPP in the XDP program if the adjust_tail - * helper is not supported. Skip the adjust_tail test case in this scenario. - */ - return adjust_value != -EOPNOTSUPP; -} - -static bool is_frag_valid(struct xsk_umem_info *umem, u64 addr, u32 len, u32 expected_pkt_nb, - u32 bytes_processed) -{ - u32 seqnum, pkt_nb, *pkt_data, words_to_end, expected_seqnum; - void *data = xsk_umem__get_data(umem->buffer, addr); - - addr -= umem->base_addr; - - if (addr >= umem->num_frames * umem->frame_size || - addr + len > umem->num_frames * umem->frame_size) { - ksft_print_msg("Frag invalid addr: %llx len: %u\n", - (unsigned long long)addr, len); - return false; - } - if (!umem->unaligned_mode && addr % umem->frame_size + len > umem->frame_size) { - ksft_print_msg("Frag crosses frame boundary addr: %llx len: %u\n", - (unsigned long long)addr, len); - return false; - } - - pkt_data = data; - if (!bytes_processed) { - pkt_data += PKT_HDR_SIZE / sizeof(*pkt_data); - len -= PKT_HDR_SIZE; - } else { - bytes_processed -= PKT_HDR_SIZE; - } - - expected_seqnum = bytes_processed / sizeof(*pkt_data); - seqnum = ntohl(*pkt_data) & 0xffff; - pkt_nb = ntohl(*pkt_data) >> 16; - - if (expected_pkt_nb != pkt_nb) { - ksft_print_msg("[%s] expected pkt_nb [%u], got pkt_nb [%u]\n", - __func__, expected_pkt_nb, pkt_nb); - goto error; - } - if (expected_seqnum != seqnum) { - ksft_print_msg("[%s] expected seqnum at start [%u], got seqnum [%u]\n", - __func__, expected_seqnum, seqnum); - goto error; - } - - words_to_end = len / sizeof(*pkt_data) - 1; - pkt_data += words_to_end; - seqnum = ntohl(*pkt_data) & 0xffff; - expected_seqnum += words_to_end; - if (expected_seqnum != seqnum) { - ksft_print_msg("[%s] expected seqnum at end [%u], got seqnum [%u]\n", - __func__, expected_seqnum, seqnum); - goto error; - } - - return true; - -error: - pkt_dump(data, len, !bytes_processed); - return false; -} - -static bool is_pkt_valid(struct pkt *pkt, void *buffer, u64 addr, u32 len) -{ - if (pkt->len != len) { - ksft_print_msg("[%s] expected packet length [%d], got length [%d]\n", - __func__, pkt->len, len); - pkt_dump(xsk_umem__get_data(buffer, addr), len, true); - return false; - } - - return true; -} - -static u32 load_value(u32 *counter) -{ - return __atomic_load_n(counter, __ATOMIC_ACQUIRE); -} - -static bool kick_tx_with_check(struct xsk_socket_info *xsk, int *ret) -{ - u32 max_budget = MAX_TX_BUDGET_DEFAULT; - u32 cons, ready_to_send; - int delta; - - cons = load_value(xsk->tx.consumer); - ready_to_send = load_value(xsk->tx.producer) - cons; - *ret = sendto(xsk_socket__fd(xsk->xsk), NULL, 0, MSG_DONTWAIT, NULL, 0); - - delta = load_value(xsk->tx.consumer) - cons; - /* By default, xsk should consume exact @max_budget descs at one - * send in this case where hitting the max budget limit in while - * loop is triggered in __xsk_generic_xmit(). Please make sure that - * the number of descs to be sent is larger than @max_budget, or - * else the tx.consumer will be updated in xskq_cons_peek_desc() - * in time which hides the issue we try to verify. - */ - if (ready_to_send > max_budget && delta != max_budget) - return false; - - return true; -} - -static int kick_tx(struct xsk_socket_info *xsk) -{ - int ret; - - if (xsk->check_consumer) { - if (!kick_tx_with_check(xsk, &ret)) - return TEST_FAILURE; - } else { - ret = sendto(xsk_socket__fd(xsk->xsk), NULL, 0, MSG_DONTWAIT, NULL, 0); - } - if (ret >= 0) - return TEST_PASS; - if (errno == ENOBUFS || errno == EAGAIN || errno == EBUSY || errno == ENETDOWN) { - usleep(100); - return TEST_PASS; - } - return TEST_FAILURE; -} - -static int kick_rx(struct xsk_socket_info *xsk) -{ - int ret; - - ret = recvfrom(xsk_socket__fd(xsk->xsk), NULL, 0, MSG_DONTWAIT, NULL, NULL); - if (ret < 0) - return TEST_FAILURE; - - return TEST_PASS; -} - -static int complete_pkts(struct xsk_socket_info *xsk, int batch_size) -{ - unsigned int rcvd; - u32 idx; - int ret; - - if (xsk_ring_prod__needs_wakeup(&xsk->tx)) { - ret = kick_tx(xsk); - if (ret) - return TEST_FAILURE; - } - - rcvd = xsk_ring_cons__peek(&xsk->umem->cq, batch_size, &idx); - if (rcvd) { - if (rcvd > xsk->outstanding_tx) { - u64 addr = *xsk_ring_cons__comp_addr(&xsk->umem->cq, idx + rcvd - 1); - - ksft_print_msg("[%s] Too many packets completed\n", __func__); - ksft_print_msg("Last completion address: %llx\n", - (unsigned long long)addr); - return TEST_FAILURE; - } - - xsk_ring_cons__release(&xsk->umem->cq, rcvd); - xsk->outstanding_tx -= rcvd; - } - - return TEST_PASS; -} - -static int __receive_pkts(struct test_spec *test, struct xsk_socket_info *xsk) -{ - u32 frags_processed = 0, nb_frags = 0, pkt_len = 0; - u32 idx_rx = 0, idx_fq = 0, rcvd, pkts_sent = 0; - struct pkt_stream *pkt_stream = xsk->pkt_stream; - struct ifobject *ifobj = test->ifobj_rx; - struct xsk_umem_info *umem = xsk->umem; - struct pollfd fds = { }; - struct pkt *pkt; - u64 first_addr = 0; - int ret; - - fds.fd = xsk_socket__fd(xsk->xsk); - fds.events = POLLIN; - - ret = kick_rx(xsk); - if (ret) - return TEST_FAILURE; - - if (ifobj->use_poll) { - ret = poll(&fds, 1, POLL_TMOUT); - if (ret < 0) - return TEST_FAILURE; - - if (!ret) { - if (!is_umem_valid(test->ifobj_tx)) - return TEST_PASS; - - ksft_print_msg("ERROR: [%s] Poll timed out\n", __func__); - return TEST_CONTINUE; - } - - if (!(fds.revents & POLLIN)) - return TEST_CONTINUE; - } - - rcvd = xsk_ring_cons__peek(&xsk->rx, xsk->batch_size, &idx_rx); - if (!rcvd) - return TEST_CONTINUE; - - if (ifobj->use_fill_ring) { - ret = xsk_ring_prod__reserve(&umem->fq, rcvd, &idx_fq); - while (ret != rcvd) { - if (xsk_ring_prod__needs_wakeup(&umem->fq)) { - ret = poll(&fds, 1, POLL_TMOUT); - if (ret < 0) - return TEST_FAILURE; - } - ret = xsk_ring_prod__reserve(&umem->fq, rcvd, &idx_fq); - } - } - - while (frags_processed < rcvd) { - const struct xdp_desc *desc = xsk_ring_cons__rx_desc(&xsk->rx, idx_rx++); - u64 addr = desc->addr, orig; - - orig = xsk_umem__extract_addr(addr); - addr = xsk_umem__add_offset_to_addr(addr); - - if (!nb_frags) { - pkt = pkt_stream_get_next_rx_pkt(pkt_stream, &pkts_sent); - if (!pkt) { - ksft_print_msg("[%s] received too many packets addr: %lx len %u\n", - __func__, addr, desc->len); - return TEST_FAILURE; - } - } - - print_verbose("Rx: addr: %lx len: %u options: %u pkt_nb: %u valid: %u\n", - addr, desc->len, desc->options, pkt->pkt_nb, pkt->valid); - - if (!is_frag_valid(umem, addr, desc->len, pkt->pkt_nb, pkt_len) || - !is_offset_correct(umem, pkt, addr) || (ifobj->use_metadata && - !is_metadata_correct(pkt, umem->buffer, addr))) - return TEST_FAILURE; - - if (!nb_frags++) - first_addr = addr; - frags_processed++; - pkt_len += desc->len; - if (ifobj->use_fill_ring) - *xsk_ring_prod__fill_addr(&umem->fq, idx_fq++) = orig; - - if (pkt_continues(desc->options)) - continue; - - /* The complete packet has been received */ - if (!is_pkt_valid(pkt, umem->buffer, first_addr, pkt_len) || - !is_offset_correct(umem, pkt, addr)) - return TEST_FAILURE; - - pkt_stream->nb_rx_pkts++; - nb_frags = 0; - pkt_len = 0; - } - - if (nb_frags) { - /* In the middle of a packet. Start over from beginning of packet. */ - idx_rx -= nb_frags; - xsk_ring_cons__cancel(&xsk->rx, nb_frags); - if (ifobj->use_fill_ring) { - idx_fq -= nb_frags; - xsk_ring_prod__cancel(&umem->fq, nb_frags); - } - frags_processed -= nb_frags; - } - - if (ifobj->use_fill_ring) - xsk_ring_prod__submit(&umem->fq, frags_processed); - if (ifobj->release_rx) - xsk_ring_cons__release(&xsk->rx, frags_processed); - - pthread_mutex_lock(&pacing_mutex); - pkts_in_flight -= pkts_sent; - pthread_mutex_unlock(&pacing_mutex); - pkts_sent = 0; - -return TEST_CONTINUE; -} - -bool all_packets_received(struct test_spec *test, struct xsk_socket_info *xsk, u32 sock_num, - unsigned long *bitmap) -{ - struct pkt_stream *pkt_stream = xsk->pkt_stream; - - if (!pkt_stream) { - __set_bit(sock_num, bitmap); - return false; - } - - if (pkt_stream->nb_rx_pkts == pkt_stream->nb_valid_entries) { - __set_bit(sock_num, bitmap); - if (bitmap_full(bitmap, test->nb_sockets)) - return true; - } - - return false; -} - -static int receive_pkts(struct test_spec *test) -{ - struct timeval tv_end, tv_now, tv_timeout = {THREAD_TMOUT, 0}; - DECLARE_BITMAP(bitmap, test->nb_sockets); - struct xsk_socket_info *xsk; - u32 sock_num = 0; - int res, ret; - - ret = gettimeofday(&tv_now, NULL); - if (ret) - exit_with_error(errno); - - timeradd(&tv_now, &tv_timeout, &tv_end); - - while (1) { - xsk = &test->ifobj_rx->xsk_arr[sock_num]; - - if ((all_packets_received(test, xsk, sock_num, bitmap))) - break; - - res = __receive_pkts(test, xsk); - if (!(res == TEST_PASS || res == TEST_CONTINUE)) - return res; - - ret = gettimeofday(&tv_now, NULL); - if (ret) - exit_with_error(errno); - - if (timercmp(&tv_now, &tv_end, >)) { - ksft_print_msg("ERROR: [%s] Receive loop timed out\n", __func__); - return TEST_FAILURE; - } - sock_num = (sock_num + 1) % test->nb_sockets; - } - - return TEST_PASS; -} - -static int __send_pkts(struct ifobject *ifobject, struct xsk_socket_info *xsk, bool timeout) -{ - u32 i, idx = 0, valid_pkts = 0, valid_frags = 0, buffer_len; - struct pkt_stream *pkt_stream = xsk->pkt_stream; - struct xsk_umem_info *umem = ifobject->umem; - bool use_poll = ifobject->use_poll; - struct pollfd fds = { }; - int ret; - - buffer_len = pkt_get_buffer_len(umem, pkt_stream->max_pkt_len); - /* pkts_in_flight might be negative if many invalid packets are sent */ - if (pkts_in_flight >= (int)((umem_size(umem) - xsk->batch_size * buffer_len) / - buffer_len)) { - ret = kick_tx(xsk); - if (ret) - return TEST_FAILURE; - return TEST_CONTINUE; - } - - fds.fd = xsk_socket__fd(xsk->xsk); - fds.events = POLLOUT; - - while (xsk_ring_prod__reserve(&xsk->tx, xsk->batch_size, &idx) < xsk->batch_size) { - if (use_poll) { - ret = poll(&fds, 1, POLL_TMOUT); - if (timeout) { - if (ret < 0) { - ksft_print_msg("ERROR: [%s] Poll error %d\n", - __func__, errno); - return TEST_FAILURE; - } - if (ret == 0) - return TEST_PASS; - break; - } - if (ret <= 0) { - ksft_print_msg("ERROR: [%s] Poll error %d\n", - __func__, errno); - return TEST_FAILURE; - } - } - - complete_pkts(xsk, xsk->batch_size); - } - - for (i = 0; i < xsk->batch_size; i++) { - struct pkt *pkt = pkt_stream_get_next_tx_pkt(pkt_stream); - u32 nb_frags_left, nb_frags, bytes_written = 0; - - if (!pkt) - break; - - nb_frags = pkt_nb_frags(umem->frame_size, pkt_stream, pkt); - if (nb_frags > xsk->batch_size - i) { - pkt_stream_cancel(pkt_stream); - xsk_ring_prod__cancel(&xsk->tx, xsk->batch_size - i); - break; - } - nb_frags_left = nb_frags; - - while (nb_frags_left--) { - struct xdp_desc *tx_desc = xsk_ring_prod__tx_desc(&xsk->tx, idx + i); - - tx_desc->addr = pkt_get_addr(pkt, ifobject->umem); - if (pkt_stream->verbatim) { - tx_desc->len = pkt->len; - tx_desc->options = pkt->options; - } else if (nb_frags_left) { - tx_desc->len = umem->frame_size; - tx_desc->options = XDP_PKT_CONTD; - } else { - tx_desc->len = pkt->len - bytes_written; - tx_desc->options = 0; - } - if (pkt->valid) - pkt_generate(xsk, umem, tx_desc->addr, tx_desc->len, pkt->pkt_nb, - bytes_written); - bytes_written += tx_desc->len; - - print_verbose("Tx addr: %llx len: %u options: %u pkt_nb: %u\n", - tx_desc->addr, tx_desc->len, tx_desc->options, pkt->pkt_nb); - - if (nb_frags_left) { - i++; - if (pkt_stream->verbatim) - pkt = pkt_stream_get_next_tx_pkt(pkt_stream); - } - } - - if (pkt && pkt->valid) { - valid_pkts++; - valid_frags += nb_frags; - } - } - - pthread_mutex_lock(&pacing_mutex); - pkts_in_flight += valid_pkts; - pthread_mutex_unlock(&pacing_mutex); - - xsk_ring_prod__submit(&xsk->tx, i); - xsk->outstanding_tx += valid_frags; - - if (use_poll) { - ret = poll(&fds, 1, POLL_TMOUT); - if (ret <= 0) { - if (ret == 0 && timeout) - return TEST_PASS; - - ksft_print_msg("ERROR: [%s] Poll error %d\n", __func__, ret); - return TEST_FAILURE; - } - } - - if (!timeout) { - if (complete_pkts(xsk, i)) - return TEST_FAILURE; - - usleep(10); - return TEST_PASS; - } - - return TEST_CONTINUE; -} - -static int wait_for_tx_completion(struct xsk_socket_info *xsk) -{ - struct timeval tv_end, tv_now, tv_timeout = {THREAD_TMOUT, 0}; - int ret; - - ret = gettimeofday(&tv_now, NULL); - if (ret) - exit_with_error(errno); - timeradd(&tv_now, &tv_timeout, &tv_end); - - while (xsk->outstanding_tx) { - ret = gettimeofday(&tv_now, NULL); - if (ret) - exit_with_error(errno); - if (timercmp(&tv_now, &tv_end, >)) { - ksft_print_msg("ERROR: [%s] Transmission loop timed out\n", __func__); - return TEST_FAILURE; - } - - complete_pkts(xsk, xsk->batch_size); - } - - return TEST_PASS; -} - -bool all_packets_sent(struct test_spec *test, unsigned long *bitmap) -{ - return bitmap_full(bitmap, test->nb_sockets); -} - -static int send_pkts(struct test_spec *test, struct ifobject *ifobject) -{ - bool timeout = !is_umem_valid(test->ifobj_rx); - DECLARE_BITMAP(bitmap, test->nb_sockets); - u32 i, ret; - - while (!(all_packets_sent(test, bitmap))) { - for (i = 0; i < test->nb_sockets; i++) { - struct pkt_stream *pkt_stream; - - pkt_stream = ifobject->xsk_arr[i].pkt_stream; - if (!pkt_stream || pkt_stream->current_pkt_nb >= pkt_stream->nb_pkts) { - __set_bit(i, bitmap); - continue; - } - ret = __send_pkts(ifobject, &ifobject->xsk_arr[i], timeout); - if (ret == TEST_CONTINUE && !test->fail) - continue; - - if ((ret || test->fail) && !timeout) - return TEST_FAILURE; - - if (ret == TEST_PASS && timeout) - return ret; - - ret = wait_for_tx_completion(&ifobject->xsk_arr[i]); - if (ret) - return TEST_FAILURE; - } - } - - return TEST_PASS; -} - -static int get_xsk_stats(struct xsk_socket *xsk, struct xdp_statistics *stats) -{ - int fd = xsk_socket__fd(xsk), err; - socklen_t optlen, expected_len; - - optlen = sizeof(*stats); - err = getsockopt(fd, SOL_XDP, XDP_STATISTICS, stats, &optlen); - if (err) { - ksft_print_msg("[%s] getsockopt(XDP_STATISTICS) error %u %s\n", - __func__, -err, strerror(-err)); - return TEST_FAILURE; - } - - expected_len = sizeof(struct xdp_statistics); - if (optlen != expected_len) { - ksft_print_msg("[%s] getsockopt optlen error. Expected: %u got: %u\n", - __func__, expected_len, optlen); - return TEST_FAILURE; - } - - return TEST_PASS; -} - -static int validate_rx_dropped(struct ifobject *ifobject) -{ - struct xsk_socket *xsk = ifobject->xsk->xsk; - struct xdp_statistics stats; - int err; - - err = kick_rx(ifobject->xsk); - if (err) - return TEST_FAILURE; - - err = get_xsk_stats(xsk, &stats); - if (err) - return TEST_FAILURE; - - /* The receiver calls getsockopt after receiving the last (valid) - * packet which is not the final packet sent in this test (valid and - * invalid packets are sent in alternating fashion with the final - * packet being invalid). Since the last packet may or may not have - * been dropped already, both outcomes must be allowed. - */ - if (stats.rx_dropped == ifobject->xsk->pkt_stream->nb_pkts / 2 || - stats.rx_dropped == ifobject->xsk->pkt_stream->nb_pkts / 2 - 1) - return TEST_PASS; - - return TEST_FAILURE; -} - -static int validate_rx_full(struct ifobject *ifobject) -{ - struct xsk_socket *xsk = ifobject->xsk->xsk; - struct xdp_statistics stats; - int err; - - usleep(1000); - err = kick_rx(ifobject->xsk); - if (err) - return TEST_FAILURE; - - err = get_xsk_stats(xsk, &stats); - if (err) - return TEST_FAILURE; - - if (stats.rx_ring_full) - return TEST_PASS; - - return TEST_FAILURE; -} - -static int validate_fill_empty(struct ifobject *ifobject) -{ - struct xsk_socket *xsk = ifobject->xsk->xsk; - struct xdp_statistics stats; - int err; - - usleep(1000); - err = kick_rx(ifobject->xsk); - if (err) - return TEST_FAILURE; - - err = get_xsk_stats(xsk, &stats); - if (err) - return TEST_FAILURE; - - if (stats.rx_fill_ring_empty_descs) - return TEST_PASS; - - return TEST_FAILURE; -} - -static int validate_tx_invalid_descs(struct ifobject *ifobject) -{ - struct xsk_socket *xsk = ifobject->xsk->xsk; - int fd = xsk_socket__fd(xsk); - struct xdp_statistics stats; - socklen_t optlen; - int err; - - optlen = sizeof(stats); - err = getsockopt(fd, SOL_XDP, XDP_STATISTICS, &stats, &optlen); - if (err) { - ksft_print_msg("[%s] getsockopt(XDP_STATISTICS) error %u %s\n", - __func__, -err, strerror(-err)); - return TEST_FAILURE; - } - - if (stats.tx_invalid_descs != ifobject->xsk->pkt_stream->nb_pkts / 2) { - ksft_print_msg("[%s] tx_invalid_descs incorrect. Got [%llu] expected [%u]\n", - __func__, - (unsigned long long)stats.tx_invalid_descs, - ifobject->xsk->pkt_stream->nb_pkts); - return TEST_FAILURE; - } - - return TEST_PASS; -} - -static void xsk_configure_socket(struct test_spec *test, struct ifobject *ifobject, - struct xsk_umem_info *umem, bool tx) -{ - int i, ret; - - for (i = 0; i < test->nb_sockets; i++) { - bool shared = (ifobject->shared_umem && tx) ? true : !!i; - u32 ctr = 0; - - while (ctr++ < SOCK_RECONF_CTR) { - ret = __xsk_configure_socket(&ifobject->xsk_arr[i], umem, - ifobject, shared); - if (!ret) - break; - - /* Retry if it fails as xsk_socket__create() is asynchronous */ - if (ctr >= SOCK_RECONF_CTR) - exit_with_error(-ret); - usleep(USLEEP_MAX); - } - if (ifobject->busy_poll) - enable_busy_poll(&ifobject->xsk_arr[i]); - } -} - -static void thread_common_ops_tx(struct test_spec *test, struct ifobject *ifobject) -{ - xsk_configure_socket(test, ifobject, test->ifobj_rx->umem, true); - ifobject->xsk = &ifobject->xsk_arr[0]; - ifobject->xskmap = test->ifobj_rx->xskmap; - memcpy(ifobject->umem, test->ifobj_rx->umem, sizeof(struct xsk_umem_info)); - ifobject->umem->base_addr = 0; -} - -static void xsk_populate_fill_ring(struct xsk_umem_info *umem, struct pkt_stream *pkt_stream, - bool fill_up) -{ - u32 rx_frame_size = umem->frame_size - XDP_PACKET_HEADROOM; - u32 idx = 0, filled = 0, buffers_to_fill, nb_pkts; - int ret; - - if (umem->num_frames < XSK_RING_PROD__DEFAULT_NUM_DESCS) - buffers_to_fill = umem->num_frames; - else - buffers_to_fill = umem->fill_size; - - ret = xsk_ring_prod__reserve(&umem->fq, buffers_to_fill, &idx); - if (ret != buffers_to_fill) - exit_with_error(ENOSPC); - - while (filled < buffers_to_fill) { - struct pkt *pkt = pkt_stream_get_next_rx_pkt(pkt_stream, &nb_pkts); - u64 addr; - u32 i; - - for (i = 0; i < pkt_nb_frags(rx_frame_size, pkt_stream, pkt); i++) { - if (!pkt) { - if (!fill_up) - break; - addr = filled * umem->frame_size + umem->base_addr; - } else if (pkt->offset >= 0) { - addr = pkt->offset % umem->frame_size + umem_alloc_buffer(umem); - } else { - addr = pkt->offset + umem_alloc_buffer(umem); - } - - *xsk_ring_prod__fill_addr(&umem->fq, idx++) = addr; - if (++filled >= buffers_to_fill) - break; - } - } - xsk_ring_prod__submit(&umem->fq, filled); - xsk_ring_prod__cancel(&umem->fq, buffers_to_fill - filled); - - pkt_stream_reset(pkt_stream); - umem_reset_alloc(umem); -} - -static void thread_common_ops(struct test_spec *test, struct ifobject *ifobject) -{ - u64 umem_sz = ifobject->umem->num_frames * ifobject->umem->frame_size; - int mmap_flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_NORESERVE; - LIBBPF_OPTS(bpf_xdp_query_opts, opts); - void *bufs; - int ret; - u32 i; - - if (ifobject->umem->unaligned_mode) - mmap_flags |= MAP_HUGETLB | MAP_HUGE_2MB; - - if (ifobject->shared_umem) - umem_sz *= 2; - - bufs = mmap(NULL, umem_sz, PROT_READ | PROT_WRITE, mmap_flags, -1, 0); - if (bufs == MAP_FAILED) - exit_with_error(errno); - - ret = xsk_configure_umem(ifobject, ifobject->umem, bufs, umem_sz); - if (ret) - exit_with_error(-ret); - - xsk_configure_socket(test, ifobject, ifobject->umem, false); - - ifobject->xsk = &ifobject->xsk_arr[0]; - - if (!ifobject->rx_on) - return; - - xsk_populate_fill_ring(ifobject->umem, ifobject->xsk->pkt_stream, ifobject->use_fill_ring); - - for (i = 0; i < test->nb_sockets; i++) { - ifobject->xsk = &ifobject->xsk_arr[i]; - ret = xsk_update_xskmap(ifobject->xskmap, ifobject->xsk->xsk, i); - if (ret) - exit_with_error(errno); - } -} - -static void *worker_testapp_validate_tx(void *arg) -{ - struct test_spec *test = (struct test_spec *)arg; - struct ifobject *ifobject = test->ifobj_tx; - int err; - - if (test->current_step == 1) { - if (!ifobject->shared_umem) - thread_common_ops(test, ifobject); - else - thread_common_ops_tx(test, ifobject); - } - - err = send_pkts(test, ifobject); - - if (!err && ifobject->validation_func) - err = ifobject->validation_func(ifobject); - if (err) - report_failure(test); - - pthread_exit(NULL); -} - -static void *worker_testapp_validate_rx(void *arg) -{ - struct test_spec *test = (struct test_spec *)arg; - struct ifobject *ifobject = test->ifobj_rx; - int err; - - if (test->current_step == 1) { - thread_common_ops(test, ifobject); - } else { - xsk_clear_xskmap(ifobject->xskmap); - err = xsk_update_xskmap(ifobject->xskmap, ifobject->xsk->xsk, 0); - if (err) { - ksft_print_msg("Error: Failed to update xskmap, error %s\n", - strerror(-err)); - exit_with_error(-err); - } - } - - pthread_barrier_wait(&barr); - - err = receive_pkts(test); - - if (!err && ifobject->validation_func) - err = ifobject->validation_func(ifobject); - - if (err) { - if (test->adjust_tail && !is_adjust_tail_supported(ifobject->xdp_progs)) - test->adjust_tail_support = false; - else - report_failure(test); - } - - pthread_exit(NULL); -} - -static u64 ceil_u64(u64 a, u64 b) -{ - return (a + b - 1) / b; -} - -static void testapp_clean_xsk_umem(struct ifobject *ifobj) -{ - u64 umem_sz = ifobj->umem->num_frames * ifobj->umem->frame_size; - - if (ifobj->shared_umem) - umem_sz *= 2; - - umem_sz = ceil_u64(umem_sz, HUGEPAGE_SIZE) * HUGEPAGE_SIZE; - xsk_umem__delete(ifobj->umem->umem); - munmap(ifobj->umem->buffer, umem_sz); -} - -static void handler(int signum) -{ - pthread_exit(NULL); -} - -static bool xdp_prog_changed_rx(struct test_spec *test) -{ - struct ifobject *ifobj = test->ifobj_rx; - - return ifobj->xdp_prog != test->xdp_prog_rx || ifobj->mode != test->mode; -} - -static bool xdp_prog_changed_tx(struct test_spec *test) -{ - struct ifobject *ifobj = test->ifobj_tx; - - return ifobj->xdp_prog != test->xdp_prog_tx || ifobj->mode != test->mode; -} - -static void xsk_reattach_xdp(struct ifobject *ifobj, struct bpf_program *xdp_prog, - struct bpf_map *xskmap, enum test_mode mode) -{ - int err; - - xsk_detach_xdp_program(ifobj->ifindex, mode_to_xdp_flags(ifobj->mode)); - err = xsk_attach_xdp_program(xdp_prog, ifobj->ifindex, mode_to_xdp_flags(mode)); - if (err) { - ksft_print_msg("Error attaching XDP program\n"); - exit_with_error(-err); - } - - if (ifobj->mode != mode && (mode == TEST_MODE_DRV || mode == TEST_MODE_ZC)) - if (!xsk_is_in_mode(ifobj->ifindex, XDP_FLAGS_DRV_MODE)) { - ksft_print_msg("ERROR: XDP prog not in DRV mode\n"); - exit_with_error(EINVAL); - } - - ifobj->xdp_prog = xdp_prog; - ifobj->xskmap = xskmap; - ifobj->mode = mode; -} - -static void xsk_attach_xdp_progs(struct test_spec *test, struct ifobject *ifobj_rx, - struct ifobject *ifobj_tx) -{ - if (xdp_prog_changed_rx(test)) - xsk_reattach_xdp(ifobj_rx, test->xdp_prog_rx, test->xskmap_rx, test->mode); - - if (!ifobj_tx || ifobj_tx->shared_umem) - return; - - if (xdp_prog_changed_tx(test)) - xsk_reattach_xdp(ifobj_tx, test->xdp_prog_tx, test->xskmap_tx, test->mode); -} - -static int __testapp_validate_traffic(struct test_spec *test, struct ifobject *ifobj1, - struct ifobject *ifobj2) -{ - pthread_t t0, t1; - int err; - - if (test->mtu > MAX_ETH_PKT_SIZE) { - if (test->mode == TEST_MODE_ZC && (!ifobj1->multi_buff_zc_supp || - (ifobj2 && !ifobj2->multi_buff_zc_supp))) { - ksft_test_result_skip("Multi buffer for zero-copy not supported.\n"); - return TEST_SKIP; - } - if (test->mode != TEST_MODE_ZC && (!ifobj1->multi_buff_supp || - (ifobj2 && !ifobj2->multi_buff_supp))) { - ksft_test_result_skip("Multi buffer not supported.\n"); - return TEST_SKIP; - } - } - err = test_spec_set_mtu(test, test->mtu); - if (err) { - ksft_print_msg("Error, could not set mtu.\n"); - exit_with_error(err); - } - - if (ifobj2) { - if (pthread_barrier_init(&barr, NULL, 2)) - exit_with_error(errno); - pkt_stream_reset(ifobj2->xsk->pkt_stream); - } - - test->current_step++; - pkt_stream_reset(ifobj1->xsk->pkt_stream); - pkts_in_flight = 0; - - signal(SIGUSR1, handler); - /*Spawn RX thread */ - pthread_create(&t0, NULL, ifobj1->func_ptr, test); - - if (ifobj2) { - pthread_barrier_wait(&barr); - if (pthread_barrier_destroy(&barr)) - exit_with_error(errno); - - /*Spawn TX thread */ - pthread_create(&t1, NULL, ifobj2->func_ptr, test); - - pthread_join(t1, NULL); - } - - if (!ifobj2) - pthread_kill(t0, SIGUSR1); - else - pthread_join(t0, NULL); - - if (test->total_steps == test->current_step || test->fail) { - u32 i; - - if (ifobj2) - for (i = 0; i < test->nb_sockets; i++) - xsk_socket__delete(ifobj2->xsk_arr[i].xsk); - - for (i = 0; i < test->nb_sockets; i++) - xsk_socket__delete(ifobj1->xsk_arr[i].xsk); - - testapp_clean_xsk_umem(ifobj1); - if (ifobj2 && !ifobj2->shared_umem) - testapp_clean_xsk_umem(ifobj2); - } - - return !!test->fail; -} - -static int testapp_validate_traffic(struct test_spec *test) -{ - struct ifobject *ifobj_rx = test->ifobj_rx; - struct ifobject *ifobj_tx = test->ifobj_tx; - - if ((ifobj_rx->umem->unaligned_mode && !ifobj_rx->unaligned_supp) || - (ifobj_tx->umem->unaligned_mode && !ifobj_tx->unaligned_supp)) { - ksft_test_result_skip("No huge pages present.\n"); - return TEST_SKIP; - } - - if (test->set_ring) { - if (ifobj_tx->hw_ring_size_supp) { - if (set_ring_size(ifobj_tx)) { - ksft_test_result_skip("Failed to change HW ring size.\n"); - return TEST_FAILURE; - } - } else { - ksft_test_result_skip("Changing HW ring size not supported.\n"); - return TEST_SKIP; - } - } - - xsk_attach_xdp_progs(test, ifobj_rx, ifobj_tx); - return __testapp_validate_traffic(test, ifobj_rx, ifobj_tx); -} - -static int testapp_validate_traffic_single_thread(struct test_spec *test, struct ifobject *ifobj) -{ - return __testapp_validate_traffic(test, ifobj, NULL); -} - -static int testapp_teardown(struct test_spec *test) -{ - int i; - - for (i = 0; i < MAX_TEARDOWN_ITER; i++) { - if (testapp_validate_traffic(test)) - return TEST_FAILURE; - test_spec_reset(test); - } - - return TEST_PASS; -} - -static void swap_directions(struct ifobject **ifobj1, struct ifobject **ifobj2) -{ - thread_func_t tmp_func_ptr = (*ifobj1)->func_ptr; - struct ifobject *tmp_ifobj = (*ifobj1); - - (*ifobj1)->func_ptr = (*ifobj2)->func_ptr; - (*ifobj2)->func_ptr = tmp_func_ptr; - - *ifobj1 = *ifobj2; - *ifobj2 = tmp_ifobj; -} - -static int testapp_bidirectional(struct test_spec *test) -{ - int res; - - test->ifobj_tx->rx_on = true; - test->ifobj_rx->tx_on = true; - test->total_steps = 2; - if (testapp_validate_traffic(test)) - return TEST_FAILURE; - - print_verbose("Switching Tx/Rx direction\n"); - swap_directions(&test->ifobj_rx, &test->ifobj_tx); - res = __testapp_validate_traffic(test, test->ifobj_rx, test->ifobj_tx); - - swap_directions(&test->ifobj_rx, &test->ifobj_tx); - return res; -} - -static int swap_xsk_resources(struct test_spec *test) -{ - int ret; - - test->ifobj_tx->xsk_arr[0].pkt_stream = NULL; - test->ifobj_rx->xsk_arr[0].pkt_stream = NULL; - test->ifobj_tx->xsk_arr[1].pkt_stream = test->tx_pkt_stream_default; - test->ifobj_rx->xsk_arr[1].pkt_stream = test->rx_pkt_stream_default; - test->ifobj_tx->xsk = &test->ifobj_tx->xsk_arr[1]; - test->ifobj_rx->xsk = &test->ifobj_rx->xsk_arr[1]; - - ret = xsk_update_xskmap(test->ifobj_rx->xskmap, test->ifobj_rx->xsk->xsk, 0); - if (ret) - return TEST_FAILURE; - - return TEST_PASS; -} - -static int testapp_xdp_prog_cleanup(struct test_spec *test) -{ - test->total_steps = 2; - test->nb_sockets = 2; - if (testapp_validate_traffic(test)) - return TEST_FAILURE; - - if (swap_xsk_resources(test)) - return TEST_FAILURE; - return testapp_validate_traffic(test); -} - -static int testapp_headroom(struct test_spec *test) -{ - test->ifobj_rx->umem->frame_headroom = UMEM_HEADROOM_TEST_SIZE; - return testapp_validate_traffic(test); -} - -static int testapp_stats_rx_dropped(struct test_spec *test) -{ - if (test->mode == TEST_MODE_ZC) { - ksft_test_result_skip("Can not run RX_DROPPED test for ZC mode\n"); - return TEST_SKIP; - } - - pkt_stream_replace_half(test, MIN_PKT_SIZE * 4, 0); - test->ifobj_rx->umem->frame_headroom = test->ifobj_rx->umem->frame_size - - XDP_PACKET_HEADROOM - MIN_PKT_SIZE * 3; - pkt_stream_receive_half(test); - test->ifobj_rx->validation_func = validate_rx_dropped; - return testapp_validate_traffic(test); -} - -static int testapp_stats_tx_invalid_descs(struct test_spec *test) -{ - pkt_stream_replace_half(test, XSK_UMEM__INVALID_FRAME_SIZE, 0); - test->ifobj_tx->validation_func = validate_tx_invalid_descs; - return testapp_validate_traffic(test); -} - -static int testapp_stats_rx_full(struct test_spec *test) -{ - pkt_stream_replace(test, DEFAULT_UMEM_BUFFERS + DEFAULT_UMEM_BUFFERS / 2, MIN_PKT_SIZE); - test->ifobj_rx->xsk->pkt_stream = pkt_stream_generate(DEFAULT_UMEM_BUFFERS, MIN_PKT_SIZE); - - test->ifobj_rx->xsk->rxqsize = DEFAULT_UMEM_BUFFERS; - test->ifobj_rx->release_rx = false; - test->ifobj_rx->validation_func = validate_rx_full; - return testapp_validate_traffic(test); -} - -static int testapp_stats_fill_empty(struct test_spec *test) -{ - pkt_stream_replace(test, DEFAULT_UMEM_BUFFERS + DEFAULT_UMEM_BUFFERS / 2, MIN_PKT_SIZE); - test->ifobj_rx->xsk->pkt_stream = pkt_stream_generate(DEFAULT_UMEM_BUFFERS, MIN_PKT_SIZE); - - test->ifobj_rx->use_fill_ring = false; - test->ifobj_rx->validation_func = validate_fill_empty; - return testapp_validate_traffic(test); -} - -static int testapp_send_receive_unaligned(struct test_spec *test) -{ - test->ifobj_tx->umem->unaligned_mode = true; - test->ifobj_rx->umem->unaligned_mode = true; - /* Let half of the packets straddle a 4K buffer boundary */ - pkt_stream_replace_half(test, MIN_PKT_SIZE, -MIN_PKT_SIZE / 2); - - return testapp_validate_traffic(test); -} - -static int testapp_send_receive_unaligned_mb(struct test_spec *test) -{ - test->mtu = MAX_ETH_JUMBO_SIZE; - test->ifobj_tx->umem->unaligned_mode = true; - test->ifobj_rx->umem->unaligned_mode = true; - pkt_stream_replace(test, DEFAULT_PKT_CNT, MAX_ETH_JUMBO_SIZE); - return testapp_validate_traffic(test); -} - -static int testapp_single_pkt(struct test_spec *test) -{ - struct pkt pkts[] = {{0, MIN_PKT_SIZE, 0, true}}; - - pkt_stream_generate_custom(test, pkts, ARRAY_SIZE(pkts)); - return testapp_validate_traffic(test); -} - -static int testapp_send_receive_mb(struct test_spec *test) -{ - test->mtu = MAX_ETH_JUMBO_SIZE; - pkt_stream_replace(test, DEFAULT_PKT_CNT, MAX_ETH_JUMBO_SIZE); - - return testapp_validate_traffic(test); -} - -static int testapp_invalid_desc_mb(struct test_spec *test) -{ - struct xsk_umem_info *umem = test->ifobj_tx->umem; - u64 umem_size = umem->num_frames * umem->frame_size; - struct pkt pkts[] = { - /* Valid packet for synch to start with */ - {0, MIN_PKT_SIZE, 0, true, 0}, - /* Zero frame len is not legal */ - {0, XSK_UMEM__LARGE_FRAME_SIZE, 0, false, XDP_PKT_CONTD}, - {0, XSK_UMEM__LARGE_FRAME_SIZE, 0, false, XDP_PKT_CONTD}, - {0, 0, 0, false, 0}, - /* Invalid address in the second frame */ - {0, XSK_UMEM__LARGE_FRAME_SIZE, 0, false, XDP_PKT_CONTD}, - {umem_size, XSK_UMEM__LARGE_FRAME_SIZE, 0, false, XDP_PKT_CONTD}, - /* Invalid len in the middle */ - {0, XSK_UMEM__LARGE_FRAME_SIZE, 0, false, XDP_PKT_CONTD}, - {0, XSK_UMEM__INVALID_FRAME_SIZE, 0, false, XDP_PKT_CONTD}, - /* Invalid options in the middle */ - {0, XSK_UMEM__LARGE_FRAME_SIZE, 0, false, XDP_PKT_CONTD}, - {0, XSK_UMEM__LARGE_FRAME_SIZE, 0, false, XSK_DESC__INVALID_OPTION}, - /* Transmit 2 frags, receive 3 */ - {0, XSK_UMEM__MAX_FRAME_SIZE, 0, true, XDP_PKT_CONTD}, - {0, XSK_UMEM__MAX_FRAME_SIZE, 0, true, 0}, - /* Middle frame crosses chunk boundary with small length */ - {0, XSK_UMEM__LARGE_FRAME_SIZE, 0, false, XDP_PKT_CONTD}, - {-MIN_PKT_SIZE / 2, MIN_PKT_SIZE, 0, false, 0}, - /* Valid packet for synch so that something is received */ - {0, MIN_PKT_SIZE, 0, true, 0}}; - - if (umem->unaligned_mode) { - /* Crossing a chunk boundary allowed */ - pkts[12].valid = true; - pkts[13].valid = true; - } - - test->mtu = MAX_ETH_JUMBO_SIZE; - pkt_stream_generate_custom(test, pkts, ARRAY_SIZE(pkts)); - return testapp_validate_traffic(test); -} - -static int testapp_invalid_desc(struct test_spec *test) -{ - struct xsk_umem_info *umem = test->ifobj_tx->umem; - u64 umem_size = umem->num_frames * umem->frame_size; - struct pkt pkts[] = { - /* Zero packet address allowed */ - {0, MIN_PKT_SIZE, 0, true}, - /* Allowed packet */ - {0, MIN_PKT_SIZE, 0, true}, - /* Straddling the start of umem */ - {-2, MIN_PKT_SIZE, 0, false}, - /* Packet too large */ - {0, XSK_UMEM__INVALID_FRAME_SIZE, 0, false}, - /* Up to end of umem allowed */ - {umem_size - MIN_PKT_SIZE - 2 * umem->frame_size, MIN_PKT_SIZE, 0, true}, - /* After umem ends */ - {umem_size, MIN_PKT_SIZE, 0, false}, - /* Straddle the end of umem */ - {umem_size - MIN_PKT_SIZE / 2, MIN_PKT_SIZE, 0, false}, - /* Straddle a 4K boundary */ - {0x1000 - MIN_PKT_SIZE / 2, MIN_PKT_SIZE, 0, false}, - /* Straddle a 2K boundary */ - {0x800 - MIN_PKT_SIZE / 2, MIN_PKT_SIZE, 0, true}, - /* Valid packet for synch so that something is received */ - {0, MIN_PKT_SIZE, 0, true}}; - - if (umem->unaligned_mode) { - /* Crossing a page boundary allowed */ - pkts[7].valid = true; - } - if (umem->frame_size == XSK_UMEM__DEFAULT_FRAME_SIZE / 2) { - /* Crossing a 2K frame size boundary not allowed */ - pkts[8].valid = false; - } - - if (test->ifobj_tx->shared_umem) { - pkts[4].offset += umem_size; - pkts[5].offset += umem_size; - pkts[6].offset += umem_size; - } - - pkt_stream_generate_custom(test, pkts, ARRAY_SIZE(pkts)); - return testapp_validate_traffic(test); -} - -static int testapp_xdp_drop(struct test_spec *test) -{ - struct xsk_xdp_progs *skel_rx = test->ifobj_rx->xdp_progs; - struct xsk_xdp_progs *skel_tx = test->ifobj_tx->xdp_progs; - - test_spec_set_xdp_prog(test, skel_rx->progs.xsk_xdp_drop, skel_tx->progs.xsk_xdp_drop, - skel_rx->maps.xsk, skel_tx->maps.xsk); - - pkt_stream_receive_half(test); - return testapp_validate_traffic(test); -} - -static int testapp_xdp_metadata_copy(struct test_spec *test) -{ - struct xsk_xdp_progs *skel_rx = test->ifobj_rx->xdp_progs; - struct xsk_xdp_progs *skel_tx = test->ifobj_tx->xdp_progs; - - test_spec_set_xdp_prog(test, skel_rx->progs.xsk_xdp_populate_metadata, - skel_tx->progs.xsk_xdp_populate_metadata, - skel_rx->maps.xsk, skel_tx->maps.xsk); - test->ifobj_rx->use_metadata = true; - - skel_rx->bss->count = 0; - - return testapp_validate_traffic(test); -} - -static int testapp_xdp_shared_umem(struct test_spec *test) -{ - struct xsk_xdp_progs *skel_rx = test->ifobj_rx->xdp_progs; - struct xsk_xdp_progs *skel_tx = test->ifobj_tx->xdp_progs; - - test->total_steps = 1; - test->nb_sockets = 2; - - test_spec_set_xdp_prog(test, skel_rx->progs.xsk_xdp_shared_umem, - skel_tx->progs.xsk_xdp_shared_umem, - skel_rx->maps.xsk, skel_tx->maps.xsk); - - pkt_stream_even_odd_sequence(test); - - return testapp_validate_traffic(test); -} - -static int testapp_poll_txq_tmout(struct test_spec *test) -{ - test->ifobj_tx->use_poll = true; - /* create invalid frame by set umem frame_size and pkt length equal to 2048 */ - test->ifobj_tx->umem->frame_size = 2048; - pkt_stream_replace(test, 2 * DEFAULT_PKT_CNT, 2048); - return testapp_validate_traffic_single_thread(test, test->ifobj_tx); -} - -static int testapp_poll_rxq_tmout(struct test_spec *test) -{ - test->ifobj_rx->use_poll = true; - return testapp_validate_traffic_single_thread(test, test->ifobj_rx); -} - -static int testapp_too_many_frags(struct test_spec *test) -{ - struct pkt *pkts; - u32 max_frags, i; - int ret; - - if (test->mode == TEST_MODE_ZC) { - max_frags = test->ifobj_tx->xdp_zc_max_segs; - } else { - max_frags = get_max_skb_frags(); - if (!max_frags) { - ksft_print_msg("Couldn't retrieve MAX_SKB_FRAGS from system, using default (17) value\n"); - max_frags = 17; - } - max_frags += 1; - } - - pkts = calloc(2 * max_frags + 2, sizeof(struct pkt)); - if (!pkts) - return TEST_FAILURE; - - test->mtu = MAX_ETH_JUMBO_SIZE; - - /* Valid packet for synch */ - pkts[0].len = MIN_PKT_SIZE; - pkts[0].valid = true; - - /* One valid packet with the max amount of frags */ - for (i = 1; i < max_frags + 1; i++) { - pkts[i].len = MIN_PKT_SIZE; - pkts[i].options = XDP_PKT_CONTD; - pkts[i].valid = true; - } - pkts[max_frags].options = 0; - - /* An invalid packet with the max amount of frags but signals packet - * continues on the last frag - */ - for (i = max_frags + 1; i < 2 * max_frags + 1; i++) { - pkts[i].len = MIN_PKT_SIZE; - pkts[i].options = XDP_PKT_CONTD; - pkts[i].valid = false; - } - - /* Valid packet for synch */ - pkts[2 * max_frags + 1].len = MIN_PKT_SIZE; - pkts[2 * max_frags + 1].valid = true; - - pkt_stream_generate_custom(test, pkts, 2 * max_frags + 2); - ret = testapp_validate_traffic(test); - - free(pkts); - return ret; -} - -static int xsk_load_xdp_programs(struct ifobject *ifobj) -{ - ifobj->xdp_progs = xsk_xdp_progs__open_and_load(); - if (libbpf_get_error(ifobj->xdp_progs)) - return libbpf_get_error(ifobj->xdp_progs); - - return 0; -} - static void xsk_unload_xdp_programs(struct ifobject *ifobj) { xsk_xdp_progs__destroy(ifobj->xdp_progs); } -/* Simple test */ -static bool hugepages_present(void) -{ - size_t mmap_sz = 2 * DEFAULT_UMEM_BUFFERS * XSK_UMEM__DEFAULT_FRAME_SIZE; - void *bufs; - - bufs = mmap(NULL, mmap_sz, PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB, -1, MAP_HUGE_2MB); - if (bufs == MAP_FAILED) - return false; - - mmap_sz = ceil_u64(mmap_sz, HUGEPAGE_SIZE) * HUGEPAGE_SIZE; - munmap(bufs, mmap_sz); - return true; -} - -static void init_iface(struct ifobject *ifobj, thread_func_t func_ptr) -{ - LIBBPF_OPTS(bpf_xdp_query_opts, query_opts); - int err; - - ifobj->func_ptr = func_ptr; - - err = xsk_load_xdp_programs(ifobj); - if (err) { - ksft_print_msg("Error loading XDP program\n"); - exit_with_error(err); - } - - if (hugepages_present()) - ifobj->unaligned_supp = true; - - err = bpf_xdp_query(ifobj->ifindex, XDP_FLAGS_DRV_MODE, &query_opts); - if (err) { - ksft_print_msg("Error querying XDP capabilities\n"); - exit_with_error(-err); - } - if (query_opts.feature_flags & NETDEV_XDP_ACT_RX_SG) - ifobj->multi_buff_supp = true; - if (query_opts.feature_flags & NETDEV_XDP_ACT_XSK_ZEROCOPY) { - if (query_opts.xdp_zc_max_segs > 1) { - ifobj->multi_buff_zc_supp = true; - ifobj->xdp_zc_max_segs = query_opts.xdp_zc_max_segs; - } else { - ifobj->xdp_zc_max_segs = 0; - } - } -} - -static int testapp_send_receive(struct test_spec *test) -{ - return testapp_validate_traffic(test); -} - -static int testapp_send_receive_2k_frame(struct test_spec *test) -{ - test->ifobj_tx->umem->frame_size = 2048; - test->ifobj_rx->umem->frame_size = 2048; - pkt_stream_replace(test, DEFAULT_PKT_CNT, MIN_PKT_SIZE); - return testapp_validate_traffic(test); -} - -static int testapp_poll_rx(struct test_spec *test) -{ - test->ifobj_rx->use_poll = true; - return testapp_validate_traffic(test); -} - -static int testapp_poll_tx(struct test_spec *test) -{ - test->ifobj_tx->use_poll = true; - return testapp_validate_traffic(test); -} - -static int testapp_aligned_inv_desc(struct test_spec *test) -{ - return testapp_invalid_desc(test); -} - -static int testapp_aligned_inv_desc_2k_frame(struct test_spec *test) -{ - test->ifobj_tx->umem->frame_size = 2048; - test->ifobj_rx->umem->frame_size = 2048; - return testapp_invalid_desc(test); -} - -static int testapp_unaligned_inv_desc(struct test_spec *test) -{ - test->ifobj_tx->umem->unaligned_mode = true; - test->ifobj_rx->umem->unaligned_mode = true; - return testapp_invalid_desc(test); -} - -static int testapp_unaligned_inv_desc_4001_frame(struct test_spec *test) -{ - u64 page_size, umem_size; - - /* Odd frame size so the UMEM doesn't end near a page boundary. */ - test->ifobj_tx->umem->frame_size = 4001; - test->ifobj_rx->umem->frame_size = 4001; - test->ifobj_tx->umem->unaligned_mode = true; - test->ifobj_rx->umem->unaligned_mode = true; - /* This test exists to test descriptors that staddle the end of - * the UMEM but not a page. - */ - page_size = sysconf(_SC_PAGESIZE); - umem_size = test->ifobj_tx->umem->num_frames * test->ifobj_tx->umem->frame_size; - assert(umem_size % page_size > MIN_PKT_SIZE); - assert(umem_size % page_size < page_size - MIN_PKT_SIZE); - - return testapp_invalid_desc(test); -} - -static int testapp_aligned_inv_desc_mb(struct test_spec *test) -{ - return testapp_invalid_desc_mb(test); -} - -static int testapp_unaligned_inv_desc_mb(struct test_spec *test) -{ - test->ifobj_tx->umem->unaligned_mode = true; - test->ifobj_rx->umem->unaligned_mode = true; - return testapp_invalid_desc_mb(test); -} - -static int testapp_xdp_metadata(struct test_spec *test) -{ - return testapp_xdp_metadata_copy(test); -} - -static int testapp_xdp_metadata_mb(struct test_spec *test) -{ - test->mtu = MAX_ETH_JUMBO_SIZE; - return testapp_xdp_metadata_copy(test); -} - -static int testapp_hw_sw_min_ring_size(struct test_spec *test) -{ - int ret; - - test->set_ring = true; - test->total_steps = 2; - test->ifobj_tx->ring.tx_pending = DEFAULT_BATCH_SIZE; - test->ifobj_tx->ring.rx_pending = DEFAULT_BATCH_SIZE * 2; - test->ifobj_tx->xsk->batch_size = 1; - test->ifobj_rx->xsk->batch_size = 1; - ret = testapp_validate_traffic(test); - if (ret) - return ret; - - /* Set batch size to hw_ring_size - 1 */ - test->ifobj_tx->xsk->batch_size = DEFAULT_BATCH_SIZE - 1; - test->ifobj_rx->xsk->batch_size = DEFAULT_BATCH_SIZE - 1; - return testapp_validate_traffic(test); -} - -static int testapp_hw_sw_max_ring_size(struct test_spec *test) -{ - u32 max_descs = XSK_RING_PROD__DEFAULT_NUM_DESCS * 4; - int ret; - - test->set_ring = true; - test->total_steps = 2; - test->ifobj_tx->ring.tx_pending = test->ifobj_tx->ring.tx_max_pending; - test->ifobj_tx->ring.rx_pending = test->ifobj_tx->ring.rx_max_pending; - test->ifobj_rx->umem->num_frames = max_descs; - test->ifobj_rx->umem->fill_size = max_descs; - test->ifobj_rx->umem->comp_size = max_descs; - test->ifobj_tx->xsk->batch_size = XSK_RING_PROD__DEFAULT_NUM_DESCS; - test->ifobj_rx->xsk->batch_size = XSK_RING_PROD__DEFAULT_NUM_DESCS; - - ret = testapp_validate_traffic(test); - if (ret) - return ret; - - /* Set batch_size to 8152 for testing, as the ice HW ignores the 3 lowest bits when - * updating the Rx HW tail register. - */ - test->ifobj_tx->xsk->batch_size = test->ifobj_tx->ring.tx_max_pending - 8; - test->ifobj_rx->xsk->batch_size = test->ifobj_tx->ring.tx_max_pending - 8; - pkt_stream_replace(test, max_descs, MIN_PKT_SIZE); - return testapp_validate_traffic(test); -} - -static int testapp_xdp_adjust_tail(struct test_spec *test, int adjust_value) -{ - struct xsk_xdp_progs *skel_rx = test->ifobj_rx->xdp_progs; - struct xsk_xdp_progs *skel_tx = test->ifobj_tx->xdp_progs; - - test_spec_set_xdp_prog(test, skel_rx->progs.xsk_xdp_adjust_tail, - skel_tx->progs.xsk_xdp_adjust_tail, - skel_rx->maps.xsk, skel_tx->maps.xsk); - - skel_rx->bss->adjust_value = adjust_value; - - return testapp_validate_traffic(test); -} - -static int testapp_adjust_tail(struct test_spec *test, u32 value, u32 pkt_len) -{ - int ret; - - test->adjust_tail_support = true; - test->adjust_tail = true; - test->total_steps = 1; - - pkt_stream_replace_ifobject(test->ifobj_tx, DEFAULT_BATCH_SIZE, pkt_len); - pkt_stream_replace_ifobject(test->ifobj_rx, DEFAULT_BATCH_SIZE, pkt_len + value); - - ret = testapp_xdp_adjust_tail(test, value); - if (ret) - return ret; - - if (!test->adjust_tail_support) { - ksft_test_result_skip("%s %sResize pkt with bpf_xdp_adjust_tail() not supported\n", - mode_string(test), busy_poll_string(test)); - return TEST_SKIP; - } - - return 0; -} - -static int testapp_adjust_tail_shrink(struct test_spec *test) -{ - /* Shrink by 4 bytes for testing purpose */ - return testapp_adjust_tail(test, -4, MIN_PKT_SIZE * 2); -} - -static int testapp_adjust_tail_shrink_mb(struct test_spec *test) -{ - test->mtu = MAX_ETH_JUMBO_SIZE; - /* Shrink by the frag size */ - return testapp_adjust_tail(test, -XSK_UMEM__MAX_FRAME_SIZE, XSK_UMEM__LARGE_FRAME_SIZE * 2); -} - -static int testapp_adjust_tail_grow(struct test_spec *test) -{ - /* Grow by 4 bytes for testing purpose */ - return testapp_adjust_tail(test, 4, MIN_PKT_SIZE * 2); -} - -static int testapp_adjust_tail_grow_mb(struct test_spec *test) -{ - test->mtu = MAX_ETH_JUMBO_SIZE; - /* Grow by (frag_size - last_frag_Size) - 1 to stay inside the last fragment */ - return testapp_adjust_tail(test, (XSK_UMEM__MAX_FRAME_SIZE / 2) - 1, - XSK_UMEM__LARGE_FRAME_SIZE * 2); -} - -static int testapp_tx_queue_consumer(struct test_spec *test) -{ - int nr_packets; - - if (test->mode == TEST_MODE_ZC) { - ksft_test_result_skip("Can not run TX_QUEUE_CONSUMER test for ZC mode\n"); - return TEST_SKIP; - } - - nr_packets = MAX_TX_BUDGET_DEFAULT + 1; - pkt_stream_replace(test, nr_packets, MIN_PKT_SIZE); - test->ifobj_tx->xsk->batch_size = nr_packets; - test->ifobj_tx->xsk->check_consumer = true; - - return testapp_validate_traffic(test); -} - static void run_pkt_test(struct test_spec *test) { int ret; ret = test->test_func(test); - if (ret == TEST_PASS) + switch (ret) { + case TEST_PASS: ksft_test_result_pass("PASS: %s %s%s\n", mode_string(test), busy_poll_string(test), test->name); - pkt_stream_restore_default(test); -} - -static struct ifobject *ifobject_create(void) -{ - struct ifobject *ifobj; - - ifobj = calloc(1, sizeof(struct ifobject)); - if (!ifobj) - return NULL; - - ifobj->xsk_arr = calloc(MAX_SOCKETS, sizeof(*ifobj->xsk_arr)); - if (!ifobj->xsk_arr) - goto out_xsk_arr; - - ifobj->umem = calloc(1, sizeof(*ifobj->umem)); - if (!ifobj->umem) - goto out_umem; - - return ifobj; - -out_umem: - free(ifobj->xsk_arr); -out_xsk_arr: - free(ifobj); - return NULL; -} + break; + case TEST_SKIP: + ksft_test_result_skip("SKIP: %s %s%s\n", mode_string(test), busy_poll_string(test), + test->name); + break; + case TEST_FAILURE: + ksft_test_result_fail("FAIL: %s %s%s\n", mode_string(test), busy_poll_string(test), + test->name); + break; + default: + ksft_test_result_fail("FAIL: %s %s%s -- Unexpected returned value (%d)\n", + mode_string(test), busy_poll_string(test), test->name, ret); + } -static void ifobject_delete(struct ifobject *ifobj) -{ - free(ifobj->umem); - free(ifobj->xsk_arr); - free(ifobj); + pkt_stream_restore_default(test); } static bool is_xdp_supported(int ifindex) @@ -2726,47 +319,6 @@ static bool is_xdp_supported(int ifindex) return true; } -static const struct test_spec tests[] = { - {.name = "SEND_RECEIVE", .test_func = testapp_send_receive}, - {.name = "SEND_RECEIVE_2K_FRAME", .test_func = testapp_send_receive_2k_frame}, - {.name = "SEND_RECEIVE_SINGLE_PKT", .test_func = testapp_single_pkt}, - {.name = "POLL_RX", .test_func = testapp_poll_rx}, - {.name = "POLL_TX", .test_func = testapp_poll_tx}, - {.name = "POLL_RXQ_FULL", .test_func = testapp_poll_rxq_tmout}, - {.name = "POLL_TXQ_FULL", .test_func = testapp_poll_txq_tmout}, - {.name = "SEND_RECEIVE_UNALIGNED", .test_func = testapp_send_receive_unaligned}, - {.name = "ALIGNED_INV_DESC", .test_func = testapp_aligned_inv_desc}, - {.name = "ALIGNED_INV_DESC_2K_FRAME_SIZE", .test_func = testapp_aligned_inv_desc_2k_frame}, - {.name = "UNALIGNED_INV_DESC", .test_func = testapp_unaligned_inv_desc}, - {.name = "UNALIGNED_INV_DESC_4001_FRAME_SIZE", - .test_func = testapp_unaligned_inv_desc_4001_frame}, - {.name = "UMEM_HEADROOM", .test_func = testapp_headroom}, - {.name = "TEARDOWN", .test_func = testapp_teardown}, - {.name = "BIDIRECTIONAL", .test_func = testapp_bidirectional}, - {.name = "STAT_RX_DROPPED", .test_func = testapp_stats_rx_dropped}, - {.name = "STAT_TX_INVALID", .test_func = testapp_stats_tx_invalid_descs}, - {.name = "STAT_RX_FULL", .test_func = testapp_stats_rx_full}, - {.name = "STAT_FILL_EMPTY", .test_func = testapp_stats_fill_empty}, - {.name = "XDP_PROG_CLEANUP", .test_func = testapp_xdp_prog_cleanup}, - {.name = "XDP_DROP_HALF", .test_func = testapp_xdp_drop}, - {.name = "XDP_SHARED_UMEM", .test_func = testapp_xdp_shared_umem}, - {.name = "XDP_METADATA_COPY", .test_func = testapp_xdp_metadata}, - {.name = "XDP_METADATA_COPY_MULTI_BUFF", .test_func = testapp_xdp_metadata_mb}, - {.name = "SEND_RECEIVE_9K_PACKETS", .test_func = testapp_send_receive_mb}, - {.name = "SEND_RECEIVE_UNALIGNED_9K_PACKETS", - .test_func = testapp_send_receive_unaligned_mb}, - {.name = "ALIGNED_INV_DESC_MULTI_BUFF", .test_func = testapp_aligned_inv_desc_mb}, - {.name = "UNALIGNED_INV_DESC_MULTI_BUFF", .test_func = testapp_unaligned_inv_desc_mb}, - {.name = "TOO_MANY_FRAGS", .test_func = testapp_too_many_frags}, - {.name = "HW_SW_MIN_RING_SIZE", .test_func = testapp_hw_sw_min_ring_size}, - {.name = "HW_SW_MAX_RING_SIZE", .test_func = testapp_hw_sw_max_ring_size}, - {.name = "XDP_ADJUST_TAIL_SHRINK", .test_func = testapp_adjust_tail_shrink}, - {.name = "XDP_ADJUST_TAIL_SHRINK_MULTI_BUFF", .test_func = testapp_adjust_tail_shrink_mb}, - {.name = "XDP_ADJUST_TAIL_GROW", .test_func = testapp_adjust_tail_grow}, - {.name = "XDP_ADJUST_TAIL_GROW_MULTI_BUFF", .test_func = testapp_adjust_tail_grow_mb}, - {.name = "TX_QUEUE_CONSUMER", .test_func = testapp_tx_queue_consumer}, - }; - static void print_tests(void) { u32 i; @@ -2774,10 +326,13 @@ static void print_tests(void) printf("Tests:\n"); for (i = 0; i < ARRAY_SIZE(tests); i++) printf("%u: %s\n", i, tests[i].name); + for (i = ARRAY_SIZE(tests); i < ARRAY_SIZE(tests) + ARRAY_SIZE(ci_skip_tests); i++) + printf("%u: %s\n", i, ci_skip_tests[i - ARRAY_SIZE(tests)].name); } int main(int argc, char **argv) { + const size_t total_tests = ARRAY_SIZE(tests) + ARRAY_SIZE(ci_skip_tests); struct pkt_stream *rx_pkt_stream_default; struct pkt_stream *tx_pkt_stream_default; struct ifobject *ifobj_tx, *ifobj_rx; @@ -2805,7 +360,7 @@ int main(int argc, char **argv) print_tests(); ksft_exit_xpass(); } - if (opt_run_test != RUN_ALL_TESTS && opt_run_test >= ARRAY_SIZE(tests)) { + if (opt_run_test != RUN_ALL_TESTS && opt_run_test >= total_tests) { ksft_print_msg("Error: test %u does not exist.\n", opt_run_test); ksft_exit_xfail(); } @@ -2830,10 +385,13 @@ int main(int argc, char **argv) ifobj_tx->set_ring.default_rx = ifobj_tx->ring.rx_pending; } - init_iface(ifobj_rx, worker_testapp_validate_rx); - init_iface(ifobj_tx, worker_testapp_validate_tx); + if (init_iface(ifobj_rx, worker_testapp_validate_rx) || + init_iface(ifobj_tx, worker_testapp_validate_tx)) { + ksft_print_msg("Error : can't initialize interfaces\n"); + ksft_exit_xfail(); + } - test_spec_init(&test, ifobj_tx, ifobj_rx, 0, &tests[0]); + test_init(&test, ifobj_tx, ifobj_rx, 0, &tests[0]); tx_pkt_stream_default = pkt_stream_generate(DEFAULT_PKT_CNT, MIN_PKT_SIZE); rx_pkt_stream_default = pkt_stream_generate(DEFAULT_PKT_CNT, MIN_PKT_SIZE); if (!tx_pkt_stream_default || !rx_pkt_stream_default) @@ -2842,7 +400,7 @@ int main(int argc, char **argv) test.rx_pkt_stream_default = rx_pkt_stream_default; if (opt_run_test == RUN_ALL_TESTS) - nb_tests = ARRAY_SIZE(tests); + nb_tests = total_tests; else nb_tests = 1; if (opt_mode == TEST_MODE_ALL) { @@ -2864,11 +422,15 @@ int main(int argc, char **argv) if (opt_mode != TEST_MODE_ALL && i != opt_mode) continue; - for (j = 0; j < ARRAY_SIZE(tests); j++) { + for (j = 0; j < total_tests; j++) { if (opt_run_test != RUN_ALL_TESTS && j != opt_run_test) continue; - test_spec_init(&test, ifobj_tx, ifobj_rx, i, &tests[j]); + if (j < ARRAY_SIZE(tests)) + test_init(&test, ifobj_tx, ifobj_rx, i, &tests[j]); + else + test_init(&test, ifobj_tx, ifobj_rx, i, + &ci_skip_tests[j - ARRAY_SIZE(tests)]); run_pkt_test(&test); usleep(USLEEP_MAX); diff --git a/tools/testing/selftests/bpf/xskxceiver.h b/tools/testing/selftests/bpf/xskxceiver.h index 4df3a5d329ac..3ca518df23ad 100644 --- a/tools/testing/selftests/bpf/xskxceiver.h +++ b/tools/testing/selftests/bpf/xskxceiver.h @@ -22,169 +22,13 @@ #define PF_XDP AF_XDP #endif -#ifndef SO_BUSY_POLL_BUDGET -#define SO_BUSY_POLL_BUDGET 70 -#endif - -#ifndef SO_PREFER_BUSY_POLL -#define SO_PREFER_BUSY_POLL 69 -#endif - -#define TEST_PASS 0 -#define TEST_FAILURE -1 -#define TEST_CONTINUE 1 -#define TEST_SKIP 2 -#define MAX_INTERFACES 2 -#define MAX_INTERFACE_NAME_CHARS 16 -#define MAX_TEST_NAME_SIZE 48 #define MAX_TEARDOWN_ITER 10 -#define PKT_HDR_SIZE (sizeof(struct ethhdr) + 2) /* Just to align the data in the packet */ -#define MIN_PKT_SIZE 64 -#define MAX_ETH_PKT_SIZE 1518 #define MAX_ETH_JUMBO_SIZE 9000 -#define USLEEP_MAX 10000 #define SOCK_RECONF_CTR 10 -#define DEFAULT_BATCH_SIZE 64 -#define POLL_TMOUT 1000 -#define THREAD_TMOUT 3 -#define DEFAULT_PKT_CNT (4 * 1024) -#define DEFAULT_UMEM_BUFFERS (DEFAULT_PKT_CNT / 4) #define RX_FULL_RXQSIZE 32 #define UMEM_HEADROOM_TEST_SIZE 128 #define XSK_UMEM__INVALID_FRAME_SIZE (MAX_ETH_JUMBO_SIZE + 1) -#define XSK_UMEM__LARGE_FRAME_SIZE (3 * 1024) -#define XSK_UMEM__MAX_FRAME_SIZE (4 * 1024) -#define XSK_DESC__INVALID_OPTION (0xffff) -#define HUGEPAGE_SIZE (2 * 1024 * 1024) -#define PKT_DUMP_NB_TO_PRINT 16 #define RUN_ALL_TESTS UINT_MAX #define NUM_MAC_ADDRESSES 4 -#define print_verbose(x...) do { if (opt_verbose) ksft_print_msg(x); } while (0) - -enum test_mode { - TEST_MODE_SKB, - TEST_MODE_DRV, - TEST_MODE_ZC, - TEST_MODE_ALL -}; - -struct xsk_umem_info { - struct xsk_ring_prod fq; - struct xsk_ring_cons cq; - struct xsk_umem *umem; - u64 next_buffer; - u32 num_frames; - u32 frame_headroom; - void *buffer; - u32 frame_size; - u32 base_addr; - u32 fill_size; - u32 comp_size; - bool unaligned_mode; -}; - -struct xsk_socket_info { - struct xsk_ring_cons rx; - struct xsk_ring_prod tx; - struct xsk_umem_info *umem; - struct xsk_socket *xsk; - struct pkt_stream *pkt_stream; - u32 outstanding_tx; - u32 rxqsize; - u32 batch_size; - u8 dst_mac[ETH_ALEN]; - u8 src_mac[ETH_ALEN]; - bool check_consumer; -}; - -struct pkt { - int offset; - u32 len; - u32 pkt_nb; - bool valid; - u16 options; -}; - -struct pkt_stream { - u32 nb_pkts; - u32 current_pkt_nb; - struct pkt *pkts; - u32 max_pkt_len; - u32 nb_rx_pkts; - u32 nb_valid_entries; - bool verbatim; -}; - -struct set_hw_ring { - u32 default_tx; - u32 default_rx; -}; - -struct ifobject; -struct test_spec; -typedef int (*validation_func_t)(struct ifobject *ifobj); -typedef void *(*thread_func_t)(void *arg); -typedef int (*test_func_t)(struct test_spec *test); - -struct ifobject { - char ifname[MAX_INTERFACE_NAME_CHARS]; - struct xsk_socket_info *xsk; - struct xsk_socket_info *xsk_arr; - struct xsk_umem_info *umem; - thread_func_t func_ptr; - validation_func_t validation_func; - struct xsk_xdp_progs *xdp_progs; - struct bpf_map *xskmap; - struct bpf_program *xdp_prog; - struct ethtool_ringparam ring; - struct set_hw_ring set_ring; - enum test_mode mode; - int ifindex; - int mtu; - u32 bind_flags; - u32 xdp_zc_max_segs; - bool tx_on; - bool rx_on; - bool use_poll; - bool busy_poll; - bool use_fill_ring; - bool release_rx; - bool shared_umem; - bool use_metadata; - bool unaligned_supp; - bool multi_buff_supp; - bool multi_buff_zc_supp; - bool hw_ring_size_supp; -}; - -struct test_spec { - struct ifobject *ifobj_tx; - struct ifobject *ifobj_rx; - struct pkt_stream *tx_pkt_stream_default; - struct pkt_stream *rx_pkt_stream_default; - struct bpf_program *xdp_prog_rx; - struct bpf_program *xdp_prog_tx; - struct bpf_map *xskmap_rx; - struct bpf_map *xskmap_tx; - test_func_t test_func; - int mtu; - u16 total_steps; - u16 current_step; - u16 nb_sockets; - bool fail; - bool set_ring; - bool adjust_tail; - bool adjust_tail_support; - enum test_mode mode; - char name[MAX_TEST_NAME_SIZE]; -}; - -pthread_barrier_t barr; -pthread_mutex_t pacing_mutex = PTHREAD_MUTEX_INITIALIZER; - -int pkts_in_flight; - -static const u8 g_mac[ETH_ALEN] = {0x55, 0x44, 0x33, 0x22, 0x11, 0x00}; - #endif /* XSKXCEIVER_H_ */ diff --git a/tools/testing/selftests/cgroup/test_core.c b/tools/testing/selftests/cgroup/test_core.c index a360e2eb2eef..1d778c8b7764 100644 --- a/tools/testing/selftests/cgroup/test_core.c +++ b/tools/testing/selftests/cgroup/test_core.c @@ -923,8 +923,10 @@ struct corecg_test { int main(int argc, char *argv[]) { char root[PATH_MAX]; - int i, ret = EXIT_SUCCESS; + int i; + ksft_print_header(); + ksft_set_plan(ARRAY_SIZE(tests)); if (cg_find_unified_root(root, sizeof(root), &nsdelegate)) { if (setup_named_v1_root(root, sizeof(root), CG_NAMED_NAME)) ksft_exit_skip("cgroup v2 isn't mounted and could not setup named v1 hierarchy\n"); @@ -946,12 +948,11 @@ post_v2_setup: ksft_test_result_skip("%s\n", tests[i].name); break; default: - ret = EXIT_FAILURE; ksft_test_result_fail("%s\n", tests[i].name); break; } } cleanup_named_v1_root(root); - return ret; + ksft_finished(); } diff --git a/tools/testing/selftests/cgroup/test_cpu.c b/tools/testing/selftests/cgroup/test_cpu.c index d54e2317efff..b1b30e82dd7c 100644 --- a/tools/testing/selftests/cgroup/test_cpu.c +++ b/tools/testing/selftests/cgroup/test_cpu.c @@ -796,8 +796,10 @@ struct cpucg_test { int main(int argc, char *argv[]) { char root[PATH_MAX]; - int i, ret = EXIT_SUCCESS; + int i; + ksft_print_header(); + ksft_set_plan(ARRAY_SIZE(tests)); if (cg_find_unified_root(root, sizeof(root), NULL)) ksft_exit_skip("cgroup v2 isn't mounted\n"); @@ -814,11 +816,10 @@ int main(int argc, char *argv[]) ksft_test_result_skip("%s\n", tests[i].name); break; default: - ret = EXIT_FAILURE; ksft_test_result_fail("%s\n", tests[i].name); break; } } - return ret; + ksft_finished(); } diff --git a/tools/testing/selftests/cgroup/test_cpuset.c b/tools/testing/selftests/cgroup/test_cpuset.c index 4034d14ba69a..8086d2ea394f 100644 --- a/tools/testing/selftests/cgroup/test_cpuset.c +++ b/tools/testing/selftests/cgroup/test_cpuset.c @@ -247,8 +247,10 @@ struct cpuset_test { int main(int argc, char *argv[]) { char root[PATH_MAX]; - int i, ret = EXIT_SUCCESS; + int i; + ksft_print_header(); + ksft_set_plan(ARRAY_SIZE(tests)); if (cg_find_unified_root(root, sizeof(root), NULL)) ksft_exit_skip("cgroup v2 isn't mounted\n"); @@ -265,11 +267,10 @@ int main(int argc, char *argv[]) ksft_test_result_skip("%s\n", tests[i].name); break; default: - ret = EXIT_FAILURE; ksft_test_result_fail("%s\n", tests[i].name); break; } } - return ret; + ksft_finished(); } diff --git a/tools/testing/selftests/cgroup/test_freezer.c b/tools/testing/selftests/cgroup/test_freezer.c index dfb763819581..465cdad2bfca 100644 --- a/tools/testing/selftests/cgroup/test_freezer.c +++ b/tools/testing/selftests/cgroup/test_freezer.c @@ -1488,8 +1488,10 @@ struct cgfreezer_test { int main(int argc, char *argv[]) { char root[PATH_MAX]; - int i, ret = EXIT_SUCCESS; + int i; + ksft_print_header(); + ksft_set_plan(ARRAY_SIZE(tests)); if (cg_find_unified_root(root, sizeof(root), NULL)) ksft_exit_skip("cgroup v2 isn't mounted\n"); for (i = 0; i < ARRAY_SIZE(tests); i++) { @@ -1501,11 +1503,10 @@ int main(int argc, char *argv[]) ksft_test_result_skip("%s\n", tests[i].name); break; default: - ret = EXIT_FAILURE; ksft_test_result_fail("%s\n", tests[i].name); break; } } - return ret; + ksft_finished(); } diff --git a/tools/testing/selftests/cgroup/test_kill.c b/tools/testing/selftests/cgroup/test_kill.c index 0e5bb6c7307a..ed590b150a17 100644 --- a/tools/testing/selftests/cgroup/test_kill.c +++ b/tools/testing/selftests/cgroup/test_kill.c @@ -274,8 +274,10 @@ struct cgkill_test { int main(int argc, char *argv[]) { char root[PATH_MAX]; - int i, ret = EXIT_SUCCESS; + int i; + ksft_print_header(); + ksft_set_plan(ARRAY_SIZE(tests)); if (cg_find_unified_root(root, sizeof(root), NULL)) ksft_exit_skip("cgroup v2 isn't mounted\n"); for (i = 0; i < ARRAY_SIZE(tests); i++) { @@ -287,11 +289,10 @@ int main(int argc, char *argv[]) ksft_test_result_skip("%s\n", tests[i].name); break; default: - ret = EXIT_FAILURE; ksft_test_result_fail("%s\n", tests[i].name); break; } } - return ret; + ksft_finished(); } diff --git a/tools/testing/selftests/cgroup/test_kmem.c b/tools/testing/selftests/cgroup/test_kmem.c index 63b3c9aad399..d4c4a514ee43 100644 --- a/tools/testing/selftests/cgroup/test_kmem.c +++ b/tools/testing/selftests/cgroup/test_kmem.c @@ -421,8 +421,10 @@ struct kmem_test { int main(int argc, char **argv) { char root[PATH_MAX]; - int i, ret = EXIT_SUCCESS; + int i; + ksft_print_header(); + ksft_set_plan(ARRAY_SIZE(tests)); if (cg_find_unified_root(root, sizeof(root), NULL)) ksft_exit_skip("cgroup v2 isn't mounted\n"); @@ -446,11 +448,10 @@ int main(int argc, char **argv) ksft_test_result_skip("%s\n", tests[i].name); break; default: - ret = EXIT_FAILURE; ksft_test_result_fail("%s\n", tests[i].name); break; } } - return ret; + ksft_finished(); } diff --git a/tools/testing/selftests/cgroup/test_memcontrol.c b/tools/testing/selftests/cgroup/test_memcontrol.c index a680f773f2d5..b117325c0439 100644 --- a/tools/testing/selftests/cgroup/test_memcontrol.c +++ b/tools/testing/selftests/cgroup/test_memcontrol.c @@ -1650,8 +1650,10 @@ struct memcg_test { int main(int argc, char **argv) { char root[PATH_MAX]; - int i, proc_status, ret = EXIT_SUCCESS; + int i, proc_status; + ksft_print_header(); + ksft_set_plan(ARRAY_SIZE(tests)); if (cg_find_unified_root(root, sizeof(root), NULL)) ksft_exit_skip("cgroup v2 isn't mounted\n"); @@ -1685,11 +1687,10 @@ int main(int argc, char **argv) ksft_test_result_skip("%s\n", tests[i].name); break; default: - ret = EXIT_FAILURE; ksft_test_result_fail("%s\n", tests[i].name); break; } } - return ret; + ksft_finished(); } diff --git a/tools/testing/selftests/cgroup/test_zswap.c b/tools/testing/selftests/cgroup/test_zswap.c index e1f578ca2841..86a8930b47e3 100644 --- a/tools/testing/selftests/cgroup/test_zswap.c +++ b/tools/testing/selftests/cgroup/test_zswap.c @@ -597,8 +597,10 @@ static bool zswap_configured(void) int main(int argc, char **argv) { char root[PATH_MAX]; - int i, ret = EXIT_SUCCESS; + int i; + ksft_print_header(); + ksft_set_plan(ARRAY_SIZE(tests)); if (cg_find_unified_root(root, sizeof(root), NULL)) ksft_exit_skip("cgroup v2 isn't mounted\n"); @@ -625,11 +627,10 @@ int main(int argc, char **argv) ksft_test_result_skip("%s\n", tests[i].name); break; default: - ret = EXIT_FAILURE; ksft_test_result_fail("%s\n", tests[i].name); break; } } - return ret; + ksft_finished(); } diff --git a/tools/testing/selftests/coredump/.gitignore b/tools/testing/selftests/coredump/.gitignore new file mode 100644 index 000000000000..097f52db0be9 --- /dev/null +++ b/tools/testing/selftests/coredump/.gitignore @@ -0,0 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0-only +stackdump_test +coredump_socket_test +coredump_socket_protocol_test diff --git a/tools/testing/selftests/coredump/Makefile b/tools/testing/selftests/coredump/Makefile index 77b3665c73c7..dece1a31d561 100644 --- a/tools/testing/selftests/coredump/Makefile +++ b/tools/testing/selftests/coredump/Makefile @@ -1,7 +1,13 @@ # SPDX-License-Identifier: GPL-2.0-only CFLAGS += -Wall -O0 -g $(KHDR_INCLUDES) $(TOOLS_INCLUDES) -TEST_GEN_PROGS := stackdump_test +TEST_GEN_PROGS := stackdump_test \ + coredump_socket_test \ + coredump_socket_protocol_test TEST_FILES := stackdump include ../lib.mk + +$(OUTPUT)/stackdump_test: coredump_test_helpers.c +$(OUTPUT)/coredump_socket_test: coredump_test_helpers.c +$(OUTPUT)/coredump_socket_protocol_test: coredump_test_helpers.c diff --git a/tools/testing/selftests/coredump/coredump_socket_protocol_test.c b/tools/testing/selftests/coredump/coredump_socket_protocol_test.c new file mode 100644 index 000000000000..d19b6717c53e --- /dev/null +++ b/tools/testing/selftests/coredump/coredump_socket_protocol_test.c @@ -0,0 +1,1568 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <sys/stat.h> +#include <sys/epoll.h> +#include <sys/socket.h> +#include <sys/un.h> + +#include "coredump_test.h" + +#define NUM_CRASHING_COREDUMPS 5 + +FIXTURE_SETUP(coredump) +{ + FILE *file; + int ret; + + self->pid_coredump_server = -ESRCH; + self->fd_tmpfs_detached = -1; + file = fopen("/proc/sys/kernel/core_pattern", "r"); + ASSERT_NE(NULL, file); + + ret = fread(self->original_core_pattern, 1, sizeof(self->original_core_pattern), file); + ASSERT_TRUE(ret || feof(file)); + ASSERT_LT(ret, sizeof(self->original_core_pattern)); + + self->original_core_pattern[ret] = '\0'; + self->fd_tmpfs_detached = create_detached_tmpfs(); + ASSERT_GE(self->fd_tmpfs_detached, 0); + + ret = fclose(file); + ASSERT_EQ(0, ret); +} + +FIXTURE_TEARDOWN(coredump) +{ + const char *reason; + FILE *file; + int ret, status; + + if (self->pid_coredump_server > 0) { + kill(self->pid_coredump_server, SIGTERM); + waitpid(self->pid_coredump_server, &status, 0); + } + unlink("/tmp/coredump.file"); + unlink("/tmp/coredump.socket"); + + file = fopen("/proc/sys/kernel/core_pattern", "w"); + if (!file) { + reason = "Unable to open core_pattern"; + goto fail; + } + + ret = fprintf(file, "%s", self->original_core_pattern); + if (ret < 0) { + reason = "Unable to write to core_pattern"; + goto fail; + } + + ret = fclose(file); + if (ret) { + reason = "Unable to close core_pattern"; + goto fail; + } + + if (self->fd_tmpfs_detached >= 0) { + ret = close(self->fd_tmpfs_detached); + if (ret < 0) { + reason = "Unable to close detached tmpfs"; + goto fail; + } + self->fd_tmpfs_detached = -1; + } + + return; +fail: + /* This should never happen */ + fprintf(stderr, "Failed to cleanup coredump test: %s\n", reason); +} + +TEST_F(coredump, socket_request_kernel) +{ + int pidfd, ret, status; + pid_t pid, pid_coredump_server; + struct stat st; + struct pidfd_info info = {}; + int ipc_sockets[2]; + char c; + + ASSERT_TRUE(set_core_pattern("@@/tmp/coredump.socket")); + + ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets); + ASSERT_EQ(ret, 0); + + pid_coredump_server = fork(); + ASSERT_GE(pid_coredump_server, 0); + if (pid_coredump_server == 0) { + struct coredump_req req = {}; + int fd_server = -1, fd_coredump = -1, fd_core_file = -1, fd_peer_pidfd = -1; + int exit_code = EXIT_FAILURE; + + close(ipc_sockets[0]); + + fd_server = create_and_listen_unix_socket("/tmp/coredump.socket"); + if (fd_server < 0) { + fprintf(stderr, "socket_request_kernel: create_and_listen_unix_socket failed: %m\n"); + goto out; + } + + if (write_nointr(ipc_sockets[1], "1", 1) < 0) { + fprintf(stderr, "socket_request_kernel: write_nointr to ipc socket failed: %m\n"); + goto out; + } + + close(ipc_sockets[1]); + + fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC); + if (fd_coredump < 0) { + fprintf(stderr, "socket_request_kernel: accept4 failed: %m\n"); + goto out; + } + + fd_peer_pidfd = get_peer_pidfd(fd_coredump); + if (fd_peer_pidfd < 0) { + fprintf(stderr, "socket_request_kernel: get_peer_pidfd failed\n"); + goto out; + } + + if (!get_pidfd_info(fd_peer_pidfd, &info)) { + fprintf(stderr, "socket_request_kernel: get_pidfd_info failed\n"); + goto out; + } + + if (!(info.mask & PIDFD_INFO_COREDUMP)) { + fprintf(stderr, "socket_request_kernel: PIDFD_INFO_COREDUMP not set in mask\n"); + goto out; + } + + if (!(info.coredump_mask & PIDFD_COREDUMPED)) { + fprintf(stderr, "socket_request_kernel: PIDFD_COREDUMPED not set in coredump_mask\n"); + goto out; + } + + fd_core_file = creat("/tmp/coredump.file", 0644); + if (fd_core_file < 0) { + fprintf(stderr, "socket_request_kernel: creat coredump file failed: %m\n"); + goto out; + } + + if (!read_coredump_req(fd_coredump, &req)) { + fprintf(stderr, "socket_request_kernel: read_coredump_req failed\n"); + goto out; + } + + if (!check_coredump_req(&req, COREDUMP_ACK_SIZE_VER0, + COREDUMP_KERNEL | COREDUMP_USERSPACE | + COREDUMP_REJECT | COREDUMP_WAIT)) { + fprintf(stderr, "socket_request_kernel: check_coredump_req failed\n"); + goto out; + } + + if (!send_coredump_ack(fd_coredump, &req, + COREDUMP_KERNEL | COREDUMP_WAIT, 0)) { + fprintf(stderr, "socket_request_kernel: send_coredump_ack failed\n"); + goto out; + } + + if (!read_marker(fd_coredump, COREDUMP_MARK_REQACK)) { + fprintf(stderr, "socket_request_kernel: read_marker COREDUMP_MARK_REQACK failed\n"); + goto out; + } + + for (;;) { + char buffer[4096]; + ssize_t bytes_read, bytes_write; + + bytes_read = read(fd_coredump, buffer, sizeof(buffer)); + if (bytes_read < 0) { + fprintf(stderr, "socket_request_kernel: read from coredump socket failed: %m\n"); + goto out; + } + + if (bytes_read == 0) + break; + + bytes_write = write(fd_core_file, buffer, bytes_read); + if (bytes_read != bytes_write) { + if (bytes_write < 0 && errno == ENOSPC) + continue; + fprintf(stderr, "socket_request_kernel: write to core file failed (read=%zd, write=%zd): %m\n", + bytes_read, bytes_write); + goto out; + } + } + + exit_code = EXIT_SUCCESS; + fprintf(stderr, "socket_request_kernel: completed successfully\n"); +out: + if (fd_core_file >= 0) + close(fd_core_file); + if (fd_peer_pidfd >= 0) + close(fd_peer_pidfd); + if (fd_coredump >= 0) + close(fd_coredump); + if (fd_server >= 0) + close(fd_server); + _exit(exit_code); + } + self->pid_coredump_server = pid_coredump_server; + + EXPECT_EQ(close(ipc_sockets[1]), 0); + ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1); + EXPECT_EQ(close(ipc_sockets[0]), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + if (pid == 0) + crashing_child(); + + pidfd = sys_pidfd_open(pid, 0); + ASSERT_GE(pidfd, 0); + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFSIGNALED(status)); + ASSERT_TRUE(WCOREDUMP(status)); + + ASSERT_TRUE(get_pidfd_info(pidfd, &info)); + ASSERT_GT((info.mask & PIDFD_INFO_COREDUMP), 0); + ASSERT_GT((info.coredump_mask & PIDFD_COREDUMPED), 0); + + wait_and_check_coredump_server(pid_coredump_server, _metadata, self); + + ASSERT_EQ(stat("/tmp/coredump.file", &st), 0); + ASSERT_GT(st.st_size, 0); + system("file /tmp/coredump.file"); +} + +TEST_F(coredump, socket_request_userspace) +{ + int pidfd, ret, status; + pid_t pid, pid_coredump_server; + struct pidfd_info info = {}; + int ipc_sockets[2]; + char c; + + ASSERT_TRUE(set_core_pattern("@@/tmp/coredump.socket")); + + ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets); + ASSERT_EQ(ret, 0); + + pid_coredump_server = fork(); + ASSERT_GE(pid_coredump_server, 0); + if (pid_coredump_server == 0) { + struct coredump_req req = {}; + int fd_server = -1, fd_coredump = -1, fd_peer_pidfd = -1; + int exit_code = EXIT_FAILURE; + + close(ipc_sockets[0]); + + fd_server = create_and_listen_unix_socket("/tmp/coredump.socket"); + if (fd_server < 0) { + fprintf(stderr, "socket_request_userspace: create_and_listen_unix_socket failed: %m\n"); + goto out; + } + + if (write_nointr(ipc_sockets[1], "1", 1) < 0) { + fprintf(stderr, "socket_request_userspace: write_nointr to ipc socket failed: %m\n"); + goto out; + } + + close(ipc_sockets[1]); + + fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC); + if (fd_coredump < 0) { + fprintf(stderr, "socket_request_userspace: accept4 failed: %m\n"); + goto out; + } + + fd_peer_pidfd = get_peer_pidfd(fd_coredump); + if (fd_peer_pidfd < 0) { + fprintf(stderr, "socket_request_userspace: get_peer_pidfd failed\n"); + goto out; + } + + if (!get_pidfd_info(fd_peer_pidfd, &info)) { + fprintf(stderr, "socket_request_userspace: get_pidfd_info failed\n"); + goto out; + } + + if (!(info.mask & PIDFD_INFO_COREDUMP)) { + fprintf(stderr, "socket_request_userspace: PIDFD_INFO_COREDUMP not set in mask\n"); + goto out; + } + + if (!(info.coredump_mask & PIDFD_COREDUMPED)) { + fprintf(stderr, "socket_request_userspace: PIDFD_COREDUMPED not set in coredump_mask\n"); + goto out; + } + + if (!read_coredump_req(fd_coredump, &req)) { + fprintf(stderr, "socket_request_userspace: read_coredump_req failed\n"); + goto out; + } + + if (!check_coredump_req(&req, COREDUMP_ACK_SIZE_VER0, + COREDUMP_KERNEL | COREDUMP_USERSPACE | + COREDUMP_REJECT | COREDUMP_WAIT)) { + fprintf(stderr, "socket_request_userspace: check_coredump_req failed\n"); + goto out; + } + + if (!send_coredump_ack(fd_coredump, &req, + COREDUMP_USERSPACE | COREDUMP_WAIT, 0)) { + fprintf(stderr, "socket_request_userspace: send_coredump_ack failed\n"); + goto out; + } + + if (!read_marker(fd_coredump, COREDUMP_MARK_REQACK)) { + fprintf(stderr, "socket_request_userspace: read_marker COREDUMP_MARK_REQACK failed\n"); + goto out; + } + + for (;;) { + char buffer[4096]; + ssize_t bytes_read; + + bytes_read = read(fd_coredump, buffer, sizeof(buffer)); + if (bytes_read > 0) { + fprintf(stderr, "socket_request_userspace: unexpected data received (expected no coredump data)\n"); + goto out; + } + + if (bytes_read < 0) { + fprintf(stderr, "socket_request_userspace: read from coredump socket failed: %m\n"); + goto out; + } + + if (bytes_read == 0) + break; + } + + exit_code = EXIT_SUCCESS; + fprintf(stderr, "socket_request_userspace: completed successfully\n"); +out: + if (fd_peer_pidfd >= 0) + close(fd_peer_pidfd); + if (fd_coredump >= 0) + close(fd_coredump); + if (fd_server >= 0) + close(fd_server); + _exit(exit_code); + } + self->pid_coredump_server = pid_coredump_server; + + EXPECT_EQ(close(ipc_sockets[1]), 0); + ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1); + EXPECT_EQ(close(ipc_sockets[0]), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + if (pid == 0) + crashing_child(); + + pidfd = sys_pidfd_open(pid, 0); + ASSERT_GE(pidfd, 0); + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFSIGNALED(status)); + ASSERT_TRUE(WCOREDUMP(status)); + + ASSERT_TRUE(get_pidfd_info(pidfd, &info)); + ASSERT_GT((info.mask & PIDFD_INFO_COREDUMP), 0); + ASSERT_GT((info.coredump_mask & PIDFD_COREDUMPED), 0); + + wait_and_check_coredump_server(pid_coredump_server, _metadata, self); +} + +TEST_F(coredump, socket_request_reject) +{ + int pidfd, ret, status; + pid_t pid, pid_coredump_server; + struct pidfd_info info = {}; + int ipc_sockets[2]; + char c; + + ASSERT_TRUE(set_core_pattern("@@/tmp/coredump.socket")); + + ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets); + ASSERT_EQ(ret, 0); + + pid_coredump_server = fork(); + ASSERT_GE(pid_coredump_server, 0); + if (pid_coredump_server == 0) { + struct coredump_req req = {}; + int fd_server = -1, fd_coredump = -1, fd_peer_pidfd = -1; + int exit_code = EXIT_FAILURE; + + close(ipc_sockets[0]); + + fd_server = create_and_listen_unix_socket("/tmp/coredump.socket"); + if (fd_server < 0) { + fprintf(stderr, "socket_request_reject: create_and_listen_unix_socket failed: %m\n"); + goto out; + } + + if (write_nointr(ipc_sockets[1], "1", 1) < 0) { + fprintf(stderr, "socket_request_reject: write_nointr to ipc socket failed: %m\n"); + goto out; + } + + close(ipc_sockets[1]); + + fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC); + if (fd_coredump < 0) { + fprintf(stderr, "socket_request_reject: accept4 failed: %m\n"); + goto out; + } + + fd_peer_pidfd = get_peer_pidfd(fd_coredump); + if (fd_peer_pidfd < 0) { + fprintf(stderr, "socket_request_reject: get_peer_pidfd failed\n"); + goto out; + } + + if (!get_pidfd_info(fd_peer_pidfd, &info)) { + fprintf(stderr, "socket_request_reject: get_pidfd_info failed\n"); + goto out; + } + + if (!(info.mask & PIDFD_INFO_COREDUMP)) { + fprintf(stderr, "socket_request_reject: PIDFD_INFO_COREDUMP not set in mask\n"); + goto out; + } + + if (!(info.coredump_mask & PIDFD_COREDUMPED)) { + fprintf(stderr, "socket_request_reject: PIDFD_COREDUMPED not set in coredump_mask\n"); + goto out; + } + + if (!read_coredump_req(fd_coredump, &req)) { + fprintf(stderr, "socket_request_reject: read_coredump_req failed\n"); + goto out; + } + + if (!check_coredump_req(&req, COREDUMP_ACK_SIZE_VER0, + COREDUMP_KERNEL | COREDUMP_USERSPACE | + COREDUMP_REJECT | COREDUMP_WAIT)) { + fprintf(stderr, "socket_request_reject: check_coredump_req failed\n"); + goto out; + } + + if (!send_coredump_ack(fd_coredump, &req, + COREDUMP_REJECT | COREDUMP_WAIT, 0)) { + fprintf(stderr, "socket_request_reject: send_coredump_ack failed\n"); + goto out; + } + + if (!read_marker(fd_coredump, COREDUMP_MARK_REQACK)) { + fprintf(stderr, "socket_request_reject: read_marker COREDUMP_MARK_REQACK failed\n"); + goto out; + } + + for (;;) { + char buffer[4096]; + ssize_t bytes_read; + + bytes_read = read(fd_coredump, buffer, sizeof(buffer)); + if (bytes_read > 0) { + fprintf(stderr, "socket_request_reject: unexpected data received (expected no coredump data for REJECT)\n"); + goto out; + } + + if (bytes_read < 0) { + fprintf(stderr, "socket_request_reject: read from coredump socket failed: %m\n"); + goto out; + } + + if (bytes_read == 0) + break; + } + + exit_code = EXIT_SUCCESS; + fprintf(stderr, "socket_request_reject: completed successfully\n"); +out: + if (fd_peer_pidfd >= 0) + close(fd_peer_pidfd); + if (fd_coredump >= 0) + close(fd_coredump); + if (fd_server >= 0) + close(fd_server); + _exit(exit_code); + } + self->pid_coredump_server = pid_coredump_server; + + EXPECT_EQ(close(ipc_sockets[1]), 0); + ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1); + EXPECT_EQ(close(ipc_sockets[0]), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + if (pid == 0) + crashing_child(); + + pidfd = sys_pidfd_open(pid, 0); + ASSERT_GE(pidfd, 0); + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFSIGNALED(status)); + ASSERT_FALSE(WCOREDUMP(status)); + + ASSERT_TRUE(get_pidfd_info(pidfd, &info)); + ASSERT_GT((info.mask & PIDFD_INFO_COREDUMP), 0); + ASSERT_GT((info.coredump_mask & PIDFD_COREDUMPED), 0); + + wait_and_check_coredump_server(pid_coredump_server, _metadata, self); +} + +TEST_F(coredump, socket_request_invalid_flag_combination) +{ + int pidfd, ret, status; + pid_t pid, pid_coredump_server; + struct pidfd_info info = {}; + int ipc_sockets[2]; + char c; + + ASSERT_TRUE(set_core_pattern("@@/tmp/coredump.socket")); + + ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets); + ASSERT_EQ(ret, 0); + + pid_coredump_server = fork(); + ASSERT_GE(pid_coredump_server, 0); + if (pid_coredump_server == 0) { + struct coredump_req req = {}; + int fd_server = -1, fd_coredump = -1, fd_peer_pidfd = -1; + int exit_code = EXIT_FAILURE; + + close(ipc_sockets[0]); + + fd_server = create_and_listen_unix_socket("/tmp/coredump.socket"); + if (fd_server < 0) { + fprintf(stderr, "socket_request_invalid_flag_combination: create_and_listen_unix_socket failed: %m\n"); + goto out; + } + + if (write_nointr(ipc_sockets[1], "1", 1) < 0) { + fprintf(stderr, "socket_request_invalid_flag_combination: write_nointr to ipc socket failed: %m\n"); + goto out; + } + + close(ipc_sockets[1]); + + fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC); + if (fd_coredump < 0) { + fprintf(stderr, "socket_request_invalid_flag_combination: accept4 failed: %m\n"); + goto out; + } + + fd_peer_pidfd = get_peer_pidfd(fd_coredump); + if (fd_peer_pidfd < 0) { + fprintf(stderr, "socket_request_invalid_flag_combination: get_peer_pidfd failed\n"); + goto out; + } + + if (!get_pidfd_info(fd_peer_pidfd, &info)) { + fprintf(stderr, "socket_request_invalid_flag_combination: get_pidfd_info failed\n"); + goto out; + } + + if (!(info.mask & PIDFD_INFO_COREDUMP)) { + fprintf(stderr, "socket_request_invalid_flag_combination: PIDFD_INFO_COREDUMP not set in mask\n"); + goto out; + } + + if (!(info.coredump_mask & PIDFD_COREDUMPED)) { + fprintf(stderr, "socket_request_invalid_flag_combination: PIDFD_COREDUMPED not set in coredump_mask\n"); + goto out; + } + + if (!read_coredump_req(fd_coredump, &req)) { + fprintf(stderr, "socket_request_invalid_flag_combination: read_coredump_req failed\n"); + goto out; + } + + if (!check_coredump_req(&req, COREDUMP_ACK_SIZE_VER0, + COREDUMP_KERNEL | COREDUMP_USERSPACE | + COREDUMP_REJECT | COREDUMP_WAIT)) { + fprintf(stderr, "socket_request_invalid_flag_combination: check_coredump_req failed\n"); + goto out; + } + + if (!send_coredump_ack(fd_coredump, &req, + COREDUMP_KERNEL | COREDUMP_REJECT | COREDUMP_WAIT, 0)) { + fprintf(stderr, "socket_request_invalid_flag_combination: send_coredump_ack failed\n"); + goto out; + } + + if (!read_marker(fd_coredump, COREDUMP_MARK_CONFLICTING)) { + fprintf(stderr, "socket_request_invalid_flag_combination: read_marker COREDUMP_MARK_CONFLICTING failed\n"); + goto out; + } + + exit_code = EXIT_SUCCESS; + fprintf(stderr, "socket_request_invalid_flag_combination: completed successfully\n"); +out: + if (fd_peer_pidfd >= 0) + close(fd_peer_pidfd); + if (fd_coredump >= 0) + close(fd_coredump); + if (fd_server >= 0) + close(fd_server); + _exit(exit_code); + } + self->pid_coredump_server = pid_coredump_server; + + EXPECT_EQ(close(ipc_sockets[1]), 0); + ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1); + EXPECT_EQ(close(ipc_sockets[0]), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + if (pid == 0) + crashing_child(); + + pidfd = sys_pidfd_open(pid, 0); + ASSERT_GE(pidfd, 0); + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFSIGNALED(status)); + ASSERT_FALSE(WCOREDUMP(status)); + + ASSERT_TRUE(get_pidfd_info(pidfd, &info)); + ASSERT_GT((info.mask & PIDFD_INFO_COREDUMP), 0); + ASSERT_GT((info.coredump_mask & PIDFD_COREDUMPED), 0); + + wait_and_check_coredump_server(pid_coredump_server, _metadata, self); +} + +TEST_F(coredump, socket_request_unknown_flag) +{ + int pidfd, ret, status; + pid_t pid, pid_coredump_server; + struct pidfd_info info = {}; + int ipc_sockets[2]; + char c; + + ASSERT_TRUE(set_core_pattern("@@/tmp/coredump.socket")); + + ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets); + ASSERT_EQ(ret, 0); + + pid_coredump_server = fork(); + ASSERT_GE(pid_coredump_server, 0); + if (pid_coredump_server == 0) { + struct coredump_req req = {}; + int fd_server = -1, fd_coredump = -1, fd_peer_pidfd = -1; + int exit_code = EXIT_FAILURE; + + close(ipc_sockets[0]); + + fd_server = create_and_listen_unix_socket("/tmp/coredump.socket"); + if (fd_server < 0) { + fprintf(stderr, "socket_request_unknown_flag: create_and_listen_unix_socket failed: %m\n"); + goto out; + } + + if (write_nointr(ipc_sockets[1], "1", 1) < 0) { + fprintf(stderr, "socket_request_unknown_flag: write_nointr to ipc socket failed: %m\n"); + goto out; + } + + close(ipc_sockets[1]); + + fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC); + if (fd_coredump < 0) { + fprintf(stderr, "socket_request_unknown_flag: accept4 failed: %m\n"); + goto out; + } + + fd_peer_pidfd = get_peer_pidfd(fd_coredump); + if (fd_peer_pidfd < 0) { + fprintf(stderr, "socket_request_unknown_flag: get_peer_pidfd failed\n"); + goto out; + } + + if (!get_pidfd_info(fd_peer_pidfd, &info)) { + fprintf(stderr, "socket_request_unknown_flag: get_pidfd_info failed\n"); + goto out; + } + + if (!(info.mask & PIDFD_INFO_COREDUMP)) { + fprintf(stderr, "socket_request_unknown_flag: PIDFD_INFO_COREDUMP not set in mask\n"); + goto out; + } + + if (!(info.coredump_mask & PIDFD_COREDUMPED)) { + fprintf(stderr, "socket_request_unknown_flag: PIDFD_COREDUMPED not set in coredump_mask\n"); + goto out; + } + + if (!read_coredump_req(fd_coredump, &req)) { + fprintf(stderr, "socket_request_unknown_flag: read_coredump_req failed\n"); + goto out; + } + + if (!check_coredump_req(&req, COREDUMP_ACK_SIZE_VER0, + COREDUMP_KERNEL | COREDUMP_USERSPACE | + COREDUMP_REJECT | COREDUMP_WAIT)) { + fprintf(stderr, "socket_request_unknown_flag: check_coredump_req failed\n"); + goto out; + } + + if (!send_coredump_ack(fd_coredump, &req, (1ULL << 63), 0)) { + fprintf(stderr, "socket_request_unknown_flag: send_coredump_ack failed\n"); + goto out; + } + + if (!read_marker(fd_coredump, COREDUMP_MARK_UNSUPPORTED)) { + fprintf(stderr, "socket_request_unknown_flag: read_marker COREDUMP_MARK_UNSUPPORTED failed\n"); + goto out; + } + + exit_code = EXIT_SUCCESS; + fprintf(stderr, "socket_request_unknown_flag: completed successfully\n"); +out: + if (fd_peer_pidfd >= 0) + close(fd_peer_pidfd); + if (fd_coredump >= 0) + close(fd_coredump); + if (fd_server >= 0) + close(fd_server); + _exit(exit_code); + } + self->pid_coredump_server = pid_coredump_server; + + EXPECT_EQ(close(ipc_sockets[1]), 0); + ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1); + EXPECT_EQ(close(ipc_sockets[0]), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + if (pid == 0) + crashing_child(); + + pidfd = sys_pidfd_open(pid, 0); + ASSERT_GE(pidfd, 0); + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFSIGNALED(status)); + ASSERT_FALSE(WCOREDUMP(status)); + + ASSERT_TRUE(get_pidfd_info(pidfd, &info)); + ASSERT_GT((info.mask & PIDFD_INFO_COREDUMP), 0); + ASSERT_GT((info.coredump_mask & PIDFD_COREDUMPED), 0); + + wait_and_check_coredump_server(pid_coredump_server, _metadata, self); +} + +TEST_F(coredump, socket_request_invalid_size_small) +{ + int pidfd, ret, status; + pid_t pid, pid_coredump_server; + struct pidfd_info info = {}; + int ipc_sockets[2]; + char c; + + ASSERT_TRUE(set_core_pattern("@@/tmp/coredump.socket")); + + ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets); + ASSERT_EQ(ret, 0); + + pid_coredump_server = fork(); + ASSERT_GE(pid_coredump_server, 0); + if (pid_coredump_server == 0) { + struct coredump_req req = {}; + int fd_server = -1, fd_coredump = -1, fd_peer_pidfd = -1; + int exit_code = EXIT_FAILURE; + + close(ipc_sockets[0]); + + fd_server = create_and_listen_unix_socket("/tmp/coredump.socket"); + if (fd_server < 0) { + fprintf(stderr, "socket_request_invalid_size_small: create_and_listen_unix_socket failed: %m\n"); + goto out; + } + + if (write_nointr(ipc_sockets[1], "1", 1) < 0) { + fprintf(stderr, "socket_request_invalid_size_small: write_nointr to ipc socket failed: %m\n"); + goto out; + } + + close(ipc_sockets[1]); + + fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC); + if (fd_coredump < 0) { + fprintf(stderr, "socket_request_invalid_size_small: accept4 failed: %m\n"); + goto out; + } + + fd_peer_pidfd = get_peer_pidfd(fd_coredump); + if (fd_peer_pidfd < 0) { + fprintf(stderr, "socket_request_invalid_size_small: get_peer_pidfd failed\n"); + goto out; + } + + if (!get_pidfd_info(fd_peer_pidfd, &info)) { + fprintf(stderr, "socket_request_invalid_size_small: get_pidfd_info failed\n"); + goto out; + } + + if (!(info.mask & PIDFD_INFO_COREDUMP)) { + fprintf(stderr, "socket_request_invalid_size_small: PIDFD_INFO_COREDUMP not set in mask\n"); + goto out; + } + + if (!(info.coredump_mask & PIDFD_COREDUMPED)) { + fprintf(stderr, "socket_request_invalid_size_small: PIDFD_COREDUMPED not set in coredump_mask\n"); + goto out; + } + + if (!read_coredump_req(fd_coredump, &req)) { + fprintf(stderr, "socket_request_invalid_size_small: read_coredump_req failed\n"); + goto out; + } + + if (!check_coredump_req(&req, COREDUMP_ACK_SIZE_VER0, + COREDUMP_KERNEL | COREDUMP_USERSPACE | + COREDUMP_REJECT | COREDUMP_WAIT)) { + fprintf(stderr, "socket_request_invalid_size_small: check_coredump_req failed\n"); + goto out; + } + + if (!send_coredump_ack(fd_coredump, &req, + COREDUMP_REJECT | COREDUMP_WAIT, + COREDUMP_ACK_SIZE_VER0 / 2)) { + fprintf(stderr, "socket_request_invalid_size_small: send_coredump_ack failed\n"); + goto out; + } + + if (!read_marker(fd_coredump, COREDUMP_MARK_MINSIZE)) { + fprintf(stderr, "socket_request_invalid_size_small: read_marker COREDUMP_MARK_MINSIZE failed\n"); + goto out; + } + + exit_code = EXIT_SUCCESS; + fprintf(stderr, "socket_request_invalid_size_small: completed successfully\n"); +out: + if (fd_peer_pidfd >= 0) + close(fd_peer_pidfd); + if (fd_coredump >= 0) + close(fd_coredump); + if (fd_server >= 0) + close(fd_server); + _exit(exit_code); + } + self->pid_coredump_server = pid_coredump_server; + + EXPECT_EQ(close(ipc_sockets[1]), 0); + ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1); + EXPECT_EQ(close(ipc_sockets[0]), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + if (pid == 0) + crashing_child(); + + pidfd = sys_pidfd_open(pid, 0); + ASSERT_GE(pidfd, 0); + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFSIGNALED(status)); + ASSERT_FALSE(WCOREDUMP(status)); + + ASSERT_TRUE(get_pidfd_info(pidfd, &info)); + ASSERT_GT((info.mask & PIDFD_INFO_COREDUMP), 0); + ASSERT_GT((info.coredump_mask & PIDFD_COREDUMPED), 0); + + wait_and_check_coredump_server(pid_coredump_server, _metadata, self); +} + +TEST_F(coredump, socket_request_invalid_size_large) +{ + int pidfd, ret, status; + pid_t pid, pid_coredump_server; + struct pidfd_info info = {}; + int ipc_sockets[2]; + char c; + + ASSERT_TRUE(set_core_pattern("@@/tmp/coredump.socket")); + + ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets); + ASSERT_EQ(ret, 0); + + pid_coredump_server = fork(); + ASSERT_GE(pid_coredump_server, 0); + if (pid_coredump_server == 0) { + struct coredump_req req = {}; + int fd_server = -1, fd_coredump = -1, fd_peer_pidfd = -1; + int exit_code = EXIT_FAILURE; + + close(ipc_sockets[0]); + + fd_server = create_and_listen_unix_socket("/tmp/coredump.socket"); + if (fd_server < 0) { + fprintf(stderr, "socket_request_invalid_size_large: create_and_listen_unix_socket failed: %m\n"); + goto out; + } + + if (write_nointr(ipc_sockets[1], "1", 1) < 0) { + fprintf(stderr, "socket_request_invalid_size_large: write_nointr to ipc socket failed: %m\n"); + goto out; + } + + close(ipc_sockets[1]); + + fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC); + if (fd_coredump < 0) { + fprintf(stderr, "socket_request_invalid_size_large: accept4 failed: %m\n"); + goto out; + } + + fd_peer_pidfd = get_peer_pidfd(fd_coredump); + if (fd_peer_pidfd < 0) { + fprintf(stderr, "socket_request_invalid_size_large: get_peer_pidfd failed\n"); + goto out; + } + + if (!get_pidfd_info(fd_peer_pidfd, &info)) { + fprintf(stderr, "socket_request_invalid_size_large: get_pidfd_info failed\n"); + goto out; + } + + if (!(info.mask & PIDFD_INFO_COREDUMP)) { + fprintf(stderr, "socket_request_invalid_size_large: PIDFD_INFO_COREDUMP not set in mask\n"); + goto out; + } + + if (!(info.coredump_mask & PIDFD_COREDUMPED)) { + fprintf(stderr, "socket_request_invalid_size_large: PIDFD_COREDUMPED not set in coredump_mask\n"); + goto out; + } + + if (!read_coredump_req(fd_coredump, &req)) { + fprintf(stderr, "socket_request_invalid_size_large: read_coredump_req failed\n"); + goto out; + } + + if (!check_coredump_req(&req, COREDUMP_ACK_SIZE_VER0, + COREDUMP_KERNEL | COREDUMP_USERSPACE | + COREDUMP_REJECT | COREDUMP_WAIT)) { + fprintf(stderr, "socket_request_invalid_size_large: check_coredump_req failed\n"); + goto out; + } + + if (!send_coredump_ack(fd_coredump, &req, + COREDUMP_REJECT | COREDUMP_WAIT, + COREDUMP_ACK_SIZE_VER0 + PAGE_SIZE)) { + fprintf(stderr, "socket_request_invalid_size_large: send_coredump_ack failed\n"); + goto out; + } + + if (!read_marker(fd_coredump, COREDUMP_MARK_MAXSIZE)) { + fprintf(stderr, "socket_request_invalid_size_large: read_marker COREDUMP_MARK_MAXSIZE failed\n"); + goto out; + } + + exit_code = EXIT_SUCCESS; + fprintf(stderr, "socket_request_invalid_size_large: completed successfully\n"); +out: + if (fd_peer_pidfd >= 0) + close(fd_peer_pidfd); + if (fd_coredump >= 0) + close(fd_coredump); + if (fd_server >= 0) + close(fd_server); + _exit(exit_code); + } + self->pid_coredump_server = pid_coredump_server; + + EXPECT_EQ(close(ipc_sockets[1]), 0); + ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1); + EXPECT_EQ(close(ipc_sockets[0]), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + if (pid == 0) + crashing_child(); + + pidfd = sys_pidfd_open(pid, 0); + ASSERT_GE(pidfd, 0); + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFSIGNALED(status)); + ASSERT_FALSE(WCOREDUMP(status)); + + ASSERT_TRUE(get_pidfd_info(pidfd, &info)); + ASSERT_GT((info.mask & PIDFD_INFO_COREDUMP), 0); + ASSERT_GT((info.coredump_mask & PIDFD_COREDUMPED), 0); + + wait_and_check_coredump_server(pid_coredump_server, _metadata, self); +} + +/* + * Test: PIDFD_INFO_COREDUMP_SIGNAL via socket coredump with SIGSEGV + * + * Verify that when using socket-based coredump protocol, + * the coredump_signal field is correctly exposed as SIGSEGV. + */ +TEST_F(coredump, socket_coredump_signal_sigsegv) +{ + int pidfd, ret, status; + pid_t pid, pid_coredump_server; + struct pidfd_info info = {}; + int ipc_sockets[2]; + char c; + + ASSERT_TRUE(set_core_pattern("@@/tmp/coredump.socket")); + + ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets); + ASSERT_EQ(ret, 0); + + pid_coredump_server = fork(); + ASSERT_GE(pid_coredump_server, 0); + if (pid_coredump_server == 0) { + struct coredump_req req = {}; + int fd_server = -1, fd_coredump = -1, fd_peer_pidfd = -1; + int exit_code = EXIT_FAILURE; + + close(ipc_sockets[0]); + + fd_server = create_and_listen_unix_socket("/tmp/coredump.socket"); + if (fd_server < 0) { + fprintf(stderr, "socket_coredump_signal_sigsegv: create_and_listen_unix_socket failed: %m\n"); + goto out; + } + + if (write_nointr(ipc_sockets[1], "1", 1) < 0) { + fprintf(stderr, "socket_coredump_signal_sigsegv: write_nointr to ipc socket failed: %m\n"); + goto out; + } + + close(ipc_sockets[1]); + + fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC); + if (fd_coredump < 0) { + fprintf(stderr, "socket_coredump_signal_sigsegv: accept4 failed: %m\n"); + goto out; + } + + fd_peer_pidfd = get_peer_pidfd(fd_coredump); + if (fd_peer_pidfd < 0) { + fprintf(stderr, "socket_coredump_signal_sigsegv: get_peer_pidfd failed\n"); + goto out; + } + + if (!get_pidfd_info(fd_peer_pidfd, &info)) { + fprintf(stderr, "socket_coredump_signal_sigsegv: get_pidfd_info failed\n"); + goto out; + } + + if (!(info.mask & PIDFD_INFO_COREDUMP)) { + fprintf(stderr, "socket_coredump_signal_sigsegv: PIDFD_INFO_COREDUMP not set in mask\n"); + goto out; + } + + if (!(info.coredump_mask & PIDFD_COREDUMPED)) { + fprintf(stderr, "socket_coredump_signal_sigsegv: PIDFD_COREDUMPED not set in coredump_mask\n"); + goto out; + } + + /* Verify coredump_signal is available and correct */ + if (!(info.mask & PIDFD_INFO_COREDUMP_SIGNAL)) { + fprintf(stderr, "socket_coredump_signal_sigsegv: PIDFD_INFO_COREDUMP_SIGNAL not set in mask\n"); + goto out; + } + + if (info.coredump_signal != SIGSEGV) { + fprintf(stderr, "socket_coredump_signal_sigsegv: coredump_signal=%d, expected SIGSEGV=%d\n", + info.coredump_signal, SIGSEGV); + goto out; + } + + if (!read_coredump_req(fd_coredump, &req)) { + fprintf(stderr, "socket_coredump_signal_sigsegv: read_coredump_req failed\n"); + goto out; + } + + if (!send_coredump_ack(fd_coredump, &req, + COREDUMP_REJECT | COREDUMP_WAIT, 0)) { + fprintf(stderr, "socket_coredump_signal_sigsegv: send_coredump_ack failed\n"); + goto out; + } + + if (!read_marker(fd_coredump, COREDUMP_MARK_REQACK)) { + fprintf(stderr, "socket_coredump_signal_sigsegv: read_marker COREDUMP_MARK_REQACK failed\n"); + goto out; + } + + exit_code = EXIT_SUCCESS; + fprintf(stderr, "socket_coredump_signal_sigsegv: completed successfully\n"); +out: + if (fd_peer_pidfd >= 0) + close(fd_peer_pidfd); + if (fd_coredump >= 0) + close(fd_coredump); + if (fd_server >= 0) + close(fd_server); + _exit(exit_code); + } + self->pid_coredump_server = pid_coredump_server; + + EXPECT_EQ(close(ipc_sockets[1]), 0); + ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1); + EXPECT_EQ(close(ipc_sockets[0]), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + if (pid == 0) + crashing_child(); + + pidfd = sys_pidfd_open(pid, 0); + ASSERT_GE(pidfd, 0); + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFSIGNALED(status)); + ASSERT_EQ(WTERMSIG(status), SIGSEGV); + + ASSERT_TRUE(get_pidfd_info(pidfd, &info)); + ASSERT_TRUE(!!(info.mask & PIDFD_INFO_COREDUMP)); + ASSERT_TRUE(!!(info.mask & PIDFD_INFO_COREDUMP_SIGNAL)); + ASSERT_EQ(info.coredump_signal, SIGSEGV); + + wait_and_check_coredump_server(pid_coredump_server, _metadata, self); +} + +/* + * Test: PIDFD_INFO_COREDUMP_SIGNAL via socket coredump with SIGABRT + * + * Verify that when using socket-based coredump protocol, + * the coredump_signal field is correctly exposed as SIGABRT. + */ +TEST_F(coredump, socket_coredump_signal_sigabrt) +{ + int pidfd, ret, status; + pid_t pid, pid_coredump_server; + struct pidfd_info info = {}; + int ipc_sockets[2]; + char c; + + ASSERT_TRUE(set_core_pattern("@@/tmp/coredump.socket")); + + ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets); + ASSERT_EQ(ret, 0); + + pid_coredump_server = fork(); + ASSERT_GE(pid_coredump_server, 0); + if (pid_coredump_server == 0) { + struct coredump_req req = {}; + int fd_server = -1, fd_coredump = -1, fd_peer_pidfd = -1; + int exit_code = EXIT_FAILURE; + + close(ipc_sockets[0]); + + fd_server = create_and_listen_unix_socket("/tmp/coredump.socket"); + if (fd_server < 0) { + fprintf(stderr, "socket_coredump_signal_sigabrt: create_and_listen_unix_socket failed: %m\n"); + goto out; + } + + if (write_nointr(ipc_sockets[1], "1", 1) < 0) { + fprintf(stderr, "socket_coredump_signal_sigabrt: write_nointr to ipc socket failed: %m\n"); + goto out; + } + + close(ipc_sockets[1]); + + fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC); + if (fd_coredump < 0) { + fprintf(stderr, "socket_coredump_signal_sigabrt: accept4 failed: %m\n"); + goto out; + } + + fd_peer_pidfd = get_peer_pidfd(fd_coredump); + if (fd_peer_pidfd < 0) { + fprintf(stderr, "socket_coredump_signal_sigabrt: get_peer_pidfd failed\n"); + goto out; + } + + if (!get_pidfd_info(fd_peer_pidfd, &info)) { + fprintf(stderr, "socket_coredump_signal_sigabrt: get_pidfd_info failed\n"); + goto out; + } + + if (!(info.mask & PIDFD_INFO_COREDUMP)) { + fprintf(stderr, "socket_coredump_signal_sigabrt: PIDFD_INFO_COREDUMP not set in mask\n"); + goto out; + } + + if (!(info.coredump_mask & PIDFD_COREDUMPED)) { + fprintf(stderr, "socket_coredump_signal_sigabrt: PIDFD_COREDUMPED not set in coredump_mask\n"); + goto out; + } + + /* Verify coredump_signal is available and correct */ + if (!(info.mask & PIDFD_INFO_COREDUMP_SIGNAL)) { + fprintf(stderr, "socket_coredump_signal_sigabrt: PIDFD_INFO_COREDUMP_SIGNAL not set in mask\n"); + goto out; + } + + if (info.coredump_signal != SIGABRT) { + fprintf(stderr, "socket_coredump_signal_sigabrt: coredump_signal=%d, expected SIGABRT=%d\n", + info.coredump_signal, SIGABRT); + goto out; + } + + if (!read_coredump_req(fd_coredump, &req)) { + fprintf(stderr, "socket_coredump_signal_sigabrt: read_coredump_req failed\n"); + goto out; + } + + if (!send_coredump_ack(fd_coredump, &req, + COREDUMP_REJECT | COREDUMP_WAIT, 0)) { + fprintf(stderr, "socket_coredump_signal_sigabrt: send_coredump_ack failed\n"); + goto out; + } + + if (!read_marker(fd_coredump, COREDUMP_MARK_REQACK)) { + fprintf(stderr, "socket_coredump_signal_sigabrt: read_marker COREDUMP_MARK_REQACK failed\n"); + goto out; + } + + exit_code = EXIT_SUCCESS; + fprintf(stderr, "socket_coredump_signal_sigabrt: completed successfully\n"); +out: + if (fd_peer_pidfd >= 0) + close(fd_peer_pidfd); + if (fd_coredump >= 0) + close(fd_coredump); + if (fd_server >= 0) + close(fd_server); + _exit(exit_code); + } + self->pid_coredump_server = pid_coredump_server; + + EXPECT_EQ(close(ipc_sockets[1]), 0); + ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1); + EXPECT_EQ(close(ipc_sockets[0]), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + if (pid == 0) + abort(); + + pidfd = sys_pidfd_open(pid, 0); + ASSERT_GE(pidfd, 0); + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFSIGNALED(status)); + ASSERT_EQ(WTERMSIG(status), SIGABRT); + + ASSERT_TRUE(get_pidfd_info(pidfd, &info)); + ASSERT_TRUE(!!(info.mask & PIDFD_INFO_COREDUMP)); + ASSERT_TRUE(!!(info.mask & PIDFD_INFO_COREDUMP_SIGNAL)); + ASSERT_EQ(info.coredump_signal, SIGABRT); + + wait_and_check_coredump_server(pid_coredump_server, _metadata, self); +} + +TEST_F_TIMEOUT(coredump, socket_multiple_crashing_coredumps, 500) +{ + int pidfd[NUM_CRASHING_COREDUMPS], status[NUM_CRASHING_COREDUMPS]; + pid_t pid[NUM_CRASHING_COREDUMPS], pid_coredump_server; + struct pidfd_info info = {}; + int ipc_sockets[2]; + char c; + + ASSERT_TRUE(set_core_pattern("@@/tmp/coredump.socket")); + + ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets), 0); + + pid_coredump_server = fork(); + ASSERT_GE(pid_coredump_server, 0); + if (pid_coredump_server == 0) { + int fd_server = -1, fd_coredump = -1, fd_peer_pidfd = -1, fd_core_file = -1; + int exit_code = EXIT_FAILURE; + struct coredump_req req = {}; + + close(ipc_sockets[0]); + fd_server = create_and_listen_unix_socket("/tmp/coredump.socket"); + if (fd_server < 0) { + fprintf(stderr, "Failed to create and listen on unix socket\n"); + goto out; + } + + if (write_nointr(ipc_sockets[1], "1", 1) < 0) { + fprintf(stderr, "Failed to notify parent via ipc socket\n"); + goto out; + } + close(ipc_sockets[1]); + + for (int i = 0; i < NUM_CRASHING_COREDUMPS; i++) { + fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC); + if (fd_coredump < 0) { + fprintf(stderr, "accept4 failed: %m\n"); + goto out; + } + + fd_peer_pidfd = get_peer_pidfd(fd_coredump); + if (fd_peer_pidfd < 0) { + fprintf(stderr, "get_peer_pidfd failed for fd %d: %m\n", fd_coredump); + goto out; + } + + if (!get_pidfd_info(fd_peer_pidfd, &info)) { + fprintf(stderr, "get_pidfd_info failed for fd %d\n", fd_peer_pidfd); + goto out; + } + + if (!(info.mask & PIDFD_INFO_COREDUMP)) { + fprintf(stderr, "pidfd info missing PIDFD_INFO_COREDUMP for fd %d\n", fd_peer_pidfd); + goto out; + } + if (!(info.coredump_mask & PIDFD_COREDUMPED)) { + fprintf(stderr, "pidfd info missing PIDFD_COREDUMPED for fd %d\n", fd_peer_pidfd); + goto out; + } + + if (!read_coredump_req(fd_coredump, &req)) { + fprintf(stderr, "read_coredump_req failed for fd %d\n", fd_coredump); + goto out; + } + + if (!check_coredump_req(&req, COREDUMP_ACK_SIZE_VER0, + COREDUMP_KERNEL | COREDUMP_USERSPACE | + COREDUMP_REJECT | COREDUMP_WAIT)) { + fprintf(stderr, "check_coredump_req failed for fd %d\n", fd_coredump); + goto out; + } + + if (!send_coredump_ack(fd_coredump, &req, + COREDUMP_KERNEL | COREDUMP_WAIT, 0)) { + fprintf(stderr, "send_coredump_ack failed for fd %d\n", fd_coredump); + goto out; + } + + if (!read_marker(fd_coredump, COREDUMP_MARK_REQACK)) { + fprintf(stderr, "read_marker failed for fd %d\n", fd_coredump); + goto out; + } + + fd_core_file = open_coredump_tmpfile(self->fd_tmpfs_detached); + if (fd_core_file < 0) { + fprintf(stderr, "%m - open_coredump_tmpfile failed for fd %d\n", fd_coredump); + goto out; + } + + for (;;) { + char buffer[4096]; + ssize_t bytes_read, bytes_write; + + bytes_read = read(fd_coredump, buffer, sizeof(buffer)); + if (bytes_read < 0) { + fprintf(stderr, "read failed for fd %d: %m\n", fd_coredump); + goto out; + } + + if (bytes_read == 0) + break; + + bytes_write = write(fd_core_file, buffer, bytes_read); + if (bytes_read != bytes_write) { + if (bytes_write < 0 && errno == ENOSPC) + continue; + fprintf(stderr, "write failed for fd %d: %m\n", fd_core_file); + goto out; + } + } + + close(fd_core_file); + close(fd_peer_pidfd); + close(fd_coredump); + fd_peer_pidfd = -1; + fd_coredump = -1; + } + + exit_code = EXIT_SUCCESS; +out: + if (fd_core_file >= 0) + close(fd_core_file); + if (fd_peer_pidfd >= 0) + close(fd_peer_pidfd); + if (fd_coredump >= 0) + close(fd_coredump); + if (fd_server >= 0) + close(fd_server); + _exit(exit_code); + } + self->pid_coredump_server = pid_coredump_server; + + EXPECT_EQ(close(ipc_sockets[1]), 0); + ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1); + EXPECT_EQ(close(ipc_sockets[0]), 0); + + for (int i = 0; i < NUM_CRASHING_COREDUMPS; i++) { + pid[i] = fork(); + ASSERT_GE(pid[i], 0); + if (pid[i] == 0) + crashing_child(); + pidfd[i] = sys_pidfd_open(pid[i], 0); + ASSERT_GE(pidfd[i], 0); + } + + for (int i = 0; i < NUM_CRASHING_COREDUMPS; i++) { + waitpid(pid[i], &status[i], 0); + ASSERT_TRUE(WIFSIGNALED(status[i])); + ASSERT_TRUE(WCOREDUMP(status[i])); + } + + for (int i = 0; i < NUM_CRASHING_COREDUMPS; i++) { + info.mask = PIDFD_INFO_EXIT | PIDFD_INFO_COREDUMP; + ASSERT_EQ(ioctl(pidfd[i], PIDFD_GET_INFO, &info), 0); + ASSERT_GT((info.mask & PIDFD_INFO_COREDUMP), 0); + ASSERT_GT((info.coredump_mask & PIDFD_COREDUMPED), 0); + } + + wait_and_check_coredump_server(pid_coredump_server, _metadata, self); +} + +TEST_F_TIMEOUT(coredump, socket_multiple_crashing_coredumps_epoll_workers, 500) +{ + int pidfd[NUM_CRASHING_COREDUMPS], status[NUM_CRASHING_COREDUMPS]; + pid_t pid[NUM_CRASHING_COREDUMPS], pid_coredump_server, worker_pids[NUM_CRASHING_COREDUMPS]; + struct pidfd_info info = {}; + int ipc_sockets[2]; + char c; + + ASSERT_TRUE(set_core_pattern("@@/tmp/coredump.socket")); + ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets), 0); + + pid_coredump_server = fork(); + ASSERT_GE(pid_coredump_server, 0); + if (pid_coredump_server == 0) { + int fd_server = -1, exit_code = EXIT_FAILURE, n_conns = 0; + fd_server = -1; + exit_code = EXIT_FAILURE; + n_conns = 0; + close(ipc_sockets[0]); + fd_server = create_and_listen_unix_socket("/tmp/coredump.socket"); + if (fd_server < 0) { + fprintf(stderr, "socket_multiple_crashing_coredumps_epoll_workers: create_and_listen_unix_socket failed: %m\n"); + goto out; + } + + if (write_nointr(ipc_sockets[1], "1", 1) < 0) { + fprintf(stderr, "socket_multiple_crashing_coredumps_epoll_workers: write_nointr to ipc socket failed: %m\n"); + goto out; + } + close(ipc_sockets[1]); + + while (n_conns < NUM_CRASHING_COREDUMPS) { + int fd_coredump = -1, fd_peer_pidfd = -1, fd_core_file = -1; + struct coredump_req req = {}; + fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC); + if (fd_coredump < 0) { + if (errno == EAGAIN || errno == EWOULDBLOCK) + continue; + fprintf(stderr, "socket_multiple_crashing_coredumps_epoll_workers: accept4 failed: %m\n"); + goto out; + } + fd_peer_pidfd = get_peer_pidfd(fd_coredump); + if (fd_peer_pidfd < 0) { + fprintf(stderr, "socket_multiple_crashing_coredumps_epoll_workers: get_peer_pidfd failed\n"); + goto out; + } + if (!get_pidfd_info(fd_peer_pidfd, &info)) { + fprintf(stderr, "socket_multiple_crashing_coredumps_epoll_workers: get_pidfd_info failed\n"); + goto out; + } + if (!(info.mask & PIDFD_INFO_COREDUMP) || !(info.coredump_mask & PIDFD_COREDUMPED)) { + fprintf(stderr, "socket_multiple_crashing_coredumps_epoll_workers: missing PIDFD_INFO_COREDUMP or PIDFD_COREDUMPED\n"); + goto out; + } + if (!read_coredump_req(fd_coredump, &req)) { + fprintf(stderr, "socket_multiple_crashing_coredumps_epoll_workers: read_coredump_req failed\n"); + goto out; + } + if (!check_coredump_req(&req, COREDUMP_ACK_SIZE_VER0, + COREDUMP_KERNEL | COREDUMP_USERSPACE | + COREDUMP_REJECT | COREDUMP_WAIT)) { + fprintf(stderr, "socket_multiple_crashing_coredumps_epoll_workers: check_coredump_req failed\n"); + goto out; + } + if (!send_coredump_ack(fd_coredump, &req, COREDUMP_KERNEL | COREDUMP_WAIT, 0)) { + fprintf(stderr, "socket_multiple_crashing_coredumps_epoll_workers: send_coredump_ack failed\n"); + goto out; + } + if (!read_marker(fd_coredump, COREDUMP_MARK_REQACK)) { + fprintf(stderr, "socket_multiple_crashing_coredumps_epoll_workers: read_marker failed\n"); + goto out; + } + fd_core_file = open_coredump_tmpfile(self->fd_tmpfs_detached); + if (fd_core_file < 0) { + fprintf(stderr, "socket_multiple_crashing_coredumps_epoll_workers: open_coredump_tmpfile failed: %m\n"); + goto out; + } + pid_t worker = fork(); + if (worker == 0) { + close(fd_server); + process_coredump_worker(fd_coredump, fd_peer_pidfd, fd_core_file); + } + worker_pids[n_conns] = worker; + if (fd_coredump >= 0) + close(fd_coredump); + if (fd_peer_pidfd >= 0) + close(fd_peer_pidfd); + if (fd_core_file >= 0) + close(fd_core_file); + n_conns++; + } + exit_code = EXIT_SUCCESS; +out: + if (fd_server >= 0) + close(fd_server); + + // Reap all worker processes + for (int i = 0; i < n_conns; i++) { + int wstatus; + if (waitpid(worker_pids[i], &wstatus, 0) < 0) { + fprintf(stderr, "Failed to wait for worker %d: %m\n", worker_pids[i]); + } else if (WIFEXITED(wstatus) && WEXITSTATUS(wstatus) != EXIT_SUCCESS) { + fprintf(stderr, "Worker %d exited with error code %d\n", worker_pids[i], WEXITSTATUS(wstatus)); + exit_code = EXIT_FAILURE; + } + } + + _exit(exit_code); + } + self->pid_coredump_server = pid_coredump_server; + + EXPECT_EQ(close(ipc_sockets[1]), 0); + ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1); + EXPECT_EQ(close(ipc_sockets[0]), 0); + + for (int i = 0; i < NUM_CRASHING_COREDUMPS; i++) { + pid[i] = fork(); + ASSERT_GE(pid[i], 0); + if (pid[i] == 0) + crashing_child(); + pidfd[i] = sys_pidfd_open(pid[i], 0); + ASSERT_GE(pidfd[i], 0); + } + + for (int i = 0; i < NUM_CRASHING_COREDUMPS; i++) { + ASSERT_GE(waitpid(pid[i], &status[i], 0), 0); + ASSERT_TRUE(WIFSIGNALED(status[i])); + ASSERT_TRUE(WCOREDUMP(status[i])); + } + + for (int i = 0; i < NUM_CRASHING_COREDUMPS; i++) { + info.mask = PIDFD_INFO_EXIT | PIDFD_INFO_COREDUMP; + ASSERT_EQ(ioctl(pidfd[i], PIDFD_GET_INFO, &info), 0); + ASSERT_GT((info.mask & PIDFD_INFO_COREDUMP), 0); + ASSERT_GT((info.coredump_mask & PIDFD_COREDUMPED), 0); + } + + wait_and_check_coredump_server(pid_coredump_server, _metadata, self); +} + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/coredump/coredump_socket_test.c b/tools/testing/selftests/coredump/coredump_socket_test.c new file mode 100644 index 000000000000..7e26d4a6a15d --- /dev/null +++ b/tools/testing/selftests/coredump/coredump_socket_test.c @@ -0,0 +1,742 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <sys/stat.h> +#include <sys/epoll.h> +#include <sys/socket.h> +#include <sys/un.h> + +#include "coredump_test.h" + +FIXTURE_SETUP(coredump) +{ + FILE *file; + int ret; + + self->pid_coredump_server = -ESRCH; + self->fd_tmpfs_detached = -1; + file = fopen("/proc/sys/kernel/core_pattern", "r"); + ASSERT_NE(NULL, file); + + ret = fread(self->original_core_pattern, 1, sizeof(self->original_core_pattern), file); + ASSERT_TRUE(ret || feof(file)); + ASSERT_LT(ret, sizeof(self->original_core_pattern)); + + self->original_core_pattern[ret] = '\0'; + self->fd_tmpfs_detached = create_detached_tmpfs(); + ASSERT_GE(self->fd_tmpfs_detached, 0); + + ret = fclose(file); + ASSERT_EQ(0, ret); +} + +FIXTURE_TEARDOWN(coredump) +{ + const char *reason; + FILE *file; + int ret, status; + + if (self->pid_coredump_server > 0) { + kill(self->pid_coredump_server, SIGTERM); + waitpid(self->pid_coredump_server, &status, 0); + } + unlink("/tmp/coredump.file"); + unlink("/tmp/coredump.socket"); + + file = fopen("/proc/sys/kernel/core_pattern", "w"); + if (!file) { + reason = "Unable to open core_pattern"; + goto fail; + } + + ret = fprintf(file, "%s", self->original_core_pattern); + if (ret < 0) { + reason = "Unable to write to core_pattern"; + goto fail; + } + + ret = fclose(file); + if (ret) { + reason = "Unable to close core_pattern"; + goto fail; + } + + if (self->fd_tmpfs_detached >= 0) { + ret = close(self->fd_tmpfs_detached); + if (ret < 0) { + reason = "Unable to close detached tmpfs"; + goto fail; + } + self->fd_tmpfs_detached = -1; + } + + return; +fail: + /* This should never happen */ + fprintf(stderr, "Failed to cleanup coredump test: %s\n", reason); +} + +TEST_F(coredump, socket) +{ + int pidfd, ret, status; + pid_t pid, pid_coredump_server; + struct stat st; + struct pidfd_info info = {}; + int ipc_sockets[2]; + char c; + + ASSERT_TRUE(set_core_pattern("@/tmp/coredump.socket")); + + ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets); + ASSERT_EQ(ret, 0); + + pid_coredump_server = fork(); + ASSERT_GE(pid_coredump_server, 0); + if (pid_coredump_server == 0) { + int fd_server = -1, fd_coredump = -1, fd_peer_pidfd = -1, fd_core_file = -1; + int exit_code = EXIT_FAILURE; + + close(ipc_sockets[0]); + + fd_server = create_and_listen_unix_socket("/tmp/coredump.socket"); + if (fd_server < 0) { + fprintf(stderr, "socket test: create_and_listen_unix_socket failed: %m\n"); + goto out; + } + + if (write_nointr(ipc_sockets[1], "1", 1) < 0) { + fprintf(stderr, "socket test: write_nointr to ipc socket failed: %m\n"); + goto out; + } + + close(ipc_sockets[1]); + + fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC); + if (fd_coredump < 0) { + fprintf(stderr, "socket test: accept4 failed: %m\n"); + goto out; + } + + fd_peer_pidfd = get_peer_pidfd(fd_coredump); + if (fd_peer_pidfd < 0) { + fprintf(stderr, "socket test: get_peer_pidfd failed\n"); + goto out; + } + + if (!get_pidfd_info(fd_peer_pidfd, &info)) { + fprintf(stderr, "socket test: get_pidfd_info failed\n"); + goto out; + } + + if (!(info.mask & PIDFD_INFO_COREDUMP)) { + fprintf(stderr, "socket test: PIDFD_INFO_COREDUMP not set in mask\n"); + goto out; + } + + if (!(info.coredump_mask & PIDFD_COREDUMPED)) { + fprintf(stderr, "socket test: PIDFD_COREDUMPED not set in coredump_mask\n"); + goto out; + } + + fd_core_file = creat("/tmp/coredump.file", 0644); + if (fd_core_file < 0) { + fprintf(stderr, "socket test: creat coredump file failed: %m\n"); + goto out; + } + + for (;;) { + char buffer[4096]; + ssize_t bytes_read, bytes_write; + + bytes_read = read(fd_coredump, buffer, sizeof(buffer)); + if (bytes_read < 0) { + fprintf(stderr, "socket test: read from coredump socket failed: %m\n"); + goto out; + } + + if (bytes_read == 0) + break; + + bytes_write = write(fd_core_file, buffer, bytes_read); + if (bytes_read != bytes_write) { + if (bytes_write < 0 && errno == ENOSPC) + continue; + fprintf(stderr, "socket test: write to core file failed (read=%zd, write=%zd): %m\n", bytes_read, bytes_write); + goto out; + } + } + + exit_code = EXIT_SUCCESS; + fprintf(stderr, "socket test: completed successfully\n"); +out: + if (fd_core_file >= 0) + close(fd_core_file); + if (fd_peer_pidfd >= 0) + close(fd_peer_pidfd); + if (fd_coredump >= 0) + close(fd_coredump); + if (fd_server >= 0) + close(fd_server); + _exit(exit_code); + } + self->pid_coredump_server = pid_coredump_server; + + EXPECT_EQ(close(ipc_sockets[1]), 0); + ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1); + EXPECT_EQ(close(ipc_sockets[0]), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + if (pid == 0) + crashing_child(); + + pidfd = sys_pidfd_open(pid, 0); + ASSERT_GE(pidfd, 0); + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFSIGNALED(status)); + ASSERT_TRUE(WCOREDUMP(status)); + + ASSERT_TRUE(get_pidfd_info(pidfd, &info)); + ASSERT_GT((info.mask & PIDFD_INFO_COREDUMP), 0); + ASSERT_GT((info.coredump_mask & PIDFD_COREDUMPED), 0); + + wait_and_check_coredump_server(pid_coredump_server, _metadata, self); + + ASSERT_EQ(stat("/tmp/coredump.file", &st), 0); + ASSERT_GT(st.st_size, 0); +} + +TEST_F(coredump, socket_detect_userspace_client) +{ + int pidfd, ret, status; + pid_t pid, pid_coredump_server; + struct stat st; + struct pidfd_info info = { + .mask = PIDFD_INFO_COREDUMP, + }; + int ipc_sockets[2]; + char c; + + ASSERT_TRUE(set_core_pattern("@/tmp/coredump.socket")); + + ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets); + ASSERT_EQ(ret, 0); + + pid_coredump_server = fork(); + ASSERT_GE(pid_coredump_server, 0); + if (pid_coredump_server == 0) { + int fd_server = -1, fd_coredump = -1, fd_peer_pidfd = -1; + int exit_code = EXIT_FAILURE; + + close(ipc_sockets[0]); + + fd_server = create_and_listen_unix_socket("/tmp/coredump.socket"); + if (fd_server < 0) { + fprintf(stderr, "socket_detect_userspace_client: create_and_listen_unix_socket failed: %m\n"); + goto out; + } + + if (write_nointr(ipc_sockets[1], "1", 1) < 0) { + fprintf(stderr, "socket_detect_userspace_client: write_nointr to ipc socket failed: %m\n"); + goto out; + } + + close(ipc_sockets[1]); + + fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC); + if (fd_coredump < 0) { + fprintf(stderr, "socket_detect_userspace_client: accept4 failed: %m\n"); + goto out; + } + + fd_peer_pidfd = get_peer_pidfd(fd_coredump); + if (fd_peer_pidfd < 0) { + fprintf(stderr, "socket_detect_userspace_client: get_peer_pidfd failed\n"); + goto out; + } + + if (!get_pidfd_info(fd_peer_pidfd, &info)) { + fprintf(stderr, "socket_detect_userspace_client: get_pidfd_info failed\n"); + goto out; + } + + if (!(info.mask & PIDFD_INFO_COREDUMP)) { + fprintf(stderr, "socket_detect_userspace_client: PIDFD_INFO_COREDUMP not set in mask\n"); + goto out; + } + + if (info.coredump_mask & PIDFD_COREDUMPED) { + fprintf(stderr, "socket_detect_userspace_client: PIDFD_COREDUMPED incorrectly set (should be userspace client)\n"); + goto out; + } + + exit_code = EXIT_SUCCESS; + fprintf(stderr, "socket_detect_userspace_client: completed successfully\n"); +out: + if (fd_peer_pidfd >= 0) + close(fd_peer_pidfd); + if (fd_coredump >= 0) + close(fd_coredump); + if (fd_server >= 0) + close(fd_server); + _exit(exit_code); + } + self->pid_coredump_server = pid_coredump_server; + + EXPECT_EQ(close(ipc_sockets[1]), 0); + ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1); + EXPECT_EQ(close(ipc_sockets[0]), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + if (pid == 0) { + int fd_socket; + ssize_t ret; + const struct sockaddr_un coredump_sk = { + .sun_family = AF_UNIX, + .sun_path = "/tmp/coredump.socket", + }; + size_t coredump_sk_len = + offsetof(struct sockaddr_un, sun_path) + + sizeof("/tmp/coredump.socket"); + + fd_socket = socket(AF_UNIX, SOCK_STREAM, 0); + if (fd_socket < 0) { + fprintf(stderr, "socket_detect_userspace_client (client): socket failed: %m\n"); + _exit(EXIT_FAILURE); + } + + ret = connect(fd_socket, (const struct sockaddr *)&coredump_sk, coredump_sk_len); + if (ret < 0) { + fprintf(stderr, "socket_detect_userspace_client (client): connect failed: %m\n"); + _exit(EXIT_FAILURE); + } + + close(fd_socket); + pause(); + fprintf(stderr, "socket_detect_userspace_client (client): completed successfully\n"); + _exit(EXIT_SUCCESS); + } + + pidfd = sys_pidfd_open(pid, 0); + ASSERT_GE(pidfd, 0); + + ASSERT_TRUE(get_pidfd_info(pidfd, &info)); + ASSERT_GT((info.mask & PIDFD_INFO_COREDUMP), 0); + ASSERT_EQ((info.coredump_mask & PIDFD_COREDUMPED), 0); + + wait_and_check_coredump_server(pid_coredump_server, _metadata, self); + + ASSERT_EQ(sys_pidfd_send_signal(pidfd, SIGKILL, NULL, 0), 0); + ASSERT_EQ(close(pidfd), 0); + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFSIGNALED(status)); + ASSERT_EQ(WTERMSIG(status), SIGKILL); + + ASSERT_NE(stat("/tmp/coredump.file", &st), 0); + ASSERT_EQ(errno, ENOENT); +} + +TEST_F(coredump, socket_enoent) +{ + int pidfd, status; + pid_t pid; + + ASSERT_TRUE(set_core_pattern("@/tmp/coredump.socket")); + + pid = fork(); + ASSERT_GE(pid, 0); + if (pid == 0) + crashing_child(); + + pidfd = sys_pidfd_open(pid, 0); + ASSERT_GE(pidfd, 0); + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFSIGNALED(status)); + ASSERT_FALSE(WCOREDUMP(status)); +} + +TEST_F(coredump, socket_no_listener) +{ + int pidfd, ret, status; + pid_t pid, pid_coredump_server; + int ipc_sockets[2]; + char c; + const struct sockaddr_un coredump_sk = { + .sun_family = AF_UNIX, + .sun_path = "/tmp/coredump.socket", + }; + size_t coredump_sk_len = offsetof(struct sockaddr_un, sun_path) + + sizeof("/tmp/coredump.socket"); + + ASSERT_TRUE(set_core_pattern("@/tmp/coredump.socket")); + + ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets); + ASSERT_EQ(ret, 0); + + pid_coredump_server = fork(); + ASSERT_GE(pid_coredump_server, 0); + if (pid_coredump_server == 0) { + int fd_server = -1; + int exit_code = EXIT_FAILURE; + + close(ipc_sockets[0]); + + fd_server = socket(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0); + if (fd_server < 0) { + fprintf(stderr, "socket_no_listener: socket failed: %m\n"); + goto out; + } + + ret = bind(fd_server, (const struct sockaddr *)&coredump_sk, coredump_sk_len); + if (ret < 0) { + fprintf(stderr, "socket_no_listener: bind failed: %m\n"); + goto out; + } + + if (write_nointr(ipc_sockets[1], "1", 1) < 0) { + fprintf(stderr, "socket_no_listener: write_nointr to ipc socket failed: %m\n"); + goto out; + } + + exit_code = EXIT_SUCCESS; + fprintf(stderr, "socket_no_listener: completed successfully\n"); +out: + if (fd_server >= 0) + close(fd_server); + close(ipc_sockets[1]); + _exit(exit_code); + } + self->pid_coredump_server = pid_coredump_server; + + EXPECT_EQ(close(ipc_sockets[1]), 0); + ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1); + EXPECT_EQ(close(ipc_sockets[0]), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + if (pid == 0) + crashing_child(); + + pidfd = sys_pidfd_open(pid, 0); + ASSERT_GE(pidfd, 0); + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFSIGNALED(status)); + ASSERT_FALSE(WCOREDUMP(status)); + + wait_and_check_coredump_server(pid_coredump_server, _metadata, self); +} + +/* + * Test: PIDFD_INFO_COREDUMP_SIGNAL via simple socket coredump + * + * Verify that when using simple socket-based coredump (@ pattern), + * the coredump_signal field is correctly exposed as SIGSEGV. + */ +TEST_F(coredump, socket_coredump_signal_sigsegv) +{ + int pidfd, ret, status; + pid_t pid, pid_coredump_server; + struct pidfd_info info = {}; + int ipc_sockets[2]; + char c; + + ASSERT_TRUE(set_core_pattern("@/tmp/coredump.socket")); + + ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets); + ASSERT_EQ(ret, 0); + + pid_coredump_server = fork(); + ASSERT_GE(pid_coredump_server, 0); + if (pid_coredump_server == 0) { + int fd_server = -1, fd_coredump = -1, fd_peer_pidfd = -1, fd_core_file = -1; + int exit_code = EXIT_FAILURE; + + close(ipc_sockets[0]); + + fd_server = create_and_listen_unix_socket("/tmp/coredump.socket"); + if (fd_server < 0) { + fprintf(stderr, "socket_coredump_signal_sigsegv: create_and_listen_unix_socket failed: %m\n"); + goto out; + } + + if (write_nointr(ipc_sockets[1], "1", 1) < 0) { + fprintf(stderr, "socket_coredump_signal_sigsegv: write_nointr to ipc socket failed: %m\n"); + goto out; + } + + close(ipc_sockets[1]); + + fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC); + if (fd_coredump < 0) { + fprintf(stderr, "socket_coredump_signal_sigsegv: accept4 failed: %m\n"); + goto out; + } + + fd_peer_pidfd = get_peer_pidfd(fd_coredump); + if (fd_peer_pidfd < 0) { + fprintf(stderr, "socket_coredump_signal_sigsegv: get_peer_pidfd failed\n"); + goto out; + } + + if (!get_pidfd_info(fd_peer_pidfd, &info)) { + fprintf(stderr, "socket_coredump_signal_sigsegv: get_pidfd_info failed\n"); + goto out; + } + + if (!(info.mask & PIDFD_INFO_COREDUMP)) { + fprintf(stderr, "socket_coredump_signal_sigsegv: PIDFD_INFO_COREDUMP not set in mask\n"); + goto out; + } + + if (!(info.coredump_mask & PIDFD_COREDUMPED)) { + fprintf(stderr, "socket_coredump_signal_sigsegv: PIDFD_COREDUMPED not set in coredump_mask\n"); + goto out; + } + + /* Verify coredump_signal is available and correct */ + if (!(info.mask & PIDFD_INFO_COREDUMP_SIGNAL)) { + fprintf(stderr, "socket_coredump_signal_sigsegv: PIDFD_INFO_COREDUMP_SIGNAL not set in mask\n"); + goto out; + } + + if (info.coredump_signal != SIGSEGV) { + fprintf(stderr, "socket_coredump_signal_sigsegv: coredump_signal=%d, expected SIGSEGV=%d\n", + info.coredump_signal, SIGSEGV); + goto out; + } + + fd_core_file = open_coredump_tmpfile(self->fd_tmpfs_detached); + if (fd_core_file < 0) { + fprintf(stderr, "socket_coredump_signal_sigsegv: open_coredump_tmpfile failed: %m\n"); + goto out; + } + + for (;;) { + char buffer[4096]; + ssize_t bytes_read, bytes_write; + + bytes_read = read(fd_coredump, buffer, sizeof(buffer)); + if (bytes_read < 0) { + fprintf(stderr, "socket_coredump_signal_sigsegv: read from coredump socket failed: %m\n"); + goto out; + } + + if (bytes_read == 0) + break; + + bytes_write = write(fd_core_file, buffer, bytes_read); + if (bytes_read != bytes_write) { + fprintf(stderr, "socket_coredump_signal_sigsegv: write to core file failed (read=%zd, write=%zd): %m\n", + bytes_read, bytes_write); + goto out; + } + } + + exit_code = EXIT_SUCCESS; + fprintf(stderr, "socket_coredump_signal_sigsegv: completed successfully\n"); +out: + if (fd_core_file >= 0) + close(fd_core_file); + if (fd_peer_pidfd >= 0) + close(fd_peer_pidfd); + if (fd_coredump >= 0) + close(fd_coredump); + if (fd_server >= 0) + close(fd_server); + _exit(exit_code); + } + self->pid_coredump_server = pid_coredump_server; + + EXPECT_EQ(close(ipc_sockets[1]), 0); + ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1); + EXPECT_EQ(close(ipc_sockets[0]), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + if (pid == 0) + crashing_child(); + + pidfd = sys_pidfd_open(pid, 0); + ASSERT_GE(pidfd, 0); + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFSIGNALED(status)); + ASSERT_EQ(WTERMSIG(status), SIGSEGV); + ASSERT_TRUE(WCOREDUMP(status)); + + ASSERT_TRUE(get_pidfd_info(pidfd, &info)); + ASSERT_TRUE(!!(info.mask & PIDFD_INFO_COREDUMP)); + ASSERT_TRUE(!!(info.mask & PIDFD_INFO_COREDUMP_SIGNAL)); + ASSERT_EQ(info.coredump_signal, SIGSEGV); + + wait_and_check_coredump_server(pid_coredump_server, _metadata, self); +} + +/* + * Test: PIDFD_INFO_COREDUMP_SIGNAL via simple socket coredump with SIGABRT + * + * Verify that when using simple socket-based coredump (@ pattern), + * the coredump_signal field is correctly exposed as SIGABRT. + */ +TEST_F(coredump, socket_coredump_signal_sigabrt) +{ + int pidfd, ret, status; + pid_t pid, pid_coredump_server; + struct pidfd_info info = {}; + int ipc_sockets[2]; + char c; + + ASSERT_TRUE(set_core_pattern("@/tmp/coredump.socket")); + + ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets); + ASSERT_EQ(ret, 0); + + pid_coredump_server = fork(); + ASSERT_GE(pid_coredump_server, 0); + if (pid_coredump_server == 0) { + int fd_server = -1, fd_coredump = -1, fd_peer_pidfd = -1, fd_core_file = -1; + int exit_code = EXIT_FAILURE; + + close(ipc_sockets[0]); + + fd_server = create_and_listen_unix_socket("/tmp/coredump.socket"); + if (fd_server < 0) { + fprintf(stderr, "socket_coredump_signal_sigabrt: create_and_listen_unix_socket failed: %m\n"); + goto out; + } + + if (write_nointr(ipc_sockets[1], "1", 1) < 0) { + fprintf(stderr, "socket_coredump_signal_sigabrt: write_nointr to ipc socket failed: %m\n"); + goto out; + } + + close(ipc_sockets[1]); + + fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC); + if (fd_coredump < 0) { + fprintf(stderr, "socket_coredump_signal_sigabrt: accept4 failed: %m\n"); + goto out; + } + + fd_peer_pidfd = get_peer_pidfd(fd_coredump); + if (fd_peer_pidfd < 0) { + fprintf(stderr, "socket_coredump_signal_sigabrt: get_peer_pidfd failed\n"); + goto out; + } + + if (!get_pidfd_info(fd_peer_pidfd, &info)) { + fprintf(stderr, "socket_coredump_signal_sigabrt: get_pidfd_info failed\n"); + goto out; + } + + if (!(info.mask & PIDFD_INFO_COREDUMP)) { + fprintf(stderr, "socket_coredump_signal_sigabrt: PIDFD_INFO_COREDUMP not set in mask\n"); + goto out; + } + + if (!(info.coredump_mask & PIDFD_COREDUMPED)) { + fprintf(stderr, "socket_coredump_signal_sigabrt: PIDFD_COREDUMPED not set in coredump_mask\n"); + goto out; + } + + /* Verify coredump_signal is available and correct */ + if (!(info.mask & PIDFD_INFO_COREDUMP_SIGNAL)) { + fprintf(stderr, "socket_coredump_signal_sigabrt: PIDFD_INFO_COREDUMP_SIGNAL not set in mask\n"); + goto out; + } + + if (info.coredump_signal != SIGABRT) { + fprintf(stderr, "socket_coredump_signal_sigabrt: coredump_signal=%d, expected SIGABRT=%d\n", + info.coredump_signal, SIGABRT); + goto out; + } + + fd_core_file = open_coredump_tmpfile(self->fd_tmpfs_detached); + if (fd_core_file < 0) { + fprintf(stderr, "socket_coredump_signal_sigabrt: open_coredump_tmpfile failed: %m\n"); + goto out; + } + + for (;;) { + char buffer[4096]; + ssize_t bytes_read, bytes_write; + + bytes_read = read(fd_coredump, buffer, sizeof(buffer)); + if (bytes_read < 0) { + fprintf(stderr, "socket_coredump_signal_sigabrt: read from coredump socket failed: %m\n"); + goto out; + } + + if (bytes_read == 0) + break; + + bytes_write = write(fd_core_file, buffer, bytes_read); + if (bytes_read != bytes_write) { + fprintf(stderr, "socket_coredump_signal_sigabrt: write to core file failed (read=%zd, write=%zd): %m\n", + bytes_read, bytes_write); + goto out; + } + } + + exit_code = EXIT_SUCCESS; + fprintf(stderr, "socket_coredump_signal_sigabrt: completed successfully\n"); +out: + if (fd_core_file >= 0) + close(fd_core_file); + if (fd_peer_pidfd >= 0) + close(fd_peer_pidfd); + if (fd_coredump >= 0) + close(fd_coredump); + if (fd_server >= 0) + close(fd_server); + _exit(exit_code); + } + self->pid_coredump_server = pid_coredump_server; + + EXPECT_EQ(close(ipc_sockets[1]), 0); + ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1); + EXPECT_EQ(close(ipc_sockets[0]), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + if (pid == 0) + abort(); + + pidfd = sys_pidfd_open(pid, 0); + ASSERT_GE(pidfd, 0); + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFSIGNALED(status)); + ASSERT_EQ(WTERMSIG(status), SIGABRT); + ASSERT_TRUE(WCOREDUMP(status)); + + ASSERT_TRUE(get_pidfd_info(pidfd, &info)); + ASSERT_TRUE(!!(info.mask & PIDFD_INFO_COREDUMP)); + ASSERT_TRUE(!!(info.mask & PIDFD_INFO_COREDUMP_SIGNAL)); + ASSERT_EQ(info.coredump_signal, SIGABRT); + + wait_and_check_coredump_server(pid_coredump_server, _metadata, self); +} + +TEST_F(coredump, socket_invalid_paths) +{ + ASSERT_FALSE(set_core_pattern("@ /tmp/coredump.socket")); + ASSERT_FALSE(set_core_pattern("@/tmp/../coredump.socket")); + ASSERT_FALSE(set_core_pattern("@../coredump.socket")); + ASSERT_FALSE(set_core_pattern("@/tmp/coredump.socket/..")); + ASSERT_FALSE(set_core_pattern("@..")); + + ASSERT_FALSE(set_core_pattern("@@ /tmp/coredump.socket")); + ASSERT_FALSE(set_core_pattern("@@/tmp/../coredump.socket")); + ASSERT_FALSE(set_core_pattern("@@../coredump.socket")); + ASSERT_FALSE(set_core_pattern("@@/tmp/coredump.socket/..")); + ASSERT_FALSE(set_core_pattern("@@..")); + + ASSERT_FALSE(set_core_pattern("@@@/tmp/coredump.socket")); +} + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/coredump/coredump_test.h b/tools/testing/selftests/coredump/coredump_test.h new file mode 100644 index 000000000000..ed47f01fa53c --- /dev/null +++ b/tools/testing/selftests/coredump/coredump_test.h @@ -0,0 +1,59 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef __COREDUMP_TEST_H +#define __COREDUMP_TEST_H + +#include <stdbool.h> +#include <sys/types.h> +#include <linux/coredump.h> + +#include "../kselftest_harness.h" +#include "../pidfd/pidfd.h" + +#ifndef PAGE_SIZE +#define PAGE_SIZE 4096 +#endif + +#define NUM_THREAD_SPAWN 128 + +/* Coredump fixture */ +FIXTURE(coredump) +{ + char original_core_pattern[256]; + pid_t pid_coredump_server; + int fd_tmpfs_detached; +}; + +/* Shared helper function declarations */ +void *do_nothing(void *arg); +void crashing_child(void); +int create_detached_tmpfs(void); +int create_and_listen_unix_socket(const char *path); +bool set_core_pattern(const char *pattern); +int get_peer_pidfd(int fd); +bool get_pidfd_info(int fd_peer_pidfd, struct pidfd_info *info); + +/* Inline helper that uses harness types */ +static inline void wait_and_check_coredump_server(pid_t pid_coredump_server, + struct __test_metadata *const _metadata, + FIXTURE_DATA(coredump) *self) +{ + int status; + waitpid(pid_coredump_server, &status, 0); + self->pid_coredump_server = -ESRCH; + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); +} + +/* Protocol helper function declarations */ +ssize_t recv_marker(int fd); +bool read_marker(int fd, enum coredump_mark mark); +bool read_coredump_req(int fd, struct coredump_req *req); +bool send_coredump_ack(int fd, const struct coredump_req *req, + __u64 mask, size_t size_ack); +bool check_coredump_req(const struct coredump_req *req, size_t min_size, + __u64 required_mask); +int open_coredump_tmpfile(int fd_tmpfs_detached); +void process_coredump_worker(int fd_coredump, int fd_peer_pidfd, int fd_core_file); + +#endif /* __COREDUMP_TEST_H */ diff --git a/tools/testing/selftests/coredump/coredump_test_helpers.c b/tools/testing/selftests/coredump/coredump_test_helpers.c new file mode 100644 index 000000000000..a6f6d5f2ae07 --- /dev/null +++ b/tools/testing/selftests/coredump/coredump_test_helpers.c @@ -0,0 +1,383 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <assert.h> +#include <errno.h> +#include <fcntl.h> +#include <limits.h> +#include <linux/coredump.h> +#include <linux/fs.h> +#include <pthread.h> +#include <stdbool.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/epoll.h> +#include <sys/ioctl.h> +#include <sys/socket.h> +#include <sys/types.h> +#include <sys/un.h> +#include <sys/wait.h> +#include <unistd.h> + +#include "../filesystems/wrappers.h" +#include "../pidfd/pidfd.h" + +/* Forward declarations to avoid including harness header */ +struct __test_metadata; + +/* Match the fixture definition from coredump_test.h */ +struct _fixture_coredump_data { + char original_core_pattern[256]; + pid_t pid_coredump_server; + int fd_tmpfs_detached; +}; + +#ifndef PAGE_SIZE +#define PAGE_SIZE 4096 +#endif + +#define NUM_THREAD_SPAWN 128 + +void *do_nothing(void *arg) +{ + (void)arg; + while (1) + pause(); + + return NULL; +} + +void crashing_child(void) +{ + pthread_t thread; + int i; + + for (i = 0; i < NUM_THREAD_SPAWN; ++i) + pthread_create(&thread, NULL, do_nothing, NULL); + + /* crash on purpose */ + i = *(int *)NULL; +} + +int create_detached_tmpfs(void) +{ + int fd_context, fd_tmpfs; + + fd_context = sys_fsopen("tmpfs", 0); + if (fd_context < 0) + return -1; + + if (sys_fsconfig(fd_context, FSCONFIG_CMD_CREATE, NULL, NULL, 0) < 0) + return -1; + + fd_tmpfs = sys_fsmount(fd_context, 0, 0); + close(fd_context); + return fd_tmpfs; +} + +int create_and_listen_unix_socket(const char *path) +{ + struct sockaddr_un addr = { + .sun_family = AF_UNIX, + }; + assert(strlen(path) < sizeof(addr.sun_path) - 1); + strncpy(addr.sun_path, path, sizeof(addr.sun_path) - 1); + size_t addr_len = + offsetof(struct sockaddr_un, sun_path) + strlen(path) + 1; + int fd, ret; + + fd = socket(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0); + if (fd < 0) + goto out; + + ret = bind(fd, (const struct sockaddr *)&addr, addr_len); + if (ret < 0) + goto out; + + ret = listen(fd, 128); + if (ret < 0) + goto out; + + return fd; + +out: + if (fd >= 0) + close(fd); + return -1; +} + +bool set_core_pattern(const char *pattern) +{ + int fd; + ssize_t ret; + + fd = open("/proc/sys/kernel/core_pattern", O_WRONLY | O_CLOEXEC); + if (fd < 0) + return false; + + ret = write(fd, pattern, strlen(pattern)); + close(fd); + if (ret < 0) + return false; + + fprintf(stderr, "Set core_pattern to '%s' | %zu == %zu\n", pattern, ret, strlen(pattern)); + return ret == strlen(pattern); +} + +int get_peer_pidfd(int fd) +{ + int fd_peer_pidfd; + socklen_t fd_peer_pidfd_len = sizeof(fd_peer_pidfd); + int ret = getsockopt(fd, SOL_SOCKET, SO_PEERPIDFD, &fd_peer_pidfd, + &fd_peer_pidfd_len); + if (ret < 0) { + fprintf(stderr, "get_peer_pidfd: getsockopt(SO_PEERPIDFD) failed: %m\n"); + return -1; + } + fprintf(stderr, "get_peer_pidfd: successfully retrieved pidfd %d\n", fd_peer_pidfd); + return fd_peer_pidfd; +} + +bool get_pidfd_info(int fd_peer_pidfd, struct pidfd_info *info) +{ + int ret; + memset(info, 0, sizeof(*info)); + info->mask = PIDFD_INFO_EXIT | PIDFD_INFO_COREDUMP | PIDFD_INFO_COREDUMP_SIGNAL; + ret = ioctl(fd_peer_pidfd, PIDFD_GET_INFO, info); + if (ret < 0) { + fprintf(stderr, "get_pidfd_info: ioctl(PIDFD_GET_INFO) failed: %m\n"); + return false; + } + fprintf(stderr, "get_pidfd_info: mask=0x%llx, coredump_mask=0x%x, coredump_signal=%d\n", + (unsigned long long)info->mask, info->coredump_mask, info->coredump_signal); + return true; +} + +/* Protocol helper functions */ + +ssize_t recv_marker(int fd) +{ + enum coredump_mark mark = COREDUMP_MARK_REQACK; + ssize_t ret; + + ret = recv(fd, &mark, sizeof(mark), MSG_WAITALL); + if (ret != sizeof(mark)) + return -1; + + switch (mark) { + case COREDUMP_MARK_REQACK: + fprintf(stderr, "Received marker: ReqAck\n"); + return COREDUMP_MARK_REQACK; + case COREDUMP_MARK_MINSIZE: + fprintf(stderr, "Received marker: MinSize\n"); + return COREDUMP_MARK_MINSIZE; + case COREDUMP_MARK_MAXSIZE: + fprintf(stderr, "Received marker: MaxSize\n"); + return COREDUMP_MARK_MAXSIZE; + case COREDUMP_MARK_UNSUPPORTED: + fprintf(stderr, "Received marker: Unsupported\n"); + return COREDUMP_MARK_UNSUPPORTED; + case COREDUMP_MARK_CONFLICTING: + fprintf(stderr, "Received marker: Conflicting\n"); + return COREDUMP_MARK_CONFLICTING; + default: + fprintf(stderr, "Received unknown marker: %u\n", mark); + break; + } + return -1; +} + +bool read_marker(int fd, enum coredump_mark mark) +{ + ssize_t ret; + + ret = recv_marker(fd); + if (ret < 0) + return false; + return ret == mark; +} + +bool read_coredump_req(int fd, struct coredump_req *req) +{ + ssize_t ret; + size_t field_size, user_size, ack_size, kernel_size, remaining_size; + + memset(req, 0, sizeof(*req)); + field_size = sizeof(req->size); + + /* Peek the size of the coredump request. */ + ret = recv(fd, req, field_size, MSG_PEEK | MSG_WAITALL); + if (ret != field_size) { + fprintf(stderr, "read_coredump_req: peek failed (got %zd, expected %zu): %m\n", + ret, field_size); + return false; + } + kernel_size = req->size; + + if (kernel_size < COREDUMP_ACK_SIZE_VER0) { + fprintf(stderr, "read_coredump_req: kernel_size %zu < min %d\n", + kernel_size, COREDUMP_ACK_SIZE_VER0); + return false; + } + if (kernel_size >= PAGE_SIZE) { + fprintf(stderr, "read_coredump_req: kernel_size %zu >= PAGE_SIZE %d\n", + kernel_size, PAGE_SIZE); + return false; + } + + /* Use the minimum of user and kernel size to read the full request. */ + user_size = sizeof(struct coredump_req); + ack_size = user_size < kernel_size ? user_size : kernel_size; + ret = recv(fd, req, ack_size, MSG_WAITALL); + if (ret != ack_size) + return false; + + fprintf(stderr, "Read coredump request with size %u and mask 0x%llx\n", + req->size, (unsigned long long)req->mask); + + if (user_size > kernel_size) + remaining_size = user_size - kernel_size; + else + remaining_size = kernel_size - user_size; + + if (PAGE_SIZE <= remaining_size) + return false; + + /* + * Discard any additional data if the kernel's request was larger than + * what we knew about or cared about. + */ + if (remaining_size) { + char buffer[PAGE_SIZE]; + + ret = recv(fd, buffer, sizeof(buffer), MSG_WAITALL); + if (ret != remaining_size) + return false; + fprintf(stderr, "Discarded %zu bytes of data after coredump request\n", remaining_size); + } + + return true; +} + +bool send_coredump_ack(int fd, const struct coredump_req *req, + __u64 mask, size_t size_ack) +{ + ssize_t ret; + /* + * Wrap struct coredump_ack in a larger struct so we can + * simulate sending to much data to the kernel. + */ + struct large_ack_for_size_testing { + struct coredump_ack ack; + char buffer[PAGE_SIZE]; + } large_ack = {}; + + if (!size_ack) + size_ack = sizeof(struct coredump_ack) < req->size_ack ? + sizeof(struct coredump_ack) : + req->size_ack; + large_ack.ack.mask = mask; + large_ack.ack.size = size_ack; + ret = send(fd, &large_ack, size_ack, MSG_NOSIGNAL); + if (ret != size_ack) + return false; + + fprintf(stderr, "Sent coredump ack with size %zu and mask 0x%llx\n", + size_ack, (unsigned long long)mask); + return true; +} + +bool check_coredump_req(const struct coredump_req *req, size_t min_size, + __u64 required_mask) +{ + if (req->size < min_size) + return false; + if ((req->mask & required_mask) != required_mask) + return false; + if (req->mask & ~required_mask) + return false; + return true; +} + +int open_coredump_tmpfile(int fd_tmpfs_detached) +{ + return openat(fd_tmpfs_detached, ".", O_TMPFILE | O_RDWR | O_EXCL, 0600); +} + +void process_coredump_worker(int fd_coredump, int fd_peer_pidfd, int fd_core_file) +{ + int epfd = -1; + int exit_code = EXIT_FAILURE; + struct epoll_event ev; + int flags; + + /* Set socket to non-blocking mode for edge-triggered epoll */ + flags = fcntl(fd_coredump, F_GETFL, 0); + if (flags < 0) { + fprintf(stderr, "Worker: fcntl(F_GETFL) failed: %m\n"); + goto out; + } + if (fcntl(fd_coredump, F_SETFL, flags | O_NONBLOCK) < 0) { + fprintf(stderr, "Worker: fcntl(F_SETFL, O_NONBLOCK) failed: %m\n"); + goto out; + } + + epfd = epoll_create1(0); + if (epfd < 0) { + fprintf(stderr, "Worker: epoll_create1() failed: %m\n"); + goto out; + } + + ev.events = EPOLLIN | EPOLLRDHUP | EPOLLET; + ev.data.fd = fd_coredump; + if (epoll_ctl(epfd, EPOLL_CTL_ADD, fd_coredump, &ev) < 0) { + fprintf(stderr, "Worker: epoll_ctl(EPOLL_CTL_ADD) failed: %m\n"); + goto out; + } + + for (;;) { + struct epoll_event events[1]; + int n = epoll_wait(epfd, events, 1, -1); + if (n < 0) { + fprintf(stderr, "Worker: epoll_wait() failed: %m\n"); + break; + } + + if (events[0].events & (EPOLLIN | EPOLLRDHUP)) { + for (;;) { + char buffer[4096]; + ssize_t bytes_read = read(fd_coredump, buffer, sizeof(buffer)); + if (bytes_read < 0) { + if (errno == EAGAIN || errno == EWOULDBLOCK) + break; + fprintf(stderr, "Worker: read() failed: %m\n"); + goto out; + } + if (bytes_read == 0) + goto done; + ssize_t bytes_write = write(fd_core_file, buffer, bytes_read); + if (bytes_write != bytes_read) { + if (bytes_write < 0 && errno == ENOSPC) + continue; + fprintf(stderr, "Worker: write() failed (read=%zd, write=%zd): %m\n", + bytes_read, bytes_write); + goto out; + } + } + } + } + +done: + exit_code = EXIT_SUCCESS; + fprintf(stderr, "Worker: completed successfully\n"); +out: + if (epfd >= 0) + close(epfd); + if (fd_core_file >= 0) + close(fd_core_file); + if (fd_peer_pidfd >= 0) + close(fd_peer_pidfd); + if (fd_coredump >= 0) + close(fd_coredump); + _exit(exit_code); +} diff --git a/tools/testing/selftests/coredump/stackdump_test.c b/tools/testing/selftests/coredump/stackdump_test.c index a4ac80bb1003..c2e895bcc160 100644 --- a/tools/testing/selftests/coredump/stackdump_test.c +++ b/tools/testing/selftests/coredump/stackdump_test.c @@ -23,57 +23,15 @@ #include "../filesystems/wrappers.h" #include "../pidfd/pidfd.h" +#include "coredump_test.h" + #define STACKDUMP_FILE "stack_values" #define STACKDUMP_SCRIPT "stackdump" -#define NUM_THREAD_SPAWN 128 #ifndef PAGE_SIZE #define PAGE_SIZE 4096 #endif -static void *do_nothing(void *) -{ - while (1) - pause(); - - return NULL; -} - -static void crashing_child(void) -{ - pthread_t thread; - int i; - - for (i = 0; i < NUM_THREAD_SPAWN; ++i) - pthread_create(&thread, NULL, do_nothing, NULL); - - /* crash on purpose */ - i = *(int *)NULL; -} - -FIXTURE(coredump) -{ - char original_core_pattern[256]; - pid_t pid_coredump_server; - int fd_tmpfs_detached; -}; - -static int create_detached_tmpfs(void) -{ - int fd_context, fd_tmpfs; - - fd_context = sys_fsopen("tmpfs", 0); - if (fd_context < 0) - return -1; - - if (sys_fsconfig(fd_context, FSCONFIG_CMD_CREATE, NULL, NULL, 0) < 0) - return -1; - - fd_tmpfs = sys_fsmount(fd_context, 0, 0); - close(fd_context); - return fd_tmpfs; -} - FIXTURE_SETUP(coredump) { FILE *file; @@ -208,1620 +166,4 @@ TEST_F_TIMEOUT(coredump, stackdump, 120) fclose(file); } -static int create_and_listen_unix_socket(const char *path) -{ - struct sockaddr_un addr = { - .sun_family = AF_UNIX, - }; - assert(strlen(path) < sizeof(addr.sun_path) - 1); - strncpy(addr.sun_path, path, sizeof(addr.sun_path) - 1); - size_t addr_len = - offsetof(struct sockaddr_un, sun_path) + strlen(path) + 1; - int fd, ret; - - fd = socket(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0); - if (fd < 0) - goto out; - - ret = bind(fd, (const struct sockaddr *)&addr, addr_len); - if (ret < 0) - goto out; - - ret = listen(fd, 128); - if (ret < 0) - goto out; - - return fd; - -out: - if (fd >= 0) - close(fd); - return -1; -} - -static bool set_core_pattern(const char *pattern) -{ - int fd; - ssize_t ret; - - fd = open("/proc/sys/kernel/core_pattern", O_WRONLY | O_CLOEXEC); - if (fd < 0) - return false; - - ret = write(fd, pattern, strlen(pattern)); - close(fd); - if (ret < 0) - return false; - - fprintf(stderr, "Set core_pattern to '%s' | %zu == %zu\n", pattern, ret, strlen(pattern)); - return ret == strlen(pattern); -} - -static int get_peer_pidfd(int fd) -{ - int fd_peer_pidfd; - socklen_t fd_peer_pidfd_len = sizeof(fd_peer_pidfd); - int ret = getsockopt(fd, SOL_SOCKET, SO_PEERPIDFD, &fd_peer_pidfd, - &fd_peer_pidfd_len); - if (ret < 0) { - fprintf(stderr, "%m - Failed to retrieve peer pidfd for coredump socket connection\n"); - return -1; - } - return fd_peer_pidfd; -} - -static bool get_pidfd_info(int fd_peer_pidfd, struct pidfd_info *info) -{ - memset(info, 0, sizeof(*info)); - info->mask = PIDFD_INFO_EXIT | PIDFD_INFO_COREDUMP; - return ioctl(fd_peer_pidfd, PIDFD_GET_INFO, info) == 0; -} - -static void -wait_and_check_coredump_server(pid_t pid_coredump_server, - struct __test_metadata *const _metadata, - FIXTURE_DATA(coredump)* self) -{ - int status; - waitpid(pid_coredump_server, &status, 0); - self->pid_coredump_server = -ESRCH; - ASSERT_TRUE(WIFEXITED(status)); - ASSERT_EQ(WEXITSTATUS(status), 0); -} - -TEST_F(coredump, socket) -{ - int pidfd, ret, status; - pid_t pid, pid_coredump_server; - struct stat st; - struct pidfd_info info = {}; - int ipc_sockets[2]; - char c; - - ASSERT_TRUE(set_core_pattern("@/tmp/coredump.socket")); - - ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets); - ASSERT_EQ(ret, 0); - - pid_coredump_server = fork(); - ASSERT_GE(pid_coredump_server, 0); - if (pid_coredump_server == 0) { - int fd_server = -1, fd_coredump = -1, fd_peer_pidfd = -1, fd_core_file = -1; - int exit_code = EXIT_FAILURE; - - close(ipc_sockets[0]); - - fd_server = create_and_listen_unix_socket("/tmp/coredump.socket"); - if (fd_server < 0) - goto out; - - if (write_nointr(ipc_sockets[1], "1", 1) < 0) - goto out; - - close(ipc_sockets[1]); - - fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC); - if (fd_coredump < 0) - goto out; - - fd_peer_pidfd = get_peer_pidfd(fd_coredump); - if (fd_peer_pidfd < 0) - goto out; - - if (!get_pidfd_info(fd_peer_pidfd, &info)) - goto out; - - if (!(info.mask & PIDFD_INFO_COREDUMP)) - goto out; - - if (!(info.coredump_mask & PIDFD_COREDUMPED)) - goto out; - - fd_core_file = creat("/tmp/coredump.file", 0644); - if (fd_core_file < 0) - goto out; - - for (;;) { - char buffer[4096]; - ssize_t bytes_read, bytes_write; - - bytes_read = read(fd_coredump, buffer, sizeof(buffer)); - if (bytes_read < 0) - goto out; - - if (bytes_read == 0) - break; - - bytes_write = write(fd_core_file, buffer, bytes_read); - if (bytes_read != bytes_write) - goto out; - } - - exit_code = EXIT_SUCCESS; -out: - if (fd_core_file >= 0) - close(fd_core_file); - if (fd_peer_pidfd >= 0) - close(fd_peer_pidfd); - if (fd_coredump >= 0) - close(fd_coredump); - if (fd_server >= 0) - close(fd_server); - _exit(exit_code); - } - self->pid_coredump_server = pid_coredump_server; - - EXPECT_EQ(close(ipc_sockets[1]), 0); - ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1); - EXPECT_EQ(close(ipc_sockets[0]), 0); - - pid = fork(); - ASSERT_GE(pid, 0); - if (pid == 0) - crashing_child(); - - pidfd = sys_pidfd_open(pid, 0); - ASSERT_GE(pidfd, 0); - - waitpid(pid, &status, 0); - ASSERT_TRUE(WIFSIGNALED(status)); - ASSERT_TRUE(WCOREDUMP(status)); - - ASSERT_TRUE(get_pidfd_info(pidfd, &info)); - ASSERT_GT((info.mask & PIDFD_INFO_COREDUMP), 0); - ASSERT_GT((info.coredump_mask & PIDFD_COREDUMPED), 0); - - wait_and_check_coredump_server(pid_coredump_server, _metadata, self); - - ASSERT_EQ(stat("/tmp/coredump.file", &st), 0); - ASSERT_GT(st.st_size, 0); - system("file /tmp/coredump.file"); -} - -TEST_F(coredump, socket_detect_userspace_client) -{ - int pidfd, ret, status; - pid_t pid, pid_coredump_server; - struct stat st; - struct pidfd_info info = {}; - int ipc_sockets[2]; - char c; - - ASSERT_TRUE(set_core_pattern("@/tmp/coredump.socket")); - - ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets); - ASSERT_EQ(ret, 0); - - pid_coredump_server = fork(); - ASSERT_GE(pid_coredump_server, 0); - if (pid_coredump_server == 0) { - int fd_server = -1, fd_coredump = -1, fd_peer_pidfd = -1; - int exit_code = EXIT_FAILURE; - - close(ipc_sockets[0]); - - fd_server = create_and_listen_unix_socket("/tmp/coredump.socket"); - if (fd_server < 0) - goto out; - - if (write_nointr(ipc_sockets[1], "1", 1) < 0) - goto out; - - close(ipc_sockets[1]); - - fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC); - if (fd_coredump < 0) - goto out; - - fd_peer_pidfd = get_peer_pidfd(fd_coredump); - if (fd_peer_pidfd < 0) - goto out; - - if (!get_pidfd_info(fd_peer_pidfd, &info)) - goto out; - - if (!(info.mask & PIDFD_INFO_COREDUMP)) - goto out; - - if (info.coredump_mask & PIDFD_COREDUMPED) - goto out; - - exit_code = EXIT_SUCCESS; -out: - if (fd_peer_pidfd >= 0) - close(fd_peer_pidfd); - if (fd_coredump >= 0) - close(fd_coredump); - if (fd_server >= 0) - close(fd_server); - _exit(exit_code); - } - self->pid_coredump_server = pid_coredump_server; - - EXPECT_EQ(close(ipc_sockets[1]), 0); - ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1); - EXPECT_EQ(close(ipc_sockets[0]), 0); - - pid = fork(); - ASSERT_GE(pid, 0); - if (pid == 0) { - int fd_socket; - ssize_t ret; - const struct sockaddr_un coredump_sk = { - .sun_family = AF_UNIX, - .sun_path = "/tmp/coredump.socket", - }; - size_t coredump_sk_len = - offsetof(struct sockaddr_un, sun_path) + - sizeof("/tmp/coredump.socket"); - - fd_socket = socket(AF_UNIX, SOCK_STREAM, 0); - if (fd_socket < 0) - _exit(EXIT_FAILURE); - - ret = connect(fd_socket, (const struct sockaddr *)&coredump_sk, coredump_sk_len); - if (ret < 0) - _exit(EXIT_FAILURE); - - close(fd_socket); - _exit(EXIT_SUCCESS); - } - - pidfd = sys_pidfd_open(pid, 0); - ASSERT_GE(pidfd, 0); - - waitpid(pid, &status, 0); - ASSERT_TRUE(WIFEXITED(status)); - ASSERT_EQ(WEXITSTATUS(status), 0); - - ASSERT_TRUE(get_pidfd_info(pidfd, &info)); - ASSERT_GT((info.mask & PIDFD_INFO_COREDUMP), 0); - ASSERT_EQ((info.coredump_mask & PIDFD_COREDUMPED), 0); - - wait_and_check_coredump_server(pid_coredump_server, _metadata, self); - - ASSERT_NE(stat("/tmp/coredump.file", &st), 0); - ASSERT_EQ(errno, ENOENT); -} - -TEST_F(coredump, socket_enoent) -{ - int pidfd, status; - pid_t pid; - - ASSERT_TRUE(set_core_pattern("@/tmp/coredump.socket")); - - pid = fork(); - ASSERT_GE(pid, 0); - if (pid == 0) - crashing_child(); - - pidfd = sys_pidfd_open(pid, 0); - ASSERT_GE(pidfd, 0); - - waitpid(pid, &status, 0); - ASSERT_TRUE(WIFSIGNALED(status)); - ASSERT_FALSE(WCOREDUMP(status)); -} - -TEST_F(coredump, socket_no_listener) -{ - int pidfd, ret, status; - pid_t pid, pid_coredump_server; - int ipc_sockets[2]; - char c; - const struct sockaddr_un coredump_sk = { - .sun_family = AF_UNIX, - .sun_path = "/tmp/coredump.socket", - }; - size_t coredump_sk_len = offsetof(struct sockaddr_un, sun_path) + - sizeof("/tmp/coredump.socket"); - - ASSERT_TRUE(set_core_pattern("@/tmp/coredump.socket")); - - ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets); - ASSERT_EQ(ret, 0); - - pid_coredump_server = fork(); - ASSERT_GE(pid_coredump_server, 0); - if (pid_coredump_server == 0) { - int fd_server = -1; - int exit_code = EXIT_FAILURE; - - close(ipc_sockets[0]); - - fd_server = socket(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0); - if (fd_server < 0) - goto out; - - ret = bind(fd_server, (const struct sockaddr *)&coredump_sk, coredump_sk_len); - if (ret < 0) - goto out; - - if (write_nointr(ipc_sockets[1], "1", 1) < 0) - goto out; - - exit_code = EXIT_SUCCESS; -out: - if (fd_server >= 0) - close(fd_server); - close(ipc_sockets[1]); - _exit(exit_code); - } - self->pid_coredump_server = pid_coredump_server; - - EXPECT_EQ(close(ipc_sockets[1]), 0); - ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1); - EXPECT_EQ(close(ipc_sockets[0]), 0); - - pid = fork(); - ASSERT_GE(pid, 0); - if (pid == 0) - crashing_child(); - - pidfd = sys_pidfd_open(pid, 0); - ASSERT_GE(pidfd, 0); - - waitpid(pid, &status, 0); - ASSERT_TRUE(WIFSIGNALED(status)); - ASSERT_FALSE(WCOREDUMP(status)); - - wait_and_check_coredump_server(pid_coredump_server, _metadata, self); -} - -static ssize_t recv_marker(int fd) -{ - enum coredump_mark mark = COREDUMP_MARK_REQACK; - ssize_t ret; - - ret = recv(fd, &mark, sizeof(mark), MSG_WAITALL); - if (ret != sizeof(mark)) - return -1; - - switch (mark) { - case COREDUMP_MARK_REQACK: - fprintf(stderr, "Received marker: ReqAck\n"); - return COREDUMP_MARK_REQACK; - case COREDUMP_MARK_MINSIZE: - fprintf(stderr, "Received marker: MinSize\n"); - return COREDUMP_MARK_MINSIZE; - case COREDUMP_MARK_MAXSIZE: - fprintf(stderr, "Received marker: MaxSize\n"); - return COREDUMP_MARK_MAXSIZE; - case COREDUMP_MARK_UNSUPPORTED: - fprintf(stderr, "Received marker: Unsupported\n"); - return COREDUMP_MARK_UNSUPPORTED; - case COREDUMP_MARK_CONFLICTING: - fprintf(stderr, "Received marker: Conflicting\n"); - return COREDUMP_MARK_CONFLICTING; - default: - fprintf(stderr, "Received unknown marker: %u\n", mark); - break; - } - return -1; -} - -static bool read_marker(int fd, enum coredump_mark mark) -{ - ssize_t ret; - - ret = recv_marker(fd); - if (ret < 0) - return false; - return ret == mark; -} - -static bool read_coredump_req(int fd, struct coredump_req *req) -{ - ssize_t ret; - size_t field_size, user_size, ack_size, kernel_size, remaining_size; - - memset(req, 0, sizeof(*req)); - field_size = sizeof(req->size); - - /* Peek the size of the coredump request. */ - ret = recv(fd, req, field_size, MSG_PEEK | MSG_WAITALL); - if (ret != field_size) - return false; - kernel_size = req->size; - - if (kernel_size < COREDUMP_ACK_SIZE_VER0) - return false; - if (kernel_size >= PAGE_SIZE) - return false; - - /* Use the minimum of user and kernel size to read the full request. */ - user_size = sizeof(struct coredump_req); - ack_size = user_size < kernel_size ? user_size : kernel_size; - ret = recv(fd, req, ack_size, MSG_WAITALL); - if (ret != ack_size) - return false; - - fprintf(stderr, "Read coredump request with size %u and mask 0x%llx\n", - req->size, (unsigned long long)req->mask); - - if (user_size > kernel_size) - remaining_size = user_size - kernel_size; - else - remaining_size = kernel_size - user_size; - - if (PAGE_SIZE <= remaining_size) - return false; - - /* - * Discard any additional data if the kernel's request was larger than - * what we knew about or cared about. - */ - if (remaining_size) { - char buffer[PAGE_SIZE]; - - ret = recv(fd, buffer, sizeof(buffer), MSG_WAITALL); - if (ret != remaining_size) - return false; - fprintf(stderr, "Discarded %zu bytes of data after coredump request\n", remaining_size); - } - - return true; -} - -static bool send_coredump_ack(int fd, const struct coredump_req *req, - __u64 mask, size_t size_ack) -{ - ssize_t ret; - /* - * Wrap struct coredump_ack in a larger struct so we can - * simulate sending to much data to the kernel. - */ - struct large_ack_for_size_testing { - struct coredump_ack ack; - char buffer[PAGE_SIZE]; - } large_ack = {}; - - if (!size_ack) - size_ack = sizeof(struct coredump_ack) < req->size_ack ? - sizeof(struct coredump_ack) : - req->size_ack; - large_ack.ack.mask = mask; - large_ack.ack.size = size_ack; - ret = send(fd, &large_ack, size_ack, MSG_NOSIGNAL); - if (ret != size_ack) - return false; - - fprintf(stderr, "Sent coredump ack with size %zu and mask 0x%llx\n", - size_ack, (unsigned long long)mask); - return true; -} - -static bool check_coredump_req(const struct coredump_req *req, size_t min_size, - __u64 required_mask) -{ - if (req->size < min_size) - return false; - if ((req->mask & required_mask) != required_mask) - return false; - if (req->mask & ~required_mask) - return false; - return true; -} - -TEST_F(coredump, socket_request_kernel) -{ - int pidfd, ret, status; - pid_t pid, pid_coredump_server; - struct stat st; - struct pidfd_info info = {}; - int ipc_sockets[2]; - char c; - - ASSERT_TRUE(set_core_pattern("@@/tmp/coredump.socket")); - - ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets); - ASSERT_EQ(ret, 0); - - pid_coredump_server = fork(); - ASSERT_GE(pid_coredump_server, 0); - if (pid_coredump_server == 0) { - struct coredump_req req = {}; - int fd_server = -1, fd_coredump = -1, fd_core_file = -1, fd_peer_pidfd = -1; - int exit_code = EXIT_FAILURE; - - close(ipc_sockets[0]); - - fd_server = create_and_listen_unix_socket("/tmp/coredump.socket"); - if (fd_server < 0) - goto out; - - if (write_nointr(ipc_sockets[1], "1", 1) < 0) - goto out; - - close(ipc_sockets[1]); - - fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC); - if (fd_coredump < 0) - goto out; - - fd_peer_pidfd = get_peer_pidfd(fd_coredump); - if (fd_peer_pidfd < 0) - goto out; - - if (!get_pidfd_info(fd_peer_pidfd, &info)) - goto out; - - if (!(info.mask & PIDFD_INFO_COREDUMP)) - goto out; - - if (!(info.coredump_mask & PIDFD_COREDUMPED)) - goto out; - - fd_core_file = creat("/tmp/coredump.file", 0644); - if (fd_core_file < 0) - goto out; - - if (!read_coredump_req(fd_coredump, &req)) - goto out; - - if (!check_coredump_req(&req, COREDUMP_ACK_SIZE_VER0, - COREDUMP_KERNEL | COREDUMP_USERSPACE | - COREDUMP_REJECT | COREDUMP_WAIT)) - goto out; - - if (!send_coredump_ack(fd_coredump, &req, - COREDUMP_KERNEL | COREDUMP_WAIT, 0)) - goto out; - - if (!read_marker(fd_coredump, COREDUMP_MARK_REQACK)) - goto out; - - for (;;) { - char buffer[4096]; - ssize_t bytes_read, bytes_write; - - bytes_read = read(fd_coredump, buffer, sizeof(buffer)); - if (bytes_read < 0) - goto out; - - if (bytes_read == 0) - break; - - bytes_write = write(fd_core_file, buffer, bytes_read); - if (bytes_read != bytes_write) - goto out; - } - - exit_code = EXIT_SUCCESS; -out: - if (fd_core_file >= 0) - close(fd_core_file); - if (fd_peer_pidfd >= 0) - close(fd_peer_pidfd); - if (fd_coredump >= 0) - close(fd_coredump); - if (fd_server >= 0) - close(fd_server); - _exit(exit_code); - } - self->pid_coredump_server = pid_coredump_server; - - EXPECT_EQ(close(ipc_sockets[1]), 0); - ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1); - EXPECT_EQ(close(ipc_sockets[0]), 0); - - pid = fork(); - ASSERT_GE(pid, 0); - if (pid == 0) - crashing_child(); - - pidfd = sys_pidfd_open(pid, 0); - ASSERT_GE(pidfd, 0); - - waitpid(pid, &status, 0); - ASSERT_TRUE(WIFSIGNALED(status)); - ASSERT_TRUE(WCOREDUMP(status)); - - ASSERT_TRUE(get_pidfd_info(pidfd, &info)); - ASSERT_GT((info.mask & PIDFD_INFO_COREDUMP), 0); - ASSERT_GT((info.coredump_mask & PIDFD_COREDUMPED), 0); - - wait_and_check_coredump_server(pid_coredump_server, _metadata, self); - - ASSERT_EQ(stat("/tmp/coredump.file", &st), 0); - ASSERT_GT(st.st_size, 0); - system("file /tmp/coredump.file"); -} - -TEST_F(coredump, socket_request_userspace) -{ - int pidfd, ret, status; - pid_t pid, pid_coredump_server; - struct pidfd_info info = {}; - int ipc_sockets[2]; - char c; - - ASSERT_TRUE(set_core_pattern("@@/tmp/coredump.socket")); - - ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets); - ASSERT_EQ(ret, 0); - - pid_coredump_server = fork(); - ASSERT_GE(pid_coredump_server, 0); - if (pid_coredump_server == 0) { - struct coredump_req req = {}; - int fd_server = -1, fd_coredump = -1, fd_peer_pidfd = -1; - int exit_code = EXIT_FAILURE; - - close(ipc_sockets[0]); - - fd_server = create_and_listen_unix_socket("/tmp/coredump.socket"); - if (fd_server < 0) - goto out; - - if (write_nointr(ipc_sockets[1], "1", 1) < 0) - goto out; - - close(ipc_sockets[1]); - - fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC); - if (fd_coredump < 0) - goto out; - - fd_peer_pidfd = get_peer_pidfd(fd_coredump); - if (fd_peer_pidfd < 0) - goto out; - - if (!get_pidfd_info(fd_peer_pidfd, &info)) - goto out; - - if (!(info.mask & PIDFD_INFO_COREDUMP)) - goto out; - - if (!(info.coredump_mask & PIDFD_COREDUMPED)) - goto out; - - if (!read_coredump_req(fd_coredump, &req)) - goto out; - - if (!check_coredump_req(&req, COREDUMP_ACK_SIZE_VER0, - COREDUMP_KERNEL | COREDUMP_USERSPACE | - COREDUMP_REJECT | COREDUMP_WAIT)) - goto out; - - if (!send_coredump_ack(fd_coredump, &req, - COREDUMP_USERSPACE | COREDUMP_WAIT, 0)) - goto out; - - if (!read_marker(fd_coredump, COREDUMP_MARK_REQACK)) - goto out; - - for (;;) { - char buffer[4096]; - ssize_t bytes_read; - - bytes_read = read(fd_coredump, buffer, sizeof(buffer)); - if (bytes_read > 0) - goto out; - - if (bytes_read < 0) - goto out; - - if (bytes_read == 0) - break; - } - - exit_code = EXIT_SUCCESS; -out: - if (fd_peer_pidfd >= 0) - close(fd_peer_pidfd); - if (fd_coredump >= 0) - close(fd_coredump); - if (fd_server >= 0) - close(fd_server); - _exit(exit_code); - } - self->pid_coredump_server = pid_coredump_server; - - EXPECT_EQ(close(ipc_sockets[1]), 0); - ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1); - EXPECT_EQ(close(ipc_sockets[0]), 0); - - pid = fork(); - ASSERT_GE(pid, 0); - if (pid == 0) - crashing_child(); - - pidfd = sys_pidfd_open(pid, 0); - ASSERT_GE(pidfd, 0); - - waitpid(pid, &status, 0); - ASSERT_TRUE(WIFSIGNALED(status)); - ASSERT_TRUE(WCOREDUMP(status)); - - ASSERT_TRUE(get_pidfd_info(pidfd, &info)); - ASSERT_GT((info.mask & PIDFD_INFO_COREDUMP), 0); - ASSERT_GT((info.coredump_mask & PIDFD_COREDUMPED), 0); - - wait_and_check_coredump_server(pid_coredump_server, _metadata, self); -} - -TEST_F(coredump, socket_request_reject) -{ - int pidfd, ret, status; - pid_t pid, pid_coredump_server; - struct pidfd_info info = {}; - int ipc_sockets[2]; - char c; - - ASSERT_TRUE(set_core_pattern("@@/tmp/coredump.socket")); - - ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets); - ASSERT_EQ(ret, 0); - - pid_coredump_server = fork(); - ASSERT_GE(pid_coredump_server, 0); - if (pid_coredump_server == 0) { - struct coredump_req req = {}; - int fd_server = -1, fd_coredump = -1, fd_peer_pidfd = -1; - int exit_code = EXIT_FAILURE; - - close(ipc_sockets[0]); - - fd_server = create_and_listen_unix_socket("/tmp/coredump.socket"); - if (fd_server < 0) - goto out; - - if (write_nointr(ipc_sockets[1], "1", 1) < 0) - goto out; - - close(ipc_sockets[1]); - - fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC); - if (fd_coredump < 0) - goto out; - - fd_peer_pidfd = get_peer_pidfd(fd_coredump); - if (fd_peer_pidfd < 0) - goto out; - - if (!get_pidfd_info(fd_peer_pidfd, &info)) - goto out; - - if (!(info.mask & PIDFD_INFO_COREDUMP)) - goto out; - - if (!(info.coredump_mask & PIDFD_COREDUMPED)) - goto out; - - if (!read_coredump_req(fd_coredump, &req)) - goto out; - - if (!check_coredump_req(&req, COREDUMP_ACK_SIZE_VER0, - COREDUMP_KERNEL | COREDUMP_USERSPACE | - COREDUMP_REJECT | COREDUMP_WAIT)) - goto out; - - if (!send_coredump_ack(fd_coredump, &req, - COREDUMP_REJECT | COREDUMP_WAIT, 0)) - goto out; - - if (!read_marker(fd_coredump, COREDUMP_MARK_REQACK)) - goto out; - - for (;;) { - char buffer[4096]; - ssize_t bytes_read; - - bytes_read = read(fd_coredump, buffer, sizeof(buffer)); - if (bytes_read > 0) - goto out; - - if (bytes_read < 0) - goto out; - - if (bytes_read == 0) - break; - } - - exit_code = EXIT_SUCCESS; -out: - if (fd_peer_pidfd >= 0) - close(fd_peer_pidfd); - if (fd_coredump >= 0) - close(fd_coredump); - if (fd_server >= 0) - close(fd_server); - _exit(exit_code); - } - self->pid_coredump_server = pid_coredump_server; - - EXPECT_EQ(close(ipc_sockets[1]), 0); - ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1); - EXPECT_EQ(close(ipc_sockets[0]), 0); - - pid = fork(); - ASSERT_GE(pid, 0); - if (pid == 0) - crashing_child(); - - pidfd = sys_pidfd_open(pid, 0); - ASSERT_GE(pidfd, 0); - - waitpid(pid, &status, 0); - ASSERT_TRUE(WIFSIGNALED(status)); - ASSERT_FALSE(WCOREDUMP(status)); - - ASSERT_TRUE(get_pidfd_info(pidfd, &info)); - ASSERT_GT((info.mask & PIDFD_INFO_COREDUMP), 0); - ASSERT_GT((info.coredump_mask & PIDFD_COREDUMPED), 0); - - wait_and_check_coredump_server(pid_coredump_server, _metadata, self); -} - -TEST_F(coredump, socket_request_invalid_flag_combination) -{ - int pidfd, ret, status; - pid_t pid, pid_coredump_server; - struct pidfd_info info = {}; - int ipc_sockets[2]; - char c; - - ASSERT_TRUE(set_core_pattern("@@/tmp/coredump.socket")); - - ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets); - ASSERT_EQ(ret, 0); - - pid_coredump_server = fork(); - ASSERT_GE(pid_coredump_server, 0); - if (pid_coredump_server == 0) { - struct coredump_req req = {}; - int fd_server = -1, fd_coredump = -1, fd_peer_pidfd = -1; - int exit_code = EXIT_FAILURE; - - close(ipc_sockets[0]); - - fd_server = create_and_listen_unix_socket("/tmp/coredump.socket"); - if (fd_server < 0) - goto out; - - if (write_nointr(ipc_sockets[1], "1", 1) < 0) - goto out; - - close(ipc_sockets[1]); - - fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC); - if (fd_coredump < 0) - goto out; - - fd_peer_pidfd = get_peer_pidfd(fd_coredump); - if (fd_peer_pidfd < 0) - goto out; - - if (!get_pidfd_info(fd_peer_pidfd, &info)) - goto out; - - if (!(info.mask & PIDFD_INFO_COREDUMP)) - goto out; - - if (!(info.coredump_mask & PIDFD_COREDUMPED)) - goto out; - - if (!read_coredump_req(fd_coredump, &req)) - goto out; - - if (!check_coredump_req(&req, COREDUMP_ACK_SIZE_VER0, - COREDUMP_KERNEL | COREDUMP_USERSPACE | - COREDUMP_REJECT | COREDUMP_WAIT)) - goto out; - - if (!send_coredump_ack(fd_coredump, &req, - COREDUMP_KERNEL | COREDUMP_REJECT | COREDUMP_WAIT, 0)) - goto out; - - if (!read_marker(fd_coredump, COREDUMP_MARK_CONFLICTING)) - goto out; - - exit_code = EXIT_SUCCESS; -out: - if (fd_peer_pidfd >= 0) - close(fd_peer_pidfd); - if (fd_coredump >= 0) - close(fd_coredump); - if (fd_server >= 0) - close(fd_server); - _exit(exit_code); - } - self->pid_coredump_server = pid_coredump_server; - - EXPECT_EQ(close(ipc_sockets[1]), 0); - ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1); - EXPECT_EQ(close(ipc_sockets[0]), 0); - - pid = fork(); - ASSERT_GE(pid, 0); - if (pid == 0) - crashing_child(); - - pidfd = sys_pidfd_open(pid, 0); - ASSERT_GE(pidfd, 0); - - waitpid(pid, &status, 0); - ASSERT_TRUE(WIFSIGNALED(status)); - ASSERT_FALSE(WCOREDUMP(status)); - - ASSERT_TRUE(get_pidfd_info(pidfd, &info)); - ASSERT_GT((info.mask & PIDFD_INFO_COREDUMP), 0); - ASSERT_GT((info.coredump_mask & PIDFD_COREDUMPED), 0); - - wait_and_check_coredump_server(pid_coredump_server, _metadata, self); -} - -TEST_F(coredump, socket_request_unknown_flag) -{ - int pidfd, ret, status; - pid_t pid, pid_coredump_server; - struct pidfd_info info = {}; - int ipc_sockets[2]; - char c; - - ASSERT_TRUE(set_core_pattern("@@/tmp/coredump.socket")); - - ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets); - ASSERT_EQ(ret, 0); - - pid_coredump_server = fork(); - ASSERT_GE(pid_coredump_server, 0); - if (pid_coredump_server == 0) { - struct coredump_req req = {}; - int fd_server = -1, fd_coredump = -1, fd_peer_pidfd = -1; - int exit_code = EXIT_FAILURE; - - close(ipc_sockets[0]); - - fd_server = create_and_listen_unix_socket("/tmp/coredump.socket"); - if (fd_server < 0) - goto out; - - if (write_nointr(ipc_sockets[1], "1", 1) < 0) - goto out; - - close(ipc_sockets[1]); - - fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC); - if (fd_coredump < 0) - goto out; - - fd_peer_pidfd = get_peer_pidfd(fd_coredump); - if (fd_peer_pidfd < 0) - goto out; - - if (!get_pidfd_info(fd_peer_pidfd, &info)) - goto out; - - if (!(info.mask & PIDFD_INFO_COREDUMP)) - goto out; - - if (!(info.coredump_mask & PIDFD_COREDUMPED)) - goto out; - - if (!read_coredump_req(fd_coredump, &req)) - goto out; - - if (!check_coredump_req(&req, COREDUMP_ACK_SIZE_VER0, - COREDUMP_KERNEL | COREDUMP_USERSPACE | - COREDUMP_REJECT | COREDUMP_WAIT)) - goto out; - - if (!send_coredump_ack(fd_coredump, &req, (1ULL << 63), 0)) - goto out; - - if (!read_marker(fd_coredump, COREDUMP_MARK_UNSUPPORTED)) - goto out; - - exit_code = EXIT_SUCCESS; -out: - if (fd_peer_pidfd >= 0) - close(fd_peer_pidfd); - if (fd_coredump >= 0) - close(fd_coredump); - if (fd_server >= 0) - close(fd_server); - _exit(exit_code); - } - self->pid_coredump_server = pid_coredump_server; - - EXPECT_EQ(close(ipc_sockets[1]), 0); - ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1); - EXPECT_EQ(close(ipc_sockets[0]), 0); - - pid = fork(); - ASSERT_GE(pid, 0); - if (pid == 0) - crashing_child(); - - pidfd = sys_pidfd_open(pid, 0); - ASSERT_GE(pidfd, 0); - - waitpid(pid, &status, 0); - ASSERT_TRUE(WIFSIGNALED(status)); - ASSERT_FALSE(WCOREDUMP(status)); - - ASSERT_TRUE(get_pidfd_info(pidfd, &info)); - ASSERT_GT((info.mask & PIDFD_INFO_COREDUMP), 0); - ASSERT_GT((info.coredump_mask & PIDFD_COREDUMPED), 0); - - wait_and_check_coredump_server(pid_coredump_server, _metadata, self); -} - -TEST_F(coredump, socket_request_invalid_size_small) -{ - int pidfd, ret, status; - pid_t pid, pid_coredump_server; - struct pidfd_info info = {}; - int ipc_sockets[2]; - char c; - - ASSERT_TRUE(set_core_pattern("@@/tmp/coredump.socket")); - - ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets); - ASSERT_EQ(ret, 0); - - pid_coredump_server = fork(); - ASSERT_GE(pid_coredump_server, 0); - if (pid_coredump_server == 0) { - struct coredump_req req = {}; - int fd_server = -1, fd_coredump = -1, fd_peer_pidfd = -1; - int exit_code = EXIT_FAILURE; - - close(ipc_sockets[0]); - - fd_server = create_and_listen_unix_socket("/tmp/coredump.socket"); - if (fd_server < 0) - goto out; - - if (write_nointr(ipc_sockets[1], "1", 1) < 0) - goto out; - - close(ipc_sockets[1]); - - fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC); - if (fd_coredump < 0) - goto out; - - fd_peer_pidfd = get_peer_pidfd(fd_coredump); - if (fd_peer_pidfd < 0) - goto out; - - if (!get_pidfd_info(fd_peer_pidfd, &info)) - goto out; - - if (!(info.mask & PIDFD_INFO_COREDUMP)) - goto out; - - if (!(info.coredump_mask & PIDFD_COREDUMPED)) - goto out; - - if (!read_coredump_req(fd_coredump, &req)) - goto out; - - if (!check_coredump_req(&req, COREDUMP_ACK_SIZE_VER0, - COREDUMP_KERNEL | COREDUMP_USERSPACE | - COREDUMP_REJECT | COREDUMP_WAIT)) - goto out; - - if (!send_coredump_ack(fd_coredump, &req, - COREDUMP_REJECT | COREDUMP_WAIT, - COREDUMP_ACK_SIZE_VER0 / 2)) - goto out; - - if (!read_marker(fd_coredump, COREDUMP_MARK_MINSIZE)) - goto out; - - exit_code = EXIT_SUCCESS; -out: - if (fd_peer_pidfd >= 0) - close(fd_peer_pidfd); - if (fd_coredump >= 0) - close(fd_coredump); - if (fd_server >= 0) - close(fd_server); - _exit(exit_code); - } - self->pid_coredump_server = pid_coredump_server; - - EXPECT_EQ(close(ipc_sockets[1]), 0); - ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1); - EXPECT_EQ(close(ipc_sockets[0]), 0); - - pid = fork(); - ASSERT_GE(pid, 0); - if (pid == 0) - crashing_child(); - - pidfd = sys_pidfd_open(pid, 0); - ASSERT_GE(pidfd, 0); - - waitpid(pid, &status, 0); - ASSERT_TRUE(WIFSIGNALED(status)); - ASSERT_FALSE(WCOREDUMP(status)); - - ASSERT_TRUE(get_pidfd_info(pidfd, &info)); - ASSERT_GT((info.mask & PIDFD_INFO_COREDUMP), 0); - ASSERT_GT((info.coredump_mask & PIDFD_COREDUMPED), 0); - - wait_and_check_coredump_server(pid_coredump_server, _metadata, self); -} - -TEST_F(coredump, socket_request_invalid_size_large) -{ - int pidfd, ret, status; - pid_t pid, pid_coredump_server; - struct pidfd_info info = {}; - int ipc_sockets[2]; - char c; - - ASSERT_TRUE(set_core_pattern("@@/tmp/coredump.socket")); - - ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets); - ASSERT_EQ(ret, 0); - - pid_coredump_server = fork(); - ASSERT_GE(pid_coredump_server, 0); - if (pid_coredump_server == 0) { - struct coredump_req req = {}; - int fd_server = -1, fd_coredump = -1, fd_peer_pidfd = -1; - int exit_code = EXIT_FAILURE; - - close(ipc_sockets[0]); - - fd_server = create_and_listen_unix_socket("/tmp/coredump.socket"); - if (fd_server < 0) - goto out; - - if (write_nointr(ipc_sockets[1], "1", 1) < 0) - goto out; - - close(ipc_sockets[1]); - - fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC); - if (fd_coredump < 0) - goto out; - - fd_peer_pidfd = get_peer_pidfd(fd_coredump); - if (fd_peer_pidfd < 0) - goto out; - - if (!get_pidfd_info(fd_peer_pidfd, &info)) - goto out; - - if (!(info.mask & PIDFD_INFO_COREDUMP)) - goto out; - - if (!(info.coredump_mask & PIDFD_COREDUMPED)) - goto out; - - if (!read_coredump_req(fd_coredump, &req)) - goto out; - - if (!check_coredump_req(&req, COREDUMP_ACK_SIZE_VER0, - COREDUMP_KERNEL | COREDUMP_USERSPACE | - COREDUMP_REJECT | COREDUMP_WAIT)) - goto out; - - if (!send_coredump_ack(fd_coredump, &req, - COREDUMP_REJECT | COREDUMP_WAIT, - COREDUMP_ACK_SIZE_VER0 + PAGE_SIZE)) - goto out; - - if (!read_marker(fd_coredump, COREDUMP_MARK_MAXSIZE)) - goto out; - - exit_code = EXIT_SUCCESS; -out: - if (fd_peer_pidfd >= 0) - close(fd_peer_pidfd); - if (fd_coredump >= 0) - close(fd_coredump); - if (fd_server >= 0) - close(fd_server); - _exit(exit_code); - } - self->pid_coredump_server = pid_coredump_server; - - EXPECT_EQ(close(ipc_sockets[1]), 0); - ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1); - EXPECT_EQ(close(ipc_sockets[0]), 0); - - pid = fork(); - ASSERT_GE(pid, 0); - if (pid == 0) - crashing_child(); - - pidfd = sys_pidfd_open(pid, 0); - ASSERT_GE(pidfd, 0); - - waitpid(pid, &status, 0); - ASSERT_TRUE(WIFSIGNALED(status)); - ASSERT_FALSE(WCOREDUMP(status)); - - ASSERT_TRUE(get_pidfd_info(pidfd, &info)); - ASSERT_GT((info.mask & PIDFD_INFO_COREDUMP), 0); - ASSERT_GT((info.coredump_mask & PIDFD_COREDUMPED), 0); - - wait_and_check_coredump_server(pid_coredump_server, _metadata, self); -} - -static int open_coredump_tmpfile(int fd_tmpfs_detached) -{ - return openat(fd_tmpfs_detached, ".", O_TMPFILE | O_RDWR | O_EXCL, 0600); -} - -#define NUM_CRASHING_COREDUMPS 5 - -TEST_F_TIMEOUT(coredump, socket_multiple_crashing_coredumps, 500) -{ - int pidfd[NUM_CRASHING_COREDUMPS], status[NUM_CRASHING_COREDUMPS]; - pid_t pid[NUM_CRASHING_COREDUMPS], pid_coredump_server; - struct pidfd_info info = {}; - int ipc_sockets[2]; - char c; - - ASSERT_TRUE(set_core_pattern("@@/tmp/coredump.socket")); - - ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets), 0); - - pid_coredump_server = fork(); - ASSERT_GE(pid_coredump_server, 0); - if (pid_coredump_server == 0) { - int fd_server = -1, fd_coredump = -1, fd_peer_pidfd = -1, fd_core_file = -1; - int exit_code = EXIT_FAILURE; - struct coredump_req req = {}; - - close(ipc_sockets[0]); - fd_server = create_and_listen_unix_socket("/tmp/coredump.socket"); - if (fd_server < 0) { - fprintf(stderr, "Failed to create and listen on unix socket\n"); - goto out; - } - - if (write_nointr(ipc_sockets[1], "1", 1) < 0) { - fprintf(stderr, "Failed to notify parent via ipc socket\n"); - goto out; - } - close(ipc_sockets[1]); - - for (int i = 0; i < NUM_CRASHING_COREDUMPS; i++) { - fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC); - if (fd_coredump < 0) { - fprintf(stderr, "accept4 failed: %m\n"); - goto out; - } - - fd_peer_pidfd = get_peer_pidfd(fd_coredump); - if (fd_peer_pidfd < 0) { - fprintf(stderr, "get_peer_pidfd failed for fd %d: %m\n", fd_coredump); - goto out; - } - - if (!get_pidfd_info(fd_peer_pidfd, &info)) { - fprintf(stderr, "get_pidfd_info failed for fd %d\n", fd_peer_pidfd); - goto out; - } - - if (!(info.mask & PIDFD_INFO_COREDUMP)) { - fprintf(stderr, "pidfd info missing PIDFD_INFO_COREDUMP for fd %d\n", fd_peer_pidfd); - goto out; - } - if (!(info.coredump_mask & PIDFD_COREDUMPED)) { - fprintf(stderr, "pidfd info missing PIDFD_COREDUMPED for fd %d\n", fd_peer_pidfd); - goto out; - } - - if (!read_coredump_req(fd_coredump, &req)) { - fprintf(stderr, "read_coredump_req failed for fd %d\n", fd_coredump); - goto out; - } - - if (!check_coredump_req(&req, COREDUMP_ACK_SIZE_VER0, - COREDUMP_KERNEL | COREDUMP_USERSPACE | - COREDUMP_REJECT | COREDUMP_WAIT)) { - fprintf(stderr, "check_coredump_req failed for fd %d\n", fd_coredump); - goto out; - } - - if (!send_coredump_ack(fd_coredump, &req, - COREDUMP_KERNEL | COREDUMP_WAIT, 0)) { - fprintf(stderr, "send_coredump_ack failed for fd %d\n", fd_coredump); - goto out; - } - - if (!read_marker(fd_coredump, COREDUMP_MARK_REQACK)) { - fprintf(stderr, "read_marker failed for fd %d\n", fd_coredump); - goto out; - } - - fd_core_file = open_coredump_tmpfile(self->fd_tmpfs_detached); - if (fd_core_file < 0) { - fprintf(stderr, "%m - open_coredump_tmpfile failed for fd %d\n", fd_coredump); - goto out; - } - - for (;;) { - char buffer[4096]; - ssize_t bytes_read, bytes_write; - - bytes_read = read(fd_coredump, buffer, sizeof(buffer)); - if (bytes_read < 0) { - fprintf(stderr, "read failed for fd %d: %m\n", fd_coredump); - goto out; - } - - if (bytes_read == 0) - break; - - bytes_write = write(fd_core_file, buffer, bytes_read); - if (bytes_read != bytes_write) { - fprintf(stderr, "write failed for fd %d: %m\n", fd_core_file); - goto out; - } - } - - close(fd_core_file); - close(fd_peer_pidfd); - close(fd_coredump); - fd_peer_pidfd = -1; - fd_coredump = -1; - } - - exit_code = EXIT_SUCCESS; -out: - if (fd_core_file >= 0) - close(fd_core_file); - if (fd_peer_pidfd >= 0) - close(fd_peer_pidfd); - if (fd_coredump >= 0) - close(fd_coredump); - if (fd_server >= 0) - close(fd_server); - _exit(exit_code); - } - self->pid_coredump_server = pid_coredump_server; - - EXPECT_EQ(close(ipc_sockets[1]), 0); - ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1); - EXPECT_EQ(close(ipc_sockets[0]), 0); - - for (int i = 0; i < NUM_CRASHING_COREDUMPS; i++) { - pid[i] = fork(); - ASSERT_GE(pid[i], 0); - if (pid[i] == 0) - crashing_child(); - pidfd[i] = sys_pidfd_open(pid[i], 0); - ASSERT_GE(pidfd[i], 0); - } - - for (int i = 0; i < NUM_CRASHING_COREDUMPS; i++) { - waitpid(pid[i], &status[i], 0); - ASSERT_TRUE(WIFSIGNALED(status[i])); - ASSERT_TRUE(WCOREDUMP(status[i])); - } - - for (int i = 0; i < NUM_CRASHING_COREDUMPS; i++) { - info.mask = PIDFD_INFO_EXIT | PIDFD_INFO_COREDUMP; - ASSERT_EQ(ioctl(pidfd[i], PIDFD_GET_INFO, &info), 0); - ASSERT_GT((info.mask & PIDFD_INFO_COREDUMP), 0); - ASSERT_GT((info.coredump_mask & PIDFD_COREDUMPED), 0); - } - - wait_and_check_coredump_server(pid_coredump_server, _metadata, self); -} - -#define MAX_EVENTS 128 - -static void process_coredump_worker(int fd_coredump, int fd_peer_pidfd, int fd_core_file) -{ - int epfd = -1; - int exit_code = EXIT_FAILURE; - - epfd = epoll_create1(0); - if (epfd < 0) - goto out; - - struct epoll_event ev; - ev.events = EPOLLIN | EPOLLRDHUP | EPOLLET; - ev.data.fd = fd_coredump; - if (epoll_ctl(epfd, EPOLL_CTL_ADD, fd_coredump, &ev) < 0) - goto out; - - for (;;) { - struct epoll_event events[1]; - int n = epoll_wait(epfd, events, 1, -1); - if (n < 0) - break; - - if (events[0].events & (EPOLLIN | EPOLLRDHUP)) { - for (;;) { - char buffer[4096]; - ssize_t bytes_read = read(fd_coredump, buffer, sizeof(buffer)); - if (bytes_read < 0) { - if (errno == EAGAIN || errno == EWOULDBLOCK) - break; - goto out; - } - if (bytes_read == 0) - goto done; - ssize_t bytes_write = write(fd_core_file, buffer, bytes_read); - if (bytes_write != bytes_read) - goto out; - } - } - } - -done: - exit_code = EXIT_SUCCESS; -out: - if (epfd >= 0) - close(epfd); - if (fd_core_file >= 0) - close(fd_core_file); - if (fd_peer_pidfd >= 0) - close(fd_peer_pidfd); - if (fd_coredump >= 0) - close(fd_coredump); - _exit(exit_code); -} - -TEST_F_TIMEOUT(coredump, socket_multiple_crashing_coredumps_epoll_workers, 500) -{ - int pidfd[NUM_CRASHING_COREDUMPS], status[NUM_CRASHING_COREDUMPS]; - pid_t pid[NUM_CRASHING_COREDUMPS], pid_coredump_server, worker_pids[NUM_CRASHING_COREDUMPS]; - struct pidfd_info info = {}; - int ipc_sockets[2]; - char c; - - ASSERT_TRUE(set_core_pattern("@@/tmp/coredump.socket")); - ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets), 0); - - pid_coredump_server = fork(); - ASSERT_GE(pid_coredump_server, 0); - if (pid_coredump_server == 0) { - int fd_server = -1, exit_code = EXIT_FAILURE, n_conns = 0; - fd_server = -1; - exit_code = EXIT_FAILURE; - n_conns = 0; - close(ipc_sockets[0]); - fd_server = create_and_listen_unix_socket("/tmp/coredump.socket"); - if (fd_server < 0) - goto out; - - if (write_nointr(ipc_sockets[1], "1", 1) < 0) - goto out; - close(ipc_sockets[1]); - - while (n_conns < NUM_CRASHING_COREDUMPS) { - int fd_coredump = -1, fd_peer_pidfd = -1, fd_core_file = -1; - struct coredump_req req = {}; - fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC); - if (fd_coredump < 0) { - if (errno == EAGAIN || errno == EWOULDBLOCK) - continue; - goto out; - } - fd_peer_pidfd = get_peer_pidfd(fd_coredump); - if (fd_peer_pidfd < 0) - goto out; - if (!get_pidfd_info(fd_peer_pidfd, &info)) - goto out; - if (!(info.mask & PIDFD_INFO_COREDUMP) || !(info.coredump_mask & PIDFD_COREDUMPED)) - goto out; - if (!read_coredump_req(fd_coredump, &req)) - goto out; - if (!check_coredump_req(&req, COREDUMP_ACK_SIZE_VER0, - COREDUMP_KERNEL | COREDUMP_USERSPACE | - COREDUMP_REJECT | COREDUMP_WAIT)) - goto out; - if (!send_coredump_ack(fd_coredump, &req, COREDUMP_KERNEL | COREDUMP_WAIT, 0)) - goto out; - if (!read_marker(fd_coredump, COREDUMP_MARK_REQACK)) - goto out; - fd_core_file = open_coredump_tmpfile(self->fd_tmpfs_detached); - if (fd_core_file < 0) - goto out; - pid_t worker = fork(); - if (worker == 0) { - close(fd_server); - process_coredump_worker(fd_coredump, fd_peer_pidfd, fd_core_file); - } - worker_pids[n_conns] = worker; - if (fd_coredump >= 0) - close(fd_coredump); - if (fd_peer_pidfd >= 0) - close(fd_peer_pidfd); - if (fd_core_file >= 0) - close(fd_core_file); - n_conns++; - } - exit_code = EXIT_SUCCESS; -out: - if (fd_server >= 0) - close(fd_server); - - // Reap all worker processes - for (int i = 0; i < n_conns; i++) { - int wstatus; - if (waitpid(worker_pids[i], &wstatus, 0) < 0) { - fprintf(stderr, "Failed to wait for worker %d: %m\n", worker_pids[i]); - } else if (WIFEXITED(wstatus) && WEXITSTATUS(wstatus) != EXIT_SUCCESS) { - fprintf(stderr, "Worker %d exited with error code %d\n", worker_pids[i], WEXITSTATUS(wstatus)); - exit_code = EXIT_FAILURE; - } - } - - _exit(exit_code); - } - self->pid_coredump_server = pid_coredump_server; - - EXPECT_EQ(close(ipc_sockets[1]), 0); - ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1); - EXPECT_EQ(close(ipc_sockets[0]), 0); - - for (int i = 0; i < NUM_CRASHING_COREDUMPS; i++) { - pid[i] = fork(); - ASSERT_GE(pid[i], 0); - if (pid[i] == 0) - crashing_child(); - pidfd[i] = sys_pidfd_open(pid[i], 0); - ASSERT_GE(pidfd[i], 0); - } - - for (int i = 0; i < NUM_CRASHING_COREDUMPS; i++) { - ASSERT_GE(waitpid(pid[i], &status[i], 0), 0); - ASSERT_TRUE(WIFSIGNALED(status[i])); - ASSERT_TRUE(WCOREDUMP(status[i])); - } - - for (int i = 0; i < NUM_CRASHING_COREDUMPS; i++) { - info.mask = PIDFD_INFO_EXIT | PIDFD_INFO_COREDUMP; - ASSERT_EQ(ioctl(pidfd[i], PIDFD_GET_INFO, &info), 0); - ASSERT_GT((info.mask & PIDFD_INFO_COREDUMP), 0); - ASSERT_GT((info.coredump_mask & PIDFD_COREDUMPED), 0); - } - - wait_and_check_coredump_server(pid_coredump_server, _metadata, self); -} - -TEST_F(coredump, socket_invalid_paths) -{ - ASSERT_FALSE(set_core_pattern("@ /tmp/coredump.socket")); - ASSERT_FALSE(set_core_pattern("@/tmp/../coredump.socket")); - ASSERT_FALSE(set_core_pattern("@../coredump.socket")); - ASSERT_FALSE(set_core_pattern("@/tmp/coredump.socket/..")); - ASSERT_FALSE(set_core_pattern("@..")); - - ASSERT_FALSE(set_core_pattern("@@ /tmp/coredump.socket")); - ASSERT_FALSE(set_core_pattern("@@/tmp/../coredump.socket")); - ASSERT_FALSE(set_core_pattern("@@../coredump.socket")); - ASSERT_FALSE(set_core_pattern("@@/tmp/coredump.socket/..")); - ASSERT_FALSE(set_core_pattern("@@..")); - - ASSERT_FALSE(set_core_pattern("@@@/tmp/coredump.socket")); -} - TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/damon/_damon_sysfs.py b/tools/testing/selftests/damon/_damon_sysfs.py index a0e6290833fb..748778b563cd 100644 --- a/tools/testing/selftests/damon/_damon_sysfs.py +++ b/tools/testing/selftests/damon/_damon_sysfs.py @@ -475,12 +475,14 @@ class Damos: class DamonTarget: pid = None + obsolete = None # todo: Support target regions if test is made idx = None context = None - def __init__(self, pid): + def __init__(self, pid, obsolete=False): self.pid = pid + self.obsolete = obsolete def sysfs_dir(self): return os.path.join( @@ -491,8 +493,13 @@ class DamonTarget: os.path.join(self.sysfs_dir(), 'regions', 'nr_regions'), '0') if err is not None: return err - return write_file( + err = write_file( os.path.join(self.sysfs_dir(), 'pid_target'), self.pid) + if err is not None: + return err + return write_file( + os.path.join(self.sysfs_dir(), 'obsolete_target'), + 'Y' if self.obsolete else 'N') class IntervalsGoal: access_bp = None diff --git a/tools/testing/selftests/damon/drgn_dump_damon_status.py b/tools/testing/selftests/damon/drgn_dump_damon_status.py index 7233369a3a44..5374d18d1fa8 100755 --- a/tools/testing/selftests/damon/drgn_dump_damon_status.py +++ b/tools/testing/selftests/damon/drgn_dump_damon_status.py @@ -73,6 +73,7 @@ def target_to_dict(target): ['pid', int], ['nr_regions', int], ['regions_list', regions_to_list], + ['obsolete', bool], ]) def targets_to_list(targets): @@ -174,11 +175,11 @@ def scheme_to_dict(scheme): ['target_nid', int], ['migrate_dests', damos_migrate_dests_to_dict], ]) - filters = [] + core_filters = [] for f in list_for_each_entry( - 'struct damos_filter', scheme.filters.address_of_(), 'list'): - filters.append(damos_filter_to_dict(f)) - dict_['filters'] = filters + 'struct damos_filter', scheme.core_filters.address_of_(), 'list'): + core_filters.append(damos_filter_to_dict(f)) + dict_['core_filters'] = core_filters ops_filters = [] for f in list_for_each_entry( 'struct damos_filter', scheme.ops_filters.address_of_(), 'list'): diff --git a/tools/testing/selftests/damon/sysfs.py b/tools/testing/selftests/damon/sysfs.py index 2666c6f0f1a5..9cca71eb0325 100755 --- a/tools/testing/selftests/damon/sysfs.py +++ b/tools/testing/selftests/damon/sysfs.py @@ -132,7 +132,7 @@ def assert_scheme_committed(scheme, dump): assert_watermarks_committed(scheme.watermarks, dump['wmarks']) # TODO: test filters directory for idx, f in enumerate(scheme.core_filters.filters): - assert_filter_committed(f, dump['filters'][idx]) + assert_filter_committed(f, dump['core_filters'][idx]) for idx, f in enumerate(scheme.ops_filters.filters): assert_filter_committed(f, dump['ops_filters'][idx]) @@ -164,6 +164,16 @@ def assert_monitoring_attrs_committed(attrs, dump): assert_true(dump['max_nr_regions'] == attrs.max_nr_regions, 'max_nr_regions', dump) +def assert_monitoring_target_committed(target, dump): + # target.pid is the pid "number", while dump['pid'] is 'struct pid' + # pointer, and hence cannot be compared. + assert_true(dump['obsolete'] == target.obsolete, 'target obsolete', dump) + +def assert_monitoring_targets_committed(targets, dump): + assert_true(len(targets) == len(dump), 'len_targets', dump) + for idx, target in enumerate(targets): + assert_monitoring_target_committed(target, dump[idx]) + def assert_ctx_committed(ctx, dump): ops_val = { 'vaddr': 0, @@ -172,9 +182,18 @@ def assert_ctx_committed(ctx, dump): } assert_true(dump['ops']['id'] == ops_val[ctx.ops], 'ops_id', dump) assert_monitoring_attrs_committed(ctx.monitoring_attrs, dump['attrs']) + assert_monitoring_targets_committed(ctx.targets, dump['adaptive_targets']) assert_schemes_committed(ctx.schemes, dump['schemes']) -def assert_ctxs_committed(ctxs, dump): +def assert_ctxs_committed(kdamonds): + status, err = dump_damon_status_dict(kdamonds.kdamonds[0].pid) + if err is not None: + print(err) + kdamonds.stop() + exit(1) + + ctxs = kdamonds.kdamonds[0].contexts + dump = status['contexts'] assert_true(len(ctxs) == len(dump), 'ctxs length', dump) for idx, ctx in enumerate(ctxs): assert_ctx_committed(ctx, dump[idx]) @@ -191,13 +210,7 @@ def main(): print('kdamond start failed: %s' % err) exit(1) - status, err = dump_damon_status_dict(kdamonds.kdamonds[0].pid) - if err is not None: - print(err) - kdamonds.stop() - exit(1) - - assert_ctxs_committed(kdamonds.kdamonds[0].contexts, status['contexts']) + assert_ctxs_committed(kdamonds) context = _damon_sysfs.DamonCtx( monitoring_attrs=_damon_sysfs.DamonAttrs( @@ -245,12 +258,7 @@ def main(): kdamonds.kdamonds[0].contexts = [context] kdamonds.kdamonds[0].commit() - status, err = dump_damon_status_dict(kdamonds.kdamonds[0].pid) - if err is not None: - print(err) - exit(1) - - assert_ctxs_committed(kdamonds.kdamonds[0].contexts, status['contexts']) + assert_ctxs_committed(kdamonds) # test online commitment of minimum context. context = _damon_sysfs.DamonCtx() @@ -259,13 +267,36 @@ def main(): kdamonds.kdamonds[0].contexts = [context] kdamonds.kdamonds[0].commit() - status, err = dump_damon_status_dict(kdamonds.kdamonds[0].pid) - if err is not None: - print(err) - exit(1) + assert_ctxs_committed(kdamonds) - assert_ctxs_committed(kdamonds.kdamonds[0].contexts, status['contexts']) + kdamonds.stop() + # test obsolete_target. + proc1 = subprocess.Popen(['sh'], stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + proc2 = subprocess.Popen(['sh'], stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + proc3 = subprocess.Popen(['sh'], stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + kdamonds = _damon_sysfs.Kdamonds( + [_damon_sysfs.Kdamond( + contexts=[_damon_sysfs.DamonCtx( + ops='vaddr', + targets=[ + _damon_sysfs.DamonTarget(pid=proc1.pid), + _damon_sysfs.DamonTarget(pid=proc2.pid), + _damon_sysfs.DamonTarget(pid=proc3.pid), + ], + schemes=[_damon_sysfs.Damos()], + )])]) + err = kdamonds.start() + if err is not None: + print('kdamond start failed: %s' % err) + exit(1) + kdamonds.kdamonds[0].contexts[0].targets[1].obsolete = True + kdamonds.kdamonds[0].commit() + del kdamonds.kdamonds[0].contexts[0].targets[1] + assert_ctxs_committed(kdamonds) kdamonds.stop() if __name__ == '__main__': diff --git a/tools/testing/selftests/dma/Makefile b/tools/testing/selftests/dma/Makefile deleted file mode 100644 index cd8c5ece1cba..000000000000 --- a/tools/testing/selftests/dma/Makefile +++ /dev/null @@ -1,7 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0 -CFLAGS += -I../../../../usr/include/ -CFLAGS += -I../../../../include/ - -TEST_GEN_PROGS := dma_map_benchmark - -include ../lib.mk diff --git a/tools/testing/selftests/drivers/net/.gitignore b/tools/testing/selftests/drivers/net/.gitignore index 585ecb4d5dc4..3633c7a3ed65 100644 --- a/tools/testing/selftests/drivers/net/.gitignore +++ b/tools/testing/selftests/drivers/net/.gitignore @@ -1,3 +1,4 @@ # SPDX-License-Identifier: GPL-2.0-only +gro napi_id_helper psp_responder diff --git a/tools/testing/selftests/drivers/net/Makefile b/tools/testing/selftests/drivers/net/Makefile index 71ee69e524d7..f5c71d993750 100644 --- a/tools/testing/selftests/drivers/net/Makefile +++ b/tools/testing/selftests/drivers/net/Makefile @@ -6,10 +6,12 @@ TEST_INCLUDES := $(wildcard lib/py/*.py) \ ../../net/lib.sh \ TEST_GEN_FILES := \ + gro \ napi_id_helper \ # end of TEST_GEN_FILES TEST_PROGS := \ + gro.py \ hds.py \ napi_id.py \ napi_threaded.py \ @@ -23,6 +25,7 @@ TEST_PROGS := \ ping.py \ psp.py \ queues.py \ + ring_reconfig.py \ shaper.py \ stats.py \ xdp.py \ diff --git a/tools/testing/selftests/drivers/net/bonding/bond_macvlan_ipvlan.sh b/tools/testing/selftests/drivers/net/bonding/bond_macvlan_ipvlan.sh index c4711272fe45..559f300f965a 100755 --- a/tools/testing/selftests/drivers/net/bonding/bond_macvlan_ipvlan.sh +++ b/tools/testing/selftests/drivers/net/bonding/bond_macvlan_ipvlan.sh @@ -30,6 +30,7 @@ check_connection() local message=${3} RET=0 + sleep 0.25 ip netns exec ${ns} ping ${target} -c 4 -i 0.1 &>/dev/null check_err $? "ping failed" log_test "${bond_mode}/${xvlan_type}_${xvlan_mode}: ${message}" diff --git a/tools/testing/selftests/net/gro.c b/tools/testing/selftests/drivers/net/gro.c index cfc39f70635d..995b492f5bcb 100644 --- a/tools/testing/selftests/net/gro.c +++ b/tools/testing/selftests/drivers/net/gro.c @@ -57,7 +57,8 @@ #include <string.h> #include <unistd.h> -#include "../kselftest.h" +#include "../../kselftest.h" +#include "../../net/lib/ksft.h" #define DPORT 8000 #define SPORT 1500 @@ -1127,6 +1128,8 @@ static void gro_receiver(void) set_timeout(rxfd); bind_packetsocket(rxfd); + ksft_ready(); + memset(correct_payload, 0, sizeof(correct_payload)); if (strcmp(testname, "data") == 0) { diff --git a/tools/testing/selftests/drivers/net/gro.py b/tools/testing/selftests/drivers/net/gro.py new file mode 100755 index 000000000000..ba83713bf7b5 --- /dev/null +++ b/tools/testing/selftests/drivers/net/gro.py @@ -0,0 +1,164 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: GPL-2.0 + +""" +GRO (Generic Receive Offload) conformance tests. + +Validates that GRO coalescing works correctly by running the gro +binary in different configurations and checking for correct packet +coalescing behavior. + +Test cases: + - data: Data packets with same size/headers and correct seq numbers coalesce + - ack: Pure ACK packets do not coalesce + - flags: Packets with PSH, SYN, URG, RST flags do not coalesce + - tcp: Packets with incorrect checksum, non-consecutive seqno don't coalesce + - ip: Packets with different ECN, TTL, TOS, or IP options don't coalesce + - large: Packets larger than GRO_MAX_SIZE don't coalesce +""" + +import os +from lib.py import ksft_run, ksft_exit, ksft_pr +from lib.py import NetDrvEpEnv, KsftXfailEx +from lib.py import cmd, defer, bkg, ip +from lib.py import ksft_variants + + +def _resolve_dmac(cfg, ipver): + """ + Find the destination MAC address remote host should use to send packets + towards the local host. It may be a router / gateway address. + """ + + attr = "dmac" + ipver + # Cache the response across test cases + if hasattr(cfg, attr): + return getattr(cfg, attr) + + route = ip(f"-{ipver} route get {cfg.addr_v[ipver]}", + json=True, host=cfg.remote)[0] + gw = route.get("gateway") + # Local L2 segment, address directly + if not gw: + setattr(cfg, attr, cfg.dev['address']) + return getattr(cfg, attr) + + # ping to make sure neighbor is resolved, + # bind to an interface, for v6 the GW is likely link local + cmd(f"ping -c1 -W0 -I{cfg.remote_ifname} {gw}", host=cfg.remote) + + neigh = ip(f"neigh get {gw} dev {cfg.remote_ifname}", + json=True, host=cfg.remote)[0] + setattr(cfg, attr, neigh['lladdr']) + return getattr(cfg, attr) + + +def _write_defer_restore(cfg, path, val, defer_undo=False): + with open(path, "r", encoding="utf-8") as fp: + orig_val = fp.read().strip() + if str(val) == orig_val: + return + with open(path, "w", encoding="utf-8") as fp: + fp.write(val) + if defer_undo: + defer(_write_defer_restore, cfg, path, orig_val) + + +def _set_mtu_restore(dev, mtu, host): + if dev['mtu'] < mtu: + ip(f"link set dev {dev['ifname']} mtu {mtu}", host=host) + defer(ip, f"link set dev {dev['ifname']} mtu {dev['mtu']}", host=host) + + +def _setup(cfg, test_name): + """ Setup hardware loopback mode for GRO testing. """ + + if not hasattr(cfg, "bin_remote"): + cfg.bin_local = cfg.test_dir / "gro" + cfg.bin_remote = cfg.remote.deploy(cfg.bin_local) + + # "large" test needs at least 4k MTU + if test_name == "large": + _set_mtu_restore(cfg.dev, 4096, None) + _set_mtu_restore(cfg.remote_dev, 4096, cfg.remote) + + flush_path = f"/sys/class/net/{cfg.ifname}/gro_flush_timeout" + irq_path = f"/sys/class/net/{cfg.ifname}/napi_defer_hard_irqs" + + _write_defer_restore(cfg, flush_path, "200000", defer_undo=True) + _write_defer_restore(cfg, irq_path, "10", defer_undo=True) + + try: + # Disable TSO for local tests + cfg.require_nsim() # will raise KsftXfailEx if not running on nsim + + cmd(f"ethtool -K {cfg.ifname} gro on tso off") + cmd(f"ethtool -K {cfg.remote_ifname} gro on tso off", host=cfg.remote) + except KsftXfailEx: + pass + +def _gro_variants(): + """Generator that yields all combinations of protocol and test types.""" + + for protocol in ["ipv4", "ipv6", "ipip"]: + for test_name in ["data", "ack", "flags", "tcp", "ip", "large"]: + yield protocol, test_name + + +@ksft_variants(_gro_variants()) +def test(cfg, protocol, test_name): + """Run a single GRO test with retries.""" + + ipver = "6" if protocol[-1] == "6" else "4" + cfg.require_ipver(ipver) + + _setup(cfg, test_name) + + base_cmd_args = [ + f"--{protocol}", + f"--dmac {_resolve_dmac(cfg, ipver)}", + f"--smac {cfg.remote_dev['address']}", + f"--daddr {cfg.addr_v[ipver]}", + f"--saddr {cfg.remote_addr_v[ipver]}", + f"--test {test_name}", + "--verbose" + ] + base_args = " ".join(base_cmd_args) + + # Each test is run 6 times to deflake, because given the receive timing, + # not all packets that should coalesce will be considered in the same flow + # on every try. + max_retries = 6 + for attempt in range(max_retries): + rx_cmd = f"{cfg.bin_local} {base_args} --rx --iface {cfg.ifname}" + tx_cmd = f"{cfg.bin_remote} {base_args} --iface {cfg.remote_ifname}" + + fail_now = attempt >= max_retries - 1 + + with bkg(rx_cmd, ksft_ready=True, exit_wait=True, + fail=fail_now) as rx_proc: + cmd(tx_cmd, host=cfg.remote) + + if rx_proc.ret == 0: + return + + ksft_pr(rx_proc.stdout.strip().replace('\n', '\n# ')) + ksft_pr(rx_proc.stderr.strip().replace('\n', '\n# ')) + + if test_name == "large" and os.environ.get("KSFT_MACHINE_SLOW"): + ksft_pr(f"Ignoring {protocol}/{test_name} failure due to slow environment") + return + + ksft_pr(f"Attempt {attempt + 1}/{max_retries} failed, retrying...") + + +def main() -> None: + """ Ksft boiler plate main """ + + with NetDrvEpEnv(__file__) as cfg: + ksft_run(cases=[test], args=(cfg,)) + ksft_exit() + + +if __name__ == "__main__": + main() diff --git a/tools/testing/selftests/drivers/net/hw/.gitignore b/tools/testing/selftests/drivers/net/hw/.gitignore index 6942bf575497..46540468a775 100644 --- a/tools/testing/selftests/drivers/net/hw/.gitignore +++ b/tools/testing/selftests/drivers/net/hw/.gitignore @@ -1,3 +1,4 @@ # SPDX-License-Identifier: GPL-2.0-only iou-zcrx ncdevmem +toeplitz diff --git a/tools/testing/selftests/drivers/net/hw/Makefile b/tools/testing/selftests/drivers/net/hw/Makefile index 8133d1a0051c..9c163ba6feee 100644 --- a/tools/testing/selftests/drivers/net/hw/Makefile +++ b/tools/testing/selftests/drivers/net/hw/Makefile @@ -1,10 +1,26 @@ # SPDX-License-Identifier: GPL-2.0+ OR MIT -TEST_GEN_FILES = iou-zcrx +# Check if io_uring supports zero-copy receive +HAS_IOURING_ZCRX := $(shell \ + echo -e '#include <liburing.h>\n' \ + 'void *func = (void *)io_uring_register_ifq;\n' \ + 'int main() {return 0;}' | \ + $(CC) -luring -x c - -o /dev/null 2>&1 && echo y) + +ifeq ($(HAS_IOURING_ZCRX),y) +COND_GEN_FILES += iou-zcrx +else +$(warning excluding iouring tests, liburing not installed or too old) +endif + +TEST_GEN_FILES := \ + $(COND_GEN_FILES) \ +# end of TEST_GEN_FILES TEST_PROGS = \ csum.py \ devlink_port_split.py \ + devlink_rate_tc_bw.py \ devmem.py \ ethtool.sh \ ethtool_extended_state.sh \ @@ -21,6 +37,7 @@ TEST_PROGS = \ rss_ctx.py \ rss_flow_label.py \ rss_input_xfrm.py \ + toeplitz.py \ tso.py \ xsk_reconfig.py \ # @@ -38,7 +55,10 @@ TEST_INCLUDES := \ # # YNL files, must be before "include ..lib.mk" -YNL_GEN_FILES := ncdevmem +YNL_GEN_FILES := \ + ncdevmem \ + toeplitz \ +# end of YNL_GEN_FILES TEST_GEN_FILES += $(YNL_GEN_FILES) TEST_GEN_FILES += $(patsubst %.c,%.o,$(wildcard *.bpf.c)) @@ -54,4 +74,6 @@ include ../../../net/ynl.mk include ../../../net/bpf.mk +ifeq ($(HAS_IOURING_ZCRX),y) $(OUTPUT)/iou-zcrx: LDLIBS += -luring +endif diff --git a/tools/testing/selftests/drivers/net/hw/devlink_rate_tc_bw.py b/tools/testing/selftests/drivers/net/hw/devlink_rate_tc_bw.py index ead6784d1910..4e4faa9275bb 100755 --- a/tools/testing/selftests/drivers/net/hw/devlink_rate_tc_bw.py +++ b/tools/testing/selftests/drivers/net/hw/devlink_rate_tc_bw.py @@ -21,21 +21,21 @@ Test Cases: ---------- 1. test_no_tc_mapping_bandwidth: - Verifies that without TC mapping, bandwidth is NOT distributed according to - the configured 80/20 split between TC4 and TC3 - - This test should fail if bandwidth matches the 80/20 split without TC + the configured 20/80 split between TC3 and TC4 + - This test should fail if bandwidth matches the 20/80 split without TC mapping - - Expected: Bandwidth should NOT be distributed as 80/20 + - Expected: Bandwidth should NOT be distributed as 20/80 2. test_tc_mapping_bandwidth: - Configures TC mapping using mqprio qdisc - Verifies that with TC mapping, bandwidth IS distributed according to the - configured 80/20 split between TC3 and TC4 - - Expected: Bandwidth should be distributed as 80/20 + configured 20/80 split between TC3 and TC4 + - Expected: Bandwidth should be distributed as 20/80 Bandwidth Distribution: ---------------------- -- TC3 (VLAN 101): Configured for 80% of total bandwidth -- TC4 (VLAN 102): Configured for 20% of total bandwidth +- TC3 (VLAN 101): Configured for 20% of total bandwidth +- TC4 (VLAN 102): Configured for 80% of total bandwidth - Total bandwidth: 1Gbps - Tolerance: +-12% @@ -64,43 +64,40 @@ from lib.py import KsftSkipEx, KsftFailEx, KsftXfailEx from lib.py import NetDrvEpEnv, DevlinkFamily from lib.py import NlError from lib.py import cmd, defer, ethtool, ip +from lib.py import Iperf3Runner class BandwidthValidator: """ - Validates bandwidth totals and per-TC shares against expected values - with a tolerance. + Validates total bandwidth and individual shares with tolerance + relative to the overall total. """ - def __init__(self): + def __init__(self, shares): self.tolerance_percent = 12 - self.expected_total_gbps = 1.0 - self.total_min_expected = self.min_expected(self.expected_total_gbps) - self.total_max_expected = self.max_expected(self.expected_total_gbps) - self.tc_expected_percent = { - 3: 20.0, - 4: 80.0, - } + self.expected_total = sum(shares.values()) + self.bounds = {} + + for name, exp in shares.items(): + self.bounds[name] = (self.min_expected(exp), self.max_expected(exp)) def min_expected(self, value): """Calculates the minimum acceptable value based on tolerance.""" - return value - (value * self.tolerance_percent / 100) + return value - (self.expected_total * self.tolerance_percent / 100) def max_expected(self, value): """Calculates the maximum acceptable value based on tolerance.""" - return value + (value * self.tolerance_percent / 100) - - def bound(self, expected, value): - """Returns True if value is within expected tolerance.""" - return self.min_expected(expected) <= value <= self.max_expected(expected) + return value + (self.expected_total * self.tolerance_percent / 100) - def tc_bandwidth_bound(self, value, tc_ix): + def bound(self, values): """ - Returns True if the given bandwidth value is within tolerance - for the TC's expected bandwidth. + Return True if all given values fall within tolerance. """ - expected = self.tc_expected_percent[tc_ix] - return self.bound(expected, value) + for name, value in values.items(): + low, high = self.bounds[name] + if not low <= value <= high: + return False + return True def setup_vf(cfg, set_tc_mapping=True): @@ -116,8 +113,8 @@ def setup_vf(cfg, set_tc_mapping=True): except Exception as exc: raise KsftSkipEx(f"Failed to enable switchdev mode on {cfg.pci}") from exc try: - cmd(f"echo 1 > /sys/class/net/{cfg.ifname}/device/sriov_numvfs") - defer(cmd, f"echo 0 > /sys/class/net/{cfg.ifname}/device/sriov_numvfs") + cmd(f"echo 1 > /sys/class/net/{cfg.ifname}/device/sriov_numvfs", shell=True) + defer(cmd, f"echo 0 > /sys/class/net/{cfg.ifname}/device/sriov_numvfs", shell=True) except Exception as exc: raise KsftSkipEx(f"Failed to enable SR-IOV on {cfg.ifname}") from exc @@ -139,8 +136,8 @@ def setup_vlans_on_vf(vf_ifc): Sets up two VLAN interfaces on the given VF, each mapped to a different TC. """ vlan_configs = [ - {"vlan_id": 101, "tc": 3, "ip": "198.51.100.2"}, - {"vlan_id": 102, "tc": 4, "ip": "198.51.100.10"}, + {"vlan_id": 101, "tc": 3, "ip": "198.51.100.1"}, + {"vlan_id": 102, "tc": 4, "ip": "198.51.100.9"}, ] for config in vlan_configs: @@ -224,13 +221,13 @@ def setup_devlink_rate(cfg): raise KsftFailEx(f"rate_set failed on VF port {port_index}") from exc -def setup_remote_server(cfg): +def setup_remote_vlans(cfg): """ - Sets up VLAN interfaces and starts iperf3 servers on the remote side. + Sets up VLAN interfaces on the remote side. """ remote_dev = cfg.remote_ifname vlan_ids = [101, 102] - remote_ips = ["198.51.100.1", "198.51.100.9"] + remote_ips = ["198.51.100.2", "198.51.100.10"] for vlan_id, ip_addr in zip(vlan_ids, remote_ips): vlan_dev = f"{remote_dev}.{vlan_id}" @@ -238,14 +235,13 @@ def setup_remote_server(cfg): f"type vlan id {vlan_id}", host=cfg.remote) cmd(f"ip addr add {ip_addr}/29 dev {vlan_dev}", host=cfg.remote) cmd(f"ip link set dev {vlan_dev} up", host=cfg.remote) - cmd(f"iperf3 -s -1 -B {ip_addr}",background=True, host=cfg.remote) defer(cmd, f"ip link del {vlan_dev}", host=cfg.remote) def setup_test_environment(cfg, set_tc_mapping=True): """ Sets up the complete test environment including VF creation, VLANs, - bridge configuration, devlink rate setup, and the remote server. + bridge configuration and devlink rate setup. """ vf_ifc = setup_vf(cfg, set_tc_mapping) ksft_pr(f"Created VF interface: {vf_ifc}") @@ -256,51 +252,39 @@ def setup_test_environment(cfg, set_tc_mapping=True): setup_bridge(cfg) setup_devlink_rate(cfg) - setup_remote_server(cfg) - time.sleep(2) + setup_remote_vlans(cfg) -def run_iperf_client(server_ip, local_ip, barrier, min_expected_gbps=0.1): +def measure_bandwidth(cfg, server_ip, client_ip, barrier): """ - Runs a single iperf3 client instance, binding to the given local IP. - Waits on a barrier to synchronize with other threads. + Synchronizes with peers and runs an iperf3-based bandwidth measurement + between the given endpoints. Returns average Gbps. """ + runner = Iperf3Runner(cfg, server_ip=server_ip, client_ip=client_ip) try: barrier.wait(timeout=10) except Exception as exc: raise KsftFailEx("iperf3 barrier wait timed") from exc - iperf_cmd = ["iperf3", "-c", server_ip, "-B", local_ip, "-J"] - result = subprocess.run(iperf_cmd, capture_output=True, text=True, - check=True) - try: - output = json.loads(result.stdout) - bits_per_second = output["end"]["sum_received"]["bits_per_second"] - gbps = bits_per_second / 1e9 - if gbps < min_expected_gbps: - ksft_pr( - f"iperf3 bandwidth too low: {gbps:.2f} Gbps " - f"(expected ≥ {min_expected_gbps} Gbps)" - ) - return None - return gbps - except json.JSONDecodeError as exc: - ksft_pr(f"Failed to parse iperf3 JSON output: {exc}") - return None + bw_gbps = runner.measure_bandwidth(reverse=True) + except Exception as exc: + raise KsftFailEx("iperf3 bandwidth measurement failed") from exc + + return bw_gbps -def run_bandwidth_test(): +def run_bandwidth_test(cfg): """ - Launches iperf3 client threads for each VLAN/TC pair and collects results. + Runs parallel bandwidth measurements for each VLAN/TC pair and collects results. """ - def _run_iperf_client_thread(server_ip, local_ip, results, barrier, tc_ix): - results[tc_ix] = run_iperf_client(server_ip, local_ip, barrier) + def _run_measure_bandwidth_thread(local_ip, remote_ip, results, barrier, tc_ix): + results[tc_ix] = measure_bandwidth(cfg, local_ip, remote_ip, barrier) vf_vlan_data = [ # (local_ip, remote_ip, TC) - ("198.51.100.2", "198.51.100.1", 3), - ("198.51.100.10", "198.51.100.9", 4), + ("198.51.100.1", "198.51.100.2", 3), + ("198.51.100.9", "198.51.100.10", 4), ] results = {} @@ -309,8 +293,8 @@ def run_bandwidth_test(): for local_ip, remote_ip, tc_ix in vf_vlan_data: thread = threading.Thread( - target=_run_iperf_client_thread, - args=(remote_ip, local_ip, results, start_barrier, tc_ix) + target=_run_measure_bandwidth_thread, + args=(local_ip, remote_ip, results, start_barrier, tc_ix) ) thread.start() threads.append(thread) @@ -320,10 +304,11 @@ def run_bandwidth_test(): for tc_ix, tc_bw in results.items(): if tc_bw is None: - raise KsftFailEx("iperf3 client failed; cannot evaluate bandwidth") + raise KsftFailEx("iperf3 failed; cannot evaluate bandwidth") return results + def calculate_bandwidth_percentages(results): """ Calculates the percentage of total bandwidth received by TC3 and TC4. @@ -364,59 +349,48 @@ def verify_total_bandwidth(bw_data, validator): """ total = bw_data['total_bw'] - if validator.bound(validator.expected_total_gbps, total): + if validator.bound({"total": total}): return - if total < validator.total_min_expected: + low, high = validator.bounds["total"] + + if total < low: raise KsftSkipEx( f"Total bandwidth {total:.2f} Gbps < minimum " - f"{validator.total_min_expected:.2f} Gbps; " - f"parent tx_max ({validator.expected_total_gbps:.1f} G) " + f"{low:.2f} Gbps; " + f"parent tx_max ({validator.expected_total:.1f} G) " f"not reached, cannot validate share" ) raise KsftFailEx( f"Total bandwidth {total:.2f} Gbps exceeds allowed ceiling " - f"{validator.total_max_expected:.2f} Gbps " - f"(VF tx_max set to {validator.expected_total_gbps:.1f} G)" + f"{high:.2f} Gbps " + f"(VF tx_max set to {validator.expected_total:.1f} G)" ) -def check_bandwidth_distribution(bw_data, validator): - """ - Checks whether the measured TC3 and TC4 bandwidth percentages - fall within their expected tolerance ranges. - - Returns: - bool: True if both TC3 and TC4 percentages are within bounds. - """ - tc3_valid = validator.tc_bandwidth_bound(bw_data['tc3_percentage'], 3) - tc4_valid = validator.tc_bandwidth_bound(bw_data['tc4_percentage'], 4) - - return tc3_valid and tc4_valid - - def run_bandwidth_distribution_test(cfg, set_tc_mapping): """ - Runs parallel iperf3 tests for both TCs and collects results. + Runs parallel bandwidth measurements for both TCs and collects results. """ setup_test_environment(cfg, set_tc_mapping) - bandwidths = run_bandwidth_test() + bandwidths = run_bandwidth_test(cfg) bw_data = calculate_bandwidth_percentages(bandwidths) test_name = "with TC mapping" if set_tc_mapping else "without TC mapping" print_bandwidth_results(bw_data, test_name) - verify_total_bandwidth(bw_data, cfg.bw_validator) + verify_total_bandwidth(bw_data, cfg.traffic_bw_validator) - return check_bandwidth_distribution(bw_data, cfg.bw_validator) + return cfg.tc_bw_validator.bound({"tc3": bw_data['tc3_percentage'], + "tc4": bw_data['tc4_percentage']}) def test_no_tc_mapping_bandwidth(cfg): """ - Verifies that bandwidth is not split 80/20 without traffic class mapping. + Verifies that bandwidth is not split 20/80 without traffic class mapping. """ - pass_bw_msg = "Bandwidth is NOT distributed as 80/20 without TC mapping" - fail_bw_msg = "Bandwidth matched 80/20 split without TC mapping" + pass_bw_msg = "Bandwidth is NOT distributed as 20/80 without TC mapping" + fail_bw_msg = "Bandwidth matched 20/80 split without TC mapping" is_mlx5 = "driver: mlx5" in ethtool(f"-i {cfg.ifname}").stdout if run_bandwidth_distribution_test(cfg, set_tc_mapping=False): @@ -430,13 +404,13 @@ def test_no_tc_mapping_bandwidth(cfg): def test_tc_mapping_bandwidth(cfg): """ - Verifies that bandwidth is correctly split 80/20 between TC3 and TC4 + Verifies that bandwidth is correctly split 20/80 between TC3 and TC4 when traffic class mapping is set. """ if run_bandwidth_distribution_test(cfg, set_tc_mapping=True): - ksft_pr("Bandwidth is distributed as 80/20 with TC mapping") + ksft_pr("Bandwidth is distributed as 20/80 with TC mapping") else: - raise KsftFailEx("Bandwidth did not match 80/20 split with TC mapping") + raise KsftFailEx("Bandwidth did not match 20/80 split with TC mapping") def main() -> None: @@ -451,9 +425,9 @@ def main() -> None: ) if not cfg.pci: raise KsftSkipEx("Could not get PCI address of the interface") - cfg.require_cmd("iperf3", local=True, remote=True) - cfg.bw_validator = BandwidthValidator() + cfg.traffic_bw_validator = BandwidthValidator({"total": 1}) + cfg.tc_bw_validator = BandwidthValidator({"tc3": 20, "tc4": 80}) cases = [test_no_tc_mapping_bandwidth, test_tc_mapping_bandwidth] diff --git a/tools/testing/selftests/drivers/net/hw/lib/py/__init__.py b/tools/testing/selftests/drivers/net/hw/lib/py/__init__.py index fb010a48a5a1..766bfc4ad842 100644 --- a/tools/testing/selftests/drivers/net/hw/lib/py/__init__.py +++ b/tools/testing/selftests/drivers/net/hw/lib/py/__init__.py @@ -25,10 +25,10 @@ try: fd_read_timeout, ip, rand_port, wait_port_listen, wait_file from net.lib.py import KsftSkipEx, KsftFailEx, KsftXfailEx from net.lib.py import ksft_disruptive, ksft_exit, ksft_pr, ksft_run, \ - ksft_setup + ksft_setup, ksft_variants, KsftNamedVariant from net.lib.py import ksft_eq, ksft_ge, ksft_in, ksft_is, ksft_lt, \ ksft_ne, ksft_not_in, ksft_raises, ksft_true, ksft_gt, ksft_not_none - from drivers.net.lib.py import GenerateTraffic, Remote + from drivers.net.lib.py import GenerateTraffic, Remote, Iperf3Runner from drivers.net.lib.py import NetDrvEnv, NetDrvEpEnv __all__ = ["NetNS", "NetNSEnter", "NetdevSimDev", @@ -40,11 +40,12 @@ try: "wait_port_listen", "wait_file", "KsftSkipEx", "KsftFailEx", "KsftXfailEx", "ksft_disruptive", "ksft_exit", "ksft_pr", "ksft_run", - "ksft_setup", + "ksft_setup", "ksft_variants", "KsftNamedVariant", "ksft_eq", "ksft_ge", "ksft_in", "ksft_is", "ksft_lt", "ksft_ne", "ksft_not_in", "ksft_raises", "ksft_true", "ksft_gt", "ksft_not_none", "ksft_not_none", - "NetDrvEnv", "NetDrvEpEnv", "GenerateTraffic", "Remote"] + "NetDrvEnv", "NetDrvEpEnv", "GenerateTraffic", "Remote", + "Iperf3Runner"] except ModuleNotFoundError as e: print("Failed importing `net` library from kernel sources") print(str(e)) diff --git a/tools/testing/selftests/net/toeplitz.c b/tools/testing/selftests/drivers/net/hw/toeplitz.c index 9ba03164d73a..a4d04438c313 100644 --- a/tools/testing/selftests/net/toeplitz.c +++ b/tools/testing/selftests/drivers/net/hw/toeplitz.c @@ -52,7 +52,11 @@ #include <sys/types.h> #include <unistd.h> -#include "../kselftest.h" +#include <ynl.h> +#include "ethtool-user.h" + +#include "../../../kselftest.h" +#include "../../../net/lib/ksft.h" #define TOEPLITZ_KEY_MIN_LEN 40 #define TOEPLITZ_KEY_MAX_LEN 60 @@ -64,6 +68,7 @@ #define FOUR_TUPLE_MAX_LEN ((sizeof(struct in6_addr) * 2) + (sizeof(uint16_t) * 2)) #define RSS_MAX_CPUS (1 << 16) /* real constraint is PACKET_FANOUT_MAX */ +#define RSS_MAX_INDIR (1 << 16) #define RPS_MAX_CPUS 16UL /* must be a power of 2 */ @@ -101,6 +106,8 @@ struct ring_state { static unsigned int rx_irq_cpus[RSS_MAX_CPUS]; /* map from rxq to cpu */ static int rps_silo_to_cpu[RPS_MAX_CPUS]; static unsigned char toeplitz_key[TOEPLITZ_KEY_MAX_LEN]; +static unsigned int rss_indir_tbl[RSS_MAX_INDIR]; +static unsigned int rss_indir_tbl_size; static struct ring_state rings[RSS_MAX_CPUS]; static inline uint32_t toeplitz(const unsigned char *four_tuple, @@ -129,7 +136,12 @@ static inline uint32_t toeplitz(const unsigned char *four_tuple, /* Compare computed cpu with arrival cpu from packet_fanout_cpu */ static void verify_rss(uint32_t rx_hash, int cpu) { - int queue = rx_hash % cfg_num_queues; + int queue; + + if (rss_indir_tbl_size) + queue = rss_indir_tbl[rx_hash % rss_indir_tbl_size]; + else + queue = rx_hash % cfg_num_queues; log_verbose(" rxq %d (cpu %d)", queue, rx_irq_cpus[queue]); if (rx_irq_cpus[queue] != cpu) { @@ -482,6 +494,56 @@ static void parse_rps_bitmap(const char *arg) rps_silo_to_cpu[cfg_num_rps_cpus++] = i; } +static void read_rss_dev_info_ynl(void) +{ + struct ethtool_rss_get_req *req; + struct ethtool_rss_get_rsp *rsp; + struct ynl_sock *ys; + + ys = ynl_sock_create(&ynl_ethtool_family, NULL); + if (!ys) + error(1, errno, "ynl_sock_create failed"); + + req = ethtool_rss_get_req_alloc(); + if (!req) + error(1, errno, "ethtool_rss_get_req_alloc failed"); + + ethtool_rss_get_req_set_header_dev_name(req, cfg_ifname); + + rsp = ethtool_rss_get(ys, req); + if (!rsp) + error(1, ys->err.code, "YNL: %s", ys->err.msg); + + if (!rsp->_len.hkey) + error(1, 0, "RSS key not available for %s", cfg_ifname); + + if (rsp->_len.hkey < TOEPLITZ_KEY_MIN_LEN || + rsp->_len.hkey > TOEPLITZ_KEY_MAX_LEN) + error(1, 0, "RSS key length %u out of bounds [%u, %u]", + rsp->_len.hkey, TOEPLITZ_KEY_MIN_LEN, + TOEPLITZ_KEY_MAX_LEN); + + memcpy(toeplitz_key, rsp->hkey, rsp->_len.hkey); + + if (rsp->_count.indir > RSS_MAX_INDIR) + error(1, 0, "RSS indirection table too large (%u > %u)", + rsp->_count.indir, RSS_MAX_INDIR); + + /* If indir table not available we'll fallback to simple modulo math */ + if (rsp->_count.indir) { + memcpy(rss_indir_tbl, rsp->indir, + rsp->_count.indir * sizeof(rss_indir_tbl[0])); + rss_indir_tbl_size = rsp->_count.indir; + + log_verbose("RSS indirection table size: %u\n", + rss_indir_tbl_size); + } + + ethtool_rss_get_rsp_free(rsp); + ethtool_rss_get_req_free(req); + ynl_sock_destroy(ys); +} + static void parse_opts(int argc, char **argv) { static struct option long_options[] = { @@ -550,7 +612,7 @@ static void parse_opts(int argc, char **argv) } if (!have_toeplitz) - error(1, 0, "Must supply rss key ('-k')"); + read_rss_dev_info_ynl(); num_cpus = get_nprocs(); if (num_cpus > RSS_MAX_CPUS) @@ -576,6 +638,10 @@ int main(int argc, char **argv) fd_sink = setup_sink(); setup_rings(); + + /* Signal to test framework that we're ready to receive */ + ksft_ready(); + process_rings(); cleanup_rings(); diff --git a/tools/testing/selftests/drivers/net/hw/toeplitz.py b/tools/testing/selftests/drivers/net/hw/toeplitz.py new file mode 100755 index 000000000000..d2db5ee9e358 --- /dev/null +++ b/tools/testing/selftests/drivers/net/hw/toeplitz.py @@ -0,0 +1,211 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: GPL-2.0 + +""" +Toeplitz Rx hashing test: + - rxhash (the hash value calculation itself); + - RSS mapping from rxhash to rx queue; + - RPS mapping from rxhash to cpu. +""" + +import glob +import os +import socket +from lib.py import ksft_run, ksft_exit, ksft_pr +from lib.py import NetDrvEpEnv, EthtoolFamily, NetdevFamily +from lib.py import cmd, bkg, rand_port, defer +from lib.py import ksft_in +from lib.py import ksft_variants, KsftNamedVariant, KsftSkipEx, KsftFailEx + +# "define" for the ID of the Toeplitz hash function +ETH_RSS_HASH_TOP = 1 + + +def _check_rps_and_rfs_not_configured(cfg): + """Verify that RPS is not already configured.""" + + for rps_file in glob.glob(f"/sys/class/net/{cfg.ifname}/queues/rx-*/rps_cpus"): + with open(rps_file, "r", encoding="utf-8") as fp: + val = fp.read().strip() + if set(val) - {"0", ","}: + raise KsftSkipEx(f"RPS already configured on {rps_file}: {val}") + + rfs_file = "/proc/sys/net/core/rps_sock_flow_entries" + with open(rfs_file, "r", encoding="utf-8") as fp: + val = fp.read().strip() + if val != "0": + raise KsftSkipEx(f"RFS already configured {rfs_file}: {val}") + + +def _get_cpu_for_irq(irq): + with open(f"/proc/irq/{irq}/smp_affinity_list", "r", + encoding="utf-8") as fp: + data = fp.read().strip() + if "," in data or "-" in data: + raise KsftFailEx(f"IRQ{irq} is not mapped to a single core: {data}") + return int(data) + + +def _get_irq_cpus(cfg): + """ + Read the list of IRQs for the device Rx queues. + """ + queues = cfg.netnl.queue_get({"ifindex": cfg.ifindex}, dump=True) + napis = cfg.netnl.napi_get({"ifindex": cfg.ifindex}, dump=True) + + # Remap into ID-based dicts + napis = {n["id"]: n for n in napis} + queues = {f"{q['type']}{q['id']}": q for q in queues} + + cpus = [] + for rx in range(9999): + name = f"rx{rx}" + if name not in queues: + break + cpus.append(_get_cpu_for_irq(napis[queues[name]["napi-id"]]["irq"])) + + return cpus + + +def _get_unused_cpus(cfg, count=2): + """ + Get CPUs that are not used by Rx queues. + Returns a list of at least 'count' CPU numbers. + """ + + # Get CPUs used by Rx queues + rx_cpus = set(_get_irq_cpus(cfg)) + + # Get total number of CPUs + num_cpus = os.cpu_count() + + # Find unused CPUs + unused_cpus = [cpu for cpu in range(num_cpus) if cpu not in rx_cpus] + + if len(unused_cpus) < count: + raise KsftSkipEx(f"Need at {count} CPUs not used by Rx queues, found {len(unused_cpus)}") + + return unused_cpus[:count] + + +def _configure_rps(cfg, rps_cpus): + """Configure RPS for all Rx queues.""" + + mask = 0 + for cpu in rps_cpus: + mask |= (1 << cpu) + mask = hex(mask)[2:] + + # Set RPS bitmap for all rx queues + for rps_file in glob.glob(f"/sys/class/net/{cfg.ifname}/queues/rx-*/rps_cpus"): + with open(rps_file, "w", encoding="utf-8") as fp: + fp.write(mask) + + return mask + + +def _send_traffic(cfg, proto_flag, ipver, port): + """Send 20 packets of requested type.""" + + # Determine protocol and IP version for socat + if proto_flag == "-u": + proto = "UDP" + else: + proto = "TCP" + + baddr = f"[{cfg.addr_v['6']}]" if ipver == "6" else cfg.addr_v["4"] + + # Run socat in a loop to send traffic periodically + # Use sh -c with a loop similar to toeplitz_client.sh + socat_cmd = f""" + for i in `seq 20`; do + echo "msg $i" | socat -{ipver} -t 0.1 - {proto}:{baddr}:{port}; + sleep 0.001; + done + """ + + cmd(socat_cmd, shell=True, host=cfg.remote) + + +def _test_variants(): + for grp in ["", "rss", "rps"]: + for l4 in ["tcp", "udp"]: + for l3 in ["4", "6"]: + name = f"{l4}_ipv{l3}" + if grp: + name = f"{grp}_{name}" + yield KsftNamedVariant(name, "-" + l4[0], l3, grp) + + +@ksft_variants(_test_variants()) +def test(cfg, proto_flag, ipver, grp): + """Run a single toeplitz test.""" + + cfg.require_ipver(ipver) + + # Check that rxhash is enabled + ksft_in("receive-hashing: on", cmd(f"ethtool -k {cfg.ifname}").stdout) + + rss = cfg.ethnl.rss_get({"header": {"dev-index": cfg.ifindex}}) + # Make sure NIC is configured to use Toeplitz hash, and no key xfrm. + if rss.get('hfunc') != ETH_RSS_HASH_TOP or rss.get('input-xfrm'): + cfg.ethnl.rss_set({"header": {"dev-index": cfg.ifindex}, + "hfunc": ETH_RSS_HASH_TOP, + "input-xfrm": {}}) + defer(cfg.ethnl.rss_set, {"header": {"dev-index": cfg.ifindex}, + "hfunc": rss.get('hfunc'), + "input-xfrm": rss.get('input-xfrm', {}) + }) + + port = rand_port(socket.SOCK_DGRAM) + + toeplitz_path = cfg.test_dir / "toeplitz" + rx_cmd = [ + str(toeplitz_path), + "-" + ipver, + proto_flag, + "-d", str(port), + "-i", cfg.ifname, + "-T", "4000", + "-s", + "-v" + ] + + if grp: + _check_rps_and_rfs_not_configured(cfg) + if grp == "rss": + irq_cpus = ",".join([str(x) for x in _get_irq_cpus(cfg)]) + rx_cmd += ["-C", irq_cpus] + ksft_pr(f"RSS using CPUs: {irq_cpus}") + elif grp == "rps": + # Get CPUs not used by Rx queues and configure them for RPS + rps_cpus = _get_unused_cpus(cfg, count=2) + rps_mask = _configure_rps(cfg, rps_cpus) + defer(_configure_rps, cfg, []) + rx_cmd += ["-r", rps_mask] + ksft_pr(f"RPS using CPUs: {rps_cpus}, mask: {rps_mask}") + + # Run rx in background, it will exit once it has seen enough packets + with bkg(" ".join(rx_cmd), ksft_ready=True, exit_wait=True) as rx_proc: + while rx_proc.proc.poll() is None: + _send_traffic(cfg, proto_flag, ipver, port) + + # Check rx result + ksft_pr("Receiver output:") + ksft_pr(rx_proc.stdout.strip().replace('\n', '\n# ')) + if rx_proc.stderr: + ksft_pr(rx_proc.stderr.strip().replace('\n', '\n# ')) + + +def main() -> None: + """Ksft boilerplate main.""" + + with NetDrvEpEnv(__file__) as cfg: + cfg.ethnl = EthtoolFamily() + cfg.netnl = NetdevFamily() + ksft_run(cases=[test], args=(cfg,)) + ksft_exit() + + +if __name__ == "__main__": + main() diff --git a/tools/testing/selftests/drivers/net/lib/py/__init__.py b/tools/testing/selftests/drivers/net/lib/py/__init__.py index b0c6300150fb..8b75faa9af6d 100644 --- a/tools/testing/selftests/drivers/net/lib/py/__init__.py +++ b/tools/testing/selftests/drivers/net/lib/py/__init__.py @@ -25,7 +25,7 @@ try: fd_read_timeout, ip, rand_port, wait_port_listen, wait_file from net.lib.py import KsftSkipEx, KsftFailEx, KsftXfailEx from net.lib.py import ksft_disruptive, ksft_exit, ksft_pr, ksft_run, \ - ksft_setup + ksft_setup, ksft_variants, KsftNamedVariant from net.lib.py import ksft_eq, ksft_ge, ksft_in, ksft_is, ksft_lt, \ ksft_ne, ksft_not_in, ksft_raises, ksft_true, ksft_gt, ksft_not_none @@ -38,16 +38,17 @@ try: "wait_port_listen", "wait_file", "KsftSkipEx", "KsftFailEx", "KsftXfailEx", "ksft_disruptive", "ksft_exit", "ksft_pr", "ksft_run", - "ksft_setup", + "ksft_setup", "ksft_variants", "KsftNamedVariant", "ksft_eq", "ksft_ge", "ksft_in", "ksft_is", "ksft_lt", "ksft_ne", "ksft_not_in", "ksft_raises", "ksft_true", "ksft_gt", "ksft_not_none", "ksft_not_none"] from .env import NetDrvEnv, NetDrvEpEnv - from .load import GenerateTraffic + from .load import GenerateTraffic, Iperf3Runner from .remote import Remote - __all__ += ["NetDrvEnv", "NetDrvEpEnv", "GenerateTraffic", "Remote"] + __all__ += ["NetDrvEnv", "NetDrvEpEnv", "GenerateTraffic", "Remote", + "Iperf3Runner"] except ModuleNotFoundError as e: print("Failed importing `net` library from kernel sources") print(str(e)) diff --git a/tools/testing/selftests/drivers/net/lib/py/env.py b/tools/testing/selftests/drivers/net/lib/py/env.py index 01be3d9b9720..8b644fd84ff2 100644 --- a/tools/testing/selftests/drivers/net/lib/py/env.py +++ b/tools/testing/selftests/drivers/net/lib/py/env.py @@ -168,6 +168,8 @@ class NetDrvEpEnv(NetDrvEnvBase): # resolve remote interface name self.remote_ifname = self.resolve_remote_ifc() + self.remote_dev = ip("-d link show dev " + self.remote_ifname, + host=self.remote, json=True)[0] self._required_cmd = {} diff --git a/tools/testing/selftests/drivers/net/lib/py/load.py b/tools/testing/selftests/drivers/net/lib/py/load.py index c4e808407cc4..f181fa2d38fc 100644 --- a/tools/testing/selftests/drivers/net/lib/py/load.py +++ b/tools/testing/selftests/drivers/net/lib/py/load.py @@ -2,21 +2,89 @@ import re import time +import json from lib.py import ksft_pr, cmd, ip, rand_port, wait_port_listen -class GenerateTraffic: - def __init__(self, env, port=None): - env.require_cmd("iperf3", local=True, remote=True) +class Iperf3Runner: + """ + Sets up and runs iperf3 traffic. + """ + def __init__(self, env, port=None, server_ip=None, client_ip=None): + env.require_cmd("iperf3", local=True, remote=True) self.env = env - self.port = rand_port() if port is None else port - self._iperf_server = cmd(f"iperf3 -s -1 -p {self.port}", background=True) + self.server_ip = server_ip + self.client_ip = client_ip + + def _build_server(self): + cmdline = f"iperf3 -s -1 -p {self.port}" + if self.server_ip: + cmdline += f" -B {self.server_ip}" + return cmdline + + def _build_client(self, streams, duration, reverse): + host = self.env.addr if self.server_ip is None else self.server_ip + cmdline = f"iperf3 -c {host} -p {self.port} -P {streams} -t {duration} -J" + if self.client_ip: + cmdline += f" -B {self.client_ip}" + if reverse: + cmdline += " --reverse" + return cmdline + + def start_server(self): + """ + Starts an iperf3 server with optional bind IP. + """ + cmdline = self._build_server() + proc = cmd(cmdline, background=True) wait_port_listen(self.port) time.sleep(0.1) - self._iperf_client = cmd(f"iperf3 -c {env.addr} -P 16 -p {self.port} -t 86400", - background=True, host=env.remote) + return proc + + def start_client(self, background=False, streams=1, duration=10, reverse=False): + """ + Starts the iperf3 client with the configured options. + """ + cmdline = self._build_client(streams, duration, reverse) + return cmd(cmdline, background=background, host=self.env.remote) + + def measure_bandwidth(self, reverse=False): + """ + Runs an iperf3 measurement and returns the average bandwidth (Gbps). + Discards the first and last few reporting intervals and uses only the + middle part of the run where throughput is typically stable. + """ + self.start_server() + result = self.start_client(duration=10, reverse=reverse) + + if result.ret != 0: + raise RuntimeError("iperf3 failed to run successfully") + try: + out = json.loads(result.stdout) + except json.JSONDecodeError as exc: + raise ValueError("Failed to parse iperf3 JSON output") from exc + + intervals = out.get("intervals", []) + samples = [i["sum"]["bits_per_second"] / 1e9 for i in intervals] + if len(samples) < 10: + raise ValueError(f"iperf3 returned too few intervals: {len(samples)}") + # Discard potentially unstable first and last 3 seconds. + stable = samples[3:-3] + + avg = sum(stable) / len(stable) + + return avg + + +class GenerateTraffic: + def __init__(self, env, port=None): + self.env = env + self.runner = Iperf3Runner(env, port) + + self._iperf_server = self.runner.start_server() + self._iperf_client = self.runner.start_client(background=True, streams=16, duration=86400) # Wait for traffic to ramp up if not self._wait_pkts(pps=1000): @@ -61,7 +129,7 @@ class GenerateTraffic: def _wait_client_stopped(self, sleep=0.005, timeout=5): end = time.monotonic() + timeout - live_port_pattern = re.compile(fr":{self.port:04X} 0[^6] ") + live_port_pattern = re.compile(fr":{self.runner.port:04X} 0[^6] ") while time.monotonic() < end: data = cmd("cat /proc/net/tcp*", host=self.env.remote).stdout diff --git a/tools/testing/selftests/drivers/net/lib/sh/lib_netcons.sh b/tools/testing/selftests/drivers/net/lib/sh/lib_netcons.sh index 87f89fd92f8c..ae8abff4be40 100644 --- a/tools/testing/selftests/drivers/net/lib/sh/lib_netcons.sh +++ b/tools/testing/selftests/drivers/net/lib/sh/lib_netcons.sh @@ -249,7 +249,7 @@ function listen_port_and_save_to() { # Just wait for 2 seconds timeout 2 ip netns exec "${NAMESPACE}" \ - socat "${SOCAT_MODE}":"${PORT}",fork "${OUTPUT}" + socat "${SOCAT_MODE}":"${PORT}",fork "${OUTPUT}" 2> /dev/null } # Only validate that the message arrived properly diff --git a/tools/testing/selftests/drivers/net/netcons_basic.sh b/tools/testing/selftests/drivers/net/netcons_basic.sh index a3446b569976..2022f3061738 100755 --- a/tools/testing/selftests/drivers/net/netcons_basic.sh +++ b/tools/testing/selftests/drivers/net/netcons_basic.sh @@ -28,8 +28,6 @@ OUTPUT_FILE="/tmp/${TARGET}" # Check for basic system dependency and exit if not found check_for_dependencies -# Set current loglevel to KERN_INFO(6), and default to KERN_NOTICE(5) -echo "6 5" > /proc/sys/kernel/printk # Remove the namespace, interfaces and netconsole target on exit trap cleanup EXIT @@ -39,6 +37,9 @@ do for IP_VERSION in "ipv6" "ipv4" do echo "Running with target mode: ${FORMAT} (${IP_VERSION})" + # Set current loglevel to KERN_INFO(6), and default to + # KERN_NOTICE(5) + echo "6 5" > /proc/sys/kernel/printk # Create one namespace and two interfaces set_network "${IP_VERSION}" # Create a dynamic target for netconsole diff --git a/tools/testing/selftests/drivers/net/netcons_overflow.sh b/tools/testing/selftests/drivers/net/netcons_overflow.sh index 29bad56448a2..06089643b771 100755 --- a/tools/testing/selftests/drivers/net/netcons_overflow.sh +++ b/tools/testing/selftests/drivers/net/netcons_overflow.sh @@ -15,7 +15,7 @@ SCRIPTDIR=$(dirname "$(readlink -e "${BASH_SOURCE[0]}")") source "${SCRIPTDIR}"/lib/sh/lib_netcons.sh # This is coming from netconsole code. Check for it in drivers/net/netconsole.c -MAX_USERDATA_ITEMS=16 +MAX_USERDATA_ITEMS=256 # Function to create userdata entries function create_userdata_max_entries() { diff --git a/tools/testing/selftests/drivers/net/netdevsim/Makefile b/tools/testing/selftests/drivers/net/netdevsim/Makefile index df10c7243511..1a228c5430f5 100644 --- a/tools/testing/selftests/drivers/net/netdevsim/Makefile +++ b/tools/testing/selftests/drivers/net/netdevsim/Makefile @@ -8,7 +8,6 @@ TEST_PROGS := \ ethtool-features.sh \ ethtool-fec.sh \ ethtool-pause.sh \ - ethtool-ring.sh \ fib.sh \ fib_notifications.sh \ hw_stats_l3.sh \ diff --git a/tools/testing/selftests/drivers/net/netdevsim/devlink.sh b/tools/testing/selftests/drivers/net/netdevsim/devlink.sh index 030762b203d7..1b529ccaf050 100755 --- a/tools/testing/selftests/drivers/net/netdevsim/devlink.sh +++ b/tools/testing/selftests/drivers/net/netdevsim/devlink.sh @@ -3,7 +3,8 @@ lib_dir=$(dirname $0)/../../../net/forwarding -ALL_TESTS="fw_flash_test params_test regions_test reload_test \ +ALL_TESTS="fw_flash_test params_test \ + params_default_test regions_test reload_test \ netns_reload_test resource_test dev_info_test \ empty_reporter_test dummy_reporter_test rate_test" NUM_NETIFS=0 @@ -78,17 +79,28 @@ fw_flash_test() param_get() { local name=$1 + local attr=${2:-value} + local cmode=${3:-driverinit} cmd_jq "devlink dev param show $DL_HANDLE name $name -j" \ - '.[][][].values[] | select(.cmode == "driverinit").value' + '.[][][].values[] | select(.cmode == "'"$cmode"'").'"$attr" } param_set() { local name=$1 local value=$2 + local cmode=${3:-driverinit} - devlink dev param set $DL_HANDLE name $name cmode driverinit value $value + devlink dev param set $DL_HANDLE name $name cmode $cmode value $value +} + +param_set_default() +{ + local name=$1 + local cmode=${2:-driverinit} + + devlink dev param set $DL_HANDLE name $name default cmode $cmode } check_value() @@ -97,12 +109,18 @@ check_value() local phase_name=$2 local expected_param_value=$3 local expected_debugfs_value=$4 + local cmode=${5:-driverinit} local value + local attr="value" - value=$(param_get $name) - check_err $? "Failed to get $name param value" + if [[ "$phase_name" == *"default"* ]]; then + attr="default" + fi + + value=$(param_get $name $attr $cmode) + check_err $? "Failed to get $name param $attr" [ "$value" == "$expected_param_value" ] - check_err $? "Unexpected $phase_name $name param value" + check_err $? "Unexpected $phase_name $name param $attr" value=$(<$DEBUGFS_DIR/$name) check_err $? "Failed to get $name debugfs value" [ "$value" == "$expected_debugfs_value" ] @@ -135,6 +153,92 @@ params_test() log_test "params test" } +value_to_debugfs() +{ + local value=$1 + + case "$value" in + true) + echo "Y" + ;; + false) + echo "N" + ;; + *) + echo "$value" + ;; + esac +} + +test_default() +{ + local param_name=$1 + local new_value=$2 + local expected_default=$3 + local cmode=${4:-driverinit} + local default_debugfs + local new_debugfs + local expected_debugfs + + default_debugfs=$(value_to_debugfs $expected_default) + new_debugfs=$(value_to_debugfs $new_value) + + expected_debugfs=$default_debugfs + check_value $param_name initial-default $expected_default $expected_debugfs $cmode + + param_set $param_name $new_value $cmode + check_err $? "Failed to set $param_name to $new_value" + + expected_debugfs=$([ "$cmode" == "runtime" ] && echo "$new_debugfs" || echo "$default_debugfs") + check_value $param_name post-set $new_value $expected_debugfs $cmode + + devlink dev reload $DL_HANDLE + check_err $? "Failed to reload device" + + expected_debugfs=$new_debugfs + check_value $param_name post-reload-new-value $new_value $expected_debugfs $cmode + + param_set_default $param_name $cmode + check_err $? "Failed to set $param_name to default" + + expected_debugfs=$([ "$cmode" == "runtime" ] && echo "$default_debugfs" || echo "$new_debugfs") + check_value $param_name post-set-default $expected_default $expected_debugfs $cmode + + devlink dev reload $DL_HANDLE + check_err $? "Failed to reload device" + + expected_debugfs=$default_debugfs + check_value $param_name post-reload-default $expected_default $expected_debugfs $cmode +} + +params_default_test() +{ + RET=0 + + if ! devlink dev param help 2>&1 | grep -q "value VALUE | default"; then + echo "SKIP: devlink cli missing default feature" + return + fi + + # Remove side effects of previous tests. Use plain param_set, because + # param_set_default is a feature under test here. + param_set max_macs 32 driverinit + check_err $? "Failed to reset max_macs to default value" + param_set test1 true driverinit + check_err $? "Failed to reset test1 to default value" + param_set test2 1234 runtime + check_err $? "Failed to reset test2 to default value" + + devlink dev reload $DL_HANDLE + check_err $? "Failed to reload device for clean state" + + test_default max_macs 16 32 driverinit + test_default test1 false true driverinit + test_default test2 100 1234 runtime + + log_test "params default test" +} + check_region_size() { local name=$1 diff --git a/tools/testing/selftests/drivers/net/netdevsim/ethtool-ring.sh b/tools/testing/selftests/drivers/net/netdevsim/ethtool-ring.sh deleted file mode 100755 index c969559ffa7a..000000000000 --- a/tools/testing/selftests/drivers/net/netdevsim/ethtool-ring.sh +++ /dev/null @@ -1,85 +0,0 @@ -#!/bin/bash -# SPDX-License-Identifier: GPL-2.0-only - -source ethtool-common.sh - -function get_value { - local query="${SETTINGS_MAP[$1]}" - - echo $(ethtool -g $NSIM_NETDEV | \ - tail -n +$CURR_SETT_LINE | \ - awk -F':' -v pattern="$query:" '$0 ~ pattern {gsub(/[\t ]/, "", $2); print $2}') -} - -function update_current_settings { - for key in ${!SETTINGS_MAP[@]}; do - CURRENT_SETTINGS[$key]=$(get_value $key) - done - echo ${CURRENT_SETTINGS[@]} -} - -if ! ethtool -h | grep -q set-ring >/dev/null; then - echo "SKIP: No --set-ring support in ethtool" - exit 4 -fi - -NSIM_NETDEV=$(make_netdev) - -set -o pipefail - -declare -A SETTINGS_MAP=( - ["rx"]="RX" - ["rx-mini"]="RX Mini" - ["rx-jumbo"]="RX Jumbo" - ["tx"]="TX" -) - -declare -A EXPECTED_SETTINGS=( - ["rx"]="" - ["rx-mini"]="" - ["rx-jumbo"]="" - ["tx"]="" -) - -declare -A CURRENT_SETTINGS=( - ["rx"]="" - ["rx-mini"]="" - ["rx-jumbo"]="" - ["tx"]="" -) - -MAX_VALUE=$((RANDOM % $((2**32-1)))) -RING_MAX_LIST=$(ls $NSIM_DEV_DFS/ethtool/ring/) - -for ring_max_entry in $RING_MAX_LIST; do - echo $MAX_VALUE > $NSIM_DEV_DFS/ethtool/ring/$ring_max_entry -done - -CURR_SETT_LINE=$(ethtool -g $NSIM_NETDEV | grep -i -m1 -n 'Current hardware settings' | cut -f1 -d:) - -# populate the expected settings map -for key in ${!SETTINGS_MAP[@]}; do - EXPECTED_SETTINGS[$key]=$(get_value $key) -done - -# test -for key in ${!SETTINGS_MAP[@]}; do - value=$((RANDOM % $MAX_VALUE)) - - ethtool -G $NSIM_NETDEV "$key" "$value" - - EXPECTED_SETTINGS[$key]="$value" - expected=${EXPECTED_SETTINGS[@]} - current=$(update_current_settings) - - check $? "$current" "$expected" - set +x -done - -if [ $num_errors -eq 0 ]; then - echo "PASSED all $((num_passes)) checks" - exit 0 -else - echo "FAILED $num_errors/$((num_errors+num_passes)) checks" - exit 1 -fi diff --git a/tools/testing/selftests/drivers/net/psp.py b/tools/testing/selftests/drivers/net/psp.py index 4ae7a785ff10..06559ef49b9a 100755 --- a/tools/testing/selftests/drivers/net/psp.py +++ b/tools/testing/selftests/drivers/net/psp.py @@ -109,6 +109,10 @@ def _check_data_outq(s, exp_len, force_wait=False): time.sleep(0.01) ksft_eq(outq, exp_len) + +def _get_stat(cfg, key): + return cfg.pspnl.get_stats({'dev-id': cfg.psp_dev_id})[key] + # # Test case boiler plate # @@ -171,11 +175,16 @@ def dev_rotate(cfg): """ Test key rotation """ _init_psp_dev(cfg) + prev_rotations = _get_stat(cfg, 'key-rotations') + rot = cfg.pspnl.key_rotate({"id": cfg.psp_dev_id}) ksft_eq(rot['id'], cfg.psp_dev_id) rot = cfg.pspnl.key_rotate({"id": cfg.psp_dev_id}) ksft_eq(rot['id'], cfg.psp_dev_id) + cur_rotations = _get_stat(cfg, 'key-rotations') + ksft_eq(cur_rotations, prev_rotations + 2) + def dev_rotate_spi(cfg): """ Test key rotation and SPI check """ @@ -475,6 +484,7 @@ def data_stale_key(cfg): """ Test send on a double-rotated key """ _init_psp_dev(cfg) + prev_stale = _get_stat(cfg, 'stale-events') s = _make_psp_conn(cfg) try: rx_assoc = cfg.pspnl.rx_assoc({"version": 0, @@ -495,6 +505,9 @@ def data_stale_key(cfg): cfg.pspnl.key_rotate({"id": cfg.psp_dev_id}) cfg.pspnl.key_rotate({"id": cfg.psp_dev_id}) + cur_stale = _get_stat(cfg, 'stale-events') + ksft_gt(cur_stale, prev_stale) + s.send(b'0123456789' * 200) _check_data_outq(s, 2000, force_wait=True) finally: diff --git a/tools/testing/selftests/drivers/net/ring_reconfig.py b/tools/testing/selftests/drivers/net/ring_reconfig.py new file mode 100755 index 000000000000..f9530a8b0856 --- /dev/null +++ b/tools/testing/selftests/drivers/net/ring_reconfig.py @@ -0,0 +1,167 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: GPL-2.0 + +""" +Test channel and ring size configuration via ethtool (-L / -G). +""" + +from lib.py import ksft_run, ksft_exit, ksft_pr +from lib.py import ksft_eq +from lib.py import NetDrvEpEnv, EthtoolFamily, GenerateTraffic +from lib.py import defer, NlError + + +def channels(cfg) -> None: + """ + Twiddle channel counts in various combinations of parameters. + We're only looking for driver adhering to the requested config + if the config is accepted and crashes. + """ + ehdr = {'header':{'dev-index': cfg.ifindex}} + chans = cfg.eth.channels_get(ehdr) + + all_keys = ["rx", "tx", "combined"] + mixes = [{"combined"}, {"rx", "tx"}, {"rx", "combined"}, {"tx", "combined"}, + {"rx", "tx", "combined"},] + + # Get the set of keys that device actually supports + restore = {} + supported = set() + for key in all_keys: + if key + "-max" in chans: + supported.add(key) + restore |= {key + "-count": chans[key + "-count"]} + + defer(cfg.eth.channels_set, ehdr | restore) + + def test_config(config): + try: + cfg.eth.channels_set(ehdr | config) + get = cfg.eth.channels_get(ehdr) + for k, v in config.items(): + ksft_eq(get.get(k, 0), v) + except NlError as e: + failed.append(mix) + ksft_pr("Can't set", config, e) + else: + ksft_pr("Okay", config) + + failed = [] + for mix in mixes: + if not mix.issubset(supported): + continue + + # Set all the values in the mix to 1, other supported to 0 + config = {} + for key in all_keys: + config[key + "-count"] = 1 if key in mix else 0 + test_config(config) + + for mix in mixes: + if not mix.issubset(supported): + continue + if mix in failed: + continue + + # Set all the values in the mix to max, other supported to 0 + config = {} + for key in all_keys: + config[key + "-count"] = chans[key + '-max'] if key in mix else 0 + test_config(config) + + +def _configure_min_ring_cnt(cfg) -> None: + """ Try to configure a single Rx/Tx ring. """ + ehdr = {'header':{'dev-index': cfg.ifindex}} + chans = cfg.eth.channels_get(ehdr) + + all_keys = ["rx-count", "tx-count", "combined-count"] + restore = {} + config = {} + for key in all_keys: + if key in chans: + restore[key] = chans[key] + config[key] = 0 + + if chans.get('combined-count', 0) > 1: + config['combined-count'] = 1 + elif chans.get('rx-count', 0) > 1 and chans.get('tx-count', 0) > 1: + config['tx-count'] = 1 + config['rx-count'] = 1 + else: + # looks like we're already on 1 channel + return + + cfg.eth.channels_set(ehdr | config) + defer(cfg.eth.channels_set, ehdr | restore) + + +def ringparam(cfg) -> None: + """ + Tweak the ringparam configuration. Try to run some traffic over min + ring size to make sure it actually functions. + """ + ehdr = {'header':{'dev-index': cfg.ifindex}} + rings = cfg.eth.rings_get(ehdr) + + restore = {} + maxes = {} + params = set() + for key in rings.keys(): + if 'max' in key: + param = key[:-4] + maxes[param] = rings[key] + params.add(param) + restore[param] = rings[param] + + defer(cfg.eth.rings_set, ehdr | restore) + + # Speed up the reconfig by configuring just one ring + _configure_min_ring_cnt(cfg) + + # Try to reach min on all settings + for param in params: + val = rings[param] + while True: + try: + cfg.eth.rings_set({'header':{'dev-index': cfg.ifindex}, + param: val // 2}) + if val == 0: + break + val //= 2 + except NlError: + break + + get = cfg.eth.rings_get(ehdr) + ksft_eq(get[param], val) + + ksft_pr(f"Reached min for '{param}' at {val} (max {rings[param]})") + + GenerateTraffic(cfg).wait_pkts_and_stop(10000) + + # Try max across all params, if the driver supports large rings + # this may OOM so we ignore errors + try: + ksft_pr("Applying max settings") + config = {p: maxes[p] for p in params} + cfg.eth.rings_set(ehdr | config) + except NlError as e: + ksft_pr("Can't set max params", config, e) + else: + GenerateTraffic(cfg).wait_pkts_and_stop(10000) + + +def main() -> None: + """ Ksft boiler plate main """ + + with NetDrvEpEnv(__file__) as cfg: + cfg.eth = EthtoolFamily() + + ksft_run([channels, + ringparam], + args=(cfg, )) + ksft_exit() + + +if __name__ == "__main__": + main() diff --git a/tools/testing/selftests/drivers/net/stats.py b/tools/testing/selftests/drivers/net/stats.py index 04d0a2a13e73..b08e4d48b15c 100755 --- a/tools/testing/selftests/drivers/net/stats.py +++ b/tools/testing/selftests/drivers/net/stats.py @@ -263,14 +263,15 @@ def procfs_downup_hammer(cfg) -> None: Reading stats via procfs only holds the RCU lock, drivers often try to sleep when reading the stats, or don't protect against races. """ - # Max out the queues, we'll flip between max and 1 + # Set a large number of queues, + # we'll flip between min(max_queues, 64) and 1 channels = ethnl.channels_get({'header': {'dev-index': cfg.ifindex}}) if channels['combined-count'] == 0: rx_type = 'rx' else: rx_type = 'combined' cur_queue_cnt = channels[f'{rx_type}-count'] - max_queue_cnt = channels[f'{rx_type}-max'] + max_queue_cnt = min(channels[f'{rx_type}-max'], 64) cmd(f"ethtool -L {cfg.ifname} {rx_type} {max_queue_cnt}") defer(cmd, f"ethtool -L {cfg.ifname} {rx_type} {cur_queue_cnt}") diff --git a/tools/testing/selftests/drivers/net/xdp.py b/tools/testing/selftests/drivers/net/xdp.py index a148004e1c36..e54df158dfe9 100755 --- a/tools/testing/selftests/drivers/net/xdp.py +++ b/tools/testing/selftests/drivers/net/xdp.py @@ -12,6 +12,7 @@ from dataclasses import dataclass from enum import Enum from lib.py import ksft_run, ksft_exit, ksft_eq, ksft_ge, ksft_ne, ksft_pr +from lib.py import KsftNamedVariant, ksft_variants from lib.py import KsftFailEx, NetDrvEpEnv from lib.py import EthtoolFamily, NetdevFamily, NlError from lib.py import bkg, cmd, rand_port, wait_port_listen @@ -672,7 +673,18 @@ def test_xdp_native_adjst_head_shrnk_data(cfg): _validate_res(res, offset_lst, pkt_sz_lst) -def _test_xdp_native_ifc_stats(cfg, act): +@ksft_variants([ + KsftNamedVariant("pass", XDPAction.PASS), + KsftNamedVariant("drop", XDPAction.DROP), + KsftNamedVariant("tx", XDPAction.TX), +]) +def test_xdp_native_qstats(cfg, act): + """ + Send 1000 messages. Expect XDP action specified in @act. + Make sure the packets were counted to interface level qstats + (Rx, and Tx if act is TX). + """ + cfg.require_cmd("socat") bpf_info = BPFProgInfo("xdp_prog", "xdp_native.bpf.o", "xdp", 1500) @@ -687,9 +699,12 @@ def _test_xdp_native_ifc_stats(cfg, act): "/dev/null" # Listener runs on "remote" in case of XDP_TX rx_host = cfg.remote if act == XDPAction.TX else None - # We want to spew 2000 packets quickly, bash seems to do a good enough job - tx_udp = f"exec 5<>/dev/udp/{cfg.addr}/{port}; " \ - "for i in `seq 2000`; do echo a >&5; done; exec 5>&-" + # We want to spew 1000 packets quickly, bash seems to do a good enough job + # Each reopening of the socket gives us a differenot local port (for RSS) + tx_udp = "for _ in `seq 20`; do " \ + f"exec 5<>/dev/udp/{cfg.addr}/{port}; " \ + "for i in `seq 50`; do echo a >&5; done; " \ + "exec 5>&-; done" cfg.wait_hw_stats_settle() # Qstats have more clearly defined semantics than rtnetlink. @@ -704,11 +719,11 @@ def _test_xdp_native_ifc_stats(cfg, act): cfg.wait_hw_stats_settle() after = cfg.netnl.qstats_get({"ifindex": cfg.ifindex}, dump=True)[0] - ksft_ge(after['rx-packets'] - before['rx-packets'], 2000) + expected_pkts = 1000 + ksft_ge(after['rx-packets'] - before['rx-packets'], expected_pkts) if act == XDPAction.TX: - ksft_ge(after['tx-packets'] - before['tx-packets'], 2000) + ksft_ge(after['tx-packets'] - before['tx-packets'], expected_pkts) - expected_pkts = 2000 stats = _get_stats(prog_info["maps"]["map_xdp_stats"]) ksft_eq(stats[XDPStats.RX.value], expected_pkts, "XDP RX stats mismatch") if act == XDPAction.TX: @@ -730,30 +745,6 @@ def _test_xdp_native_ifc_stats(cfg, act): ksft_ge(after['tx-packets'], before['tx-packets']) -def test_xdp_native_qstats_pass(cfg): - """ - Send 2000 messages, expect XDP_PASS, make sure the packets were counted - to interface level qstats (Rx). - """ - _test_xdp_native_ifc_stats(cfg, XDPAction.PASS) - - -def test_xdp_native_qstats_drop(cfg): - """ - Send 2000 messages, expect XDP_DROP, make sure the packets were counted - to interface level qstats (Rx). - """ - _test_xdp_native_ifc_stats(cfg, XDPAction.DROP) - - -def test_xdp_native_qstats_tx(cfg): - """ - Send 2000 messages, expect XDP_TX, make sure the packets were counted - to interface level qstats (Rx and Tx) - """ - _test_xdp_native_ifc_stats(cfg, XDPAction.TX) - - def main(): """ Main function to execute the XDP tests. @@ -778,9 +769,7 @@ def main(): test_xdp_native_adjst_tail_shrnk_data, test_xdp_native_adjst_head_grow_data, test_xdp_native_adjst_head_shrnk_data, - test_xdp_native_qstats_pass, - test_xdp_native_qstats_drop, - test_xdp_native_qstats_tx, + test_xdp_native_qstats, ], args=(cfg,)) ksft_exit() diff --git a/tools/testing/selftests/filesystems/utils.c b/tools/testing/selftests/filesystems/utils.c index c43a69dffd83..a0c64f415a7f 100644 --- a/tools/testing/selftests/filesystems/utils.c +++ b/tools/testing/selftests/filesystems/utils.c @@ -487,7 +487,7 @@ int setup_userns(void) uid_t uid = getuid(); gid_t gid = getgid(); - ret = unshare(CLONE_NEWNS|CLONE_NEWUSER|CLONE_NEWPID); + ret = unshare(CLONE_NEWNS|CLONE_NEWUSER); if (ret) { ksft_exit_fail_msg("unsharing mountns and userns: %s\n", strerror(errno)); diff --git a/tools/testing/selftests/ftrace/ftracetest b/tools/testing/selftests/ftrace/ftracetest index cce72f8b03dc..3230bd54dba8 100755 --- a/tools/testing/selftests/ftrace/ftracetest +++ b/tools/testing/selftests/ftrace/ftracetest @@ -22,6 +22,7 @@ echo " --fail-unresolved Treat UNRESOLVED as a failure" echo " -d|--debug Debug mode (trace all shell commands)" echo " -l|--logdir <dir> Save logs on the <dir>" echo " If <dir> is -, all logs output in console only" +echo " --rv Run RV selftests instead of ftrace ones" exit $1 } @@ -133,6 +134,10 @@ parse_opts() { # opts LINK_PTR= shift 2 ;; + --rv) + RV_TEST=1 + shift 1 + ;; *.tc) if [ -f "$1" ]; then OPT_TEST_CASES="$OPT_TEST_CASES `abspath $1`" @@ -152,9 +157,13 @@ parse_opts() { # opts ;; esac done - if [ ! -z "$OPT_TEST_CASES" ]; then + if [ -n "$OPT_TEST_CASES" ]; then TEST_CASES=$OPT_TEST_CASES fi + if [ -n "$OPT_TEST_DIR" -a -f "$OPT_TEST_DIR"/test.d/functions ]; then + TOP_DIR=$OPT_TEST_DIR + TEST_DIR=$TOP_DIR/test.d + fi } # Parameters @@ -190,10 +199,6 @@ fi TOP_DIR=`absdir $0` TEST_DIR=$TOP_DIR/test.d TEST_CASES=`find_testcases $TEST_DIR` -LOG_TOP_DIR=$TOP_DIR/logs -LOG_DATE=`date +%Y%m%d-%H%M%S` -LOG_DIR=$LOG_TOP_DIR/$LOG_DATE/ -LINK_PTR=$LOG_TOP_DIR/latest KEEP_LOG=0 KTAP=0 DEBUG=0 @@ -201,14 +206,23 @@ VERBOSE=0 UNSUPPORTED_RESULT=0 UNRESOLVED_RESULT=0 STOP_FAILURE=0 +RV_TEST=0 # Parse command-line options parse_opts $* +LOG_TOP_DIR=$TOP_DIR/logs +LOG_DATE=`date +%Y%m%d-%H%M%S` +LOG_DIR=$LOG_TOP_DIR/$LOG_DATE/ +LINK_PTR=$LOG_TOP_DIR/latest + [ $DEBUG -ne 0 ] && set -x -# Verify parameters -if [ -z "$TRACING_DIR" -o ! -d "$TRACING_DIR" ]; then - errexit "No ftrace directory found" +if [ $RV_TEST -ne 0 ]; then + TRACING_DIR=$TRACING_DIR/rv + if [ ! -d "$TRACING_DIR" ]; then + err_ret=$err_skip + errexit "rv is not configured in this kernel" + fi fi # Preparing logs @@ -419,7 +433,7 @@ trap 'SIG_RESULT=$XFAIL' $SIG_XFAIL __run_test() { # testfile # setup PID and PPID, $$ is not updated. (cd $TRACING_DIR; read PID _ < /proc/self/stat; set -e; set -x; - checkreq $1; initialize_ftrace; . $1) + checkreq $1; initialize_system; . $1) [ $? -ne 0 ] && kill -s $SIG_FAIL $SIG_PID } @@ -496,7 +510,7 @@ for t in $TEST_CASES; do exit 1 fi done -(cd $TRACING_DIR; finish_ftrace) # for cleanup +(cd $TRACING_DIR; finish_system) # for cleanup prlog "" prlog "# of passed: " `echo $PASSED_CASES | wc -w` diff --git a/tools/testing/selftests/ftrace/test.d/00basic/mount_options.tc b/tools/testing/selftests/ftrace/test.d/00basic/mount_options.tc index 8a7ce647a60d..318939451caf 100644 --- a/tools/testing/selftests/ftrace/test.d/00basic/mount_options.tc +++ b/tools/testing/selftests/ftrace/test.d/00basic/mount_options.tc @@ -28,7 +28,7 @@ unmount_tracefs() { local mount_point="$1" # Need to make sure the mount isn't busy so that we can umount it - (cd $mount_point; finish_ftrace;) + (cd $mount_point; finish_system;) cleanup } diff --git a/tools/testing/selftests/ftrace/test.d/00basic/trace_marker_raw.tc b/tools/testing/selftests/ftrace/test.d/00basic/trace_marker_raw.tc new file mode 100644 index 000000000000..7daf7292209e --- /dev/null +++ b/tools/testing/selftests/ftrace/test.d/00basic/trace_marker_raw.tc @@ -0,0 +1,107 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0 +# description: Basic tests on writing to trace_marker_raw +# requires: trace_marker_raw +# flags: instance + +is_little_endian() { + if lscpu | grep -q 'Little Endian'; then + echo 1; + else + echo 0; + fi +} + +little=`is_little_endian` + +make_str() { + id=$1 + cnt=$2 + + if [ $little -eq 1 ]; then + val=`printf "\\%03o\\%03o\\%03o\\%03o" \ + $(($id & 0xff)) \ + $((($id >> 8) & 0xff)) \ + $((($id >> 16) & 0xff)) \ + $((($id >> 24) & 0xff))` + else + val=`printf "\\%03o\\%03o\\%03o\\%03o" \ + $((($id >> 24) & 0xff)) \ + $((($id >> 16) & 0xff)) \ + $((($id >> 8) & 0xff)) \ + $(($id & 0xff))` + fi + + data=`printf -- 'X%.0s' $(seq $cnt)` + + printf "${val}${data}" +} + +write_buffer() { + id=$1 + size=$2 + + # write the string into the raw marker + make_str $id $size > trace_marker_raw +} + + +test_multiple_writes() { + + # Write a bunch of data where the id is the count of + # data to write + for i in `seq 1 10` `seq 101 110` `seq 1001 1010`; do + write_buffer $i $i + done + + # add a little buffer + echo stop > trace_marker + + # Check to make sure the number of entries is the id (rounded up by 4) + awk '/.*: # [0-9a-f]* / { + print; + cnt = -1; + for (i = 0; i < NF; i++) { + # The counter is after the "#" marker + if ( $i == "#" ) { + i++; + cnt = strtonum("0x" $i); + num = NF - (i + 1); + # The number of items is always rounded up by 4 + cnt2 = int((cnt + 3) / 4) * 4; + if (cnt2 != num) { + exit 1; + } + break; + } + } + } + // { if (NR > 30) { exit 0; } } ' trace_pipe; +} + + +get_buffer_data_size() { + sed -ne 's/^.*data.*size:\([0-9][0-9]*\).*/\1/p' events/header_page +} + +test_buffer() { + + # The id must be four bytes, test that 3 bytes fails a write + if echo -n abc > ./trace_marker_raw ; then + echo "Too small of write expected to fail but did not" + exit_fail + fi + + size=`get_buffer_data_size` + echo size = $size + + # Now add a little more than what it can handle + + if write_buffer 0xdeadbeef $size ; then + echo "Too big of write expected to fail but did not" + exit_fail + fi +} + +test_buffer +test_multiple_writes diff --git a/tools/testing/selftests/ftrace/test.d/dynevent/add_remove_fprobe.tc b/tools/testing/selftests/ftrace/test.d/dynevent/add_remove_fprobe.tc index 2506f464811b..47067a5e3cb0 100644 --- a/tools/testing/selftests/ftrace/test.d/dynevent/add_remove_fprobe.tc +++ b/tools/testing/selftests/ftrace/test.d/dynevent/add_remove_fprobe.tc @@ -28,25 +28,21 @@ test -d events/fprobes/myevent1 test -d events/fprobes/myevent2 echo 1 > events/fprobes/myevent1/enable -# Make sure the event is attached and is the only one +# Make sure the event is attached. grep -q $PLACE enabled_functions cnt=`cat enabled_functions | wc -l` -if [ $cnt -ne $((ocnt + 1)) ]; then +if [ $cnt -eq $ocnt ]; then exit_fail fi echo 1 > events/fprobes/myevent2/enable -# It should till be the only attached function -cnt=`cat enabled_functions | wc -l` -if [ $cnt -ne $((ocnt + 1)) ]; then - exit_fail -fi +cnt2=`cat enabled_functions | wc -l` echo 1 > events/fprobes/myevent3/enable # If the function is different, the attached function should be increased grep -q $PLACE2 enabled_functions cnt=`cat enabled_functions | wc -l` -if [ $cnt -ne $((ocnt + 2)) ]; then +if [ $cnt -eq $cnt2 ]; then exit_fail fi @@ -56,12 +52,6 @@ echo "-:myevent2" >> dynamic_events grep -q myevent1 dynamic_events ! grep -q myevent2 dynamic_events -# should still have 2 left -cnt=`cat enabled_functions | wc -l` -if [ $cnt -ne $((ocnt + 2)) ]; then - exit_fail -fi - echo 0 > events/fprobes/enable echo > dynamic_events diff --git a/tools/testing/selftests/ftrace/test.d/dynevent/enable_disable_tprobe.tc b/tools/testing/selftests/ftrace/test.d/dynevent/enable_disable_tprobe.tc new file mode 100644 index 000000000000..c1f1cafa30f3 --- /dev/null +++ b/tools/testing/selftests/ftrace/test.d/dynevent/enable_disable_tprobe.tc @@ -0,0 +1,40 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0 +# description: Generic dynamic event - enable/disable tracepoint probe events +# requires: dynamic_events "t[:[<group>/][<event>]] <tracepoint> [<args>]":README + +echo 0 > events/enable +echo > dynamic_events + +TRACEPOINT=sched_switch +ENABLEFILE=events/tracepoints/myprobe/enable + +:;: "Add tracepoint event on $TRACEPOINT" ;: + +echo "t:myprobe ${TRACEPOINT}" >> dynamic_events + +:;: "Check enable/disable to ensure it works" ;: + +echo 1 > $ENABLEFILE + +grep -q $TRACEPOINT trace + +echo 0 > $ENABLEFILE + +echo > trace + +! grep -q $TRACEPOINT trace + +:;: "Repeat enable/disable to ensure it works" ;: + +echo 1 > $ENABLEFILE + +grep -q $TRACEPOINT trace + +echo 0 > $ENABLEFILE + +echo > trace + +! grep -q $TRACEPOINT trace + +exit 0 diff --git a/tools/testing/selftests/ftrace/test.d/functions b/tools/testing/selftests/ftrace/test.d/functions index a1052bf460fc..e8e718139294 100644 --- a/tools/testing/selftests/ftrace/test.d/functions +++ b/tools/testing/selftests/ftrace/test.d/functions @@ -104,7 +104,7 @@ clear_dynamic_events() { # reset all current dynamic events done } -initialize_ftrace() { # Reset ftrace to initial-state +initialize_system() { # Reset ftrace to initial-state # As the initial state, ftrace will be set to nop tracer, # no events, no triggers, no filters, no function filters, # no probes, and tracing on. @@ -134,8 +134,8 @@ initialize_ftrace() { # Reset ftrace to initial-state enable_tracing } -finish_ftrace() { - initialize_ftrace +finish_system() { + initialize_system # And recover it to default. [ -f options/pause-on-trace ] && echo 0 > options/pause-on-trace } diff --git a/tools/testing/selftests/hid/tests/test_tablet.py b/tools/testing/selftests/hid/tests/test_tablet.py index 50d5699812bb..5b9abb616db4 100644 --- a/tools/testing/selftests/hid/tests/test_tablet.py +++ b/tools/testing/selftests/hid/tests/test_tablet.py @@ -452,6 +452,7 @@ class Pen(object): def __init__(self, x, y): self.x = x self.y = y + self.distance = -10 self.tipswitch = False self.tippressure = 15 self.azimuth = 0 @@ -473,6 +474,7 @@ class Pen(object): for i in [ "x", "y", + "distance", "tippressure", "azimuth", "width", @@ -554,6 +556,7 @@ class PenDigitizer(base.UHIDTestDevice): pen.tipswitch = False pen.tippressure = 0 pen.azimuth = 0 + pen.distance = 0 pen.inrange = False pen.width = 0 pen.height = 0 @@ -868,6 +871,29 @@ class BaseTest: state machine.""" self._test_states(state_list, scribble, allow_intermediate_states=True) + @pytest.mark.skip_if_uhdev( + lambda uhdev: "Z" not in uhdev.fields, + "Device not compatible, missing Z usage", + ) + @pytest.mark.parametrize("scribble", [True, False], ids=["scribble", "static"]) + @pytest.mark.parametrize( + "state_list", + [pytest.param(v, id=k) for k, v in PenState.legal_transitions().items()], + ) + def test_z_reported_as_distance(self, state_list, scribble): + """Verify stylus Z values are reported as ABS_DISTANCE.""" + self._test_states(state_list, scribble, allow_intermediate_states=False) + + uhdev = self.uhdev + evdev = uhdev.get_evdev() + p = Pen(0, 0) + uhdev.move_to(p, PenState.PEN_IS_OUT_OF_RANGE, None) + p = Pen(100, 200) + uhdev.move_to(p, PenState.PEN_IS_IN_RANGE, None) + p.distance = -1 + events = self.post(uhdev, p, None) + assert evdev.value[libevdev.EV_ABS.ABS_DISTANCE] == -1 + class GXTP_pen(PenDigitizer): def event(self, pen, test_button): @@ -1292,6 +1318,35 @@ class Huion_Kamvas_Pro_19_256c_006b(PenDigitizer): return rs +class Wacom_2d1f_014b(PenDigitizer): + """ + Pen that reports distance values with HID_GD_Z usage. + """ + def __init__( + self, + name, + rdesc_str=None, + rdesc=None, + application="Pen", + physical="Stylus", + input_info=(BusType.USB, 0x2D1F, 0x014B), + evdev_name_suffix=None, + ): + super().__init__( + name, rdesc_str, rdesc, application, physical, input_info, evdev_name_suffix + ) + + def match_evdev_rule(self, application, evdev): + # there are 2 nodes created by the device, only one matters + return evdev.name.endswith("Stylus") + + def event(self, pen, test_button): + # this device reports the distance through Z + pen.z = pen.distance + + return super().event(pen, test_button) + + ################################################################################ # # Windows 7 compatible devices @@ -1504,3 +1559,19 @@ class TestHuion_Kamvas_Pro_19_256c_006b(BaseTest.TestTablet): rdesc="05 0d 09 02 a1 01 85 0a 09 20 a1 01 09 42 09 44 09 43 09 3c 09 45 15 00 25 01 75 01 95 06 81 02 09 32 75 01 95 01 81 02 81 03 05 01 09 30 09 31 55 0d 65 33 26 ff 7f 35 00 46 00 08 75 10 95 02 81 02 05 0d 09 30 26 ff 3f 75 10 95 01 81 02 09 3d 09 3e 15 a6 25 5a 75 08 95 02 81 02 c0 c0 05 0d 09 04 a1 01 85 04 09 22 a1 02 05 0d 95 01 75 06 09 51 15 00 25 3f 81 02 09 42 25 01 75 01 95 01 81 02 75 01 95 01 81 03 05 01 75 10 55 0e 65 11 09 30 26 ff 7f 35 00 46 15 0c 81 42 09 31 26 ff 7f 46 cb 06 81 42 05 0d 09 30 26 ff 1f 75 10 95 01 81 02 c0 05 0d 09 22 a1 02 05 0d 95 01 75 06 09 51 15 00 25 3f 81 02 09 42 25 01 75 01 95 01 81 02 75 01 95 01 81 03 05 01 75 10 55 0e 65 11 09 30 26 ff 7f 35 00 46 15 0c 81 42 09 31 26 ff 7f 46 cb 06 81 42 05 0d 09 30 26 ff 1f 75 10 95 01 81 02 c0 05 0d 09 56 55 00 65 00 27 ff ff ff 7f 95 01 75 20 81 02 09 54 25 7f 95 01 75 08 81 02 75 08 95 08 81 03 85 05 09 55 25 0a 75 08 95 01 b1 02 06 00 ff 09 c5 85 06 15 00 26 ff 00 75 08 96 00 01 b1 02 c0", input_info=(BusType.USB, 0x256C, 0x006B), ) + + +################################################################################ +# +# Devices Reporting Distance +# +################################################################################ + + +class TestWacom_2d1f_014b(BaseTest.TestTablet): + def create_device(self): + return Wacom_2d1f_014b( + "uhid test Wacom 2d1f_014b", + rdesc="05 0d 09 02 a1 01 85 02 09 20 a1 00 09 42 09 44 09 45 09 3c 09 5a 09 32 15 00 25 01 75 01 95 06 81 02 95 02 81 03 05 01 09 30 26 88 3e 46 88 3e 65 11 55 0d 75 10 95 01 81 02 09 31 26 60 53 46 60 53 81 02 05 0d 09 30 26 ff 0f 45 00 65 00 55 00 81 02 06 00 ff 09 04 75 08 26 ff 00 46 ff 00 65 11 55 0e 81 02 05 0d 09 3d 75 10 16 d8 dc 26 28 23 36 d8 dc 46 28 23 65 14 81 02 09 3e 81 02 05 01 09 32 16 01 ff 25 00 36 01 ff 45 00 65 11 81 02 05 0d 09 56 15 00 27 ff ff 00 00 35 00 47 ff ff 00 00 66 01 10 55 0c 81 02 45 00 65 00 55 00 c0 09 00 75 08 26 ff 00 b1 12 85 03 09 00 95 12 b1 12 85 05 09 00 95 04 b1 02 85 06 09 00 95 24 b1 02 85 16 09 00 15 00 26 ff 00 95 06 b1 02 85 17 09 00 95 0c b1 02 85 19 09 00 95 01 b1 02 85 0a 09 00 75 08 95 01 15 10 26 ff 00 b1 02 85 1e 09 00 95 10 b1 02 c0 06 00 ff 09 00 a1 01 85 09 05 0d 09 20 a1 00 09 00 15 00 26 ff 00 75 08 95 10 81 02 c0 09 00 95 03 b1 12 c0 06 00 ff 09 02 a1 01 85 07 09 00 96 09 01 b1 02 85 08 09 00 95 03 81 02 09 00 b1 02 85 0e 09 00 96 0a 01 b1 02 c0 05 0d 09 02 a1 01 85 1a 09 20 a1 00 09 42 09 44 09 45 09 3c 09 5a 09 32 15 00 25 01 75 01 95 06 81 02 09 38 25 03 75 02 95 01 81 02 05 01 09 30 26 88 3e 46 88 3e 65 11 55 0d 75 10 95 01 81 02 09 31 26 60 53 46 60 53 81 02 05 0d 09 30 26 ff 0f 46 b0 0f 66 11 e1 55 02 81 02 06 00 ff 09 04 75 08 26 ff 00 46 ff 00 65 11 55 0e 81 02 05 0d 09 3d 75 10 16 d8 dc 26 28 23 36 d8 dc 46 28 23 65 14 81 02 09 3e 81 02 05 01 09 32 16 01 ff 25 00 36 01 ff 45 00 65 11 81 02 05 0d 09 56 15 00 27 ff ff 00 00 35 00 47 ff ff 00 00 66 01 10 55 0c 81 02 45 00 65 00 55 00 c0 c0 06 00 ff 09 00 a1 01 85 1b 05 0d 09 20 a1 00 09 00 26 ff 00 75 08 95 10 81 02 c0 c0", + input_info=(BusType.USB, 0x2D1F, 0x014B), + ) diff --git a/tools/testing/selftests/iommu/iommufd.c b/tools/testing/selftests/iommu/iommufd.c index bb4d33dde3c8..10e051b6f592 100644 --- a/tools/testing/selftests/iommu/iommufd.c +++ b/tools/testing/selftests/iommu/iommufd.c @@ -13,9 +13,6 @@ static unsigned long HUGEPAGE_SIZE; -#define MOCK_PAGE_SIZE (PAGE_SIZE / 2) -#define MOCK_HUGE_PAGE_SIZE (512 * MOCK_PAGE_SIZE) - static unsigned long get_huge_page_size(void) { char buf[80]; @@ -1574,6 +1571,49 @@ TEST_F(iommufd_ioas, copy_sweep) test_ioctl_destroy(dst_ioas_id); } +TEST_F(iommufd_ioas, dmabuf_simple) +{ + size_t buf_size = PAGE_SIZE*4; + __u64 iova; + int dfd; + + test_cmd_get_dmabuf(buf_size, &dfd); + test_err_ioctl_ioas_map_file(EINVAL, dfd, 0, 0, &iova); + test_err_ioctl_ioas_map_file(EINVAL, dfd, buf_size, buf_size, &iova); + test_err_ioctl_ioas_map_file(EINVAL, dfd, 0, buf_size + 1, &iova); + test_ioctl_ioas_map_file(dfd, 0, buf_size, &iova); + + close(dfd); +} + +TEST_F(iommufd_ioas, dmabuf_revoke) +{ + size_t buf_size = PAGE_SIZE*4; + __u32 hwpt_id; + __u64 iova; + __u64 iova2; + int dfd; + + test_cmd_get_dmabuf(buf_size, &dfd); + test_ioctl_ioas_map_file(dfd, 0, buf_size, &iova); + test_cmd_revoke_dmabuf(dfd, true); + + if (variant->mock_domains) + test_cmd_hwpt_alloc(self->device_id, self->ioas_id, 0, + &hwpt_id); + + test_err_ioctl_ioas_map_file(ENODEV, dfd, 0, buf_size, &iova2); + + test_cmd_revoke_dmabuf(dfd, false); + test_ioctl_ioas_map_file(dfd, 0, buf_size, &iova2); + + /* Restore the iova back */ + test_ioctl_ioas_unmap(iova, buf_size); + test_ioctl_ioas_map_fixed_file(dfd, 0, buf_size, iova); + + close(dfd); +} + FIXTURE(iommufd_mock_domain) { int fd; @@ -2058,6 +2098,12 @@ FIXTURE_VARIANT(iommufd_dirty_tracking) FIXTURE_SETUP(iommufd_dirty_tracking) { + struct iommu_option cmd = { + .size = sizeof(cmd), + .option_id = IOMMU_OPTION_HUGE_PAGES, + .op = IOMMU_OPTION_OP_SET, + .val64 = 0, + }; size_t mmap_buffer_size; unsigned long size; int mmap_flags; @@ -2066,7 +2112,7 @@ FIXTURE_SETUP(iommufd_dirty_tracking) if (variant->buffer_size < MOCK_PAGE_SIZE) { SKIP(return, - "Skipping buffer_size=%lu, less than MOCK_PAGE_SIZE=%lu", + "Skipping buffer_size=%lu, less than MOCK_PAGE_SIZE=%u", variant->buffer_size, MOCK_PAGE_SIZE); } @@ -2114,16 +2160,18 @@ FIXTURE_SETUP(iommufd_dirty_tracking) assert((uintptr_t)self->bitmap % PAGE_SIZE == 0); test_ioctl_ioas_alloc(&self->ioas_id); - /* Enable 1M mock IOMMU hugepages */ - if (variant->hugepages) { - test_cmd_mock_domain_flags(self->ioas_id, - MOCK_FLAGS_DEVICE_HUGE_IOVA, - &self->stdev_id, &self->hwpt_id, - &self->idev_id); - } else { - test_cmd_mock_domain(self->ioas_id, &self->stdev_id, - &self->hwpt_id, &self->idev_id); - } + + /* + * For dirty testing it is important that the page size fed into + * the iommu page tables matches the size the dirty logic + * expects, or set_dirty can touch too much stuff. + */ + cmd.object_id = self->ioas_id; + if (!variant->hugepages) + ASSERT_EQ(0, ioctl(self->fd, IOMMU_OPTION, &cmd)); + + test_cmd_mock_domain(self->ioas_id, &self->stdev_id, &self->hwpt_id, + &self->idev_id); } FIXTURE_TEARDOWN(iommufd_dirty_tracking) @@ -2248,18 +2296,23 @@ TEST_F(iommufd_dirty_tracking, device_dirty_capability) TEST_F(iommufd_dirty_tracking, get_dirty_bitmap) { uint32_t page_size = MOCK_PAGE_SIZE; + uint32_t ioas_id = self->ioas_id; uint32_t hwpt_id; - uint32_t ioas_id; if (variant->hugepages) page_size = MOCK_HUGE_PAGE_SIZE; - test_ioctl_ioas_alloc(&ioas_id); test_ioctl_ioas_map_fixed_id(ioas_id, self->buffer, variant->buffer_size, MOCK_APERTURE_START); - test_cmd_hwpt_alloc(self->idev_id, ioas_id, - IOMMU_HWPT_ALLOC_DIRTY_TRACKING, &hwpt_id); + if (variant->hugepages) + test_cmd_hwpt_alloc_iommupt(self->idev_id, ioas_id, + IOMMU_HWPT_ALLOC_DIRTY_TRACKING, + MOCK_IOMMUPT_HUGE, &hwpt_id); + else + test_cmd_hwpt_alloc_iommupt(self->idev_id, ioas_id, + IOMMU_HWPT_ALLOC_DIRTY_TRACKING, + MOCK_IOMMUPT_DEFAULT, &hwpt_id); test_cmd_set_dirty_tracking(hwpt_id, true); @@ -2285,18 +2338,24 @@ TEST_F(iommufd_dirty_tracking, get_dirty_bitmap) TEST_F(iommufd_dirty_tracking, get_dirty_bitmap_no_clear) { uint32_t page_size = MOCK_PAGE_SIZE; + uint32_t ioas_id = self->ioas_id; uint32_t hwpt_id; - uint32_t ioas_id; if (variant->hugepages) page_size = MOCK_HUGE_PAGE_SIZE; - test_ioctl_ioas_alloc(&ioas_id); test_ioctl_ioas_map_fixed_id(ioas_id, self->buffer, variant->buffer_size, MOCK_APERTURE_START); - test_cmd_hwpt_alloc(self->idev_id, ioas_id, - IOMMU_HWPT_ALLOC_DIRTY_TRACKING, &hwpt_id); + + if (variant->hugepages) + test_cmd_hwpt_alloc_iommupt(self->idev_id, ioas_id, + IOMMU_HWPT_ALLOC_DIRTY_TRACKING, + MOCK_IOMMUPT_HUGE, &hwpt_id); + else + test_cmd_hwpt_alloc_iommupt(self->idev_id, ioas_id, + IOMMU_HWPT_ALLOC_DIRTY_TRACKING, + MOCK_IOMMUPT_DEFAULT, &hwpt_id); test_cmd_set_dirty_tracking(hwpt_id, true); diff --git a/tools/testing/selftests/iommu/iommufd_utils.h b/tools/testing/selftests/iommu/iommufd_utils.h index 9f472c20c190..0a0ff6f7926d 100644 --- a/tools/testing/selftests/iommu/iommufd_utils.h +++ b/tools/testing/selftests/iommu/iommufd_utils.h @@ -215,6 +215,18 @@ static int _test_cmd_hwpt_alloc(int fd, __u32 device_id, __u32 pt_id, __u32 ft_i ASSERT_EQ(0, _test_cmd_hwpt_alloc(self->fd, device_id, pt_id, 0, flags, \ hwpt_id, IOMMU_HWPT_DATA_NONE, NULL, \ 0)) +#define test_cmd_hwpt_alloc_iommupt(device_id, pt_id, flags, iommupt_type, \ + hwpt_id) \ + ({ \ + struct iommu_hwpt_selftest user_cfg = { \ + .pagetable_type = iommupt_type \ + }; \ + \ + ASSERT_EQ(0, _test_cmd_hwpt_alloc( \ + self->fd, device_id, pt_id, 0, flags, \ + hwpt_id, IOMMU_HWPT_DATA_SELFTEST, \ + &user_cfg, sizeof(user_cfg))); \ + }) #define test_err_hwpt_alloc(_errno, device_id, pt_id, flags, hwpt_id) \ EXPECT_ERRNO(_errno, _test_cmd_hwpt_alloc( \ self->fd, device_id, pt_id, 0, flags, \ @@ -548,6 +560,39 @@ static int _test_cmd_destroy_access_pages(int fd, unsigned int access_id, EXPECT_ERRNO(_errno, _test_cmd_destroy_access_pages( \ self->fd, access_id, access_pages_id)) +static int _test_cmd_get_dmabuf(int fd, size_t len, int *out_fd) +{ + struct iommu_test_cmd cmd = { + .size = sizeof(cmd), + .op = IOMMU_TEST_OP_DMABUF_GET, + .dmabuf_get = { .length = len, .open_flags = O_CLOEXEC }, + }; + + *out_fd = ioctl(fd, IOMMU_TEST_CMD, &cmd); + if (*out_fd < 0) + return -1; + return 0; +} +#define test_cmd_get_dmabuf(len, out_fd) \ + ASSERT_EQ(0, _test_cmd_get_dmabuf(self->fd, len, out_fd)) + +static int _test_cmd_revoke_dmabuf(int fd, int dmabuf_fd, bool revoked) +{ + struct iommu_test_cmd cmd = { + .size = sizeof(cmd), + .op = IOMMU_TEST_OP_DMABUF_REVOKE, + .dmabuf_revoke = { .dmabuf_fd = dmabuf_fd, .revoked = revoked }, + }; + int ret; + + ret = ioctl(fd, IOMMU_TEST_CMD, &cmd); + if (ret < 0) + return -1; + return 0; +} +#define test_cmd_revoke_dmabuf(dmabuf_fd, revoke) \ + ASSERT_EQ(0, _test_cmd_revoke_dmabuf(self->fd, dmabuf_fd, revoke)) + static int _test_ioctl_destroy(int fd, unsigned int id) { struct iommu_destroy cmd = { @@ -718,6 +763,17 @@ static int _test_ioctl_ioas_map_file(int fd, unsigned int ioas_id, int mfd, self->fd, ioas_id, mfd, start, length, iova_p, \ IOMMU_IOAS_MAP_WRITEABLE | IOMMU_IOAS_MAP_READABLE)) +#define test_ioctl_ioas_map_fixed_file(mfd, start, length, iova) \ + ({ \ + __u64 __iova = iova; \ + ASSERT_EQ(0, _test_ioctl_ioas_map_file( \ + self->fd, self->ioas_id, mfd, start, \ + length, &__iova, \ + IOMMU_IOAS_MAP_FIXED_IOVA | \ + IOMMU_IOAS_MAP_WRITEABLE | \ + IOMMU_IOAS_MAP_READABLE)); \ + }) + static int _test_ioctl_set_temp_memory_limit(int fd, unsigned int limit) { struct iommu_test_cmd memlimit_cmd = { diff --git a/tools/testing/selftests/kselftest/runner.sh b/tools/testing/selftests/kselftest/runner.sh index 2c3c58e65a41..3a62039fa621 100644 --- a/tools/testing/selftests/kselftest/runner.sh +++ b/tools/testing/selftests/kselftest/runner.sh @@ -44,6 +44,12 @@ tap_timeout() fi } +report_failure() +{ + echo "not ok $*" + echo "$*" >> "$kselftest_failures_file" +} + run_one() { DIR="$1" @@ -105,7 +111,7 @@ run_one() echo "# $TEST_HDR_MSG" if [ ! -e "$TEST" ]; then echo "# Warning: file $TEST is missing!" - echo "not ok $test_num $TEST_HDR_MSG" + report_failure "$test_num $TEST_HDR_MSG" else if [ -x /usr/bin/stdbuf ]; then stdbuf="/usr/bin/stdbuf --output=L " @@ -123,7 +129,7 @@ run_one() interpreter=$(head -n 1 "$TEST" | cut -c 3-) cmd="$stdbuf $interpreter ./$BASENAME_TEST" else - echo "not ok $test_num $TEST_HDR_MSG" + report_failure "$test_num $TEST_HDR_MSG" return fi fi @@ -137,9 +143,9 @@ run_one() echo "ok $test_num $TEST_HDR_MSG # SKIP" elif [ $rc -eq $timeout_rc ]; then \ echo "#" - echo "not ok $test_num $TEST_HDR_MSG # TIMEOUT $kselftest_timeout seconds" + report_failure "$test_num $TEST_HDR_MSG # TIMEOUT $kselftest_timeout seconds" else - echo "not ok $test_num $TEST_HDR_MSG # exit=$rc" + report_failure "$test_num $TEST_HDR_MSG # exit=$rc" fi) cd - >/dev/null fi diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile index d9fffe06d3ea..f2b223072b62 100644 --- a/tools/testing/selftests/kvm/Makefile +++ b/tools/testing/selftests/kvm/Makefile @@ -6,7 +6,7 @@ ARCH ?= $(SUBARCH) ifeq ($(ARCH),$(filter $(ARCH),arm64 s390 riscv x86 x86_64 loongarch)) # Top-level selftests allows ARCH=x86_64 :-( ifeq ($(ARCH),x86_64) - ARCH := x86 + override ARCH := x86 endif include Makefile.kvm else diff --git a/tools/testing/selftests/kvm/Makefile.kvm b/tools/testing/selftests/kvm/Makefile.kvm index 148d427ff24b..ba5c2b643efa 100644 --- a/tools/testing/selftests/kvm/Makefile.kvm +++ b/tools/testing/selftests/kvm/Makefile.kvm @@ -88,8 +88,12 @@ TEST_GEN_PROGS_x86 += x86/kvm_pv_test TEST_GEN_PROGS_x86 += x86/kvm_buslock_test TEST_GEN_PROGS_x86 += x86/monitor_mwait_test TEST_GEN_PROGS_x86 += x86/msrs_test +TEST_GEN_PROGS_x86 += x86/nested_close_kvm_test TEST_GEN_PROGS_x86 += x86/nested_emulation_test TEST_GEN_PROGS_x86 += x86/nested_exceptions_test +TEST_GEN_PROGS_x86 += x86/nested_invalid_cr3_test +TEST_GEN_PROGS_x86 += x86/nested_tsc_adjust_test +TEST_GEN_PROGS_x86 += x86/nested_tsc_scaling_test TEST_GEN_PROGS_x86 += x86/platform_info_test TEST_GEN_PROGS_x86 += x86/pmu_counters_test TEST_GEN_PROGS_x86 += x86/pmu_event_filter_test @@ -111,14 +115,12 @@ TEST_GEN_PROGS_x86 += x86/ucna_injection_test TEST_GEN_PROGS_x86 += x86/userspace_io_test TEST_GEN_PROGS_x86 += x86/userspace_msr_exit_test TEST_GEN_PROGS_x86 += x86/vmx_apic_access_test -TEST_GEN_PROGS_x86 += x86/vmx_close_while_nested_test TEST_GEN_PROGS_x86 += x86/vmx_dirty_log_test TEST_GEN_PROGS_x86 += x86/vmx_exception_with_invalid_guest_state TEST_GEN_PROGS_x86 += x86/vmx_msrs_test TEST_GEN_PROGS_x86 += x86/vmx_invalid_nested_guest_state +TEST_GEN_PROGS_x86 += x86/vmx_nested_la57_state_test TEST_GEN_PROGS_x86 += x86/vmx_set_nested_state_test -TEST_GEN_PROGS_x86 += x86/vmx_tsc_adjust_test -TEST_GEN_PROGS_x86 += x86/vmx_nested_tsc_scaling_test TEST_GEN_PROGS_x86 += x86/apic_bus_clock_test TEST_GEN_PROGS_x86 += x86/xapic_ipi_test TEST_GEN_PROGS_x86 += x86/xapic_state_test @@ -156,6 +158,7 @@ TEST_GEN_PROGS_EXTENDED_x86 += x86/nx_huge_pages_test TEST_GEN_PROGS_arm64 = $(TEST_GEN_PROGS_COMMON) TEST_GEN_PROGS_arm64 += arm64/aarch32_id_regs TEST_GEN_PROGS_arm64 += arm64/arch_timer_edge_cases +TEST_GEN_PROGS_arm64 += arm64/at TEST_GEN_PROGS_arm64 += arm64/debug-exceptions TEST_GEN_PROGS_arm64 += arm64/hello_el2 TEST_GEN_PROGS_arm64 += arm64/host_sve @@ -163,6 +166,7 @@ TEST_GEN_PROGS_arm64 += arm64/hypercalls TEST_GEN_PROGS_arm64 += arm64/external_aborts TEST_GEN_PROGS_arm64 += arm64/page_fault_test TEST_GEN_PROGS_arm64 += arm64/psci_test +TEST_GEN_PROGS_arm64 += arm64/sea_to_user TEST_GEN_PROGS_arm64 += arm64/set_id_regs TEST_GEN_PROGS_arm64 += arm64/smccc_filter TEST_GEN_PROGS_arm64 += arm64/vcpu_width_config @@ -194,6 +198,7 @@ TEST_GEN_PROGS_s390 += s390/debug_test TEST_GEN_PROGS_s390 += s390/cpumodel_subfuncs_test TEST_GEN_PROGS_s390 += s390/shared_zeropage_test TEST_GEN_PROGS_s390 += s390/ucontrol_test +TEST_GEN_PROGS_s390 += s390/user_operexec TEST_GEN_PROGS_s390 += rseq_test TEST_GEN_PROGS_riscv = $(TEST_GEN_PROGS_COMMON) @@ -210,6 +215,7 @@ TEST_GEN_PROGS_riscv += mmu_stress_test TEST_GEN_PROGS_riscv += rseq_test TEST_GEN_PROGS_riscv += steal_time +TEST_GEN_PROGS_loongarch = arch_timer TEST_GEN_PROGS_loongarch += coalesced_io_test TEST_GEN_PROGS_loongarch += demand_paging_test TEST_GEN_PROGS_loongarch += dirty_log_perf_test diff --git a/tools/testing/selftests/kvm/arm64/at.c b/tools/testing/selftests/kvm/arm64/at.c new file mode 100644 index 000000000000..c8ee6f520734 --- /dev/null +++ b/tools/testing/selftests/kvm/arm64/at.c @@ -0,0 +1,166 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * at - Test for KVM's AT emulation in the EL2&0 and EL1&0 translation regimes. + */ +#include "kvm_util.h" +#include "processor.h" +#include "test_util.h" +#include "ucall.h" + +#include <asm/sysreg.h> + +#define TEST_ADDR 0x80000000 + +enum { + CLEAR_ACCESS_FLAG, + TEST_ACCESS_FLAG, +}; + +static u64 *ptep_hva; + +#define copy_el2_to_el1(reg) \ + write_sysreg_s(read_sysreg_s(SYS_##reg##_EL1), SYS_##reg##_EL12) + +/* Yes, this is an ugly hack */ +#define __at(op, addr) write_sysreg_s(addr, op) + +#define test_at_insn(op, expect_fault) \ +do { \ + u64 par, fsc; \ + bool fault; \ + \ + GUEST_SYNC(CLEAR_ACCESS_FLAG); \ + \ + __at(OP_AT_##op, TEST_ADDR); \ + isb(); \ + par = read_sysreg(par_el1); \ + \ + fault = par & SYS_PAR_EL1_F; \ + fsc = FIELD_GET(SYS_PAR_EL1_FST, par); \ + \ + __GUEST_ASSERT((expect_fault) == fault, \ + "AT "#op": %sexpected fault (par: %lx)1", \ + (expect_fault) ? "" : "un", par); \ + if ((expect_fault)) { \ + __GUEST_ASSERT(fsc == ESR_ELx_FSC_ACCESS_L(3), \ + "AT "#op": expected access flag fault (par: %lx)", \ + par); \ + } else { \ + GUEST_ASSERT_EQ(FIELD_GET(SYS_PAR_EL1_ATTR, par), MAIR_ATTR_NORMAL); \ + GUEST_ASSERT_EQ(FIELD_GET(SYS_PAR_EL1_SH, par), PTE_SHARED >> 8); \ + GUEST_ASSERT_EQ(par & SYS_PAR_EL1_PA, TEST_ADDR); \ + GUEST_SYNC(TEST_ACCESS_FLAG); \ + } \ +} while (0) + +static void test_at(bool expect_fault) +{ + test_at_insn(S1E2R, expect_fault); + test_at_insn(S1E2W, expect_fault); + + /* Reuse the stage-1 MMU context from EL2 at EL1 */ + copy_el2_to_el1(SCTLR); + copy_el2_to_el1(MAIR); + copy_el2_to_el1(TCR); + copy_el2_to_el1(TTBR0); + copy_el2_to_el1(TTBR1); + + /* Disable stage-2 translation and enter a non-host context */ + write_sysreg(0, vtcr_el2); + write_sysreg(0, vttbr_el2); + sysreg_clear_set(hcr_el2, HCR_EL2_TGE | HCR_EL2_VM, 0); + isb(); + + test_at_insn(S1E1R, expect_fault); + test_at_insn(S1E1W, expect_fault); +} + +static void guest_code(void) +{ + sysreg_clear_set(tcr_el1, TCR_HA, 0); + isb(); + + test_at(true); + + if (!SYS_FIELD_GET(ID_AA64MMFR1_EL1, HAFDBS, read_sysreg(id_aa64mmfr1_el1))) + GUEST_DONE(); + + /* + * KVM's software PTW makes the implementation choice that the AT + * instruction sets the access flag. + */ + sysreg_clear_set(tcr_el1, 0, TCR_HA); + isb(); + test_at(false); + + GUEST_DONE(); +} + +static void handle_sync(struct kvm_vcpu *vcpu, struct ucall *uc) +{ + switch (uc->args[1]) { + case CLEAR_ACCESS_FLAG: + /* + * Delete + reinstall the memslot to invalidate stage-2 + * mappings of the stage-1 page tables, forcing KVM to + * use the 'slow' AT emulation path. + * + * This and clearing the access flag from host userspace + * ensures that the access flag cannot be set speculatively + * and is reliably cleared at the time of the AT instruction. + */ + clear_bit(__ffs(PTE_AF), ptep_hva); + vm_mem_region_reload(vcpu->vm, vcpu->vm->memslots[MEM_REGION_PT]); + break; + case TEST_ACCESS_FLAG: + TEST_ASSERT(test_bit(__ffs(PTE_AF), ptep_hva), + "Expected access flag to be set (desc: %lu)", *ptep_hva); + break; + default: + TEST_FAIL("Unexpected SYNC arg: %lu", uc->args[1]); + } +} + +static void run_test(struct kvm_vcpu *vcpu) +{ + struct ucall uc; + + while (true) { + vcpu_run(vcpu); + switch (get_ucall(vcpu, &uc)) { + case UCALL_DONE: + return; + case UCALL_SYNC: + handle_sync(vcpu, &uc); + continue; + case UCALL_ABORT: + REPORT_GUEST_ASSERT(uc); + return; + default: + TEST_FAIL("Unexpected ucall: %lu", uc.cmd); + } + } +} + +int main(void) +{ + struct kvm_vcpu_init init; + struct kvm_vcpu *vcpu; + struct kvm_vm *vm; + + TEST_REQUIRE(kvm_check_cap(KVM_CAP_ARM_EL2)); + + vm = vm_create(1); + + kvm_get_default_vcpu_target(vm, &init); + init.features[0] |= BIT(KVM_ARM_VCPU_HAS_EL2); + vcpu = aarch64_vcpu_add(vm, 0, &init, guest_code); + kvm_arch_vm_finalize_vcpus(vm); + + virt_map(vm, TEST_ADDR, TEST_ADDR, 1); + ptep_hva = virt_get_pte_hva_at_level(vm, TEST_ADDR, 3); + run_test(vcpu); + + kvm_vm_free(vm); + return 0; +} diff --git a/tools/testing/selftests/kvm/arm64/sea_to_user.c b/tools/testing/selftests/kvm/arm64/sea_to_user.c new file mode 100644 index 000000000000..573dd790aeb8 --- /dev/null +++ b/tools/testing/selftests/kvm/arm64/sea_to_user.c @@ -0,0 +1,331 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Test KVM returns to userspace with KVM_EXIT_ARM_SEA if host APEI fails + * to handle SEA and userspace has opt-ed in KVM_CAP_ARM_SEA_TO_USER. + * + * After reaching userspace with expected arm_sea info, also test userspace + * injecting a synchronous external data abort into the guest. + * + * This test utilizes EINJ to generate a REAL synchronous external data + * abort by consuming a recoverable uncorrectable memory error. Therefore + * the device under test must support EINJ in both firmware and host kernel, + * including the notrigger feature. Otherwise the test will be skipped. + * The under-test platform's APEI should be unable to claim SEA. Otherwise + * the test will also be skipped. + */ + +#include <signal.h> +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> + +#include "test_util.h" +#include "kvm_util.h" +#include "processor.h" +#include "guest_modes.h" + +#define PAGE_PRESENT (1ULL << 63) +#define PAGE_PHYSICAL 0x007fffffffffffffULL +#define PAGE_ADDR_MASK (~(0xfffULL)) + +/* Group ISV and ISS[23:14]. */ +#define ESR_ELx_INST_SYNDROME ((ESR_ELx_ISV) | (ESR_ELx_SAS) | \ + (ESR_ELx_SSE) | (ESR_ELx_SRT_MASK) | \ + (ESR_ELx_SF) | (ESR_ELx_AR)) + +#define EINJ_ETYPE "/sys/kernel/debug/apei/einj/error_type" +#define EINJ_ADDR "/sys/kernel/debug/apei/einj/param1" +#define EINJ_MASK "/sys/kernel/debug/apei/einj/param2" +#define EINJ_FLAGS "/sys/kernel/debug/apei/einj/flags" +#define EINJ_NOTRIGGER "/sys/kernel/debug/apei/einj/notrigger" +#define EINJ_DOIT "/sys/kernel/debug/apei/einj/error_inject" +/* Memory Uncorrectable non-fatal. */ +#define ERROR_TYPE_MEMORY_UER 0x10 +/* Memory address and mask valid (param1 and param2). */ +#define MASK_MEMORY_UER 0b10 + +/* Guest virtual address region = [2G, 3G). */ +#define START_GVA 0x80000000UL +#define VM_MEM_SIZE 0x40000000UL +/* Note: EINJ_OFFSET must < VM_MEM_SIZE. */ +#define EINJ_OFFSET 0x01234badUL +#define EINJ_GVA ((START_GVA) + (EINJ_OFFSET)) + +static vm_paddr_t einj_gpa; +static void *einj_hva; +static uint64_t einj_hpa; +static bool far_invalid; + +static uint64_t translate_to_host_paddr(unsigned long vaddr) +{ + uint64_t pinfo; + int64_t offset = vaddr / getpagesize() * sizeof(pinfo); + int fd; + uint64_t page_addr; + uint64_t paddr; + + fd = open("/proc/self/pagemap", O_RDONLY); + if (fd < 0) + ksft_exit_fail_perror("Failed to open /proc/self/pagemap"); + if (pread(fd, &pinfo, sizeof(pinfo), offset) != sizeof(pinfo)) { + close(fd); + ksft_exit_fail_perror("Failed to read /proc/self/pagemap"); + } + + close(fd); + + if ((pinfo & PAGE_PRESENT) == 0) + ksft_exit_fail_perror("Page not present"); + + page_addr = (pinfo & PAGE_PHYSICAL) << MIN_PAGE_SHIFT; + paddr = page_addr + (vaddr & (getpagesize() - 1)); + return paddr; +} + +static void write_einj_entry(const char *einj_path, uint64_t val) +{ + char cmd[256] = {0}; + FILE *cmdfile = NULL; + + sprintf(cmd, "echo %#lx > %s", val, einj_path); + cmdfile = popen(cmd, "r"); + + if (pclose(cmdfile) == 0) + ksft_print_msg("echo %#lx > %s - done\n", val, einj_path); + else + ksft_exit_fail_perror("Failed to write EINJ entry"); +} + +static void inject_uer(uint64_t paddr) +{ + if (access("/sys/firmware/acpi/tables/EINJ", R_OK) == -1) + ksft_test_result_skip("EINJ table no available in firmware"); + + if (access(EINJ_ETYPE, R_OK | W_OK) == -1) + ksft_test_result_skip("EINJ module probably not loaded?"); + + write_einj_entry(EINJ_ETYPE, ERROR_TYPE_MEMORY_UER); + write_einj_entry(EINJ_FLAGS, MASK_MEMORY_UER); + write_einj_entry(EINJ_ADDR, paddr); + write_einj_entry(EINJ_MASK, ~0x0UL); + write_einj_entry(EINJ_NOTRIGGER, 1); + write_einj_entry(EINJ_DOIT, 1); +} + +/* + * When host APEI successfully claims the SEA caused by guest_code, kernel + * will send SIGBUS signal with BUS_MCEERR_AR to test thread. + * + * We set up this SIGBUS handler to skip the test for that case. + */ +static void sigbus_signal_handler(int sig, siginfo_t *si, void *v) +{ + ksft_print_msg("SIGBUS (%d) received, dumping siginfo...\n", sig); + ksft_print_msg("si_signo=%d, si_errno=%d, si_code=%d, si_addr=%p\n", + si->si_signo, si->si_errno, si->si_code, si->si_addr); + if (si->si_code == BUS_MCEERR_AR) + ksft_test_result_skip("SEA is claimed by host APEI\n"); + else + ksft_test_result_fail("Exit with signal unhandled\n"); + + exit(0); +} + +static void setup_sigbus_handler(void) +{ + struct sigaction act; + + memset(&act, 0, sizeof(act)); + sigemptyset(&act.sa_mask); + act.sa_sigaction = sigbus_signal_handler; + act.sa_flags = SA_SIGINFO; + TEST_ASSERT(sigaction(SIGBUS, &act, NULL) == 0, + "Failed to setup SIGBUS handler"); +} + +static void guest_code(void) +{ + uint64_t guest_data; + + /* Consumes error will cause a SEA. */ + guest_data = *(uint64_t *)EINJ_GVA; + + GUEST_FAIL("Poison not protected by SEA: gva=%#lx, guest_data=%#lx\n", + EINJ_GVA, guest_data); +} + +static void expect_sea_handler(struct ex_regs *regs) +{ + u64 esr = read_sysreg(esr_el1); + u64 far = read_sysreg(far_el1); + bool expect_far_invalid = far_invalid; + + GUEST_PRINTF("Handling Guest SEA\n"); + GUEST_PRINTF("ESR_EL1=%#lx, FAR_EL1=%#lx\n", esr, far); + + GUEST_ASSERT_EQ(ESR_ELx_EC(esr), ESR_ELx_EC_DABT_CUR); + GUEST_ASSERT_EQ(esr & ESR_ELx_FSC_TYPE, ESR_ELx_FSC_EXTABT); + + if (expect_far_invalid) { + GUEST_ASSERT_EQ(esr & ESR_ELx_FnV, ESR_ELx_FnV); + GUEST_PRINTF("Guest observed garbage value in FAR\n"); + } else { + GUEST_ASSERT_EQ(esr & ESR_ELx_FnV, 0); + GUEST_ASSERT_EQ(far, EINJ_GVA); + } + + GUEST_DONE(); +} + +static void vcpu_inject_sea(struct kvm_vcpu *vcpu) +{ + struct kvm_vcpu_events events = {}; + + events.exception.ext_dabt_pending = true; + vcpu_events_set(vcpu, &events); +} + +static void run_vm(struct kvm_vm *vm, struct kvm_vcpu *vcpu) +{ + struct ucall uc; + bool guest_done = false; + struct kvm_run *run = vcpu->run; + u64 esr; + + /* Resume the vCPU after error injection to consume the error. */ + vcpu_run(vcpu); + + ksft_print_msg("Dump kvm_run info about KVM_EXIT_%s\n", + exit_reason_str(run->exit_reason)); + ksft_print_msg("kvm_run.arm_sea: esr=%#llx, flags=%#llx\n", + run->arm_sea.esr, run->arm_sea.flags); + ksft_print_msg("kvm_run.arm_sea: gva=%#llx, gpa=%#llx\n", + run->arm_sea.gva, run->arm_sea.gpa); + + TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_ARM_SEA); + + esr = run->arm_sea.esr; + TEST_ASSERT_EQ(ESR_ELx_EC(esr), ESR_ELx_EC_DABT_LOW); + TEST_ASSERT_EQ(esr & ESR_ELx_FSC_TYPE, ESR_ELx_FSC_EXTABT); + TEST_ASSERT_EQ(ESR_ELx_ISS2(esr), 0); + TEST_ASSERT_EQ((esr & ESR_ELx_INST_SYNDROME), 0); + TEST_ASSERT_EQ(esr & ESR_ELx_VNCR, 0); + + if (!(esr & ESR_ELx_FnV)) { + ksft_print_msg("Expect gva to match given FnV bit is 0\n"); + TEST_ASSERT_EQ(run->arm_sea.gva, EINJ_GVA); + } + + if (run->arm_sea.flags & KVM_EXIT_ARM_SEA_FLAG_GPA_VALID) { + ksft_print_msg("Expect gpa to match given KVM_EXIT_ARM_SEA_FLAG_GPA_VALID is set\n"); + TEST_ASSERT_EQ(run->arm_sea.gpa, einj_gpa & PAGE_ADDR_MASK); + } + + far_invalid = esr & ESR_ELx_FnV; + + /* Inject a SEA into guest and expect handled in SEA handler. */ + vcpu_inject_sea(vcpu); + + /* Expect the guest to reach GUEST_DONE gracefully. */ + do { + vcpu_run(vcpu); + switch (get_ucall(vcpu, &uc)) { + case UCALL_PRINTF: + ksft_print_msg("From guest: %s", uc.buffer); + break; + case UCALL_DONE: + ksft_print_msg("Guest done gracefully!\n"); + guest_done = 1; + break; + case UCALL_ABORT: + ksft_print_msg("Guest aborted!\n"); + guest_done = 1; + REPORT_GUEST_ASSERT(uc); + break; + default: + TEST_FAIL("Unexpected ucall: %lu\n", uc.cmd); + } + } while (!guest_done); +} + +static struct kvm_vm *vm_create_with_sea_handler(struct kvm_vcpu **vcpu) +{ + size_t backing_page_size; + size_t guest_page_size; + size_t alignment; + uint64_t num_guest_pages; + vm_paddr_t start_gpa; + enum vm_mem_backing_src_type src_type = VM_MEM_SRC_ANONYMOUS_HUGETLB_1GB; + struct kvm_vm *vm; + + backing_page_size = get_backing_src_pagesz(src_type); + guest_page_size = vm_guest_mode_params[VM_MODE_DEFAULT].page_size; + alignment = max(backing_page_size, guest_page_size); + num_guest_pages = VM_MEM_SIZE / guest_page_size; + + vm = __vm_create_with_one_vcpu(vcpu, num_guest_pages, guest_code); + vm_init_descriptor_tables(vm); + vcpu_init_descriptor_tables(*vcpu); + + vm_install_sync_handler(vm, + /*vector=*/VECTOR_SYNC_CURRENT, + /*ec=*/ESR_ELx_EC_DABT_CUR, + /*handler=*/expect_sea_handler); + + start_gpa = (vm->max_gfn - num_guest_pages) * guest_page_size; + start_gpa = align_down(start_gpa, alignment); + + vm_userspace_mem_region_add( + /*vm=*/vm, + /*src_type=*/src_type, + /*guest_paddr=*/start_gpa, + /*slot=*/1, + /*npages=*/num_guest_pages, + /*flags=*/0); + + virt_map(vm, START_GVA, start_gpa, num_guest_pages); + + ksft_print_msg("Mapped %#lx pages: gva=%#lx to gpa=%#lx\n", + num_guest_pages, START_GVA, start_gpa); + return vm; +} + +static void vm_inject_memory_uer(struct kvm_vm *vm) +{ + uint64_t guest_data; + + einj_gpa = addr_gva2gpa(vm, EINJ_GVA); + einj_hva = addr_gva2hva(vm, EINJ_GVA); + + /* Populate certain data before injecting UER. */ + *(uint64_t *)einj_hva = 0xBAADCAFE; + guest_data = *(uint64_t *)einj_hva; + ksft_print_msg("Before EINJect: data=%#lx\n", + guest_data); + + einj_hpa = translate_to_host_paddr((unsigned long)einj_hva); + + ksft_print_msg("EINJ_GVA=%#lx, einj_gpa=%#lx, einj_hva=%p, einj_hpa=%#lx\n", + EINJ_GVA, einj_gpa, einj_hva, einj_hpa); + + inject_uer(einj_hpa); + ksft_print_msg("Memory UER EINJected\n"); +} + +int main(int argc, char *argv[]) +{ + struct kvm_vm *vm; + struct kvm_vcpu *vcpu; + + TEST_REQUIRE(kvm_has_cap(KVM_CAP_ARM_SEA_TO_USER)); + + setup_sigbus_handler(); + + vm = vm_create_with_sea_handler(&vcpu); + vm_enable_cap(vm, KVM_CAP_ARM_SEA_TO_USER, 0); + vm_inject_memory_uer(vm); + run_vm(vm, vcpu); + kvm_vm_free(vm); + + return 0; +} diff --git a/tools/testing/selftests/kvm/arm64/set_id_regs.c b/tools/testing/selftests/kvm/arm64/set_id_regs.c index 5e24f77868b5..c4815d365816 100644 --- a/tools/testing/selftests/kvm/arm64/set_id_regs.c +++ b/tools/testing/selftests/kvm/arm64/set_id_regs.c @@ -268,7 +268,9 @@ static void guest_code(void) /* Return a safe value to a given ftr_bits an ftr value */ uint64_t get_safe_value(const struct reg_ftr_bits *ftr_bits, uint64_t ftr) { - uint64_t ftr_max = GENMASK_ULL(ARM64_FEATURE_FIELD_BITS - 1, 0); + uint64_t ftr_max = ftr_bits->mask >> ftr_bits->shift; + + TEST_ASSERT(ftr_max > 1, "This test doesn't support single bit features"); if (ftr_bits->sign == FTR_UNSIGNED) { switch (ftr_bits->type) { @@ -320,7 +322,9 @@ uint64_t get_safe_value(const struct reg_ftr_bits *ftr_bits, uint64_t ftr) /* Return an invalid value to a given ftr_bits an ftr value */ uint64_t get_invalid_value(const struct reg_ftr_bits *ftr_bits, uint64_t ftr) { - uint64_t ftr_max = GENMASK_ULL(ARM64_FEATURE_FIELD_BITS - 1, 0); + uint64_t ftr_max = ftr_bits->mask >> ftr_bits->shift; + + TEST_ASSERT(ftr_max > 1, "This test doesn't support single bit features"); if (ftr_bits->sign == FTR_UNSIGNED) { switch (ftr_bits->type) { @@ -672,7 +676,7 @@ static void test_clidr(struct kvm_vcpu *vcpu) clidr = vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(SYS_CLIDR_EL1)); /* find the first empty level in the cache hierarchy */ - for (level = 1; level < 7; level++) { + for (level = 1; level <= 7; level++) { if (!CLIDR_CTYPE(clidr, level)) break; } diff --git a/tools/testing/selftests/kvm/arm64/vgic_irq.c b/tools/testing/selftests/kvm/arm64/vgic_irq.c index 6338f5bbdb70..2fb2c7939fe9 100644 --- a/tools/testing/selftests/kvm/arm64/vgic_irq.c +++ b/tools/testing/selftests/kvm/arm64/vgic_irq.c @@ -29,6 +29,7 @@ struct test_args { bool level_sensitive; /* 1 is level, 0 is edge */ int kvm_max_routes; /* output of KVM_CAP_IRQ_ROUTING */ bool kvm_supports_irqfd; /* output of KVM_CAP_IRQFD */ + uint32_t shared_data; }; /* @@ -205,7 +206,7 @@ static void kvm_inject_call(kvm_inject_cmd cmd, uint32_t first_intid, do { \ uint32_t _intid; \ _intid = gic_get_and_ack_irq(); \ - GUEST_ASSERT(_intid == 0 || _intid == IAR_SPURIOUS); \ + GUEST_ASSERT(_intid == IAR_SPURIOUS); \ } while (0) #define CAT_HELPER(a, b) a ## b @@ -359,8 +360,9 @@ static uint32_t wait_for_and_activate_irq(void) * interrupts for the whole test. */ static void test_inject_preemption(struct test_args *args, - uint32_t first_intid, int num, - kvm_inject_cmd cmd) + uint32_t first_intid, int num, + const unsigned long *exclude, + kvm_inject_cmd cmd) { uint32_t intid, prio, step = KVM_PRIO_STEPS; int i; @@ -379,6 +381,10 @@ static void test_inject_preemption(struct test_args *args, for (i = 0; i < num; i++) { uint32_t tmp; intid = i + first_intid; + + if (exclude && test_bit(i, exclude)) + continue; + KVM_INJECT(cmd, intid); /* Each successive IRQ will preempt the previous one. */ tmp = wait_for_and_activate_irq(); @@ -390,15 +396,33 @@ static void test_inject_preemption(struct test_args *args, /* finish handling the IRQs starting with the highest priority one. */ for (i = 0; i < num; i++) { intid = num - i - 1 + first_intid; + + if (exclude && test_bit(intid - first_intid, exclude)) + continue; + gic_set_eoi(intid); - if (args->eoi_split) - gic_set_dir(intid); + } + + if (args->eoi_split) { + for (i = 0; i < num; i++) { + intid = i + first_intid; + + if (exclude && test_bit(i, exclude)) + continue; + + if (args->eoi_split) + gic_set_dir(intid); + } } local_irq_enable(); - for (i = 0; i < num; i++) + for (i = 0; i < num; i++) { + if (exclude && test_bit(i, exclude)) + continue; + GUEST_ASSERT(!gic_irq_get_active(i + first_intid)); + } GUEST_ASSERT_EQ(gic_read_ap1r0(), 0); GUEST_ASSERT_IAR_EMPTY(); @@ -436,33 +460,32 @@ static void test_injection_failure(struct test_args *args, static void test_preemption(struct test_args *args, struct kvm_inject_desc *f) { - /* - * Test up to 4 levels of preemption. The reason is that KVM doesn't - * currently implement the ability to have more than the number-of-LRs - * number of concurrently active IRQs. The number of LRs implemented is - * IMPLEMENTATION DEFINED, however, it seems that most implement 4. - */ + /* Timer PPIs cannot be injected from userspace */ + static const unsigned long ppi_exclude = (BIT(27 - MIN_PPI) | + BIT(30 - MIN_PPI) | + BIT(28 - MIN_PPI) | + BIT(26 - MIN_PPI)); + if (f->sgi) - test_inject_preemption(args, MIN_SGI, 4, f->cmd); + test_inject_preemption(args, MIN_SGI, 16, NULL, f->cmd); if (f->ppi) - test_inject_preemption(args, MIN_PPI, 4, f->cmd); + test_inject_preemption(args, MIN_PPI, 16, &ppi_exclude, f->cmd); if (f->spi) - test_inject_preemption(args, MIN_SPI, 4, f->cmd); + test_inject_preemption(args, MIN_SPI, 31, NULL, f->cmd); } static void test_restore_active(struct test_args *args, struct kvm_inject_desc *f) { - /* Test up to 4 active IRQs. Same reason as in test_preemption. */ if (f->sgi) - guest_restore_active(args, MIN_SGI, 4, f->cmd); + guest_restore_active(args, MIN_SGI, 16, f->cmd); if (f->ppi) - guest_restore_active(args, MIN_PPI, 4, f->cmd); + guest_restore_active(args, MIN_PPI, 16, f->cmd); if (f->spi) - guest_restore_active(args, MIN_SPI, 4, f->cmd); + guest_restore_active(args, MIN_SPI, 31, f->cmd); } static void guest_code(struct test_args *args) @@ -473,12 +496,12 @@ static void guest_code(struct test_args *args) gic_init(GIC_V3, 1); - for (i = 0; i < nr_irqs; i++) - gic_irq_enable(i); - for (i = MIN_SPI; i < nr_irqs; i++) gic_irq_set_config(i, !level_sensitive); + for (i = 0; i < nr_irqs; i++) + gic_irq_enable(i); + gic_set_eoi_split(args->eoi_split); reset_priorities(args); @@ -636,7 +659,7 @@ static void kvm_routing_and_irqfd_check(struct kvm_vm *vm, } for (f = 0, i = intid; i < (uint64_t)intid + num; i++, f++) - close(fd[f]); + kvm_close(fd[f]); } /* handles the valid case: intid=0xffffffff num=1 */ @@ -779,6 +802,221 @@ done: kvm_vm_free(vm); } +static void guest_code_asym_dir(struct test_args *args, int cpuid) +{ + gic_init(GIC_V3, 2); + + gic_set_eoi_split(1); + gic_set_priority_mask(CPU_PRIO_MASK); + + if (cpuid == 0) { + uint32_t intid; + + local_irq_disable(); + + gic_set_priority(MIN_PPI, IRQ_DEFAULT_PRIO); + gic_irq_enable(MIN_SPI); + gic_irq_set_pending(MIN_SPI); + + intid = wait_for_and_activate_irq(); + GUEST_ASSERT_EQ(intid, MIN_SPI); + + gic_set_eoi(intid); + isb(); + + WRITE_ONCE(args->shared_data, MIN_SPI); + dsb(ishst); + + do { + dsb(ishld); + } while (READ_ONCE(args->shared_data) == MIN_SPI); + GUEST_ASSERT(!gic_irq_get_active(MIN_SPI)); + } else { + do { + dsb(ishld); + } while (READ_ONCE(args->shared_data) != MIN_SPI); + + gic_set_dir(MIN_SPI); + isb(); + + WRITE_ONCE(args->shared_data, 0); + dsb(ishst); + } + + GUEST_DONE(); +} + +static void guest_code_group_en(struct test_args *args, int cpuid) +{ + uint32_t intid; + + gic_init(GIC_V3, 2); + + gic_set_eoi_split(0); + gic_set_priority_mask(CPU_PRIO_MASK); + /* SGI0 is G0, which is disabled */ + gic_irq_set_group(0, 0); + + /* Configure all SGIs with decreasing priority */ + for (intid = 0; intid < MIN_PPI; intid++) { + gic_set_priority(intid, (intid + 1) * 8); + gic_irq_enable(intid); + gic_irq_set_pending(intid); + } + + /* Ack and EOI all G1 interrupts */ + for (int i = 1; i < MIN_PPI; i++) { + intid = wait_for_and_activate_irq(); + + GUEST_ASSERT(intid < MIN_PPI); + gic_set_eoi(intid); + isb(); + } + + /* + * Check that SGI0 is still pending, inactive, and that we cannot + * ack anything. + */ + GUEST_ASSERT(gic_irq_get_pending(0)); + GUEST_ASSERT(!gic_irq_get_active(0)); + GUEST_ASSERT_IAR_EMPTY(); + GUEST_ASSERT(read_sysreg_s(SYS_ICC_IAR0_EL1) == IAR_SPURIOUS); + + /* Open the G0 gates, and verify we can ack SGI0 */ + write_sysreg_s(1, SYS_ICC_IGRPEN0_EL1); + isb(); + + do { + intid = read_sysreg_s(SYS_ICC_IAR0_EL1); + } while (intid == IAR_SPURIOUS); + + GUEST_ASSERT(intid == 0); + GUEST_DONE(); +} + +static void guest_code_timer_spi(struct test_args *args, int cpuid) +{ + uint32_t intid; + u64 val; + + gic_init(GIC_V3, 2); + + gic_set_eoi_split(1); + gic_set_priority_mask(CPU_PRIO_MASK); + + /* Add a pending SPI so that KVM starts trapping DIR */ + gic_set_priority(MIN_SPI + cpuid, IRQ_DEFAULT_PRIO); + gic_irq_set_pending(MIN_SPI + cpuid); + + /* Configure the timer with a higher priority, make it pending */ + gic_set_priority(27, IRQ_DEFAULT_PRIO - 8); + + isb(); + val = read_sysreg(cntvct_el0); + write_sysreg(val, cntv_cval_el0); + write_sysreg(1, cntv_ctl_el0); + isb(); + + GUEST_ASSERT(gic_irq_get_pending(27)); + + /* Enable both interrupts */ + gic_irq_enable(MIN_SPI + cpuid); + gic_irq_enable(27); + + /* The timer must fire */ + intid = wait_for_and_activate_irq(); + GUEST_ASSERT(intid == 27); + + /* Check that we can deassert it */ + write_sysreg(0, cntv_ctl_el0); + isb(); + + GUEST_ASSERT(!gic_irq_get_pending(27)); + + /* + * Priority drop, deactivation -- we expect that the host + * deactivation will have been effective + */ + gic_set_eoi(27); + gic_set_dir(27); + + GUEST_ASSERT(!gic_irq_get_active(27)); + + /* Do it one more time */ + isb(); + val = read_sysreg(cntvct_el0); + write_sysreg(val, cntv_cval_el0); + write_sysreg(1, cntv_ctl_el0); + isb(); + + GUEST_ASSERT(gic_irq_get_pending(27)); + + /* The timer must fire again */ + intid = wait_for_and_activate_irq(); + GUEST_ASSERT(intid == 27); + + GUEST_DONE(); +} + +static void *test_vcpu_run(void *arg) +{ + struct kvm_vcpu *vcpu = arg; + struct ucall uc; + + while (1) { + vcpu_run(vcpu); + + switch (get_ucall(vcpu, &uc)) { + case UCALL_ABORT: + REPORT_GUEST_ASSERT(uc); + break; + case UCALL_DONE: + return NULL; + default: + TEST_FAIL("Unknown ucall %lu", uc.cmd); + } + } + + return NULL; +} + +static void test_vgic_two_cpus(void *gcode) +{ + pthread_t thr[2]; + struct kvm_vcpu *vcpus[2]; + struct test_args args = {}; + struct kvm_vm *vm; + vm_vaddr_t args_gva; + int gic_fd, ret; + + vm = vm_create_with_vcpus(2, gcode, vcpus); + + vm_init_descriptor_tables(vm); + vcpu_init_descriptor_tables(vcpus[0]); + vcpu_init_descriptor_tables(vcpus[1]); + + /* Setup the guest args page (so it gets the args). */ + args_gva = vm_vaddr_alloc_page(vm); + memcpy(addr_gva2hva(vm, args_gva), &args, sizeof(args)); + vcpu_args_set(vcpus[0], 2, args_gva, 0); + vcpu_args_set(vcpus[1], 2, args_gva, 1); + + gic_fd = vgic_v3_setup(vm, 2, 64); + + ret = pthread_create(&thr[0], NULL, test_vcpu_run, vcpus[0]); + if (ret) + TEST_FAIL("Can't create thread for vcpu 0 (%d)\n", ret); + ret = pthread_create(&thr[1], NULL, test_vcpu_run, vcpus[1]); + if (ret) + TEST_FAIL("Can't create thread for vcpu 1 (%d)\n", ret); + + pthread_join(thr[0], NULL); + pthread_join(thr[1], NULL); + + close(gic_fd); + kvm_vm_free(vm); +} + static void help(const char *name) { printf( @@ -835,6 +1073,9 @@ int main(int argc, char **argv) test_vgic(nr_irqs, false /* level */, true /* eoi_split */); test_vgic(nr_irqs, true /* level */, false /* eoi_split */); test_vgic(nr_irqs, true /* level */, true /* eoi_split */); + test_vgic_two_cpus(guest_code_asym_dir); + test_vgic_two_cpus(guest_code_group_en); + test_vgic_two_cpus(guest_code_timer_spi); } else { test_vgic(nr_irqs, level_sensitive, eoi_split); } diff --git a/tools/testing/selftests/kvm/arm64/vgic_lpi_stress.c b/tools/testing/selftests/kvm/arm64/vgic_lpi_stress.c index 687d04463983..e857a605f577 100644 --- a/tools/testing/selftests/kvm/arm64/vgic_lpi_stress.c +++ b/tools/testing/selftests/kvm/arm64/vgic_lpi_stress.c @@ -118,6 +118,10 @@ static void guest_setup_gic(void) guest_setup_its_mappings(); guest_invalidate_all_rdists(); + + /* SYNC to ensure ITS setup is complete */ + for (cpuid = 0; cpuid < test_data.nr_cpus; cpuid++) + its_send_sync_cmd(test_data.cmdq_base_va, cpuid); } static void guest_code(size_t nr_lpis) diff --git a/tools/testing/selftests/kvm/guest_memfd_test.c b/tools/testing/selftests/kvm/guest_memfd_test.c index e7d9aeb418d3..618c937f3c90 100644 --- a/tools/testing/selftests/kvm/guest_memfd_test.c +++ b/tools/testing/selftests/kvm/guest_memfd_test.c @@ -19,6 +19,7 @@ #include <sys/stat.h> #include "kvm_util.h" +#include "numaif.h" #include "test_util.h" #include "ucall_common.h" @@ -75,6 +76,101 @@ static void test_mmap_supported(int fd, size_t total_size) kvm_munmap(mem, total_size); } +static void test_mbind(int fd, size_t total_size) +{ + const unsigned long nodemask_0 = 1; /* nid: 0 */ + unsigned long nodemask = 0; + unsigned long maxnode = 8; + int policy; + char *mem; + int ret; + + if (!is_multi_numa_node_system()) + return; + + mem = kvm_mmap(total_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd); + + /* Test MPOL_INTERLEAVE policy */ + kvm_mbind(mem, page_size * 2, MPOL_INTERLEAVE, &nodemask_0, maxnode, 0); + kvm_get_mempolicy(&policy, &nodemask, maxnode, mem, MPOL_F_ADDR); + TEST_ASSERT(policy == MPOL_INTERLEAVE && nodemask == nodemask_0, + "Wanted MPOL_INTERLEAVE (%u) and nodemask 0x%lx, got %u and 0x%lx", + MPOL_INTERLEAVE, nodemask_0, policy, nodemask); + + /* Test basic MPOL_BIND policy */ + kvm_mbind(mem + page_size * 2, page_size * 2, MPOL_BIND, &nodemask_0, maxnode, 0); + kvm_get_mempolicy(&policy, &nodemask, maxnode, mem + page_size * 2, MPOL_F_ADDR); + TEST_ASSERT(policy == MPOL_BIND && nodemask == nodemask_0, + "Wanted MPOL_BIND (%u) and nodemask 0x%lx, got %u and 0x%lx", + MPOL_BIND, nodemask_0, policy, nodemask); + + /* Test MPOL_DEFAULT policy */ + kvm_mbind(mem, total_size, MPOL_DEFAULT, NULL, 0, 0); + kvm_get_mempolicy(&policy, &nodemask, maxnode, mem, MPOL_F_ADDR); + TEST_ASSERT(policy == MPOL_DEFAULT && !nodemask, + "Wanted MPOL_DEFAULT (%u) and nodemask 0x0, got %u and 0x%lx", + MPOL_DEFAULT, policy, nodemask); + + /* Test with invalid policy */ + ret = mbind(mem, page_size, 999, &nodemask_0, maxnode, 0); + TEST_ASSERT(ret == -1 && errno == EINVAL, + "mbind with invalid policy should fail with EINVAL"); + + kvm_munmap(mem, total_size); +} + +static void test_numa_allocation(int fd, size_t total_size) +{ + unsigned long node0_mask = 1; /* Node 0 */ + unsigned long node1_mask = 2; /* Node 1 */ + unsigned long maxnode = 8; + void *pages[4]; + int status[4]; + char *mem; + int i; + + if (!is_multi_numa_node_system()) + return; + + mem = kvm_mmap(total_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd); + + for (i = 0; i < 4; i++) + pages[i] = (char *)mem + page_size * i; + + /* Set NUMA policy after allocation */ + memset(mem, 0xaa, page_size); + kvm_mbind(pages[0], page_size, MPOL_BIND, &node0_mask, maxnode, 0); + kvm_fallocate(fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, 0, page_size); + + /* Set NUMA policy before allocation */ + kvm_mbind(pages[0], page_size * 2, MPOL_BIND, &node1_mask, maxnode, 0); + kvm_mbind(pages[2], page_size * 2, MPOL_BIND, &node0_mask, maxnode, 0); + memset(mem, 0xaa, total_size); + + /* Validate if pages are allocated on specified NUMA nodes */ + kvm_move_pages(0, 4, pages, NULL, status, 0); + TEST_ASSERT(status[0] == 1, "Expected page 0 on node 1, got it on node %d", status[0]); + TEST_ASSERT(status[1] == 1, "Expected page 1 on node 1, got it on node %d", status[1]); + TEST_ASSERT(status[2] == 0, "Expected page 2 on node 0, got it on node %d", status[2]); + TEST_ASSERT(status[3] == 0, "Expected page 3 on node 0, got it on node %d", status[3]); + + /* Punch hole for all pages */ + kvm_fallocate(fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, 0, total_size); + + /* Change NUMA policy nodes and reallocate */ + kvm_mbind(pages[0], page_size * 2, MPOL_BIND, &node0_mask, maxnode, 0); + kvm_mbind(pages[2], page_size * 2, MPOL_BIND, &node1_mask, maxnode, 0); + memset(mem, 0xaa, total_size); + + kvm_move_pages(0, 4, pages, NULL, status, 0); + TEST_ASSERT(status[0] == 0, "Expected page 0 on node 0, got it on node %d", status[0]); + TEST_ASSERT(status[1] == 0, "Expected page 1 on node 0, got it on node %d", status[1]); + TEST_ASSERT(status[2] == 1, "Expected page 2 on node 1, got it on node %d", status[2]); + TEST_ASSERT(status[3] == 1, "Expected page 3 on node 1, got it on node %d", status[3]); + + kvm_munmap(mem, total_size); +} + static void test_fault_sigbus(int fd, size_t accessible_size, size_t map_size) { const char val = 0xaa; @@ -273,11 +369,13 @@ static void __test_guest_memfd(struct kvm_vm *vm, uint64_t flags) if (flags & GUEST_MEMFD_FLAG_INIT_SHARED) { gmem_test(mmap_supported, vm, flags); gmem_test(fault_overflow, vm, flags); + gmem_test(numa_allocation, vm, flags); } else { gmem_test(fault_private, vm, flags); } gmem_test(mmap_cow, vm, flags); + gmem_test(mbind, vm, flags); } else { gmem_test(mmap_not_supported, vm, flags); } diff --git a/tools/testing/selftests/kvm/include/arm64/gic.h b/tools/testing/selftests/kvm/include/arm64/gic.h index baeb3c859389..cc7a7f34ed37 100644 --- a/tools/testing/selftests/kvm/include/arm64/gic.h +++ b/tools/testing/selftests/kvm/include/arm64/gic.h @@ -57,6 +57,7 @@ void gic_irq_set_pending(unsigned int intid); void gic_irq_clear_pending(unsigned int intid); bool gic_irq_get_pending(unsigned int intid); void gic_irq_set_config(unsigned int intid, bool is_edge); +void gic_irq_set_group(unsigned int intid, bool group); void gic_rdist_enable_lpis(vm_paddr_t cfg_table, size_t cfg_table_size, vm_paddr_t pend_table); diff --git a/tools/testing/selftests/kvm/include/arm64/gic_v3_its.h b/tools/testing/selftests/kvm/include/arm64/gic_v3_its.h index 3722ed9c8f96..58feef3eb386 100644 --- a/tools/testing/selftests/kvm/include/arm64/gic_v3_its.h +++ b/tools/testing/selftests/kvm/include/arm64/gic_v3_its.h @@ -15,5 +15,6 @@ void its_send_mapc_cmd(void *cmdq_base, u32 vcpu_id, u32 collection_id, bool val void its_send_mapti_cmd(void *cmdq_base, u32 device_id, u32 event_id, u32 collection_id, u32 intid); void its_send_invall_cmd(void *cmdq_base, u32 collection_id); +void its_send_sync_cmd(void *cmdq_base, u32 vcpu_id); #endif // __SELFTESTS_GIC_V3_ITS_H__ diff --git a/tools/testing/selftests/kvm/include/kvm_syscalls.h b/tools/testing/selftests/kvm/include/kvm_syscalls.h new file mode 100644 index 000000000000..d4e613162bba --- /dev/null +++ b/tools/testing/selftests/kvm/include/kvm_syscalls.h @@ -0,0 +1,81 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef SELFTEST_KVM_SYSCALLS_H +#define SELFTEST_KVM_SYSCALLS_H + +#include <sys/syscall.h> + +#define MAP_ARGS0(m,...) +#define MAP_ARGS1(m,t,a,...) m(t,a) +#define MAP_ARGS2(m,t,a,...) m(t,a), MAP_ARGS1(m,__VA_ARGS__) +#define MAP_ARGS3(m,t,a,...) m(t,a), MAP_ARGS2(m,__VA_ARGS__) +#define MAP_ARGS4(m,t,a,...) m(t,a), MAP_ARGS3(m,__VA_ARGS__) +#define MAP_ARGS5(m,t,a,...) m(t,a), MAP_ARGS4(m,__VA_ARGS__) +#define MAP_ARGS6(m,t,a,...) m(t,a), MAP_ARGS5(m,__VA_ARGS__) +#define MAP_ARGS(n,...) MAP_ARGS##n(__VA_ARGS__) + +#define __DECLARE_ARGS(t, a) t a +#define __UNPACK_ARGS(t, a) a + +#define DECLARE_ARGS(nr_args, args...) MAP_ARGS(nr_args, __DECLARE_ARGS, args) +#define UNPACK_ARGS(nr_args, args...) MAP_ARGS(nr_args, __UNPACK_ARGS, args) + +#define __KVM_SYSCALL_ERROR(_name, _ret) \ + "%s failed, rc: %i errno: %i (%s)", (_name), (_ret), errno, strerror(errno) + +/* Define a kvm_<syscall>() API to assert success. */ +#define __KVM_SYSCALL_DEFINE(name, nr_args, args...) \ +static inline void kvm_##name(DECLARE_ARGS(nr_args, args)) \ +{ \ + int r; \ + \ + r = name(UNPACK_ARGS(nr_args, args)); \ + TEST_ASSERT(!r, __KVM_SYSCALL_ERROR(#name, r)); \ +} + +/* + * Macro to define syscall APIs, either because KVM selftests doesn't link to + * the standard library, e.g. libnuma, or because there is no library that yet + * provides the syscall. These + */ +#define KVM_SYSCALL_DEFINE(name, nr_args, args...) \ +static inline long name(DECLARE_ARGS(nr_args, args)) \ +{ \ + return syscall(__NR_##name, UNPACK_ARGS(nr_args, args)); \ +} \ +__KVM_SYSCALL_DEFINE(name, nr_args, args) + +/* + * Special case mmap(), as KVM selftest rarely/never specific an address, + * rarely specify an offset, and because the unique return code requires + * special handling anyways. + */ +static inline void *__kvm_mmap(size_t size, int prot, int flags, int fd, + off_t offset) +{ + void *mem; + + mem = mmap(NULL, size, prot, flags, fd, offset); + TEST_ASSERT(mem != MAP_FAILED, __KVM_SYSCALL_ERROR("mmap()", + (int)(unsigned long)MAP_FAILED)); + return mem; +} + +static inline void *kvm_mmap(size_t size, int prot, int flags, int fd) +{ + return __kvm_mmap(size, prot, flags, fd, 0); +} + +static inline int kvm_dup(int fd) +{ + int new_fd = dup(fd); + + TEST_ASSERT(new_fd >= 0, __KVM_SYSCALL_ERROR("dup()", new_fd)); + return new_fd; +} + +__KVM_SYSCALL_DEFINE(munmap, 2, void *, mem, size_t, size); +__KVM_SYSCALL_DEFINE(close, 1, int, fd); +__KVM_SYSCALL_DEFINE(fallocate, 4, int, fd, int, mode, loff_t, offset, loff_t, len); +__KVM_SYSCALL_DEFINE(ftruncate, 2, unsigned int, fd, off_t, length); + +#endif /* SELFTEST_KVM_SYSCALLS_H */ diff --git a/tools/testing/selftests/kvm/include/kvm_util.h b/tools/testing/selftests/kvm/include/kvm_util.h index d3f3e455c031..81f4355ff28a 100644 --- a/tools/testing/selftests/kvm/include/kvm_util.h +++ b/tools/testing/selftests/kvm/include/kvm_util.h @@ -23,6 +23,7 @@ #include <pthread.h> +#include "kvm_syscalls.h" #include "kvm_util_arch.h" #include "kvm_util_types.h" #include "sparsebit.h" @@ -177,7 +178,7 @@ enum vm_guest_mode { VM_MODE_P40V48_4K, VM_MODE_P40V48_16K, VM_MODE_P40V48_64K, - VM_MODE_PXXV48_4K, /* For 48bits VA but ANY bits PA */ + VM_MODE_PXXVYY_4K, /* For 48-bit or 57-bit VA, depending on host support */ VM_MODE_P47V64_4K, VM_MODE_P44V64_4K, VM_MODE_P36V48_4K, @@ -219,7 +220,7 @@ extern enum vm_guest_mode vm_mode_default; #elif defined(__x86_64__) -#define VM_MODE_DEFAULT VM_MODE_PXXV48_4K +#define VM_MODE_DEFAULT VM_MODE_PXXVYY_4K #define MIN_PAGE_SHIFT 12U #define ptes_per_page(page_size) ((page_size) / 8) @@ -283,34 +284,6 @@ static inline bool kvm_has_cap(long cap) return kvm_check_cap(cap); } -#define __KVM_SYSCALL_ERROR(_name, _ret) \ - "%s failed, rc: %i errno: %i (%s)", (_name), (_ret), errno, strerror(errno) - -static inline void *__kvm_mmap(size_t size, int prot, int flags, int fd, - off_t offset) -{ - void *mem; - - mem = mmap(NULL, size, prot, flags, fd, offset); - TEST_ASSERT(mem != MAP_FAILED, __KVM_SYSCALL_ERROR("mmap()", - (int)(unsigned long)MAP_FAILED)); - - return mem; -} - -static inline void *kvm_mmap(size_t size, int prot, int flags, int fd) -{ - return __kvm_mmap(size, prot, flags, fd, 0); -} - -static inline void kvm_munmap(void *mem, size_t size) -{ - int ret; - - ret = munmap(mem, size); - TEST_ASSERT(!ret, __KVM_SYSCALL_ERROR("munmap()", ret)); -} - /* * Use the "inner", double-underscore macro when reporting errors from within * other macros so that the name of ioctl() and not its literal numeric value @@ -700,12 +673,12 @@ int __vm_set_user_memory_region2(struct kvm_vm *vm, uint32_t slot, uint32_t flag uint32_t guest_memfd, uint64_t guest_memfd_offset); void vm_userspace_mem_region_add(struct kvm_vm *vm, - enum vm_mem_backing_src_type src_type, - uint64_t guest_paddr, uint32_t slot, uint64_t npages, - uint32_t flags); + enum vm_mem_backing_src_type src_type, + uint64_t gpa, uint32_t slot, uint64_t npages, + uint32_t flags); void vm_mem_add(struct kvm_vm *vm, enum vm_mem_backing_src_type src_type, - uint64_t guest_paddr, uint32_t slot, uint64_t npages, - uint32_t flags, int guest_memfd_fd, uint64_t guest_memfd_offset); + uint64_t gpa, uint32_t slot, uint64_t npages, uint32_t flags, + int guest_memfd_fd, uint64_t guest_memfd_offset); #ifndef vm_arch_has_protected_memory static inline bool vm_arch_has_protected_memory(struct kvm_vm *vm) @@ -715,6 +688,7 @@ static inline bool vm_arch_has_protected_memory(struct kvm_vm *vm) #endif void vm_mem_region_set_flags(struct kvm_vm *vm, uint32_t slot, uint32_t flags); +void vm_mem_region_reload(struct kvm_vm *vm, uint32_t slot); void vm_mem_region_move(struct kvm_vm *vm, uint32_t slot, uint64_t new_gpa); void vm_mem_region_delete(struct kvm_vm *vm, uint32_t slot); struct kvm_vcpu *__vm_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id); @@ -1230,6 +1204,7 @@ void virt_arch_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr); static inline void virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr) { virt_arch_pg_map(vm, vaddr, paddr); + sparsebit_set(vm->vpages_mapped, vaddr >> vm->page_shift); } diff --git a/tools/testing/selftests/kvm/include/loongarch/arch_timer.h b/tools/testing/selftests/kvm/include/loongarch/arch_timer.h new file mode 100644 index 000000000000..2ed106b32c81 --- /dev/null +++ b/tools/testing/selftests/kvm/include/loongarch/arch_timer.h @@ -0,0 +1,85 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * LoongArch Constant Timer specific interface + */ +#ifndef SELFTEST_KVM_ARCH_TIMER_H +#define SELFTEST_KVM_ARCH_TIMER_H + +#include "processor.h" + +/* LoongArch timer frequency is constant 100MHZ */ +#define TIMER_FREQ (100UL << 20) +#define msec_to_cycles(msec) (TIMER_FREQ * (unsigned long)(msec) / 1000) +#define usec_to_cycles(usec) (TIMER_FREQ * (unsigned long)(usec) / 1000000) +#define cycles_to_usec(cycles) ((unsigned long)(cycles) * 1000000 / TIMER_FREQ) + +static inline unsigned long timer_get_cycles(void) +{ + unsigned long val = 0; + + __asm__ __volatile__( + "rdtime.d %0, $zero\n\t" + : "=r"(val) + : + ); + + return val; +} + +static inline unsigned long timer_get_cfg(void) +{ + return csr_read(LOONGARCH_CSR_TCFG); +} + +static inline unsigned long timer_get_val(void) +{ + return csr_read(LOONGARCH_CSR_TVAL); +} + +static inline void disable_timer(void) +{ + csr_write(0, LOONGARCH_CSR_TCFG); +} + +static inline void timer_irq_enable(void) +{ + unsigned long val; + + val = csr_read(LOONGARCH_CSR_ECFG); + val |= ECFGF_TIMER; + csr_write(val, LOONGARCH_CSR_ECFG); +} + +static inline void timer_irq_disable(void) +{ + unsigned long val; + + val = csr_read(LOONGARCH_CSR_ECFG); + val &= ~ECFGF_TIMER; + csr_write(val, LOONGARCH_CSR_ECFG); +} + +static inline void timer_set_next_cmp_ms(unsigned int msec, bool period) +{ + unsigned long val; + + val = msec_to_cycles(msec) & CSR_TCFG_VAL; + val |= CSR_TCFG_EN; + if (period) + val |= CSR_TCFG_PERIOD; + csr_write(val, LOONGARCH_CSR_TCFG); +} + +static inline void __delay(uint64_t cycles) +{ + uint64_t start = timer_get_cycles(); + + while ((timer_get_cycles() - start) < cycles) + cpu_relax(); +} + +static inline void udelay(unsigned long usec) +{ + __delay(usec_to_cycles(usec)); +} +#endif /* SELFTEST_KVM_ARCH_TIMER_H */ diff --git a/tools/testing/selftests/kvm/include/loongarch/processor.h b/tools/testing/selftests/kvm/include/loongarch/processor.h index 6427a3275e6a..76840ddda57d 100644 --- a/tools/testing/selftests/kvm/include/loongarch/processor.h +++ b/tools/testing/selftests/kvm/include/loongarch/processor.h @@ -83,7 +83,14 @@ #define LOONGARCH_CSR_PRMD 0x1 #define LOONGARCH_CSR_EUEN 0x2 #define LOONGARCH_CSR_ECFG 0x4 +#define ECFGB_TIMER 11 +#define ECFGF_TIMER (BIT_ULL(ECFGB_TIMER)) #define LOONGARCH_CSR_ESTAT 0x5 /* Exception status */ +#define CSR_ESTAT_EXC_SHIFT 16 +#define CSR_ESTAT_EXC_WIDTH 6 +#define CSR_ESTAT_EXC (0x3f << CSR_ESTAT_EXC_SHIFT) +#define EXCCODE_INT 0 /* Interrupt */ +#define INT_TI 11 /* Timer interrupt*/ #define LOONGARCH_CSR_ERA 0x6 /* ERA */ #define LOONGARCH_CSR_BADV 0x7 /* Bad virtual address */ #define LOONGARCH_CSR_EENTRY 0xc @@ -106,6 +113,14 @@ #define LOONGARCH_CSR_KS1 0x31 #define LOONGARCH_CSR_TMID 0x40 #define LOONGARCH_CSR_TCFG 0x41 +#define CSR_TCFG_VAL (BIT_ULL(48) - BIT_ULL(2)) +#define CSR_TCFG_PERIOD_SHIFT 1 +#define CSR_TCFG_PERIOD (0x1UL << CSR_TCFG_PERIOD_SHIFT) +#define CSR_TCFG_EN (0x1UL) +#define LOONGARCH_CSR_TVAL 0x42 +#define LOONGARCH_CSR_TINTCLR 0x44 /* Timer interrupt clear */ +#define CSR_TINTCLR_TI_SHIFT 0 +#define CSR_TINTCLR_TI (1 << CSR_TINTCLR_TI_SHIFT) /* TLB refill exception entry */ #define LOONGARCH_CSR_TLBRENTRY 0x88 #define LOONGARCH_CSR_TLBRSAVE 0x8b @@ -113,6 +128,28 @@ #define CSR_TLBREHI_PS_SHIFT 0 #define CSR_TLBREHI_PS (0x3fUL << CSR_TLBREHI_PS_SHIFT) +#define csr_read(csr) \ +({ \ + register unsigned long __v; \ + __asm__ __volatile__( \ + "csrrd %[val], %[reg]\n\t" \ + : [val] "=r" (__v) \ + : [reg] "i" (csr) \ + : "memory"); \ + __v; \ +}) + +#define csr_write(v, csr) \ +({ \ + register unsigned long __v = v; \ + __asm__ __volatile__ ( \ + "csrwr %[val], %[reg]\n\t" \ + : [val] "+r" (__v) \ + : [reg] "i" (csr) \ + : "memory"); \ + __v; \ +}) + #define EXREGS_GPRS (32) #ifndef __ASSEMBLER__ @@ -124,18 +161,60 @@ struct ex_regs { unsigned long pc; unsigned long estat; unsigned long badv; + unsigned long prmd; }; #define PC_OFFSET_EXREGS offsetof(struct ex_regs, pc) #define ESTAT_OFFSET_EXREGS offsetof(struct ex_regs, estat) #define BADV_OFFSET_EXREGS offsetof(struct ex_regs, badv) +#define PRMD_OFFSET_EXREGS offsetof(struct ex_regs, prmd) #define EXREGS_SIZE sizeof(struct ex_regs) +#define VECTOR_NUM 64 + +typedef void(*handler_fn)(struct ex_regs *); + +struct handlers { + handler_fn exception_handlers[VECTOR_NUM]; +}; + +void vm_init_descriptor_tables(struct kvm_vm *vm); +void vm_install_exception_handler(struct kvm_vm *vm, int vector, handler_fn handler); + +static inline void cpu_relax(void) +{ + asm volatile("nop" ::: "memory"); +} + +static inline void local_irq_enable(void) +{ + unsigned int flags = CSR_CRMD_IE; + register unsigned int mask asm("$t0") = CSR_CRMD_IE; + + __asm__ __volatile__( + "csrxchg %[val], %[mask], %[reg]\n\t" + : [val] "+r" (flags) + : [mask] "r" (mask), [reg] "i" (LOONGARCH_CSR_CRMD) + : "memory"); +} + +static inline void local_irq_disable(void) +{ + unsigned int flags = 0; + register unsigned int mask asm("$t0") = CSR_CRMD_IE; + + __asm__ __volatile__( + "csrxchg %[val], %[mask], %[reg]\n\t" + : [val] "+r" (flags) + : [mask] "r" (mask), [reg] "i" (LOONGARCH_CSR_CRMD) + : "memory"); +} #else #define PC_OFFSET_EXREGS ((EXREGS_GPRS + 0) * 8) #define ESTAT_OFFSET_EXREGS ((EXREGS_GPRS + 1) * 8) #define BADV_OFFSET_EXREGS ((EXREGS_GPRS + 2) * 8) -#define EXREGS_SIZE ((EXREGS_GPRS + 3) * 8) +#define PRMD_OFFSET_EXREGS ((EXREGS_GPRS + 3) * 8) +#define EXREGS_SIZE ((EXREGS_GPRS + 4) * 8) #endif #endif /* SELFTEST_KVM_PROCESSOR_H */ diff --git a/tools/testing/selftests/kvm/include/numaif.h b/tools/testing/selftests/kvm/include/numaif.h index b020547403fd..29572a6d789c 100644 --- a/tools/testing/selftests/kvm/include/numaif.h +++ b/tools/testing/selftests/kvm/include/numaif.h @@ -1,55 +1,83 @@ /* SPDX-License-Identifier: GPL-2.0-only */ -/* - * tools/testing/selftests/kvm/include/numaif.h - * - * Copyright (C) 2020, Google LLC. - * - * This work is licensed under the terms of the GNU GPL, version 2. - * - * Header file that provides access to NUMA API functions not explicitly - * exported to user space. - */ +/* Copyright (C) 2020, Google LLC. */ #ifndef SELFTEST_KVM_NUMAIF_H #define SELFTEST_KVM_NUMAIF_H -#define __NR_get_mempolicy 239 -#define __NR_migrate_pages 256 +#include <dirent.h> -/* System calls */ -long get_mempolicy(int *policy, const unsigned long *nmask, - unsigned long maxnode, void *addr, int flags) +#include <linux/mempolicy.h> + +#include "kvm_syscalls.h" + +KVM_SYSCALL_DEFINE(get_mempolicy, 5, int *, policy, const unsigned long *, nmask, + unsigned long, maxnode, void *, addr, int, flags); + +KVM_SYSCALL_DEFINE(set_mempolicy, 3, int, mode, const unsigned long *, nmask, + unsigned long, maxnode); + +KVM_SYSCALL_DEFINE(set_mempolicy_home_node, 4, unsigned long, start, + unsigned long, len, unsigned long, home_node, + unsigned long, flags); + +KVM_SYSCALL_DEFINE(migrate_pages, 4, int, pid, unsigned long, maxnode, + const unsigned long *, frommask, const unsigned long *, tomask); + +KVM_SYSCALL_DEFINE(move_pages, 6, int, pid, unsigned long, count, void *, pages, + const int *, nodes, int *, status, int, flags); + +KVM_SYSCALL_DEFINE(mbind, 6, void *, addr, unsigned long, size, int, mode, + const unsigned long *, nodemask, unsigned long, maxnode, + unsigned int, flags); + +static inline int get_max_numa_node(void) { - return syscall(__NR_get_mempolicy, policy, nmask, - maxnode, addr, flags); + struct dirent *de; + int max_node = 0; + DIR *d; + + /* + * Assume there's a single node if the kernel doesn't support NUMA, + * or if no nodes are found. + */ + d = opendir("/sys/devices/system/node"); + if (!d) + return 0; + + while ((de = readdir(d)) != NULL) { + int node_id; + char *endptr; + + if (strncmp(de->d_name, "node", 4) != 0) + continue; + + node_id = strtol(de->d_name + 4, &endptr, 10); + if (*endptr != '\0') + continue; + + if (node_id > max_node) + max_node = node_id; + } + closedir(d); + + return max_node; } -long migrate_pages(int pid, unsigned long maxnode, - const unsigned long *frommask, - const unsigned long *tomask) +static bool is_numa_available(void) { - return syscall(__NR_migrate_pages, pid, maxnode, frommask, tomask); + /* + * Probe for NUMA by doing a dummy get_mempolicy(). If the syscall + * fails with ENOSYS, then the kernel was built without NUMA support. + * if the syscall fails with EPERM, then the process/user lacks the + * necessary capabilities (CAP_SYS_NICE). + */ + return !get_mempolicy(NULL, NULL, 0, NULL, 0) || + (errno != ENOSYS && errno != EPERM); } -/* Policies */ -#define MPOL_DEFAULT 0 -#define MPOL_PREFERRED 1 -#define MPOL_BIND 2 -#define MPOL_INTERLEAVE 3 - -#define MPOL_MAX MPOL_INTERLEAVE - -/* Flags for get_mem_policy */ -#define MPOL_F_NODE (1<<0) /* return next il node or node of address */ - /* Warning: MPOL_F_NODE is unsupported and - * subject to change. Don't use. - */ -#define MPOL_F_ADDR (1<<1) /* look up vma using address */ -#define MPOL_F_MEMS_ALLOWED (1<<2) /* query nodes allowed in cpuset */ - -/* Flags for mbind */ -#define MPOL_MF_STRICT (1<<0) /* Verify existing pages in the mapping */ -#define MPOL_MF_MOVE (1<<1) /* Move pages owned by this process to conform to mapping */ -#define MPOL_MF_MOVE_ALL (1<<2) /* Move every page to conform to mapping */ +static inline bool is_multi_numa_node_system(void) +{ + return is_numa_available() && get_max_numa_node() >= 1; +} #endif /* SELFTEST_KVM_NUMAIF_H */ diff --git a/tools/testing/selftests/kvm/include/x86/processor.h b/tools/testing/selftests/kvm/include/x86/processor.h index 51cd84b9ca66..57d62a425109 100644 --- a/tools/testing/selftests/kvm/include/x86/processor.h +++ b/tools/testing/selftests/kvm/include/x86/processor.h @@ -1441,7 +1441,7 @@ enum pg_level { PG_LEVEL_2M, PG_LEVEL_1G, PG_LEVEL_512G, - PG_LEVEL_NUM + PG_LEVEL_256T }; #define PG_LEVEL_SHIFT(_level) ((_level - 1) * 9 + 12) diff --git a/tools/testing/selftests/kvm/include/x86/vmx.h b/tools/testing/selftests/kvm/include/x86/vmx.h index edb3c391b982..96e2b4c630a9 100644 --- a/tools/testing/selftests/kvm/include/x86/vmx.h +++ b/tools/testing/selftests/kvm/include/x86/vmx.h @@ -568,8 +568,7 @@ void nested_map_memslot(struct vmx_pages *vmx, struct kvm_vm *vm, void nested_identity_map_1g(struct vmx_pages *vmx, struct kvm_vm *vm, uint64_t addr, uint64_t size); bool kvm_cpu_has_ept(void); -void prepare_eptp(struct vmx_pages *vmx, struct kvm_vm *vm, - uint32_t eptp_memslot); +void prepare_eptp(struct vmx_pages *vmx, struct kvm_vm *vm); void prepare_virtualize_apic_accesses(struct vmx_pages *vmx, struct kvm_vm *vm); #endif /* SELFTEST_KVM_VMX_H */ diff --git a/tools/testing/selftests/kvm/kvm_binary_stats_test.c b/tools/testing/selftests/kvm/kvm_binary_stats_test.c index f02355c3c4c2..b7dbde9c0843 100644 --- a/tools/testing/selftests/kvm/kvm_binary_stats_test.c +++ b/tools/testing/selftests/kvm/kvm_binary_stats_test.c @@ -239,14 +239,14 @@ int main(int argc, char *argv[]) * single stats file works and doesn't cause explosions. */ vm_stats_fds = vm_get_stats_fd(vms[i]); - stats_test(dup(vm_stats_fds)); + stats_test(kvm_dup(vm_stats_fds)); /* Verify userspace can instantiate multiple stats files. */ stats_test(vm_get_stats_fd(vms[i])); for (j = 0; j < max_vcpu; ++j) { vcpu_stats_fds[j] = vcpu_get_stats_fd(vcpus[i * max_vcpu + j]); - stats_test(dup(vcpu_stats_fds[j])); + stats_test(kvm_dup(vcpu_stats_fds[j])); stats_test(vcpu_get_stats_fd(vcpus[i * max_vcpu + j])); } diff --git a/tools/testing/selftests/kvm/lib/arm64/gic.c b/tools/testing/selftests/kvm/lib/arm64/gic.c index 7abbf8866512..b023868fe0b8 100644 --- a/tools/testing/selftests/kvm/lib/arm64/gic.c +++ b/tools/testing/selftests/kvm/lib/arm64/gic.c @@ -155,3 +155,9 @@ void gic_irq_set_config(unsigned int intid, bool is_edge) GUEST_ASSERT(gic_common_ops); gic_common_ops->gic_irq_set_config(intid, is_edge); } + +void gic_irq_set_group(unsigned int intid, bool group) +{ + GUEST_ASSERT(gic_common_ops); + gic_common_ops->gic_irq_set_group(intid, group); +} diff --git a/tools/testing/selftests/kvm/lib/arm64/gic_private.h b/tools/testing/selftests/kvm/lib/arm64/gic_private.h index d24e9ecc96c6..b6a7e30c3eb1 100644 --- a/tools/testing/selftests/kvm/lib/arm64/gic_private.h +++ b/tools/testing/selftests/kvm/lib/arm64/gic_private.h @@ -25,6 +25,7 @@ struct gic_common_ops { void (*gic_irq_clear_pending)(uint32_t intid); bool (*gic_irq_get_pending)(uint32_t intid); void (*gic_irq_set_config)(uint32_t intid, bool is_edge); + void (*gic_irq_set_group)(uint32_t intid, bool group); }; extern const struct gic_common_ops gicv3_ops; diff --git a/tools/testing/selftests/kvm/lib/arm64/gic_v3.c b/tools/testing/selftests/kvm/lib/arm64/gic_v3.c index 66d05506f78b..50754a27f493 100644 --- a/tools/testing/selftests/kvm/lib/arm64/gic_v3.c +++ b/tools/testing/selftests/kvm/lib/arm64/gic_v3.c @@ -293,17 +293,36 @@ static void gicv3_enable_redist(volatile void *redist_base) } } +static void gicv3_set_group(uint32_t intid, bool grp) +{ + uint32_t cpu_or_dist; + uint32_t val; + + cpu_or_dist = (get_intid_range(intid) == SPI_RANGE) ? DIST_BIT : guest_get_vcpuid(); + val = gicv3_reg_readl(cpu_or_dist, GICD_IGROUPR + (intid / 32) * 4); + if (grp) + val |= BIT(intid % 32); + else + val &= ~BIT(intid % 32); + gicv3_reg_writel(cpu_or_dist, GICD_IGROUPR + (intid / 32) * 4, val); +} + static void gicv3_cpu_init(unsigned int cpu) { volatile void *sgi_base; unsigned int i; volatile void *redist_base_cpu; + u64 typer; GUEST_ASSERT(cpu < gicv3_data.nr_cpus); redist_base_cpu = gicr_base_cpu(cpu); sgi_base = sgi_base_from_redist(redist_base_cpu); + /* Verify assumption that GICR_TYPER.Processor_number == cpu */ + typer = readq_relaxed(redist_base_cpu + GICR_TYPER); + GUEST_ASSERT_EQ(GICR_TYPER_CPU_NUMBER(typer), cpu); + gicv3_enable_redist(redist_base_cpu); /* @@ -328,6 +347,8 @@ static void gicv3_cpu_init(unsigned int cpu) /* Set a default priority threshold */ write_sysreg_s(ICC_PMR_DEF_PRIO, SYS_ICC_PMR_EL1); + /* Disable Group-0 interrupts */ + write_sysreg_s(ICC_IGRPEN0_EL1_MASK, SYS_ICC_IGRPEN1_EL1); /* Enable non-secure Group-1 interrupts */ write_sysreg_s(ICC_IGRPEN1_EL1_MASK, SYS_ICC_IGRPEN1_EL1); } @@ -400,6 +421,7 @@ const struct gic_common_ops gicv3_ops = { .gic_irq_clear_pending = gicv3_irq_clear_pending, .gic_irq_get_pending = gicv3_irq_get_pending, .gic_irq_set_config = gicv3_irq_set_config, + .gic_irq_set_group = gicv3_set_group, }; void gic_rdist_enable_lpis(vm_paddr_t cfg_table, size_t cfg_table_size, diff --git a/tools/testing/selftests/kvm/lib/arm64/gic_v3_its.c b/tools/testing/selftests/kvm/lib/arm64/gic_v3_its.c index 0e2f8ed90f30..7f9fdcf42ae6 100644 --- a/tools/testing/selftests/kvm/lib/arm64/gic_v3_its.c +++ b/tools/testing/selftests/kvm/lib/arm64/gic_v3_its.c @@ -253,3 +253,13 @@ void its_send_invall_cmd(void *cmdq_base, u32 collection_id) its_send_cmd(cmdq_base, &cmd); } + +void its_send_sync_cmd(void *cmdq_base, u32 vcpu_id) +{ + struct its_cmd_block cmd = {}; + + its_encode_cmd(&cmd, GITS_CMD_SYNC); + its_encode_target(&cmd, procnum_to_rdbase(vcpu_id)); + + its_send_cmd(cmdq_base, &cmd); +} diff --git a/tools/testing/selftests/kvm/lib/arm64/processor.c b/tools/testing/selftests/kvm/lib/arm64/processor.c index 54f6d17c78f7..d46e4b13b92c 100644 --- a/tools/testing/selftests/kvm/lib/arm64/processor.c +++ b/tools/testing/selftests/kvm/lib/arm64/processor.c @@ -324,7 +324,7 @@ void aarch64_vcpu_setup(struct kvm_vcpu *vcpu, struct kvm_vcpu_init *init) /* Configure base granule size */ switch (vm->mode) { - case VM_MODE_PXXV48_4K: + case VM_MODE_PXXVYY_4K: TEST_FAIL("AArch64 does not support 4K sized pages " "with ANY-bit physical address ranges"); case VM_MODE_P52V48_64K: diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index 1a93d6361671..8279b6ced8d2 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -201,7 +201,7 @@ const char *vm_guest_mode_string(uint32_t i) [VM_MODE_P40V48_4K] = "PA-bits:40, VA-bits:48, 4K pages", [VM_MODE_P40V48_16K] = "PA-bits:40, VA-bits:48, 16K pages", [VM_MODE_P40V48_64K] = "PA-bits:40, VA-bits:48, 64K pages", - [VM_MODE_PXXV48_4K] = "PA-bits:ANY, VA-bits:48, 4K pages", + [VM_MODE_PXXVYY_4K] = "PA-bits:ANY, VA-bits:48 or 57, 4K pages", [VM_MODE_P47V64_4K] = "PA-bits:47, VA-bits:64, 4K pages", [VM_MODE_P44V64_4K] = "PA-bits:44, VA-bits:64, 4K pages", [VM_MODE_P36V48_4K] = "PA-bits:36, VA-bits:48, 4K pages", @@ -228,7 +228,7 @@ const struct vm_guest_mode_params vm_guest_mode_params[] = { [VM_MODE_P40V48_4K] = { 40, 48, 0x1000, 12 }, [VM_MODE_P40V48_16K] = { 40, 48, 0x4000, 14 }, [VM_MODE_P40V48_64K] = { 40, 48, 0x10000, 16 }, - [VM_MODE_PXXV48_4K] = { 0, 0, 0x1000, 12 }, + [VM_MODE_PXXVYY_4K] = { 0, 0, 0x1000, 12 }, [VM_MODE_P47V64_4K] = { 47, 64, 0x1000, 12 }, [VM_MODE_P44V64_4K] = { 44, 64, 0x1000, 12 }, [VM_MODE_P36V48_4K] = { 36, 48, 0x1000, 12 }, @@ -310,24 +310,26 @@ struct kvm_vm *____vm_create(struct vm_shape shape) case VM_MODE_P36V47_16K: vm->pgtable_levels = 3; break; - case VM_MODE_PXXV48_4K: + case VM_MODE_PXXVYY_4K: #ifdef __x86_64__ kvm_get_cpu_address_width(&vm->pa_bits, &vm->va_bits); kvm_init_vm_address_properties(vm); - /* - * Ignore KVM support for 5-level paging (vm->va_bits == 57), - * it doesn't take effect unless a CR4.LA57 is set, which it - * isn't for this mode (48-bit virtual address space). - */ - TEST_ASSERT(vm->va_bits == 48 || vm->va_bits == 57, - "Linear address width (%d bits) not supported", - vm->va_bits); + pr_debug("Guest physical address width detected: %d\n", vm->pa_bits); - vm->pgtable_levels = 4; - vm->va_bits = 48; + pr_debug("Guest virtual address width detected: %d\n", + vm->va_bits); + + if (vm->va_bits == 57) { + vm->pgtable_levels = 5; + } else { + TEST_ASSERT(vm->va_bits == 48, + "Unexpected guest virtual address width: %d", + vm->va_bits); + vm->pgtable_levels = 4; + } #else - TEST_FAIL("VM_MODE_PXXV48_4K not supported on non-x86 platforms"); + TEST_FAIL("VM_MODE_PXXVYY_4K not supported on non-x86 platforms"); #endif break; case VM_MODE_P47V64_4K: @@ -704,8 +706,6 @@ userspace_mem_region_find(struct kvm_vm *vm, uint64_t start, uint64_t end) static void kvm_stats_release(struct kvm_binary_stats *stats) { - int ret; - if (stats->fd < 0) return; @@ -714,8 +714,7 @@ static void kvm_stats_release(struct kvm_binary_stats *stats) stats->desc = NULL; } - ret = close(stats->fd); - TEST_ASSERT(!ret, __KVM_SYSCALL_ERROR("close()", ret)); + kvm_close(stats->fd); stats->fd = -1; } @@ -738,8 +737,6 @@ __weak void vcpu_arch_free(struct kvm_vcpu *vcpu) */ static void vm_vcpu_rm(struct kvm_vm *vm, struct kvm_vcpu *vcpu) { - int ret; - if (vcpu->dirty_gfns) { kvm_munmap(vcpu->dirty_gfns, vm->dirty_ring_size); vcpu->dirty_gfns = NULL; @@ -747,9 +744,7 @@ static void vm_vcpu_rm(struct kvm_vm *vm, struct kvm_vcpu *vcpu) kvm_munmap(vcpu->run, vcpu_mmap_sz()); - ret = close(vcpu->fd); - TEST_ASSERT(!ret, __KVM_SYSCALL_ERROR("close()", ret)); - + kvm_close(vcpu->fd); kvm_stats_release(&vcpu->stats); list_del(&vcpu->list); @@ -761,16 +756,12 @@ static void vm_vcpu_rm(struct kvm_vm *vm, struct kvm_vcpu *vcpu) void kvm_vm_release(struct kvm_vm *vmp) { struct kvm_vcpu *vcpu, *tmp; - int ret; list_for_each_entry_safe(vcpu, tmp, &vmp->vcpus, list) vm_vcpu_rm(vmp, vcpu); - ret = close(vmp->fd); - TEST_ASSERT(!ret, __KVM_SYSCALL_ERROR("close()", ret)); - - ret = close(vmp->kvm_fd); - TEST_ASSERT(!ret, __KVM_SYSCALL_ERROR("close()", ret)); + kvm_close(vmp->fd); + kvm_close(vmp->kvm_fd); /* Free cached stats metadata and close FD */ kvm_stats_release(&vmp->stats); @@ -828,7 +819,7 @@ void kvm_vm_free(struct kvm_vm *vmp) int kvm_memfd_alloc(size_t size, bool hugepages) { int memfd_flags = MFD_CLOEXEC; - int fd, r; + int fd; if (hugepages) memfd_flags |= MFD_HUGETLB; @@ -836,11 +827,8 @@ int kvm_memfd_alloc(size_t size, bool hugepages) fd = memfd_create("kvm_selftest", memfd_flags); TEST_ASSERT(fd != -1, __KVM_SYSCALL_ERROR("memfd_create()", fd)); - r = ftruncate(fd, size); - TEST_ASSERT(!r, __KVM_SYSCALL_ERROR("ftruncate()", r)); - - r = fallocate(fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, 0, size); - TEST_ASSERT(!r, __KVM_SYSCALL_ERROR("fallocate()", r)); + kvm_ftruncate(fd, size); + kvm_fallocate(fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, 0, size); return fd; } @@ -957,8 +945,8 @@ void vm_set_user_memory_region2(struct kvm_vm *vm, uint32_t slot, uint32_t flags /* FIXME: This thing needs to be ripped apart and rewritten. */ void vm_mem_add(struct kvm_vm *vm, enum vm_mem_backing_src_type src_type, - uint64_t guest_paddr, uint32_t slot, uint64_t npages, - uint32_t flags, int guest_memfd, uint64_t guest_memfd_offset) + uint64_t gpa, uint32_t slot, uint64_t npages, uint32_t flags, + int guest_memfd, uint64_t guest_memfd_offset) { int ret; struct userspace_mem_region *region; @@ -972,30 +960,29 @@ void vm_mem_add(struct kvm_vm *vm, enum vm_mem_backing_src_type src_type, "Number of guest pages is not compatible with the host. " "Try npages=%d", vm_adjust_num_guest_pages(vm->mode, npages)); - TEST_ASSERT((guest_paddr % vm->page_size) == 0, "Guest physical " + TEST_ASSERT((gpa % vm->page_size) == 0, "Guest physical " "address not on a page boundary.\n" - " guest_paddr: 0x%lx vm->page_size: 0x%x", - guest_paddr, vm->page_size); - TEST_ASSERT((((guest_paddr >> vm->page_shift) + npages) - 1) + " gpa: 0x%lx vm->page_size: 0x%x", + gpa, vm->page_size); + TEST_ASSERT((((gpa >> vm->page_shift) + npages) - 1) <= vm->max_gfn, "Physical range beyond maximum " "supported physical address,\n" - " guest_paddr: 0x%lx npages: 0x%lx\n" + " gpa: 0x%lx npages: 0x%lx\n" " vm->max_gfn: 0x%lx vm->page_size: 0x%x", - guest_paddr, npages, vm->max_gfn, vm->page_size); + gpa, npages, vm->max_gfn, vm->page_size); /* * Confirm a mem region with an overlapping address doesn't * already exist. */ region = (struct userspace_mem_region *) userspace_mem_region_find( - vm, guest_paddr, (guest_paddr + npages * vm->page_size) - 1); + vm, gpa, (gpa + npages * vm->page_size) - 1); if (region != NULL) TEST_FAIL("overlapping userspace_mem_region already " "exists\n" - " requested guest_paddr: 0x%lx npages: 0x%lx " - "page_size: 0x%x\n" - " existing guest_paddr: 0x%lx size: 0x%lx", - guest_paddr, npages, vm->page_size, + " requested gpa: 0x%lx npages: 0x%lx page_size: 0x%x\n" + " existing gpa: 0x%lx size: 0x%lx", + gpa, npages, vm->page_size, (uint64_t) region->region.guest_phys_addr, (uint64_t) region->region.memory_size); @@ -1009,8 +996,7 @@ void vm_mem_add(struct kvm_vm *vm, enum vm_mem_backing_src_type src_type, "already exists.\n" " requested slot: %u paddr: 0x%lx npages: 0x%lx\n" " existing slot: %u paddr: 0x%lx size: 0x%lx", - slot, guest_paddr, npages, - region->region.slot, + slot, gpa, npages, region->region.slot, (uint64_t) region->region.guest_phys_addr, (uint64_t) region->region.memory_size); } @@ -1036,7 +1022,7 @@ void vm_mem_add(struct kvm_vm *vm, enum vm_mem_backing_src_type src_type, if (src_type == VM_MEM_SRC_ANONYMOUS_THP) alignment = max(backing_src_pagesz, alignment); - TEST_ASSERT_EQ(guest_paddr, align_up(guest_paddr, backing_src_pagesz)); + TEST_ASSERT_EQ(gpa, align_up(gpa, backing_src_pagesz)); /* Add enough memory to align up if necessary */ if (alignment > 1) @@ -1084,8 +1070,7 @@ void vm_mem_add(struct kvm_vm *vm, enum vm_mem_backing_src_type src_type, * needing to track if the fd is owned by the framework * or by the caller. */ - guest_memfd = dup(guest_memfd); - TEST_ASSERT(guest_memfd >= 0, __KVM_SYSCALL_ERROR("dup()", guest_memfd)); + guest_memfd = kvm_dup(guest_memfd); } region->region.guest_memfd = guest_memfd; @@ -1097,20 +1082,18 @@ void vm_mem_add(struct kvm_vm *vm, enum vm_mem_backing_src_type src_type, region->unused_phy_pages = sparsebit_alloc(); if (vm_arch_has_protected_memory(vm)) region->protected_phy_pages = sparsebit_alloc(); - sparsebit_set_num(region->unused_phy_pages, - guest_paddr >> vm->page_shift, npages); + sparsebit_set_num(region->unused_phy_pages, gpa >> vm->page_shift, npages); region->region.slot = slot; region->region.flags = flags; - region->region.guest_phys_addr = guest_paddr; + region->region.guest_phys_addr = gpa; region->region.memory_size = npages * vm->page_size; region->region.userspace_addr = (uintptr_t) region->host_mem; ret = __vm_ioctl(vm, KVM_SET_USER_MEMORY_REGION2, ®ion->region); TEST_ASSERT(ret == 0, "KVM_SET_USER_MEMORY_REGION2 IOCTL failed,\n" " rc: %i errno: %i\n" " slot: %u flags: 0x%x\n" - " guest_phys_addr: 0x%lx size: 0x%lx guest_memfd: %d", - ret, errno, slot, flags, - guest_paddr, (uint64_t) region->region.memory_size, + " guest_phys_addr: 0x%lx size: 0x%llx guest_memfd: %d", + ret, errno, slot, flags, gpa, region->region.memory_size, region->region.guest_memfd); /* Add to quick lookup data structures */ @@ -1132,10 +1115,10 @@ void vm_mem_add(struct kvm_vm *vm, enum vm_mem_backing_src_type src_type, void vm_userspace_mem_region_add(struct kvm_vm *vm, enum vm_mem_backing_src_type src_type, - uint64_t guest_paddr, uint32_t slot, - uint64_t npages, uint32_t flags) + uint64_t gpa, uint32_t slot, uint64_t npages, + uint32_t flags) { - vm_mem_add(vm, src_type, guest_paddr, slot, npages, flags, -1, 0); + vm_mem_add(vm, src_type, gpa, slot, npages, flags, -1, 0); } /* @@ -1201,6 +1184,16 @@ void vm_mem_region_set_flags(struct kvm_vm *vm, uint32_t slot, uint32_t flags) ret, errno, slot, flags); } +void vm_mem_region_reload(struct kvm_vm *vm, uint32_t slot) +{ + struct userspace_mem_region *region = memslot2region(vm, slot); + struct kvm_userspace_memory_region2 tmp = region->region; + + tmp.memory_size = 0; + vm_ioctl(vm, KVM_SET_USER_MEMORY_REGION2, &tmp); + vm_ioctl(vm, KVM_SET_USER_MEMORY_REGION2, ®ion->region); +} + /* * VM Memory Region Move * @@ -1456,8 +1449,6 @@ static vm_vaddr_t ____vm_vaddr_alloc(struct kvm_vm *vm, size_t sz, pages--, vaddr += vm->page_size, paddr += vm->page_size) { virt_pg_map(vm, vaddr, paddr); - - sparsebit_set(vm->vpages_mapped, vaddr >> vm->page_shift); } return vaddr_start; @@ -1571,7 +1562,6 @@ void virt_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, while (npages--) { virt_pg_map(vm, vaddr, paddr); - sparsebit_set(vm->vpages_mapped, vaddr >> vm->page_shift); vaddr += page_size; paddr += page_size; @@ -2025,6 +2015,7 @@ static struct exit_reason { KVM_EXIT_STRING(NOTIFY), KVM_EXIT_STRING(LOONGARCH_IOCSR), KVM_EXIT_STRING(MEMORY_FAULT), + KVM_EXIT_STRING(ARM_SEA), }; /* @@ -2305,11 +2296,35 @@ __weak void kvm_selftest_arch_init(void) { } +static void report_unexpected_signal(int signum) +{ +#define KVM_CASE_SIGNUM(sig) \ + case sig: TEST_FAIL("Unexpected " #sig " (%d)\n", signum) + + switch (signum) { + KVM_CASE_SIGNUM(SIGBUS); + KVM_CASE_SIGNUM(SIGSEGV); + KVM_CASE_SIGNUM(SIGILL); + KVM_CASE_SIGNUM(SIGFPE); + default: + TEST_FAIL("Unexpected signal %d\n", signum); + } +} + void __attribute((constructor)) kvm_selftest_init(void) { + struct sigaction sig_sa = { + .sa_handler = report_unexpected_signal, + }; + /* Tell stdout not to buffer its content. */ setbuf(stdout, NULL); + sigaction(SIGBUS, &sig_sa, NULL); + sigaction(SIGSEGV, &sig_sa, NULL); + sigaction(SIGILL, &sig_sa, NULL); + sigaction(SIGFPE, &sig_sa, NULL); + guest_random_seed = last_guest_seed = random(); pr_info("Random seed: 0x%x\n", guest_random_seed); diff --git a/tools/testing/selftests/kvm/lib/loongarch/exception.S b/tools/testing/selftests/kvm/lib/loongarch/exception.S index 88bfa505c6f5..3f1e4b67c5ae 100644 --- a/tools/testing/selftests/kvm/lib/loongarch/exception.S +++ b/tools/testing/selftests/kvm/lib/loongarch/exception.S @@ -51,9 +51,15 @@ handle_exception: st.d t0, sp, ESTAT_OFFSET_EXREGS csrrd t0, LOONGARCH_CSR_BADV st.d t0, sp, BADV_OFFSET_EXREGS + csrrd t0, LOONGARCH_CSR_PRMD + st.d t0, sp, PRMD_OFFSET_EXREGS or a0, sp, zero bl route_exception + ld.d t0, sp, PC_OFFSET_EXREGS + csrwr t0, LOONGARCH_CSR_ERA + ld.d t0, sp, PRMD_OFFSET_EXREGS + csrwr t0, LOONGARCH_CSR_PRMD restore_gprs sp csrrd sp, LOONGARCH_CSR_KS0 ertn diff --git a/tools/testing/selftests/kvm/lib/loongarch/processor.c b/tools/testing/selftests/kvm/lib/loongarch/processor.c index 0ac1abcb71cb..07c103369ddb 100644 --- a/tools/testing/selftests/kvm/lib/loongarch/processor.c +++ b/tools/testing/selftests/kvm/lib/loongarch/processor.c @@ -3,6 +3,7 @@ #include <assert.h> #include <linux/compiler.h> +#include <asm/kvm.h> #include "kvm_util.h" #include "processor.h" #include "ucall_common.h" @@ -11,6 +12,7 @@ #define LOONGARCH_GUEST_STACK_VADDR_MIN 0x200000 static vm_paddr_t invalid_pgtable[4]; +static vm_vaddr_t exception_handlers; static uint64_t virt_pte_index(struct kvm_vm *vm, vm_vaddr_t gva, int level) { @@ -183,7 +185,14 @@ void assert_on_unhandled_exception(struct kvm_vcpu *vcpu) void route_exception(struct ex_regs *regs) { + int vector; unsigned long pc, estat, badv; + struct handlers *handlers; + + handlers = (struct handlers *)exception_handlers; + vector = (regs->estat & CSR_ESTAT_EXC) >> CSR_ESTAT_EXC_SHIFT; + if (handlers && handlers->exception_handlers[vector]) + return handlers->exception_handlers[vector](regs); pc = regs->pc; badv = regs->badv; @@ -192,6 +201,32 @@ void route_exception(struct ex_regs *regs) while (1) ; } +void vm_init_descriptor_tables(struct kvm_vm *vm) +{ + void *addr; + + vm->handlers = __vm_vaddr_alloc(vm, sizeof(struct handlers), + LOONGARCH_GUEST_STACK_VADDR_MIN, MEM_REGION_DATA); + + addr = addr_gva2hva(vm, vm->handlers); + memset(addr, 0, vm->page_size); + exception_handlers = vm->handlers; + sync_global_to_guest(vm, exception_handlers); +} + +void vm_install_exception_handler(struct kvm_vm *vm, int vector, handler_fn handler) +{ + struct handlers *handlers = addr_gva2hva(vm, vm->handlers); + + assert(vector < VECTOR_NUM); + handlers->exception_handlers[vector] = handler; +} + +uint32_t guest_get_vcpuid(void) +{ + return csr_read(LOONGARCH_CSR_CPUID); +} + void vcpu_args_set(struct kvm_vcpu *vcpu, unsigned int num, ...) { int i; @@ -211,6 +246,11 @@ void vcpu_args_set(struct kvm_vcpu *vcpu, unsigned int num, ...) vcpu_regs_set(vcpu, ®s); } +static void loongarch_set_reg(struct kvm_vcpu *vcpu, uint64_t id, uint64_t val) +{ + __vcpu_set_reg(vcpu, id, val); +} + static void loongarch_get_csr(struct kvm_vcpu *vcpu, uint64_t id, void *addr) { uint64_t csrid; @@ -242,8 +282,8 @@ static void loongarch_vcpu_setup(struct kvm_vcpu *vcpu) TEST_FAIL("Unknown guest mode, mode: 0x%x", vm->mode); } - /* user mode and page enable mode */ - val = PLV_USER | CSR_CRMD_PG; + /* kernel mode and page enable mode */ + val = PLV_KERN | CSR_CRMD_PG; loongarch_set_csr(vcpu, LOONGARCH_CSR_CRMD, val); loongarch_set_csr(vcpu, LOONGARCH_CSR_PRMD, val); loongarch_set_csr(vcpu, LOONGARCH_CSR_EUEN, 1); @@ -251,7 +291,10 @@ static void loongarch_vcpu_setup(struct kvm_vcpu *vcpu) loongarch_set_csr(vcpu, LOONGARCH_CSR_TCFG, 0); loongarch_set_csr(vcpu, LOONGARCH_CSR_ASID, 1); + /* time count start from 0 */ val = 0; + loongarch_set_reg(vcpu, KVM_REG_LOONGARCH_COUNTER, val); + width = vm->page_shift - 3; switch (vm->pgtable_levels) { diff --git a/tools/testing/selftests/kvm/lib/x86/memstress.c b/tools/testing/selftests/kvm/lib/x86/memstress.c index 7f5d62a65c68..0b1f288ad556 100644 --- a/tools/testing/selftests/kvm/lib/x86/memstress.c +++ b/tools/testing/selftests/kvm/lib/x86/memstress.c @@ -63,7 +63,7 @@ void memstress_setup_ept(struct vmx_pages *vmx, struct kvm_vm *vm) { uint64_t start, end; - prepare_eptp(vmx, vm, 0); + prepare_eptp(vmx, vm); /* * Identity map the first 4G and the test region with 1G pages so that diff --git a/tools/testing/selftests/kvm/lib/x86/processor.c b/tools/testing/selftests/kvm/lib/x86/processor.c index b418502c5ecc..36104d27f3d9 100644 --- a/tools/testing/selftests/kvm/lib/x86/processor.c +++ b/tools/testing/selftests/kvm/lib/x86/processor.c @@ -158,10 +158,10 @@ bool kvm_is_tdp_enabled(void) void virt_arch_pgd_alloc(struct kvm_vm *vm) { - TEST_ASSERT(vm->mode == VM_MODE_PXXV48_4K, "Attempt to use " - "unknown or unsupported guest mode, mode: 0x%x", vm->mode); + TEST_ASSERT(vm->mode == VM_MODE_PXXVYY_4K, + "Unknown or unsupported guest mode: 0x%x", vm->mode); - /* If needed, create page map l4 table. */ + /* If needed, create the top-level page table. */ if (!vm->pgd_created) { vm->pgd = vm_alloc_page_table(vm); vm->pgd_created = true; @@ -218,11 +218,11 @@ static uint64_t *virt_create_upper_pte(struct kvm_vm *vm, void __virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, int level) { const uint64_t pg_size = PG_LEVEL_SIZE(level); - uint64_t *pml4e, *pdpe, *pde; - uint64_t *pte; + uint64_t *pte = &vm->pgd; + int current_level; - TEST_ASSERT(vm->mode == VM_MODE_PXXV48_4K, - "Unknown or unsupported guest mode, mode: 0x%x", vm->mode); + TEST_ASSERT(vm->mode == VM_MODE_PXXVYY_4K, + "Unknown or unsupported guest mode: 0x%x", vm->mode); TEST_ASSERT((vaddr % pg_size) == 0, "Virtual address not aligned,\n" @@ -243,20 +243,17 @@ void __virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, int level) * Allocate upper level page tables, if not already present. Return * early if a hugepage was created. */ - pml4e = virt_create_upper_pte(vm, &vm->pgd, vaddr, paddr, PG_LEVEL_512G, level); - if (*pml4e & PTE_LARGE_MASK) - return; - - pdpe = virt_create_upper_pte(vm, pml4e, vaddr, paddr, PG_LEVEL_1G, level); - if (*pdpe & PTE_LARGE_MASK) - return; - - pde = virt_create_upper_pte(vm, pdpe, vaddr, paddr, PG_LEVEL_2M, level); - if (*pde & PTE_LARGE_MASK) - return; + for (current_level = vm->pgtable_levels; + current_level > PG_LEVEL_4K; + current_level--) { + pte = virt_create_upper_pte(vm, pte, vaddr, paddr, + current_level, level); + if (*pte & PTE_LARGE_MASK) + return; + } /* Fill in page table entry. */ - pte = virt_get_pte(vm, pde, vaddr, PG_LEVEL_4K); + pte = virt_get_pte(vm, pte, vaddr, PG_LEVEL_4K); TEST_ASSERT(!(*pte & PTE_PRESENT_MASK), "PTE already present for 4k page at vaddr: 0x%lx", vaddr); *pte = PTE_PRESENT_MASK | PTE_WRITABLE_MASK | (paddr & PHYSICAL_PAGE_MASK); @@ -289,6 +286,8 @@ void virt_map_level(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, for (i = 0; i < nr_pages; i++) { __virt_pg_map(vm, vaddr, paddr, level); + sparsebit_set_num(vm->vpages_mapped, vaddr >> vm->page_shift, + nr_bytes / PAGE_SIZE); vaddr += pg_size; paddr += pg_size; @@ -310,40 +309,38 @@ static bool vm_is_target_pte(uint64_t *pte, int *level, int current_level) uint64_t *__vm_get_page_table_entry(struct kvm_vm *vm, uint64_t vaddr, int *level) { - uint64_t *pml4e, *pdpe, *pde; + int va_width = 12 + (vm->pgtable_levels) * 9; + uint64_t *pte = &vm->pgd; + int current_level; TEST_ASSERT(!vm->arch.is_pt_protected, "Walking page tables of protected guests is impossible"); - TEST_ASSERT(*level >= PG_LEVEL_NONE && *level < PG_LEVEL_NUM, + TEST_ASSERT(*level >= PG_LEVEL_NONE && *level <= vm->pgtable_levels, "Invalid PG_LEVEL_* '%d'", *level); - TEST_ASSERT(vm->mode == VM_MODE_PXXV48_4K, "Attempt to use " - "unknown or unsupported guest mode, mode: 0x%x", vm->mode); + TEST_ASSERT(vm->mode == VM_MODE_PXXVYY_4K, + "Unknown or unsupported guest mode: 0x%x", vm->mode); TEST_ASSERT(sparsebit_is_set(vm->vpages_valid, (vaddr >> vm->page_shift)), "Invalid virtual address, vaddr: 0x%lx", vaddr); /* - * Based on the mode check above there are 48 bits in the vaddr, so - * shift 16 to sign extend the last bit (bit-47), + * Check that the vaddr is a sign-extended va_width value. */ - TEST_ASSERT(vaddr == (((int64_t)vaddr << 16) >> 16), - "Canonical check failed. The virtual address is invalid."); - - pml4e = virt_get_pte(vm, &vm->pgd, vaddr, PG_LEVEL_512G); - if (vm_is_target_pte(pml4e, level, PG_LEVEL_512G)) - return pml4e; - - pdpe = virt_get_pte(vm, pml4e, vaddr, PG_LEVEL_1G); - if (vm_is_target_pte(pdpe, level, PG_LEVEL_1G)) - return pdpe; - - pde = virt_get_pte(vm, pdpe, vaddr, PG_LEVEL_2M); - if (vm_is_target_pte(pde, level, PG_LEVEL_2M)) - return pde; + TEST_ASSERT(vaddr == + (((int64_t)vaddr << (64 - va_width) >> (64 - va_width))), + "Canonical check failed. The virtual address is invalid."); + + for (current_level = vm->pgtable_levels; + current_level > PG_LEVEL_4K; + current_level--) { + pte = virt_get_pte(vm, pte, vaddr, current_level); + if (vm_is_target_pte(pte, level, current_level)) + return pte; + } - return virt_get_pte(vm, pde, vaddr, PG_LEVEL_4K); + return virt_get_pte(vm, pte, vaddr, PG_LEVEL_4K); } uint64_t *vm_get_page_table_entry(struct kvm_vm *vm, uint64_t vaddr) @@ -526,7 +523,8 @@ static void vcpu_init_sregs(struct kvm_vm *vm, struct kvm_vcpu *vcpu) { struct kvm_sregs sregs; - TEST_ASSERT_EQ(vm->mode, VM_MODE_PXXV48_4K); + TEST_ASSERT(vm->mode == VM_MODE_PXXVYY_4K, + "Unknown or unsupported guest mode: 0x%x", vm->mode); /* Set mode specific system register values. */ vcpu_sregs_get(vcpu, &sregs); @@ -540,6 +538,8 @@ static void vcpu_init_sregs(struct kvm_vm *vm, struct kvm_vcpu *vcpu) sregs.cr4 |= X86_CR4_PAE | X86_CR4_OSFXSR; if (kvm_cpu_has(X86_FEATURE_XSAVE)) sregs.cr4 |= X86_CR4_OSXSAVE; + if (vm->pgtable_levels == 5) + sregs.cr4 |= X86_CR4_LA57; sregs.efer |= (EFER_LME | EFER_LMA | EFER_NX); kvm_seg_set_unusable(&sregs.ldt); diff --git a/tools/testing/selftests/kvm/lib/x86/vmx.c b/tools/testing/selftests/kvm/lib/x86/vmx.c index d4d1208dd023..29b082a58daa 100644 --- a/tools/testing/selftests/kvm/lib/x86/vmx.c +++ b/tools/testing/selftests/kvm/lib/x86/vmx.c @@ -401,11 +401,11 @@ void __nested_pg_map(struct vmx_pages *vmx, struct kvm_vm *vm, struct eptPageTableEntry *pt = vmx->eptp_hva, *pte; uint16_t index; - TEST_ASSERT(vm->mode == VM_MODE_PXXV48_4K, "Attempt to use " - "unknown or unsupported guest mode, mode: 0x%x", vm->mode); + TEST_ASSERT(vm->mode == VM_MODE_PXXVYY_4K, + "Unknown or unsupported guest mode: 0x%x", vm->mode); TEST_ASSERT((nested_paddr >> 48) == 0, - "Nested physical address 0x%lx requires 5-level paging", + "Nested physical address 0x%lx is > 48-bits and requires 5-level EPT", nested_paddr); TEST_ASSERT((nested_paddr % page_size) == 0, "Nested physical address not on page boundary,\n" @@ -534,8 +534,7 @@ bool kvm_cpu_has_ept(void) return ctrl & SECONDARY_EXEC_ENABLE_EPT; } -void prepare_eptp(struct vmx_pages *vmx, struct kvm_vm *vm, - uint32_t eptp_memslot) +void prepare_eptp(struct vmx_pages *vmx, struct kvm_vm *vm) { TEST_ASSERT(kvm_cpu_has_ept(), "KVM doesn't support nested EPT"); diff --git a/tools/testing/selftests/kvm/loongarch/arch_timer.c b/tools/testing/selftests/kvm/loongarch/arch_timer.c new file mode 100644 index 000000000000..355ecac30954 --- /dev/null +++ b/tools/testing/selftests/kvm/loongarch/arch_timer.c @@ -0,0 +1,200 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * The test validates periodic/one-shot constant timer IRQ using + * CSR.TCFG and CSR.TVAL registers. + */ +#include "arch_timer.h" +#include "kvm_util.h" +#include "processor.h" +#include "timer_test.h" +#include "ucall_common.h" + +static void do_idle(void) +{ + unsigned int intid; + unsigned long estat; + + __asm__ __volatile__("idle 0" : : : "memory"); + + estat = csr_read(LOONGARCH_CSR_ESTAT); + intid = !!(estat & BIT(INT_TI)); + + /* Make sure pending timer IRQ arrived */ + GUEST_ASSERT_EQ(intid, 1); + csr_write(CSR_TINTCLR_TI, LOONGARCH_CSR_TINTCLR); +} + +static void guest_irq_handler(struct ex_regs *regs) +{ + unsigned int intid; + uint32_t cpu = guest_get_vcpuid(); + uint64_t xcnt, val, cfg, xcnt_diff_us; + struct test_vcpu_shared_data *shared_data = &vcpu_shared_data[cpu]; + + intid = !!(regs->estat & BIT(INT_TI)); + + /* Make sure we are dealing with the correct timer IRQ */ + GUEST_ASSERT_EQ(intid, 1); + + cfg = timer_get_cfg(); + if (cfg & CSR_TCFG_PERIOD) { + WRITE_ONCE(shared_data->nr_iter, shared_data->nr_iter - 1); + if (shared_data->nr_iter == 0) + disable_timer(); + csr_write(CSR_TINTCLR_TI, LOONGARCH_CSR_TINTCLR); + return; + } + + /* + * On real machine, value of LOONGARCH_CSR_TVAL is BIT_ULL(48) - 1 + * On virtual machine, its value counts down from BIT_ULL(48) - 1 + */ + val = timer_get_val(); + xcnt = timer_get_cycles(); + xcnt_diff_us = cycles_to_usec(xcnt - shared_data->xcnt); + + /* Basic 'timer condition met' check */ + __GUEST_ASSERT(val > cfg, + "val = 0x%lx, cfg = 0x%lx, xcnt_diff_us = 0x%lx", + val, cfg, xcnt_diff_us); + + csr_write(CSR_TINTCLR_TI, LOONGARCH_CSR_TINTCLR); + WRITE_ONCE(shared_data->nr_iter, shared_data->nr_iter + 1); +} + +static void guest_test_period_timer(uint32_t cpu) +{ + uint32_t irq_iter, config_iter; + uint64_t us; + struct test_vcpu_shared_data *shared_data = &vcpu_shared_data[cpu]; + + shared_data->nr_iter = test_args.nr_iter; + shared_data->xcnt = timer_get_cycles(); + us = msecs_to_usecs(test_args.timer_period_ms) + test_args.timer_err_margin_us; + timer_set_next_cmp_ms(test_args.timer_period_ms, true); + + for (config_iter = 0; config_iter < test_args.nr_iter; config_iter++) { + /* Setup a timeout for the interrupt to arrive */ + udelay(us); + } + + irq_iter = READ_ONCE(shared_data->nr_iter); + __GUEST_ASSERT(irq_iter == 0, + "irq_iter = 0x%x.\n" + " Guest period timer interrupt was not triggered within the specified\n" + " interval, try to increase the error margin by [-e] option.\n", + irq_iter); +} + +static void guest_test_oneshot_timer(uint32_t cpu) +{ + uint32_t irq_iter, config_iter; + uint64_t us; + struct test_vcpu_shared_data *shared_data = &vcpu_shared_data[cpu]; + + shared_data->nr_iter = 0; + shared_data->guest_stage = 0; + us = msecs_to_usecs(test_args.timer_period_ms) + test_args.timer_err_margin_us; + for (config_iter = 0; config_iter < test_args.nr_iter; config_iter++) { + shared_data->xcnt = timer_get_cycles(); + + /* Setup the next interrupt */ + timer_set_next_cmp_ms(test_args.timer_period_ms, false); + /* Setup a timeout for the interrupt to arrive */ + udelay(us); + + irq_iter = READ_ONCE(shared_data->nr_iter); + __GUEST_ASSERT(config_iter + 1 == irq_iter, + "config_iter + 1 = 0x%x, irq_iter = 0x%x.\n" + " Guest timer interrupt was not triggered within the specified\n" + " interval, try to increase the error margin by [-e] option.\n", + config_iter + 1, irq_iter); + } +} + +static void guest_test_emulate_timer(uint32_t cpu) +{ + uint32_t config_iter; + uint64_t xcnt_diff_us, us; + struct test_vcpu_shared_data *shared_data = &vcpu_shared_data[cpu]; + + local_irq_disable(); + shared_data->nr_iter = 0; + us = msecs_to_usecs(test_args.timer_period_ms); + for (config_iter = 0; config_iter < test_args.nr_iter; config_iter++) { + shared_data->xcnt = timer_get_cycles(); + + /* Setup the next interrupt */ + timer_set_next_cmp_ms(test_args.timer_period_ms, false); + do_idle(); + + xcnt_diff_us = cycles_to_usec(timer_get_cycles() - shared_data->xcnt); + __GUEST_ASSERT(xcnt_diff_us >= us, + "xcnt_diff_us = 0x%lx, us = 0x%lx.\n", + xcnt_diff_us, us); + } + local_irq_enable(); +} + +static void guest_time_count_test(uint32_t cpu) +{ + uint32_t config_iter; + unsigned long start, end, prev, us; + + /* Assuming that test case starts to run in 1 second */ + start = timer_get_cycles(); + us = msec_to_cycles(1000); + __GUEST_ASSERT(start <= us, + "start = 0x%lx, us = 0x%lx.\n", + start, us); + + us = msec_to_cycles(test_args.timer_period_ms); + for (config_iter = 0; config_iter < test_args.nr_iter; config_iter++) { + start = timer_get_cycles(); + end = start + us; + /* test time count growing up always */ + while (start < end) { + prev = start; + start = timer_get_cycles(); + __GUEST_ASSERT(prev <= start, + "prev = 0x%lx, start = 0x%lx.\n", + prev, start); + } + } +} + +static void guest_code(void) +{ + uint32_t cpu = guest_get_vcpuid(); + + /* must run at first */ + guest_time_count_test(cpu); + + timer_irq_enable(); + local_irq_enable(); + guest_test_period_timer(cpu); + guest_test_oneshot_timer(cpu); + guest_test_emulate_timer(cpu); + + GUEST_DONE(); +} + +struct kvm_vm *test_vm_create(void) +{ + struct kvm_vm *vm; + int nr_vcpus = test_args.nr_vcpus; + + vm = vm_create_with_vcpus(nr_vcpus, guest_code, vcpus); + vm_init_descriptor_tables(vm); + vm_install_exception_handler(vm, EXCCODE_INT, guest_irq_handler); + + /* Make all the test's cmdline args visible to the guest */ + sync_global_to_guest(vm, test_args); + + return vm; +} + +void test_vm_cleanup(struct kvm_vm *vm) +{ + kvm_vm_free(vm); +} diff --git a/tools/testing/selftests/kvm/mmu_stress_test.c b/tools/testing/selftests/kvm/mmu_stress_test.c index 37b7e6524533..51c070556f3e 100644 --- a/tools/testing/selftests/kvm/mmu_stress_test.c +++ b/tools/testing/selftests/kvm/mmu_stress_test.c @@ -263,8 +263,10 @@ static void calc_default_nr_vcpus(void) TEST_ASSERT(!r, "sched_getaffinity failed, errno = %d (%s)", errno, strerror(errno)); - nr_vcpus = CPU_COUNT(&possible_mask) * 3/4; + nr_vcpus = CPU_COUNT(&possible_mask); TEST_ASSERT(nr_vcpus > 0, "Uh, no CPUs?"); + if (nr_vcpus >= 2) + nr_vcpus = nr_vcpus * 3/4; } int main(int argc, char *argv[]) @@ -360,11 +362,9 @@ int main(int argc, char *argv[]) #ifdef __x86_64__ /* Identity map memory in the guest using 1gb pages. */ - for (i = 0; i < slot_size; i += SZ_1G) - __virt_pg_map(vm, gpa + i, gpa + i, PG_LEVEL_1G); + virt_map_level(vm, gpa, gpa, slot_size, PG_LEVEL_1G); #else - for (i = 0; i < slot_size; i += vm->page_size) - virt_pg_map(vm, gpa + i, gpa + i); + virt_map(vm, gpa, gpa, slot_size >> vm->page_shift); #endif } diff --git a/tools/testing/selftests/kvm/pre_fault_memory_test.c b/tools/testing/selftests/kvm/pre_fault_memory_test.c index f04768c1d2e4..93e603d91311 100644 --- a/tools/testing/selftests/kvm/pre_fault_memory_test.c +++ b/tools/testing/selftests/kvm/pre_fault_memory_test.c @@ -17,13 +17,13 @@ #define TEST_NPAGES (TEST_SIZE / PAGE_SIZE) #define TEST_SLOT 10 -static void guest_code(uint64_t base_gpa) +static void guest_code(uint64_t base_gva) { volatile uint64_t val __used; int i; for (i = 0; i < TEST_NPAGES; i++) { - uint64_t *src = (uint64_t *)(base_gpa + i * PAGE_SIZE); + uint64_t *src = (uint64_t *)(base_gva + i * PAGE_SIZE); val = *src; } @@ -161,6 +161,7 @@ static void pre_fault_memory(struct kvm_vcpu *vcpu, u64 base_gpa, u64 offset, static void __test_pre_fault_memory(unsigned long vm_type, bool private) { + uint64_t gpa, gva, alignment, guest_page_size; const struct vm_shape shape = { .mode = VM_MODE_DEFAULT, .type = vm_type, @@ -170,35 +171,30 @@ static void __test_pre_fault_memory(unsigned long vm_type, bool private) struct kvm_vm *vm; struct ucall uc; - uint64_t guest_test_phys_mem; - uint64_t guest_test_virt_mem; - uint64_t alignment, guest_page_size; - vm = vm_create_shape_with_one_vcpu(shape, &vcpu, guest_code); alignment = guest_page_size = vm_guest_mode_params[VM_MODE_DEFAULT].page_size; - guest_test_phys_mem = (vm->max_gfn - TEST_NPAGES) * guest_page_size; + gpa = (vm->max_gfn - TEST_NPAGES) * guest_page_size; #ifdef __s390x__ alignment = max(0x100000UL, guest_page_size); #else alignment = SZ_2M; #endif - guest_test_phys_mem = align_down(guest_test_phys_mem, alignment); - guest_test_virt_mem = guest_test_phys_mem & ((1ULL << (vm->va_bits - 1)) - 1); + gpa = align_down(gpa, alignment); + gva = gpa & ((1ULL << (vm->va_bits - 1)) - 1); - vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, - guest_test_phys_mem, TEST_SLOT, TEST_NPAGES, - private ? KVM_MEM_GUEST_MEMFD : 0); - virt_map(vm, guest_test_virt_mem, guest_test_phys_mem, TEST_NPAGES); + vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, gpa, TEST_SLOT, + TEST_NPAGES, private ? KVM_MEM_GUEST_MEMFD : 0); + virt_map(vm, gva, gpa, TEST_NPAGES); if (private) - vm_mem_set_private(vm, guest_test_phys_mem, TEST_SIZE); + vm_mem_set_private(vm, gpa, TEST_SIZE); - pre_fault_memory(vcpu, guest_test_phys_mem, 0, SZ_2M, 0, private); - pre_fault_memory(vcpu, guest_test_phys_mem, SZ_2M, PAGE_SIZE * 2, PAGE_SIZE, private); - pre_fault_memory(vcpu, guest_test_phys_mem, TEST_SIZE, PAGE_SIZE, PAGE_SIZE, private); + pre_fault_memory(vcpu, gpa, 0, SZ_2M, 0, private); + pre_fault_memory(vcpu, gpa, SZ_2M, PAGE_SIZE * 2, PAGE_SIZE, private); + pre_fault_memory(vcpu, gpa, TEST_SIZE, PAGE_SIZE, PAGE_SIZE, private); - vcpu_args_set(vcpu, 1, guest_test_virt_mem); + vcpu_args_set(vcpu, 1, gva); vcpu_run(vcpu); run = vcpu->run; diff --git a/tools/testing/selftests/kvm/riscv/get-reg-list.c b/tools/testing/selftests/kvm/riscv/get-reg-list.c index 705ab3d7778b..cb54a56990a0 100644 --- a/tools/testing/selftests/kvm/riscv/get-reg-list.c +++ b/tools/testing/selftests/kvm/riscv/get-reg-list.c @@ -133,6 +133,7 @@ bool filter_reg(__u64 reg) case KVM_REG_RISCV_SBI_EXT | KVM_REG_RISCV_SBI_SINGLE | KVM_RISCV_SBI_EXT_SUSP: case KVM_REG_RISCV_SBI_EXT | KVM_REG_RISCV_SBI_SINGLE | KVM_RISCV_SBI_EXT_STA: case KVM_REG_RISCV_SBI_EXT | KVM_REG_RISCV_SBI_SINGLE | KVM_RISCV_SBI_EXT_FWFT: + case KVM_REG_RISCV_SBI_EXT | KVM_REG_RISCV_SBI_SINGLE | KVM_RISCV_SBI_EXT_MPXY: case KVM_REG_RISCV_SBI_EXT | KVM_REG_RISCV_SBI_SINGLE | KVM_RISCV_SBI_EXT_EXPERIMENTAL: case KVM_REG_RISCV_SBI_EXT | KVM_REG_RISCV_SBI_SINGLE | KVM_RISCV_SBI_EXT_VENDOR: return true; @@ -639,6 +640,7 @@ static const char *sbi_ext_single_id_to_str(__u64 reg_off) KVM_SBI_EXT_ARR(KVM_RISCV_SBI_EXT_SUSP), KVM_SBI_EXT_ARR(KVM_RISCV_SBI_EXT_STA), KVM_SBI_EXT_ARR(KVM_RISCV_SBI_EXT_FWFT), + KVM_SBI_EXT_ARR(KVM_RISCV_SBI_EXT_MPXY), KVM_SBI_EXT_ARR(KVM_RISCV_SBI_EXT_EXPERIMENTAL), KVM_SBI_EXT_ARR(KVM_RISCV_SBI_EXT_VENDOR), }; @@ -1142,6 +1144,7 @@ KVM_SBI_EXT_SUBLIST_CONFIG(sta, STA); KVM_SBI_EXT_SIMPLE_CONFIG(pmu, PMU); KVM_SBI_EXT_SIMPLE_CONFIG(dbcn, DBCN); KVM_SBI_EXT_SIMPLE_CONFIG(susp, SUSP); +KVM_SBI_EXT_SIMPLE_CONFIG(mpxy, MPXY); KVM_SBI_EXT_SUBLIST_CONFIG(fwft, FWFT); KVM_ISA_EXT_SUBLIST_CONFIG(aia, AIA); @@ -1222,6 +1225,7 @@ struct vcpu_reg_list *vcpu_configs[] = { &config_sbi_pmu, &config_sbi_dbcn, &config_sbi_susp, + &config_sbi_mpxy, &config_sbi_fwft, &config_aia, &config_fp_f, diff --git a/tools/testing/selftests/kvm/s390/user_operexec.c b/tools/testing/selftests/kvm/s390/user_operexec.c new file mode 100644 index 000000000000..714906c1d12a --- /dev/null +++ b/tools/testing/selftests/kvm/s390/user_operexec.c @@ -0,0 +1,140 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Test operation exception forwarding. + * + * Copyright IBM Corp. 2025 + * + * Authors: + * Janosch Frank <frankja@linux.ibm.com> + */ +#include "kselftest.h" +#include "kvm_util.h" +#include "test_util.h" +#include "sie.h" + +#include <linux/kvm.h> + +static void guest_code_instr0(void) +{ + asm(".word 0x0000"); +} + +static void test_user_instr0(void) +{ + struct kvm_vcpu *vcpu; + struct kvm_vm *vm; + int rc; + + vm = vm_create_with_one_vcpu(&vcpu, guest_code_instr0); + rc = __vm_enable_cap(vm, KVM_CAP_S390_USER_INSTR0, 0); + TEST_ASSERT_EQ(0, rc); + + vcpu_run(vcpu); + TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_S390_SIEIC); + TEST_ASSERT_EQ(vcpu->run->s390_sieic.icptcode, ICPT_OPEREXC); + TEST_ASSERT_EQ(vcpu->run->s390_sieic.ipa, 0); + + kvm_vm_free(vm); +} + +static void guest_code_user_operexec(void) +{ + asm(".word 0x0807"); +} + +static void test_user_operexec(void) +{ + struct kvm_vcpu *vcpu; + struct kvm_vm *vm; + int rc; + + vm = vm_create_with_one_vcpu(&vcpu, guest_code_user_operexec); + rc = __vm_enable_cap(vm, KVM_CAP_S390_USER_OPEREXEC, 0); + TEST_ASSERT_EQ(0, rc); + + vcpu_run(vcpu); + TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_S390_SIEIC); + TEST_ASSERT_EQ(vcpu->run->s390_sieic.icptcode, ICPT_OPEREXC); + TEST_ASSERT_EQ(vcpu->run->s390_sieic.ipa, 0x0807); + + kvm_vm_free(vm); + + /* + * Since user_operexec is the superset it can be used for the + * 0 instruction. + */ + vm = vm_create_with_one_vcpu(&vcpu, guest_code_instr0); + rc = __vm_enable_cap(vm, KVM_CAP_S390_USER_OPEREXEC, 0); + TEST_ASSERT_EQ(0, rc); + + vcpu_run(vcpu); + TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_S390_SIEIC); + TEST_ASSERT_EQ(vcpu->run->s390_sieic.icptcode, ICPT_OPEREXC); + TEST_ASSERT_EQ(vcpu->run->s390_sieic.ipa, 0); + + kvm_vm_free(vm); +} + +/* combine user_instr0 and user_operexec */ +static void test_user_operexec_combined(void) +{ + struct kvm_vcpu *vcpu; + struct kvm_vm *vm; + int rc; + + vm = vm_create_with_one_vcpu(&vcpu, guest_code_user_operexec); + rc = __vm_enable_cap(vm, KVM_CAP_S390_USER_INSTR0, 0); + TEST_ASSERT_EQ(0, rc); + rc = __vm_enable_cap(vm, KVM_CAP_S390_USER_OPEREXEC, 0); + TEST_ASSERT_EQ(0, rc); + + vcpu_run(vcpu); + TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_S390_SIEIC); + TEST_ASSERT_EQ(vcpu->run->s390_sieic.icptcode, ICPT_OPEREXC); + TEST_ASSERT_EQ(vcpu->run->s390_sieic.ipa, 0x0807); + + kvm_vm_free(vm); + + /* Reverse enablement order */ + vm = vm_create_with_one_vcpu(&vcpu, guest_code_user_operexec); + rc = __vm_enable_cap(vm, KVM_CAP_S390_USER_OPEREXEC, 0); + TEST_ASSERT_EQ(0, rc); + rc = __vm_enable_cap(vm, KVM_CAP_S390_USER_INSTR0, 0); + TEST_ASSERT_EQ(0, rc); + + vcpu_run(vcpu); + TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_S390_SIEIC); + TEST_ASSERT_EQ(vcpu->run->s390_sieic.icptcode, ICPT_OPEREXC); + TEST_ASSERT_EQ(vcpu->run->s390_sieic.ipa, 0x0807); + + kvm_vm_free(vm); +} + +/* + * Run all tests above. + * + * Enablement after VCPU has been added is automatically tested since + * we enable the capability after VCPU creation. + */ +static struct testdef { + const char *name; + void (*test)(void); +} testlist[] = { + { "instr0", test_user_instr0 }, + { "operexec", test_user_operexec }, + { "operexec_combined", test_user_operexec_combined}, +}; + +int main(int argc, char *argv[]) +{ + int idx; + + TEST_REQUIRE(kvm_has_cap(KVM_CAP_S390_USER_INSTR0)); + + ksft_print_header(); + ksft_set_plan(ARRAY_SIZE(testlist)); + for (idx = 0; idx < ARRAY_SIZE(testlist); idx++) { + testlist[idx].test(); + ksft_test_result_pass("%s\n", testlist[idx].name); + } + ksft_finished(); +} diff --git a/tools/testing/selftests/kvm/x86/hyperv_features.c b/tools/testing/selftests/kvm/x86/hyperv_features.c index 99d327084172..130b9ce7e5dd 100644 --- a/tools/testing/selftests/kvm/x86/hyperv_features.c +++ b/tools/testing/selftests/kvm/x86/hyperv_features.c @@ -94,7 +94,7 @@ static void guest_hcall(vm_vaddr_t pgs_gpa, struct hcall_data *hcall) if (!(hcall->control & HV_HYPERCALL_FAST_BIT)) { input = pgs_gpa; - output = pgs_gpa + 4096; + output = pgs_gpa + PAGE_SIZE; } else { input = output = 0; } diff --git a/tools/testing/selftests/kvm/x86/hyperv_ipi.c b/tools/testing/selftests/kvm/x86/hyperv_ipi.c index 2b5b4bc6ef7e..ca61836c4e32 100644 --- a/tools/testing/selftests/kvm/x86/hyperv_ipi.c +++ b/tools/testing/selftests/kvm/x86/hyperv_ipi.c @@ -102,7 +102,7 @@ static void sender_guest_code(void *hcall_page, vm_vaddr_t pgs_gpa) /* 'Slow' HvCallSendSyntheticClusterIpi to RECEIVER_VCPU_ID_1 */ ipi->vector = IPI_VECTOR; ipi->cpu_mask = 1 << RECEIVER_VCPU_ID_1; - hyperv_hypercall(HVCALL_SEND_IPI, pgs_gpa, pgs_gpa + 4096); + hyperv_hypercall(HVCALL_SEND_IPI, pgs_gpa, pgs_gpa + PAGE_SIZE); nop_loop(); GUEST_ASSERT(ipis_rcvd[RECEIVER_VCPU_ID_1] == ++ipis_expected[0]); GUEST_ASSERT(ipis_rcvd[RECEIVER_VCPU_ID_2] == ipis_expected[1]); @@ -116,13 +116,13 @@ static void sender_guest_code(void *hcall_page, vm_vaddr_t pgs_gpa) GUEST_SYNC(stage++); /* 'Slow' HvCallSendSyntheticClusterIpiEx to RECEIVER_VCPU_ID_1 */ - memset(hcall_page, 0, 4096); + memset(hcall_page, 0, PAGE_SIZE); ipi_ex->vector = IPI_VECTOR; ipi_ex->vp_set.format = HV_GENERIC_SET_SPARSE_4K; ipi_ex->vp_set.valid_bank_mask = 1 << 0; ipi_ex->vp_set.bank_contents[0] = BIT(RECEIVER_VCPU_ID_1); hyperv_hypercall(HVCALL_SEND_IPI_EX | (1 << HV_HYPERCALL_VARHEAD_OFFSET), - pgs_gpa, pgs_gpa + 4096); + pgs_gpa, pgs_gpa + PAGE_SIZE); nop_loop(); GUEST_ASSERT(ipis_rcvd[RECEIVER_VCPU_ID_1] == ++ipis_expected[0]); GUEST_ASSERT(ipis_rcvd[RECEIVER_VCPU_ID_2] == ipis_expected[1]); @@ -138,13 +138,13 @@ static void sender_guest_code(void *hcall_page, vm_vaddr_t pgs_gpa) GUEST_SYNC(stage++); /* 'Slow' HvCallSendSyntheticClusterIpiEx to RECEIVER_VCPU_ID_2 */ - memset(hcall_page, 0, 4096); + memset(hcall_page, 0, PAGE_SIZE); ipi_ex->vector = IPI_VECTOR; ipi_ex->vp_set.format = HV_GENERIC_SET_SPARSE_4K; ipi_ex->vp_set.valid_bank_mask = 1 << 1; ipi_ex->vp_set.bank_contents[0] = BIT(RECEIVER_VCPU_ID_2 - 64); hyperv_hypercall(HVCALL_SEND_IPI_EX | (1 << HV_HYPERCALL_VARHEAD_OFFSET), - pgs_gpa, pgs_gpa + 4096); + pgs_gpa, pgs_gpa + PAGE_SIZE); nop_loop(); GUEST_ASSERT(ipis_rcvd[RECEIVER_VCPU_ID_1] == ipis_expected[0]); GUEST_ASSERT(ipis_rcvd[RECEIVER_VCPU_ID_2] == ++ipis_expected[1]); @@ -160,14 +160,14 @@ static void sender_guest_code(void *hcall_page, vm_vaddr_t pgs_gpa) GUEST_SYNC(stage++); /* 'Slow' HvCallSendSyntheticClusterIpiEx to both RECEIVER_VCPU_ID_{1,2} */ - memset(hcall_page, 0, 4096); + memset(hcall_page, 0, PAGE_SIZE); ipi_ex->vector = IPI_VECTOR; ipi_ex->vp_set.format = HV_GENERIC_SET_SPARSE_4K; ipi_ex->vp_set.valid_bank_mask = 1 << 1 | 1; ipi_ex->vp_set.bank_contents[0] = BIT(RECEIVER_VCPU_ID_1); ipi_ex->vp_set.bank_contents[1] = BIT(RECEIVER_VCPU_ID_2 - 64); hyperv_hypercall(HVCALL_SEND_IPI_EX | (2 << HV_HYPERCALL_VARHEAD_OFFSET), - pgs_gpa, pgs_gpa + 4096); + pgs_gpa, pgs_gpa + PAGE_SIZE); nop_loop(); GUEST_ASSERT(ipis_rcvd[RECEIVER_VCPU_ID_1] == ++ipis_expected[0]); GUEST_ASSERT(ipis_rcvd[RECEIVER_VCPU_ID_2] == ++ipis_expected[1]); @@ -183,10 +183,10 @@ static void sender_guest_code(void *hcall_page, vm_vaddr_t pgs_gpa) GUEST_SYNC(stage++); /* 'Slow' HvCallSendSyntheticClusterIpiEx to HV_GENERIC_SET_ALL */ - memset(hcall_page, 0, 4096); + memset(hcall_page, 0, PAGE_SIZE); ipi_ex->vector = IPI_VECTOR; ipi_ex->vp_set.format = HV_GENERIC_SET_ALL; - hyperv_hypercall(HVCALL_SEND_IPI_EX, pgs_gpa, pgs_gpa + 4096); + hyperv_hypercall(HVCALL_SEND_IPI_EX, pgs_gpa, pgs_gpa + PAGE_SIZE); nop_loop(); GUEST_ASSERT(ipis_rcvd[RECEIVER_VCPU_ID_1] == ++ipis_expected[0]); GUEST_ASSERT(ipis_rcvd[RECEIVER_VCPU_ID_2] == ++ipis_expected[1]); diff --git a/tools/testing/selftests/kvm/x86/hyperv_tlb_flush.c b/tools/testing/selftests/kvm/x86/hyperv_tlb_flush.c index 077cd0ec3040..a3b7ce155981 100644 --- a/tools/testing/selftests/kvm/x86/hyperv_tlb_flush.c +++ b/tools/testing/selftests/kvm/x86/hyperv_tlb_flush.c @@ -621,7 +621,7 @@ int main(int argc, char *argv[]) for (i = 0; i < NTEST_PAGES; i++) { pte = vm_get_page_table_entry(vm, data->test_pages + i * PAGE_SIZE); gpa = addr_hva2gpa(vm, pte); - __virt_pg_map(vm, gva + PAGE_SIZE * i, gpa & PAGE_MASK, PG_LEVEL_4K); + virt_pg_map(vm, gva + PAGE_SIZE * i, gpa & PAGE_MASK); data->test_pages_pte[i] = gva + (gpa & ~PAGE_MASK); } diff --git a/tools/testing/selftests/kvm/x86/vmx_close_while_nested_test.c b/tools/testing/selftests/kvm/x86/nested_close_kvm_test.c index dad988351493..f001cb836bfa 100644 --- a/tools/testing/selftests/kvm/x86/vmx_close_while_nested_test.c +++ b/tools/testing/selftests/kvm/x86/nested_close_kvm_test.c @@ -1,7 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-only /* - * vmx_close_while_nested - * * Copyright (C) 2019, Red Hat, Inc. * * Verify that nothing bad happens if a KVM user exits with open @@ -12,6 +10,7 @@ #include "kvm_util.h" #include "processor.h" #include "vmx.h" +#include "svm_util.h" #include <string.h> #include <sys/ioctl.h> @@ -22,6 +21,8 @@ enum { PORT_L0_EXIT = 0x2000, }; +#define L2_GUEST_STACK_SIZE 64 + static void l2_guest_code(void) { /* Exit to L0 */ @@ -29,9 +30,8 @@ static void l2_guest_code(void) : : [port] "d" (PORT_L0_EXIT) : "rax"); } -static void l1_guest_code(struct vmx_pages *vmx_pages) +static void l1_vmx_code(struct vmx_pages *vmx_pages) { -#define L2_GUEST_STACK_SIZE 64 unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; GUEST_ASSERT(prepare_for_vmx_operation(vmx_pages)); @@ -45,19 +45,43 @@ static void l1_guest_code(struct vmx_pages *vmx_pages) GUEST_ASSERT(0); } +static void l1_svm_code(struct svm_test_data *svm) +{ + unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; + + /* Prepare the VMCB for L2 execution. */ + generic_svm_setup(svm, l2_guest_code, + &l2_guest_stack[L2_GUEST_STACK_SIZE]); + + run_guest(svm->vmcb, svm->vmcb_gpa); + GUEST_ASSERT(0); +} + +static void l1_guest_code(void *data) +{ + if (this_cpu_has(X86_FEATURE_VMX)) + l1_vmx_code(data); + else + l1_svm_code(data); +} + int main(int argc, char *argv[]) { - vm_vaddr_t vmx_pages_gva; + vm_vaddr_t guest_gva; struct kvm_vcpu *vcpu; struct kvm_vm *vm; - TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_VMX)); + TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_VMX) || + kvm_cpu_has(X86_FEATURE_SVM)); vm = vm_create_with_one_vcpu(&vcpu, l1_guest_code); - /* Allocate VMX pages and shared descriptors (vmx_pages). */ - vcpu_alloc_vmx(vm, &vmx_pages_gva); - vcpu_args_set(vcpu, 1, vmx_pages_gva); + if (kvm_cpu_has(X86_FEATURE_VMX)) + vcpu_alloc_vmx(vm, &guest_gva); + else + vcpu_alloc_svm(vm, &guest_gva); + + vcpu_args_set(vcpu, 1, guest_gva); for (;;) { volatile struct kvm_run *run = vcpu->run; diff --git a/tools/testing/selftests/kvm/x86/nested_invalid_cr3_test.c b/tools/testing/selftests/kvm/x86/nested_invalid_cr3_test.c new file mode 100644 index 000000000000..a6b6da9cf7fe --- /dev/null +++ b/tools/testing/selftests/kvm/x86/nested_invalid_cr3_test.c @@ -0,0 +1,116 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2025, Google LLC. + * + * This test verifies that L1 fails to enter L2 with an invalid CR3, and + * succeeds otherwise. + */ +#include "kvm_util.h" +#include "vmx.h" +#include "svm_util.h" +#include "kselftest.h" + + +#define L2_GUEST_STACK_SIZE 64 + +static void l2_guest_code(void) +{ + vmcall(); +} + +static void l1_svm_code(struct svm_test_data *svm) +{ + unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; + uintptr_t save_cr3; + + generic_svm_setup(svm, l2_guest_code, + &l2_guest_stack[L2_GUEST_STACK_SIZE]); + + /* Try to run L2 with invalid CR3 and make sure it fails */ + save_cr3 = svm->vmcb->save.cr3; + svm->vmcb->save.cr3 = -1ull; + run_guest(svm->vmcb, svm->vmcb_gpa); + GUEST_ASSERT(svm->vmcb->control.exit_code == SVM_EXIT_ERR); + + /* Now restore CR3 and make sure L2 runs successfully */ + svm->vmcb->save.cr3 = save_cr3; + run_guest(svm->vmcb, svm->vmcb_gpa); + GUEST_ASSERT(svm->vmcb->control.exit_code == SVM_EXIT_VMMCALL); + + GUEST_DONE(); +} + +static void l1_vmx_code(struct vmx_pages *vmx_pages) +{ + unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; + uintptr_t save_cr3; + + GUEST_ASSERT(prepare_for_vmx_operation(vmx_pages)); + GUEST_ASSERT(load_vmcs(vmx_pages)); + + prepare_vmcs(vmx_pages, l2_guest_code, + &l2_guest_stack[L2_GUEST_STACK_SIZE]); + + /* Try to run L2 with invalid CR3 and make sure it fails */ + save_cr3 = vmreadz(GUEST_CR3); + vmwrite(GUEST_CR3, -1ull); + GUEST_ASSERT(!vmlaunch()); + GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == + (EXIT_REASON_FAILED_VMENTRY | EXIT_REASON_INVALID_STATE)); + + /* Now restore CR3 and make sure L2 runs successfully */ + vmwrite(GUEST_CR3, save_cr3); + GUEST_ASSERT(!vmlaunch()); + GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL); + + GUEST_DONE(); +} + +static void l1_guest_code(void *data) +{ + if (this_cpu_has(X86_FEATURE_VMX)) + l1_vmx_code(data); + else + l1_svm_code(data); +} + +int main(int argc, char *argv[]) +{ + struct kvm_vcpu *vcpu; + struct kvm_vm *vm; + vm_vaddr_t guest_gva = 0; + + TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_VMX) || + kvm_cpu_has(X86_FEATURE_SVM)); + + vm = vm_create_with_one_vcpu(&vcpu, l1_guest_code); + + if (kvm_cpu_has(X86_FEATURE_VMX)) + vcpu_alloc_vmx(vm, &guest_gva); + else + vcpu_alloc_svm(vm, &guest_gva); + + vcpu_args_set(vcpu, 1, guest_gva); + + for (;;) { + struct ucall uc; + + vcpu_run(vcpu); + TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO); + + switch (get_ucall(vcpu, &uc)) { + case UCALL_ABORT: + REPORT_GUEST_ASSERT(uc); + case UCALL_SYNC: + break; + case UCALL_DONE: + goto done; + default: + TEST_FAIL("Unknown ucall %lu", uc.cmd); + } + } + +done: + kvm_vm_free(vm); + return 0; +} diff --git a/tools/testing/selftests/kvm/x86/vmx_tsc_adjust_test.c b/tools/testing/selftests/kvm/x86/nested_tsc_adjust_test.c index 2ceb5c78c442..2839f650e5c9 100644 --- a/tools/testing/selftests/kvm/x86/vmx_tsc_adjust_test.c +++ b/tools/testing/selftests/kvm/x86/nested_tsc_adjust_test.c @@ -1,7 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-only /* - * vmx_tsc_adjust_test - * * Copyright (C) 2018, Google LLC. * * IA32_TSC_ADJUST test @@ -22,6 +20,7 @@ #include "kvm_util.h" #include "processor.h" #include "vmx.h" +#include "svm_util.h" #include <string.h> #include <sys/ioctl.h> @@ -35,6 +34,8 @@ #define TSC_ADJUST_VALUE (1ll << 32) #define TSC_OFFSET_VALUE -(1ll << 48) +#define L2_GUEST_STACK_SIZE 64 + enum { PORT_ABORT = 0x1000, PORT_REPORT, @@ -72,42 +73,47 @@ static void l2_guest_code(void) __asm__ __volatile__("vmcall"); } -static void l1_guest_code(struct vmx_pages *vmx_pages) +static void l1_guest_code(void *data) { -#define L2_GUEST_STACK_SIZE 64 unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; - uint32_t control; - uintptr_t save_cr3; + /* Set TSC from L1 and make sure TSC_ADJUST is updated correctly */ GUEST_ASSERT(rdtsc() < TSC_ADJUST_VALUE); wrmsr(MSR_IA32_TSC, rdtsc() - TSC_ADJUST_VALUE); check_ia32_tsc_adjust(-1 * TSC_ADJUST_VALUE); - GUEST_ASSERT(prepare_for_vmx_operation(vmx_pages)); - GUEST_ASSERT(load_vmcs(vmx_pages)); - - /* Prepare the VMCS for L2 execution. */ - prepare_vmcs(vmx_pages, l2_guest_code, - &l2_guest_stack[L2_GUEST_STACK_SIZE]); - control = vmreadz(CPU_BASED_VM_EXEC_CONTROL); - control |= CPU_BASED_USE_MSR_BITMAPS | CPU_BASED_USE_TSC_OFFSETTING; - vmwrite(CPU_BASED_VM_EXEC_CONTROL, control); - vmwrite(TSC_OFFSET, TSC_OFFSET_VALUE); - - /* Jump into L2. First, test failure to load guest CR3. */ - save_cr3 = vmreadz(GUEST_CR3); - vmwrite(GUEST_CR3, -1ull); - GUEST_ASSERT(!vmlaunch()); - GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == - (EXIT_REASON_FAILED_VMENTRY | EXIT_REASON_INVALID_STATE)); - check_ia32_tsc_adjust(-1 * TSC_ADJUST_VALUE); - vmwrite(GUEST_CR3, save_cr3); - - GUEST_ASSERT(!vmlaunch()); - GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL); + /* + * Run L2 with TSC_OFFSET. L2 will write to TSC, and L1 is not + * intercepting the write so it should update L1's TSC_ADJUST. + */ + if (this_cpu_has(X86_FEATURE_VMX)) { + struct vmx_pages *vmx_pages = data; + uint32_t control; + + GUEST_ASSERT(prepare_for_vmx_operation(vmx_pages)); + GUEST_ASSERT(load_vmcs(vmx_pages)); + + prepare_vmcs(vmx_pages, l2_guest_code, + &l2_guest_stack[L2_GUEST_STACK_SIZE]); + control = vmreadz(CPU_BASED_VM_EXEC_CONTROL); + control |= CPU_BASED_USE_MSR_BITMAPS | CPU_BASED_USE_TSC_OFFSETTING; + vmwrite(CPU_BASED_VM_EXEC_CONTROL, control); + vmwrite(TSC_OFFSET, TSC_OFFSET_VALUE); + + GUEST_ASSERT(!vmlaunch()); + GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL); + } else { + struct svm_test_data *svm = data; + + generic_svm_setup(svm, l2_guest_code, + &l2_guest_stack[L2_GUEST_STACK_SIZE]); + + svm->vmcb->control.tsc_offset = TSC_OFFSET_VALUE; + run_guest(svm->vmcb, svm->vmcb_gpa); + GUEST_ASSERT(svm->vmcb->control.exit_code == SVM_EXIT_VMMCALL); + } check_ia32_tsc_adjust(-2 * TSC_ADJUST_VALUE); - GUEST_DONE(); } @@ -119,16 +125,19 @@ static void report(int64_t val) int main(int argc, char *argv[]) { - vm_vaddr_t vmx_pages_gva; + vm_vaddr_t nested_gva; struct kvm_vcpu *vcpu; - TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_VMX)); + TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_VMX) || + kvm_cpu_has(X86_FEATURE_SVM)); - vm = vm_create_with_one_vcpu(&vcpu, (void *) l1_guest_code); + vm = vm_create_with_one_vcpu(&vcpu, l1_guest_code); + if (kvm_cpu_has(X86_FEATURE_VMX)) + vcpu_alloc_vmx(vm, &nested_gva); + else + vcpu_alloc_svm(vm, &nested_gva); - /* Allocate VMX pages and shared descriptors (vmx_pages). */ - vcpu_alloc_vmx(vm, &vmx_pages_gva); - vcpu_args_set(vcpu, 1, vmx_pages_gva); + vcpu_args_set(vcpu, 1, nested_gva); for (;;) { struct ucall uc; diff --git a/tools/testing/selftests/kvm/x86/vmx_nested_tsc_scaling_test.c b/tools/testing/selftests/kvm/x86/nested_tsc_scaling_test.c index 1759fa5cb3f2..4260c9e4f489 100644 --- a/tools/testing/selftests/kvm/x86/vmx_nested_tsc_scaling_test.c +++ b/tools/testing/selftests/kvm/x86/nested_tsc_scaling_test.c @@ -13,6 +13,7 @@ #include "kvm_util.h" #include "vmx.h" +#include "svm_util.h" #include "kselftest.h" /* L2 is scaled up (from L1's perspective) by this factor */ @@ -79,7 +80,30 @@ static void l2_guest_code(void) __asm__ __volatile__("vmcall"); } -static void l1_guest_code(struct vmx_pages *vmx_pages) +static void l1_svm_code(struct svm_test_data *svm) +{ + unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; + + /* check that L1's frequency looks alright before launching L2 */ + check_tsc_freq(UCHECK_L1); + + generic_svm_setup(svm, l2_guest_code, + &l2_guest_stack[L2_GUEST_STACK_SIZE]); + + /* enable TSC scaling for L2 */ + wrmsr(MSR_AMD64_TSC_RATIO, L2_SCALE_FACTOR << 32); + + /* launch L2 */ + run_guest(svm->vmcb, svm->vmcb_gpa); + GUEST_ASSERT(svm->vmcb->control.exit_code == SVM_EXIT_VMMCALL); + + /* check that L1's frequency still looks good */ + check_tsc_freq(UCHECK_L1); + + GUEST_DONE(); +} + +static void l1_vmx_code(struct vmx_pages *vmx_pages) { unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; uint32_t control; @@ -116,11 +140,19 @@ static void l1_guest_code(struct vmx_pages *vmx_pages) GUEST_DONE(); } +static void l1_guest_code(void *data) +{ + if (this_cpu_has(X86_FEATURE_VMX)) + l1_vmx_code(data); + else + l1_svm_code(data); +} + int main(int argc, char *argv[]) { struct kvm_vcpu *vcpu; struct kvm_vm *vm; - vm_vaddr_t vmx_pages_gva; + vm_vaddr_t guest_gva = 0; uint64_t tsc_start, tsc_end; uint64_t tsc_khz; @@ -129,7 +161,8 @@ int main(int argc, char *argv[]) uint64_t l1_tsc_freq = 0; uint64_t l2_tsc_freq = 0; - TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_VMX)); + TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_VMX) || + kvm_cpu_has(X86_FEATURE_SVM)); TEST_REQUIRE(kvm_has_cap(KVM_CAP_TSC_CONTROL)); TEST_REQUIRE(sys_clocksource_is_based_on_tsc()); @@ -152,8 +185,13 @@ int main(int argc, char *argv[]) printf("real TSC frequency is around: %"PRIu64"\n", l0_tsc_freq); vm = vm_create_with_one_vcpu(&vcpu, l1_guest_code); - vcpu_alloc_vmx(vm, &vmx_pages_gva); - vcpu_args_set(vcpu, 1, vmx_pages_gva); + + if (kvm_cpu_has(X86_FEATURE_VMX)) + vcpu_alloc_vmx(vm, &guest_gva); + else + vcpu_alloc_svm(vm, &guest_gva); + + vcpu_args_set(vcpu, 1, guest_gva); tsc_khz = __vcpu_ioctl(vcpu, KVM_GET_TSC_KHZ, NULL); TEST_ASSERT(tsc_khz != -1, "vcpu ioctl KVM_GET_TSC_KHZ failed"); diff --git a/tools/testing/selftests/kvm/x86/private_mem_conversions_test.c b/tools/testing/selftests/kvm/x86/private_mem_conversions_test.c index 82a8d88b5338..1969f4ab9b28 100644 --- a/tools/testing/selftests/kvm/x86/private_mem_conversions_test.c +++ b/tools/testing/selftests/kvm/x86/private_mem_conversions_test.c @@ -380,7 +380,7 @@ static void test_mem_conversions(enum vm_mem_backing_src_type src_type, uint32_t struct kvm_vcpu *vcpus[KVM_MAX_VCPUS]; pthread_t threads[KVM_MAX_VCPUS]; struct kvm_vm *vm; - int memfd, i, r; + int memfd, i; const struct vm_shape shape = { .mode = VM_MODE_DEFAULT, @@ -428,11 +428,8 @@ static void test_mem_conversions(enum vm_mem_backing_src_type src_type, uint32_t * should prevent the VM from being fully destroyed until the last * reference to the guest_memfd is also put. */ - r = fallocate(memfd, FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE, 0, memfd_size); - TEST_ASSERT(!r, __KVM_SYSCALL_ERROR("fallocate()", r)); - - r = fallocate(memfd, FALLOC_FL_KEEP_SIZE, 0, memfd_size); - TEST_ASSERT(!r, __KVM_SYSCALL_ERROR("fallocate()", r)); + kvm_fallocate(memfd, FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE, 0, memfd_size); + kvm_fallocate(memfd, FALLOC_FL_KEEP_SIZE, 0, memfd_size); close(memfd); } diff --git a/tools/testing/selftests/kvm/x86/sev_smoke_test.c b/tools/testing/selftests/kvm/x86/sev_smoke_test.c index 77256c89bb8d..86ad1c7d068f 100644 --- a/tools/testing/selftests/kvm/x86/sev_smoke_test.c +++ b/tools/testing/selftests/kvm/x86/sev_smoke_test.c @@ -104,7 +104,7 @@ static void test_sync_vmsa(uint32_t type, uint64_t policy) vm_sev_launch(vm, policy, NULL); /* This page is shared, so make it decrypted. */ - memset(hva, 0, 4096); + memset(hva, 0, PAGE_SIZE); vcpu_run(vcpu); diff --git a/tools/testing/selftests/kvm/x86/state_test.c b/tools/testing/selftests/kvm/x86/state_test.c index 141b7fc0c965..f2c7a1c297e3 100644 --- a/tools/testing/selftests/kvm/x86/state_test.c +++ b/tools/testing/selftests/kvm/x86/state_test.c @@ -141,7 +141,7 @@ static void __attribute__((__flatten__)) guest_code(void *arg) if (this_cpu_has(X86_FEATURE_XSAVE)) { uint64_t supported_xcr0 = this_cpu_supported_xcr0(); - uint8_t buffer[4096]; + uint8_t buffer[PAGE_SIZE]; memset(buffer, 0xcc, sizeof(buffer)); diff --git a/tools/testing/selftests/kvm/x86/userspace_io_test.c b/tools/testing/selftests/kvm/x86/userspace_io_test.c index 9481cbcf284f..be7d72f3c029 100644 --- a/tools/testing/selftests/kvm/x86/userspace_io_test.c +++ b/tools/testing/selftests/kvm/x86/userspace_io_test.c @@ -85,7 +85,7 @@ int main(int argc, char *argv[]) regs.rcx = 1; if (regs.rcx == 3) regs.rcx = 8192; - memset((void *)run + run->io.data_offset, 0xaa, 4096); + memset((void *)run + run->io.data_offset, 0xaa, PAGE_SIZE); vcpu_regs_set(vcpu, ®s); } diff --git a/tools/testing/selftests/kvm/x86/vmx_dirty_log_test.c b/tools/testing/selftests/kvm/x86/vmx_dirty_log_test.c index fa512d033205..98cb6bdab3e6 100644 --- a/tools/testing/selftests/kvm/x86/vmx_dirty_log_test.c +++ b/tools/testing/selftests/kvm/x86/vmx_dirty_log_test.c @@ -120,17 +120,17 @@ static void test_vmx_dirty_log(bool enable_ept) * GPAs as the EPT enabled case. */ if (enable_ept) { - prepare_eptp(vmx, vm, 0); + prepare_eptp(vmx, vm); nested_map_memslot(vmx, vm, 0); - nested_map(vmx, vm, NESTED_TEST_MEM1, GUEST_TEST_MEM, 4096); - nested_map(vmx, vm, NESTED_TEST_MEM2, GUEST_TEST_MEM, 4096); + nested_map(vmx, vm, NESTED_TEST_MEM1, GUEST_TEST_MEM, PAGE_SIZE); + nested_map(vmx, vm, NESTED_TEST_MEM2, GUEST_TEST_MEM, PAGE_SIZE); } bmap = bitmap_zalloc(TEST_MEM_PAGES); host_test_mem = addr_gpa2hva(vm, GUEST_TEST_MEM); while (!done) { - memset(host_test_mem, 0xaa, TEST_MEM_PAGES * 4096); + memset(host_test_mem, 0xaa, TEST_MEM_PAGES * PAGE_SIZE); vcpu_run(vcpu); TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO); @@ -153,9 +153,9 @@ static void test_vmx_dirty_log(bool enable_ept) } TEST_ASSERT(!test_bit(1, bmap), "Page 1 incorrectly reported dirty"); - TEST_ASSERT(host_test_mem[4096 / 8] == 0xaaaaaaaaaaaaaaaaULL, "Page 1 written by guest"); + TEST_ASSERT(host_test_mem[PAGE_SIZE / 8] == 0xaaaaaaaaaaaaaaaaULL, "Page 1 written by guest"); TEST_ASSERT(!test_bit(2, bmap), "Page 2 incorrectly reported dirty"); - TEST_ASSERT(host_test_mem[8192 / 8] == 0xaaaaaaaaaaaaaaaaULL, "Page 2 written by guest"); + TEST_ASSERT(host_test_mem[PAGE_SIZE*2 / 8] == 0xaaaaaaaaaaaaaaaaULL, "Page 2 written by guest"); break; case UCALL_DONE: done = true; diff --git a/tools/testing/selftests/kvm/x86/vmx_nested_la57_state_test.c b/tools/testing/selftests/kvm/x86/vmx_nested_la57_state_test.c new file mode 100644 index 000000000000..cf1d2d1f2a8f --- /dev/null +++ b/tools/testing/selftests/kvm/x86/vmx_nested_la57_state_test.c @@ -0,0 +1,132 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2025, Google LLC. + * + * Test KVM's ability to save and restore nested state when the L1 guest + * is using 5-level paging and the L2 guest is using 4-level paging. + * + * This test would have failed prior to commit 9245fd6b8531 ("KVM: x86: + * model canonical checks more precisely"). + */ +#include "test_util.h" +#include "kvm_util.h" +#include "processor.h" +#include "vmx.h" + +#define LA57_GS_BASE 0xff2bc0311fb00000ull + +static void l2_guest_code(void) +{ + /* + * Sync with L0 to trigger save/restore. After + * resuming, execute VMCALL to exit back to L1. + */ + GUEST_SYNC(1); + vmcall(); +} + +static void l1_guest_code(struct vmx_pages *vmx_pages) +{ +#define L2_GUEST_STACK_SIZE 64 + unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; + u64 guest_cr4; + vm_paddr_t pml5_pa, pml4_pa; + u64 *pml5; + u64 exit_reason; + + /* Set GS_BASE to a value that is only canonical with LA57. */ + wrmsr(MSR_GS_BASE, LA57_GS_BASE); + GUEST_ASSERT(rdmsr(MSR_GS_BASE) == LA57_GS_BASE); + + GUEST_ASSERT(vmx_pages->vmcs_gpa); + GUEST_ASSERT(prepare_for_vmx_operation(vmx_pages)); + GUEST_ASSERT(load_vmcs(vmx_pages)); + + prepare_vmcs(vmx_pages, l2_guest_code, + &l2_guest_stack[L2_GUEST_STACK_SIZE]); + + /* + * Set up L2 with a 4-level page table by pointing its CR3 to + * L1's first PML4 table and clearing CR4.LA57. This creates + * the CR4.LA57 mismatch that exercises the bug. + */ + pml5_pa = get_cr3() & PHYSICAL_PAGE_MASK; + pml5 = (u64 *)pml5_pa; + pml4_pa = pml5[0] & PHYSICAL_PAGE_MASK; + vmwrite(GUEST_CR3, pml4_pa); + + guest_cr4 = vmreadz(GUEST_CR4); + guest_cr4 &= ~X86_CR4_LA57; + vmwrite(GUEST_CR4, guest_cr4); + + GUEST_ASSERT(!vmlaunch()); + + exit_reason = vmreadz(VM_EXIT_REASON); + GUEST_ASSERT(exit_reason == EXIT_REASON_VMCALL); +} + +void guest_code(struct vmx_pages *vmx_pages) +{ + l1_guest_code(vmx_pages); + GUEST_DONE(); +} + +int main(int argc, char *argv[]) +{ + vm_vaddr_t vmx_pages_gva = 0; + struct kvm_vm *vm; + struct kvm_vcpu *vcpu; + struct kvm_x86_state *state; + struct ucall uc; + int stage; + + TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_VMX)); + TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_LA57)); + TEST_REQUIRE(kvm_has_cap(KVM_CAP_NESTED_STATE)); + + vm = vm_create_with_one_vcpu(&vcpu, guest_code); + + /* + * L1 needs to read its own PML5 table to set up L2. Identity map + * the PML5 table to facilitate this. + */ + virt_map(vm, vm->pgd, vm->pgd, 1); + + vcpu_alloc_vmx(vm, &vmx_pages_gva); + vcpu_args_set(vcpu, 1, vmx_pages_gva); + + for (stage = 1;; stage++) { + vcpu_run(vcpu); + TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO); + + switch (get_ucall(vcpu, &uc)) { + case UCALL_ABORT: + REPORT_GUEST_ASSERT(uc); + /* NOT REACHED */ + case UCALL_SYNC: + break; + case UCALL_DONE: + goto done; + default: + TEST_FAIL("Unknown ucall %lu", uc.cmd); + } + + TEST_ASSERT(uc.args[1] == stage, + "Expected stage %d, got stage %lu", stage, (ulong)uc.args[1]); + if (stage == 1) { + pr_info("L2 is active; performing save/restore.\n"); + state = vcpu_save_state(vcpu); + + kvm_vm_release(vm); + + /* Restore state in a new VM. */ + vcpu = vm_recreate_with_one_vcpu(vm); + vcpu_load_state(vcpu, state); + kvm_x86_state_cleanup(state); + } + } + +done: + kvm_vm_free(vm); + return 0; +} diff --git a/tools/testing/selftests/kvm/x86/xapic_ipi_test.c b/tools/testing/selftests/kvm/x86/xapic_ipi_test.c index 35cb9de54a82..ae4a4b6c05ca 100644 --- a/tools/testing/selftests/kvm/x86/xapic_ipi_test.c +++ b/tools/testing/selftests/kvm/x86/xapic_ipi_test.c @@ -256,7 +256,7 @@ void do_migrations(struct test_data_page *data, int run_secs, int delay_usecs, int nodes = 0; time_t start_time, last_update, now; time_t interval_secs = 1; - int i, r; + int i; int from, to; unsigned long bit; uint64_t hlt_count; @@ -267,9 +267,8 @@ void do_migrations(struct test_data_page *data, int run_secs, int delay_usecs, delay_usecs); /* Get set of first 64 numa nodes available */ - r = get_mempolicy(NULL, &nodemask, sizeof(nodemask) * 8, + kvm_get_mempolicy(NULL, &nodemask, sizeof(nodemask) * 8, 0, MPOL_F_MEMS_ALLOWED); - TEST_ASSERT(r == 0, "get_mempolicy failed errno=%d", errno); fprintf(stderr, "Numa nodes found amongst first %lu possible nodes " "(each 1-bit indicates node is present): %#lx\n", diff --git a/tools/testing/selftests/landlock/Makefile b/tools/testing/selftests/landlock/Makefile index a3f449914bf9..044b83bde16e 100644 --- a/tools/testing/selftests/landlock/Makefile +++ b/tools/testing/selftests/landlock/Makefile @@ -4,7 +4,7 @@ CFLAGS += -Wall -O2 $(KHDR_INCLUDES) -LOCAL_HDRS += common.h +LOCAL_HDRS += $(wildcard *.h) src_test := $(wildcard *_test.c) diff --git a/tools/testing/selftests/landlock/fs_test.c b/tools/testing/selftests/landlock/fs_test.c index fa0f18ec62c4..eee814e09dd7 100644 --- a/tools/testing/selftests/landlock/fs_test.c +++ b/tools/testing/selftests/landlock/fs_test.c @@ -2267,6 +2267,22 @@ static int test_exchange(const char *const oldpath, const char *const newpath) return 0; } +static int test_renameat(int olddirfd, const char *oldpath, int newdirfd, + const char *newpath) +{ + if (renameat2(olddirfd, oldpath, newdirfd, newpath, 0)) + return errno; + return 0; +} + +static int test_exchangeat(int olddirfd, const char *oldpath, int newdirfd, + const char *newpath) +{ + if (renameat2(olddirfd, oldpath, newdirfd, newpath, RENAME_EXCHANGE)) + return errno; + return 0; +} + TEST_F_FORK(layout1, rename_file) { const struct rule rules[] = { @@ -4561,6 +4577,18 @@ TEST_F_FORK(ioctl, handle_file_access_file) FIXTURE(layout1_bind) {}; /* clang-format on */ +static const char bind_dir_s1d3[] = TMP_DIR "/s2d1/s2d2/s1d3"; +static const char bind_file1_s1d3[] = TMP_DIR "/s2d1/s2d2/s1d3/f1"; + +/* Move targets for disconnected path tests. */ +static const char dir_s4d1[] = TMP_DIR "/s4d1"; +static const char file1_s4d1[] = TMP_DIR "/s4d1/f1"; +static const char file2_s4d1[] = TMP_DIR "/s4d1/f2"; +static const char dir_s4d2[] = TMP_DIR "/s4d1/s4d2"; +static const char file1_s4d2[] = TMP_DIR "/s4d1/s4d2/f1"; +static const char file1_name[] = "f1"; +static const char file2_name[] = "f2"; + FIXTURE_SETUP(layout1_bind) { prepare_layout(_metadata); @@ -4576,14 +4604,14 @@ FIXTURE_TEARDOWN_PARENT(layout1_bind) { /* umount(dir_s2d2)) is handled by namespace lifetime. */ + remove_path(file1_s4d1); + remove_path(file2_s4d1); + remove_layout1(_metadata); cleanup_layout(_metadata); } -static const char bind_dir_s1d3[] = TMP_DIR "/s2d1/s2d2/s1d3"; -static const char bind_file1_s1d3[] = TMP_DIR "/s2d1/s2d2/s1d3/f1"; - /* * layout1_bind hierarchy: * @@ -4594,20 +4622,25 @@ static const char bind_file1_s1d3[] = TMP_DIR "/s2d1/s2d2/s1d3/f1"; * │ └── s1d2 * │ ├── f1 * │ ├── f2 - * │ └── s1d3 + * │ └── s1d3 [disconnected by path_disconnected] * │ ├── f1 * │ └── f2 * ├── s2d1 * │ ├── f1 - * │ └── s2d2 + * │ └── s2d2 [bind mount from s1d2] * │ ├── f1 * │ ├── f2 * │ └── s1d3 * │ ├── f1 * │ └── f2 - * └── s3d1 - * └── s3d2 - * └── s3d3 + * ├── s3d1 + * │ └── s3d2 + * │ └── s3d3 + * └── s4d1 [renamed from s1d3 by path_disconnected] + * ├── f1 + * ├── f2 + * └── s4d2 + * └── f1 */ TEST_F_FORK(layout1_bind, no_restriction) @@ -4806,6 +4839,1431 @@ TEST_F_FORK(layout1_bind, reparent_cross_mount) ASSERT_EQ(0, rename(bind_file1_s1d3, file1_s2d2)); } +/* + * Make sure access to file through a disconnected path works as expected. + * This test moves s1d3 to s4d1. + */ +TEST_F_FORK(layout1_bind, path_disconnected) +{ + const struct rule layer1_allow_all[] = { + { + .path = TMP_DIR, + .access = ACCESS_ALL, + }, + {}, + }; + const struct rule layer2_allow_just_f1[] = { + { + .path = file1_s1d3, + .access = LANDLOCK_ACCESS_FS_READ_FILE, + }, + {}, + }; + const struct rule layer3_only_s1d2[] = { + { + .path = dir_s1d2, + .access = LANDLOCK_ACCESS_FS_READ_FILE, + }, + {}, + }; + + /* Landlock should not deny access just because it is disconnected. */ + int ruleset_fd_l1 = + create_ruleset(_metadata, ACCESS_ALL, layer1_allow_all); + + /* Creates the new ruleset now before we move the dir containing the file. */ + int ruleset_fd_l2 = + create_ruleset(_metadata, ACCESS_RW, layer2_allow_just_f1); + int ruleset_fd_l3 = + create_ruleset(_metadata, ACCESS_RW, layer3_only_s1d2); + int bind_s1d3_fd; + + ASSERT_LE(0, ruleset_fd_l1); + ASSERT_LE(0, ruleset_fd_l2); + ASSERT_LE(0, ruleset_fd_l3); + + enforce_ruleset(_metadata, ruleset_fd_l1); + EXPECT_EQ(0, close(ruleset_fd_l1)); + + bind_s1d3_fd = open(bind_dir_s1d3, O_PATH | O_CLOEXEC); + ASSERT_LE(0, bind_s1d3_fd); + + /* Tests access is possible before we move. */ + EXPECT_EQ(0, test_open_rel(bind_s1d3_fd, file1_name, O_RDONLY)); + EXPECT_EQ(0, test_open_rel(bind_s1d3_fd, file2_name, O_RDONLY)); + EXPECT_EQ(0, test_open_rel(bind_s1d3_fd, "..", O_RDONLY | O_DIRECTORY)); + + /* Makes it disconnected. */ + ASSERT_EQ(0, rename(dir_s1d3, dir_s4d1)) + { + TH_LOG("Failed to rename %s to %s: %s", dir_s1d3, dir_s4d1, + strerror(errno)); + } + + /* Tests that access is still possible. */ + EXPECT_EQ(0, test_open_rel(bind_s1d3_fd, file1_name, O_RDONLY)); + EXPECT_EQ(0, test_open_rel(bind_s1d3_fd, file2_name, O_RDONLY)); + + /* + * Tests that ".." is not possible (not because of Landlock, but just + * because it's disconnected). + */ + EXPECT_EQ(ENOENT, + test_open_rel(bind_s1d3_fd, "..", O_RDONLY | O_DIRECTORY)); + + /* This should still work with a narrower rule. */ + enforce_ruleset(_metadata, ruleset_fd_l2); + EXPECT_EQ(0, close(ruleset_fd_l2)); + + EXPECT_EQ(0, test_open(file1_s4d1, O_RDONLY)); + /* + * Accessing a file through a disconnected file descriptor can still be + * allowed by a rule tied to this file, even if it is no longer visible in + * its mount point. + */ + EXPECT_EQ(0, test_open_rel(bind_s1d3_fd, file1_name, O_RDONLY)); + EXPECT_EQ(EACCES, test_open_rel(bind_s1d3_fd, file2_name, O_RDONLY)); + + enforce_ruleset(_metadata, ruleset_fd_l3); + EXPECT_EQ(0, close(ruleset_fd_l3)); + + EXPECT_EQ(EACCES, test_open(file1_s4d1, O_RDONLY)); + /* + * Accessing a file through a disconnected file descriptor can still be + * allowed by a rule tied to the original mount point, even if it is no + * longer visible in its mount point. + */ + EXPECT_EQ(0, test_open_rel(bind_s1d3_fd, file1_name, O_RDONLY)); + EXPECT_EQ(EACCES, test_open_rel(bind_s1d3_fd, file2_name, O_RDONLY)); +} + +/* + * Test that renameat with disconnected paths works under Landlock. This test + * moves s1d3 to s4d2, so that we can have a rule allowing refers on the move + * target's immediate parent. + */ +TEST_F_FORK(layout1_bind, path_disconnected_rename) +{ + const struct rule layer1[] = { + { + .path = dir_s1d2, + .access = LANDLOCK_ACCESS_FS_REFER | + LANDLOCK_ACCESS_FS_MAKE_DIR | + LANDLOCK_ACCESS_FS_REMOVE_DIR | + LANDLOCK_ACCESS_FS_MAKE_REG | + LANDLOCK_ACCESS_FS_REMOVE_FILE | + LANDLOCK_ACCESS_FS_READ_FILE, + }, + { + .path = dir_s4d1, + .access = LANDLOCK_ACCESS_FS_REFER | + LANDLOCK_ACCESS_FS_MAKE_DIR | + LANDLOCK_ACCESS_FS_REMOVE_DIR | + LANDLOCK_ACCESS_FS_MAKE_REG | + LANDLOCK_ACCESS_FS_REMOVE_FILE | + LANDLOCK_ACCESS_FS_READ_FILE, + }, + {} + }; + + /* This layer only handles LANDLOCK_ACCESS_FS_READ_FILE. */ + const struct rule layer2_only_s1d2[] = { + { + .path = dir_s1d2, + .access = LANDLOCK_ACCESS_FS_READ_FILE, + }, + {}, + }; + int ruleset_fd_l1, ruleset_fd_l2; + pid_t child_pid; + int bind_s1d3_fd, status; + + ASSERT_EQ(0, mkdir(dir_s4d1, 0755)) + { + TH_LOG("Failed to create %s: %s", dir_s4d1, strerror(errno)); + } + ruleset_fd_l1 = create_ruleset(_metadata, ACCESS_ALL, layer1); + ruleset_fd_l2 = create_ruleset(_metadata, LANDLOCK_ACCESS_FS_READ_FILE, + layer2_only_s1d2); + ASSERT_LE(0, ruleset_fd_l1); + ASSERT_LE(0, ruleset_fd_l2); + + enforce_ruleset(_metadata, ruleset_fd_l1); + EXPECT_EQ(0, close(ruleset_fd_l1)); + + bind_s1d3_fd = open(bind_dir_s1d3, O_PATH | O_CLOEXEC); + ASSERT_LE(0, bind_s1d3_fd); + EXPECT_EQ(0, test_open_rel(bind_s1d3_fd, file1_name, O_RDONLY)); + + /* Tests ENOENT priority over EACCES for disconnected directory. */ + EXPECT_EQ(EACCES, test_open_rel(bind_s1d3_fd, "..", O_DIRECTORY)); + ASSERT_EQ(0, rename(dir_s1d3, dir_s4d2)) + { + TH_LOG("Failed to rename %s to %s: %s", dir_s1d3, dir_s4d2, + strerror(errno)); + } + EXPECT_EQ(ENOENT, test_open_rel(bind_s1d3_fd, "..", O_DIRECTORY)); + + /* + * The file is no longer under s1d2 but we should still be able to access it + * with layer 2 because its mount point is evaluated as the first valid + * directory because it was initially a parent. Do a fork to test this so + * we don't prevent ourselves from renaming it back later. + */ + child_pid = fork(); + ASSERT_LE(0, child_pid); + if (child_pid == 0) { + enforce_ruleset(_metadata, ruleset_fd_l2); + EXPECT_EQ(0, close(ruleset_fd_l2)); + EXPECT_EQ(0, test_open_rel(bind_s1d3_fd, file1_name, O_RDONLY)); + EXPECT_EQ(EACCES, test_open(file1_s4d2, O_RDONLY)); + + /* + * Tests that access widening checks indeed prevents us from renaming it + * back. + */ + EXPECT_EQ(-1, rename(dir_s4d2, dir_s1d3)); + EXPECT_EQ(EXDEV, errno); + + /* + * Including through the now disconnected fd (but it should return + * EXDEV). + */ + EXPECT_EQ(-1, renameat(bind_s1d3_fd, file1_name, AT_FDCWD, + file1_s2d2)); + EXPECT_EQ(EXDEV, errno); + _exit(_metadata->exit_code); + return; + } + + EXPECT_EQ(child_pid, waitpid(child_pid, &status, 0)); + EXPECT_EQ(1, WIFEXITED(status)); + EXPECT_EQ(EXIT_SUCCESS, WEXITSTATUS(status)); + + ASSERT_EQ(0, rename(dir_s4d2, dir_s1d3)) + { + TH_LOG("Failed to rename %s back to %s: %s", dir_s4d1, dir_s1d3, + strerror(errno)); + } + + /* Now checks that we can access it under l2. */ + child_pid = fork(); + ASSERT_LE(0, child_pid); + if (child_pid == 0) { + enforce_ruleset(_metadata, ruleset_fd_l2); + EXPECT_EQ(0, close(ruleset_fd_l2)); + EXPECT_EQ(0, test_open_rel(bind_s1d3_fd, file1_name, O_RDONLY)); + EXPECT_EQ(0, test_open(file1_s1d3, O_RDONLY)); + _exit(_metadata->exit_code); + return; + } + + EXPECT_EQ(child_pid, waitpid(child_pid, &status, 0)); + EXPECT_EQ(1, WIFEXITED(status)); + EXPECT_EQ(EXIT_SUCCESS, WEXITSTATUS(status)); + + /* + * Also test that we can rename via a disconnected path. We move the + * dir back to the disconnected place first, then we rename file1 to + * file2 through our dir fd. + */ + ASSERT_EQ(0, rename(dir_s1d3, dir_s4d2)) + { + TH_LOG("Failed to rename %s to %s: %s", dir_s1d3, dir_s4d2, + strerror(errno)); + } + ASSERT_EQ(0, + renameat(bind_s1d3_fd, file1_name, bind_s1d3_fd, file2_name)) + { + TH_LOG("Failed to rename %s to %s within disconnected %s: %s", + file1_name, file2_name, bind_dir_s1d3, strerror(errno)); + } + EXPECT_EQ(0, test_open_rel(bind_s1d3_fd, file2_name, O_RDONLY)); + ASSERT_EQ(0, renameat(bind_s1d3_fd, file2_name, AT_FDCWD, file1_s2d2)) + { + TH_LOG("Failed to rename %s to %s through disconnected %s: %s", + file2_name, file1_s2d2, bind_dir_s1d3, strerror(errno)); + } + EXPECT_EQ(0, test_open(file1_s2d2, O_RDONLY)); + EXPECT_EQ(0, test_open(file1_s1d2, O_RDONLY)); + + /* Move it back using the disconnected path as the target. */ + ASSERT_EQ(0, renameat(AT_FDCWD, file1_s2d2, bind_s1d3_fd, file1_name)) + { + TH_LOG("Failed to rename %s to %s through disconnected %s: %s", + file1_s1d2, file1_name, bind_dir_s1d3, strerror(errno)); + } + + /* Now make it connected again. */ + ASSERT_EQ(0, rename(dir_s4d2, dir_s1d3)) + { + TH_LOG("Failed to rename %s back to %s: %s", dir_s4d2, dir_s1d3, + strerror(errno)); + } + + /* Checks again that we can access it under l2. */ + enforce_ruleset(_metadata, ruleset_fd_l2); + EXPECT_EQ(0, close(ruleset_fd_l2)); + EXPECT_EQ(0, test_open_rel(bind_s1d3_fd, file1_name, O_RDONLY)); + EXPECT_EQ(0, test_open(file1_s1d3, O_RDONLY)); +} + +/* + * Test that linkat(2) with disconnected paths works under Landlock. This + * test moves s1d3 to s4d1. + */ +TEST_F_FORK(layout1_bind, path_disconnected_link) +{ + /* Ruleset to be applied after renaming s1d3 to s4d1. */ + const struct rule layer1[] = { + { + .path = dir_s4d1, + .access = LANDLOCK_ACCESS_FS_REFER | + LANDLOCK_ACCESS_FS_READ_FILE | + LANDLOCK_ACCESS_FS_MAKE_REG | + LANDLOCK_ACCESS_FS_REMOVE_FILE, + }, + { + .path = dir_s2d2, + .access = LANDLOCK_ACCESS_FS_REFER | + LANDLOCK_ACCESS_FS_READ_FILE | + LANDLOCK_ACCESS_FS_MAKE_REG | + LANDLOCK_ACCESS_FS_REMOVE_FILE, + }, + {} + }; + int ruleset_fd, bind_s1d3_fd; + + /* Removes unneeded files created by layout1, otherwise it will EEXIST. */ + ASSERT_EQ(0, unlink(file1_s1d2)); + ASSERT_EQ(0, unlink(file2_s1d3)); + + bind_s1d3_fd = open(bind_dir_s1d3, O_PATH | O_CLOEXEC); + ASSERT_LE(0, bind_s1d3_fd); + EXPECT_EQ(0, test_open_rel(bind_s1d3_fd, file1_name, O_RDONLY)); + + /* Disconnects bind_s1d3_fd. */ + ASSERT_EQ(0, rename(dir_s1d3, dir_s4d1)) + { + TH_LOG("Failed to rename %s to %s: %s", dir_s1d3, dir_s4d1, + strerror(errno)); + } + + /* Need this later to test different parent link. */ + ASSERT_EQ(0, mkdir(dir_s4d2, 0755)) + { + TH_LOG("Failed to create %s: %s", dir_s4d2, strerror(errno)); + } + + ruleset_fd = create_ruleset(_metadata, ACCESS_ALL, layer1); + ASSERT_LE(0, ruleset_fd); + enforce_ruleset(_metadata, ruleset_fd); + EXPECT_EQ(0, close(ruleset_fd)); + + /* From disconnected to connected. */ + ASSERT_EQ(0, linkat(bind_s1d3_fd, file1_name, AT_FDCWD, file1_s2d2, 0)) + { + TH_LOG("Failed to link %s to %s via disconnected %s: %s", + file1_name, file1_s2d2, bind_dir_s1d3, strerror(errno)); + } + + /* Tests that we can access via the new link... */ + EXPECT_EQ(0, test_open(file1_s2d2, O_RDONLY)) + { + TH_LOG("Failed to open newly linked %s: %s", file1_s2d2, + strerror(errno)); + } + + /* ...as well as the old one. */ + EXPECT_EQ(0, test_open(file1_s4d1, O_RDONLY)) + { + TH_LOG("Failed to open original %s: %s", file1_s4d1, + strerror(errno)); + } + + /* From connected to disconnected. */ + ASSERT_EQ(0, unlink(file1_s4d1)); + ASSERT_EQ(0, linkat(AT_FDCWD, file1_s2d2, bind_s1d3_fd, file2_name, 0)) + { + TH_LOG("Failed to link %s to %s via disconnected %s: %s", + file1_s2d2, file2_name, bind_dir_s1d3, strerror(errno)); + } + EXPECT_EQ(0, test_open(file2_s4d1, O_RDONLY)); + ASSERT_EQ(0, unlink(file1_s2d2)); + + /* From disconnected to disconnected (same parent). */ + ASSERT_EQ(0, + linkat(bind_s1d3_fd, file2_name, bind_s1d3_fd, file1_name, 0)) + { + TH_LOG("Failed to link %s to %s within disconnected %s: %s", + file2_name, file1_name, bind_dir_s1d3, strerror(errno)); + } + EXPECT_EQ(0, test_open(file1_s4d1, O_RDONLY)) + { + TH_LOG("Failed to open newly linked %s: %s", file1_s4d1, + strerror(errno)); + } + EXPECT_EQ(0, test_open_rel(bind_s1d3_fd, file1_name, O_RDONLY)) + { + TH_LOG("Failed to open %s through newly created link under disconnected path: %s", + file1_name, strerror(errno)); + } + ASSERT_EQ(0, unlink(file2_s4d1)); + + /* From disconnected to disconnected (different parent). */ + ASSERT_EQ(0, + linkat(bind_s1d3_fd, file1_name, bind_s1d3_fd, "s4d2/f1", 0)) + { + TH_LOG("Failed to link %s to %s within disconnected %s: %s", + file1_name, "s4d2/f1", bind_dir_s1d3, strerror(errno)); + } + EXPECT_EQ(0, test_open(file1_s4d2, O_RDONLY)) + { + TH_LOG("Failed to open %s after link: %s", file1_s4d2, + strerror(errno)); + } + EXPECT_EQ(0, test_open_rel(bind_s1d3_fd, "s4d2/f1", O_RDONLY)) + { + TH_LOG("Failed to open %s through disconnected path after link: %s", + "s4d2/f1", strerror(errno)); + } +} + +/* + * layout4_disconnected_leafs with bind mount and renames: + * + * tmp + * ├── s1d1 + * │ └── s1d2 [source of the bind mount] + * │ ├── s1d31 + * │ │ └── s1d41 [now renamed beneath s3d1] + * │ │ ├── f1 + * │ │ └── f2 + * │ └── s1d32 + * │ └── s1d42 [now renamed beneath s4d1] + * │ ├── f3 + * │ └── f4 + * ├── s2d1 + * │ └── s2d2 [bind mount of s1d2] + * │ ├── s1d31 + * │ │ └── s1d41 [opened FD, now renamed beneath s3d1] + * │ │ ├── f1 + * │ │ └── f2 + * │ └── s1d32 + * │ └── s1d42 [opened FD, now renamed beneath s4d1] + * │ ├── f3 + * │ └── f4 + * ├── s3d1 + * │ └── s1d41 [renamed here] + * │ ├── f1 + * │ └── f2 + * └── s4d1 + * └── s1d42 [renamed here] + * ├── f3 + * └── f4 + */ +/* clang-format off */ +FIXTURE(layout4_disconnected_leafs) { + int s2d2_fd; +}; +/* clang-format on */ + +FIXTURE_SETUP(layout4_disconnected_leafs) +{ + prepare_layout(_metadata); + + create_file(_metadata, TMP_DIR "/s1d1/s1d2/s1d31/s1d41/f1"); + create_file(_metadata, TMP_DIR "/s1d1/s1d2/s1d31/s1d41/f2"); + create_file(_metadata, TMP_DIR "/s1d1/s1d2/s1d32/s1d42/f3"); + create_file(_metadata, TMP_DIR "/s1d1/s1d2/s1d32/s1d42/f4"); + create_directory(_metadata, TMP_DIR "/s2d1/s2d2"); + create_directory(_metadata, TMP_DIR "/s3d1"); + create_directory(_metadata, TMP_DIR "/s4d1"); + + self->s2d2_fd = + open(TMP_DIR "/s2d1/s2d2", O_DIRECTORY | O_PATH | O_CLOEXEC); + ASSERT_LE(0, self->s2d2_fd); + + set_cap(_metadata, CAP_SYS_ADMIN); + ASSERT_EQ(0, mount(TMP_DIR "/s1d1/s1d2", TMP_DIR "/s2d1/s2d2", NULL, + MS_BIND, NULL)); + clear_cap(_metadata, CAP_SYS_ADMIN); +} + +FIXTURE_TEARDOWN_PARENT(layout4_disconnected_leafs) +{ + /* umount(TMP_DIR "/s2d1") is handled by namespace lifetime. */ + + /* Removes files after renames. */ + remove_path(TMP_DIR "/s3d1/s1d41/f1"); + remove_path(TMP_DIR "/s3d1/s1d41/f2"); + remove_path(TMP_DIR "/s4d1/s1d42/f1"); + remove_path(TMP_DIR "/s4d1/s1d42/f3"); + remove_path(TMP_DIR "/s4d1/s1d42/f4"); + remove_path(TMP_DIR "/s4d1/s1d42/f5"); + + cleanup_layout(_metadata); +} + +FIXTURE_VARIANT(layout4_disconnected_leafs) +{ + /* + * Parent of the bind mount source. It should always be ignored when + * testing against files under the s1d41 or s1d42 disconnected directories. + */ + const __u64 allowed_s1d1; + /* + * Source of bind mount (to s2d2). It should always be enforced when + * testing against files under the s1d41 or s1d42 disconnected directories. + */ + const __u64 allowed_s1d2; + /* + * Original parent of s1d41. It should always be ignored when testing + * against files under the s1d41 disconnected directory. + */ + const __u64 allowed_s1d31; + /* + * Original parent of s1d42. It should always be ignored when testing + * against files under the s1d42 disconnected directory. + */ + const __u64 allowed_s1d32; + /* + * Opened and disconnected source directory. It should always be enforced + * when testing against files under the s1d41 disconnected directory. + */ + const __u64 allowed_s1d41; + /* + * Opened and disconnected source directory. It should always be enforced + * when testing against files under the s1d42 disconnected directory. + */ + const __u64 allowed_s1d42; + /* + * File in the s1d41 disconnected directory. It should always be enforced + * when testing against itself under the s1d41 disconnected directory. + */ + const __u64 allowed_f1; + /* + * File in the s1d41 disconnected directory. It should always be enforced + * when testing against itself under the s1d41 disconnected directory. + */ + const __u64 allowed_f2; + /* + * File in the s1d42 disconnected directory. It should always be enforced + * when testing against itself under the s1d42 disconnected directory. + */ + const __u64 allowed_f3; + /* + * Parent of the bind mount destination. It should always be enforced when + * testing against files under the s1d41 or s1d42 disconnected directories. + */ + const __u64 allowed_s2d1; + /* + * Directory covered by the bind mount. It should always be ignored when + * testing against files under the s1d41 or s1d42 disconnected directories. + */ + const __u64 allowed_s2d2; + /* + * New parent of the renamed s1d41. It should always be ignored when + * testing against files under the s1d41 disconnected directory. + */ + const __u64 allowed_s3d1; + /* + * New parent of the renamed s1d42. It should always be ignored when + * testing against files under the s1d42 disconnected directory. + */ + const __u64 allowed_s4d1; + + /* Expected result of the call to open([fd:s1d41]/f1, O_RDONLY). */ + const int expected_read_result; + /* Expected result of the call to renameat([fd:s1d41]/f1, [fd:s1d42]/f1). */ + const int expected_rename_result; + /* + * Expected result of the call to renameat([fd:s1d41]/f2, [fd:s1d42]/f3, + * RENAME_EXCHANGE). + */ + const int expected_exchange_result; + /* Expected result of the call to renameat([fd:s1d42]/f4, [fd:s1d42]/f5). */ + const int expected_same_dir_rename_result; +}; + +/* clang-format off */ +FIXTURE_VARIANT_ADD(layout4_disconnected_leafs, s1d1_mount_src_parent) { + /* clang-format on */ + .allowed_s1d1 = LANDLOCK_ACCESS_FS_REFER | + LANDLOCK_ACCESS_FS_READ_FILE | + LANDLOCK_ACCESS_FS_EXECUTE | + LANDLOCK_ACCESS_FS_MAKE_REG, + .expected_read_result = EACCES, + .expected_same_dir_rename_result = EACCES, + .expected_rename_result = EACCES, + .expected_exchange_result = EACCES, +}; + +/* clang-format off */ +FIXTURE_VARIANT_ADD(layout4_disconnected_leafs, s1d2_mount_src_refer) { + /* clang-format on */ + .allowed_s1d2 = LANDLOCK_ACCESS_FS_REFER | LANDLOCK_ACCESS_FS_READ_FILE, + .expected_read_result = 0, + .expected_same_dir_rename_result = EACCES, + .expected_rename_result = EACCES, + .expected_exchange_result = EACCES, +}; + +/* clang-format off */ +FIXTURE_VARIANT_ADD(layout4_disconnected_leafs, s1d2_mount_src_create) { + /* clang-format on */ + .allowed_s1d2 = LANDLOCK_ACCESS_FS_READ_FILE | + LANDLOCK_ACCESS_FS_MAKE_REG, + .expected_read_result = 0, + .expected_same_dir_rename_result = 0, + .expected_rename_result = EXDEV, + .expected_exchange_result = EXDEV, +}; + +/* clang-format off */ +FIXTURE_VARIANT_ADD(layout4_disconnected_leafs, s1d2_mount_src_rename) { + /* clang-format on */ + .allowed_s1d2 = LANDLOCK_ACCESS_FS_REFER | LANDLOCK_ACCESS_FS_MAKE_REG, + .expected_read_result = EACCES, + .expected_same_dir_rename_result = 0, + .expected_rename_result = 0, + .expected_exchange_result = 0, +}; + +/* clang-format off */ +FIXTURE_VARIANT_ADD(layout4_disconnected_leafs, s1d31_s1d32_old_parent) { + /* clang-format on */ + .allowed_s1d31 = LANDLOCK_ACCESS_FS_REFER | + LANDLOCK_ACCESS_FS_READ_FILE | + LANDLOCK_ACCESS_FS_EXECUTE | + LANDLOCK_ACCESS_FS_MAKE_REG, + .allowed_s1d32 = LANDLOCK_ACCESS_FS_REFER | + LANDLOCK_ACCESS_FS_READ_FILE | + LANDLOCK_ACCESS_FS_EXECUTE | + LANDLOCK_ACCESS_FS_MAKE_REG, + .expected_read_result = EACCES, + .expected_same_dir_rename_result = EACCES, + .expected_rename_result = EACCES, + .expected_exchange_result = EACCES, +}; + +/* clang-format off */ +FIXTURE_VARIANT_ADD(layout4_disconnected_leafs, s1d41_s1d42_disconnected_refer) { + /* clang-format on */ + .allowed_s1d41 = LANDLOCK_ACCESS_FS_REFER | + LANDLOCK_ACCESS_FS_READ_FILE, + .allowed_s1d42 = LANDLOCK_ACCESS_FS_REFER | + LANDLOCK_ACCESS_FS_READ_FILE, + .expected_read_result = 0, + .expected_same_dir_rename_result = EACCES, + .expected_rename_result = EACCES, + .expected_exchange_result = EACCES, +}; + +/* clang-format off */ +FIXTURE_VARIANT_ADD(layout4_disconnected_leafs, s1d41_s1d42_disconnected_create) { + /* clang-format on */ + .allowed_s1d41 = LANDLOCK_ACCESS_FS_READ_FILE | + LANDLOCK_ACCESS_FS_MAKE_REG, + .allowed_s1d42 = LANDLOCK_ACCESS_FS_READ_FILE | + LANDLOCK_ACCESS_FS_MAKE_REG, + .expected_read_result = 0, + .expected_same_dir_rename_result = 0, + .expected_rename_result = EXDEV, + .expected_exchange_result = EXDEV, +}; + +/* clang-format off */ +FIXTURE_VARIANT_ADD(layout4_disconnected_leafs, s1d41_s1d42_disconnected_rename_even) { + /* clang-format on */ + .allowed_s1d41 = LANDLOCK_ACCESS_FS_REFER | LANDLOCK_ACCESS_FS_MAKE_REG, + .allowed_s1d42 = LANDLOCK_ACCESS_FS_REFER | LANDLOCK_ACCESS_FS_MAKE_REG, + .expected_read_result = EACCES, + .expected_same_dir_rename_result = 0, + .expected_rename_result = 0, + .expected_exchange_result = 0, +}; + +/* The destination directory has more access right. */ +/* clang-format off */ +FIXTURE_VARIANT_ADD(layout4_disconnected_leafs, s1d41_s1d42_disconnected_rename_more) { + /* clang-format on */ + .allowed_s1d41 = LANDLOCK_ACCESS_FS_REFER | LANDLOCK_ACCESS_FS_MAKE_REG, + .allowed_s1d42 = LANDLOCK_ACCESS_FS_REFER | + LANDLOCK_ACCESS_FS_MAKE_REG | + LANDLOCK_ACCESS_FS_EXECUTE, + .expected_read_result = EACCES, + .expected_same_dir_rename_result = 0, + /* Access denied. */ + .expected_rename_result = EXDEV, + .expected_exchange_result = EXDEV, +}; + +/* The destination directory has less access right. */ +/* clang-format off */ +FIXTURE_VARIANT_ADD(layout4_disconnected_leafs, s1d41_s1d42_disconnected_rename_less) { + /* clang-format on */ + .allowed_s1d41 = LANDLOCK_ACCESS_FS_REFER | + LANDLOCK_ACCESS_FS_MAKE_REG | + LANDLOCK_ACCESS_FS_EXECUTE, + .allowed_s1d42 = LANDLOCK_ACCESS_FS_REFER | LANDLOCK_ACCESS_FS_MAKE_REG, + .expected_read_result = EACCES, + .expected_same_dir_rename_result = 0, + /* Access allowed. */ + .expected_rename_result = 0, + .expected_exchange_result = EXDEV, +}; + +/* clang-format off */ +FIXTURE_VARIANT_ADD(layout4_disconnected_leafs, s2d1_mount_dst_parent_create) { + /* clang-format on */ + .allowed_s2d1 = LANDLOCK_ACCESS_FS_READ_FILE | + LANDLOCK_ACCESS_FS_MAKE_REG, + .expected_read_result = 0, + .expected_same_dir_rename_result = 0, + .expected_rename_result = EXDEV, + .expected_exchange_result = EXDEV, +}; + +/* clang-format off */ +FIXTURE_VARIANT_ADD(layout4_disconnected_leafs, s2d1_mount_dst_parent_refer) { + /* clang-format on */ + .allowed_s2d1 = LANDLOCK_ACCESS_FS_REFER | LANDLOCK_ACCESS_FS_READ_FILE, + .expected_read_result = 0, + .expected_same_dir_rename_result = EACCES, + .expected_rename_result = EACCES, + .expected_exchange_result = EACCES, +}; + +/* clang-format off */ +FIXTURE_VARIANT_ADD(layout4_disconnected_leafs, s2d1_mount_dst_parent_mini) { + /* clang-format on */ + .allowed_s2d1 = LANDLOCK_ACCESS_FS_REFER | + LANDLOCK_ACCESS_FS_READ_FILE | + LANDLOCK_ACCESS_FS_MAKE_REG, + .expected_read_result = 0, + .expected_same_dir_rename_result = 0, + .expected_rename_result = 0, + .expected_exchange_result = 0, +}; + +/* clang-format off */ +FIXTURE_VARIANT_ADD(layout4_disconnected_leafs, s2d2_covered_by_mount) { + /* clang-format on */ + .allowed_s2d2 = LANDLOCK_ACCESS_FS_REFER | + LANDLOCK_ACCESS_FS_READ_FILE | + LANDLOCK_ACCESS_FS_EXECUTE | + LANDLOCK_ACCESS_FS_MAKE_REG, + .expected_read_result = EACCES, + .expected_same_dir_rename_result = EACCES, + .expected_rename_result = EACCES, + .expected_exchange_result = EACCES, +}; + +/* Tests collect_domain_accesses(). */ +/* clang-format off */ +FIXTURE_VARIANT_ADD(layout4_disconnected_leafs, s3d1_s4d1_new_parent_refer) { + /* clang-format on */ + .allowed_s3d1 = LANDLOCK_ACCESS_FS_REFER | LANDLOCK_ACCESS_FS_READ_FILE, + .allowed_s4d1 = LANDLOCK_ACCESS_FS_REFER | LANDLOCK_ACCESS_FS_READ_FILE, + .expected_read_result = 0, + .expected_same_dir_rename_result = EACCES, + .expected_rename_result = EACCES, + .expected_exchange_result = EACCES, +}; + +/* clang-format off */ +FIXTURE_VARIANT_ADD(layout4_disconnected_leafs, s3d1_s4d1_new_parent_create) { + /* clang-format on */ + .allowed_s3d1 = LANDLOCK_ACCESS_FS_READ_FILE | + LANDLOCK_ACCESS_FS_MAKE_REG, + .allowed_s4d1 = LANDLOCK_ACCESS_FS_READ_FILE | + LANDLOCK_ACCESS_FS_MAKE_REG, + .expected_read_result = 0, + .expected_same_dir_rename_result = 0, + .expected_rename_result = EXDEV, + .expected_exchange_result = EXDEV, +}; + +FIXTURE_VARIANT_ADD(layout4_disconnected_leafs, + s3d1_s4d1_disconnected_rename_even){ + /* clang-format on */ + .allowed_s3d1 = LANDLOCK_ACCESS_FS_REFER | LANDLOCK_ACCESS_FS_MAKE_REG, + .allowed_s4d1 = LANDLOCK_ACCESS_FS_REFER | LANDLOCK_ACCESS_FS_MAKE_REG, + .expected_read_result = EACCES, + .expected_same_dir_rename_result = 0, + .expected_rename_result = 0, + .expected_exchange_result = 0, +}; + +/* The destination directory has more access right. */ +/* clang-format off */ +FIXTURE_VARIANT_ADD(layout4_disconnected_leafs, s3d1_s4d1_disconnected_rename_more) { + /* clang-format on */ + .allowed_s3d1 = LANDLOCK_ACCESS_FS_REFER | LANDLOCK_ACCESS_FS_MAKE_REG, + .allowed_s4d1 = LANDLOCK_ACCESS_FS_REFER | LANDLOCK_ACCESS_FS_MAKE_REG | + LANDLOCK_ACCESS_FS_EXECUTE, + .expected_read_result = EACCES, + .expected_same_dir_rename_result = 0, + /* Access denied. */ + .expected_rename_result = EXDEV, + .expected_exchange_result = EXDEV, +}; + +/* The destination directory has less access right. */ +/* clang-format off */ +FIXTURE_VARIANT_ADD(layout4_disconnected_leafs, s3d1_s4d1_disconnected_rename_less) { + /* clang-format on */ + .allowed_s3d1 = LANDLOCK_ACCESS_FS_REFER | LANDLOCK_ACCESS_FS_MAKE_REG | + LANDLOCK_ACCESS_FS_EXECUTE, + .allowed_s4d1 = LANDLOCK_ACCESS_FS_REFER | LANDLOCK_ACCESS_FS_MAKE_REG, + .expected_read_result = EACCES, + .expected_same_dir_rename_result = 0, + /* Access allowed. */ + .expected_rename_result = 0, + .expected_exchange_result = EXDEV, +}; + +/* clang-format off */ +FIXTURE_VARIANT_ADD(layout4_disconnected_leafs, f1_f2_f3) { + /* clang-format on */ + .allowed_f1 = LANDLOCK_ACCESS_FS_READ_FILE, + .allowed_f2 = LANDLOCK_ACCESS_FS_READ_FILE, + .allowed_f3 = LANDLOCK_ACCESS_FS_READ_FILE, + .expected_read_result = 0, + .expected_same_dir_rename_result = EACCES, + .expected_rename_result = EACCES, + .expected_exchange_result = EACCES, +}; + +TEST_F_FORK(layout4_disconnected_leafs, read_rename_exchange) +{ + const __u64 handled_access = + LANDLOCK_ACCESS_FS_REFER | LANDLOCK_ACCESS_FS_READ_FILE | + LANDLOCK_ACCESS_FS_EXECUTE | LANDLOCK_ACCESS_FS_MAKE_REG; + const struct rule rules[] = { + { + .path = TMP_DIR "/s1d1", + .access = variant->allowed_s1d1, + }, + { + .path = TMP_DIR "/s1d1/s1d2", + .access = variant->allowed_s1d2, + }, + { + .path = TMP_DIR "/s1d1/s1d2/s1d31", + .access = variant->allowed_s1d31, + }, + { + .path = TMP_DIR "/s1d1/s1d2/s1d32", + .access = variant->allowed_s1d32, + }, + { + .path = TMP_DIR "/s1d1/s1d2/s1d31/s1d41", + .access = variant->allowed_s1d41, + }, + { + .path = TMP_DIR "/s1d1/s1d2/s1d32/s1d42", + .access = variant->allowed_s1d42, + }, + { + .path = TMP_DIR "/s1d1/s1d2/s1d31/s1d41/f1", + .access = variant->allowed_f1, + }, + { + .path = TMP_DIR "/s1d1/s1d2/s1d31/s1d41/f2", + .access = variant->allowed_f2, + }, + { + .path = TMP_DIR "/s1d1/s1d2/s1d32/s1d42/f3", + .access = variant->allowed_f3, + }, + { + .path = TMP_DIR "/s2d1", + .access = variant->allowed_s2d1, + }, + /* s2d2_fd */ + { + .path = TMP_DIR "/s3d1", + .access = variant->allowed_s3d1, + }, + { + .path = TMP_DIR "/s4d1", + .access = variant->allowed_s4d1, + }, + {}, + }; + int ruleset_fd, s1d41_bind_fd, s1d42_bind_fd; + + ruleset_fd = create_ruleset(_metadata, handled_access, rules); + ASSERT_LE(0, ruleset_fd); + + /* Adds rule for the covered directory. */ + if (variant->allowed_s2d2) { + ASSERT_EQ(0, landlock_add_rule( + ruleset_fd, LANDLOCK_RULE_PATH_BENEATH, + &(struct landlock_path_beneath_attr){ + .parent_fd = self->s2d2_fd, + .allowed_access = + variant->allowed_s2d2, + }, + 0)); + } + EXPECT_EQ(0, close(self->s2d2_fd)); + + s1d41_bind_fd = open(TMP_DIR "/s2d1/s2d2/s1d31/s1d41", + O_DIRECTORY | O_PATH | O_CLOEXEC); + ASSERT_LE(0, s1d41_bind_fd); + s1d42_bind_fd = open(TMP_DIR "/s2d1/s2d2/s1d32/s1d42", + O_DIRECTORY | O_PATH | O_CLOEXEC); + ASSERT_LE(0, s1d42_bind_fd); + + /* Disconnects and checks source and destination directories. */ + EXPECT_EQ(0, test_open_rel(s1d41_bind_fd, "..", O_DIRECTORY)); + EXPECT_EQ(0, test_open_rel(s1d42_bind_fd, "..", O_DIRECTORY)); + /* Renames to make it accessible through s3d1/s1d41 */ + ASSERT_EQ(0, test_renameat(AT_FDCWD, TMP_DIR "/s1d1/s1d2/s1d31/s1d41", + AT_FDCWD, TMP_DIR "/s3d1/s1d41")); + /* Renames to make it accessible through s4d1/s1d42 */ + ASSERT_EQ(0, test_renameat(AT_FDCWD, TMP_DIR "/s1d1/s1d2/s1d32/s1d42", + AT_FDCWD, TMP_DIR "/s4d1/s1d42")); + EXPECT_EQ(ENOENT, test_open_rel(s1d41_bind_fd, "..", O_DIRECTORY)); + EXPECT_EQ(ENOENT, test_open_rel(s1d42_bind_fd, "..", O_DIRECTORY)); + + enforce_ruleset(_metadata, ruleset_fd); + EXPECT_EQ(0, close(ruleset_fd)); + + EXPECT_EQ(variant->expected_read_result, + test_open_rel(s1d41_bind_fd, "f1", O_RDONLY)); + + EXPECT_EQ(variant->expected_rename_result, + test_renameat(s1d41_bind_fd, "f1", s1d42_bind_fd, "f1")); + EXPECT_EQ(variant->expected_exchange_result, + test_exchangeat(s1d41_bind_fd, "f2", s1d42_bind_fd, "f3")); + + EXPECT_EQ(variant->expected_same_dir_rename_result, + test_renameat(s1d42_bind_fd, "f4", s1d42_bind_fd, "f5")); +} + +/* + * layout5_disconnected_branch before rename: + * + * tmp + * ├── s1d1 + * │ └── s1d2 [source of the first bind mount] + * │ └── s1d3 + * │ ├── s1d41 + * │ │ ├── f1 + * │ │ └── f2 + * │ └── s1d42 + * │ ├── f3 + * │ └── f4 + * ├── s2d1 + * │ └── s2d2 [source of the second bind mount] + * │ └── s2d3 + * │ └── s2d4 [first s1d2 bind mount] + * │ └── s1d3 + * │ ├── s1d41 + * │ │ ├── f1 + * │ │ └── f2 + * │ └── s1d42 + * │ ├── f3 + * │ └── f4 + * ├── s3d1 + * │ └── s3d2 [second s2d2 bind mount] + * │ └── s2d3 + * │ └── s2d4 [first s1d2 bind mount] + * │ └── s1d3 + * │ ├── s1d41 + * │ │ ├── f1 + * │ │ └── f2 + * │ └── s1d42 + * │ ├── f3 + * │ └── f4 + * └── s4d1 + * + * After rename: + * + * tmp + * ├── s1d1 + * │ └── s1d2 [source of the first bind mount] + * │ └── s1d3 + * │ ├── s1d41 + * │ │ ├── f1 + * │ │ └── f2 + * │ └── s1d42 + * │ ├── f3 + * │ └── f4 + * ├── s2d1 + * │ └── s2d2 [source of the second bind mount] + * ├── s3d1 + * │ └── s3d2 [second s2d2 bind mount] + * └── s4d1 + * └── s2d3 [renamed here] + * └── s2d4 [first s1d2 bind mount] + * └── s1d3 + * ├── s1d41 + * │ ├── f1 + * │ └── f2 + * └── s1d42 + * ├── f3 + * └── f4 + * + * Decision path for access from the s3d1/s3d2/s2d3/s2d4/s1d3 file descriptor: + * 1. first bind mount: s1d3 -> s1d2 + * 2. second bind mount: s2d3 + * 3. tmp mount: s4d1 -> tmp [disconnected branch] + * 4. second bind mount: s2d2 + * 5. tmp mount: s3d1 -> tmp + * 6. parent mounts: [...] -> / + * + * The s4d1 directory is evaluated even if it is not in the s2d2 mount. + */ + +/* clang-format off */ +FIXTURE(layout5_disconnected_branch) { + int s2d4_fd, s3d2_fd; +}; +/* clang-format on */ + +FIXTURE_SETUP(layout5_disconnected_branch) +{ + prepare_layout(_metadata); + + create_file(_metadata, TMP_DIR "/s1d1/s1d2/s1d3/s1d41/f1"); + create_file(_metadata, TMP_DIR "/s1d1/s1d2/s1d3/s1d41/f2"); + create_file(_metadata, TMP_DIR "/s1d1/s1d2/s1d3/s1d42/f3"); + create_file(_metadata, TMP_DIR "/s1d1/s1d2/s1d3/s1d42/f4"); + create_directory(_metadata, TMP_DIR "/s2d1/s2d2/s2d3/s2d4"); + create_directory(_metadata, TMP_DIR "/s3d1/s3d2"); + create_directory(_metadata, TMP_DIR "/s4d1"); + + self->s2d4_fd = open(TMP_DIR "/s2d1/s2d2/s2d3/s2d4", + O_DIRECTORY | O_PATH | O_CLOEXEC); + ASSERT_LE(0, self->s2d4_fd); + + self->s3d2_fd = + open(TMP_DIR "/s3d1/s3d2", O_DIRECTORY | O_PATH | O_CLOEXEC); + ASSERT_LE(0, self->s3d2_fd); + + set_cap(_metadata, CAP_SYS_ADMIN); + ASSERT_EQ(0, mount(TMP_DIR "/s1d1/s1d2", TMP_DIR "/s2d1/s2d2/s2d3/s2d4", + NULL, MS_BIND, NULL)); + ASSERT_EQ(0, mount(TMP_DIR "/s2d1/s2d2", TMP_DIR "/s3d1/s3d2", NULL, + MS_BIND | MS_REC, NULL)); + clear_cap(_metadata, CAP_SYS_ADMIN); +} + +FIXTURE_TEARDOWN_PARENT(layout5_disconnected_branch) +{ + /* Bind mounts are handled by namespace lifetime. */ + + /* Removes files after renames. */ + remove_path(TMP_DIR "/s1d1/s1d2/s1d3/s1d41/f1"); + remove_path(TMP_DIR "/s1d1/s1d2/s1d3/s1d41/f2"); + remove_path(TMP_DIR "/s1d1/s1d2/s1d3/s1d42/f1"); + remove_path(TMP_DIR "/s1d1/s1d2/s1d3/s1d42/f3"); + remove_path(TMP_DIR "/s1d1/s1d2/s1d3/s1d42/f4"); + remove_path(TMP_DIR "/s1d1/s1d2/s1d3/s1d42/f5"); + + cleanup_layout(_metadata); +} + +FIXTURE_VARIANT(layout5_disconnected_branch) +{ + /* + * Parent of all files. It should always be enforced when testing against + * files under the s1d41 or s1d42 disconnected directories. + */ + const __u64 allowed_base; + /* + * Parent of the first bind mount source. It should always be ignored when + * testing against files under the s1d41 or s1d42 disconnected directories. + */ + const __u64 allowed_s1d1; + const __u64 allowed_s1d2; + const __u64 allowed_s1d3; + const __u64 allowed_s2d1; + const __u64 allowed_s2d2; + const __u64 allowed_s2d3; + const __u64 allowed_s2d4; + const __u64 allowed_s3d1; + const __u64 allowed_s3d2; + const __u64 allowed_s4d1; + + /* Expected result of the call to open([fd:s1d3]/s1d41/f1, O_RDONLY). */ + const int expected_read_result; + /* + * Expected result of the call to renameat([fd:s1d3]/s1d41/f1, + * [fd:s1d3]/s1d42/f1). + */ + const int expected_rename_result; + /* + * Expected result of the call to renameat([fd:s1d3]/s1d41/f2, + * [fd:s1d3]/s1d42/f3, RENAME_EXCHANGE). + */ + const int expected_exchange_result; + /* + * Expected result of the call to renameat([fd:s1d3]/s1d42/f4, + * [fd:s1d3]/s1d42/f5). + */ + const int expected_same_dir_rename_result; +}; + +/* clang-format off */ +FIXTURE_VARIANT_ADD(layout5_disconnected_branch, s1d1_mount1_src_parent) { + /* clang-format on */ + .allowed_s1d1 = LANDLOCK_ACCESS_FS_REFER | + LANDLOCK_ACCESS_FS_READ_FILE | + LANDLOCK_ACCESS_FS_EXECUTE | + LANDLOCK_ACCESS_FS_MAKE_REG, + .expected_read_result = EACCES, + .expected_same_dir_rename_result = EACCES, + .expected_rename_result = EACCES, + .expected_exchange_result = EACCES, +}; + +/* clang-format off */ +FIXTURE_VARIANT_ADD(layout5_disconnected_branch, s1d2_mount1_src_refer) { + /* clang-format on */ + .allowed_s1d2 = LANDLOCK_ACCESS_FS_REFER | LANDLOCK_ACCESS_FS_READ_FILE, + .expected_read_result = 0, + .expected_same_dir_rename_result = EACCES, + .expected_rename_result = EACCES, + .expected_exchange_result = EACCES, +}; + +/* clang-format off */ +FIXTURE_VARIANT_ADD(layout5_disconnected_branch, s1d2_mount1_src_create) { + /* clang-format on */ + .allowed_s1d2 = LANDLOCK_ACCESS_FS_READ_FILE | + LANDLOCK_ACCESS_FS_MAKE_REG, + .expected_read_result = 0, + .expected_same_dir_rename_result = 0, + .expected_rename_result = EXDEV, + .expected_exchange_result = EXDEV, +}; + +/* clang-format off */ +FIXTURE_VARIANT_ADD(layout5_disconnected_branch, s1d2_mount1_src_rename) { + /* clang-format on */ + .allowed_s1d2 = LANDLOCK_ACCESS_FS_REFER | LANDLOCK_ACCESS_FS_MAKE_REG, + .expected_read_result = EACCES, + .expected_same_dir_rename_result = 0, + .expected_rename_result = 0, + .expected_exchange_result = 0, +}; + +/* clang-format off */ +FIXTURE_VARIANT_ADD(layout5_disconnected_branch, s1d3_fd_refer) { + /* clang-format on */ + .allowed_s1d3 = LANDLOCK_ACCESS_FS_REFER | LANDLOCK_ACCESS_FS_READ_FILE, + .expected_read_result = 0, + .expected_same_dir_rename_result = EACCES, + .expected_rename_result = EACCES, + .expected_exchange_result = EACCES, +}; + +/* clang-format off */ +FIXTURE_VARIANT_ADD(layout5_disconnected_branch, s1d3_fd_create) { + /* clang-format on */ + .allowed_s1d3 = LANDLOCK_ACCESS_FS_READ_FILE | + LANDLOCK_ACCESS_FS_MAKE_REG, + .expected_read_result = 0, + .expected_same_dir_rename_result = 0, + .expected_rename_result = EXDEV, + .expected_exchange_result = EXDEV, +}; + +/* clang-format off */ +FIXTURE_VARIANT_ADD(layout5_disconnected_branch, s1d3_fd_rename) { + /* clang-format on */ + .allowed_s1d3 = LANDLOCK_ACCESS_FS_REFER | LANDLOCK_ACCESS_FS_MAKE_REG, + .expected_read_result = EACCES, + .expected_same_dir_rename_result = 0, + .expected_rename_result = 0, + .expected_exchange_result = 0, +}; + +/* clang-format off */ +FIXTURE_VARIANT_ADD(layout5_disconnected_branch, s1d3_fd_full) { + /* clang-format on */ + .allowed_s1d3 = LANDLOCK_ACCESS_FS_REFER | + LANDLOCK_ACCESS_FS_READ_FILE | + LANDLOCK_ACCESS_FS_EXECUTE | + LANDLOCK_ACCESS_FS_MAKE_REG, + .expected_read_result = 0, + .expected_same_dir_rename_result = 0, + .expected_rename_result = 0, + .expected_exchange_result = 0, +}; + +/* clang-format off */ +FIXTURE_VARIANT_ADD(layout5_disconnected_branch, s2d1_mount2_src_parent) { + /* clang-format on */ + .allowed_s2d1 = LANDLOCK_ACCESS_FS_REFER | + LANDLOCK_ACCESS_FS_READ_FILE | + LANDLOCK_ACCESS_FS_EXECUTE | + LANDLOCK_ACCESS_FS_MAKE_REG, + .expected_read_result = EACCES, + .expected_same_dir_rename_result = EACCES, + .expected_rename_result = EACCES, + .expected_exchange_result = EACCES, +}; + +/* clang-format off */ +FIXTURE_VARIANT_ADD(layout5_disconnected_branch, s2d2_mount2_src_refer) { + /* clang-format on */ + .allowed_s2d2 = LANDLOCK_ACCESS_FS_REFER | LANDLOCK_ACCESS_FS_READ_FILE, + .expected_read_result = 0, + .expected_same_dir_rename_result = EACCES, + .expected_rename_result = EACCES, + .expected_exchange_result = EACCES, +}; + +/* clang-format off */ +FIXTURE_VARIANT_ADD(layout5_disconnected_branch, s2d2_mount2_src_create) { + /* clang-format on */ + .allowed_s2d2 = LANDLOCK_ACCESS_FS_READ_FILE | + LANDLOCK_ACCESS_FS_MAKE_REG, + .expected_read_result = 0, + .expected_same_dir_rename_result = 0, + .expected_rename_result = EXDEV, + .expected_exchange_result = EXDEV, +}; + +/* clang-format off */ +FIXTURE_VARIANT_ADD(layout5_disconnected_branch, s2d2_mount2_src_rename) { + /* clang-format on */ + .allowed_s2d2 = LANDLOCK_ACCESS_FS_REFER | LANDLOCK_ACCESS_FS_MAKE_REG, + .expected_read_result = EACCES, + .expected_same_dir_rename_result = 0, + .expected_rename_result = 0, + .expected_exchange_result = 0, +}; + +/* clang-format off */ +FIXTURE_VARIANT_ADD(layout5_disconnected_branch, s2d3_mount1_dst_parent_refer) { + /* clang-format on */ + .allowed_s2d3 = LANDLOCK_ACCESS_FS_REFER | LANDLOCK_ACCESS_FS_READ_FILE, + .expected_read_result = 0, + .expected_same_dir_rename_result = EACCES, + .expected_rename_result = EACCES, + .expected_exchange_result = EACCES, +}; + +/* clang-format off */ +FIXTURE_VARIANT_ADD(layout5_disconnected_branch, s2d3_mount1_dst_parent_create) { + /* clang-format on */ + .allowed_s2d3 = LANDLOCK_ACCESS_FS_READ_FILE | + LANDLOCK_ACCESS_FS_MAKE_REG, + .expected_read_result = 0, + .expected_same_dir_rename_result = 0, + .expected_rename_result = EXDEV, + .expected_exchange_result = EXDEV, +}; + +/* clang-format off */ +FIXTURE_VARIANT_ADD(layout5_disconnected_branch, s2d3_mount1_dst_parent_rename) { + /* clang-format on */ + .allowed_s2d3 = LANDLOCK_ACCESS_FS_REFER | LANDLOCK_ACCESS_FS_MAKE_REG, + .expected_read_result = EACCES, + .expected_same_dir_rename_result = 0, + .expected_rename_result = 0, + .expected_exchange_result = 0, +}; + +/* clang-format off */ +FIXTURE_VARIANT_ADD(layout5_disconnected_branch, s2d4_mount1_dst) { + /* clang-format on */ + .allowed_s2d4 = LANDLOCK_ACCESS_FS_REFER | + LANDLOCK_ACCESS_FS_READ_FILE | + LANDLOCK_ACCESS_FS_EXECUTE | + LANDLOCK_ACCESS_FS_MAKE_REG, + .expected_read_result = EACCES, + .expected_same_dir_rename_result = EACCES, + .expected_rename_result = EACCES, + .expected_exchange_result = EACCES, +}; + +/* clang-format off */ +FIXTURE_VARIANT_ADD(layout5_disconnected_branch, s3d1_mount2_dst_parent_refer) { + /* clang-format on */ + .allowed_s3d1 = LANDLOCK_ACCESS_FS_REFER | LANDLOCK_ACCESS_FS_READ_FILE, + .expected_read_result = 0, + .expected_same_dir_rename_result = EACCES, + .expected_rename_result = EACCES, + .expected_exchange_result = EACCES, +}; + +/* clang-format off */ +FIXTURE_VARIANT_ADD(layout5_disconnected_branch, s3d1_mount2_dst_parent_create) { + /* clang-format on */ + .allowed_s3d1 = LANDLOCK_ACCESS_FS_READ_FILE | + LANDLOCK_ACCESS_FS_MAKE_REG, + .expected_read_result = 0, + .expected_same_dir_rename_result = 0, + .expected_rename_result = EXDEV, + .expected_exchange_result = EXDEV, +}; + +/* clang-format off */ +FIXTURE_VARIANT_ADD(layout5_disconnected_branch, s3d1_mount2_dst_parent_rename) { + /* clang-format on */ + .allowed_s3d1 = LANDLOCK_ACCESS_FS_REFER | LANDLOCK_ACCESS_FS_MAKE_REG, + .expected_read_result = EACCES, + .expected_same_dir_rename_result = 0, + .expected_rename_result = 0, + .expected_exchange_result = 0, +}; + +/* clang-format off */ +FIXTURE_VARIANT_ADD(layout5_disconnected_branch, s3d2_mount1_dst) { + /* clang-format on */ + .allowed_s3d2 = LANDLOCK_ACCESS_FS_REFER | + LANDLOCK_ACCESS_FS_READ_FILE | + LANDLOCK_ACCESS_FS_EXECUTE | + LANDLOCK_ACCESS_FS_MAKE_REG, + .expected_read_result = EACCES, + .expected_same_dir_rename_result = EACCES, + .expected_rename_result = EACCES, + .expected_exchange_result = EACCES, +}; + +/* clang-format off */ +FIXTURE_VARIANT_ADD(layout5_disconnected_branch, s4d1_rename_parent_refer) { + /* clang-format on */ + .allowed_s4d1 = LANDLOCK_ACCESS_FS_REFER | LANDLOCK_ACCESS_FS_READ_FILE, + .expected_read_result = 0, + .expected_same_dir_rename_result = EACCES, + .expected_rename_result = EACCES, + .expected_exchange_result = EACCES, +}; + +/* clang-format off */ +FIXTURE_VARIANT_ADD(layout5_disconnected_branch, s4d1_rename_parent_create) { + /* clang-format on */ + .allowed_s4d1 = LANDLOCK_ACCESS_FS_READ_FILE | + LANDLOCK_ACCESS_FS_MAKE_REG, + .expected_read_result = 0, + .expected_same_dir_rename_result = 0, + .expected_rename_result = EXDEV, + .expected_exchange_result = EXDEV, +}; + +/* clang-format off */ +FIXTURE_VARIANT_ADD(layout5_disconnected_branch, s4d1_rename_parent_rename) { + /* clang-format on */ + .allowed_s4d1 = LANDLOCK_ACCESS_FS_REFER | LANDLOCK_ACCESS_FS_MAKE_REG, + .expected_read_result = EACCES, + .expected_same_dir_rename_result = 0, + .expected_rename_result = 0, + .expected_exchange_result = 0, +}; + +TEST_F_FORK(layout5_disconnected_branch, read_rename_exchange) +{ + const __u64 handled_access = + LANDLOCK_ACCESS_FS_REFER | LANDLOCK_ACCESS_FS_READ_FILE | + LANDLOCK_ACCESS_FS_EXECUTE | LANDLOCK_ACCESS_FS_MAKE_REG; + const struct rule rules[] = { + { + .path = TMP_DIR "/s1d1", + .access = variant->allowed_s1d1, + }, + { + .path = TMP_DIR "/s1d1/s1d2", + .access = variant->allowed_s1d2, + }, + { + .path = TMP_DIR "/s1d1/s1d2/s1d3", + .access = variant->allowed_s1d3, + }, + { + .path = TMP_DIR "/s2d1", + .access = variant->allowed_s2d1, + }, + { + .path = TMP_DIR "/s2d1/s2d2", + .access = variant->allowed_s2d2, + }, + { + .path = TMP_DIR "/s2d1/s2d2/s2d3", + .access = variant->allowed_s2d3, + }, + /* s2d4_fd */ + { + .path = TMP_DIR "/s3d1", + .access = variant->allowed_s3d1, + }, + /* s3d2_fd */ + { + .path = TMP_DIR "/s4d1", + .access = variant->allowed_s4d1, + }, + {}, + }; + int ruleset_fd, s1d3_bind_fd; + + ruleset_fd = create_ruleset(_metadata, handled_access, rules); + ASSERT_LE(0, ruleset_fd); + + /* Adds rules for the covered directories. */ + if (variant->allowed_s2d4) { + ASSERT_EQ(0, landlock_add_rule( + ruleset_fd, LANDLOCK_RULE_PATH_BENEATH, + &(struct landlock_path_beneath_attr){ + .parent_fd = self->s2d4_fd, + .allowed_access = + variant->allowed_s2d4, + }, + 0)); + } + EXPECT_EQ(0, close(self->s2d4_fd)); + + if (variant->allowed_s3d2) { + ASSERT_EQ(0, landlock_add_rule( + ruleset_fd, LANDLOCK_RULE_PATH_BENEATH, + &(struct landlock_path_beneath_attr){ + .parent_fd = self->s3d2_fd, + .allowed_access = + variant->allowed_s3d2, + }, + 0)); + } + EXPECT_EQ(0, close(self->s3d2_fd)); + + s1d3_bind_fd = open(TMP_DIR "/s3d1/s3d2/s2d3/s2d4/s1d3", + O_DIRECTORY | O_PATH | O_CLOEXEC); + ASSERT_LE(0, s1d3_bind_fd); + + /* Disconnects and checks source and destination directories. */ + EXPECT_EQ(0, test_open_rel(s1d3_bind_fd, "..", O_DIRECTORY)); + EXPECT_EQ(0, test_open_rel(s1d3_bind_fd, "../..", O_DIRECTORY)); + /* Renames to make it accessible through s3d1/s1d41 */ + ASSERT_EQ(0, test_renameat(AT_FDCWD, TMP_DIR "/s2d1/s2d2/s2d3", + AT_FDCWD, TMP_DIR "/s4d1/s2d3")); + EXPECT_EQ(0, test_open_rel(s1d3_bind_fd, "..", O_DIRECTORY)); + EXPECT_EQ(ENOENT, test_open_rel(s1d3_bind_fd, "../..", O_DIRECTORY)); + + enforce_ruleset(_metadata, ruleset_fd); + EXPECT_EQ(0, close(ruleset_fd)); + + EXPECT_EQ(variant->expected_read_result, + test_open_rel(s1d3_bind_fd, "s1d41/f1", O_RDONLY)); + + EXPECT_EQ(variant->expected_rename_result, + test_renameat(s1d3_bind_fd, "s1d41/f1", s1d3_bind_fd, + "s1d42/f1")); + EXPECT_EQ(variant->expected_exchange_result, + test_exchangeat(s1d3_bind_fd, "s1d41/f2", s1d3_bind_fd, + "s1d42/f3")); + + EXPECT_EQ(variant->expected_same_dir_rename_result, + test_renameat(s1d3_bind_fd, "s1d42/f4", s1d3_bind_fd, + "s1d42/f5")); +} + #define LOWER_BASE TMP_DIR "/lower" #define LOWER_DATA LOWER_BASE "/data" static const char lower_fl1[] = LOWER_DATA "/fl1"; diff --git a/tools/testing/selftests/livepatch/functions.sh b/tools/testing/selftests/livepatch/functions.sh index 46991a029f7c..8ec0cb64ad94 100644 --- a/tools/testing/selftests/livepatch/functions.sh +++ b/tools/testing/selftests/livepatch/functions.sh @@ -10,7 +10,11 @@ SYSFS_KERNEL_DIR="/sys/kernel" SYSFS_KLP_DIR="$SYSFS_KERNEL_DIR/livepatch" SYSFS_DEBUG_DIR="$SYSFS_KERNEL_DIR/debug" SYSFS_KPROBES_DIR="$SYSFS_DEBUG_DIR/kprobes" -SYSFS_TRACING_DIR="$SYSFS_DEBUG_DIR/tracing" +if [[ -e /sys/kernel/tracing/trace ]]; then + SYSFS_TRACING_DIR="$SYSFS_KERNEL_DIR/tracing" +else + SYSFS_TRACING_DIR="$SYSFS_DEBUG_DIR/tracing" +fi # Kselftest framework requirement - SKIP code is 4 ksft_skip=4 diff --git a/tools/testing/selftests/mm/guard-regions.c b/tools/testing/selftests/mm/guard-regions.c index 8dd81c0a4a5a..795bf3f39f44 100644 --- a/tools/testing/selftests/mm/guard-regions.c +++ b/tools/testing/selftests/mm/guard-regions.c @@ -94,6 +94,7 @@ static void *mmap_(FIXTURE_DATA(guard_regions) * self, case ANON_BACKED: flags |= MAP_PRIVATE | MAP_ANON; fd = -1; + offset = 0; break; case SHMEM_BACKED: case LOCAL_FILE_BACKED: @@ -260,6 +261,54 @@ static bool is_buf_eq(char *buf, size_t size, char chr) return true; } +/* + * Some file systems have issues with merging due to changing merge-sensitive + * parameters in the .mmap callback, and prior to .mmap_prepare being + * implemented everywhere this will now result in an unexpected failure to + * merge (e.g. - overlayfs). + * + * Perform a simple test to see if the local file system suffers from this, if + * it does then we can skip test logic that assumes local file system merging is + * sane. + */ +static bool local_fs_has_sane_mmap(FIXTURE_DATA(guard_regions) * self, + const FIXTURE_VARIANT(guard_regions) * variant) +{ + const unsigned long page_size = self->page_size; + char *ptr, *ptr2; + struct procmap_fd procmap; + + if (variant->backing != LOCAL_FILE_BACKED) + return true; + + /* Map 10 pages. */ + ptr = mmap_(self, variant, NULL, 10 * page_size, PROT_READ | PROT_WRITE, 0, 0); + if (ptr == MAP_FAILED) + return false; + /* Unmap the middle. */ + munmap(&ptr[5 * page_size], page_size); + + /* Map again. */ + ptr2 = mmap_(self, variant, &ptr[5 * page_size], page_size, PROT_READ | PROT_WRITE, + MAP_FIXED, 5 * page_size); + + if (ptr2 == MAP_FAILED) + return false; + + /* Now make sure they all merged. */ + if (open_self_procmap(&procmap) != 0) + return false; + if (!find_vma_procmap(&procmap, ptr)) + return false; + if (procmap.query.vma_start != (unsigned long)ptr) + return false; + if (procmap.query.vma_end != (unsigned long)ptr + 10 * page_size) + return false; + close_procmap(&procmap); + + return true; +} + FIXTURE_SETUP(guard_regions) { self->page_size = (unsigned long)sysconf(_SC_PAGESIZE); @@ -2138,4 +2187,140 @@ TEST_F(guard_regions, pagemap_scan) ASSERT_EQ(munmap(ptr, 10 * page_size), 0); } +TEST_F(guard_regions, collapse) +{ + const unsigned long page_size = self->page_size; + const unsigned long size = 2 * HPAGE_SIZE; + const unsigned long num_pages = size / page_size; + char *ptr; + int i; + + /* Need file to be correct size for tests for non-anon. */ + if (variant->backing != ANON_BACKED) + ASSERT_EQ(ftruncate(self->fd, size), 0); + + /* + * We must close and re-open local-file backed as read-only for + * CONFIG_READ_ONLY_THP_FOR_FS to work. + */ + if (variant->backing == LOCAL_FILE_BACKED) { + ASSERT_EQ(close(self->fd), 0); + + self->fd = open(self->path, O_RDONLY); + ASSERT_GE(self->fd, 0); + } + + ptr = mmap_(self, variant, NULL, size, PROT_READ, 0, 0); + ASSERT_NE(ptr, MAP_FAILED); + + /* Prevent being faulted-in as huge. */ + ASSERT_EQ(madvise(ptr, size, MADV_NOHUGEPAGE), 0); + /* Fault in. */ + ASSERT_EQ(madvise(ptr, size, MADV_POPULATE_READ), 0); + + /* Install guard regions in ever other page. */ + for (i = 0; i < num_pages; i += 2) { + char *ptr_page = &ptr[i * page_size]; + + ASSERT_EQ(madvise(ptr_page, page_size, MADV_GUARD_INSTALL), 0); + /* Accesses should now fail. */ + ASSERT_FALSE(try_read_buf(ptr_page)); + } + + /* Allow huge page throughout region. */ + ASSERT_EQ(madvise(ptr, size, MADV_HUGEPAGE), 0); + + /* + * Now collapse the entire region. This should fail in all cases. + * + * The madvise() call will also fail if CONFIG_READ_ONLY_THP_FOR_FS is + * not set for the local file case, but we can't differentiate whether + * this occurred or if the collapse was rightly rejected. + */ + EXPECT_NE(madvise(ptr, size, MADV_COLLAPSE), 0); + + /* + * If we introduce a bug that causes the collapse to succeed, gather + * data on whether guard regions are at least preserved. The test will + * fail at this point in any case. + */ + for (i = 0; i < num_pages; i += 2) { + char *ptr_page = &ptr[i * page_size]; + + /* Accesses should still fail. */ + ASSERT_FALSE(try_read_buf(ptr_page)); + } +} + +TEST_F(guard_regions, smaps) +{ + const unsigned long page_size = self->page_size; + struct procmap_fd procmap; + char *ptr, *ptr2; + int i; + + /* Map a region. */ + ptr = mmap_(self, variant, NULL, 10 * page_size, PROT_READ | PROT_WRITE, 0, 0); + ASSERT_NE(ptr, MAP_FAILED); + + /* We shouldn't yet see a guard flag. */ + ASSERT_FALSE(check_vmflag_guard(ptr)); + + /* Install a single guard region. */ + ASSERT_EQ(madvise(ptr, page_size, MADV_GUARD_INSTALL), 0); + + /* Now we should see a guard flag. */ + ASSERT_TRUE(check_vmflag_guard(ptr)); + + /* + * Removing the guard region should not change things because we simply + * cannot accurately track whether a given VMA has had all of its guard + * regions removed. + */ + ASSERT_EQ(madvise(ptr, page_size, MADV_GUARD_REMOVE), 0); + ASSERT_TRUE(check_vmflag_guard(ptr)); + + /* Install guard regions throughout. */ + for (i = 0; i < 10; i++) { + ASSERT_EQ(madvise(&ptr[i * page_size], page_size, MADV_GUARD_INSTALL), 0); + /* We should always see the guard region flag. */ + ASSERT_TRUE(check_vmflag_guard(ptr)); + } + + /* Split into two VMAs. */ + ASSERT_EQ(munmap(&ptr[4 * page_size], page_size), 0); + + /* Both VMAs should have the guard flag set. */ + ASSERT_TRUE(check_vmflag_guard(ptr)); + ASSERT_TRUE(check_vmflag_guard(&ptr[5 * page_size])); + + /* + * If the local file system is unable to merge VMAs due to having + * unusual characteristics, there is no point in asserting merge + * behaviour. + */ + if (!local_fs_has_sane_mmap(self, variant)) { + TH_LOG("local filesystem does not support sane merging skipping merge test"); + return; + } + + /* Map a fresh VMA between the two split VMAs. */ + ptr2 = mmap_(self, variant, &ptr[4 * page_size], page_size, + PROT_READ | PROT_WRITE, MAP_FIXED, 4 * page_size); + ASSERT_NE(ptr2, MAP_FAILED); + + /* + * Check the procmap to ensure that this VMA merged with the adjacent + * two. The guard region flag is 'sticky' so should not preclude + * merging. + */ + ASSERT_EQ(open_self_procmap(&procmap), 0); + ASSERT_TRUE(find_vma_procmap(&procmap, ptr)); + ASSERT_EQ(procmap.query.vma_start, (unsigned long)ptr); + ASSERT_EQ(procmap.query.vma_end, (unsigned long)ptr + 10 * page_size); + ASSERT_EQ(close_procmap(&procmap), 0); + /* And, of course, this VMA should have the guard flag set. */ + ASSERT_TRUE(check_vmflag_guard(ptr)); +} + TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/mm/gup_test.c b/tools/testing/selftests/mm/gup_test.c index 8900b840c17a..40c1538a17b4 100644 --- a/tools/testing/selftests/mm/gup_test.c +++ b/tools/testing/selftests/mm/gup_test.c @@ -17,9 +17,8 @@ #define MB (1UL << 20) -/* Just the flags we need, copied from mm.h: */ +/* Just the flags we need, copied from the kernel internals. */ #define FOLL_WRITE 0x01 /* check pte is writable */ -#define FOLL_TOUCH 0x02 /* mark page accessed */ #define GUP_TEST_FILE "/sys/kernel/debug/gup_test" @@ -93,7 +92,7 @@ int main(int argc, char **argv) { struct gup_test gup = { 0 }; int filed, i, opt, nr_pages = 1, thp = -1, write = 1, nthreads = 1, ret; - int flags = MAP_PRIVATE, touch = 0; + int flags = MAP_PRIVATE; char *file = "/dev/zero"; pthread_t *tid; char *p; @@ -170,10 +169,6 @@ int main(int argc, char **argv) case 'H': flags |= (MAP_HUGETLB | MAP_ANONYMOUS); break; - case 'z': - /* fault pages in gup, do not fault in userland */ - touch = 1; - break; default: ksft_exit_fail_msg("Wrong argument\n"); } @@ -244,18 +239,9 @@ int main(int argc, char **argv) else if (thp == 0) madvise(p, size, MADV_NOHUGEPAGE); - /* - * FOLL_TOUCH, in gup_test, is used as an either/or case: either - * fault pages in from the kernel via FOLL_TOUCH, or fault them - * in here, from user space. This allows comparison of performance - * between those two cases. - */ - if (touch) { - gup.gup_flags |= FOLL_TOUCH; - } else { - for (; (unsigned long)p < gup.addr + size; p += psize()) - p[0] = 0; - } + /* Fault them in here, from user space. */ + for (; (unsigned long)p < gup.addr + size; p += psize()) + p[0] = 0; tid = malloc(sizeof(pthread_t) * nthreads); assert(tid); diff --git a/tools/testing/selftests/mm/hmm-tests.c b/tools/testing/selftests/mm/hmm-tests.c index 15aadaf24a66..5a1525f72daa 100644 --- a/tools/testing/selftests/mm/hmm-tests.c +++ b/tools/testing/selftests/mm/hmm-tests.c @@ -25,6 +25,7 @@ #include <sys/stat.h> #include <sys/mman.h> #include <sys/ioctl.h> +#include <sys/time.h> /* @@ -50,6 +51,8 @@ enum { HMM_COHERENCE_DEVICE_TWO, }; +#define ONEKB (1 << 10) +#define ONEMEG (1 << 20) #define TWOMEG (1 << 21) #define HMM_BUFFER_SIZE (1024 << 12) #define HMM_PATH_MAX 64 @@ -207,8 +210,10 @@ static void hmm_buffer_free(struct hmm_buffer *buffer) if (buffer == NULL) return; - if (buffer->ptr) + if (buffer->ptr) { munmap(buffer->ptr, buffer->size); + buffer->ptr = NULL; + } free(buffer->mirror); free(buffer); } @@ -525,6 +530,8 @@ TEST_F(hmm, anon_write_prot) /* * Check that a device writing an anonymous private mapping * will copy-on-write if a child process inherits the mapping. + * + * Also verifies after fork() memory the device can be read by child. */ TEST_F(hmm, anon_write_child) { @@ -532,72 +539,101 @@ TEST_F(hmm, anon_write_child) unsigned long npages; unsigned long size; unsigned long i; + void *old_ptr; + void *map; int *ptr; pid_t pid; int child_fd; - int ret; - - npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift; - ASSERT_NE(npages, 0); - size = npages << self->page_shift; - - buffer = malloc(sizeof(*buffer)); - ASSERT_NE(buffer, NULL); - - buffer->fd = -1; - buffer->size = size; - buffer->mirror = malloc(size); - ASSERT_NE(buffer->mirror, NULL); - - buffer->ptr = mmap(NULL, size, - PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS, - buffer->fd, 0); - ASSERT_NE(buffer->ptr, MAP_FAILED); - - /* Initialize buffer->ptr so we can tell if it is written. */ - for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) - ptr[i] = i; - - /* Initialize data that the device will write to buffer->ptr. */ - for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) - ptr[i] = -i; + int ret, use_thp, migrate; + + for (migrate = 0; migrate < 2; ++migrate) { + for (use_thp = 0; use_thp < 2; ++use_thp) { + npages = ALIGN(use_thp ? TWOMEG : HMM_BUFFER_SIZE, + self->page_size) >> self->page_shift; + ASSERT_NE(npages, 0); + size = npages << self->page_shift; + + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = -1; + buffer->size = size * 2; + buffer->mirror = malloc(size); + ASSERT_NE(buffer->mirror, NULL); + + buffer->ptr = mmap(NULL, size * 2, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, + buffer->fd, 0); + ASSERT_NE(buffer->ptr, MAP_FAILED); + + old_ptr = buffer->ptr; + if (use_thp) { + map = (void *)ALIGN((uintptr_t)buffer->ptr, size); + ret = madvise(map, size, MADV_HUGEPAGE); + ASSERT_EQ(ret, 0); + buffer->ptr = map; + } + + /* Initialize buffer->ptr so we can tell if it is written. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ptr[i] = i; + + /* Initialize data that the device will write to buffer->ptr. */ + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ptr[i] = -i; + + if (migrate) { + ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + + } + + pid = fork(); + if (pid == -1) + ASSERT_EQ(pid, 0); + if (pid != 0) { + waitpid(pid, &ret, 0); + ASSERT_EQ(WIFEXITED(ret), 1); + + /* Check that the parent's buffer did not change. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); + + buffer->ptr = old_ptr; + hmm_buffer_free(buffer); + continue; + } + + /* Check that we see the parent's values. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); + if (!migrate) { + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], -i); + } + + /* The child process needs its own mirror to its own mm. */ + child_fd = hmm_open(0); + ASSERT_GE(child_fd, 0); + + /* Simulate a device writing system memory. */ + ret = hmm_dmirror_cmd(child_fd, HMM_DMIRROR_WRITE, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + ASSERT_EQ(buffer->faults, 1); - pid = fork(); - if (pid == -1) - ASSERT_EQ(pid, 0); - if (pid != 0) { - waitpid(pid, &ret, 0); - ASSERT_EQ(WIFEXITED(ret), 1); + /* Check what the device wrote. */ + if (!migrate) { + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], -i); + } - /* Check that the parent's buffer did not change. */ - for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) - ASSERT_EQ(ptr[i], i); - return; + close(child_fd); + exit(0); + } } - - /* Check that we see the parent's values. */ - for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) - ASSERT_EQ(ptr[i], i); - for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) - ASSERT_EQ(ptr[i], -i); - - /* The child process needs its own mirror to its own mm. */ - child_fd = hmm_open(0); - ASSERT_GE(child_fd, 0); - - /* Simulate a device writing system memory. */ - ret = hmm_dmirror_cmd(child_fd, HMM_DMIRROR_WRITE, buffer, npages); - ASSERT_EQ(ret, 0); - ASSERT_EQ(buffer->cpages, npages); - ASSERT_EQ(buffer->faults, 1); - - /* Check what the device wrote. */ - for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) - ASSERT_EQ(ptr[i], -i); - - close(child_fd); - exit(0); } /* @@ -2055,4 +2091,765 @@ TEST_F(hmm, hmm_cow_in_device) hmm_buffer_free(buffer); } + +/* + * Migrate private anonymous huge empty page. + */ +TEST_F(hmm, migrate_anon_huge_empty) +{ + struct hmm_buffer *buffer; + unsigned long npages; + unsigned long size; + unsigned long i; + void *old_ptr; + void *map; + int *ptr; + int ret; + + size = TWOMEG; + + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = -1; + buffer->size = 2 * size; + buffer->mirror = malloc(size); + ASSERT_NE(buffer->mirror, NULL); + memset(buffer->mirror, 0xFF, size); + + buffer->ptr = mmap(NULL, 2 * size, + PROT_READ, + MAP_PRIVATE | MAP_ANONYMOUS, + buffer->fd, 0); + ASSERT_NE(buffer->ptr, MAP_FAILED); + + npages = size >> self->page_shift; + map = (void *)ALIGN((uintptr_t)buffer->ptr, size); + ret = madvise(map, size, MADV_HUGEPAGE); + ASSERT_EQ(ret, 0); + old_ptr = buffer->ptr; + buffer->ptr = map; + + /* Migrate memory to device. */ + ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + + /* Check what the device read. */ + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], 0); + + buffer->ptr = old_ptr; + hmm_buffer_free(buffer); +} + +/* + * Migrate private anonymous huge zero page. + */ +TEST_F(hmm, migrate_anon_huge_zero) +{ + struct hmm_buffer *buffer; + unsigned long npages; + unsigned long size; + unsigned long i; + void *old_ptr; + void *map; + int *ptr; + int ret; + int val; + + size = TWOMEG; + + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = -1; + buffer->size = 2 * size; + buffer->mirror = malloc(size); + ASSERT_NE(buffer->mirror, NULL); + memset(buffer->mirror, 0xFF, size); + + buffer->ptr = mmap(NULL, 2 * size, + PROT_READ, + MAP_PRIVATE | MAP_ANONYMOUS, + buffer->fd, 0); + ASSERT_NE(buffer->ptr, MAP_FAILED); + + npages = size >> self->page_shift; + map = (void *)ALIGN((uintptr_t)buffer->ptr, size); + ret = madvise(map, size, MADV_HUGEPAGE); + ASSERT_EQ(ret, 0); + old_ptr = buffer->ptr; + buffer->ptr = map; + + /* Initialize a read-only zero huge page. */ + val = *(int *)buffer->ptr; + ASSERT_EQ(val, 0); + + /* Migrate memory to device. */ + ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + + /* Check what the device read. */ + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], 0); + + /* Fault pages back to system memory and check them. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) { + ASSERT_EQ(ptr[i], 0); + /* If it asserts once, it probably will 500,000 times */ + if (ptr[i] != 0) + break; + } + + buffer->ptr = old_ptr; + hmm_buffer_free(buffer); +} + +/* + * Migrate private anonymous huge page and free. + */ +TEST_F(hmm, migrate_anon_huge_free) +{ + struct hmm_buffer *buffer; + unsigned long npages; + unsigned long size; + unsigned long i; + void *old_ptr; + void *map; + int *ptr; + int ret; + + size = TWOMEG; + + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = -1; + buffer->size = 2 * size; + buffer->mirror = malloc(size); + ASSERT_NE(buffer->mirror, NULL); + memset(buffer->mirror, 0xFF, size); + + buffer->ptr = mmap(NULL, 2 * size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, + buffer->fd, 0); + ASSERT_NE(buffer->ptr, MAP_FAILED); + + npages = size >> self->page_shift; + map = (void *)ALIGN((uintptr_t)buffer->ptr, size); + ret = madvise(map, size, MADV_HUGEPAGE); + ASSERT_EQ(ret, 0); + old_ptr = buffer->ptr; + buffer->ptr = map; + + /* Initialize buffer in system memory. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ptr[i] = i; + + /* Migrate memory to device. */ + ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + + /* Check what the device read. */ + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); + + /* Try freeing it. */ + ret = madvise(map, size, MADV_FREE); + ASSERT_EQ(ret, 0); + + buffer->ptr = old_ptr; + hmm_buffer_free(buffer); +} + +/* + * Migrate private anonymous huge page and fault back to sysmem. + */ +TEST_F(hmm, migrate_anon_huge_fault) +{ + struct hmm_buffer *buffer; + unsigned long npages; + unsigned long size; + unsigned long i; + void *old_ptr; + void *map; + int *ptr; + int ret; + + size = TWOMEG; + + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = -1; + buffer->size = 2 * size; + buffer->mirror = malloc(size); + ASSERT_NE(buffer->mirror, NULL); + memset(buffer->mirror, 0xFF, size); + + buffer->ptr = mmap(NULL, 2 * size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, + buffer->fd, 0); + ASSERT_NE(buffer->ptr, MAP_FAILED); + + npages = size >> self->page_shift; + map = (void *)ALIGN((uintptr_t)buffer->ptr, size); + ret = madvise(map, size, MADV_HUGEPAGE); + ASSERT_EQ(ret, 0); + old_ptr = buffer->ptr; + buffer->ptr = map; + + /* Initialize buffer in system memory. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ptr[i] = i; + + /* Migrate memory to device. */ + ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + + /* Check what the device read. */ + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); + + /* Fault pages back to system memory and check them. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); + + buffer->ptr = old_ptr; + hmm_buffer_free(buffer); +} + +/* + * Migrate memory and fault back to sysmem after partially unmapping. + */ +TEST_F(hmm, migrate_partial_unmap_fault) +{ + struct hmm_buffer *buffer; + unsigned long npages; + unsigned long size = TWOMEG; + unsigned long i; + void *old_ptr; + void *map; + int *ptr; + int ret, j, use_thp; + int offsets[] = { 0, 512 * ONEKB, ONEMEG }; + + for (use_thp = 0; use_thp < 2; ++use_thp) { + for (j = 0; j < ARRAY_SIZE(offsets); ++j) { + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = -1; + buffer->size = 2 * size; + buffer->mirror = malloc(size); + ASSERT_NE(buffer->mirror, NULL); + memset(buffer->mirror, 0xFF, size); + + buffer->ptr = mmap(NULL, 2 * size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, + buffer->fd, 0); + ASSERT_NE(buffer->ptr, MAP_FAILED); + + npages = size >> self->page_shift; + map = (void *)ALIGN((uintptr_t)buffer->ptr, size); + if (use_thp) + ret = madvise(map, size, MADV_HUGEPAGE); + else + ret = madvise(map, size, MADV_NOHUGEPAGE); + ASSERT_EQ(ret, 0); + old_ptr = buffer->ptr; + buffer->ptr = map; + + /* Initialize buffer in system memory. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ptr[i] = i; + + /* Migrate memory to device. */ + ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + + /* Check what the device read. */ + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); + + munmap(buffer->ptr + offsets[j], ONEMEG); + + /* Fault pages back to system memory and check them. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + if (i * sizeof(int) < offsets[j] || + i * sizeof(int) >= offsets[j] + ONEMEG) + ASSERT_EQ(ptr[i], i); + + buffer->ptr = old_ptr; + hmm_buffer_free(buffer); + } + } +} + +TEST_F(hmm, migrate_remap_fault) +{ + struct hmm_buffer *buffer; + unsigned long npages; + unsigned long size = TWOMEG; + unsigned long i; + void *old_ptr, *new_ptr = NULL; + void *map; + int *ptr; + int ret, j, use_thp, dont_unmap, before; + int offsets[] = { 0, 512 * ONEKB, ONEMEG }; + + for (before = 0; before < 2; ++before) { + for (dont_unmap = 0; dont_unmap < 2; ++dont_unmap) { + for (use_thp = 0; use_thp < 2; ++use_thp) { + for (j = 0; j < ARRAY_SIZE(offsets); ++j) { + int flags = MREMAP_MAYMOVE | MREMAP_FIXED; + + if (dont_unmap) + flags |= MREMAP_DONTUNMAP; + + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = -1; + buffer->size = 8 * size; + buffer->mirror = malloc(size); + ASSERT_NE(buffer->mirror, NULL); + memset(buffer->mirror, 0xFF, size); + + buffer->ptr = mmap(NULL, buffer->size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, + buffer->fd, 0); + ASSERT_NE(buffer->ptr, MAP_FAILED); + + npages = size >> self->page_shift; + map = (void *)ALIGN((uintptr_t)buffer->ptr, size); + if (use_thp) + ret = madvise(map, size, MADV_HUGEPAGE); + else + ret = madvise(map, size, MADV_NOHUGEPAGE); + ASSERT_EQ(ret, 0); + old_ptr = buffer->ptr; + munmap(map + size, size * 2); + buffer->ptr = map; + + /* Initialize buffer in system memory. */ + for (i = 0, ptr = buffer->ptr; + i < size / sizeof(*ptr); ++i) + ptr[i] = i; + + if (before) { + new_ptr = mremap((void *)map, size, size, flags, + map + size + offsets[j]); + ASSERT_NE(new_ptr, MAP_FAILED); + buffer->ptr = new_ptr; + } + + /* Migrate memory to device. */ + ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + + /* Check what the device read. */ + for (i = 0, ptr = buffer->mirror; + i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); + + if (!before) { + new_ptr = mremap((void *)map, size, size, flags, + map + size + offsets[j]); + ASSERT_NE(new_ptr, MAP_FAILED); + buffer->ptr = new_ptr; + } + + /* Fault pages back to system memory and check them. */ + for (i = 0, ptr = buffer->ptr; + i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); + + munmap(new_ptr, size); + buffer->ptr = old_ptr; + hmm_buffer_free(buffer); + } + } + } + } +} + +/* + * Migrate private anonymous huge page with allocation errors. + */ +TEST_F(hmm, migrate_anon_huge_err) +{ + struct hmm_buffer *buffer; + unsigned long npages; + unsigned long size; + unsigned long i; + void *old_ptr; + void *map; + int *ptr; + int ret; + + size = TWOMEG; + + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = -1; + buffer->size = 2 * size; + buffer->mirror = malloc(2 * size); + ASSERT_NE(buffer->mirror, NULL); + memset(buffer->mirror, 0xFF, 2 * size); + + old_ptr = mmap(NULL, 2 * size, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, buffer->fd, 0); + ASSERT_NE(old_ptr, MAP_FAILED); + + npages = size >> self->page_shift; + map = (void *)ALIGN((uintptr_t)old_ptr, size); + ret = madvise(map, size, MADV_HUGEPAGE); + ASSERT_EQ(ret, 0); + buffer->ptr = map; + + /* Initialize buffer in system memory. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ptr[i] = i; + + /* Migrate memory to device but force a THP allocation error. */ + ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_FLAGS, buffer, + HMM_DMIRROR_FLAG_FAIL_ALLOC); + ASSERT_EQ(ret, 0); + ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + + /* Check what the device read. */ + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) { + ASSERT_EQ(ptr[i], i); + if (ptr[i] != i) + break; + } + + /* Try faulting back a single (PAGE_SIZE) page. */ + ptr = buffer->ptr; + ASSERT_EQ(ptr[2048], 2048); + + /* unmap and remap the region to reset things. */ + ret = munmap(old_ptr, 2 * size); + ASSERT_EQ(ret, 0); + old_ptr = mmap(NULL, 2 * size, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, buffer->fd, 0); + ASSERT_NE(old_ptr, MAP_FAILED); + map = (void *)ALIGN((uintptr_t)old_ptr, size); + ret = madvise(map, size, MADV_HUGEPAGE); + ASSERT_EQ(ret, 0); + buffer->ptr = map; + + /* Initialize buffer in system memory. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ptr[i] = i; + + /* Migrate THP to device. */ + ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + + /* + * Force an allocation error when faulting back a THP resident in the + * device. + */ + ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_FLAGS, buffer, + HMM_DMIRROR_FLAG_FAIL_ALLOC); + ASSERT_EQ(ret, 0); + + ret = hmm_migrate_dev_to_sys(self->fd, buffer, npages); + ASSERT_EQ(ret, 0); + ptr = buffer->ptr; + ASSERT_EQ(ptr[2048], 2048); + + buffer->ptr = old_ptr; + hmm_buffer_free(buffer); +} + +/* + * Migrate private anonymous huge zero page with allocation errors. + */ +TEST_F(hmm, migrate_anon_huge_zero_err) +{ + struct hmm_buffer *buffer; + unsigned long npages; + unsigned long size; + unsigned long i; + void *old_ptr; + void *map; + int *ptr; + int ret; + + size = TWOMEG; + + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = -1; + buffer->size = 2 * size; + buffer->mirror = malloc(2 * size); + ASSERT_NE(buffer->mirror, NULL); + memset(buffer->mirror, 0xFF, 2 * size); + + old_ptr = mmap(NULL, 2 * size, PROT_READ, + MAP_PRIVATE | MAP_ANONYMOUS, buffer->fd, 0); + ASSERT_NE(old_ptr, MAP_FAILED); + + npages = size >> self->page_shift; + map = (void *)ALIGN((uintptr_t)old_ptr, size); + ret = madvise(map, size, MADV_HUGEPAGE); + ASSERT_EQ(ret, 0); + buffer->ptr = map; + + /* Migrate memory to device but force a THP allocation error. */ + ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_FLAGS, buffer, + HMM_DMIRROR_FLAG_FAIL_ALLOC); + ASSERT_EQ(ret, 0); + ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + + /* Check what the device read. */ + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], 0); + + /* Try faulting back a single (PAGE_SIZE) page. */ + ptr = buffer->ptr; + ASSERT_EQ(ptr[2048], 0); + + /* unmap and remap the region to reset things. */ + ret = munmap(old_ptr, 2 * size); + ASSERT_EQ(ret, 0); + old_ptr = mmap(NULL, 2 * size, PROT_READ, + MAP_PRIVATE | MAP_ANONYMOUS, buffer->fd, 0); + ASSERT_NE(old_ptr, MAP_FAILED); + map = (void *)ALIGN((uintptr_t)old_ptr, size); + ret = madvise(map, size, MADV_HUGEPAGE); + ASSERT_EQ(ret, 0); + buffer->ptr = map; + + /* Initialize buffer in system memory (zero THP page). */ + ret = ptr[0]; + ASSERT_EQ(ret, 0); + + /* Migrate memory to device but force a THP allocation error. */ + ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_FLAGS, buffer, + HMM_DMIRROR_FLAG_FAIL_ALLOC); + ASSERT_EQ(ret, 0); + ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + + /* Fault the device memory back and check it. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], 0); + + buffer->ptr = old_ptr; + hmm_buffer_free(buffer); +} + +struct benchmark_results { + double sys_to_dev_time; + double dev_to_sys_time; + double throughput_s2d; + double throughput_d2s; +}; + +static double get_time_ms(void) +{ + struct timeval tv; + + gettimeofday(&tv, NULL); + return (tv.tv_sec * 1000.0) + (tv.tv_usec / 1000.0); +} + +static inline struct hmm_buffer *hmm_buffer_alloc(unsigned long size) +{ + struct hmm_buffer *buffer; + + buffer = malloc(sizeof(*buffer)); + + buffer->fd = -1; + buffer->size = size; + buffer->mirror = malloc(size); + memset(buffer->mirror, 0xFF, size); + return buffer; +} + +static void print_benchmark_results(const char *test_name, size_t buffer_size, + struct benchmark_results *thp, + struct benchmark_results *regular) +{ + double s2d_improvement = ((regular->sys_to_dev_time - thp->sys_to_dev_time) / + regular->sys_to_dev_time) * 100.0; + double d2s_improvement = ((regular->dev_to_sys_time - thp->dev_to_sys_time) / + regular->dev_to_sys_time) * 100.0; + double throughput_s2d_improvement = ((thp->throughput_s2d - regular->throughput_s2d) / + regular->throughput_s2d) * 100.0; + double throughput_d2s_improvement = ((thp->throughput_d2s - regular->throughput_d2s) / + regular->throughput_d2s) * 100.0; + + printf("\n=== %s (%.1f MB) ===\n", test_name, buffer_size / (1024.0 * 1024.0)); + printf(" | With THP | Without THP | Improvement\n"); + printf("---------------------------------------------------------------------\n"); + printf("Sys->Dev Migration | %.3f ms | %.3f ms | %.1f%%\n", + thp->sys_to_dev_time, regular->sys_to_dev_time, s2d_improvement); + printf("Dev->Sys Migration | %.3f ms | %.3f ms | %.1f%%\n", + thp->dev_to_sys_time, regular->dev_to_sys_time, d2s_improvement); + printf("S->D Throughput | %.2f GB/s | %.2f GB/s | %.1f%%\n", + thp->throughput_s2d, regular->throughput_s2d, throughput_s2d_improvement); + printf("D->S Throughput | %.2f GB/s | %.2f GB/s | %.1f%%\n", + thp->throughput_d2s, regular->throughput_d2s, throughput_d2s_improvement); +} + +/* + * Run a single migration benchmark + * fd: file descriptor for hmm device + * use_thp: whether to use THP + * buffer_size: size of buffer to allocate + * iterations: number of iterations + * results: where to store results + */ +static inline int run_migration_benchmark(int fd, int use_thp, size_t buffer_size, + int iterations, struct benchmark_results *results) +{ + struct hmm_buffer *buffer; + unsigned long npages = buffer_size / sysconf(_SC_PAGESIZE); + double start, end; + double s2d_total = 0, d2s_total = 0; + int ret, i; + int *ptr; + + buffer = hmm_buffer_alloc(buffer_size); + + /* Map memory */ + buffer->ptr = mmap(NULL, buffer_size, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + + if (!buffer->ptr) + return -1; + + /* Apply THP hint if requested */ + if (use_thp) + ret = madvise(buffer->ptr, buffer_size, MADV_HUGEPAGE); + else + ret = madvise(buffer->ptr, buffer_size, MADV_NOHUGEPAGE); + + if (ret) + return ret; + + /* Initialize memory to make sure pages are allocated */ + ptr = (int *)buffer->ptr; + for (i = 0; i < buffer_size / sizeof(int); i++) + ptr[i] = i & 0xFF; + + /* Warmup iteration */ + ret = hmm_migrate_sys_to_dev(fd, buffer, npages); + if (ret) + return ret; + + ret = hmm_migrate_dev_to_sys(fd, buffer, npages); + if (ret) + return ret; + + /* Benchmark iterations */ + for (i = 0; i < iterations; i++) { + /* System to device migration */ + start = get_time_ms(); + + ret = hmm_migrate_sys_to_dev(fd, buffer, npages); + if (ret) + return ret; + + end = get_time_ms(); + s2d_total += (end - start); + + /* Device to system migration */ + start = get_time_ms(); + + ret = hmm_migrate_dev_to_sys(fd, buffer, npages); + if (ret) + return ret; + + end = get_time_ms(); + d2s_total += (end - start); + } + + /* Calculate average times and throughput */ + results->sys_to_dev_time = s2d_total / iterations; + results->dev_to_sys_time = d2s_total / iterations; + results->throughput_s2d = (buffer_size / (1024.0 * 1024.0 * 1024.0)) / + (results->sys_to_dev_time / 1000.0); + results->throughput_d2s = (buffer_size / (1024.0 * 1024.0 * 1024.0)) / + (results->dev_to_sys_time / 1000.0); + + /* Cleanup */ + hmm_buffer_free(buffer); + return 0; +} + +/* + * Benchmark THP migration with different buffer sizes + */ +TEST_F_TIMEOUT(hmm, benchmark_thp_migration, 120) +{ + struct benchmark_results thp_results, regular_results; + size_t thp_size = 2 * 1024 * 1024; /* 2MB - typical THP size */ + int iterations = 5; + + printf("\nHMM THP Migration Benchmark\n"); + printf("---------------------------\n"); + printf("System page size: %ld bytes\n", sysconf(_SC_PAGESIZE)); + + /* Test different buffer sizes */ + size_t test_sizes[] = { + thp_size / 4, /* 512KB - smaller than THP */ + thp_size / 2, /* 1MB - half THP */ + thp_size, /* 2MB - single THP */ + thp_size * 2, /* 4MB - two THPs */ + thp_size * 4, /* 8MB - four THPs */ + thp_size * 8, /* 16MB - eight THPs */ + thp_size * 128, /* 256MB - one twenty eight THPs */ + }; + + static const char *const test_names[] = { + "Small Buffer (512KB)", + "Half THP Size (1MB)", + "Single THP Size (2MB)", + "Two THP Size (4MB)", + "Four THP Size (8MB)", + "Eight THP Size (16MB)", + "One twenty eight THP Size (256MB)" + }; + + int num_tests = ARRAY_SIZE(test_sizes); + + /* Run all tests */ + for (int i = 0; i < num_tests; i++) { + /* Test with THP */ + ASSERT_EQ(run_migration_benchmark(self->fd, 1, test_sizes[i], + iterations, &thp_results), 0); + + /* Test without THP */ + ASSERT_EQ(run_migration_benchmark(self->fd, 0, test_sizes[i], + iterations, ®ular_results), 0); + + /* Print results */ + print_benchmark_results(test_names[i], test_sizes[i], + &thp_results, ®ular_results); + } +} TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/mm/ksm_functional_tests.c b/tools/testing/selftests/mm/ksm_functional_tests.c index ac136f04b8d6..95afa5cfc062 100644 --- a/tools/testing/selftests/mm/ksm_functional_tests.c +++ b/tools/testing/selftests/mm/ksm_functional_tests.c @@ -38,6 +38,8 @@ enum ksm_merge_mode { }; static int mem_fd; +static int pages_to_scan_fd; +static int sleep_millisecs_fd; static int pagemap_fd; static size_t pagesize; @@ -493,6 +495,46 @@ static void test_prctl_fork(void) ksft_test_result_pass("PR_SET_MEMORY_MERGE value is inherited\n"); } +static int start_ksmd_and_set_frequency(char *pages_to_scan, char *sleep_ms) +{ + int ksm_fd; + + ksm_fd = open("/sys/kernel/mm/ksm/run", O_RDWR); + if (ksm_fd < 0) + return -errno; + + if (write(ksm_fd, "1", 1) != 1) + return -errno; + + if (write(pages_to_scan_fd, pages_to_scan, strlen(pages_to_scan)) <= 0) + return -errno; + + if (write(sleep_millisecs_fd, sleep_ms, strlen(sleep_ms)) <= 0) + return -errno; + + return 0; +} + +static int stop_ksmd_and_restore_frequency(void) +{ + int ksm_fd; + + ksm_fd = open("/sys/kernel/mm/ksm/run", O_RDWR); + if (ksm_fd < 0) + return -errno; + + if (write(ksm_fd, "2", 1) != 1) + return -errno; + + if (write(pages_to_scan_fd, "100", 3) <= 0) + return -errno; + + if (write(sleep_millisecs_fd, "20", 2) <= 0) + return -errno; + + return 0; +} + static void test_prctl_fork_exec(void) { int ret, status; @@ -500,6 +542,9 @@ static void test_prctl_fork_exec(void) ksft_print_msg("[RUN] %s\n", __func__); + if (start_ksmd_and_set_frequency("2000", "0")) + ksft_test_result_fail("set ksmd's scanning frequency failed\n"); + ret = prctl(PR_SET_MEMORY_MERGE, 1, 0, 0, 0); if (ret < 0 && errno == EINVAL) { ksft_test_result_skip("PR_SET_MEMORY_MERGE not supported\n"); @@ -542,6 +587,11 @@ static void test_prctl_fork_exec(void) return; } + if (stop_ksmd_and_restore_frequency()) { + ksft_test_result_fail("restore ksmd frequency failed\n"); + return; + } + ksft_test_result_pass("PR_SET_MEMORY_MERGE value is inherited\n"); } @@ -656,6 +706,13 @@ static void init_global_file_handles(void) ksft_exit_skip("open(\"/proc/self/pagemap\") failed\n"); if (ksm_get_self_merging_pages() < 0) ksft_exit_skip("accessing \"/proc/self/ksm_merging_pages\") failed\n"); + + pages_to_scan_fd = open("/sys/kernel/mm/ksm/pages_to_scan", O_RDWR); + if (pages_to_scan_fd < 0) + ksft_exit_fail_msg("opening /sys/kernel/mm/ksm/pages_to_scan failed\n"); + sleep_millisecs_fd = open("/sys/kernel/mm/ksm/sleep_millisecs", O_RDWR); + if (sleep_millisecs_fd < 0) + ksft_exit_fail_msg("opening /sys/kernel/mm/ksm/sleep_millisecs failed\n"); } int main(int argc, char **argv) diff --git a/tools/testing/selftests/mm/mremap_test.c b/tools/testing/selftests/mm/mremap_test.c index bf2863b102e3..5f073504e0b1 100644 --- a/tools/testing/selftests/mm/mremap_test.c +++ b/tools/testing/selftests/mm/mremap_test.c @@ -994,7 +994,7 @@ static void mremap_move_multi_invalid_vmas(FILE *maps_fp, unsigned long page_siz static long long remap_region(struct config c, unsigned int threshold_mb, char *rand_addr) { - void *addr, *src_addr, *dest_addr, *dest_preamble_addr = NULL; + void *addr, *tmp_addr, *src_addr, *dest_addr, *dest_preamble_addr = NULL; unsigned long long t, d; struct timespec t_start = {0, 0}, t_end = {0, 0}; long long start_ns, end_ns, align_mask, ret, offset; @@ -1032,7 +1032,8 @@ static long long remap_region(struct config c, unsigned int threshold_mb, /* Don't destroy existing mappings unless expected to overlap */ while (!is_remap_region_valid(addr, c.region_size) && !c.overlapping) { /* Check for unsigned overflow */ - if (addr + c.dest_alignment < addr) { + tmp_addr = addr + c.dest_alignment; + if (tmp_addr < addr) { ksft_print_msg("Couldn't find a valid region to remap to\n"); ret = -1; goto clean_up_src; diff --git a/tools/testing/selftests/mm/soft-dirty.c b/tools/testing/selftests/mm/soft-dirty.c index 4ee4db3750c1..c3a9585de98c 100644 --- a/tools/testing/selftests/mm/soft-dirty.c +++ b/tools/testing/selftests/mm/soft-dirty.c @@ -184,6 +184,130 @@ static void test_mprotect(int pagemap_fd, int pagesize, bool anon) close(test_fd); } +static void test_merge(int pagemap_fd, int pagesize) +{ + char *reserved, *map, *map2; + + /* + * Reserve space for tests: + * + * ---padding to --- + * | avoid adj. | + * v merge v + * |---|---|---|---|---| + * | | 1 | 2 | 3 | | + * |---|---|---|---|---| + */ + reserved = mmap(NULL, 5 * pagesize, PROT_NONE, + MAP_ANON | MAP_PRIVATE, -1, 0); + if (reserved == MAP_FAILED) + ksft_exit_fail_msg("mmap failed\n"); + munmap(reserved, 4 * pagesize); + + /* + * Establish initial VMA: + * + * S/D + * |---|---|---|---|---| + * | | 1 | | | | + * |---|---|---|---|---| + */ + map = mmap(&reserved[pagesize], pagesize, PROT_READ | PROT_WRITE, + MAP_ANON | MAP_PRIVATE | MAP_FIXED, -1, 0); + if (map == MAP_FAILED) + ksft_exit_fail_msg("mmap failed\n"); + + /* This will clear VM_SOFTDIRTY too. */ + clear_softdirty(); + + /* + * Now place a new mapping which will be marked VM_SOFTDIRTY. Away from + * map: + * + * - S/D + * |---|---|---|---|---| + * | | 1 | | 2 | | + * |---|---|---|---|---| + */ + map2 = mmap(&reserved[3 * pagesize], pagesize, PROT_READ | PROT_WRITE, + MAP_ANON | MAP_PRIVATE | MAP_FIXED, -1, 0); + if (map2 == MAP_FAILED) + ksft_exit_fail_msg("mmap failed\n"); + + /* + * Now remap it immediately adjacent to map, if the merge correctly + * propagates VM_SOFTDIRTY, we should then observe the VMA as a whole + * being marked soft-dirty: + * + * merge + * S/D + * |---|-------|---|---| + * | | 1 | | | + * |---|-------|---|---| + */ + map2 = mremap(map2, pagesize, pagesize, MREMAP_FIXED | MREMAP_MAYMOVE, + &reserved[2 * pagesize]); + if (map2 == MAP_FAILED) + ksft_exit_fail_msg("mremap failed\n"); + ksft_test_result(pagemap_is_softdirty(pagemap_fd, map) == 1, + "Test %s-anon soft-dirty after remap merge 1st pg\n", + __func__); + ksft_test_result(pagemap_is_softdirty(pagemap_fd, map2) == 1, + "Test %s-anon soft-dirty after remap merge 2nd pg\n", + __func__); + + munmap(map, 2 * pagesize); + + /* + * Now establish another VMA: + * + * S/D + * |---|---|---|---|---| + * | | 1 | | | | + * |---|---|---|---|---| + */ + map = mmap(&reserved[pagesize], pagesize, PROT_READ | PROT_WRITE, + MAP_ANON | MAP_PRIVATE | MAP_FIXED, -1, 0); + if (map == MAP_FAILED) + ksft_exit_fail_msg("mmap failed\n"); + + /* Clear VM_SOFTDIRTY... */ + clear_softdirty(); + /* ...and establish incompatible adjacent VMA: + * + * - S/D + * |---|---|---|---|---| + * | | 1 | 2 | | | + * |---|---|---|---|---| + */ + map2 = mmap(&reserved[2 * pagesize], pagesize, + PROT_READ | PROT_WRITE | PROT_EXEC, + MAP_ANON | MAP_PRIVATE | MAP_FIXED, -1, 0); + if (map2 == MAP_FAILED) + ksft_exit_fail_msg("mmap failed\n"); + + /* + * Now mprotect() VMA 1 so it's compatible with 2 and therefore merges: + * + * merge + * S/D + * |---|-------|---|---| + * | | 1 | | | + * |---|-------|---|---| + */ + if (mprotect(map, pagesize, PROT_READ | PROT_WRITE | PROT_EXEC)) + ksft_exit_fail_msg("mprotect failed\n"); + + ksft_test_result(pagemap_is_softdirty(pagemap_fd, map) == 1, + "Test %s-anon soft-dirty after mprotect merge 1st pg\n", + __func__); + ksft_test_result(pagemap_is_softdirty(pagemap_fd, map2) == 1, + "Test %s-anon soft-dirty after mprotect merge 2nd pg\n", + __func__); + + munmap(map, 2 * pagesize); +} + static void test_mprotect_anon(int pagemap_fd, int pagesize) { test_mprotect(pagemap_fd, pagesize, true); @@ -204,7 +328,7 @@ int main(int argc, char **argv) if (!softdirty_supported()) ksft_exit_skip("soft-dirty is not support\n"); - ksft_set_plan(15); + ksft_set_plan(19); pagemap_fd = open(PAGEMAP_FILE_PATH, O_RDONLY); if (pagemap_fd < 0) ksft_exit_fail_msg("Failed to open %s\n", PAGEMAP_FILE_PATH); @@ -216,6 +340,7 @@ int main(int argc, char **argv) test_hugepage(pagemap_fd, pagesize); test_mprotect_anon(pagemap_fd, pagesize); test_mprotect_file(pagemap_fd, pagesize); + test_merge(pagemap_fd, pagesize); close(pagemap_fd); diff --git a/tools/testing/selftests/mm/uffd-common.c b/tools/testing/selftests/mm/uffd-common.c index 994fe8c03923..edd02328f77b 100644 --- a/tools/testing/selftests/mm/uffd-common.c +++ b/tools/testing/selftests/mm/uffd-common.c @@ -10,7 +10,6 @@ uffd_test_ops_t *uffd_test_ops; uffd_test_case_ops_t *uffd_test_case_ops; -#define BASE_PMD_ADDR ((void *)(1UL << 30)) /* pthread_mutex_t starts at page offset 0 */ pthread_mutex_t *area_mutex(char *area, unsigned long nr, uffd_global_test_opts_t *gopts) @@ -142,30 +141,37 @@ static int shmem_allocate_area(uffd_global_test_opts_t *gopts, void **alloc_area unsigned long offset = is_src ? 0 : bytes; char *p = NULL, *p_alias = NULL; int mem_fd = uffd_mem_fd_create(bytes * 2, false); + size_t region_size = bytes * 2 + hpage_size; - /* TODO: clean this up. Use a static addr is ugly */ - p = BASE_PMD_ADDR; - if (!is_src) - /* src map + alias + interleaved hpages */ - p += 2 * (bytes + hpage_size); + void *reserve = mmap(NULL, region_size, PROT_NONE, MAP_PRIVATE | MAP_ANONYMOUS, + -1, 0); + if (reserve == MAP_FAILED) { + close(mem_fd); + return -errno; + } + + p = reserve; p_alias = p; p_alias += bytes; p_alias += hpage_size; /* Prevent src/dst VMA merge */ - *alloc_area = mmap(p, bytes, PROT_READ | PROT_WRITE, MAP_SHARED, + *alloc_area = mmap(p, bytes, PROT_READ | PROT_WRITE, MAP_FIXED | MAP_SHARED, mem_fd, offset); if (*alloc_area == MAP_FAILED) { *alloc_area = NULL; + munmap(reserve, region_size); + close(mem_fd); return -errno; } if (*alloc_area != p) err("mmap of memfd failed at %p", p); - area_alias = mmap(p_alias, bytes, PROT_READ | PROT_WRITE, MAP_SHARED, + area_alias = mmap(p_alias, bytes, PROT_READ | PROT_WRITE, MAP_FIXED | MAP_SHARED, mem_fd, offset); if (area_alias == MAP_FAILED) { - munmap(*alloc_area, bytes); *alloc_area = NULL; + munmap(reserve, region_size); + close(mem_fd); return -errno; } if (area_alias != p_alias) diff --git a/tools/testing/selftests/mm/uffd-stress.c b/tools/testing/selftests/mm/uffd-stress.c index b51c89e1cd1a..700fbaa18d44 100644 --- a/tools/testing/selftests/mm/uffd-stress.c +++ b/tools/testing/selftests/mm/uffd-stress.c @@ -241,7 +241,7 @@ static int stress(struct uffd_args *args) return 1; for (cpu = 0; cpu < gopts->nr_parallel; cpu++) { - char c; + char c = '\0'; if (bounces & BOUNCE_POLL) { if (write(gopts->pipefd[cpu*2+1], &c, 1) != 1) err("pipefd write error"); diff --git a/tools/testing/selftests/mm/uffd-unit-tests.c b/tools/testing/selftests/mm/uffd-unit-tests.c index f917b4c4c943..f4807242c5b2 100644 --- a/tools/testing/selftests/mm/uffd-unit-tests.c +++ b/tools/testing/selftests/mm/uffd-unit-tests.c @@ -543,7 +543,7 @@ static void uffd_minor_test_common(uffd_global_test_opts_t *gopts, bool test_col { unsigned long p; pthread_t uffd_mon; - char c; + char c = '\0'; struct uffd_args args = { 0 }; args.gopts = gopts; @@ -759,7 +759,7 @@ static void uffd_sigbus_test_common(uffd_global_test_opts_t *gopts, bool wp) pthread_t uffd_mon; pid_t pid; int err; - char c; + char c = '\0'; struct uffd_args args = { 0 }; args.gopts = gopts; @@ -819,7 +819,7 @@ static void uffd_events_test_common(uffd_global_test_opts_t *gopts, bool wp) pthread_t uffd_mon; pid_t pid; int err; - char c; + char c = '\0'; struct uffd_args args = { 0 }; args.gopts = gopts; @@ -1125,7 +1125,7 @@ uffd_move_test_common(uffd_global_test_opts_t *gopts, { unsigned long nr; pthread_t uffd_mon; - char c; + char c = '\0'; unsigned long long count; struct uffd_args args = { 0 }; char *orig_area_src = NULL, *orig_area_dst = NULL; diff --git a/tools/testing/selftests/mm/vm_util.c b/tools/testing/selftests/mm/vm_util.c index e33cda301dad..605cb58ea5c3 100644 --- a/tools/testing/selftests/mm/vm_util.c +++ b/tools/testing/selftests/mm/vm_util.c @@ -449,6 +449,11 @@ bool check_vmflag_pfnmap(void *addr) return check_vmflag(addr, "pf"); } +bool check_vmflag_guard(void *addr) +{ + return check_vmflag(addr, "gu"); +} + bool softdirty_supported(void) { char *addr; diff --git a/tools/testing/selftests/mm/vm_util.h b/tools/testing/selftests/mm/vm_util.h index 26c30fdc0241..a8abdf414d46 100644 --- a/tools/testing/selftests/mm/vm_util.h +++ b/tools/testing/selftests/mm/vm_util.h @@ -98,6 +98,7 @@ int uffd_register_with_ioctls(int uffd, void *addr, uint64_t len, unsigned long get_free_hugepages(void); bool check_vmflag_io(void *addr); bool check_vmflag_pfnmap(void *addr); +bool check_vmflag_guard(void *addr); int open_procmap(pid_t pid, struct procmap_fd *procmap_out); int query_procmap(struct procmap_fd *procmap); bool find_vma_procmap(struct procmap_fd *procmap, void *address); diff --git a/tools/testing/selftests/namespaces/.gitignore b/tools/testing/selftests/namespaces/.gitignore index ccfb40837a73..0989e80da457 100644 --- a/tools/testing/selftests/namespaces/.gitignore +++ b/tools/testing/selftests/namespaces/.gitignore @@ -1,3 +1,12 @@ nsid_test file_handle_test init_ino_test +ns_active_ref_test +listns_test +listns_permissions_test +listns_efault_test +siocgskns_test +cred_change_test +stress_test +listns_pagination_bug +regression_pidfd_setns_test diff --git a/tools/testing/selftests/namespaces/Makefile b/tools/testing/selftests/namespaces/Makefile index 5fe4b3dc07d3..fbb821652c17 100644 --- a/tools/testing/selftests/namespaces/Makefile +++ b/tools/testing/selftests/namespaces/Makefile @@ -1,7 +1,29 @@ # SPDX-License-Identifier: GPL-2.0-only CFLAGS += -Wall -O0 -g $(KHDR_INCLUDES) $(TOOLS_INCLUDES) +LDLIBS += -lcap -TEST_GEN_PROGS := nsid_test file_handle_test init_ino_test +TEST_GEN_PROGS := nsid_test \ + file_handle_test \ + init_ino_test \ + ns_active_ref_test \ + listns_test \ + listns_permissions_test \ + listns_efault_test \ + siocgskns_test \ + cred_change_test \ + stress_test \ + listns_pagination_bug \ + regression_pidfd_setns_test include ../lib.mk +$(OUTPUT)/ns_active_ref_test: ../filesystems/utils.c +$(OUTPUT)/listns_test: ../filesystems/utils.c +$(OUTPUT)/listns_permissions_test: ../filesystems/utils.c +$(OUTPUT)/listns_efault_test: ../filesystems/utils.c +$(OUTPUT)/siocgskns_test: ../filesystems/utils.c +$(OUTPUT)/cred_change_test: ../filesystems/utils.c +$(OUTPUT)/stress_test: ../filesystems/utils.c +$(OUTPUT)/listns_pagination_bug: ../filesystems/utils.c +$(OUTPUT)/regression_pidfd_setns_test: ../filesystems/utils.c + diff --git a/tools/testing/selftests/namespaces/cred_change_test.c b/tools/testing/selftests/namespaces/cred_change_test.c new file mode 100644 index 000000000000..7b4f5ad3f725 --- /dev/null +++ b/tools/testing/selftests/namespaces/cred_change_test.c @@ -0,0 +1,814 @@ +// SPDX-License-Identifier: GPL-2.0 +#define _GNU_SOURCE +#include <errno.h> +#include <fcntl.h> +#include <limits.h> +#include <sched.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/capability.h> +#include <sys/ioctl.h> +#include <sys/stat.h> +#include <sys/syscall.h> +#include <sys/types.h> +#include <sys/wait.h> +#include <unistd.h> +#include <linux/nsfs.h> +#include "../kselftest_harness.h" +#include "../filesystems/utils.h" +#include "wrappers.h" + +/* + * Test credential changes and their impact on namespace active references. + */ + +/* + * Test setuid() in a user namespace properly swaps active references. + * Create a user namespace with multiple UIDs mapped, then setuid() between them. + * Verify that the user namespace remains active throughout. + */ +TEST(setuid_preserves_active_refs) +{ + pid_t pid; + int status; + __u64 userns_id; + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = CLONE_NEWUSER, + .spare2 = 0, + .user_ns_id = 0, + }; + __u64 ns_ids[256]; + ssize_t ret; + int i; + bool found = false; + int pipefd[2]; + + ASSERT_EQ(pipe(pipefd), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* Child process */ + int fd, userns_fd; + __u64 child_userns_id; + uid_t orig_uid = getuid(); + int setuid_count; + + close(pipefd[0]); + + /* Create new user namespace with multiple UIDs mapped (0-9) */ + userns_fd = get_userns_fd(0, orig_uid, 10); + if (userns_fd < 0) { + close(pipefd[1]); + exit(1); + } + + if (setns(userns_fd, CLONE_NEWUSER) < 0) { + close(userns_fd); + close(pipefd[1]); + exit(1); + } + close(userns_fd); + + /* Get user namespace ID */ + fd = open("/proc/self/ns/user", O_RDONLY); + if (fd < 0) { + close(pipefd[1]); + exit(1); + } + + if (ioctl(fd, NS_GET_ID, &child_userns_id) < 0) { + close(fd); + close(pipefd[1]); + exit(1); + } + close(fd); + + /* Send namespace ID to parent */ + write(pipefd[1], &child_userns_id, sizeof(child_userns_id)); + + /* + * Perform multiple setuid() calls. + * Each setuid() triggers commit_creds() which should properly + * swap active references via switch_cred_namespaces(). + */ + for (setuid_count = 0; setuid_count < 50; setuid_count++) { + uid_t target_uid = (setuid_count % 10); + if (setuid(target_uid) < 0) { + if (errno != EPERM) { + close(pipefd[1]); + exit(1); + } + } + } + + close(pipefd[1]); + exit(0); + } + + /* Parent process */ + close(pipefd[1]); + + if (read(pipefd[0], &userns_id, sizeof(userns_id)) != sizeof(userns_id)) { + close(pipefd[0]); + kill(pid, SIGKILL); + waitpid(pid, NULL, 0); + SKIP(return, "Failed to get namespace ID from child"); + } + close(pipefd[0]); + + TH_LOG("Child user namespace ID: %llu", (unsigned long long)userns_id); + + /* Verify namespace is active while child is running */ + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0); + if (ret < 0) { + kill(pid, SIGKILL); + waitpid(pid, NULL, 0); + if (errno == ENOSYS) + SKIP(return, "listns() not supported"); + ASSERT_GE(ret, 0); + } + + for (i = 0; i < ret; i++) { + if (ns_ids[i] == userns_id) { + found = true; + break; + } + } + ASSERT_TRUE(found); + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + /* Verify namespace becomes inactive after child exits */ + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0); + ASSERT_GE(ret, 0); + + found = false; + for (i = 0; i < ret; i++) { + if (ns_ids[i] == userns_id) { + found = true; + break; + } + } + + ASSERT_FALSE(found); + TH_LOG("setuid() correctly preserved active references (no leak)"); +} + +/* + * Test setgid() in a user namespace properly handles active references. + */ +TEST(setgid_preserves_active_refs) +{ + pid_t pid; + int status; + __u64 userns_id; + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = CLONE_NEWUSER, + .spare2 = 0, + .user_ns_id = 0, + }; + __u64 ns_ids[256]; + ssize_t ret; + int i; + bool found = false; + int pipefd[2]; + + ASSERT_EQ(pipe(pipefd), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* Child process */ + int fd, userns_fd; + __u64 child_userns_id; + uid_t orig_uid = getuid(); + int setgid_count; + + close(pipefd[0]); + + /* Create new user namespace with multiple GIDs mapped */ + userns_fd = get_userns_fd(0, orig_uid, 10); + if (userns_fd < 0) { + close(pipefd[1]); + exit(1); + } + + if (setns(userns_fd, CLONE_NEWUSER) < 0) { + close(userns_fd); + close(pipefd[1]); + exit(1); + } + close(userns_fd); + + /* Get user namespace ID */ + fd = open("/proc/self/ns/user", O_RDONLY); + if (fd < 0) { + close(pipefd[1]); + exit(1); + } + + if (ioctl(fd, NS_GET_ID, &child_userns_id) < 0) { + close(fd); + close(pipefd[1]); + exit(1); + } + close(fd); + + write(pipefd[1], &child_userns_id, sizeof(child_userns_id)); + + /* Perform multiple setgid() calls */ + for (setgid_count = 0; setgid_count < 50; setgid_count++) { + gid_t target_gid = (setgid_count % 10); + if (setgid(target_gid) < 0) { + if (errno != EPERM) { + close(pipefd[1]); + exit(1); + } + } + } + + close(pipefd[1]); + exit(0); + } + + /* Parent process */ + close(pipefd[1]); + + if (read(pipefd[0], &userns_id, sizeof(userns_id)) != sizeof(userns_id)) { + close(pipefd[0]); + kill(pid, SIGKILL); + waitpid(pid, NULL, 0); + SKIP(return, "Failed to get namespace ID from child"); + } + close(pipefd[0]); + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + /* Verify namespace becomes inactive */ + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0); + if (ret < 0) { + if (errno == ENOSYS) + SKIP(return, "listns() not supported"); + ASSERT_GE(ret, 0); + } + + for (i = 0; i < ret; i++) { + if (ns_ids[i] == userns_id) { + found = true; + break; + } + } + + ASSERT_FALSE(found); + TH_LOG("setgid() correctly preserved active references (no leak)"); +} + +/* + * Test setresuid() which changes real, effective, and saved UIDs. + * This should properly swap active references via commit_creds(). + */ +TEST(setresuid_preserves_active_refs) +{ + pid_t pid; + int status; + __u64 userns_id; + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = CLONE_NEWUSER, + .spare2 = 0, + .user_ns_id = 0, + }; + __u64 ns_ids[256]; + ssize_t ret; + int i; + bool found = false; + int pipefd[2]; + + ASSERT_EQ(pipe(pipefd), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* Child process */ + int fd, userns_fd; + __u64 child_userns_id; + uid_t orig_uid = getuid(); + int setres_count; + + close(pipefd[0]); + + /* Create new user namespace */ + userns_fd = get_userns_fd(0, orig_uid, 10); + if (userns_fd < 0) { + close(pipefd[1]); + exit(1); + } + + if (setns(userns_fd, CLONE_NEWUSER) < 0) { + close(userns_fd); + close(pipefd[1]); + exit(1); + } + close(userns_fd); + + /* Get user namespace ID */ + fd = open("/proc/self/ns/user", O_RDONLY); + if (fd < 0) { + close(pipefd[1]); + exit(1); + } + + if (ioctl(fd, NS_GET_ID, &child_userns_id) < 0) { + close(fd); + close(pipefd[1]); + exit(1); + } + close(fd); + + write(pipefd[1], &child_userns_id, sizeof(child_userns_id)); + + /* Perform multiple setresuid() calls */ + for (setres_count = 0; setres_count < 30; setres_count++) { + uid_t uid1 = (setres_count % 5); + uid_t uid2 = ((setres_count + 1) % 5); + uid_t uid3 = ((setres_count + 2) % 5); + + if (setresuid(uid1, uid2, uid3) < 0) { + if (errno != EPERM) { + close(pipefd[1]); + exit(1); + } + } + } + + close(pipefd[1]); + exit(0); + } + + /* Parent process */ + close(pipefd[1]); + + if (read(pipefd[0], &userns_id, sizeof(userns_id)) != sizeof(userns_id)) { + close(pipefd[0]); + kill(pid, SIGKILL); + waitpid(pid, NULL, 0); + SKIP(return, "Failed to get namespace ID from child"); + } + close(pipefd[0]); + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + /* Verify namespace becomes inactive */ + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0); + if (ret < 0) { + if (errno == ENOSYS) + SKIP(return, "listns() not supported"); + ASSERT_GE(ret, 0); + } + + for (i = 0; i < ret; i++) { + if (ns_ids[i] == userns_id) { + found = true; + break; + } + } + + ASSERT_FALSE(found); + TH_LOG("setresuid() correctly preserved active references (no leak)"); +} + +/* + * Test credential changes across multiple user namespaces. + * Create nested user namespaces and verify active reference tracking. + */ +TEST(cred_change_nested_userns) +{ + pid_t pid; + int status; + __u64 parent_userns_id, child_userns_id; + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = CLONE_NEWUSER, + .spare2 = 0, + .user_ns_id = 0, + }; + __u64 ns_ids[256]; + ssize_t ret; + int i; + bool found_parent = false, found_child = false; + int pipefd[2]; + + ASSERT_EQ(pipe(pipefd), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* Child process */ + int fd, userns_fd; + __u64 parent_id, child_id; + uid_t orig_uid = getuid(); + + close(pipefd[0]); + + /* Create first user namespace */ + userns_fd = get_userns_fd(0, orig_uid, 1); + if (userns_fd < 0) { + close(pipefd[1]); + exit(1); + } + + if (setns(userns_fd, CLONE_NEWUSER) < 0) { + close(userns_fd); + close(pipefd[1]); + exit(1); + } + close(userns_fd); + + /* Get first namespace ID */ + fd = open("/proc/self/ns/user", O_RDONLY); + if (fd < 0) { + close(pipefd[1]); + exit(1); + } + + if (ioctl(fd, NS_GET_ID, &parent_id) < 0) { + close(fd); + close(pipefd[1]); + exit(1); + } + close(fd); + + /* Create nested user namespace */ + userns_fd = get_userns_fd(0, 0, 1); + if (userns_fd < 0) { + close(pipefd[1]); + exit(1); + } + + if (setns(userns_fd, CLONE_NEWUSER) < 0) { + close(userns_fd); + close(pipefd[1]); + exit(1); + } + close(userns_fd); + + /* Get nested namespace ID */ + fd = open("/proc/self/ns/user", O_RDONLY); + if (fd < 0) { + close(pipefd[1]); + exit(1); + } + + if (ioctl(fd, NS_GET_ID, &child_id) < 0) { + close(fd); + close(pipefd[1]); + exit(1); + } + close(fd); + + /* Send both IDs to parent */ + write(pipefd[1], &parent_id, sizeof(parent_id)); + write(pipefd[1], &child_id, sizeof(child_id)); + + /* Perform some credential changes in nested namespace */ + setuid(0); + setgid(0); + + close(pipefd[1]); + exit(0); + } + + /* Parent process */ + close(pipefd[1]); + + /* Read both namespace IDs */ + if (read(pipefd[0], &parent_userns_id, sizeof(parent_userns_id)) != sizeof(parent_userns_id)) { + close(pipefd[0]); + kill(pid, SIGKILL); + waitpid(pid, NULL, 0); + SKIP(return, "Failed to get parent namespace ID"); + } + + if (read(pipefd[0], &child_userns_id, sizeof(child_userns_id)) != sizeof(child_userns_id)) { + close(pipefd[0]); + kill(pid, SIGKILL); + waitpid(pid, NULL, 0); + SKIP(return, "Failed to get child namespace ID"); + } + close(pipefd[0]); + + TH_LOG("Parent userns: %llu, Child userns: %llu", + (unsigned long long)parent_userns_id, + (unsigned long long)child_userns_id); + + /* Verify both namespaces are active */ + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0); + if (ret < 0) { + kill(pid, SIGKILL); + waitpid(pid, NULL, 0); + if (errno == ENOSYS) + SKIP(return, "listns() not supported"); + ASSERT_GE(ret, 0); + } + + for (i = 0; i < ret; i++) { + if (ns_ids[i] == parent_userns_id) + found_parent = true; + if (ns_ids[i] == child_userns_id) + found_child = true; + } + + ASSERT_TRUE(found_parent); + ASSERT_TRUE(found_child); + + /* Wait for child */ + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + /* Verify both namespaces become inactive */ + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0); + ASSERT_GE(ret, 0); + + found_parent = false; + found_child = false; + for (i = 0; i < ret; i++) { + if (ns_ids[i] == parent_userns_id) + found_parent = true; + if (ns_ids[i] == child_userns_id) + found_child = true; + } + + ASSERT_FALSE(found_parent); + ASSERT_FALSE(found_child); + TH_LOG("Nested user namespace credential changes preserved active refs (no leak)"); +} + +/* + * Test rapid credential changes don't cause refcount imbalances. + * This stress-tests the switch_cred_namespaces() logic. + */ +TEST(rapid_cred_changes_no_leak) +{ + pid_t pid; + int status; + __u64 userns_id; + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = CLONE_NEWUSER, + .spare2 = 0, + .user_ns_id = 0, + }; + __u64 ns_ids[256]; + ssize_t ret; + int i; + bool found = false; + int pipefd[2]; + + ASSERT_EQ(pipe(pipefd), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* Child process */ + int fd, userns_fd; + __u64 child_userns_id; + uid_t orig_uid = getuid(); + int change_count; + + close(pipefd[0]); + + /* Create new user namespace with wider range of UIDs/GIDs */ + userns_fd = get_userns_fd(0, orig_uid, 100); + if (userns_fd < 0) { + close(pipefd[1]); + exit(1); + } + + if (setns(userns_fd, CLONE_NEWUSER) < 0) { + close(userns_fd); + close(pipefd[1]); + exit(1); + } + close(userns_fd); + + /* Get user namespace ID */ + fd = open("/proc/self/ns/user", O_RDONLY); + if (fd < 0) { + close(pipefd[1]); + exit(1); + } + + if (ioctl(fd, NS_GET_ID, &child_userns_id) < 0) { + close(fd); + close(pipefd[1]); + exit(1); + } + close(fd); + + write(pipefd[1], &child_userns_id, sizeof(child_userns_id)); + + /* + * Perform many rapid credential changes. + * Mix setuid, setgid, setreuid, setregid, setresuid, setresgid. + */ + for (change_count = 0; change_count < 200; change_count++) { + switch (change_count % 6) { + case 0: + setuid(change_count % 50); + break; + case 1: + setgid(change_count % 50); + break; + case 2: + setreuid(change_count % 50, (change_count + 1) % 50); + break; + case 3: + setregid(change_count % 50, (change_count + 1) % 50); + break; + case 4: + setresuid(change_count % 50, (change_count + 1) % 50, (change_count + 2) % 50); + break; + case 5: + setresgid(change_count % 50, (change_count + 1) % 50, (change_count + 2) % 50); + break; + } + } + + close(pipefd[1]); + exit(0); + } + + /* Parent process */ + close(pipefd[1]); + + if (read(pipefd[0], &userns_id, sizeof(userns_id)) != sizeof(userns_id)) { + close(pipefd[0]); + kill(pid, SIGKILL); + waitpid(pid, NULL, 0); + SKIP(return, "Failed to get namespace ID from child"); + } + close(pipefd[0]); + + TH_LOG("Testing with user namespace ID: %llu", (unsigned long long)userns_id); + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + /* Verify namespace becomes inactive (no leaked active refs) */ + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0); + if (ret < 0) { + if (errno == ENOSYS) + SKIP(return, "listns() not supported"); + ASSERT_GE(ret, 0); + } + + for (i = 0; i < ret; i++) { + if (ns_ids[i] == userns_id) { + found = true; + break; + } + } + + ASSERT_FALSE(found); + TH_LOG("200 rapid credential changes completed with no active ref leak"); +} + +/* + * Test setfsuid/setfsgid which change filesystem UID/GID. + * These also trigger credential changes but may have different code paths. + */ +TEST(setfsuid_preserves_active_refs) +{ + pid_t pid; + int status; + __u64 userns_id; + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = CLONE_NEWUSER, + .spare2 = 0, + .user_ns_id = 0, + }; + __u64 ns_ids[256]; + ssize_t ret; + int i; + bool found = false; + int pipefd[2]; + + ASSERT_EQ(pipe(pipefd), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* Child process */ + int fd, userns_fd; + __u64 child_userns_id; + uid_t orig_uid = getuid(); + int change_count; + + close(pipefd[0]); + + /* Create new user namespace */ + userns_fd = get_userns_fd(0, orig_uid, 10); + if (userns_fd < 0) { + close(pipefd[1]); + exit(1); + } + + if (setns(userns_fd, CLONE_NEWUSER) < 0) { + close(userns_fd); + close(pipefd[1]); + exit(1); + } + close(userns_fd); + + /* Get user namespace ID */ + fd = open("/proc/self/ns/user", O_RDONLY); + if (fd < 0) { + close(pipefd[1]); + exit(1); + } + + if (ioctl(fd, NS_GET_ID, &child_userns_id) < 0) { + close(fd); + close(pipefd[1]); + exit(1); + } + close(fd); + + write(pipefd[1], &child_userns_id, sizeof(child_userns_id)); + + /* Perform multiple setfsuid/setfsgid calls */ + for (change_count = 0; change_count < 50; change_count++) { + setfsuid(change_count % 10); + setfsgid(change_count % 10); + } + + close(pipefd[1]); + exit(0); + } + + /* Parent process */ + close(pipefd[1]); + + if (read(pipefd[0], &userns_id, sizeof(userns_id)) != sizeof(userns_id)) { + close(pipefd[0]); + kill(pid, SIGKILL); + waitpid(pid, NULL, 0); + SKIP(return, "Failed to get namespace ID from child"); + } + close(pipefd[0]); + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + /* Verify namespace becomes inactive */ + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0); + if (ret < 0) { + if (errno == ENOSYS) + SKIP(return, "listns() not supported"); + ASSERT_GE(ret, 0); + } + + for (i = 0; i < ret; i++) { + if (ns_ids[i] == userns_id) { + found = true; + break; + } + } + + ASSERT_FALSE(found); + TH_LOG("setfsuid/setfsgid correctly preserved active references (no leak)"); +} + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/namespaces/listns_efault_test.c b/tools/testing/selftests/namespaces/listns_efault_test.c new file mode 100644 index 000000000000..c7ed4023d7a8 --- /dev/null +++ b/tools/testing/selftests/namespaces/listns_efault_test.c @@ -0,0 +1,530 @@ +// SPDX-License-Identifier: GPL-2.0 +#define _GNU_SOURCE +#include <errno.h> +#include <fcntl.h> +#include <limits.h> +#include <sched.h> +#include <signal.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <linux/nsfs.h> +#include <sys/ioctl.h> +#include <sys/mman.h> +#include <sys/mount.h> +#include <sys/socket.h> +#include <sys/stat.h> +#include <sys/syscall.h> +#include <sys/types.h> +#include <sys/wait.h> +#include <unistd.h> +#include "../kselftest_harness.h" +#include "../filesystems/utils.h" +#include "../pidfd/pidfd.h" +#include "wrappers.h" + +/* + * Test listns() error handling with invalid buffer addresses. + * + * When the buffer pointer is invalid (e.g., crossing page boundaries + * into unmapped memory), listns() returns EINVAL. + * + * This test also creates mount namespaces that get destroyed during + * iteration, testing that namespace cleanup happens outside the RCU + * read lock. + */ +TEST(listns_partial_fault_with_ns_cleanup) +{ + void *map; + __u64 *ns_ids; + ssize_t ret; + long page_size; + pid_t pid, iter_pid; + int pidfds[5]; + int sv[5][2]; + int iter_pidfd; + int i, status; + char c; + + page_size = sysconf(_SC_PAGESIZE); + ASSERT_GT(page_size, 0); + + /* + * Map two pages: + * - First page: readable and writable + * - Second page: will be unmapped to trigger EFAULT + */ + map = mmap(NULL, page_size * 2, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + ASSERT_NE(map, MAP_FAILED); + + /* Unmap the second page */ + ret = munmap((char *)map + page_size, page_size); + ASSERT_EQ(ret, 0); + + /* + * Position the buffer pointer so there's room for exactly one u64 + * before the page boundary. The second u64 would fall into the + * unmapped page. + */ + ns_ids = ((__u64 *)((char *)map + page_size)) - 1; + + /* + * Create a separate process to run listns() in a loop concurrently + * with namespace creation and destruction. + */ + iter_pid = create_child(&iter_pidfd, 0); + ASSERT_NE(iter_pid, -1); + + if (iter_pid == 0) { + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = 0, /* All types */ + .spare2 = 0, + .user_ns_id = 0, /* Global listing */ + }; + int iter_ret; + + /* + * Loop calling listns() until killed. + * The kernel should: + * 1. Successfully write the first namespace ID (within valid page) + * 2. Fail with EFAULT when trying to write the second ID (unmapped page) + * 3. Handle concurrent namespace destruction without deadlock + */ + while (1) { + iter_ret = sys_listns(&req, ns_ids, 2, 0); + + if (iter_ret == -1 && errno == ENOSYS) + _exit(PIDFD_SKIP); + } + } + + /* Small delay to let iterator start looping */ + usleep(50000); + + /* + * Create several child processes, each in its own mount namespace. + * These will be destroyed while the iterator is running listns(). + */ + for (i = 0; i < 5; i++) { + /* Create socketpair for synchronization */ + ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sv[i]), 0); + + pid = create_child(&pidfds[i], CLONE_NEWNS); + ASSERT_NE(pid, -1); + + if (pid == 0) { + close(sv[i][0]); /* Close parent end */ + + if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0)) + _exit(1); + + /* Child: create a couple of tmpfs mounts */ + if (mkdir("/tmp/test_mnt1", 0755) == -1 && errno != EEXIST) + _exit(1); + if (mkdir("/tmp/test_mnt2", 0755) == -1 && errno != EEXIST) + _exit(1); + + if (mount("tmpfs", "/tmp/test_mnt1", "tmpfs", 0, NULL) == -1) + _exit(1); + if (mount("tmpfs", "/tmp/test_mnt2", "tmpfs", 0, NULL) == -1) + _exit(1); + + /* Signal parent that setup is complete */ + if (write_nointr(sv[i][1], "R", 1) != 1) + _exit(1); + + /* Wait for parent to signal us to exit */ + if (read_nointr(sv[i][1], &c, 1) != 1) + _exit(1); + + close(sv[i][1]); + _exit(0); + } + + close(sv[i][1]); /* Close child end */ + } + + /* Wait for all children to finish setup */ + for (i = 0; i < 5; i++) { + ret = read_nointr(sv[i][0], &c, 1); + ASSERT_EQ(ret, 1); + ASSERT_EQ(c, 'R'); + } + + /* + * Signal children to exit. This will destroy their mount namespaces + * while listns() is iterating the namespace tree. + * This tests that cleanup happens outside the RCU read lock. + */ + for (i = 0; i < 5; i++) + write_nointr(sv[i][0], "X", 1); + + /* Wait for all mount namespace children to exit and cleanup */ + for (i = 0; i < 5; i++) { + waitpid(-1, NULL, 0); + close(sv[i][0]); + close(pidfds[i]); + } + + /* Kill iterator and wait for it */ + sys_pidfd_send_signal(iter_pidfd, SIGKILL, NULL, 0); + ret = waitpid(iter_pid, &status, 0); + ASSERT_EQ(ret, iter_pid); + close(iter_pidfd); + + /* Should have been killed */ + ASSERT_TRUE(WIFSIGNALED(status)); + ASSERT_EQ(WTERMSIG(status), SIGKILL); + + /* Clean up */ + munmap(map, page_size); +} + +/* + * Test listns() error handling when the entire buffer is invalid. + * This is a sanity check that basic invalid pointer detection works. + */ +TEST(listns_complete_fault) +{ + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = 0, + .spare2 = 0, + .user_ns_id = 0, + }; + __u64 *ns_ids; + ssize_t ret; + + /* Use a clearly invalid pointer */ + ns_ids = (__u64 *)0xdeadbeef; + + ret = sys_listns(&req, ns_ids, 10, 0); + + if (ret == -1 && errno == ENOSYS) + SKIP(return, "listns() not supported"); + + /* Should fail with EFAULT */ + ASSERT_EQ(ret, -1); + ASSERT_EQ(errno, EFAULT); +} + +/* + * Test listns() error handling when the buffer is NULL. + */ +TEST(listns_null_buffer) +{ + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = 0, + .spare2 = 0, + .user_ns_id = 0, + }; + ssize_t ret; + + /* NULL buffer with non-zero count should fail */ + ret = sys_listns(&req, NULL, 10, 0); + + if (ret == -1 && errno == ENOSYS) + SKIP(return, "listns() not supported"); + + /* Should fail with EFAULT */ + ASSERT_EQ(ret, -1); + ASSERT_EQ(errno, EFAULT); +} + +/* + * Test listns() with a buffer that becomes invalid mid-iteration + * (after several successful writes), combined with mount namespace + * destruction to test RCU cleanup logic. + */ +TEST(listns_late_fault_with_ns_cleanup) +{ + void *map; + __u64 *ns_ids; + ssize_t ret; + long page_size; + pid_t pid, iter_pid; + int pidfds[10]; + int sv[10][2]; + int iter_pidfd; + int i, status; + char c; + + page_size = sysconf(_SC_PAGESIZE); + ASSERT_GT(page_size, 0); + + /* Map two pages */ + map = mmap(NULL, page_size * 2, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + ASSERT_NE(map, MAP_FAILED); + + /* Unmap the second page */ + ret = munmap((char *)map + page_size, page_size); + ASSERT_EQ(ret, 0); + + /* + * Position buffer so we can write several u64s successfully + * before hitting the page boundary. + */ + ns_ids = ((__u64 *)((char *)map + page_size)) - 5; + + /* + * Create a separate process to run listns() concurrently. + */ + iter_pid = create_child(&iter_pidfd, 0); + ASSERT_NE(iter_pid, -1); + + if (iter_pid == 0) { + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = 0, + .spare2 = 0, + .user_ns_id = 0, + }; + int iter_ret; + + /* + * Loop calling listns() until killed. + * Request 10 namespace IDs while namespaces are being destroyed. + * This tests: + * 1. EFAULT handling when buffer becomes invalid + * 2. Namespace cleanup outside RCU read lock during iteration + */ + while (1) { + iter_ret = sys_listns(&req, ns_ids, 10, 0); + + if (iter_ret == -1 && errno == ENOSYS) + _exit(PIDFD_SKIP); + } + } + + /* Small delay to let iterator start looping */ + usleep(50000); + + /* + * Create more children with mount namespaces to increase the + * likelihood that namespace cleanup happens during iteration. + */ + for (i = 0; i < 10; i++) { + /* Create socketpair for synchronization */ + ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sv[i]), 0); + + pid = create_child(&pidfds[i], CLONE_NEWNS); + ASSERT_NE(pid, -1); + + if (pid == 0) { + close(sv[i][0]); /* Close parent end */ + + if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0)) + _exit(1); + + /* Child: create tmpfs mounts */ + if (mkdir("/tmp/test_mnt1", 0755) == -1 && errno != EEXIST) + _exit(1); + if (mkdir("/tmp/test_mnt2", 0755) == -1 && errno != EEXIST) + _exit(1); + + if (mount("tmpfs", "/tmp/test_mnt1", "tmpfs", 0, NULL) == -1) + _exit(1); + if (mount("tmpfs", "/tmp/test_mnt2", "tmpfs", 0, NULL) == -1) + _exit(1); + + /* Signal parent that setup is complete */ + if (write_nointr(sv[i][1], "R", 1) != 1) + _exit(1); + + /* Wait for parent to signal us to exit */ + if (read_nointr(sv[i][1], &c, 1) != 1) + _exit(1); + + close(sv[i][1]); + _exit(0); + } + + close(sv[i][1]); /* Close child end */ + } + + /* Wait for all children to finish setup */ + for (i = 0; i < 10; i++) { + ret = read_nointr(sv[i][0], &c, 1); + ASSERT_EQ(ret, 1); + ASSERT_EQ(c, 'R'); + } + + /* Kill half the children */ + for (i = 0; i < 5; i++) + write_nointr(sv[i][0], "X", 1); + + /* Small delay to let some exit */ + usleep(10000); + + /* Kill remaining children */ + for (i = 5; i < 10; i++) + write_nointr(sv[i][0], "X", 1); + + /* Wait for all children and cleanup */ + for (i = 0; i < 10; i++) { + waitpid(-1, NULL, 0); + close(sv[i][0]); + close(pidfds[i]); + } + + /* Kill iterator and wait for it */ + sys_pidfd_send_signal(iter_pidfd, SIGKILL, NULL, 0); + ret = waitpid(iter_pid, &status, 0); + ASSERT_EQ(ret, iter_pid); + close(iter_pidfd); + + /* Should have been killed */ + ASSERT_TRUE(WIFSIGNALED(status)); + ASSERT_EQ(WTERMSIG(status), SIGKILL); + + /* Clean up */ + munmap(map, page_size); +} + +/* + * Test specifically focused on mount namespace cleanup during EFAULT. + * Filter for mount namespaces only. + */ +TEST(listns_mnt_ns_cleanup_on_fault) +{ + void *map; + __u64 *ns_ids; + ssize_t ret; + long page_size; + pid_t pid, iter_pid; + int pidfds[8]; + int sv[8][2]; + int iter_pidfd; + int i, status; + char c; + + page_size = sysconf(_SC_PAGESIZE); + ASSERT_GT(page_size, 0); + + /* Set up partial fault buffer */ + map = mmap(NULL, page_size * 2, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + ASSERT_NE(map, MAP_FAILED); + + ret = munmap((char *)map + page_size, page_size); + ASSERT_EQ(ret, 0); + + /* Position for 3 successful writes, then fault */ + ns_ids = ((__u64 *)((char *)map + page_size)) - 3; + + /* + * Create a separate process to run listns() concurrently. + */ + iter_pid = create_child(&iter_pidfd, 0); + ASSERT_NE(iter_pid, -1); + + if (iter_pid == 0) { + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = CLONE_NEWNS, /* Only mount namespaces */ + .spare2 = 0, + .user_ns_id = 0, + }; + int iter_ret; + + /* + * Loop calling listns() until killed. + * Call listns() to race with namespace destruction. + */ + while (1) { + iter_ret = sys_listns(&req, ns_ids, 10, 0); + + if (iter_ret == -1 && errno == ENOSYS) + _exit(PIDFD_SKIP); + } + } + + /* Small delay to let iterator start looping */ + usleep(50000); + + /* Create children with mount namespaces */ + for (i = 0; i < 8; i++) { + /* Create socketpair for synchronization */ + ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sv[i]), 0); + + pid = create_child(&pidfds[i], CLONE_NEWNS); + ASSERT_NE(pid, -1); + + if (pid == 0) { + close(sv[i][0]); /* Close parent end */ + + if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0)) + _exit(1); + + /* Do some mount operations to make cleanup more interesting */ + if (mkdir("/tmp/test_mnt1", 0755) == -1 && errno != EEXIST) + _exit(1); + if (mkdir("/tmp/test_mnt2", 0755) == -1 && errno != EEXIST) + _exit(1); + + if (mount("tmpfs", "/tmp/test_mnt1", "tmpfs", 0, NULL) == -1) + _exit(1); + if (mount("tmpfs", "/tmp/test_mnt2", "tmpfs", 0, NULL) == -1) + _exit(1); + + /* Signal parent that setup is complete */ + if (write_nointr(sv[i][1], "R", 1) != 1) + _exit(1); + + /* Wait for parent to signal us to exit */ + if (read_nointr(sv[i][1], &c, 1) != 1) + _exit(1); + + close(sv[i][1]); + _exit(0); + } + + close(sv[i][1]); /* Close child end */ + } + + /* Wait for all children to finish setup */ + for (i = 0; i < 8; i++) { + ret = read_nointr(sv[i][0], &c, 1); + ASSERT_EQ(ret, 1); + ASSERT_EQ(c, 'R'); + } + + /* Kill children to trigger namespace destruction during iteration */ + for (i = 0; i < 8; i++) + write_nointr(sv[i][0], "X", 1); + + /* Wait for children and cleanup */ + for (i = 0; i < 8; i++) { + waitpid(-1, NULL, 0); + close(sv[i][0]); + close(pidfds[i]); + } + + /* Kill iterator and wait for it */ + sys_pidfd_send_signal(iter_pidfd, SIGKILL, NULL, 0); + ret = waitpid(iter_pid, &status, 0); + ASSERT_EQ(ret, iter_pid); + close(iter_pidfd); + + /* Should have been killed */ + ASSERT_TRUE(WIFSIGNALED(status)); + ASSERT_EQ(WTERMSIG(status), SIGKILL); + + munmap(map, page_size); +} + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/namespaces/listns_pagination_bug.c b/tools/testing/selftests/namespaces/listns_pagination_bug.c new file mode 100644 index 000000000000..da7d33f96397 --- /dev/null +++ b/tools/testing/selftests/namespaces/listns_pagination_bug.c @@ -0,0 +1,138 @@ +// SPDX-License-Identifier: GPL-2.0 +#define _GNU_SOURCE +#include <errno.h> +#include <sched.h> +#include <stdio.h> +#include <stdlib.h> +#include <sys/socket.h> +#include <sys/wait.h> +#include <unistd.h> +#include "../kselftest_harness.h" +#include "../filesystems/utils.h" +#include "wrappers.h" + +/* + * Minimal test case to reproduce KASAN out-of-bounds in listns pagination. + * + * The bug occurs when: + * 1. Filtering by a specific namespace type (e.g., CLONE_NEWUSER) + * 2. Using pagination (req.ns_id != 0) + * 3. The lookup_ns_id_at() call in do_listns() passes ns_type=0 instead of + * the filtered type, causing it to search the unified tree and potentially + * return a namespace of the wrong type. + */ +TEST(pagination_with_type_filter) +{ + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = CLONE_NEWUSER, /* Filter by user namespace */ + .spare2 = 0, + .user_ns_id = 0, + }; + pid_t pids[10]; + int num_children = 10; + int i; + int sv[2]; + __u64 first_batch[3]; + ssize_t ret; + + ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sv), 0); + + /* Create children with user namespaces */ + for (i = 0; i < num_children; i++) { + pids[i] = fork(); + ASSERT_GE(pids[i], 0); + + if (pids[i] == 0) { + char c; + close(sv[0]); + + if (setup_userns() < 0) { + close(sv[1]); + exit(1); + } + + /* Signal parent we're ready */ + if (write(sv[1], &c, 1) != 1) { + close(sv[1]); + exit(1); + } + + /* Wait for parent signal to exit */ + if (read(sv[1], &c, 1) != 1) { + close(sv[1]); + exit(1); + } + + close(sv[1]); + exit(0); + } + } + + close(sv[1]); + + /* Wait for all children to signal ready */ + for (i = 0; i < num_children; i++) { + char c; + if (read(sv[0], &c, 1) != 1) { + close(sv[0]); + for (int j = 0; j < num_children; j++) + kill(pids[j], SIGKILL); + for (int j = 0; j < num_children; j++) + waitpid(pids[j], NULL, 0); + ASSERT_TRUE(false); + } + } + + /* First batch - this should work */ + ret = sys_listns(&req, first_batch, 3, 0); + if (ret < 0) { + if (errno == ENOSYS) { + close(sv[0]); + for (i = 0; i < num_children; i++) + kill(pids[i], SIGKILL); + for (i = 0; i < num_children; i++) + waitpid(pids[i], NULL, 0); + SKIP(return, "listns() not supported"); + } + ASSERT_GE(ret, 0); + } + + TH_LOG("First batch returned %zd entries", ret); + + if (ret == 3) { + __u64 second_batch[3]; + + /* Second batch - pagination triggers the bug */ + req.ns_id = first_batch[2]; /* Continue from last ID */ + ret = sys_listns(&req, second_batch, 3, 0); + + TH_LOG("Second batch returned %zd entries", ret); + ASSERT_GE(ret, 0); + } + + /* Signal all children to exit */ + for (i = 0; i < num_children; i++) { + char c = 'X'; + if (write(sv[0], &c, 1) != 1) { + close(sv[0]); + for (int j = i; j < num_children; j++) + kill(pids[j], SIGKILL); + for (int j = 0; j < num_children; j++) + waitpid(pids[j], NULL, 0); + ASSERT_TRUE(false); + } + } + + close(sv[0]); + + /* Cleanup */ + for (i = 0; i < num_children; i++) { + int status; + waitpid(pids[i], &status, 0); + } +} + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/namespaces/listns_permissions_test.c b/tools/testing/selftests/namespaces/listns_permissions_test.c new file mode 100644 index 000000000000..82d818751a5f --- /dev/null +++ b/tools/testing/selftests/namespaces/listns_permissions_test.c @@ -0,0 +1,759 @@ +// SPDX-License-Identifier: GPL-2.0 +#define _GNU_SOURCE +#include <errno.h> +#include <fcntl.h> +#include <limits.h> +#include <sched.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <linux/nsfs.h> +#include <sys/capability.h> +#include <sys/ioctl.h> +#include <sys/prctl.h> +#include <sys/stat.h> +#include <sys/syscall.h> +#include <sys/types.h> +#include <sys/wait.h> +#include <unistd.h> +#include "../kselftest_harness.h" +#include "../filesystems/utils.h" +#include "wrappers.h" + +/* + * Test that unprivileged users can only see namespaces they're currently in. + * Create a namespace, drop privileges, verify we can only see our own namespaces. + */ +TEST(listns_unprivileged_current_only) +{ + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = CLONE_NEWNET, + .spare2 = 0, + .user_ns_id = 0, + }; + __u64 ns_ids[100]; + ssize_t ret; + int pipefd[2]; + pid_t pid; + int status; + bool found_ours; + int unexpected_count; + + ASSERT_EQ(pipe(pipefd), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + int fd; + __u64 our_netns_id; + bool found_ours; + int unexpected_count; + + close(pipefd[0]); + + /* Create user namespace to be unprivileged */ + if (setup_userns() < 0) { + close(pipefd[1]); + exit(1); + } + + /* Create a network namespace */ + if (unshare(CLONE_NEWNET) < 0) { + close(pipefd[1]); + exit(1); + } + + /* Get our network namespace ID */ + fd = open("/proc/self/ns/net", O_RDONLY); + if (fd < 0) { + close(pipefd[1]); + exit(1); + } + + if (ioctl(fd, NS_GET_ID, &our_netns_id) < 0) { + close(fd); + close(pipefd[1]); + exit(1); + } + close(fd); + + /* Now we're unprivileged - list all network namespaces */ + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0); + if (ret < 0) { + close(pipefd[1]); + exit(1); + } + + /* We should only see our own network namespace */ + found_ours = false; + unexpected_count = 0; + + for (ssize_t i = 0; i < ret; i++) { + if (ns_ids[i] == our_netns_id) { + found_ours = true; + } else { + /* This is either init_net (which we can see) or unexpected */ + unexpected_count++; + } + } + + /* Send results to parent */ + write(pipefd[1], &found_ours, sizeof(found_ours)); + write(pipefd[1], &unexpected_count, sizeof(unexpected_count)); + close(pipefd[1]); + exit(0); + } + + /* Parent */ + close(pipefd[1]); + + found_ours = false; + unexpected_count = 0; + read(pipefd[0], &found_ours, sizeof(found_ours)); + read(pipefd[0], &unexpected_count, sizeof(unexpected_count)); + close(pipefd[0]); + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + /* Child should have seen its own namespace */ + ASSERT_TRUE(found_ours); + + TH_LOG("Unprivileged child saw its own namespace, plus %d others (likely init_net)", + unexpected_count); +} + +/* + * Test that users with CAP_SYS_ADMIN in a user namespace can see + * all namespaces owned by that user namespace. + */ +TEST(listns_cap_sys_admin_in_userns) +{ + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = 0, /* All types */ + .spare2 = 0, + .user_ns_id = 0, /* Will be set to our created user namespace */ + }; + __u64 ns_ids[100]; + int pipefd[2]; + pid_t pid; + int status; + bool success; + ssize_t count; + + ASSERT_EQ(pipe(pipefd), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + int fd; + __u64 userns_id; + ssize_t ret; + int min_expected; + bool success; + + close(pipefd[0]); + + /* Create user namespace - we'll have CAP_SYS_ADMIN in it */ + if (setup_userns() < 0) { + close(pipefd[1]); + exit(1); + } + + /* Get the user namespace ID */ + fd = open("/proc/self/ns/user", O_RDONLY); + if (fd < 0) { + close(pipefd[1]); + exit(1); + } + + if (ioctl(fd, NS_GET_ID, &userns_id) < 0) { + close(fd); + close(pipefd[1]); + exit(1); + } + close(fd); + + /* Create several namespaces owned by this user namespace */ + unshare(CLONE_NEWNET); + unshare(CLONE_NEWUTS); + unshare(CLONE_NEWIPC); + + /* List namespaces owned by our user namespace */ + req.user_ns_id = userns_id; + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0); + if (ret < 0) { + close(pipefd[1]); + exit(1); + } + + /* + * We have CAP_SYS_ADMIN in this user namespace, + * so we should see all namespaces owned by it. + * That includes: net, uts, ipc, and the user namespace itself. + */ + min_expected = 4; + success = (ret >= min_expected); + + write(pipefd[1], &success, sizeof(success)); + write(pipefd[1], &ret, sizeof(ret)); + close(pipefd[1]); + exit(0); + } + + /* Parent */ + close(pipefd[1]); + + success = false; + count = 0; + read(pipefd[0], &success, sizeof(success)); + read(pipefd[0], &count, sizeof(count)); + close(pipefd[0]); + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + ASSERT_TRUE(success); + TH_LOG("User with CAP_SYS_ADMIN saw %zd namespaces owned by their user namespace", + count); +} + +/* + * Test that users cannot see namespaces from unrelated user namespaces. + * Create two sibling user namespaces, verify they can't see each other's + * owned namespaces. + */ +TEST(listns_cannot_see_sibling_userns_namespaces) +{ + int pipefd[2]; + pid_t pid1, pid2; + int status; + __u64 netns_a_id; + int pipefd2[2]; + bool found_sibling_netns; + + ASSERT_EQ(pipe(pipefd), 0); + + /* Fork first child - creates user namespace A */ + pid1 = fork(); + ASSERT_GE(pid1, 0); + + if (pid1 == 0) { + int fd; + __u64 netns_a_id; + char buf; + + close(pipefd[0]); + + /* Create user namespace A */ + if (setup_userns() < 0) { + close(pipefd[1]); + exit(1); + } + + /* Create network namespace owned by user namespace A */ + if (unshare(CLONE_NEWNET) < 0) { + close(pipefd[1]); + exit(1); + } + + /* Get network namespace ID */ + fd = open("/proc/self/ns/net", O_RDONLY); + if (fd < 0) { + close(pipefd[1]); + exit(1); + } + + if (ioctl(fd, NS_GET_ID, &netns_a_id) < 0) { + close(fd); + close(pipefd[1]); + exit(1); + } + close(fd); + + /* Send namespace ID to parent */ + write(pipefd[1], &netns_a_id, sizeof(netns_a_id)); + + /* Keep alive for sibling to check */ + read(pipefd[1], &buf, 1); + close(pipefd[1]); + exit(0); + } + + /* Parent reads namespace A ID */ + close(pipefd[1]); + netns_a_id = 0; + read(pipefd[0], &netns_a_id, sizeof(netns_a_id)); + + TH_LOG("User namespace A created network namespace with ID %llu", + (unsigned long long)netns_a_id); + + /* Fork second child - creates user namespace B */ + ASSERT_EQ(pipe(pipefd2), 0); + + pid2 = fork(); + ASSERT_GE(pid2, 0); + + if (pid2 == 0) { + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = CLONE_NEWNET, + .spare2 = 0, + .user_ns_id = 0, + }; + __u64 ns_ids[100]; + ssize_t ret; + bool found_sibling_netns; + + close(pipefd[0]); + close(pipefd2[0]); + + /* Create user namespace B (sibling to A) */ + if (setup_userns() < 0) { + close(pipefd2[1]); + exit(1); + } + + /* Try to list all network namespaces */ + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0); + + found_sibling_netns = false; + if (ret > 0) { + for (ssize_t i = 0; i < ret; i++) { + if (ns_ids[i] == netns_a_id) { + found_sibling_netns = true; + break; + } + } + } + + /* We should NOT see the sibling's network namespace */ + write(pipefd2[1], &found_sibling_netns, sizeof(found_sibling_netns)); + close(pipefd2[1]); + exit(0); + } + + /* Parent reads result from second child */ + close(pipefd2[1]); + found_sibling_netns = false; + read(pipefd2[0], &found_sibling_netns, sizeof(found_sibling_netns)); + close(pipefd2[0]); + + /* Signal first child to exit */ + close(pipefd[0]); + + /* Wait for both children */ + waitpid(pid2, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + + waitpid(pid1, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + + /* Second child should NOT have seen first child's namespace */ + ASSERT_FALSE(found_sibling_netns); + TH_LOG("User namespace B correctly could not see sibling namespace A's network namespace"); +} + +/* + * Test permission checking with LISTNS_CURRENT_USER. + * Verify that listing with LISTNS_CURRENT_USER respects permissions. + */ +TEST(listns_current_user_permissions) +{ + int pipefd[2]; + pid_t pid; + int status; + bool success; + ssize_t count; + + ASSERT_EQ(pipe(pipefd), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = 0, + .spare2 = 0, + .user_ns_id = LISTNS_CURRENT_USER, + }; + __u64 ns_ids[100]; + ssize_t ret; + bool success; + + close(pipefd[0]); + + /* Create user namespace */ + if (setup_userns() < 0) { + close(pipefd[1]); + exit(1); + } + + /* Create some namespaces owned by this user namespace */ + if (unshare(CLONE_NEWNET) < 0) { + close(pipefd[1]); + exit(1); + } + + if (unshare(CLONE_NEWUTS) < 0) { + close(pipefd[1]); + exit(1); + } + + /* List with LISTNS_CURRENT_USER - should see our owned namespaces */ + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0); + + success = (ret >= 3); /* At least user, net, uts */ + write(pipefd[1], &success, sizeof(success)); + write(pipefd[1], &ret, sizeof(ret)); + close(pipefd[1]); + exit(0); + } + + /* Parent */ + close(pipefd[1]); + + success = false; + count = 0; + read(pipefd[0], &success, sizeof(success)); + read(pipefd[0], &count, sizeof(count)); + close(pipefd[0]); + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + ASSERT_TRUE(success); + TH_LOG("LISTNS_CURRENT_USER returned %zd namespaces", count); +} + +/* + * Test that CAP_SYS_ADMIN in parent user namespace allows seeing + * child user namespace's owned namespaces. + */ +TEST(listns_parent_userns_cap_sys_admin) +{ + int pipefd[2]; + pid_t pid; + int status; + bool found_child_userns; + ssize_t count; + + ASSERT_EQ(pipe(pipefd), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + int fd; + __u64 parent_userns_id; + __u64 child_userns_id; + struct ns_id_req req; + __u64 ns_ids[100]; + ssize_t ret; + bool found_child_userns; + + close(pipefd[0]); + + /* Create parent user namespace - we have CAP_SYS_ADMIN in it */ + if (setup_userns() < 0) { + close(pipefd[1]); + exit(1); + } + + /* Get parent user namespace ID */ + fd = open("/proc/self/ns/user", O_RDONLY); + if (fd < 0) { + close(pipefd[1]); + exit(1); + } + + if (ioctl(fd, NS_GET_ID, &parent_userns_id) < 0) { + close(fd); + close(pipefd[1]); + exit(1); + } + close(fd); + + /* Create child user namespace */ + if (setup_userns() < 0) { + close(pipefd[1]); + exit(1); + } + + /* Get child user namespace ID */ + fd = open("/proc/self/ns/user", O_RDONLY); + if (fd < 0) { + close(pipefd[1]); + exit(1); + } + + if (ioctl(fd, NS_GET_ID, &child_userns_id) < 0) { + close(fd); + close(pipefd[1]); + exit(1); + } + close(fd); + + /* Create namespaces owned by child user namespace */ + if (unshare(CLONE_NEWNET) < 0) { + close(pipefd[1]); + exit(1); + } + + /* List namespaces owned by parent user namespace */ + req.size = sizeof(req); + req.spare = 0; + req.ns_id = 0; + req.ns_type = 0; + req.spare2 = 0; + req.user_ns_id = parent_userns_id; + + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0); + + /* Should see child user namespace in the list */ + found_child_userns = false; + if (ret > 0) { + for (ssize_t i = 0; i < ret; i++) { + if (ns_ids[i] == child_userns_id) { + found_child_userns = true; + break; + } + } + } + + write(pipefd[1], &found_child_userns, sizeof(found_child_userns)); + write(pipefd[1], &ret, sizeof(ret)); + close(pipefd[1]); + exit(0); + } + + /* Parent */ + close(pipefd[1]); + + found_child_userns = false; + count = 0; + read(pipefd[0], &found_child_userns, sizeof(found_child_userns)); + read(pipefd[0], &count, sizeof(count)); + close(pipefd[0]); + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + ASSERT_TRUE(found_child_userns); + TH_LOG("Process with CAP_SYS_ADMIN in parent user namespace saw child user namespace (total: %zd)", + count); +} + +/* + * Test that we can see user namespaces we have CAP_SYS_ADMIN inside of. + * This is different from seeing namespaces owned by a user namespace. + */ +TEST(listns_cap_sys_admin_inside_userns) +{ + int pipefd[2]; + pid_t pid; + int status; + bool found_ours; + + ASSERT_EQ(pipe(pipefd), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + int fd; + __u64 our_userns_id; + struct ns_id_req req; + __u64 ns_ids[100]; + ssize_t ret; + bool found_ours; + + close(pipefd[0]); + + /* Create user namespace - we have CAP_SYS_ADMIN inside it */ + if (setup_userns() < 0) { + close(pipefd[1]); + exit(1); + } + + /* Get our user namespace ID */ + fd = open("/proc/self/ns/user", O_RDONLY); + if (fd < 0) { + close(pipefd[1]); + exit(1); + } + + if (ioctl(fd, NS_GET_ID, &our_userns_id) < 0) { + close(fd); + close(pipefd[1]); + exit(1); + } + close(fd); + + /* List all user namespaces globally */ + req.size = sizeof(req); + req.spare = 0; + req.ns_id = 0; + req.ns_type = CLONE_NEWUSER; + req.spare2 = 0; + req.user_ns_id = 0; + + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0); + + /* We should be able to see our own user namespace */ + found_ours = false; + if (ret > 0) { + for (ssize_t i = 0; i < ret; i++) { + if (ns_ids[i] == our_userns_id) { + found_ours = true; + break; + } + } + } + + write(pipefd[1], &found_ours, sizeof(found_ours)); + close(pipefd[1]); + exit(0); + } + + /* Parent */ + close(pipefd[1]); + + found_ours = false; + read(pipefd[0], &found_ours, sizeof(found_ours)); + close(pipefd[0]); + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + ASSERT_TRUE(found_ours); + TH_LOG("Process can see user namespace it has CAP_SYS_ADMIN inside of"); +} + +/* + * Test that dropping CAP_SYS_ADMIN restricts what we can see. + */ +TEST(listns_drop_cap_sys_admin) +{ + cap_t caps; + cap_value_t cap_list[1] = { CAP_SYS_ADMIN }; + + /* This test needs to start with CAP_SYS_ADMIN */ + caps = cap_get_proc(); + if (!caps) { + SKIP(return, "Cannot get capabilities"); + } + + cap_flag_value_t cap_val; + if (cap_get_flag(caps, CAP_SYS_ADMIN, CAP_EFFECTIVE, &cap_val) < 0) { + cap_free(caps); + SKIP(return, "Cannot check CAP_SYS_ADMIN"); + } + + if (cap_val != CAP_SET) { + cap_free(caps); + SKIP(return, "Test needs CAP_SYS_ADMIN to start"); + } + cap_free(caps); + + int pipefd[2]; + pid_t pid; + int status; + bool correct; + ssize_t count_before, count_after; + + ASSERT_EQ(pipe(pipefd), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = CLONE_NEWNET, + .spare2 = 0, + .user_ns_id = LISTNS_CURRENT_USER, + }; + __u64 ns_ids_before[100]; + ssize_t count_before; + __u64 ns_ids_after[100]; + ssize_t count_after; + bool correct; + + close(pipefd[0]); + + /* Create user namespace */ + if (setup_userns() < 0) { + close(pipefd[1]); + exit(1); + } + + /* Count namespaces with CAP_SYS_ADMIN */ + count_before = sys_listns(&req, ns_ids_before, ARRAY_SIZE(ns_ids_before), 0); + + /* Drop CAP_SYS_ADMIN */ + caps = cap_get_proc(); + if (caps) { + cap_set_flag(caps, CAP_EFFECTIVE, 1, cap_list, CAP_CLEAR); + cap_set_flag(caps, CAP_PERMITTED, 1, cap_list, CAP_CLEAR); + cap_set_proc(caps); + cap_free(caps); + } + + /* Ensure we can't regain the capability */ + prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); + + /* Count namespaces without CAP_SYS_ADMIN */ + count_after = sys_listns(&req, ns_ids_after, ARRAY_SIZE(ns_ids_after), 0); + + /* Without CAP_SYS_ADMIN, we should see same or fewer namespaces */ + correct = (count_after <= count_before); + + write(pipefd[1], &correct, sizeof(correct)); + write(pipefd[1], &count_before, sizeof(count_before)); + write(pipefd[1], &count_after, sizeof(count_after)); + close(pipefd[1]); + exit(0); + } + + /* Parent */ + close(pipefd[1]); + + correct = false; + count_before = 0; + count_after = 0; + read(pipefd[0], &correct, sizeof(correct)); + read(pipefd[0], &count_before, sizeof(count_before)); + read(pipefd[0], &count_after, sizeof(count_after)); + close(pipefd[0]); + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + ASSERT_TRUE(correct); + TH_LOG("With CAP_SYS_ADMIN: %zd namespaces, without: %zd namespaces", + count_before, count_after); +} + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/namespaces/listns_test.c b/tools/testing/selftests/namespaces/listns_test.c new file mode 100644 index 000000000000..8a95789d6a87 --- /dev/null +++ b/tools/testing/selftests/namespaces/listns_test.c @@ -0,0 +1,679 @@ +// SPDX-License-Identifier: GPL-2.0 +#define _GNU_SOURCE +#include <errno.h> +#include <fcntl.h> +#include <limits.h> +#include <sched.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <linux/nsfs.h> +#include <sys/ioctl.h> +#include <sys/socket.h> +#include <sys/stat.h> +#include <sys/syscall.h> +#include <sys/types.h> +#include <sys/wait.h> +#include <unistd.h> +#include "../kselftest_harness.h" +#include "../filesystems/utils.h" +#include "wrappers.h" + +/* + * Test basic listns() functionality with the unified namespace tree. + * List all active namespaces globally. + */ +TEST(listns_basic_unified) +{ + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = 0, /* All types */ + .spare2 = 0, + .user_ns_id = 0, /* Global listing */ + }; + __u64 ns_ids[100]; + ssize_t ret; + + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0); + if (ret < 0) { + if (errno == ENOSYS) + SKIP(return, "listns() not supported"); + TH_LOG("listns failed: %s (errno=%d)", strerror(errno), errno); + ASSERT_TRUE(false); + } + + /* Should find at least the initial namespaces */ + ASSERT_GT(ret, 0); + TH_LOG("Found %zd active namespaces", ret); + + /* Verify all returned IDs are non-zero */ + for (ssize_t i = 0; i < ret; i++) { + ASSERT_NE(ns_ids[i], 0); + TH_LOG(" [%zd] ns_id: %llu", i, (unsigned long long)ns_ids[i]); + } +} + +/* + * Test listns() with type filtering. + * List only network namespaces. + */ +TEST(listns_filter_by_type) +{ + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = CLONE_NEWNET, /* Only network namespaces */ + .spare2 = 0, + .user_ns_id = 0, + }; + __u64 ns_ids[100]; + ssize_t ret; + + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0); + if (ret < 0) { + if (errno == ENOSYS) + SKIP(return, "listns() not supported"); + TH_LOG("listns failed: %s (errno=%d)", strerror(errno), errno); + ASSERT_TRUE(false); + } + ASSERT_GE(ret, 0); + + /* Should find at least init_net */ + ASSERT_GT(ret, 0); + TH_LOG("Found %zd active network namespaces", ret); + + /* Verify we can open each namespace and it's actually a network namespace */ + for (ssize_t i = 0; i < ret && i < 5; i++) { + struct nsfs_file_handle nsfh = { + .ns_id = ns_ids[i], + .ns_type = CLONE_NEWNET, + .ns_inum = 0, + }; + struct file_handle *fh; + int fd; + + fh = (struct file_handle *)malloc(sizeof(*fh) + sizeof(nsfh)); + ASSERT_NE(fh, NULL); + fh->handle_bytes = sizeof(nsfh); + fh->handle_type = 0; + memcpy(fh->f_handle, &nsfh, sizeof(nsfh)); + + fd = open_by_handle_at(-10003, fh, O_RDONLY); + free(fh); + + if (fd >= 0) { + int ns_type; + /* Verify it's a network namespace via ioctl */ + ns_type = ioctl(fd, NS_GET_NSTYPE); + if (ns_type >= 0) { + ASSERT_EQ(ns_type, CLONE_NEWNET); + } + close(fd); + } + } +} + +/* + * Test listns() pagination. + * List namespaces in batches. + */ +TEST(listns_pagination) +{ + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = 0, + .spare2 = 0, + .user_ns_id = 0, + }; + __u64 batch1[2], batch2[2]; + ssize_t ret1, ret2; + + /* Get first batch */ + ret1 = sys_listns(&req, batch1, ARRAY_SIZE(batch1), 0); + if (ret1 < 0) { + if (errno == ENOSYS) + SKIP(return, "listns() not supported"); + TH_LOG("listns failed: %s (errno=%d)", strerror(errno), errno); + ASSERT_TRUE(false); + } + ASSERT_GE(ret1, 0); + + if (ret1 == 0) + SKIP(return, "No namespaces found"); + + TH_LOG("First batch: %zd namespaces", ret1); + + /* Get second batch using last ID from first batch */ + if (ret1 == ARRAY_SIZE(batch1)) { + req.ns_id = batch1[ret1 - 1]; + ret2 = sys_listns(&req, batch2, ARRAY_SIZE(batch2), 0); + ASSERT_GE(ret2, 0); + + TH_LOG("Second batch: %zd namespaces (after ns_id=%llu)", + ret2, (unsigned long long)req.ns_id); + + /* If we got more results, verify IDs are monotonically increasing */ + if (ret2 > 0) { + ASSERT_GT(batch2[0], batch1[ret1 - 1]); + TH_LOG("Pagination working: %llu > %llu", + (unsigned long long)batch2[0], + (unsigned long long)batch1[ret1 - 1]); + } + } else { + TH_LOG("All namespaces fit in first batch"); + } +} + +/* + * Test listns() with LISTNS_CURRENT_USER. + * List namespaces owned by current user namespace. + */ +TEST(listns_current_user) +{ + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = 0, + .spare2 = 0, + .user_ns_id = LISTNS_CURRENT_USER, + }; + __u64 ns_ids[100]; + ssize_t ret; + + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0); + if (ret < 0) { + if (errno == ENOSYS) + SKIP(return, "listns() not supported"); + TH_LOG("listns failed: %s (errno=%d)", strerror(errno), errno); + ASSERT_TRUE(false); + } + ASSERT_GE(ret, 0); + + /* Should find at least the initial namespaces if we're in init_user_ns */ + TH_LOG("Found %zd namespaces owned by current user namespace", ret); + + for (ssize_t i = 0; i < ret; i++) + TH_LOG(" [%zd] ns_id: %llu", i, (unsigned long long)ns_ids[i]); +} + +/* + * Test that listns() only returns active namespaces. + * Create a namespace, let it become inactive, verify it's not listed. + */ +TEST(listns_only_active) +{ + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = CLONE_NEWNET, + .spare2 = 0, + .user_ns_id = 0, + }; + __u64 ns_ids_before[100], ns_ids_after[100]; + ssize_t ret_before, ret_after; + int pipefd[2]; + pid_t pid; + __u64 new_ns_id = 0; + int status; + + /* Get initial list */ + ret_before = sys_listns(&req, ns_ids_before, ARRAY_SIZE(ns_ids_before), 0); + if (ret_before < 0) { + if (errno == ENOSYS) + SKIP(return, "listns() not supported"); + TH_LOG("listns failed: %s (errno=%d)", strerror(errno), errno); + ASSERT_TRUE(false); + } + ASSERT_GE(ret_before, 0); + + TH_LOG("Before: %zd active network namespaces", ret_before); + + /* Create a new namespace in a child process and get its ID */ + ASSERT_EQ(pipe(pipefd), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + int fd; + __u64 ns_id; + + close(pipefd[0]); + + /* Create new network namespace */ + if (unshare(CLONE_NEWNET) < 0) { + close(pipefd[1]); + exit(1); + } + + /* Get its ID */ + fd = open("/proc/self/ns/net", O_RDONLY); + if (fd < 0) { + close(pipefd[1]); + exit(1); + } + + if (ioctl(fd, NS_GET_ID, &ns_id) < 0) { + close(fd); + close(pipefd[1]); + exit(1); + } + close(fd); + + /* Send ID to parent */ + write(pipefd[1], &ns_id, sizeof(ns_id)); + close(pipefd[1]); + + /* Keep namespace active briefly */ + usleep(100000); + exit(0); + } + + /* Parent reads the new namespace ID */ + { + int bytes; + + close(pipefd[1]); + bytes = read(pipefd[0], &new_ns_id, sizeof(new_ns_id)); + close(pipefd[0]); + + if (bytes == sizeof(new_ns_id)) { + __u64 ns_ids_during[100]; + int ret_during; + + TH_LOG("Child created namespace with ID %llu", (unsigned long long)new_ns_id); + + /* List namespaces while child is still alive - should see new one */ + ret_during = sys_listns(&req, ns_ids_during, ARRAY_SIZE(ns_ids_during), 0); + ASSERT_GE(ret_during, 0); + TH_LOG("During: %d active network namespaces", ret_during); + + /* Should have more namespaces than before */ + ASSERT_GE(ret_during, ret_before); + } + } + + /* Wait for child to exit */ + waitpid(pid, &status, 0); + + /* Give time for namespace to become inactive */ + usleep(100000); + + /* List namespaces after child exits - should not see new one */ + ret_after = sys_listns(&req, ns_ids_after, ARRAY_SIZE(ns_ids_after), 0); + ASSERT_GE(ret_after, 0); + TH_LOG("After: %zd active network namespaces", ret_after); + + /* Verify the new namespace ID is not in the after list */ + if (new_ns_id != 0) { + bool found = false; + + for (ssize_t i = 0; i < ret_after; i++) { + if (ns_ids_after[i] == new_ns_id) { + found = true; + break; + } + } + ASSERT_FALSE(found); + } +} + +/* + * Test listns() with specific user namespace ID. + * Create a user namespace and list namespaces it owns. + */ +TEST(listns_specific_userns) +{ + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = 0, + .spare2 = 0, + .user_ns_id = 0, /* Will be filled with created userns ID */ + }; + __u64 ns_ids[100]; + int sv[2]; + pid_t pid; + int status; + __u64 user_ns_id = 0; + int bytes; + ssize_t ret; + + ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sv), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + int fd; + __u64 ns_id; + char buf; + + close(sv[0]); + + /* Create new user namespace */ + if (setup_userns() < 0) { + close(sv[1]); + exit(1); + } + + /* Get user namespace ID */ + fd = open("/proc/self/ns/user", O_RDONLY); + if (fd < 0) { + close(sv[1]); + exit(1); + } + + if (ioctl(fd, NS_GET_ID, &ns_id) < 0) { + close(fd); + close(sv[1]); + exit(1); + } + close(fd); + + /* Send ID to parent */ + if (write(sv[1], &ns_id, sizeof(ns_id)) != sizeof(ns_id)) { + close(sv[1]); + exit(1); + } + + /* Create some namespaces owned by this user namespace */ + unshare(CLONE_NEWNET); + unshare(CLONE_NEWUTS); + + /* Wait for parent signal */ + if (read(sv[1], &buf, 1) != 1) { + close(sv[1]); + exit(1); + } + close(sv[1]); + exit(0); + } + + /* Parent */ + close(sv[1]); + bytes = read(sv[0], &user_ns_id, sizeof(user_ns_id)); + + if (bytes != sizeof(user_ns_id)) { + close(sv[0]); + kill(pid, SIGKILL); + waitpid(pid, NULL, 0); + SKIP(return, "Failed to get user namespace ID from child"); + } + + TH_LOG("Child created user namespace with ID %llu", (unsigned long long)user_ns_id); + + /* List namespaces owned by this user namespace */ + req.user_ns_id = user_ns_id; + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0); + + if (ret < 0) { + TH_LOG("listns failed: %s (errno=%d)", strerror(errno), errno); + close(sv[0]); + kill(pid, SIGKILL); + waitpid(pid, NULL, 0); + if (errno == ENOSYS) { + SKIP(return, "listns() not supported"); + } + ASSERT_GE(ret, 0); + } + + TH_LOG("Found %zd namespaces owned by user namespace %llu", ret, + (unsigned long long)user_ns_id); + + /* Should find at least the network and UTS namespaces we created */ + if (ret > 0) { + for (ssize_t i = 0; i < ret && i < 10; i++) + TH_LOG(" [%zd] ns_id: %llu", i, (unsigned long long)ns_ids[i]); + } + + /* Signal child to exit */ + if (write(sv[0], "X", 1) != 1) { + close(sv[0]); + kill(pid, SIGKILL); + waitpid(pid, NULL, 0); + ASSERT_TRUE(false); + } + close(sv[0]); + waitpid(pid, &status, 0); +} + +/* + * Test listns() with multiple namespace types filter. + */ +TEST(listns_multiple_types) +{ + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = CLONE_NEWNET | CLONE_NEWUTS, /* Network and UTS */ + .spare2 = 0, + .user_ns_id = 0, + }; + __u64 ns_ids[100]; + ssize_t ret; + + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0); + if (ret < 0) { + if (errno == ENOSYS) + SKIP(return, "listns() not supported"); + TH_LOG("listns failed: %s (errno=%d)", strerror(errno), errno); + ASSERT_TRUE(false); + } + ASSERT_GE(ret, 0); + + TH_LOG("Found %zd active network/UTS namespaces", ret); + + for (ssize_t i = 0; i < ret; i++) + TH_LOG(" [%zd] ns_id: %llu", i, (unsigned long long)ns_ids[i]); +} + +/* + * Test that hierarchical active reference propagation keeps parent + * user namespaces visible in listns(). + */ +TEST(listns_hierarchical_visibility) +{ + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = CLONE_NEWUSER, + .spare2 = 0, + .user_ns_id = 0, + }; + __u64 parent_ns_id = 0, child_ns_id = 0; + int sv[2]; + pid_t pid; + int status; + int bytes; + __u64 ns_ids[100]; + ssize_t ret; + bool found_parent, found_child; + + ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sv), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + int fd; + char buf; + + close(sv[0]); + + /* Create parent user namespace */ + if (setup_userns() < 0) { + close(sv[1]); + exit(1); + } + + fd = open("/proc/self/ns/user", O_RDONLY); + if (fd < 0) { + close(sv[1]); + exit(1); + } + + if (ioctl(fd, NS_GET_ID, &parent_ns_id) < 0) { + close(fd); + close(sv[1]); + exit(1); + } + close(fd); + + /* Create child user namespace */ + if (setup_userns() < 0) { + close(sv[1]); + exit(1); + } + + fd = open("/proc/self/ns/user", O_RDONLY); + if (fd < 0) { + close(sv[1]); + exit(1); + } + + if (ioctl(fd, NS_GET_ID, &child_ns_id) < 0) { + close(fd); + close(sv[1]); + exit(1); + } + close(fd); + + /* Send both IDs to parent */ + if (write(sv[1], &parent_ns_id, sizeof(parent_ns_id)) != sizeof(parent_ns_id)) { + close(sv[1]); + exit(1); + } + if (write(sv[1], &child_ns_id, sizeof(child_ns_id)) != sizeof(child_ns_id)) { + close(sv[1]); + exit(1); + } + + /* Wait for parent signal */ + if (read(sv[1], &buf, 1) != 1) { + close(sv[1]); + exit(1); + } + close(sv[1]); + exit(0); + } + + /* Parent */ + close(sv[1]); + + /* Read both namespace IDs */ + bytes = read(sv[0], &parent_ns_id, sizeof(parent_ns_id)); + bytes += read(sv[0], &child_ns_id, sizeof(child_ns_id)); + + if (bytes != (int)(2 * sizeof(__u64))) { + close(sv[0]); + kill(pid, SIGKILL); + waitpid(pid, NULL, 0); + SKIP(return, "Failed to get namespace IDs from child"); + } + + TH_LOG("Parent user namespace ID: %llu", (unsigned long long)parent_ns_id); + TH_LOG("Child user namespace ID: %llu", (unsigned long long)child_ns_id); + + /* List all user namespaces */ + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0); + + if (ret < 0 && errno == ENOSYS) { + close(sv[0]); + kill(pid, SIGKILL); + waitpid(pid, NULL, 0); + SKIP(return, "listns() not supported"); + } + + ASSERT_GE(ret, 0); + TH_LOG("Found %zd active user namespaces", ret); + + /* Both parent and child should be visible (active due to child process) */ + found_parent = false; + found_child = false; + for (ssize_t i = 0; i < ret; i++) { + if (ns_ids[i] == parent_ns_id) + found_parent = true; + if (ns_ids[i] == child_ns_id) + found_child = true; + } + + TH_LOG("Parent namespace %s, child namespace %s", + found_parent ? "found" : "NOT FOUND", + found_child ? "found" : "NOT FOUND"); + + ASSERT_TRUE(found_child); + /* With hierarchical propagation, parent should also be active */ + ASSERT_TRUE(found_parent); + + /* Signal child to exit */ + if (write(sv[0], "X", 1) != 1) { + close(sv[0]); + kill(pid, SIGKILL); + waitpid(pid, NULL, 0); + ASSERT_TRUE(false); + } + close(sv[0]); + waitpid(pid, &status, 0); +} + +/* + * Test error cases for listns(). + */ +TEST(listns_error_cases) +{ + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = 0, + .spare2 = 0, + .user_ns_id = 0, + }; + __u64 ns_ids[10]; + int ret; + + /* Test with invalid flags */ + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0xFFFF); + if (errno == ENOSYS) { + /* listns() not supported, skip this check */ + } else { + ASSERT_LT(ret, 0); + ASSERT_EQ(errno, EINVAL); + } + + /* Test with NULL ns_ids array */ + ret = sys_listns(&req, NULL, 10, 0); + ASSERT_LT(ret, 0); + + /* Test with invalid spare field */ + req.spare = 1; + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0); + if (errno == ENOSYS) { + /* listns() not supported, skip this check */ + } else { + ASSERT_LT(ret, 0); + ASSERT_EQ(errno, EINVAL); + } + req.spare = 0; + + /* Test with huge nr_ns_ids */ + ret = sys_listns(&req, ns_ids, 2000000, 0); + if (errno == ENOSYS) { + /* listns() not supported, skip this check */ + } else { + ASSERT_LT(ret, 0); + } +} + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/namespaces/ns_active_ref_test.c b/tools/testing/selftests/namespaces/ns_active_ref_test.c new file mode 100644 index 000000000000..093268f0efaa --- /dev/null +++ b/tools/testing/selftests/namespaces/ns_active_ref_test.c @@ -0,0 +1,2672 @@ +// SPDX-License-Identifier: GPL-2.0 +#define _GNU_SOURCE +#include <errno.h> +#include <fcntl.h> +#include <limits.h> +#include <sched.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <linux/nsfs.h> +#include <sys/mount.h> +#include <sys/socket.h> +#include <sys/stat.h> +#include <sys/types.h> +#include <sys/wait.h> +#include <sys/syscall.h> +#include <unistd.h> +#include <pthread.h> +#include "../kselftest_harness.h" +#include "../filesystems/utils.h" +#include "wrappers.h" + +#ifndef FD_NSFS_ROOT +#define FD_NSFS_ROOT -10003 /* Root of the nsfs filesystem */ +#endif + +#ifndef FILEID_NSFS +#define FILEID_NSFS 0xf1 +#endif + +/* + * Test that initial namespaces can be reopened via file handle. + * Initial namespaces should have active ref count of 1 from boot. + */ +TEST(init_ns_always_active) +{ + struct file_handle *handle; + int mount_id; + int ret; + int fd1, fd2; + struct stat st1, st2; + + handle = malloc(sizeof(*handle) + MAX_HANDLE_SZ); + ASSERT_NE(handle, NULL); + + /* Open initial network namespace */ + fd1 = open("/proc/1/ns/net", O_RDONLY); + ASSERT_GE(fd1, 0); + + /* Get file handle for initial namespace */ + handle->handle_bytes = MAX_HANDLE_SZ; + ret = name_to_handle_at(fd1, "", handle, &mount_id, AT_EMPTY_PATH); + if (ret < 0 && errno == EOPNOTSUPP) { + SKIP(free(handle); close(fd1); + return, "nsfs doesn't support file handles"); + } + ASSERT_EQ(ret, 0); + + /* Close the namespace fd */ + close(fd1); + + /* Try to reopen via file handle - should succeed since init ns is always active */ + fd2 = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + if (fd2 < 0 && (errno == EINVAL || errno == EOPNOTSUPP)) { + SKIP(free(handle); + return, "open_by_handle_at with FD_NSFS_ROOT not supported"); + } + ASSERT_GE(fd2, 0); + + /* Verify we opened the same namespace */ + fd1 = open("/proc/1/ns/net", O_RDONLY); + ASSERT_GE(fd1, 0); + ASSERT_EQ(fstat(fd1, &st1), 0); + ASSERT_EQ(fstat(fd2, &st2), 0); + ASSERT_EQ(st1.st_ino, st2.st_ino); + + close(fd1); + close(fd2); + free(handle); +} + +/* + * Test namespace lifecycle: create a namespace in a child process, + * get a file handle while it's active, then try to reopen after + * the process exits (namespace becomes inactive). + */ +TEST(ns_inactive_after_exit) +{ + struct file_handle *handle; + int mount_id; + int ret; + int fd; + int pipefd[2]; + pid_t pid; + int status; + char buf[sizeof(*handle) + MAX_HANDLE_SZ]; + + /* Create pipe for passing file handle from child */ + ASSERT_EQ(pipe(pipefd), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* Child process */ + close(pipefd[0]); + + /* Create new network namespace */ + ret = unshare(CLONE_NEWNET); + if (ret < 0) { + close(pipefd[1]); + exit(1); + } + + /* Open our new namespace */ + fd = open("/proc/self/ns/net", O_RDONLY); + if (fd < 0) { + close(pipefd[1]); + exit(1); + } + + /* Get file handle for the namespace */ + handle = (struct file_handle *)buf; + handle->handle_bytes = MAX_HANDLE_SZ; + ret = name_to_handle_at(fd, "", handle, &mount_id, AT_EMPTY_PATH); + close(fd); + + if (ret < 0) { + close(pipefd[1]); + exit(1); + } + + /* Send handle to parent */ + write(pipefd[1], buf, sizeof(*handle) + handle->handle_bytes); + close(pipefd[1]); + + /* Exit - namespace should become inactive */ + exit(0); + } + + /* Parent process */ + close(pipefd[1]); + + /* Read file handle from child */ + ret = read(pipefd[0], buf, sizeof(buf)); + close(pipefd[0]); + + /* Wait for child to exit */ + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + ASSERT_GT(ret, 0); + handle = (struct file_handle *)buf; + + /* Try to reopen namespace - should fail with ENOENT since it's inactive */ + fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + ASSERT_LT(fd, 0); + /* Should fail with ENOENT (namespace inactive) or ESTALE */ + ASSERT_TRUE(errno == ENOENT || errno == ESTALE); +} + +/* + * Test that a namespace remains active while a process is using it, + * even after the creating process exits. + */ +TEST(ns_active_with_multiple_processes) +{ + struct file_handle *handle; + int mount_id; + int ret; + int fd; + int pipefd[2]; + int syncpipe[2]; + pid_t pid1, pid2; + int status; + char buf[sizeof(*handle) + MAX_HANDLE_SZ]; + char sync_byte; + + /* Create pipes for communication */ + ASSERT_EQ(pipe(pipefd), 0); + ASSERT_EQ(pipe(syncpipe), 0); + + pid1 = fork(); + ASSERT_GE(pid1, 0); + + if (pid1 == 0) { + /* First child - creates namespace */ + close(pipefd[0]); + close(syncpipe[1]); + + /* Create new network namespace */ + ret = unshare(CLONE_NEWNET); + if (ret < 0) { + close(pipefd[1]); + close(syncpipe[0]); + exit(1); + } + + /* Open and get handle */ + fd = open("/proc/self/ns/net", O_RDONLY); + if (fd < 0) { + close(pipefd[1]); + close(syncpipe[0]); + exit(1); + } + + handle = (struct file_handle *)buf; + handle->handle_bytes = MAX_HANDLE_SZ; + ret = name_to_handle_at(fd, "", handle, &mount_id, AT_EMPTY_PATH); + close(fd); + + if (ret < 0) { + close(pipefd[1]); + close(syncpipe[0]); + exit(1); + } + + /* Send handle to parent */ + write(pipefd[1], buf, sizeof(*handle) + handle->handle_bytes); + close(pipefd[1]); + + /* Wait for signal before exiting */ + read(syncpipe[0], &sync_byte, 1); + close(syncpipe[0]); + exit(0); + } + + /* Parent reads handle */ + close(pipefd[1]); + ret = read(pipefd[0], buf, sizeof(buf)); + close(pipefd[0]); + ASSERT_GT(ret, 0); + + handle = (struct file_handle *)buf; + + /* Create second child that will keep namespace active */ + pid2 = fork(); + ASSERT_GE(pid2, 0); + + if (pid2 == 0) { + /* Second child - reopens the namespace */ + close(syncpipe[0]); + close(syncpipe[1]); + + /* Open the namespace via handle */ + fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + if (fd < 0) { + exit(1); + } + + /* Join the namespace */ + ret = setns(fd, CLONE_NEWNET); + close(fd); + if (ret < 0) { + exit(1); + } + + /* Sleep to keep namespace active */ + sleep(1); + exit(0); + } + + /* Let second child enter the namespace */ + usleep(100000); /* 100ms */ + + /* Signal first child to exit */ + close(syncpipe[0]); + sync_byte = 'X'; + write(syncpipe[1], &sync_byte, 1); + close(syncpipe[1]); + + /* Wait for first child */ + waitpid(pid1, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + + /* Namespace should still be active because second child is using it */ + fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + ASSERT_GE(fd, 0); + close(fd); + + /* Wait for second child */ + waitpid(pid2, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); +} + +/* + * Test user namespace active ref tracking via credential lifecycle + */ +TEST(userns_active_ref_lifecycle) +{ + struct file_handle *handle; + int mount_id; + int ret; + int fd; + int pipefd[2]; + pid_t pid; + int status; + char buf[sizeof(*handle) + MAX_HANDLE_SZ]; + + ASSERT_EQ(pipe(pipefd), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* Child process */ + close(pipefd[0]); + + /* Create new user namespace */ + ret = unshare(CLONE_NEWUSER); + if (ret < 0) { + close(pipefd[1]); + exit(1); + } + + /* Set up uid/gid mappings */ + int uid_map_fd = open("/proc/self/uid_map", O_WRONLY); + int gid_map_fd = open("/proc/self/gid_map", O_WRONLY); + int setgroups_fd = open("/proc/self/setgroups", O_WRONLY); + + if (uid_map_fd >= 0 && gid_map_fd >= 0 && setgroups_fd >= 0) { + write(setgroups_fd, "deny", 4); + close(setgroups_fd); + + char mapping[64]; + snprintf(mapping, sizeof(mapping), "0 %d 1", getuid()); + write(uid_map_fd, mapping, strlen(mapping)); + close(uid_map_fd); + + snprintf(mapping, sizeof(mapping), "0 %d 1", getgid()); + write(gid_map_fd, mapping, strlen(mapping)); + close(gid_map_fd); + } + + /* Get file handle */ + fd = open("/proc/self/ns/user", O_RDONLY); + if (fd < 0) { + close(pipefd[1]); + exit(1); + } + + handle = (struct file_handle *)buf; + handle->handle_bytes = MAX_HANDLE_SZ; + ret = name_to_handle_at(fd, "", handle, &mount_id, AT_EMPTY_PATH); + close(fd); + + if (ret < 0) { + close(pipefd[1]); + exit(1); + } + + /* Send handle to parent */ + write(pipefd[1], buf, sizeof(*handle) + handle->handle_bytes); + close(pipefd[1]); + exit(0); + } + + /* Parent */ + close(pipefd[1]); + ret = read(pipefd[0], buf, sizeof(buf)); + close(pipefd[0]); + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + ASSERT_GT(ret, 0); + handle = (struct file_handle *)buf; + + /* Namespace should be inactive after all tasks exit */ + fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + ASSERT_LT(fd, 0); + ASSERT_TRUE(errno == ENOENT || errno == ESTALE); +} + +/* + * Test PID namespace active ref tracking + */ +TEST(pidns_active_ref_lifecycle) +{ + struct file_handle *handle; + int mount_id; + int ret; + int fd; + int pipefd[2]; + pid_t pid; + int status; + char buf[sizeof(*handle) + MAX_HANDLE_SZ]; + + ASSERT_EQ(pipe(pipefd), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* Child process */ + close(pipefd[0]); + + /* Create new PID namespace */ + ret = unshare(CLONE_NEWPID); + if (ret < 0) { + close(pipefd[1]); + exit(1); + } + + /* Fork to actually enter the PID namespace */ + pid_t child = fork(); + if (child < 0) { + close(pipefd[1]); + exit(1); + } + + if (child == 0) { + /* Grandchild - in new PID namespace */ + fd = open("/proc/self/ns/pid", O_RDONLY); + if (fd < 0) { + exit(1); + } + + handle = (struct file_handle *)buf; + handle->handle_bytes = MAX_HANDLE_SZ; + ret = name_to_handle_at(fd, "", handle, &mount_id, AT_EMPTY_PATH); + close(fd); + + if (ret < 0) { + exit(1); + } + + /* Send handle to grandparent */ + write(pipefd[1], buf, sizeof(*handle) + handle->handle_bytes); + close(pipefd[1]); + exit(0); + } + + /* Wait for grandchild */ + waitpid(child, NULL, 0); + exit(0); + } + + /* Parent */ + close(pipefd[1]); + ret = read(pipefd[0], buf, sizeof(buf)); + close(pipefd[0]); + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + ASSERT_GT(ret, 0); + handle = (struct file_handle *)buf; + + /* Namespace should be inactive after all processes exit */ + fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + ASSERT_LT(fd, 0); + ASSERT_TRUE(errno == ENOENT || errno == ESTALE); +} + +/* + * Test that an open file descriptor keeps a namespace active. + * Even after the creating process exits, the namespace should remain + * active as long as an fd is held open. + */ +TEST(ns_fd_keeps_active) +{ + struct file_handle *handle; + int mount_id; + int ret; + int nsfd; + int pipe_child_ready[2]; + int pipe_parent_ready[2]; + pid_t pid; + int status; + char buf[sizeof(*handle) + MAX_HANDLE_SZ]; + char sync_byte; + char proc_path[64]; + + ASSERT_EQ(pipe(pipe_child_ready), 0); + ASSERT_EQ(pipe(pipe_parent_ready), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* Child process */ + close(pipe_child_ready[0]); + close(pipe_parent_ready[1]); + + TH_LOG("Child: creating new network namespace"); + + /* Create new network namespace */ + ret = unshare(CLONE_NEWNET); + if (ret < 0) { + TH_LOG("Child: unshare(CLONE_NEWNET) failed: %s", strerror(errno)); + close(pipe_child_ready[1]); + close(pipe_parent_ready[0]); + exit(1); + } + + TH_LOG("Child: network namespace created successfully"); + + /* Get file handle for the namespace */ + nsfd = open("/proc/self/ns/net", O_RDONLY); + if (nsfd < 0) { + TH_LOG("Child: failed to open /proc/self/ns/net: %s", strerror(errno)); + close(pipe_child_ready[1]); + close(pipe_parent_ready[0]); + exit(1); + } + + TH_LOG("Child: opened namespace fd %d", nsfd); + + handle = (struct file_handle *)buf; + handle->handle_bytes = MAX_HANDLE_SZ; + ret = name_to_handle_at(nsfd, "", handle, &mount_id, AT_EMPTY_PATH); + close(nsfd); + + if (ret < 0) { + TH_LOG("Child: name_to_handle_at failed: %s", strerror(errno)); + close(pipe_child_ready[1]); + close(pipe_parent_ready[0]); + exit(1); + } + + TH_LOG("Child: got file handle (bytes=%u)", handle->handle_bytes); + + /* Send file handle to parent */ + ret = write(pipe_child_ready[1], buf, sizeof(*handle) + handle->handle_bytes); + TH_LOG("Child: sent %d bytes of file handle to parent", ret); + close(pipe_child_ready[1]); + + /* Wait for parent to open the fd */ + TH_LOG("Child: waiting for parent to open fd"); + ret = read(pipe_parent_ready[0], &sync_byte, 1); + close(pipe_parent_ready[0]); + + TH_LOG("Child: parent signaled (read %d bytes), exiting now", ret); + /* Exit - namespace should stay active because parent holds fd */ + exit(0); + } + + /* Parent process */ + close(pipe_child_ready[1]); + close(pipe_parent_ready[0]); + + TH_LOG("Parent: reading file handle from child"); + + /* Read file handle from child */ + ret = read(pipe_child_ready[0], buf, sizeof(buf)); + close(pipe_child_ready[0]); + ASSERT_GT(ret, 0); + handle = (struct file_handle *)buf; + + TH_LOG("Parent: received %d bytes, handle size=%u", ret, handle->handle_bytes); + + /* Open the child's namespace while it's still alive */ + snprintf(proc_path, sizeof(proc_path), "/proc/%d/ns/net", pid); + TH_LOG("Parent: opening child's namespace at %s", proc_path); + nsfd = open(proc_path, O_RDONLY); + if (nsfd < 0) { + TH_LOG("Parent: failed to open %s: %s", proc_path, strerror(errno)); + close(pipe_parent_ready[1]); + kill(pid, SIGKILL); + waitpid(pid, NULL, 0); + SKIP(return, "Failed to open child's namespace"); + } + + TH_LOG("Parent: opened child's namespace, got fd %d", nsfd); + + /* Signal child that we have the fd */ + sync_byte = 'G'; + write(pipe_parent_ready[1], &sync_byte, 1); + close(pipe_parent_ready[1]); + TH_LOG("Parent: signaled child that we have the fd"); + + /* Wait for child to exit */ + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + TH_LOG("Child exited, parent holds fd %d to namespace", nsfd); + + /* + * Namespace should still be ACTIVE because we hold an fd. + * We should be able to reopen it via file handle. + */ + TH_LOG("Attempting to reopen namespace via file handle (should succeed - fd held)"); + int fd2 = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + ASSERT_GE(fd2, 0); + + TH_LOG("Successfully reopened namespace via file handle, got fd %d", fd2); + + /* Verify it's the same namespace */ + struct stat st1, st2; + ASSERT_EQ(fstat(nsfd, &st1), 0); + ASSERT_EQ(fstat(fd2, &st2), 0); + TH_LOG("Namespace inodes: nsfd=%lu, fd2=%lu", st1.st_ino, st2.st_ino); + ASSERT_EQ(st1.st_ino, st2.st_ino); + close(fd2); + + /* Now close the fd - namespace should become inactive */ + TH_LOG("Closing fd %d - namespace should become inactive", nsfd); + close(nsfd); + + /* Now reopening should fail - namespace is inactive */ + TH_LOG("Attempting to reopen namespace via file handle (should fail - inactive)"); + fd2 = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + ASSERT_LT(fd2, 0); + /* Should fail with ENOENT (inactive) or ESTALE (gone) */ + TH_LOG("Reopen failed as expected: %s (errno=%d)", strerror(errno), errno); + ASSERT_TRUE(errno == ENOENT || errno == ESTALE); +} + +/* + * Test hierarchical active reference propagation. + * When a child namespace is active, its owning user namespace should also + * be active automatically due to hierarchical active reference propagation. + * This ensures parents are always reachable when children are active. + */ +TEST(ns_parent_always_reachable) +{ + struct file_handle *parent_handle, *child_handle; + int ret; + int child_nsfd; + int pipefd[2]; + pid_t pid; + int status; + __u64 parent_id, child_id; + char parent_buf[sizeof(*parent_handle) + MAX_HANDLE_SZ]; + char child_buf[sizeof(*child_handle) + MAX_HANDLE_SZ]; + + ASSERT_EQ(pipe(pipefd), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* Child process */ + close(pipefd[0]); + + TH_LOG("Child: creating parent user namespace and setting up mappings"); + + /* Create parent user namespace with mappings */ + ret = setup_userns(); + if (ret < 0) { + TH_LOG("Child: setup_userns() for parent failed: %s", strerror(errno)); + close(pipefd[1]); + exit(1); + } + + TH_LOG("Child: parent user namespace created, now uid=%d gid=%d", getuid(), getgid()); + + /* Get namespace ID for parent user namespace */ + int parent_fd = open("/proc/self/ns/user", O_RDONLY); + if (parent_fd < 0) { + TH_LOG("Child: failed to open parent /proc/self/ns/user: %s", strerror(errno)); + close(pipefd[1]); + exit(1); + } + + TH_LOG("Child: opened parent userns fd %d", parent_fd); + + if (ioctl(parent_fd, NS_GET_ID, &parent_id) < 0) { + TH_LOG("Child: NS_GET_ID for parent failed: %s", strerror(errno)); + close(parent_fd); + close(pipefd[1]); + exit(1); + } + close(parent_fd); + + TH_LOG("Child: got parent namespace ID %llu", (unsigned long long)parent_id); + + /* Create child user namespace within parent */ + TH_LOG("Child: creating nested child user namespace"); + ret = setup_userns(); + if (ret < 0) { + TH_LOG("Child: setup_userns() for child failed: %s", strerror(errno)); + close(pipefd[1]); + exit(1); + } + + TH_LOG("Child: nested child user namespace created, uid=%d gid=%d", getuid(), getgid()); + + /* Get namespace ID for child user namespace */ + int child_fd = open("/proc/self/ns/user", O_RDONLY); + if (child_fd < 0) { + TH_LOG("Child: failed to open child /proc/self/ns/user: %s", strerror(errno)); + close(pipefd[1]); + exit(1); + } + + TH_LOG("Child: opened child userns fd %d", child_fd); + + if (ioctl(child_fd, NS_GET_ID, &child_id) < 0) { + TH_LOG("Child: NS_GET_ID for child failed: %s", strerror(errno)); + close(child_fd); + close(pipefd[1]); + exit(1); + } + close(child_fd); + + TH_LOG("Child: got child namespace ID %llu", (unsigned long long)child_id); + + /* Send both namespace IDs to parent */ + TH_LOG("Child: sending both namespace IDs to parent"); + write(pipefd[1], &parent_id, sizeof(parent_id)); + write(pipefd[1], &child_id, sizeof(child_id)); + close(pipefd[1]); + + TH_LOG("Child: exiting - parent userns should become inactive"); + /* Exit - parent user namespace should become inactive */ + exit(0); + } + + /* Parent process */ + close(pipefd[1]); + + TH_LOG("Parent: reading both namespace IDs from child"); + + /* Read both namespace IDs - fixed size, no parsing needed */ + ret = read(pipefd[0], &parent_id, sizeof(parent_id)); + if (ret != sizeof(parent_id)) { + close(pipefd[0]); + waitpid(pid, NULL, 0); + SKIP(return, "Failed to read parent namespace ID from child"); + } + + ret = read(pipefd[0], &child_id, sizeof(child_id)); + close(pipefd[0]); + if (ret != sizeof(child_id)) { + waitpid(pid, NULL, 0); + SKIP(return, "Failed to read child namespace ID from child"); + } + + TH_LOG("Parent: received parent_id=%llu, child_id=%llu", + (unsigned long long)parent_id, (unsigned long long)child_id); + + /* Construct file handles from namespace IDs */ + parent_handle = (struct file_handle *)parent_buf; + parent_handle->handle_bytes = sizeof(struct nsfs_file_handle); + parent_handle->handle_type = FILEID_NSFS; + struct nsfs_file_handle *parent_fh = (struct nsfs_file_handle *)parent_handle->f_handle; + parent_fh->ns_id = parent_id; + parent_fh->ns_type = 0; + parent_fh->ns_inum = 0; + + child_handle = (struct file_handle *)child_buf; + child_handle->handle_bytes = sizeof(struct nsfs_file_handle); + child_handle->handle_type = FILEID_NSFS; + struct nsfs_file_handle *child_fh = (struct nsfs_file_handle *)child_handle->f_handle; + child_fh->ns_id = child_id; + child_fh->ns_type = 0; + child_fh->ns_inum = 0; + + TH_LOG("Parent: opening child namespace BEFORE child exits"); + + /* Open child namespace while child is still alive to keep it active */ + child_nsfd = open_by_handle_at(FD_NSFS_ROOT, child_handle, O_RDONLY); + if (child_nsfd < 0) { + TH_LOG("Failed to open child namespace: %s (errno=%d)", strerror(errno), errno); + waitpid(pid, NULL, 0); + SKIP(return, "Failed to open child namespace"); + } + + TH_LOG("Opened child namespace fd %d", child_nsfd); + + /* Now wait for child to exit */ + TH_LOG("Parent: waiting for child to exit"); + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + TH_LOG("Child process exited, parent holds fd to child namespace"); + + /* + * With hierarchical active reference propagation: + * Since the child namespace is active (parent process holds fd), + * the parent user namespace should ALSO be active automatically. + * This is because when we took an active reference on the child, + * it propagated up to the owning user namespace. + */ + TH_LOG("Attempting to reopen parent namespace (should SUCCEED - hierarchical propagation)"); + int parent_fd = open_by_handle_at(FD_NSFS_ROOT, parent_handle, O_RDONLY); + ASSERT_GE(parent_fd, 0); + + TH_LOG("SUCCESS: Parent namespace is active (fd=%d) due to active child", parent_fd); + + /* Verify we can also get parent via NS_GET_USERNS */ + TH_LOG("Verifying NS_GET_USERNS also works"); + int parent_fd2 = ioctl(child_nsfd, NS_GET_USERNS); + if (parent_fd2 < 0) { + close(parent_fd); + close(child_nsfd); + TH_LOG("NS_GET_USERNS failed: %s (errno=%d)", strerror(errno), errno); + SKIP(return, "NS_GET_USERNS not supported or failed"); + } + + TH_LOG("NS_GET_USERNS succeeded, got parent fd %d", parent_fd2); + + /* Verify both methods give us the same namespace */ + struct stat st1, st2; + ASSERT_EQ(fstat(parent_fd, &st1), 0); + ASSERT_EQ(fstat(parent_fd2, &st2), 0); + TH_LOG("Parent namespace inodes: parent_fd=%lu, parent_fd2=%lu", st1.st_ino, st2.st_ino); + ASSERT_EQ(st1.st_ino, st2.st_ino); + + /* + * Close child fd - parent should remain active because we still + * hold direct references to it (parent_fd and parent_fd2). + */ + TH_LOG("Closing child fd - parent should remain active (direct refs held)"); + close(child_nsfd); + + /* Parent should still be openable */ + TH_LOG("Verifying parent still active via file handle"); + int parent_fd3 = open_by_handle_at(FD_NSFS_ROOT, parent_handle, O_RDONLY); + ASSERT_GE(parent_fd3, 0); + close(parent_fd3); + + TH_LOG("Closing all fds to parent namespace"); + close(parent_fd); + close(parent_fd2); + + /* Both should now be inactive */ + TH_LOG("Attempting to reopen parent (should fail - inactive, no refs)"); + parent_fd = open_by_handle_at(FD_NSFS_ROOT, parent_handle, O_RDONLY); + ASSERT_LT(parent_fd, 0); + TH_LOG("Parent inactive as expected: %s (errno=%d)", strerror(errno), errno); + ASSERT_TRUE(errno == ENOENT || errno == ESTALE); +} + +/* + * Test that bind mounts keep namespaces in the tree even when inactive + */ +TEST(ns_bind_mount_keeps_in_tree) +{ + struct file_handle *handle; + int mount_id; + int ret; + int fd; + int pipefd[2]; + pid_t pid; + int status; + char buf[sizeof(*handle) + MAX_HANDLE_SZ]; + char tmpfile[] = "/tmp/ns-test-XXXXXX"; + int tmpfd; + + /* Create temporary file for bind mount */ + tmpfd = mkstemp(tmpfile); + if (tmpfd < 0) { + SKIP(return, "Cannot create temporary file"); + } + close(tmpfd); + + ASSERT_EQ(pipe(pipefd), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* Child process */ + close(pipefd[0]); + + /* Unshare mount namespace and make mounts private to avoid propagation */ + ret = unshare(CLONE_NEWNS); + if (ret < 0) { + close(pipefd[1]); + unlink(tmpfile); + exit(1); + } + ret = mount(NULL, "/", NULL, MS_PRIVATE | MS_REC, NULL); + if (ret < 0) { + close(pipefd[1]); + unlink(tmpfile); + exit(1); + } + + /* Create new network namespace */ + ret = unshare(CLONE_NEWNET); + if (ret < 0) { + close(pipefd[1]); + unlink(tmpfile); + exit(1); + } + + /* Bind mount the namespace */ + ret = mount("/proc/self/ns/net", tmpfile, NULL, MS_BIND, NULL); + if (ret < 0) { + close(pipefd[1]); + unlink(tmpfile); + exit(1); + } + + /* Get file handle */ + fd = open("/proc/self/ns/net", O_RDONLY); + if (fd < 0) { + umount(tmpfile); + close(pipefd[1]); + unlink(tmpfile); + exit(1); + } + + handle = (struct file_handle *)buf; + handle->handle_bytes = MAX_HANDLE_SZ; + ret = name_to_handle_at(fd, "", handle, &mount_id, AT_EMPTY_PATH); + close(fd); + + if (ret < 0) { + umount(tmpfile); + close(pipefd[1]); + unlink(tmpfile); + exit(1); + } + + /* Send handle to parent */ + write(pipefd[1], buf, sizeof(*handle) + handle->handle_bytes); + close(pipefd[1]); + exit(0); + } + + /* Parent */ + close(pipefd[1]); + ret = read(pipefd[0], buf, sizeof(buf)); + close(pipefd[0]); + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + ASSERT_GT(ret, 0); + handle = (struct file_handle *)buf; + + /* + * Namespace should be inactive but still in tree due to bind mount. + * Reopening should fail with ENOENT (inactive) not ESTALE (not in tree). + */ + fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + ASSERT_LT(fd, 0); + /* Should be ENOENT (inactive) since bind mount keeps it in tree */ + if (errno != ENOENT && errno != ESTALE) { + TH_LOG("Unexpected error: %d", errno); + } + + /* Cleanup */ + umount(tmpfile); + unlink(tmpfile); +} + +/* + * Test multi-level hierarchy (3+ levels deep). + * Grandparent → Parent → Child + * When child is active, both parent AND grandparent should be active. + */ +TEST(ns_multilevel_hierarchy) +{ + struct file_handle *gp_handle, *p_handle, *c_handle; + int ret, pipefd[2]; + pid_t pid; + int status; + __u64 gp_id, p_id, c_id; + char gp_buf[sizeof(*gp_handle) + MAX_HANDLE_SZ]; + char p_buf[sizeof(*p_handle) + MAX_HANDLE_SZ]; + char c_buf[sizeof(*c_handle) + MAX_HANDLE_SZ]; + + ASSERT_EQ(pipe(pipefd), 0); + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + close(pipefd[0]); + + /* Create grandparent user namespace */ + if (setup_userns() < 0) { + close(pipefd[1]); + exit(1); + } + + int gp_fd = open("/proc/self/ns/user", O_RDONLY); + if (gp_fd < 0) { + close(pipefd[1]); + exit(1); + } + if (ioctl(gp_fd, NS_GET_ID, &gp_id) < 0) { + close(gp_fd); + close(pipefd[1]); + exit(1); + } + close(gp_fd); + + /* Create parent user namespace */ + if (setup_userns() < 0) { + close(pipefd[1]); + exit(1); + } + + int p_fd = open("/proc/self/ns/user", O_RDONLY); + if (p_fd < 0) { + close(pipefd[1]); + exit(1); + } + if (ioctl(p_fd, NS_GET_ID, &p_id) < 0) { + close(p_fd); + close(pipefd[1]); + exit(1); + } + close(p_fd); + + /* Create child user namespace */ + if (setup_userns() < 0) { + close(pipefd[1]); + exit(1); + } + + int c_fd = open("/proc/self/ns/user", O_RDONLY); + if (c_fd < 0) { + close(pipefd[1]); + exit(1); + } + if (ioctl(c_fd, NS_GET_ID, &c_id) < 0) { + close(c_fd); + close(pipefd[1]); + exit(1); + } + close(c_fd); + + /* Send all three namespace IDs */ + write(pipefd[1], &gp_id, sizeof(gp_id)); + write(pipefd[1], &p_id, sizeof(p_id)); + write(pipefd[1], &c_id, sizeof(c_id)); + close(pipefd[1]); + exit(0); + } + + close(pipefd[1]); + + /* Read all three namespace IDs - fixed size, no parsing needed */ + ret = read(pipefd[0], &gp_id, sizeof(gp_id)); + if (ret != sizeof(gp_id)) { + close(pipefd[0]); + waitpid(pid, NULL, 0); + SKIP(return, "Failed to read grandparent namespace ID from child"); + } + + ret = read(pipefd[0], &p_id, sizeof(p_id)); + if (ret != sizeof(p_id)) { + close(pipefd[0]); + waitpid(pid, NULL, 0); + SKIP(return, "Failed to read parent namespace ID from child"); + } + + ret = read(pipefd[0], &c_id, sizeof(c_id)); + close(pipefd[0]); + if (ret != sizeof(c_id)) { + waitpid(pid, NULL, 0); + SKIP(return, "Failed to read child namespace ID from child"); + } + + /* Construct file handles from namespace IDs */ + gp_handle = (struct file_handle *)gp_buf; + gp_handle->handle_bytes = sizeof(struct nsfs_file_handle); + gp_handle->handle_type = FILEID_NSFS; + struct nsfs_file_handle *gp_fh = (struct nsfs_file_handle *)gp_handle->f_handle; + gp_fh->ns_id = gp_id; + gp_fh->ns_type = 0; + gp_fh->ns_inum = 0; + + p_handle = (struct file_handle *)p_buf; + p_handle->handle_bytes = sizeof(struct nsfs_file_handle); + p_handle->handle_type = FILEID_NSFS; + struct nsfs_file_handle *p_fh = (struct nsfs_file_handle *)p_handle->f_handle; + p_fh->ns_id = p_id; + p_fh->ns_type = 0; + p_fh->ns_inum = 0; + + c_handle = (struct file_handle *)c_buf; + c_handle->handle_bytes = sizeof(struct nsfs_file_handle); + c_handle->handle_type = FILEID_NSFS; + struct nsfs_file_handle *c_fh = (struct nsfs_file_handle *)c_handle->f_handle; + c_fh->ns_id = c_id; + c_fh->ns_type = 0; + c_fh->ns_inum = 0; + + /* Open child before process exits */ + int c_fd = open_by_handle_at(FD_NSFS_ROOT, c_handle, O_RDONLY); + if (c_fd < 0) { + waitpid(pid, NULL, 0); + SKIP(return, "Failed to open child namespace"); + } + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + /* + * With 3-level hierarchy and child active: + * - Child is active (we hold fd) + * - Parent should be active (propagated from child) + * - Grandparent should be active (propagated from parent) + */ + TH_LOG("Testing parent active when child is active"); + int p_fd = open_by_handle_at(FD_NSFS_ROOT, p_handle, O_RDONLY); + ASSERT_GE(p_fd, 0); + + TH_LOG("Testing grandparent active when child is active"); + int gp_fd = open_by_handle_at(FD_NSFS_ROOT, gp_handle, O_RDONLY); + ASSERT_GE(gp_fd, 0); + + close(c_fd); + close(p_fd); + close(gp_fd); +} + +/* + * Test multiple children sharing same parent. + * Parent should stay active as long as ANY child is active. + */ +TEST(ns_multiple_children_same_parent) +{ + struct file_handle *p_handle, *c1_handle, *c2_handle; + int ret, pipefd[2]; + pid_t pid; + int status; + __u64 p_id, c1_id, c2_id; + char p_buf[sizeof(*p_handle) + MAX_HANDLE_SZ]; + char c1_buf[sizeof(*c1_handle) + MAX_HANDLE_SZ]; + char c2_buf[sizeof(*c2_handle) + MAX_HANDLE_SZ]; + + ASSERT_EQ(pipe(pipefd), 0); + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + close(pipefd[0]); + + /* Create parent user namespace */ + if (setup_userns() < 0) { + close(pipefd[1]); + exit(1); + } + + int p_fd = open("/proc/self/ns/user", O_RDONLY); + if (p_fd < 0) { + close(pipefd[1]); + exit(1); + } + if (ioctl(p_fd, NS_GET_ID, &p_id) < 0) { + close(p_fd); + close(pipefd[1]); + exit(1); + } + close(p_fd); + + /* Create first child user namespace */ + if (setup_userns() < 0) { + close(pipefd[1]); + exit(1); + } + + int c1_fd = open("/proc/self/ns/user", O_RDONLY); + if (c1_fd < 0) { + close(pipefd[1]); + exit(1); + } + if (ioctl(c1_fd, NS_GET_ID, &c1_id) < 0) { + close(c1_fd); + close(pipefd[1]); + exit(1); + } + close(c1_fd); + + /* Return to parent user namespace and create second child */ + /* We can't actually do this easily, so let's create a sibling namespace + * by creating a network namespace instead */ + if (unshare(CLONE_NEWNET) < 0) { + close(pipefd[1]); + exit(1); + } + + int c2_fd = open("/proc/self/ns/net", O_RDONLY); + if (c2_fd < 0) { + close(pipefd[1]); + exit(1); + } + if (ioctl(c2_fd, NS_GET_ID, &c2_id) < 0) { + close(c2_fd); + close(pipefd[1]); + exit(1); + } + close(c2_fd); + + /* Send all namespace IDs */ + write(pipefd[1], &p_id, sizeof(p_id)); + write(pipefd[1], &c1_id, sizeof(c1_id)); + write(pipefd[1], &c2_id, sizeof(c2_id)); + close(pipefd[1]); + exit(0); + } + + close(pipefd[1]); + + /* Read all three namespace IDs - fixed size, no parsing needed */ + ret = read(pipefd[0], &p_id, sizeof(p_id)); + if (ret != sizeof(p_id)) { + close(pipefd[0]); + waitpid(pid, NULL, 0); + SKIP(return, "Failed to read parent namespace ID"); + } + + ret = read(pipefd[0], &c1_id, sizeof(c1_id)); + if (ret != sizeof(c1_id)) { + close(pipefd[0]); + waitpid(pid, NULL, 0); + SKIP(return, "Failed to read first child namespace ID"); + } + + ret = read(pipefd[0], &c2_id, sizeof(c2_id)); + close(pipefd[0]); + if (ret != sizeof(c2_id)) { + waitpid(pid, NULL, 0); + SKIP(return, "Failed to read second child namespace ID"); + } + + /* Construct file handles from namespace IDs */ + p_handle = (struct file_handle *)p_buf; + p_handle->handle_bytes = sizeof(struct nsfs_file_handle); + p_handle->handle_type = FILEID_NSFS; + struct nsfs_file_handle *p_fh = (struct nsfs_file_handle *)p_handle->f_handle; + p_fh->ns_id = p_id; + p_fh->ns_type = 0; + p_fh->ns_inum = 0; + + c1_handle = (struct file_handle *)c1_buf; + c1_handle->handle_bytes = sizeof(struct nsfs_file_handle); + c1_handle->handle_type = FILEID_NSFS; + struct nsfs_file_handle *c1_fh = (struct nsfs_file_handle *)c1_handle->f_handle; + c1_fh->ns_id = c1_id; + c1_fh->ns_type = 0; + c1_fh->ns_inum = 0; + + c2_handle = (struct file_handle *)c2_buf; + c2_handle->handle_bytes = sizeof(struct nsfs_file_handle); + c2_handle->handle_type = FILEID_NSFS; + struct nsfs_file_handle *c2_fh = (struct nsfs_file_handle *)c2_handle->f_handle; + c2_fh->ns_id = c2_id; + c2_fh->ns_type = 0; + c2_fh->ns_inum = 0; + + /* Open both children before process exits */ + int c1_fd = open_by_handle_at(FD_NSFS_ROOT, c1_handle, O_RDONLY); + int c2_fd = open_by_handle_at(FD_NSFS_ROOT, c2_handle, O_RDONLY); + + if (c1_fd < 0 || c2_fd < 0) { + if (c1_fd >= 0) close(c1_fd); + if (c2_fd >= 0) close(c2_fd); + waitpid(pid, NULL, 0); + SKIP(return, "Failed to open child namespaces"); + } + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + /* Parent should be active (both children active) */ + TH_LOG("Both children active - parent should be active"); + int p_fd = open_by_handle_at(FD_NSFS_ROOT, p_handle, O_RDONLY); + ASSERT_GE(p_fd, 0); + close(p_fd); + + /* Close first child - parent should STILL be active */ + TH_LOG("Closing first child - parent should still be active"); + close(c1_fd); + p_fd = open_by_handle_at(FD_NSFS_ROOT, p_handle, O_RDONLY); + ASSERT_GE(p_fd, 0); + close(p_fd); + + /* Close second child - NOW parent should become inactive */ + TH_LOG("Closing second child - parent should become inactive"); + close(c2_fd); + p_fd = open_by_handle_at(FD_NSFS_ROOT, p_handle, O_RDONLY); + ASSERT_LT(p_fd, 0); +} + +/* + * Test that different namespace types with same owner all contribute + * active references to the owning user namespace. + */ +TEST(ns_different_types_same_owner) +{ + struct file_handle *u_handle, *n_handle, *ut_handle; + int ret, pipefd[2]; + pid_t pid; + int status; + __u64 u_id, n_id, ut_id; + char u_buf[sizeof(*u_handle) + MAX_HANDLE_SZ]; + char n_buf[sizeof(*n_handle) + MAX_HANDLE_SZ]; + char ut_buf[sizeof(*ut_handle) + MAX_HANDLE_SZ]; + + ASSERT_EQ(pipe(pipefd), 0); + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + close(pipefd[0]); + + /* Create user namespace */ + if (setup_userns() < 0) { + close(pipefd[1]); + exit(1); + } + + int u_fd = open("/proc/self/ns/user", O_RDONLY); + if (u_fd < 0) { + close(pipefd[1]); + exit(1); + } + if (ioctl(u_fd, NS_GET_ID, &u_id) < 0) { + close(u_fd); + close(pipefd[1]); + exit(1); + } + close(u_fd); + + /* Create network namespace (owned by user namespace) */ + if (unshare(CLONE_NEWNET) < 0) { + close(pipefd[1]); + exit(1); + } + + int n_fd = open("/proc/self/ns/net", O_RDONLY); + if (n_fd < 0) { + close(pipefd[1]); + exit(1); + } + if (ioctl(n_fd, NS_GET_ID, &n_id) < 0) { + close(n_fd); + close(pipefd[1]); + exit(1); + } + close(n_fd); + + /* Create UTS namespace (also owned by user namespace) */ + if (unshare(CLONE_NEWUTS) < 0) { + close(pipefd[1]); + exit(1); + } + + int ut_fd = open("/proc/self/ns/uts", O_RDONLY); + if (ut_fd < 0) { + close(pipefd[1]); + exit(1); + } + if (ioctl(ut_fd, NS_GET_ID, &ut_id) < 0) { + close(ut_fd); + close(pipefd[1]); + exit(1); + } + close(ut_fd); + + /* Send all namespace IDs */ + write(pipefd[1], &u_id, sizeof(u_id)); + write(pipefd[1], &n_id, sizeof(n_id)); + write(pipefd[1], &ut_id, sizeof(ut_id)); + close(pipefd[1]); + exit(0); + } + + close(pipefd[1]); + + /* Read all three namespace IDs - fixed size, no parsing needed */ + ret = read(pipefd[0], &u_id, sizeof(u_id)); + if (ret != sizeof(u_id)) { + close(pipefd[0]); + waitpid(pid, NULL, 0); + SKIP(return, "Failed to read user namespace ID"); + } + + ret = read(pipefd[0], &n_id, sizeof(n_id)); + if (ret != sizeof(n_id)) { + close(pipefd[0]); + waitpid(pid, NULL, 0); + SKIP(return, "Failed to read network namespace ID"); + } + + ret = read(pipefd[0], &ut_id, sizeof(ut_id)); + close(pipefd[0]); + if (ret != sizeof(ut_id)) { + waitpid(pid, NULL, 0); + SKIP(return, "Failed to read UTS namespace ID"); + } + + /* Construct file handles from namespace IDs */ + u_handle = (struct file_handle *)u_buf; + u_handle->handle_bytes = sizeof(struct nsfs_file_handle); + u_handle->handle_type = FILEID_NSFS; + struct nsfs_file_handle *u_fh = (struct nsfs_file_handle *)u_handle->f_handle; + u_fh->ns_id = u_id; + u_fh->ns_type = 0; + u_fh->ns_inum = 0; + + n_handle = (struct file_handle *)n_buf; + n_handle->handle_bytes = sizeof(struct nsfs_file_handle); + n_handle->handle_type = FILEID_NSFS; + struct nsfs_file_handle *n_fh = (struct nsfs_file_handle *)n_handle->f_handle; + n_fh->ns_id = n_id; + n_fh->ns_type = 0; + n_fh->ns_inum = 0; + + ut_handle = (struct file_handle *)ut_buf; + ut_handle->handle_bytes = sizeof(struct nsfs_file_handle); + ut_handle->handle_type = FILEID_NSFS; + struct nsfs_file_handle *ut_fh = (struct nsfs_file_handle *)ut_handle->f_handle; + ut_fh->ns_id = ut_id; + ut_fh->ns_type = 0; + ut_fh->ns_inum = 0; + + /* Open both non-user namespaces before process exits */ + int n_fd = open_by_handle_at(FD_NSFS_ROOT, n_handle, O_RDONLY); + int ut_fd = open_by_handle_at(FD_NSFS_ROOT, ut_handle, O_RDONLY); + + if (n_fd < 0 || ut_fd < 0) { + if (n_fd >= 0) close(n_fd); + if (ut_fd >= 0) close(ut_fd); + waitpid(pid, NULL, 0); + SKIP(return, "Failed to open namespaces"); + } + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + /* + * Both network and UTS namespaces are active. + * User namespace should be active (gets 2 active refs). + */ + TH_LOG("Both net and uts active - user namespace should be active"); + int u_fd = open_by_handle_at(FD_NSFS_ROOT, u_handle, O_RDONLY); + ASSERT_GE(u_fd, 0); + close(u_fd); + + /* Close network namespace - user namespace should STILL be active */ + TH_LOG("Closing network ns - user ns should still be active (uts still active)"); + close(n_fd); + u_fd = open_by_handle_at(FD_NSFS_ROOT, u_handle, O_RDONLY); + ASSERT_GE(u_fd, 0); + close(u_fd); + + /* Close UTS namespace - user namespace should become inactive */ + TH_LOG("Closing uts ns - user ns should become inactive"); + close(ut_fd); + u_fd = open_by_handle_at(FD_NSFS_ROOT, u_handle, O_RDONLY); + ASSERT_LT(u_fd, 0); +} + +/* + * Test hierarchical propagation with deep namespace hierarchy. + * Create: init_user_ns -> user_A -> user_B -> net_ns + * When net_ns is active, both user_A and user_B should be active. + * This verifies the conditional recursion in __ns_ref_active_put() works. + */ +TEST(ns_deep_hierarchy_propagation) +{ + struct file_handle *ua_handle, *ub_handle, *net_handle; + int ret, pipefd[2]; + pid_t pid; + int status; + __u64 ua_id, ub_id, net_id; + char ua_buf[sizeof(*ua_handle) + MAX_HANDLE_SZ]; + char ub_buf[sizeof(*ub_handle) + MAX_HANDLE_SZ]; + char net_buf[sizeof(*net_handle) + MAX_HANDLE_SZ]; + + ASSERT_EQ(pipe(pipefd), 0); + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + close(pipefd[0]); + + /* Create user_A -> user_B -> net hierarchy */ + if (setup_userns() < 0) { + close(pipefd[1]); + exit(1); + } + + int ua_fd = open("/proc/self/ns/user", O_RDONLY); + if (ua_fd < 0) { + close(pipefd[1]); + exit(1); + } + if (ioctl(ua_fd, NS_GET_ID, &ua_id) < 0) { + close(ua_fd); + close(pipefd[1]); + exit(1); + } + close(ua_fd); + + if (setup_userns() < 0) { + close(pipefd[1]); + exit(1); + } + + int ub_fd = open("/proc/self/ns/user", O_RDONLY); + if (ub_fd < 0) { + close(pipefd[1]); + exit(1); + } + if (ioctl(ub_fd, NS_GET_ID, &ub_id) < 0) { + close(ub_fd); + close(pipefd[1]); + exit(1); + } + close(ub_fd); + + if (unshare(CLONE_NEWNET) < 0) { + close(pipefd[1]); + exit(1); + } + + int net_fd = open("/proc/self/ns/net", O_RDONLY); + if (net_fd < 0) { + close(pipefd[1]); + exit(1); + } + if (ioctl(net_fd, NS_GET_ID, &net_id) < 0) { + close(net_fd); + close(pipefd[1]); + exit(1); + } + close(net_fd); + + /* Send all three namespace IDs */ + write(pipefd[1], &ua_id, sizeof(ua_id)); + write(pipefd[1], &ub_id, sizeof(ub_id)); + write(pipefd[1], &net_id, sizeof(net_id)); + close(pipefd[1]); + exit(0); + } + + close(pipefd[1]); + + /* Read all three namespace IDs - fixed size, no parsing needed */ + ret = read(pipefd[0], &ua_id, sizeof(ua_id)); + if (ret != sizeof(ua_id)) { + close(pipefd[0]); + waitpid(pid, NULL, 0); + SKIP(return, "Failed to read user_A namespace ID"); + } + + ret = read(pipefd[0], &ub_id, sizeof(ub_id)); + if (ret != sizeof(ub_id)) { + close(pipefd[0]); + waitpid(pid, NULL, 0); + SKIP(return, "Failed to read user_B namespace ID"); + } + + ret = read(pipefd[0], &net_id, sizeof(net_id)); + close(pipefd[0]); + if (ret != sizeof(net_id)) { + waitpid(pid, NULL, 0); + SKIP(return, "Failed to read network namespace ID"); + } + + /* Construct file handles from namespace IDs */ + ua_handle = (struct file_handle *)ua_buf; + ua_handle->handle_bytes = sizeof(struct nsfs_file_handle); + ua_handle->handle_type = FILEID_NSFS; + struct nsfs_file_handle *ua_fh = (struct nsfs_file_handle *)ua_handle->f_handle; + ua_fh->ns_id = ua_id; + ua_fh->ns_type = 0; + ua_fh->ns_inum = 0; + + ub_handle = (struct file_handle *)ub_buf; + ub_handle->handle_bytes = sizeof(struct nsfs_file_handle); + ub_handle->handle_type = FILEID_NSFS; + struct nsfs_file_handle *ub_fh = (struct nsfs_file_handle *)ub_handle->f_handle; + ub_fh->ns_id = ub_id; + ub_fh->ns_type = 0; + ub_fh->ns_inum = 0; + + net_handle = (struct file_handle *)net_buf; + net_handle->handle_bytes = sizeof(struct nsfs_file_handle); + net_handle->handle_type = FILEID_NSFS; + struct nsfs_file_handle *net_fh = (struct nsfs_file_handle *)net_handle->f_handle; + net_fh->ns_id = net_id; + net_fh->ns_type = 0; + net_fh->ns_inum = 0; + + /* Open net_ns before child exits to keep it active */ + int net_fd = open_by_handle_at(FD_NSFS_ROOT, net_handle, O_RDONLY); + if (net_fd < 0) { + waitpid(pid, NULL, 0); + SKIP(return, "Failed to open network namespace"); + } + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + /* With net_ns active, both user_A and user_B should be active */ + TH_LOG("Testing user_B active (net_ns active causes propagation)"); + int ub_fd = open_by_handle_at(FD_NSFS_ROOT, ub_handle, O_RDONLY); + ASSERT_GE(ub_fd, 0); + + TH_LOG("Testing user_A active (propagated through user_B)"); + int ua_fd = open_by_handle_at(FD_NSFS_ROOT, ua_handle, O_RDONLY); + ASSERT_GE(ua_fd, 0); + + /* Close net_ns - user_B should stay active (we hold direct ref) */ + TH_LOG("Closing net_ns, user_B should remain active (direct ref held)"); + close(net_fd); + int ub_fd2 = open_by_handle_at(FD_NSFS_ROOT, ub_handle, O_RDONLY); + ASSERT_GE(ub_fd2, 0); + close(ub_fd2); + + /* Close user_B - user_A should stay active (we hold direct ref) */ + TH_LOG("Closing user_B, user_A should remain active (direct ref held)"); + close(ub_fd); + int ua_fd2 = open_by_handle_at(FD_NSFS_ROOT, ua_handle, O_RDONLY); + ASSERT_GE(ua_fd2, 0); + close(ua_fd2); + + /* Close user_A - everything should become inactive */ + TH_LOG("Closing user_A, all should become inactive"); + close(ua_fd); + + /* All should now be inactive */ + ua_fd = open_by_handle_at(FD_NSFS_ROOT, ua_handle, O_RDONLY); + ASSERT_LT(ua_fd, 0); +} + +/* + * Test that parent stays active as long as ANY child is active. + * Create parent user namespace with two child net namespaces. + * Parent should remain active until BOTH children are inactive. + */ +TEST(ns_parent_multiple_children_refcount) +{ + struct file_handle *parent_handle, *net1_handle, *net2_handle; + int ret, pipefd[2], syncpipe[2]; + pid_t pid; + int status; + __u64 p_id, n1_id, n2_id; + char p_buf[sizeof(*parent_handle) + MAX_HANDLE_SZ]; + char n1_buf[sizeof(*net1_handle) + MAX_HANDLE_SZ]; + char n2_buf[sizeof(*net2_handle) + MAX_HANDLE_SZ]; + char sync_byte; + + ASSERT_EQ(pipe(pipefd), 0); + ASSERT_EQ(pipe(syncpipe), 0); + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + close(pipefd[0]); + close(syncpipe[1]); + + /* Create parent user namespace */ + if (setup_userns() < 0) { + close(pipefd[1]); + exit(1); + } + + int p_fd = open("/proc/self/ns/user", O_RDONLY); + if (p_fd < 0) { + close(pipefd[1]); + exit(1); + } + if (ioctl(p_fd, NS_GET_ID, &p_id) < 0) { + close(p_fd); + close(pipefd[1]); + exit(1); + } + close(p_fd); + + /* Create first network namespace */ + if (unshare(CLONE_NEWNET) < 0) { + close(pipefd[1]); + close(syncpipe[0]); + exit(1); + } + + int n1_fd = open("/proc/self/ns/net", O_RDONLY); + if (n1_fd < 0) { + close(pipefd[1]); + close(syncpipe[0]); + exit(1); + } + if (ioctl(n1_fd, NS_GET_ID, &n1_id) < 0) { + close(n1_fd); + close(pipefd[1]); + close(syncpipe[0]); + exit(1); + } + /* Keep n1_fd open so first namespace stays active */ + + /* Create second network namespace */ + if (unshare(CLONE_NEWNET) < 0) { + close(n1_fd); + close(pipefd[1]); + close(syncpipe[0]); + exit(1); + } + + int n2_fd = open("/proc/self/ns/net", O_RDONLY); + if (n2_fd < 0) { + close(n1_fd); + close(pipefd[1]); + close(syncpipe[0]); + exit(1); + } + if (ioctl(n2_fd, NS_GET_ID, &n2_id) < 0) { + close(n1_fd); + close(n2_fd); + close(pipefd[1]); + close(syncpipe[0]); + exit(1); + } + /* Keep both n1_fd and n2_fd open */ + + /* Send all namespace IDs */ + write(pipefd[1], &p_id, sizeof(p_id)); + write(pipefd[1], &n1_id, sizeof(n1_id)); + write(pipefd[1], &n2_id, sizeof(n2_id)); + close(pipefd[1]); + + /* Wait for parent to signal before exiting */ + read(syncpipe[0], &sync_byte, 1); + close(syncpipe[0]); + exit(0); + } + + close(pipefd[1]); + close(syncpipe[0]); + + /* Read all three namespace IDs - fixed size, no parsing needed */ + ret = read(pipefd[0], &p_id, sizeof(p_id)); + if (ret != sizeof(p_id)) { + close(pipefd[0]); + waitpid(pid, NULL, 0); + SKIP(return, "Failed to read parent namespace ID"); + } + + ret = read(pipefd[0], &n1_id, sizeof(n1_id)); + if (ret != sizeof(n1_id)) { + close(pipefd[0]); + waitpid(pid, NULL, 0); + SKIP(return, "Failed to read first network namespace ID"); + } + + ret = read(pipefd[0], &n2_id, sizeof(n2_id)); + close(pipefd[0]); + if (ret != sizeof(n2_id)) { + waitpid(pid, NULL, 0); + SKIP(return, "Failed to read second network namespace ID"); + } + + /* Construct file handles from namespace IDs */ + parent_handle = (struct file_handle *)p_buf; + parent_handle->handle_bytes = sizeof(struct nsfs_file_handle); + parent_handle->handle_type = FILEID_NSFS; + struct nsfs_file_handle *p_fh = (struct nsfs_file_handle *)parent_handle->f_handle; + p_fh->ns_id = p_id; + p_fh->ns_type = 0; + p_fh->ns_inum = 0; + + net1_handle = (struct file_handle *)n1_buf; + net1_handle->handle_bytes = sizeof(struct nsfs_file_handle); + net1_handle->handle_type = FILEID_NSFS; + struct nsfs_file_handle *n1_fh = (struct nsfs_file_handle *)net1_handle->f_handle; + n1_fh->ns_id = n1_id; + n1_fh->ns_type = 0; + n1_fh->ns_inum = 0; + + net2_handle = (struct file_handle *)n2_buf; + net2_handle->handle_bytes = sizeof(struct nsfs_file_handle); + net2_handle->handle_type = FILEID_NSFS; + struct nsfs_file_handle *n2_fh = (struct nsfs_file_handle *)net2_handle->f_handle; + n2_fh->ns_id = n2_id; + n2_fh->ns_type = 0; + n2_fh->ns_inum = 0; + + /* Open both net namespaces while child is still alive */ + int n1_fd = open_by_handle_at(FD_NSFS_ROOT, net1_handle, O_RDONLY); + int n2_fd = open_by_handle_at(FD_NSFS_ROOT, net2_handle, O_RDONLY); + if (n1_fd < 0 || n2_fd < 0) { + if (n1_fd >= 0) close(n1_fd); + if (n2_fd >= 0) close(n2_fd); + sync_byte = 'G'; + write(syncpipe[1], &sync_byte, 1); + close(syncpipe[1]); + waitpid(pid, NULL, 0); + SKIP(return, "Failed to open net namespaces"); + } + + /* Signal child that we have opened the namespaces */ + sync_byte = 'G'; + write(syncpipe[1], &sync_byte, 1); + close(syncpipe[1]); + + /* Wait for child to exit */ + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + /* Parent should be active (has 2 active children) */ + TH_LOG("Both net namespaces active - parent should be active"); + int p_fd = open_by_handle_at(FD_NSFS_ROOT, parent_handle, O_RDONLY); + ASSERT_GE(p_fd, 0); + close(p_fd); + + /* Close first net namespace - parent should STILL be active */ + TH_LOG("Closing first net ns - parent should still be active"); + close(n1_fd); + p_fd = open_by_handle_at(FD_NSFS_ROOT, parent_handle, O_RDONLY); + ASSERT_GE(p_fd, 0); + close(p_fd); + + /* Close second net namespace - parent should become inactive */ + TH_LOG("Closing second net ns - parent should become inactive"); + close(n2_fd); + p_fd = open_by_handle_at(FD_NSFS_ROOT, parent_handle, O_RDONLY); + ASSERT_LT(p_fd, 0); +} + +/* + * Test that user namespace as a child also propagates correctly. + * Create user_A -> user_B, verify when user_B is active that user_A + * is also active. This is different from non-user namespace children. + */ +TEST(ns_userns_child_propagation) +{ + struct file_handle *ua_handle, *ub_handle; + int ret, pipefd[2]; + pid_t pid; + int status; + __u64 ua_id, ub_id; + char ua_buf[sizeof(*ua_handle) + MAX_HANDLE_SZ]; + char ub_buf[sizeof(*ub_handle) + MAX_HANDLE_SZ]; + + ASSERT_EQ(pipe(pipefd), 0); + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + close(pipefd[0]); + + /* Create user_A */ + if (setup_userns() < 0) { + close(pipefd[1]); + exit(1); + } + + int ua_fd = open("/proc/self/ns/user", O_RDONLY); + if (ua_fd < 0) { + close(pipefd[1]); + exit(1); + } + if (ioctl(ua_fd, NS_GET_ID, &ua_id) < 0) { + close(ua_fd); + close(pipefd[1]); + exit(1); + } + close(ua_fd); + + /* Create user_B (child of user_A) */ + if (setup_userns() < 0) { + close(pipefd[1]); + exit(1); + } + + int ub_fd = open("/proc/self/ns/user", O_RDONLY); + if (ub_fd < 0) { + close(pipefd[1]); + exit(1); + } + if (ioctl(ub_fd, NS_GET_ID, &ub_id) < 0) { + close(ub_fd); + close(pipefd[1]); + exit(1); + } + close(ub_fd); + + /* Send both namespace IDs */ + write(pipefd[1], &ua_id, sizeof(ua_id)); + write(pipefd[1], &ub_id, sizeof(ub_id)); + close(pipefd[1]); + exit(0); + } + + close(pipefd[1]); + + /* Read both namespace IDs - fixed size, no parsing needed */ + ret = read(pipefd[0], &ua_id, sizeof(ua_id)); + if (ret != sizeof(ua_id)) { + close(pipefd[0]); + waitpid(pid, NULL, 0); + SKIP(return, "Failed to read user_A namespace ID"); + } + + ret = read(pipefd[0], &ub_id, sizeof(ub_id)); + close(pipefd[0]); + if (ret != sizeof(ub_id)) { + waitpid(pid, NULL, 0); + SKIP(return, "Failed to read user_B namespace ID"); + } + + /* Construct file handles from namespace IDs */ + ua_handle = (struct file_handle *)ua_buf; + ua_handle->handle_bytes = sizeof(struct nsfs_file_handle); + ua_handle->handle_type = FILEID_NSFS; + struct nsfs_file_handle *ua_fh = (struct nsfs_file_handle *)ua_handle->f_handle; + ua_fh->ns_id = ua_id; + ua_fh->ns_type = 0; + ua_fh->ns_inum = 0; + + ub_handle = (struct file_handle *)ub_buf; + ub_handle->handle_bytes = sizeof(struct nsfs_file_handle); + ub_handle->handle_type = FILEID_NSFS; + struct nsfs_file_handle *ub_fh = (struct nsfs_file_handle *)ub_handle->f_handle; + ub_fh->ns_id = ub_id; + ub_fh->ns_type = 0; + ub_fh->ns_inum = 0; + + /* Open user_B before child exits */ + int ub_fd = open_by_handle_at(FD_NSFS_ROOT, ub_handle, O_RDONLY); + if (ub_fd < 0) { + waitpid(pid, NULL, 0); + SKIP(return, "Failed to open user_B"); + } + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + /* With user_B active, user_A should also be active */ + TH_LOG("Testing user_A active when child user_B is active"); + int ua_fd = open_by_handle_at(FD_NSFS_ROOT, ua_handle, O_RDONLY); + ASSERT_GE(ua_fd, 0); + + /* Close user_B */ + TH_LOG("Closing user_B"); + close(ub_fd); + + /* user_A should remain active (we hold direct ref) */ + int ua_fd2 = open_by_handle_at(FD_NSFS_ROOT, ua_handle, O_RDONLY); + ASSERT_GE(ua_fd2, 0); + close(ua_fd2); + + /* Close user_A - should become inactive */ + TH_LOG("Closing user_A - should become inactive"); + close(ua_fd); + + ua_fd = open_by_handle_at(FD_NSFS_ROOT, ua_handle, O_RDONLY); + ASSERT_LT(ua_fd, 0); +} + +/* + * Test different namespace types (net, uts, ipc) all contributing + * active references to the same owning user namespace. + */ +TEST(ns_mixed_types_same_owner) +{ + struct file_handle *user_handle, *net_handle, *uts_handle; + int ret, pipefd[2]; + pid_t pid; + int status; + __u64 u_id, n_id, ut_id; + char u_buf[sizeof(*user_handle) + MAX_HANDLE_SZ]; + char n_buf[sizeof(*net_handle) + MAX_HANDLE_SZ]; + char ut_buf[sizeof(*uts_handle) + MAX_HANDLE_SZ]; + + ASSERT_EQ(pipe(pipefd), 0); + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + close(pipefd[0]); + + if (setup_userns() < 0) { + close(pipefd[1]); + exit(1); + } + + int u_fd = open("/proc/self/ns/user", O_RDONLY); + if (u_fd < 0) { + close(pipefd[1]); + exit(1); + } + if (ioctl(u_fd, NS_GET_ID, &u_id) < 0) { + close(u_fd); + close(pipefd[1]); + exit(1); + } + close(u_fd); + + if (unshare(CLONE_NEWNET) < 0) { + close(pipefd[1]); + exit(1); + } + + int n_fd = open("/proc/self/ns/net", O_RDONLY); + if (n_fd < 0) { + close(pipefd[1]); + exit(1); + } + if (ioctl(n_fd, NS_GET_ID, &n_id) < 0) { + close(n_fd); + close(pipefd[1]); + exit(1); + } + close(n_fd); + + if (unshare(CLONE_NEWUTS) < 0) { + close(pipefd[1]); + exit(1); + } + + int ut_fd = open("/proc/self/ns/uts", O_RDONLY); + if (ut_fd < 0) { + close(pipefd[1]); + exit(1); + } + if (ioctl(ut_fd, NS_GET_ID, &ut_id) < 0) { + close(ut_fd); + close(pipefd[1]); + exit(1); + } + close(ut_fd); + + /* Send all namespace IDs */ + write(pipefd[1], &u_id, sizeof(u_id)); + write(pipefd[1], &n_id, sizeof(n_id)); + write(pipefd[1], &ut_id, sizeof(ut_id)); + close(pipefd[1]); + exit(0); + } + + close(pipefd[1]); + + /* Read all three namespace IDs - fixed size, no parsing needed */ + ret = read(pipefd[0], &u_id, sizeof(u_id)); + if (ret != sizeof(u_id)) { + close(pipefd[0]); + waitpid(pid, NULL, 0); + SKIP(return, "Failed to read user namespace ID"); + } + + ret = read(pipefd[0], &n_id, sizeof(n_id)); + if (ret != sizeof(n_id)) { + close(pipefd[0]); + waitpid(pid, NULL, 0); + SKIP(return, "Failed to read network namespace ID"); + } + + ret = read(pipefd[0], &ut_id, sizeof(ut_id)); + close(pipefd[0]); + if (ret != sizeof(ut_id)) { + waitpid(pid, NULL, 0); + SKIP(return, "Failed to read UTS namespace ID"); + } + + /* Construct file handles from namespace IDs */ + user_handle = (struct file_handle *)u_buf; + user_handle->handle_bytes = sizeof(struct nsfs_file_handle); + user_handle->handle_type = FILEID_NSFS; + struct nsfs_file_handle *u_fh = (struct nsfs_file_handle *)user_handle->f_handle; + u_fh->ns_id = u_id; + u_fh->ns_type = 0; + u_fh->ns_inum = 0; + + net_handle = (struct file_handle *)n_buf; + net_handle->handle_bytes = sizeof(struct nsfs_file_handle); + net_handle->handle_type = FILEID_NSFS; + struct nsfs_file_handle *n_fh = (struct nsfs_file_handle *)net_handle->f_handle; + n_fh->ns_id = n_id; + n_fh->ns_type = 0; + n_fh->ns_inum = 0; + + uts_handle = (struct file_handle *)ut_buf; + uts_handle->handle_bytes = sizeof(struct nsfs_file_handle); + uts_handle->handle_type = FILEID_NSFS; + struct nsfs_file_handle *ut_fh = (struct nsfs_file_handle *)uts_handle->f_handle; + ut_fh->ns_id = ut_id; + ut_fh->ns_type = 0; + ut_fh->ns_inum = 0; + + /* Open both non-user namespaces */ + int n_fd = open_by_handle_at(FD_NSFS_ROOT, net_handle, O_RDONLY); + int ut_fd = open_by_handle_at(FD_NSFS_ROOT, uts_handle, O_RDONLY); + if (n_fd < 0 || ut_fd < 0) { + if (n_fd >= 0) close(n_fd); + if (ut_fd >= 0) close(ut_fd); + waitpid(pid, NULL, 0); + SKIP(return, "Failed to open namespaces"); + } + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + /* User namespace should be active (2 active children) */ + TH_LOG("Both net and uts active - user ns should be active"); + int u_fd = open_by_handle_at(FD_NSFS_ROOT, user_handle, O_RDONLY); + ASSERT_GE(u_fd, 0); + close(u_fd); + + /* Close net - user ns should STILL be active (uts still active) */ + TH_LOG("Closing net - user ns should still be active"); + close(n_fd); + u_fd = open_by_handle_at(FD_NSFS_ROOT, user_handle, O_RDONLY); + ASSERT_GE(u_fd, 0); + close(u_fd); + + /* Close uts - user ns should become inactive */ + TH_LOG("Closing uts - user ns should become inactive"); + close(ut_fd); + u_fd = open_by_handle_at(FD_NSFS_ROOT, user_handle, O_RDONLY); + ASSERT_LT(u_fd, 0); +} + +/* Thread test helpers and structures */ +struct thread_ns_info { + __u64 ns_id; + int pipefd; + int syncfd_read; + int syncfd_write; + int exit_code; +}; + +static void *thread_create_namespace(void *arg) +{ + struct thread_ns_info *info = (struct thread_ns_info *)arg; + int ret; + + /* Create new network namespace */ + ret = unshare(CLONE_NEWNET); + if (ret < 0) { + info->exit_code = 1; + return NULL; + } + + /* Get namespace ID */ + int fd = open("/proc/thread-self/ns/net", O_RDONLY); + if (fd < 0) { + info->exit_code = 2; + return NULL; + } + + ret = ioctl(fd, NS_GET_ID, &info->ns_id); + close(fd); + if (ret < 0) { + info->exit_code = 3; + return NULL; + } + + /* Send namespace ID to main thread */ + if (write(info->pipefd, &info->ns_id, sizeof(info->ns_id)) != sizeof(info->ns_id)) { + info->exit_code = 4; + return NULL; + } + + /* Wait for signal to exit */ + char sync_byte; + if (read(info->syncfd_read, &sync_byte, 1) != 1) { + info->exit_code = 5; + return NULL; + } + + info->exit_code = 0; + return NULL; +} + +/* + * Test that namespace becomes inactive after thread exits. + * This verifies active reference counting works with threads, not just processes. + */ +TEST(thread_ns_inactive_after_exit) +{ + pthread_t thread; + struct thread_ns_info info; + struct file_handle *handle; + int pipefd[2]; + int syncpipe[2]; + int ret; + char sync_byte; + char buf[sizeof(*handle) + MAX_HANDLE_SZ]; + + ASSERT_EQ(pipe(pipefd), 0); + ASSERT_EQ(pipe(syncpipe), 0); + + info.pipefd = pipefd[1]; + info.syncfd_read = syncpipe[0]; + info.syncfd_write = -1; + info.exit_code = -1; + + /* Create thread that will create a namespace */ + ret = pthread_create(&thread, NULL, thread_create_namespace, &info); + ASSERT_EQ(ret, 0); + + /* Read namespace ID from thread */ + __u64 ns_id; + ret = read(pipefd[0], &ns_id, sizeof(ns_id)); + if (ret != sizeof(ns_id)) { + sync_byte = 'X'; + write(syncpipe[1], &sync_byte, 1); + pthread_join(thread, NULL); + close(pipefd[0]); + close(pipefd[1]); + close(syncpipe[0]); + close(syncpipe[1]); + SKIP(return, "Failed to read namespace ID from thread"); + } + + TH_LOG("Thread created namespace with ID %llu", (unsigned long long)ns_id); + + /* Construct file handle */ + handle = (struct file_handle *)buf; + handle->handle_bytes = sizeof(struct nsfs_file_handle); + handle->handle_type = FILEID_NSFS; + struct nsfs_file_handle *fh = (struct nsfs_file_handle *)handle->f_handle; + fh->ns_id = ns_id; + fh->ns_type = 0; + fh->ns_inum = 0; + + /* Namespace should be active while thread is alive */ + TH_LOG("Attempting to open namespace while thread is alive (should succeed)"); + int nsfd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + ASSERT_GE(nsfd, 0); + close(nsfd); + + /* Signal thread to exit */ + TH_LOG("Signaling thread to exit"); + sync_byte = 'X'; + ASSERT_EQ(write(syncpipe[1], &sync_byte, 1), 1); + close(syncpipe[1]); + + /* Wait for thread to exit */ + ASSERT_EQ(pthread_join(thread, NULL), 0); + close(pipefd[0]); + close(pipefd[1]); + close(syncpipe[0]); + + if (info.exit_code != 0) + SKIP(return, "Thread failed to create namespace"); + + TH_LOG("Thread exited, namespace should be inactive"); + + /* Namespace should now be inactive */ + nsfd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + ASSERT_LT(nsfd, 0); + /* Should fail with ENOENT (inactive) or ESTALE (gone) */ + TH_LOG("Namespace inactive as expected: %s (errno=%d)", strerror(errno), errno); + ASSERT_TRUE(errno == ENOENT || errno == ESTALE); +} + +/* + * Test that a namespace remains active while a thread holds an fd to it. + * Even after the thread exits, the namespace should remain active as long as + * another thread holds a file descriptor to it. + */ +TEST(thread_ns_fd_keeps_active) +{ + pthread_t thread; + struct thread_ns_info info; + struct file_handle *handle; + int pipefd[2]; + int syncpipe[2]; + int ret; + char sync_byte; + char buf[sizeof(*handle) + MAX_HANDLE_SZ]; + + ASSERT_EQ(pipe(pipefd), 0); + ASSERT_EQ(pipe(syncpipe), 0); + + info.pipefd = pipefd[1]; + info.syncfd_read = syncpipe[0]; + info.syncfd_write = -1; + info.exit_code = -1; + + /* Create thread that will create a namespace */ + ret = pthread_create(&thread, NULL, thread_create_namespace, &info); + ASSERT_EQ(ret, 0); + + /* Read namespace ID from thread */ + __u64 ns_id; + ret = read(pipefd[0], &ns_id, sizeof(ns_id)); + if (ret != sizeof(ns_id)) { + sync_byte = 'X'; + write(syncpipe[1], &sync_byte, 1); + pthread_join(thread, NULL); + close(pipefd[0]); + close(pipefd[1]); + close(syncpipe[0]); + close(syncpipe[1]); + SKIP(return, "Failed to read namespace ID from thread"); + } + + TH_LOG("Thread created namespace with ID %llu", (unsigned long long)ns_id); + + /* Construct file handle */ + handle = (struct file_handle *)buf; + handle->handle_bytes = sizeof(struct nsfs_file_handle); + handle->handle_type = FILEID_NSFS; + struct nsfs_file_handle *fh = (struct nsfs_file_handle *)handle->f_handle; + fh->ns_id = ns_id; + fh->ns_type = 0; + fh->ns_inum = 0; + + /* Open namespace while thread is alive */ + TH_LOG("Opening namespace while thread is alive"); + int nsfd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + ASSERT_GE(nsfd, 0); + + /* Signal thread to exit */ + TH_LOG("Signaling thread to exit"); + sync_byte = 'X'; + write(syncpipe[1], &sync_byte, 1); + close(syncpipe[1]); + + /* Wait for thread to exit */ + pthread_join(thread, NULL); + close(pipefd[0]); + close(pipefd[1]); + close(syncpipe[0]); + + if (info.exit_code != 0) { + close(nsfd); + SKIP(return, "Thread failed to create namespace"); + } + + TH_LOG("Thread exited, but main thread holds fd - namespace should remain active"); + + /* Namespace should still be active because we hold an fd */ + int nsfd2 = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + ASSERT_GE(nsfd2, 0); + + /* Verify it's the same namespace */ + struct stat st1, st2; + ASSERT_EQ(fstat(nsfd, &st1), 0); + ASSERT_EQ(fstat(nsfd2, &st2), 0); + ASSERT_EQ(st1.st_ino, st2.st_ino); + close(nsfd2); + + TH_LOG("Closing fd - namespace should become inactive"); + close(nsfd); + + /* Now namespace should be inactive */ + nsfd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + ASSERT_LT(nsfd, 0); + /* Should fail with ENOENT (inactive) or ESTALE (gone) */ + TH_LOG("Namespace inactive as expected: %s (errno=%d)", strerror(errno), errno); + ASSERT_TRUE(errno == ENOENT || errno == ESTALE); +} + +/* Structure for thread data in subprocess */ +struct thread_sleep_data { + int syncfd_read; +}; + +static void *thread_sleep_and_wait(void *arg) +{ + struct thread_sleep_data *data = (struct thread_sleep_data *)arg; + char sync_byte; + + /* Wait for signal to exit - read will unblock when pipe is closed */ + (void)read(data->syncfd_read, &sync_byte, 1); + return NULL; +} + +/* + * Test that namespaces become inactive after subprocess with multiple threads exits. + * Create a subprocess that unshares user and network namespaces, then creates two + * threads that share those namespaces. Verify that after all threads and subprocess + * exit, the namespaces are no longer listed by listns() and cannot be opened by + * open_by_handle_at(). + */ +TEST(thread_subprocess_ns_inactive_after_all_exit) +{ + int pipefd[2]; + int sv[2]; + pid_t pid; + int status; + __u64 user_id, net_id; + struct file_handle *user_handle, *net_handle; + char user_buf[sizeof(*user_handle) + MAX_HANDLE_SZ]; + char net_buf[sizeof(*net_handle) + MAX_HANDLE_SZ]; + char sync_byte; + int ret; + + ASSERT_EQ(pipe(pipefd), 0); + ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sv), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* Child process */ + close(pipefd[0]); + close(sv[0]); + + /* Create user namespace with mappings */ + if (setup_userns() < 0) { + fprintf(stderr, "Child: setup_userns() failed: %s\n", strerror(errno)); + close(pipefd[1]); + close(sv[1]); + exit(1); + } + fprintf(stderr, "Child: setup_userns() succeeded\n"); + + /* Get user namespace ID */ + int user_fd = open("/proc/self/ns/user", O_RDONLY); + if (user_fd < 0) { + fprintf(stderr, "Child: open(/proc/self/ns/user) failed: %s\n", strerror(errno)); + close(pipefd[1]); + close(sv[1]); + exit(1); + } + + if (ioctl(user_fd, NS_GET_ID, &user_id) < 0) { + fprintf(stderr, "Child: ioctl(NS_GET_ID) for user ns failed: %s\n", strerror(errno)); + close(user_fd); + close(pipefd[1]); + close(sv[1]); + exit(1); + } + close(user_fd); + fprintf(stderr, "Child: user ns ID = %llu\n", (unsigned long long)user_id); + + /* Unshare network namespace */ + if (unshare(CLONE_NEWNET) < 0) { + fprintf(stderr, "Child: unshare(CLONE_NEWNET) failed: %s\n", strerror(errno)); + close(pipefd[1]); + close(sv[1]); + exit(1); + } + fprintf(stderr, "Child: unshare(CLONE_NEWNET) succeeded\n"); + + /* Get network namespace ID */ + int net_fd = open("/proc/self/ns/net", O_RDONLY); + if (net_fd < 0) { + fprintf(stderr, "Child: open(/proc/self/ns/net) failed: %s\n", strerror(errno)); + close(pipefd[1]); + close(sv[1]); + exit(1); + } + + if (ioctl(net_fd, NS_GET_ID, &net_id) < 0) { + fprintf(stderr, "Child: ioctl(NS_GET_ID) for net ns failed: %s\n", strerror(errno)); + close(net_fd); + close(pipefd[1]); + close(sv[1]); + exit(1); + } + close(net_fd); + fprintf(stderr, "Child: net ns ID = %llu\n", (unsigned long long)net_id); + + /* Send namespace IDs to parent */ + if (write(pipefd[1], &user_id, sizeof(user_id)) != sizeof(user_id)) { + fprintf(stderr, "Child: write(user_id) failed: %s\n", strerror(errno)); + exit(1); + } + if (write(pipefd[1], &net_id, sizeof(net_id)) != sizeof(net_id)) { + fprintf(stderr, "Child: write(net_id) failed: %s\n", strerror(errno)); + exit(1); + } + close(pipefd[1]); + fprintf(stderr, "Child: sent namespace IDs to parent\n"); + + /* Create two threads that share the namespaces */ + pthread_t thread1, thread2; + struct thread_sleep_data data; + data.syncfd_read = sv[1]; + + int ret_thread = pthread_create(&thread1, NULL, thread_sleep_and_wait, &data); + if (ret_thread != 0) { + fprintf(stderr, "Child: pthread_create(thread1) failed: %s\n", strerror(ret_thread)); + close(sv[1]); + exit(1); + } + fprintf(stderr, "Child: created thread1\n"); + + ret_thread = pthread_create(&thread2, NULL, thread_sleep_and_wait, &data); + if (ret_thread != 0) { + fprintf(stderr, "Child: pthread_create(thread2) failed: %s\n", strerror(ret_thread)); + close(sv[1]); + pthread_cancel(thread1); + exit(1); + } + fprintf(stderr, "Child: created thread2\n"); + + /* Wait for threads to complete - they will unblock when parent writes */ + fprintf(stderr, "Child: waiting for threads to exit\n"); + pthread_join(thread1, NULL); + fprintf(stderr, "Child: thread1 exited\n"); + pthread_join(thread2, NULL); + fprintf(stderr, "Child: thread2 exited\n"); + + close(sv[1]); + + /* Exit - namespaces should become inactive */ + fprintf(stderr, "Child: all threads joined, exiting with success\n"); + exit(0); + } + + /* Parent process */ + close(pipefd[1]); + close(sv[1]); + + TH_LOG("Parent: waiting to read namespace IDs from child"); + + /* Read namespace IDs from child */ + ret = read(pipefd[0], &user_id, sizeof(user_id)); + if (ret != sizeof(user_id)) { + TH_LOG("Parent: failed to read user_id, ret=%d, errno=%s", ret, strerror(errno)); + close(pipefd[0]); + sync_byte = 'X'; + (void)write(sv[0], &sync_byte, 1); + close(sv[0]); + waitpid(pid, NULL, 0); + SKIP(return, "Failed to read user namespace ID from child"); + } + + ret = read(pipefd[0], &net_id, sizeof(net_id)); + close(pipefd[0]); + if (ret != sizeof(net_id)) { + TH_LOG("Parent: failed to read net_id, ret=%d, errno=%s", ret, strerror(errno)); + sync_byte = 'X'; + (void)write(sv[0], &sync_byte, 1); + close(sv[0]); + waitpid(pid, NULL, 0); + SKIP(return, "Failed to read network namespace ID from child"); + } + + TH_LOG("Child created user ns %llu and net ns %llu with 2 threads", + (unsigned long long)user_id, (unsigned long long)net_id); + + /* Construct file handles */ + user_handle = (struct file_handle *)user_buf; + user_handle->handle_bytes = sizeof(struct nsfs_file_handle); + user_handle->handle_type = FILEID_NSFS; + struct nsfs_file_handle *user_fh = (struct nsfs_file_handle *)user_handle->f_handle; + user_fh->ns_id = user_id; + user_fh->ns_type = 0; + user_fh->ns_inum = 0; + + net_handle = (struct file_handle *)net_buf; + net_handle->handle_bytes = sizeof(struct nsfs_file_handle); + net_handle->handle_type = FILEID_NSFS; + struct nsfs_file_handle *net_fh = (struct nsfs_file_handle *)net_handle->f_handle; + net_fh->ns_id = net_id; + net_fh->ns_type = 0; + net_fh->ns_inum = 0; + + /* Verify namespaces are active while subprocess and threads are alive */ + TH_LOG("Verifying namespaces are active while subprocess with threads is running"); + int user_fd = open_by_handle_at(FD_NSFS_ROOT, user_handle, O_RDONLY); + ASSERT_GE(user_fd, 0); + + int net_fd = open_by_handle_at(FD_NSFS_ROOT, net_handle, O_RDONLY); + ASSERT_GE(net_fd, 0); + + close(user_fd); + close(net_fd); + + /* Also verify they appear in listns() */ + TH_LOG("Verifying namespaces appear in listns() while active"); + struct ns_id_req req = { + .size = sizeof(struct ns_id_req), + .spare = 0, + .ns_id = 0, + .ns_type = CLONE_NEWUSER, + .spare2 = 0, + .user_ns_id = 0, + }; + __u64 ns_ids[256]; + int nr_ids = sys_listns(&req, ns_ids, 256, 0); + if (nr_ids < 0) { + TH_LOG("listns() not available, skipping listns verification"); + } else { + /* Check if user_id is in the list */ + int found_user = 0; + for (int i = 0; i < nr_ids; i++) { + if (ns_ids[i] == user_id) { + found_user = 1; + break; + } + } + ASSERT_TRUE(found_user); + TH_LOG("User namespace found in listns() as expected"); + + /* Check network namespace */ + req.ns_type = CLONE_NEWNET; + nr_ids = sys_listns(&req, ns_ids, 256, 0); + if (nr_ids >= 0) { + int found_net = 0; + for (int i = 0; i < nr_ids; i++) { + if (ns_ids[i] == net_id) { + found_net = 1; + break; + } + } + ASSERT_TRUE(found_net); + TH_LOG("Network namespace found in listns() as expected"); + } + } + + /* Signal threads to exit */ + TH_LOG("Signaling threads to exit"); + sync_byte = 'X'; + /* Write two bytes - one for each thread */ + ASSERT_EQ(write(sv[0], &sync_byte, 1), 1); + ASSERT_EQ(write(sv[0], &sync_byte, 1), 1); + close(sv[0]); + + /* Wait for child process to exit */ + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + if (WEXITSTATUS(status) != 0) { + TH_LOG("Child process failed with exit code %d", WEXITSTATUS(status)); + SKIP(return, "Child process failed"); + } + + TH_LOG("Subprocess and all threads have exited successfully"); + + /* Verify namespaces are now inactive - open_by_handle_at should fail */ + TH_LOG("Verifying namespaces are inactive after subprocess and threads exit"); + user_fd = open_by_handle_at(FD_NSFS_ROOT, user_handle, O_RDONLY); + ASSERT_LT(user_fd, 0); + TH_LOG("User namespace inactive as expected: %s (errno=%d)", + strerror(errno), errno); + ASSERT_TRUE(errno == ENOENT || errno == ESTALE); + + net_fd = open_by_handle_at(FD_NSFS_ROOT, net_handle, O_RDONLY); + ASSERT_LT(net_fd, 0); + TH_LOG("Network namespace inactive as expected: %s (errno=%d)", + strerror(errno), errno); + ASSERT_TRUE(errno == ENOENT || errno == ESTALE); + + /* Verify namespaces do NOT appear in listns() */ + TH_LOG("Verifying namespaces do NOT appear in listns() when inactive"); + memset(&req, 0, sizeof(req)); + req.size = sizeof(struct ns_id_req); + req.ns_type = CLONE_NEWUSER; + nr_ids = sys_listns(&req, ns_ids, 256, 0); + if (nr_ids >= 0) { + int found_user = 0; + for (int i = 0; i < nr_ids; i++) { + if (ns_ids[i] == user_id) { + found_user = 1; + break; + } + } + ASSERT_FALSE(found_user); + TH_LOG("User namespace correctly not listed in listns()"); + + /* Check network namespace */ + req.ns_type = CLONE_NEWNET; + nr_ids = sys_listns(&req, ns_ids, 256, 0); + if (nr_ids >= 0) { + int found_net = 0; + for (int i = 0; i < nr_ids; i++) { + if (ns_ids[i] == net_id) { + found_net = 1; + break; + } + } + ASSERT_FALSE(found_net); + TH_LOG("Network namespace correctly not listed in listns()"); + } + } +} + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/namespaces/nsid_test.c b/tools/testing/selftests/namespaces/nsid_test.c index e28accd74a57..527ade0a8673 100644 --- a/tools/testing/selftests/namespaces/nsid_test.c +++ b/tools/testing/selftests/namespaces/nsid_test.c @@ -6,6 +6,7 @@ #include <libgen.h> #include <limits.h> #include <pthread.h> +#include <signal.h> #include <string.h> #include <sys/mount.h> #include <poll.h> @@ -14,12 +15,30 @@ #include <sys/stat.h> #include <sys/socket.h> #include <sys/un.h> +#include <sys/wait.h> #include <unistd.h> #include <linux/fs.h> #include <linux/limits.h> #include <linux/nsfs.h> #include "../kselftest_harness.h" +/* Fixture for tests that create child processes */ +FIXTURE(nsid) { + pid_t child_pid; +}; + +FIXTURE_SETUP(nsid) { + self->child_pid = 0; +} + +FIXTURE_TEARDOWN(nsid) { + /* Clean up any child process that may still be running */ + if (self->child_pid > 0) { + kill(self->child_pid, SIGKILL); + waitpid(self->child_pid, NULL, 0); + } +} + TEST(nsid_mntns_basic) { __u64 mnt_ns_id = 0; @@ -44,7 +63,7 @@ TEST(nsid_mntns_basic) close(fd_mntns); } -TEST(nsid_mntns_separate) +TEST_F(nsid, mntns_separate) { __u64 parent_mnt_ns_id = 0; __u64 child_mnt_ns_id = 0; @@ -90,6 +109,9 @@ TEST(nsid_mntns_separate) _exit(0); } + /* Track child for cleanup */ + self->child_pid = pid; + /* Parent process */ close(pipefd[1]); @@ -99,8 +121,6 @@ TEST(nsid_mntns_separate) if (buf == 'S') { /* Child couldn't create namespace, skip test */ - kill(pid, SIGTERM); - waitpid(pid, NULL, 0); close(fd_parent_mntns); SKIP(return, "No permission to create mount namespace"); } @@ -123,10 +143,6 @@ TEST(nsid_mntns_separate) close(fd_parent_mntns); close(fd_child_mntns); - - /* Clean up child process */ - kill(pid, SIGTERM); - waitpid(pid, NULL, 0); } TEST(nsid_cgroupns_basic) @@ -153,7 +169,7 @@ TEST(nsid_cgroupns_basic) close(fd_cgroupns); } -TEST(nsid_cgroupns_separate) +TEST_F(nsid, cgroupns_separate) { __u64 parent_cgroup_ns_id = 0; __u64 child_cgroup_ns_id = 0; @@ -199,6 +215,9 @@ TEST(nsid_cgroupns_separate) _exit(0); } + /* Track child for cleanup */ + self->child_pid = pid; + /* Parent process */ close(pipefd[1]); @@ -208,8 +227,6 @@ TEST(nsid_cgroupns_separate) if (buf == 'S') { /* Child couldn't create namespace, skip test */ - kill(pid, SIGTERM); - waitpid(pid, NULL, 0); close(fd_parent_cgroupns); SKIP(return, "No permission to create cgroup namespace"); } @@ -232,10 +249,6 @@ TEST(nsid_cgroupns_separate) close(fd_parent_cgroupns); close(fd_child_cgroupns); - - /* Clean up child process */ - kill(pid, SIGTERM); - waitpid(pid, NULL, 0); } TEST(nsid_ipcns_basic) @@ -262,7 +275,7 @@ TEST(nsid_ipcns_basic) close(fd_ipcns); } -TEST(nsid_ipcns_separate) +TEST_F(nsid, ipcns_separate) { __u64 parent_ipc_ns_id = 0; __u64 child_ipc_ns_id = 0; @@ -308,6 +321,9 @@ TEST(nsid_ipcns_separate) _exit(0); } + /* Track child for cleanup */ + self->child_pid = pid; + /* Parent process */ close(pipefd[1]); @@ -317,8 +333,6 @@ TEST(nsid_ipcns_separate) if (buf == 'S') { /* Child couldn't create namespace, skip test */ - kill(pid, SIGTERM); - waitpid(pid, NULL, 0); close(fd_parent_ipcns); SKIP(return, "No permission to create IPC namespace"); } @@ -341,10 +355,6 @@ TEST(nsid_ipcns_separate) close(fd_parent_ipcns); close(fd_child_ipcns); - - /* Clean up child process */ - kill(pid, SIGTERM); - waitpid(pid, NULL, 0); } TEST(nsid_utsns_basic) @@ -371,7 +381,7 @@ TEST(nsid_utsns_basic) close(fd_utsns); } -TEST(nsid_utsns_separate) +TEST_F(nsid, utsns_separate) { __u64 parent_uts_ns_id = 0; __u64 child_uts_ns_id = 0; @@ -417,6 +427,9 @@ TEST(nsid_utsns_separate) _exit(0); } + /* Track child for cleanup */ + self->child_pid = pid; + /* Parent process */ close(pipefd[1]); @@ -426,8 +439,6 @@ TEST(nsid_utsns_separate) if (buf == 'S') { /* Child couldn't create namespace, skip test */ - kill(pid, SIGTERM); - waitpid(pid, NULL, 0); close(fd_parent_utsns); SKIP(return, "No permission to create UTS namespace"); } @@ -450,10 +461,6 @@ TEST(nsid_utsns_separate) close(fd_parent_utsns); close(fd_child_utsns); - - /* Clean up child process */ - kill(pid, SIGTERM); - waitpid(pid, NULL, 0); } TEST(nsid_userns_basic) @@ -480,7 +487,7 @@ TEST(nsid_userns_basic) close(fd_userns); } -TEST(nsid_userns_separate) +TEST_F(nsid, userns_separate) { __u64 parent_user_ns_id = 0; __u64 child_user_ns_id = 0; @@ -526,6 +533,9 @@ TEST(nsid_userns_separate) _exit(0); } + /* Track child for cleanup */ + self->child_pid = pid; + /* Parent process */ close(pipefd[1]); @@ -535,8 +545,6 @@ TEST(nsid_userns_separate) if (buf == 'S') { /* Child couldn't create namespace, skip test */ - kill(pid, SIGTERM); - waitpid(pid, NULL, 0); close(fd_parent_userns); SKIP(return, "No permission to create user namespace"); } @@ -559,10 +567,6 @@ TEST(nsid_userns_separate) close(fd_parent_userns); close(fd_child_userns); - - /* Clean up child process */ - kill(pid, SIGTERM); - waitpid(pid, NULL, 0); } TEST(nsid_timens_basic) @@ -591,7 +595,7 @@ TEST(nsid_timens_basic) close(fd_timens); } -TEST(nsid_timens_separate) +TEST_F(nsid, timens_separate) { __u64 parent_time_ns_id = 0; __u64 child_time_ns_id = 0; @@ -652,6 +656,9 @@ TEST(nsid_timens_separate) } } + /* Track child for cleanup */ + self->child_pid = pid; + /* Parent process */ close(pipefd[1]); @@ -660,8 +667,6 @@ TEST(nsid_timens_separate) if (buf == 'S') { /* Child couldn't create namespace, skip test */ - kill(pid, SIGTERM); - waitpid(pid, NULL, 0); close(fd_parent_timens); close(pipefd[0]); SKIP(return, "Cannot create time namespace"); @@ -689,10 +694,6 @@ TEST(nsid_timens_separate) close(fd_parent_timens); close(fd_child_timens); - - /* Clean up child process */ - kill(pid, SIGTERM); - waitpid(pid, NULL, 0); } TEST(nsid_pidns_basic) @@ -719,7 +720,7 @@ TEST(nsid_pidns_basic) close(fd_pidns); } -TEST(nsid_pidns_separate) +TEST_F(nsid, pidns_separate) { __u64 parent_pid_ns_id = 0; __u64 child_pid_ns_id = 0; @@ -776,6 +777,9 @@ TEST(nsid_pidns_separate) } } + /* Track child for cleanup */ + self->child_pid = pid; + /* Parent process */ close(pipefd[1]); @@ -784,8 +788,6 @@ TEST(nsid_pidns_separate) if (buf == 'S') { /* Child couldn't create namespace, skip test */ - kill(pid, SIGTERM); - waitpid(pid, NULL, 0); close(fd_parent_pidns); close(pipefd[0]); SKIP(return, "No permission to create PID namespace"); @@ -813,10 +815,6 @@ TEST(nsid_pidns_separate) close(fd_parent_pidns); close(fd_child_pidns); - - /* Clean up child process */ - kill(pid, SIGTERM); - waitpid(pid, NULL, 0); } TEST(nsid_netns_basic) @@ -860,7 +858,7 @@ TEST(nsid_netns_basic) close(fd_netns); } -TEST(nsid_netns_separate) +TEST_F(nsid, netns_separate) { __u64 parent_net_ns_id = 0; __u64 parent_netns_cookie = 0; @@ -920,6 +918,9 @@ TEST(nsid_netns_separate) _exit(0); } + /* Track child for cleanup */ + self->child_pid = pid; + /* Parent process */ close(pipefd[1]); @@ -929,8 +930,6 @@ TEST(nsid_netns_separate) if (buf == 'S') { /* Child couldn't create namespace, skip test */ - kill(pid, SIGTERM); - waitpid(pid, NULL, 0); close(fd_parent_netns); close(parent_sock); SKIP(return, "No permission to create network namespace"); @@ -977,10 +976,6 @@ TEST(nsid_netns_separate) close(fd_parent_netns); close(fd_child_netns); close(parent_sock); - - /* Clean up child process */ - kill(pid, SIGTERM); - waitpid(pid, NULL, 0); } TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/namespaces/regression_pidfd_setns_test.c b/tools/testing/selftests/namespaces/regression_pidfd_setns_test.c new file mode 100644 index 000000000000..753fd29dffd8 --- /dev/null +++ b/tools/testing/selftests/namespaces/regression_pidfd_setns_test.c @@ -0,0 +1,113 @@ +// SPDX-License-Identifier: GPL-2.0 +#define _GNU_SOURCE +#include <errno.h> +#include <sched.h> +#include <signal.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/socket.h> +#include <unistd.h> +#include "../pidfd/pidfd.h" +#include "../kselftest_harness.h" + +/* + * Regression tests for the setns(pidfd) active reference counting bug. + * + * These tests are based on the reproducers that triggered the race condition + * fixed by commit 1c465d0518dc ("ns: handle setns(pidfd, ...) cleanly"). + * + * The bug: When using setns() with a pidfd, if the target task exits between + * prepare_nsset() and commit_nsset(), the namespaces would become inactive. + * Then ns_ref_active_get() would increment from 0 without properly resurrecting + * the owner chain, causing active reference count underflows. + */ + +/* + * Simple pidfd setns test using create_child()+unshare(). + * + * Without the fix, this would trigger active refcount warnings when the + * parent exits after doing setns(pidfd) on a child that has already exited. + */ +TEST(simple_pidfd_setns) +{ + pid_t child_pid; + int pidfd = -1; + int ret; + int sv[2]; + char c; + + /* Ignore SIGCHLD for autoreap */ + ASSERT_NE(signal(SIGCHLD, SIG_IGN), SIG_ERR); + + ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sv), 0); + + /* Create a child process without namespaces initially */ + child_pid = create_child(&pidfd, 0); + ASSERT_GE(child_pid, 0); + + if (child_pid == 0) { + close(sv[0]); + + if (unshare(CLONE_NEWUTS | CLONE_NEWIPC | CLONE_NEWNET | CLONE_NEWUSER) < 0) { + close(sv[1]); + _exit(1); + } + + /* Signal parent that namespaces are ready */ + if (write_nointr(sv[1], "1", 1) < 0) { + close(sv[1]); + _exit(1); + } + + close(sv[1]); + _exit(0); + } + ASSERT_GE(pidfd, 0); + EXPECT_EQ(close(sv[1]), 0); + + ret = read_nointr(sv[0], &c, 1); + ASSERT_EQ(ret, 1); + EXPECT_EQ(close(sv[0]), 0); + + /* Set to child's namespaces via pidfd */ + ret = setns(pidfd, CLONE_NEWUTS | CLONE_NEWIPC); + TH_LOG("setns() returned %d", ret); + close(pidfd); +} + +/* + * Simple pidfd setns test using create_child(). + * + * This variation uses create_child() with namespace flags directly. + * Namespaces are created immediately at clone time. + */ +TEST(simple_pidfd_setns_clone) +{ + pid_t child_pid; + int pidfd = -1; + int ret; + + /* Ignore SIGCHLD for autoreap */ + ASSERT_NE(signal(SIGCHLD, SIG_IGN), SIG_ERR); + + /* Create a child process with new namespaces using create_child() */ + child_pid = create_child(&pidfd, CLONE_NEWUSER | CLONE_NEWUTS | CLONE_NEWIPC | CLONE_NEWNET); + ASSERT_GE(child_pid, 0); + + if (child_pid == 0) { + /* Child: sleep for a while so parent can setns to us */ + sleep(2); + _exit(0); + } + + /* Parent: pidfd was already created by create_child() */ + ASSERT_GE(pidfd, 0); + + /* Set to child's namespaces via pidfd */ + ret = setns(pidfd, CLONE_NEWUTS | CLONE_NEWIPC); + close(pidfd); + TH_LOG("setns() returned %d", ret); +} + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/namespaces/siocgskns_test.c b/tools/testing/selftests/namespaces/siocgskns_test.c new file mode 100644 index 000000000000..ba689a22d82f --- /dev/null +++ b/tools/testing/selftests/namespaces/siocgskns_test.c @@ -0,0 +1,1824 @@ +// SPDX-License-Identifier: GPL-2.0 +#define _GNU_SOURCE +#include <errno.h> +#include <fcntl.h> +#include <limits.h> +#include <sched.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/ioctl.h> +#include <sys/socket.h> +#include <sys/stat.h> +#include <sys/types.h> +#include <sys/wait.h> +#include <unistd.h> +#include <linux/if.h> +#include <linux/sockios.h> +#include <linux/nsfs.h> +#include <arpa/inet.h> +#include "../kselftest_harness.h" +#include "../filesystems/utils.h" +#include "wrappers.h" + +#ifndef SIOCGSKNS +#define SIOCGSKNS 0x894C +#endif + +#ifndef FD_NSFS_ROOT +#define FD_NSFS_ROOT -10003 +#endif + +#ifndef FILEID_NSFS +#define FILEID_NSFS 0xf1 +#endif + +/* + * Test basic SIOCGSKNS functionality. + * Create a socket and verify SIOCGSKNS returns the correct network namespace. + */ +TEST(siocgskns_basic) +{ + int sock_fd, netns_fd, current_netns_fd; + struct stat st1, st2; + + /* Create a TCP socket */ + sock_fd = socket(AF_INET, SOCK_STREAM, 0); + ASSERT_GE(sock_fd, 0); + + /* Use SIOCGSKNS to get network namespace */ + netns_fd = ioctl(sock_fd, SIOCGSKNS); + if (netns_fd < 0) { + close(sock_fd); + if (errno == ENOTTY || errno == EINVAL) + SKIP(return, "SIOCGSKNS not supported"); + ASSERT_GE(netns_fd, 0); + } + + /* Get current network namespace */ + current_netns_fd = open("/proc/self/ns/net", O_RDONLY); + ASSERT_GE(current_netns_fd, 0); + + /* Verify they match */ + ASSERT_EQ(fstat(netns_fd, &st1), 0); + ASSERT_EQ(fstat(current_netns_fd, &st2), 0); + ASSERT_EQ(st1.st_ino, st2.st_ino); + + close(sock_fd); + close(netns_fd); + close(current_netns_fd); +} + +/* + * Test that socket file descriptors keep network namespaces active. + * Create a network namespace, create a socket in it, then exit the namespace. + * The namespace should remain active while the socket FD is held. + */ +TEST(siocgskns_keeps_netns_active) +{ + int sock_fd, netns_fd, test_fd; + int ipc_sockets[2]; + pid_t pid; + int status; + struct stat st; + + EXPECT_EQ(socketpair(AF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* Child: create new netns and socket */ + close(ipc_sockets[0]); + + if (unshare(CLONE_NEWNET) < 0) { + TH_LOG("unshare(CLONE_NEWNET) failed: %s", strerror(errno)); + close(ipc_sockets[1]); + exit(1); + } + + /* Create a socket in the new network namespace */ + sock_fd = socket(AF_INET, SOCK_DGRAM, 0); + if (sock_fd < 0) { + TH_LOG("socket() failed: %s", strerror(errno)); + close(ipc_sockets[1]); + exit(1); + } + + /* Send socket FD to parent via SCM_RIGHTS */ + struct msghdr msg = {0}; + struct iovec iov = {0}; + char buf[1] = {'X'}; + char cmsg_buf[CMSG_SPACE(sizeof(int))]; + + iov.iov_base = buf; + iov.iov_len = 1; + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + msg.msg_control = cmsg_buf; + msg.msg_controllen = sizeof(cmsg_buf); + + struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg); + cmsg->cmsg_level = SOL_SOCKET; + cmsg->cmsg_type = SCM_RIGHTS; + cmsg->cmsg_len = CMSG_LEN(sizeof(int)); + memcpy(CMSG_DATA(cmsg), &sock_fd, sizeof(int)); + + if (sendmsg(ipc_sockets[1], &msg, 0) < 0) { + close(sock_fd); + close(ipc_sockets[1]); + exit(1); + } + + close(sock_fd); + close(ipc_sockets[1]); + exit(0); + } + + /* Parent: receive socket FD */ + close(ipc_sockets[1]); + + struct msghdr msg = {0}; + struct iovec iov = {0}; + char buf[1]; + char cmsg_buf[CMSG_SPACE(sizeof(int))]; + + iov.iov_base = buf; + iov.iov_len = 1; + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + msg.msg_control = cmsg_buf; + msg.msg_controllen = sizeof(cmsg_buf); + + ssize_t n = recvmsg(ipc_sockets[0], &msg, 0); + close(ipc_sockets[0]); + ASSERT_EQ(n, 1); + + struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg); + ASSERT_NE(cmsg, NULL); + ASSERT_EQ(cmsg->cmsg_type, SCM_RIGHTS); + + memcpy(&sock_fd, CMSG_DATA(cmsg), sizeof(int)); + + /* Wait for child to exit */ + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + /* Get network namespace from socket */ + netns_fd = ioctl(sock_fd, SIOCGSKNS); + if (netns_fd < 0) { + close(sock_fd); + if (errno == ENOTTY || errno == EINVAL) + SKIP(return, "SIOCGSKNS not supported"); + ASSERT_GE(netns_fd, 0); + } + + ASSERT_EQ(fstat(netns_fd, &st), 0); + + /* + * Namespace should still be active because socket FD keeps it alive. + * Try to access it via /proc/self/fd/<fd>. + */ + char path[64]; + snprintf(path, sizeof(path), "/proc/self/fd/%d", netns_fd); + test_fd = open(path, O_RDONLY); + ASSERT_GE(test_fd, 0); + close(test_fd); + close(netns_fd); + + /* Close socket - namespace should become inactive */ + close(sock_fd); + + /* Try SIOCGSKNS again - should fail since socket is closed */ + ASSERT_LT(ioctl(sock_fd, SIOCGSKNS), 0); +} + +/* + * Test SIOCGSKNS with different socket types (TCP, UDP, RAW). + */ +TEST(siocgskns_socket_types) +{ + int sock_tcp, sock_udp, sock_raw; + int netns_tcp, netns_udp, netns_raw; + struct stat st_tcp, st_udp, st_raw; + + /* TCP socket */ + sock_tcp = socket(AF_INET, SOCK_STREAM, 0); + ASSERT_GE(sock_tcp, 0); + + /* UDP socket */ + sock_udp = socket(AF_INET, SOCK_DGRAM, 0); + ASSERT_GE(sock_udp, 0); + + /* RAW socket (may require privileges) */ + sock_raw = socket(AF_INET, SOCK_RAW, IPPROTO_ICMP); + if (sock_raw < 0 && (errno == EPERM || errno == EACCES)) { + sock_raw = -1; /* Skip raw socket test */ + } + + /* Test SIOCGSKNS on TCP */ + netns_tcp = ioctl(sock_tcp, SIOCGSKNS); + if (netns_tcp < 0) { + close(sock_tcp); + close(sock_udp); + if (sock_raw >= 0) close(sock_raw); + if (errno == ENOTTY || errno == EINVAL) + SKIP(return, "SIOCGSKNS not supported"); + ASSERT_GE(netns_tcp, 0); + } + + /* Test SIOCGSKNS on UDP */ + netns_udp = ioctl(sock_udp, SIOCGSKNS); + ASSERT_GE(netns_udp, 0); + + /* Test SIOCGSKNS on RAW (if available) */ + if (sock_raw >= 0) { + netns_raw = ioctl(sock_raw, SIOCGSKNS); + ASSERT_GE(netns_raw, 0); + } + + /* Verify all return the same network namespace */ + ASSERT_EQ(fstat(netns_tcp, &st_tcp), 0); + ASSERT_EQ(fstat(netns_udp, &st_udp), 0); + ASSERT_EQ(st_tcp.st_ino, st_udp.st_ino); + + if (sock_raw >= 0) { + ASSERT_EQ(fstat(netns_raw, &st_raw), 0); + ASSERT_EQ(st_tcp.st_ino, st_raw.st_ino); + close(netns_raw); + close(sock_raw); + } + + close(netns_tcp); + close(netns_udp); + close(sock_tcp); + close(sock_udp); +} + +/* + * Test SIOCGSKNS across setns. + * Create a socket in netns A, switch to netns B, verify SIOCGSKNS still + * returns netns A. + */ +TEST(siocgskns_across_setns) +{ + int sock_fd, netns_a_fd, netns_b_fd, result_fd; + struct stat st_a; + + /* Get current netns (A) */ + netns_a_fd = open("/proc/self/ns/net", O_RDONLY); + ASSERT_GE(netns_a_fd, 0); + ASSERT_EQ(fstat(netns_a_fd, &st_a), 0); + + /* Create socket in netns A */ + sock_fd = socket(AF_INET, SOCK_STREAM, 0); + ASSERT_GE(sock_fd, 0); + + /* Create new netns (B) */ + ASSERT_EQ(unshare(CLONE_NEWNET), 0); + + netns_b_fd = open("/proc/self/ns/net", O_RDONLY); + ASSERT_GE(netns_b_fd, 0); + + /* Get netns from socket created in A */ + result_fd = ioctl(sock_fd, SIOCGSKNS); + if (result_fd < 0) { + close(sock_fd); + setns(netns_a_fd, CLONE_NEWNET); + close(netns_a_fd); + close(netns_b_fd); + if (errno == ENOTTY || errno == EINVAL) + SKIP(return, "SIOCGSKNS not supported"); + ASSERT_GE(result_fd, 0); + } + + /* Verify it still points to netns A */ + struct stat st_result_stat; + ASSERT_EQ(fstat(result_fd, &st_result_stat), 0); + ASSERT_EQ(st_a.st_ino, st_result_stat.st_ino); + + close(result_fd); + close(sock_fd); + close(netns_b_fd); + + /* Restore original netns */ + ASSERT_EQ(setns(netns_a_fd, CLONE_NEWNET), 0); + close(netns_a_fd); +} + +/* + * Test SIOCGSKNS fails on non-socket file descriptors. + */ +TEST(siocgskns_non_socket) +{ + int fd; + int pipefd[2]; + + /* Test on regular file */ + fd = open("/dev/null", O_RDONLY); + ASSERT_GE(fd, 0); + + ASSERT_LT(ioctl(fd, SIOCGSKNS), 0); + ASSERT_TRUE(errno == ENOTTY || errno == EINVAL); + close(fd); + + /* Test on pipe */ + ASSERT_EQ(pipe(pipefd), 0); + + ASSERT_LT(ioctl(pipefd[0], SIOCGSKNS), 0); + ASSERT_TRUE(errno == ENOTTY || errno == EINVAL); + + close(pipefd[0]); + close(pipefd[1]); +} + +/* + * Test multiple sockets keep the same network namespace active. + * Create multiple sockets, verify closing some doesn't affect others. + */ +TEST(siocgskns_multiple_sockets) +{ + int socks[5]; + int netns_fds[5]; + int i; + struct stat st; + ino_t netns_ino; + + /* Create new network namespace */ + ASSERT_EQ(unshare(CLONE_NEWNET), 0); + + /* Create multiple sockets */ + for (i = 0; i < 5; i++) { + socks[i] = socket(AF_INET, SOCK_STREAM, 0); + ASSERT_GE(socks[i], 0); + } + + /* Get netns from all sockets */ + for (i = 0; i < 5; i++) { + netns_fds[i] = ioctl(socks[i], SIOCGSKNS); + if (netns_fds[i] < 0) { + int j; + for (j = 0; j <= i; j++) { + close(socks[j]); + if (j < i && netns_fds[j] >= 0) + close(netns_fds[j]); + } + if (errno == ENOTTY || errno == EINVAL) + SKIP(return, "SIOCGSKNS not supported"); + ASSERT_GE(netns_fds[i], 0); + } + } + + /* Verify all point to same netns */ + ASSERT_EQ(fstat(netns_fds[0], &st), 0); + netns_ino = st.st_ino; + + for (i = 1; i < 5; i++) { + ASSERT_EQ(fstat(netns_fds[i], &st), 0); + ASSERT_EQ(st.st_ino, netns_ino); + } + + /* Close some sockets */ + for (i = 0; i < 3; i++) { + close(socks[i]); + } + + /* Remaining netns FDs should still be valid */ + for (i = 3; i < 5; i++) { + char path[64]; + snprintf(path, sizeof(path), "/proc/self/fd/%d", netns_fds[i]); + int test_fd = open(path, O_RDONLY); + ASSERT_GE(test_fd, 0); + close(test_fd); + } + + /* Cleanup */ + for (i = 0; i < 5; i++) { + if (i >= 3) + close(socks[i]); + close(netns_fds[i]); + } +} + +/* + * Test socket keeps netns active after creating process exits. + * Verify that as long as the socket FD exists, the namespace remains active. + */ +TEST(siocgskns_netns_lifecycle) +{ + int sock_fd, netns_fd; + int ipc_sockets[2]; + int syncpipe[2]; + pid_t pid; + int status; + char sync_byte; + struct stat st; + ino_t netns_ino; + + EXPECT_EQ(socketpair(AF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets), 0); + + ASSERT_EQ(pipe(syncpipe), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* Child */ + close(ipc_sockets[0]); + close(syncpipe[1]); + + if (unshare(CLONE_NEWNET) < 0) { + close(ipc_sockets[1]); + close(syncpipe[0]); + exit(1); + } + + sock_fd = socket(AF_INET, SOCK_STREAM, 0); + if (sock_fd < 0) { + close(ipc_sockets[1]); + close(syncpipe[0]); + exit(1); + } + + /* Send socket to parent */ + struct msghdr msg = {0}; + struct iovec iov = {0}; + char buf[1] = {'X'}; + char cmsg_buf[CMSG_SPACE(sizeof(int))]; + + iov.iov_base = buf; + iov.iov_len = 1; + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + msg.msg_control = cmsg_buf; + msg.msg_controllen = sizeof(cmsg_buf); + + struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg); + cmsg->cmsg_level = SOL_SOCKET; + cmsg->cmsg_type = SCM_RIGHTS; + cmsg->cmsg_len = CMSG_LEN(sizeof(int)); + memcpy(CMSG_DATA(cmsg), &sock_fd, sizeof(int)); + + if (sendmsg(ipc_sockets[1], &msg, 0) < 0) { + close(sock_fd); + close(ipc_sockets[1]); + close(syncpipe[0]); + exit(1); + } + + close(sock_fd); + close(ipc_sockets[1]); + + /* Wait for parent signal */ + read(syncpipe[0], &sync_byte, 1); + close(syncpipe[0]); + exit(0); + } + + /* Parent */ + close(ipc_sockets[1]); + close(syncpipe[0]); + + /* Receive socket FD */ + struct msghdr msg = {0}; + struct iovec iov = {0}; + char buf[1]; + char cmsg_buf[CMSG_SPACE(sizeof(int))]; + + iov.iov_base = buf; + iov.iov_len = 1; + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + msg.msg_control = cmsg_buf; + msg.msg_controllen = sizeof(cmsg_buf); + + ssize_t n = recvmsg(ipc_sockets[0], &msg, 0); + close(ipc_sockets[0]); + ASSERT_EQ(n, 1); + + struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg); + ASSERT_NE(cmsg, NULL); + memcpy(&sock_fd, CMSG_DATA(cmsg), sizeof(int)); + + /* Get netns from socket while child is alive */ + netns_fd = ioctl(sock_fd, SIOCGSKNS); + if (netns_fd < 0) { + sync_byte = 'G'; + write(syncpipe[1], &sync_byte, 1); + close(syncpipe[1]); + close(sock_fd); + waitpid(pid, NULL, 0); + if (errno == ENOTTY || errno == EINVAL) + SKIP(return, "SIOCGSKNS not supported"); + ASSERT_GE(netns_fd, 0); + } + ASSERT_EQ(fstat(netns_fd, &st), 0); + netns_ino = st.st_ino; + + /* Signal child to exit */ + sync_byte = 'G'; + write(syncpipe[1], &sync_byte, 1); + close(syncpipe[1]); + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + + /* + * Socket FD should still keep namespace active even after + * the creating process exited. + */ + int test_fd = ioctl(sock_fd, SIOCGSKNS); + ASSERT_GE(test_fd, 0); + + struct stat st_test; + ASSERT_EQ(fstat(test_fd, &st_test), 0); + ASSERT_EQ(st_test.st_ino, netns_ino); + + close(test_fd); + close(netns_fd); + + /* Close socket - namespace should become inactive */ + close(sock_fd); +} + +/* + * Test IPv6 sockets also work with SIOCGSKNS. + */ +TEST(siocgskns_ipv6) +{ + int sock_fd, netns_fd, current_netns_fd; + struct stat st1, st2; + + /* Create an IPv6 TCP socket */ + sock_fd = socket(AF_INET6, SOCK_STREAM, 0); + ASSERT_GE(sock_fd, 0); + + /* Use SIOCGSKNS */ + netns_fd = ioctl(sock_fd, SIOCGSKNS); + if (netns_fd < 0) { + close(sock_fd); + if (errno == ENOTTY || errno == EINVAL) + SKIP(return, "SIOCGSKNS not supported"); + ASSERT_GE(netns_fd, 0); + } + + /* Verify it matches current namespace */ + current_netns_fd = open("/proc/self/ns/net", O_RDONLY); + ASSERT_GE(current_netns_fd, 0); + + ASSERT_EQ(fstat(netns_fd, &st1), 0); + ASSERT_EQ(fstat(current_netns_fd, &st2), 0); + ASSERT_EQ(st1.st_ino, st2.st_ino); + + close(sock_fd); + close(netns_fd); + close(current_netns_fd); +} + +/* + * Test that socket-kept netns appears in listns() output. + * Verify that a network namespace kept alive by a socket FD appears in + * listns() output even after the creating process exits, and that it + * disappears when the socket is closed. + */ +TEST(siocgskns_listns_visibility) +{ + int sock_fd, netns_fd, owner_fd; + int ipc_sockets[2]; + pid_t pid; + int status; + __u64 netns_id, owner_id; + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = CLONE_NEWNET, + .spare2 = 0, + .user_ns_id = 0, + }; + __u64 ns_ids[256]; + int ret, i; + bool found_netns = false; + + EXPECT_EQ(socketpair(AF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* Child: create new netns and socket */ + close(ipc_sockets[0]); + + if (unshare(CLONE_NEWNET) < 0) { + close(ipc_sockets[1]); + exit(1); + } + + sock_fd = socket(AF_INET, SOCK_DGRAM, 0); + if (sock_fd < 0) { + close(ipc_sockets[1]); + exit(1); + } + + /* Send socket FD to parent via SCM_RIGHTS */ + struct msghdr msg = {0}; + struct iovec iov = {0}; + char buf[1] = {'X'}; + char cmsg_buf[CMSG_SPACE(sizeof(int))]; + + iov.iov_base = buf; + iov.iov_len = 1; + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + msg.msg_control = cmsg_buf; + msg.msg_controllen = sizeof(cmsg_buf); + + struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg); + cmsg->cmsg_level = SOL_SOCKET; + cmsg->cmsg_type = SCM_RIGHTS; + cmsg->cmsg_len = CMSG_LEN(sizeof(int)); + memcpy(CMSG_DATA(cmsg), &sock_fd, sizeof(int)); + + if (sendmsg(ipc_sockets[1], &msg, 0) < 0) { + close(sock_fd); + close(ipc_sockets[1]); + exit(1); + } + + close(sock_fd); + close(ipc_sockets[1]); + exit(0); + } + + /* Parent: receive socket FD */ + close(ipc_sockets[1]); + + struct msghdr msg = {0}; + struct iovec iov = {0}; + char buf[1]; + char cmsg_buf[CMSG_SPACE(sizeof(int))]; + + iov.iov_base = buf; + iov.iov_len = 1; + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + msg.msg_control = cmsg_buf; + msg.msg_controllen = sizeof(cmsg_buf); + + ssize_t n = recvmsg(ipc_sockets[0], &msg, 0); + close(ipc_sockets[0]); + ASSERT_EQ(n, 1); + + struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg); + ASSERT_NE(cmsg, NULL); + memcpy(&sock_fd, CMSG_DATA(cmsg), sizeof(int)); + + /* Wait for child to exit */ + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + /* Get network namespace from socket */ + netns_fd = ioctl(sock_fd, SIOCGSKNS); + if (netns_fd < 0) { + close(sock_fd); + if (errno == ENOTTY || errno == EINVAL) + SKIP(return, "SIOCGSKNS not supported"); + ASSERT_GE(netns_fd, 0); + } + + /* Get namespace ID */ + ret = ioctl(netns_fd, NS_GET_ID, &netns_id); + if (ret < 0) { + close(sock_fd); + close(netns_fd); + if (errno == ENOTTY || errno == EINVAL) + SKIP(return, "NS_GET_ID not supported"); + ASSERT_EQ(ret, 0); + } + + /* Get owner user namespace */ + owner_fd = ioctl(netns_fd, NS_GET_USERNS); + if (owner_fd < 0) { + close(sock_fd); + close(netns_fd); + if (errno == ENOTTY || errno == EINVAL) + SKIP(return, "NS_GET_USERNS not supported"); + ASSERT_GE(owner_fd, 0); + } + + /* Get owner namespace ID */ + ret = ioctl(owner_fd, NS_GET_ID, &owner_id); + if (ret < 0) { + close(owner_fd); + close(sock_fd); + close(netns_fd); + ASSERT_EQ(ret, 0); + } + close(owner_fd); + + /* Namespace should appear in listns() output */ + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0); + if (ret < 0) { + close(sock_fd); + close(netns_fd); + if (errno == ENOSYS) + SKIP(return, "listns() not supported"); + TH_LOG("listns failed: %s", strerror(errno)); + ASSERT_GE(ret, 0); + } + + /* Search for our network namespace in the list */ + for (i = 0; i < ret; i++) { + if (ns_ids[i] == netns_id) { + found_netns = true; + break; + } + } + + ASSERT_TRUE(found_netns); + TH_LOG("Found netns %llu in listns() output (kept alive by socket)", netns_id); + + /* Now verify with owner filtering */ + req.user_ns_id = owner_id; + found_netns = false; + + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0); + ASSERT_GE(ret, 0); + + for (i = 0; i < ret; i++) { + if (ns_ids[i] == netns_id) { + found_netns = true; + break; + } + } + + ASSERT_TRUE(found_netns); + TH_LOG("Found netns %llu owned by userns %llu", netns_id, owner_id); + + /* Close socket - namespace should become inactive and disappear from listns() */ + close(sock_fd); + close(netns_fd); + + /* Verify it's no longer in listns() output */ + req.user_ns_id = 0; + found_netns = false; + + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0); + ASSERT_GE(ret, 0); + + for (i = 0; i < ret; i++) { + if (ns_ids[i] == netns_id) { + found_netns = true; + break; + } + } + + ASSERT_FALSE(found_netns); + TH_LOG("Netns %llu correctly disappeared from listns() after socket closed", netns_id); +} + +/* + * Test that socket-kept netns can be reopened via file handle. + * Verify that a network namespace kept alive by a socket FD can be + * reopened using file handles even after the creating process exits. + */ +TEST(siocgskns_file_handle) +{ + int sock_fd, netns_fd, reopened_fd; + int ipc_sockets[2]; + pid_t pid; + int status; + struct stat st1, st2; + ino_t netns_ino; + __u64 netns_id; + struct file_handle *handle; + struct nsfs_file_handle *nsfs_fh; + int ret; + + /* Allocate file_handle structure for nsfs */ + handle = malloc(sizeof(struct file_handle) + sizeof(struct nsfs_file_handle)); + ASSERT_NE(handle, NULL); + handle->handle_bytes = sizeof(struct nsfs_file_handle); + handle->handle_type = FILEID_NSFS; + + EXPECT_EQ(socketpair(AF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* Child: create new netns and socket */ + close(ipc_sockets[0]); + + if (unshare(CLONE_NEWNET) < 0) { + close(ipc_sockets[1]); + exit(1); + } + + sock_fd = socket(AF_INET, SOCK_DGRAM, 0); + if (sock_fd < 0) { + close(ipc_sockets[1]); + exit(1); + } + + /* Send socket FD to parent via SCM_RIGHTS */ + struct msghdr msg = {0}; + struct iovec iov = {0}; + char buf[1] = {'X'}; + char cmsg_buf[CMSG_SPACE(sizeof(int))]; + + iov.iov_base = buf; + iov.iov_len = 1; + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + msg.msg_control = cmsg_buf; + msg.msg_controllen = sizeof(cmsg_buf); + + struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg); + cmsg->cmsg_level = SOL_SOCKET; + cmsg->cmsg_type = SCM_RIGHTS; + cmsg->cmsg_len = CMSG_LEN(sizeof(int)); + memcpy(CMSG_DATA(cmsg), &sock_fd, sizeof(int)); + + if (sendmsg(ipc_sockets[1], &msg, 0) < 0) { + close(sock_fd); + close(ipc_sockets[1]); + exit(1); + } + + close(sock_fd); + close(ipc_sockets[1]); + exit(0); + } + + /* Parent: receive socket FD */ + close(ipc_sockets[1]); + + struct msghdr msg = {0}; + struct iovec iov = {0}; + char buf[1]; + char cmsg_buf[CMSG_SPACE(sizeof(int))]; + + iov.iov_base = buf; + iov.iov_len = 1; + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + msg.msg_control = cmsg_buf; + msg.msg_controllen = sizeof(cmsg_buf); + + ssize_t n = recvmsg(ipc_sockets[0], &msg, 0); + close(ipc_sockets[0]); + ASSERT_EQ(n, 1); + + struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg); + ASSERT_NE(cmsg, NULL); + memcpy(&sock_fd, CMSG_DATA(cmsg), sizeof(int)); + + /* Wait for child to exit */ + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + /* Get network namespace from socket */ + netns_fd = ioctl(sock_fd, SIOCGSKNS); + if (netns_fd < 0) { + free(handle); + close(sock_fd); + if (errno == ENOTTY || errno == EINVAL) + SKIP(return, "SIOCGSKNS not supported"); + ASSERT_GE(netns_fd, 0); + } + + ASSERT_EQ(fstat(netns_fd, &st1), 0); + netns_ino = st1.st_ino; + + /* Get namespace ID */ + ret = ioctl(netns_fd, NS_GET_ID, &netns_id); + if (ret < 0) { + free(handle); + close(sock_fd); + close(netns_fd); + if (errno == ENOTTY || errno == EINVAL) + SKIP(return, "NS_GET_ID not supported"); + ASSERT_EQ(ret, 0); + } + + /* Construct file handle from namespace ID */ + nsfs_fh = (struct nsfs_file_handle *)handle->f_handle; + nsfs_fh->ns_id = netns_id; + nsfs_fh->ns_type = 0; /* Type field not needed for reopening */ + nsfs_fh->ns_inum = 0; /* Inum field not needed for reopening */ + + TH_LOG("Constructed file handle for netns %lu (id=%llu)", netns_ino, netns_id); + + /* Reopen namespace using file handle (while socket still keeps it alive) */ + reopened_fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + if (reopened_fd < 0) { + free(handle); + close(sock_fd); + if (errno == EOPNOTSUPP || errno == ENOSYS || errno == EBADF) + SKIP(return, "open_by_handle_at with FD_NSFS_ROOT not supported"); + TH_LOG("open_by_handle_at failed: %s", strerror(errno)); + ASSERT_GE(reopened_fd, 0); + } + + /* Verify it's the same namespace */ + ASSERT_EQ(fstat(reopened_fd, &st2), 0); + ASSERT_EQ(st1.st_ino, st2.st_ino); + ASSERT_EQ(st1.st_dev, st2.st_dev); + + TH_LOG("Successfully reopened netns %lu via file handle", netns_ino); + + close(reopened_fd); + + /* Close the netns FD */ + close(netns_fd); + + /* Try to reopen via file handle - should fail since namespace is now inactive */ + reopened_fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + ASSERT_LT(reopened_fd, 0); + TH_LOG("Correctly failed to reopen inactive netns: %s", strerror(errno)); + + /* Get network namespace from socket */ + netns_fd = ioctl(sock_fd, SIOCGSKNS); + if (netns_fd < 0) { + free(handle); + close(sock_fd); + if (errno == ENOTTY || errno == EINVAL) + SKIP(return, "SIOCGSKNS not supported"); + ASSERT_GE(netns_fd, 0); + } + + /* Reopen namespace using file handle (while socket still keeps it alive) */ + reopened_fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + if (reopened_fd < 0) { + free(handle); + close(sock_fd); + if (errno == EOPNOTSUPP || errno == ENOSYS || errno == EBADF) + SKIP(return, "open_by_handle_at with FD_NSFS_ROOT not supported"); + TH_LOG("open_by_handle_at failed: %s", strerror(errno)); + ASSERT_GE(reopened_fd, 0); + } + + /* Verify it's the same namespace */ + ASSERT_EQ(fstat(reopened_fd, &st2), 0); + ASSERT_EQ(st1.st_ino, st2.st_ino); + ASSERT_EQ(st1.st_dev, st2.st_dev); + + TH_LOG("Successfully reopened netns %lu via file handle", netns_ino); + + /* Close socket - namespace should become inactive */ + close(sock_fd); + free(handle); +} + +/* + * Test combined listns() and file handle operations with socket-kept netns. + * Create a netns, keep it alive with a socket, verify it appears in listns(), + * then reopen it via file handle obtained from listns() entry. + */ +TEST(siocgskns_listns_and_file_handle) +{ + int sock_fd, netns_fd, userns_fd, reopened_fd; + int ipc_sockets[2]; + pid_t pid; + int status; + struct stat st; + ino_t netns_ino; + __u64 netns_id, userns_id; + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = CLONE_NEWNET | CLONE_NEWUSER, + .spare2 = 0, + .user_ns_id = 0, + }; + __u64 ns_ids[256]; + int ret, i; + bool found_netns = false, found_userns = false; + struct file_handle *handle; + struct nsfs_file_handle *nsfs_fh; + + /* Allocate file_handle structure for nsfs */ + handle = malloc(sizeof(struct file_handle) + sizeof(struct nsfs_file_handle)); + ASSERT_NE(handle, NULL); + handle->handle_bytes = sizeof(struct nsfs_file_handle); + handle->handle_type = FILEID_NSFS; + + EXPECT_EQ(socketpair(AF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* Child: create new userns and netns with socket */ + close(ipc_sockets[0]); + + if (setup_userns() < 0) { + close(ipc_sockets[1]); + exit(1); + } + + if (unshare(CLONE_NEWNET) < 0) { + close(ipc_sockets[1]); + exit(1); + } + + sock_fd = socket(AF_INET, SOCK_DGRAM, 0); + if (sock_fd < 0) { + close(ipc_sockets[1]); + exit(1); + } + + /* Send socket FD to parent via SCM_RIGHTS */ + struct msghdr msg = {0}; + struct iovec iov = {0}; + char buf[1] = {'X'}; + char cmsg_buf[CMSG_SPACE(sizeof(int))]; + + iov.iov_base = buf; + iov.iov_len = 1; + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + msg.msg_control = cmsg_buf; + msg.msg_controllen = sizeof(cmsg_buf); + + struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg); + cmsg->cmsg_level = SOL_SOCKET; + cmsg->cmsg_type = SCM_RIGHTS; + cmsg->cmsg_len = CMSG_LEN(sizeof(int)); + memcpy(CMSG_DATA(cmsg), &sock_fd, sizeof(int)); + + if (sendmsg(ipc_sockets[1], &msg, 0) < 0) { + close(sock_fd); + close(ipc_sockets[1]); + exit(1); + } + + close(sock_fd); + close(ipc_sockets[1]); + exit(0); + } + + /* Parent: receive socket FD */ + close(ipc_sockets[1]); + + struct msghdr msg = {0}; + struct iovec iov = {0}; + char buf[1]; + char cmsg_buf[CMSG_SPACE(sizeof(int))]; + + iov.iov_base = buf; + iov.iov_len = 1; + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + msg.msg_control = cmsg_buf; + msg.msg_controllen = sizeof(cmsg_buf); + + ssize_t n = recvmsg(ipc_sockets[0], &msg, 0); + close(ipc_sockets[0]); + ASSERT_EQ(n, 1); + + struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg); + ASSERT_NE(cmsg, NULL); + memcpy(&sock_fd, CMSG_DATA(cmsg), sizeof(int)); + + /* Wait for child to exit */ + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + /* Get network namespace from socket */ + netns_fd = ioctl(sock_fd, SIOCGSKNS); + if (netns_fd < 0) { + free(handle); + close(sock_fd); + if (errno == ENOTTY || errno == EINVAL) + SKIP(return, "SIOCGSKNS not supported"); + ASSERT_GE(netns_fd, 0); + } + + ASSERT_EQ(fstat(netns_fd, &st), 0); + netns_ino = st.st_ino; + + /* Get namespace ID */ + ret = ioctl(netns_fd, NS_GET_ID, &netns_id); + if (ret < 0) { + free(handle); + close(sock_fd); + close(netns_fd); + if (errno == ENOTTY || errno == EINVAL) + SKIP(return, "NS_GET_ID not supported"); + ASSERT_EQ(ret, 0); + } + + /* Get owner user namespace */ + userns_fd = ioctl(netns_fd, NS_GET_USERNS); + if (userns_fd < 0) { + free(handle); + close(sock_fd); + close(netns_fd); + if (errno == ENOTTY || errno == EINVAL) + SKIP(return, "NS_GET_USERNS not supported"); + ASSERT_GE(userns_fd, 0); + } + + /* Get owner namespace ID */ + ret = ioctl(userns_fd, NS_GET_ID, &userns_id); + if (ret < 0) { + close(userns_fd); + free(handle); + close(sock_fd); + close(netns_fd); + ASSERT_EQ(ret, 0); + } + close(userns_fd); + + TH_LOG("Testing netns %lu (id=%llu) owned by userns id=%llu", netns_ino, netns_id, userns_id); + + /* Verify namespace appears in listns() */ + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0); + if (ret < 0) { + free(handle); + close(sock_fd); + close(netns_fd); + if (errno == ENOSYS) + SKIP(return, "listns() not supported"); + TH_LOG("listns failed: %s", strerror(errno)); + ASSERT_GE(ret, 0); + } + + found_netns = false; + found_userns = false; + for (i = 0; i < ret; i++) { + if (ns_ids[i] == netns_id) + found_netns = true; + if (ns_ids[i] == userns_id) + found_userns = true; + } + ASSERT_TRUE(found_netns); + ASSERT_TRUE(found_userns); + TH_LOG("Found netns %llu in listns() output", netns_id); + + /* Construct file handle from namespace ID */ + nsfs_fh = (struct nsfs_file_handle *)handle->f_handle; + nsfs_fh->ns_id = netns_id; + nsfs_fh->ns_type = 0; + nsfs_fh->ns_inum = 0; + + reopened_fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + if (reopened_fd < 0) { + free(handle); + close(sock_fd); + if (errno == EOPNOTSUPP || errno == ENOSYS || errno == EBADF) + SKIP(return, "open_by_handle_at with FD_NSFS_ROOT not supported"); + TH_LOG("open_by_handle_at failed: %s", strerror(errno)); + ASSERT_GE(reopened_fd, 0); + } + + struct stat reopened_st; + ASSERT_EQ(fstat(reopened_fd, &reopened_st), 0); + ASSERT_EQ(reopened_st.st_ino, netns_ino); + + TH_LOG("Successfully reopened netns %lu via file handle (socket-kept)", netns_ino); + + close(reopened_fd); + close(netns_fd); + + /* Try to reopen via file handle - should fail since namespace is now inactive */ + reopened_fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + ASSERT_LT(reopened_fd, 0); + TH_LOG("Correctly failed to reopen inactive netns: %s", strerror(errno)); + + /* Get network namespace from socket */ + netns_fd = ioctl(sock_fd, SIOCGSKNS); + if (netns_fd < 0) { + free(handle); + close(sock_fd); + if (errno == ENOTTY || errno == EINVAL) + SKIP(return, "SIOCGSKNS not supported"); + ASSERT_GE(netns_fd, 0); + } + + /* Verify namespace appears in listns() */ + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0); + if (ret < 0) { + free(handle); + close(sock_fd); + close(netns_fd); + if (errno == ENOSYS) + SKIP(return, "listns() not supported"); + TH_LOG("listns failed: %s", strerror(errno)); + ASSERT_GE(ret, 0); + } + + found_netns = false; + found_userns = false; + for (i = 0; i < ret; i++) { + if (ns_ids[i] == netns_id) + found_netns = true; + if (ns_ids[i] == userns_id) + found_userns = true; + } + ASSERT_TRUE(found_netns); + ASSERT_TRUE(found_userns); + TH_LOG("Found netns %llu in listns() output", netns_id); + + close(netns_fd); + + /* Verify namespace appears in listns() */ + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0); + if (ret < 0) { + free(handle); + close(sock_fd); + close(netns_fd); + if (errno == ENOSYS) + SKIP(return, "listns() not supported"); + TH_LOG("listns failed: %s", strerror(errno)); + ASSERT_GE(ret, 0); + } + + found_netns = false; + found_userns = false; + for (i = 0; i < ret; i++) { + if (ns_ids[i] == netns_id) + found_netns = true; + if (ns_ids[i] == userns_id) + found_userns = true; + } + ASSERT_FALSE(found_netns); + ASSERT_FALSE(found_userns); + TH_LOG("Netns %llu correctly disappeared from listns() after socket closed", netns_id); + + close(sock_fd); + free(handle); +} + +/* + * Test multi-level namespace resurrection across three user namespace levels. + * + * This test creates a complex namespace hierarchy with three levels of user + * namespaces and a network namespace at the deepest level. It verifies that + * the resurrection semantics work correctly when SIOCGSKNS is called on a + * socket from an inactive namespace tree, and that listns() and + * open_by_handle_at() correctly respect visibility rules. + * + * Hierarchy after child processes exit (all with 0 active refcount): + * + * net_L3A (0) <- Level 3 network namespace + * | + * + + * userns_L3 (0) <- Level 3 user namespace + * | + * + + * userns_L2 (0) <- Level 2 user namespace + * | + * + + * userns_L1 (0) <- Level 1 user namespace + * | + * x + * init_user_ns + * + * The test verifies: + * 1. SIOCGSKNS on a socket from inactive net_L3A resurrects the entire chain + * 2. After resurrection, all namespaces are visible in listns() + * 3. Resurrected namespaces can be reopened via file handles + * 4. Closing the netns FD cascades down: the entire ownership chain + * (userns_L3 -> userns_L2 -> userns_L1) becomes inactive again + * 5. Inactive namespaces disappear from listns() and cannot be reopened + * 6. Calling SIOCGSKNS again on the same socket resurrects the tree again + * 7. After second resurrection, namespaces are visible and can be reopened + */ +TEST(siocgskns_multilevel_resurrection) +{ + int ipc_sockets[2]; + pid_t pid_l1, pid_l2, pid_l3; + int status; + + /* Namespace file descriptors to be received from child */ + int sock_L3A_fd = -1; + int netns_L3A_fd = -1; + __u64 netns_L3A_id; + __u64 userns_L1_id, userns_L2_id, userns_L3_id; + + /* For listns() and file handle testing */ + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = CLONE_NEWNET | CLONE_NEWUSER, + .spare2 = 0, + .user_ns_id = 0, + }; + __u64 ns_ids[256]; + int ret, i; + struct file_handle *handle; + struct nsfs_file_handle *nsfs_fh; + int reopened_fd; + + /* Allocate file handle for testing */ + handle = malloc(sizeof(struct file_handle) + sizeof(struct nsfs_file_handle)); + ASSERT_NE(handle, NULL); + handle->handle_bytes = sizeof(struct nsfs_file_handle); + handle->handle_type = FILEID_NSFS; + + EXPECT_EQ(socketpair(AF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets), 0); + + /* + * Fork level 1 child that creates userns_L1 + */ + pid_l1 = fork(); + ASSERT_GE(pid_l1, 0); + + if (pid_l1 == 0) { + /* Level 1 child */ + int ipc_L2[2]; + close(ipc_sockets[0]); + + /* Create userns_L1 */ + if (setup_userns() < 0) { + close(ipc_sockets[1]); + exit(1); + } + + /* Create socketpair for communicating with L2 child */ + if (socketpair(AF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_L2) < 0) { + close(ipc_sockets[1]); + exit(1); + } + + /* + * Fork level 2 child that creates userns_L2 + */ + pid_l2 = fork(); + if (pid_l2 < 0) { + close(ipc_sockets[1]); + close(ipc_L2[0]); + close(ipc_L2[1]); + exit(1); + } + + if (pid_l2 == 0) { + /* Level 2 child */ + int ipc_L3[2]; + close(ipc_L2[0]); + + /* Create userns_L2 (nested inside userns_L1) */ + if (setup_userns() < 0) { + close(ipc_L2[1]); + exit(1); + } + + /* Create socketpair for communicating with L3 child */ + if (socketpair(AF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_L3) < 0) { + close(ipc_L2[1]); + exit(1); + } + + /* + * Fork level 3 child that creates userns_L3 and network namespaces + */ + pid_l3 = fork(); + if (pid_l3 < 0) { + close(ipc_L2[1]); + close(ipc_L3[0]); + close(ipc_L3[1]); + exit(1); + } + + if (pid_l3 == 0) { + /* Level 3 child - the deepest level */ + int sock_fd; + close(ipc_L3[0]); + + /* Create userns_L3 (nested inside userns_L2) */ + if (setup_userns() < 0) { + close(ipc_L3[1]); + exit(1); + } + + /* Create network namespace at level 3 */ + if (unshare(CLONE_NEWNET) < 0) { + close(ipc_L3[1]); + exit(1); + } + + /* Create socket in net_L3A */ + sock_fd = socket(AF_INET, SOCK_DGRAM, 0); + if (sock_fd < 0) { + close(ipc_L3[1]); + exit(1); + } + + /* Send socket FD to L2 parent */ + struct msghdr msg = {0}; + struct iovec iov = {0}; + char buf[1] = {'X'}; + char cmsg_buf[CMSG_SPACE(sizeof(int))]; + + iov.iov_base = buf; + iov.iov_len = 1; + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + msg.msg_control = cmsg_buf; + msg.msg_controllen = sizeof(cmsg_buf); + + struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg); + cmsg->cmsg_level = SOL_SOCKET; + cmsg->cmsg_type = SCM_RIGHTS; + cmsg->cmsg_len = CMSG_LEN(sizeof(int)); + memcpy(CMSG_DATA(cmsg), &sock_fd, sizeof(int)); + + if (sendmsg(ipc_L3[1], &msg, 0) < 0) { + close(sock_fd); + close(ipc_L3[1]); + exit(1); + } + + close(sock_fd); + close(ipc_L3[1]); + exit(0); + } + + /* Level 2 child - receive from L3 and forward to L1 */ + close(ipc_L3[1]); + + struct msghdr msg = {0}; + struct iovec iov = {0}; + char buf[1]; + char cmsg_buf[CMSG_SPACE(sizeof(int))]; + int received_fd; + + iov.iov_base = buf; + iov.iov_len = 1; + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + msg.msg_control = cmsg_buf; + msg.msg_controllen = sizeof(cmsg_buf); + + ssize_t n = recvmsg(ipc_L3[0], &msg, 0); + close(ipc_L3[0]); + + if (n != 1) { + close(ipc_L2[1]); + waitpid(pid_l3, NULL, 0); + exit(1); + } + + struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg); + if (!cmsg) { + close(ipc_L2[1]); + waitpid(pid_l3, NULL, 0); + exit(1); + } + memcpy(&received_fd, CMSG_DATA(cmsg), sizeof(int)); + + /* Wait for L3 child */ + waitpid(pid_l3, NULL, 0); + + /* Forward the socket FD to L1 parent */ + memset(&msg, 0, sizeof(msg)); + buf[0] = 'Y'; + iov.iov_base = buf; + iov.iov_len = 1; + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + msg.msg_control = cmsg_buf; + msg.msg_controllen = sizeof(cmsg_buf); + + cmsg = CMSG_FIRSTHDR(&msg); + cmsg->cmsg_level = SOL_SOCKET; + cmsg->cmsg_type = SCM_RIGHTS; + cmsg->cmsg_len = CMSG_LEN(sizeof(int)); + memcpy(CMSG_DATA(cmsg), &received_fd, sizeof(int)); + + if (sendmsg(ipc_L2[1], &msg, 0) < 0) { + close(received_fd); + close(ipc_L2[1]); + exit(1); + } + + close(received_fd); + close(ipc_L2[1]); + exit(0); + } + + /* Level 1 child - receive from L2 and forward to parent */ + close(ipc_L2[1]); + + struct msghdr msg = {0}; + struct iovec iov = {0}; + char buf[1]; + char cmsg_buf[CMSG_SPACE(sizeof(int))]; + int received_fd; + + iov.iov_base = buf; + iov.iov_len = 1; + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + msg.msg_control = cmsg_buf; + msg.msg_controllen = sizeof(cmsg_buf); + + ssize_t n = recvmsg(ipc_L2[0], &msg, 0); + close(ipc_L2[0]); + + if (n != 1) { + close(ipc_sockets[1]); + waitpid(pid_l2, NULL, 0); + exit(1); + } + + struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg); + if (!cmsg) { + close(ipc_sockets[1]); + waitpid(pid_l2, NULL, 0); + exit(1); + } + memcpy(&received_fd, CMSG_DATA(cmsg), sizeof(int)); + + /* Wait for L2 child */ + waitpid(pid_l2, NULL, 0); + + /* Forward the socket FD to parent */ + memset(&msg, 0, sizeof(msg)); + buf[0] = 'Z'; + iov.iov_base = buf; + iov.iov_len = 1; + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + msg.msg_control = cmsg_buf; + msg.msg_controllen = sizeof(cmsg_buf); + + cmsg = CMSG_FIRSTHDR(&msg); + cmsg->cmsg_level = SOL_SOCKET; + cmsg->cmsg_type = SCM_RIGHTS; + cmsg->cmsg_len = CMSG_LEN(sizeof(int)); + memcpy(CMSG_DATA(cmsg), &received_fd, sizeof(int)); + + if (sendmsg(ipc_sockets[1], &msg, 0) < 0) { + close(received_fd); + close(ipc_sockets[1]); + exit(1); + } + + close(received_fd); + close(ipc_sockets[1]); + exit(0); + } + + /* Parent - receive the socket from the deepest level */ + close(ipc_sockets[1]); + + struct msghdr msg = {0}; + struct iovec iov = {0}; + char buf[1]; + char cmsg_buf[CMSG_SPACE(sizeof(int))]; + + iov.iov_base = buf; + iov.iov_len = 1; + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + msg.msg_control = cmsg_buf; + msg.msg_controllen = sizeof(cmsg_buf); + + ssize_t n = recvmsg(ipc_sockets[0], &msg, 0); + close(ipc_sockets[0]); + + if (n != 1) { + free(handle); + waitpid(pid_l1, NULL, 0); + SKIP(return, "Failed to receive socket from child"); + } + + struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg); + if (!cmsg) { + free(handle); + waitpid(pid_l1, NULL, 0); + SKIP(return, "Failed to receive socket from child"); + } + memcpy(&sock_L3A_fd, CMSG_DATA(cmsg), sizeof(int)); + + /* Wait for L1 child */ + waitpid(pid_l1, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + /* + * At this point, all child processes have exited. The socket itself + * doesn't keep the namespace active - we need to call SIOCGSKNS which + * will resurrect the entire namespace tree by taking active references. + */ + + /* Get network namespace from socket - this resurrects the tree */ + netns_L3A_fd = ioctl(sock_L3A_fd, SIOCGSKNS); + if (netns_L3A_fd < 0) { + free(handle); + close(sock_L3A_fd); + if (errno == ENOTTY || errno == EINVAL) + SKIP(return, "SIOCGSKNS not supported"); + ASSERT_GE(netns_L3A_fd, 0); + } + + /* Get namespace ID for net_L3A */ + ret = ioctl(netns_L3A_fd, NS_GET_ID, &netns_L3A_id); + if (ret < 0) { + free(handle); + close(sock_L3A_fd); + close(netns_L3A_fd); + if (errno == ENOTTY || errno == EINVAL) + SKIP(return, "NS_GET_ID not supported"); + ASSERT_EQ(ret, 0); + } + + /* Get owner user namespace chain: userns_L3 -> userns_L2 -> userns_L1 */ + int userns_L3_fd = ioctl(netns_L3A_fd, NS_GET_USERNS); + if (userns_L3_fd < 0) { + free(handle); + close(sock_L3A_fd); + close(netns_L3A_fd); + if (errno == ENOTTY || errno == EINVAL) + SKIP(return, "NS_GET_USERNS not supported"); + ASSERT_GE(userns_L3_fd, 0); + } + + ret = ioctl(userns_L3_fd, NS_GET_ID, &userns_L3_id); + ASSERT_EQ(ret, 0); + + int userns_L2_fd = ioctl(userns_L3_fd, NS_GET_USERNS); + ASSERT_GE(userns_L2_fd, 0); + ret = ioctl(userns_L2_fd, NS_GET_ID, &userns_L2_id); + ASSERT_EQ(ret, 0); + + int userns_L1_fd = ioctl(userns_L2_fd, NS_GET_USERNS); + ASSERT_GE(userns_L1_fd, 0); + ret = ioctl(userns_L1_fd, NS_GET_ID, &userns_L1_id); + ASSERT_EQ(ret, 0); + + close(userns_L1_fd); + close(userns_L2_fd); + close(userns_L3_fd); + + TH_LOG("Multi-level hierarchy: net_L3A (id=%llu) -> userns_L3 (id=%llu) -> userns_L2 (id=%llu) -> userns_L1 (id=%llu)", + netns_L3A_id, userns_L3_id, userns_L2_id, userns_L1_id); + + /* + * Test 1: Verify net_L3A is visible in listns() after resurrection. + * The entire ownership chain should be resurrected and visible. + */ + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0); + if (ret < 0) { + free(handle); + close(sock_L3A_fd); + close(netns_L3A_fd); + if (errno == ENOSYS) + SKIP(return, "listns() not supported"); + ASSERT_GE(ret, 0); + } + + bool found_netns_L3A = false; + bool found_userns_L1 = false; + bool found_userns_L2 = false; + bool found_userns_L3 = false; + + for (i = 0; i < ret; i++) { + if (ns_ids[i] == netns_L3A_id) + found_netns_L3A = true; + if (ns_ids[i] == userns_L1_id) + found_userns_L1 = true; + if (ns_ids[i] == userns_L2_id) + found_userns_L2 = true; + if (ns_ids[i] == userns_L3_id) + found_userns_L3 = true; + } + + ASSERT_TRUE(found_netns_L3A); + ASSERT_TRUE(found_userns_L1); + ASSERT_TRUE(found_userns_L2); + ASSERT_TRUE(found_userns_L3); + TH_LOG("Resurrection verified: all namespaces in hierarchy visible in listns()"); + + /* + * Test 2: Verify net_L3A can be reopened via file handle. + */ + nsfs_fh = (struct nsfs_file_handle *)handle->f_handle; + nsfs_fh->ns_id = netns_L3A_id; + nsfs_fh->ns_type = 0; + nsfs_fh->ns_inum = 0; + + reopened_fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + if (reopened_fd < 0) { + free(handle); + close(sock_L3A_fd); + close(netns_L3A_fd); + if (errno == EOPNOTSUPP || errno == ENOSYS || errno == EBADF) + SKIP(return, "open_by_handle_at with FD_NSFS_ROOT not supported"); + TH_LOG("open_by_handle_at failed: %s", strerror(errno)); + ASSERT_GE(reopened_fd, 0); + } + + close(reopened_fd); + TH_LOG("File handle test passed: net_L3A can be reopened"); + + /* + * Test 3: Verify that when we close the netns FD (dropping the last + * active reference), the entire tree becomes inactive and disappears + * from listns(). The cascade goes: net_L3A drops -> userns_L3 drops -> + * userns_L2 drops -> userns_L1 drops. + */ + close(netns_L3A_fd); + + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0); + ASSERT_GE(ret, 0); + + found_netns_L3A = false; + found_userns_L1 = false; + found_userns_L2 = false; + found_userns_L3 = false; + + for (i = 0; i < ret; i++) { + if (ns_ids[i] == netns_L3A_id) + found_netns_L3A = true; + if (ns_ids[i] == userns_L1_id) + found_userns_L1 = true; + if (ns_ids[i] == userns_L2_id) + found_userns_L2 = true; + if (ns_ids[i] == userns_L3_id) + found_userns_L3 = true; + } + + ASSERT_FALSE(found_netns_L3A); + ASSERT_FALSE(found_userns_L1); + ASSERT_FALSE(found_userns_L2); + ASSERT_FALSE(found_userns_L3); + TH_LOG("Cascade test passed: all namespaces disappeared after netns FD closed"); + + /* + * Test 4: Verify file handle no longer works for inactive namespace. + */ + reopened_fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + if (reopened_fd >= 0) { + close(reopened_fd); + free(handle); + ASSERT_TRUE(false); /* Should have failed */ + } + TH_LOG("Inactive namespace correctly cannot be reopened via file handle"); + + /* + * Test 5: Verify that calling SIOCGSKNS again resurrects the tree again. + * The socket is still valid, so we can call SIOCGSKNS on it to resurrect + * the namespace tree once more. + */ + netns_L3A_fd = ioctl(sock_L3A_fd, SIOCGSKNS); + ASSERT_GE(netns_L3A_fd, 0); + + TH_LOG("Called SIOCGSKNS again to resurrect the namespace tree"); + + /* Verify the namespace tree is resurrected and visible in listns() */ + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0); + ASSERT_GE(ret, 0); + + found_netns_L3A = false; + found_userns_L1 = false; + found_userns_L2 = false; + found_userns_L3 = false; + + for (i = 0; i < ret; i++) { + if (ns_ids[i] == netns_L3A_id) + found_netns_L3A = true; + if (ns_ids[i] == userns_L1_id) + found_userns_L1 = true; + if (ns_ids[i] == userns_L2_id) + found_userns_L2 = true; + if (ns_ids[i] == userns_L3_id) + found_userns_L3 = true; + } + + ASSERT_TRUE(found_netns_L3A); + ASSERT_TRUE(found_userns_L1); + ASSERT_TRUE(found_userns_L2); + ASSERT_TRUE(found_userns_L3); + TH_LOG("Second resurrection verified: all namespaces in hierarchy visible in listns() again"); + + /* Verify we can reopen via file handle again */ + reopened_fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + if (reopened_fd < 0) { + free(handle); + close(sock_L3A_fd); + close(netns_L3A_fd); + TH_LOG("open_by_handle_at failed after second resurrection: %s", strerror(errno)); + ASSERT_GE(reopened_fd, 0); + } + + close(reopened_fd); + TH_LOG("File handle test passed: net_L3A can be reopened after second resurrection"); + + /* Final cleanup */ + close(sock_L3A_fd); + close(netns_L3A_fd); + free(handle); +} + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/namespaces/stress_test.c b/tools/testing/selftests/namespaces/stress_test.c new file mode 100644 index 000000000000..dd7df7d6cb27 --- /dev/null +++ b/tools/testing/selftests/namespaces/stress_test.c @@ -0,0 +1,626 @@ +// SPDX-License-Identifier: GPL-2.0 +#define _GNU_SOURCE +#include <errno.h> +#include <fcntl.h> +#include <limits.h> +#include <sched.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/ioctl.h> +#include <sys/socket.h> +#include <sys/stat.h> +#include <sys/syscall.h> +#include <sys/types.h> +#include <sys/wait.h> +#include <unistd.h> +#include <linux/nsfs.h> +#include "../kselftest_harness.h" +#include "../filesystems/utils.h" +#include "wrappers.h" + +/* + * Stress tests for namespace active reference counting. + * + * These tests validate that the active reference counting system can handle + * high load scenarios including rapid namespace creation/destruction, large + * numbers of concurrent namespaces, and various edge cases under stress. + */ + +/* + * Test rapid creation and destruction of user namespaces. + * Create and destroy namespaces in quick succession to stress the + * active reference tracking and ensure no leaks occur. + */ +TEST(rapid_namespace_creation_destruction) +{ + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = CLONE_NEWUSER, + .spare2 = 0, + .user_ns_id = 0, + }; + __u64 ns_ids_before[256], ns_ids_after[256]; + ssize_t ret_before, ret_after; + int i; + + /* Get baseline count of active user namespaces */ + ret_before = sys_listns(&req, ns_ids_before, ARRAY_SIZE(ns_ids_before), 0); + if (ret_before < 0) { + if (errno == ENOSYS) + SKIP(return, "listns() not supported"); + ASSERT_GE(ret_before, 0); + } + + TH_LOG("Baseline: %zd active user namespaces", ret_before); + + /* Rapidly create and destroy 100 user namespaces */ + for (i = 0; i < 100; i++) { + pid_t pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* Child: create user namespace and immediately exit */ + if (setup_userns() < 0) + exit(1); + exit(0); + } + + /* Parent: wait for child */ + int status; + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + } + + /* Verify we're back to baseline (no leaked namespaces) */ + ret_after = sys_listns(&req, ns_ids_after, ARRAY_SIZE(ns_ids_after), 0); + ASSERT_GE(ret_after, 0); + + TH_LOG("After 100 rapid create/destroy cycles: %zd active user namespaces", ret_after); + ASSERT_EQ(ret_before, ret_after); +} + +/* + * Test creating many concurrent namespaces. + * Verify that listns() correctly tracks all of them and that they all + * become inactive after processes exit. + */ +TEST(many_concurrent_namespaces) +{ + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = CLONE_NEWUSER, + .spare2 = 0, + .user_ns_id = 0, + }; + __u64 ns_ids_before[512], ns_ids_during[512], ns_ids_after[512]; + ssize_t ret_before, ret_during, ret_after; + pid_t pids[50]; + int num_children = 50; + int i; + int sv[2]; + + /* Get baseline */ + ret_before = sys_listns(&req, ns_ids_before, ARRAY_SIZE(ns_ids_before), 0); + if (ret_before < 0) { + if (errno == ENOSYS) + SKIP(return, "listns() not supported"); + ASSERT_GE(ret_before, 0); + } + + TH_LOG("Baseline: %zd active user namespaces", ret_before); + + ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sv), 0); + + /* Create many children, each with their own user namespace */ + for (i = 0; i < num_children; i++) { + pids[i] = fork(); + ASSERT_GE(pids[i], 0); + + if (pids[i] == 0) { + /* Child: create user namespace and wait for parent signal */ + char c; + + close(sv[0]); + + if (setup_userns() < 0) { + close(sv[1]); + exit(1); + } + + /* Signal parent we're ready */ + if (write(sv[1], &c, 1) != 1) { + close(sv[1]); + exit(1); + } + + /* Wait for parent signal to exit */ + if (read(sv[1], &c, 1) != 1) { + close(sv[1]); + exit(1); + } + + close(sv[1]); + exit(0); + } + } + + close(sv[1]); + + /* Wait for all children to signal ready */ + for (i = 0; i < num_children; i++) { + char c; + if (read(sv[0], &c, 1) != 1) { + /* If we fail to read, kill all children and exit */ + close(sv[0]); + for (int j = 0; j < num_children; j++) + kill(pids[j], SIGKILL); + for (int j = 0; j < num_children; j++) + waitpid(pids[j], NULL, 0); + ASSERT_TRUE(false); + } + } + + /* List namespaces while all children are running */ + ret_during = sys_listns(&req, ns_ids_during, ARRAY_SIZE(ns_ids_during), 0); + ASSERT_GE(ret_during, 0); + + TH_LOG("With %d children running: %zd active user namespaces", num_children, ret_during); + + /* Should have at least num_children more namespaces than baseline */ + ASSERT_GE(ret_during, ret_before + num_children); + + /* Signal all children to exit */ + for (i = 0; i < num_children; i++) { + char c = 'X'; + if (write(sv[0], &c, 1) != 1) { + /* If we fail to write, kill remaining children */ + close(sv[0]); + for (int j = i; j < num_children; j++) + kill(pids[j], SIGKILL); + for (int j = 0; j < num_children; j++) + waitpid(pids[j], NULL, 0); + ASSERT_TRUE(false); + } + } + + close(sv[0]); + + /* Wait for all children */ + for (i = 0; i < num_children; i++) { + int status; + waitpid(pids[i], &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + } + + /* Verify we're back to baseline */ + ret_after = sys_listns(&req, ns_ids_after, ARRAY_SIZE(ns_ids_after), 0); + ASSERT_GE(ret_after, 0); + + TH_LOG("After all children exit: %zd active user namespaces", ret_after); + ASSERT_EQ(ret_before, ret_after); +} + +/* + * Test rapid namespace creation with different namespace types. + * Create multiple types of namespaces rapidly to stress the tracking system. + */ +TEST(rapid_mixed_namespace_creation) +{ + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = 0, /* All types */ + .spare2 = 0, + .user_ns_id = 0, + }; + __u64 ns_ids_before[512], ns_ids_after[512]; + ssize_t ret_before, ret_after; + int i; + + /* Get baseline count */ + ret_before = sys_listns(&req, ns_ids_before, ARRAY_SIZE(ns_ids_before), 0); + if (ret_before < 0) { + if (errno == ENOSYS) + SKIP(return, "listns() not supported"); + ASSERT_GE(ret_before, 0); + } + + TH_LOG("Baseline: %zd active namespaces (all types)", ret_before); + + /* Rapidly create and destroy namespaces with multiple types */ + for (i = 0; i < 50; i++) { + pid_t pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* Child: create multiple namespace types */ + if (setup_userns() < 0) + exit(1); + + /* Create additional namespace types */ + if (unshare(CLONE_NEWNET) < 0) + exit(1); + if (unshare(CLONE_NEWUTS) < 0) + exit(1); + if (unshare(CLONE_NEWIPC) < 0) + exit(1); + + exit(0); + } + + /* Parent: wait for child */ + int status; + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + } + + /* Verify we're back to baseline */ + ret_after = sys_listns(&req, ns_ids_after, ARRAY_SIZE(ns_ids_after), 0); + ASSERT_GE(ret_after, 0); + + TH_LOG("After 50 rapid mixed namespace cycles: %zd active namespaces", ret_after); + ASSERT_EQ(ret_before, ret_after); +} + +/* + * Test nested namespace creation under stress. + * Create deeply nested namespace hierarchies and verify proper cleanup. + */ +TEST(nested_namespace_stress) +{ + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = CLONE_NEWUSER, + .spare2 = 0, + .user_ns_id = 0, + }; + __u64 ns_ids_before[512], ns_ids_after[512]; + ssize_t ret_before, ret_after; + int i; + + /* Get baseline */ + ret_before = sys_listns(&req, ns_ids_before, ARRAY_SIZE(ns_ids_before), 0); + if (ret_before < 0) { + if (errno == ENOSYS) + SKIP(return, "listns() not supported"); + ASSERT_GE(ret_before, 0); + } + + TH_LOG("Baseline: %zd active user namespaces", ret_before); + + /* Create 20 processes, each with nested user namespaces */ + for (i = 0; i < 20; i++) { + pid_t pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + int userns_fd; + uid_t orig_uid = getuid(); + int depth; + + /* Create nested user namespaces (up to 5 levels) */ + for (depth = 0; depth < 5; depth++) { + userns_fd = get_userns_fd(0, (depth == 0) ? orig_uid : 0, 1); + if (userns_fd < 0) + exit(1); + + if (setns(userns_fd, CLONE_NEWUSER) < 0) { + close(userns_fd); + exit(1); + } + close(userns_fd); + } + + exit(0); + } + + /* Parent: wait for child */ + int status; + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + } + + /* Verify we're back to baseline */ + ret_after = sys_listns(&req, ns_ids_after, ARRAY_SIZE(ns_ids_after), 0); + ASSERT_GE(ret_after, 0); + + TH_LOG("After 20 nested namespace hierarchies: %zd active user namespaces", ret_after); + ASSERT_EQ(ret_before, ret_after); +} + +/* + * Test listns() pagination under stress. + * Create many namespaces and verify pagination works correctly. + */ +TEST(listns_pagination_stress) +{ + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = CLONE_NEWUSER, + .spare2 = 0, + .user_ns_id = 0, + }; + pid_t pids[30]; + int num_children = 30; + int i; + int sv[2]; + __u64 all_ns_ids[512]; + int total_found = 0; + + ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sv), 0); + + /* Create many children with user namespaces */ + for (i = 0; i < num_children; i++) { + pids[i] = fork(); + ASSERT_GE(pids[i], 0); + + if (pids[i] == 0) { + char c; + close(sv[0]); + + if (setup_userns() < 0) { + close(sv[1]); + exit(1); + } + + /* Signal parent we're ready */ + if (write(sv[1], &c, 1) != 1) { + close(sv[1]); + exit(1); + } + + /* Wait for parent signal to exit */ + if (read(sv[1], &c, 1) != 1) { + close(sv[1]); + exit(1); + } + + close(sv[1]); + exit(0); + } + } + + close(sv[1]); + + /* Wait for all children to signal ready */ + for (i = 0; i < num_children; i++) { + char c; + if (read(sv[0], &c, 1) != 1) { + /* If we fail to read, kill all children and exit */ + close(sv[0]); + for (int j = 0; j < num_children; j++) + kill(pids[j], SIGKILL); + for (int j = 0; j < num_children; j++) + waitpid(pids[j], NULL, 0); + ASSERT_TRUE(false); + } + } + + /* Paginate through all namespaces using small batch sizes */ + req.ns_id = 0; + while (1) { + __u64 batch[5]; /* Small batch size to force pagination */ + ssize_t ret; + + ret = sys_listns(&req, batch, ARRAY_SIZE(batch), 0); + if (ret < 0) { + if (errno == ENOSYS) { + close(sv[0]); + for (i = 0; i < num_children; i++) + kill(pids[i], SIGKILL); + for (i = 0; i < num_children; i++) + waitpid(pids[i], NULL, 0); + SKIP(return, "listns() not supported"); + } + ASSERT_GE(ret, 0); + } + + if (ret == 0) + break; + + /* Store results */ + for (i = 0; i < ret && total_found < 512; i++) { + all_ns_ids[total_found++] = batch[i]; + } + + /* Update cursor for next batch */ + if (ret == ARRAY_SIZE(batch)) + req.ns_id = batch[ret - 1]; + else + break; + } + + TH_LOG("Paginated through %d user namespaces", total_found); + + /* Verify no duplicates in pagination */ + for (i = 0; i < total_found; i++) { + for (int j = i + 1; j < total_found; j++) { + if (all_ns_ids[i] == all_ns_ids[j]) { + TH_LOG("Found duplicate ns_id: %llu at positions %d and %d", + (unsigned long long)all_ns_ids[i], i, j); + ASSERT_TRUE(false); + } + } + } + + /* Signal all children to exit */ + for (i = 0; i < num_children; i++) { + char c = 'X'; + if (write(sv[0], &c, 1) != 1) { + close(sv[0]); + for (int j = i; j < num_children; j++) + kill(pids[j], SIGKILL); + for (int j = 0; j < num_children; j++) + waitpid(pids[j], NULL, 0); + ASSERT_TRUE(false); + } + } + + close(sv[0]); + + /* Wait for all children */ + for (i = 0; i < num_children; i++) { + int status; + waitpid(pids[i], &status, 0); + } +} + +/* + * Test concurrent namespace operations. + * Multiple processes creating, querying, and destroying namespaces concurrently. + */ +TEST(concurrent_namespace_operations) +{ + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = 0, + .spare2 = 0, + .user_ns_id = 0, + }; + __u64 ns_ids_before[512], ns_ids_after[512]; + ssize_t ret_before, ret_after; + pid_t pids[20]; + int num_workers = 20; + int i; + + /* Get baseline */ + ret_before = sys_listns(&req, ns_ids_before, ARRAY_SIZE(ns_ids_before), 0); + if (ret_before < 0) { + if (errno == ENOSYS) + SKIP(return, "listns() not supported"); + ASSERT_GE(ret_before, 0); + } + + TH_LOG("Baseline: %zd active namespaces", ret_before); + + /* Create worker processes that do concurrent operations */ + for (i = 0; i < num_workers; i++) { + pids[i] = fork(); + ASSERT_GE(pids[i], 0); + + if (pids[i] == 0) { + /* Each worker: create namespaces, list them, repeat */ + int iterations; + + for (iterations = 0; iterations < 10; iterations++) { + int userns_fd; + __u64 temp_ns_ids[100]; + ssize_t ret; + + /* Create a user namespace */ + userns_fd = get_userns_fd(0, getuid(), 1); + if (userns_fd < 0) + continue; + + /* List namespaces */ + ret = sys_listns(&req, temp_ns_ids, ARRAY_SIZE(temp_ns_ids), 0); + (void)ret; + + close(userns_fd); + + /* Small delay */ + usleep(1000); + } + + exit(0); + } + } + + /* Wait for all workers */ + for (i = 0; i < num_workers; i++) { + int status; + waitpid(pids[i], &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + } + + /* Verify we're back to baseline */ + ret_after = sys_listns(&req, ns_ids_after, ARRAY_SIZE(ns_ids_after), 0); + ASSERT_GE(ret_after, 0); + + TH_LOG("After concurrent operations: %zd active namespaces", ret_after); + ASSERT_EQ(ret_before, ret_after); +} + +/* + * Test namespace churn - continuous creation and destruction. + * Simulates high-churn scenarios like container orchestration. + */ +TEST(namespace_churn) +{ + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = CLONE_NEWUSER | CLONE_NEWNET | CLONE_NEWUTS, + .spare2 = 0, + .user_ns_id = 0, + }; + __u64 ns_ids_before[512], ns_ids_after[512]; + ssize_t ret_before, ret_after; + int cycle; + + /* Get baseline */ + ret_before = sys_listns(&req, ns_ids_before, ARRAY_SIZE(ns_ids_before), 0); + if (ret_before < 0) { + if (errno == ENOSYS) + SKIP(return, "listns() not supported"); + ASSERT_GE(ret_before, 0); + } + + TH_LOG("Baseline: %zd active namespaces", ret_before); + + /* Simulate churn: batches of namespaces created and destroyed */ + for (cycle = 0; cycle < 10; cycle++) { + pid_t batch_pids[10]; + int i; + + /* Create batch */ + for (i = 0; i < 10; i++) { + batch_pids[i] = fork(); + ASSERT_GE(batch_pids[i], 0); + + if (batch_pids[i] == 0) { + /* Create multiple namespace types */ + if (setup_userns() < 0) + exit(1); + if (unshare(CLONE_NEWNET) < 0) + exit(1); + if (unshare(CLONE_NEWUTS) < 0) + exit(1); + + /* Keep namespaces alive briefly */ + usleep(10000); + exit(0); + } + } + + /* Wait for batch to complete */ + for (i = 0; i < 10; i++) { + int status; + waitpid(batch_pids[i], &status, 0); + } + } + + /* Verify we're back to baseline */ + ret_after = sys_listns(&req, ns_ids_after, ARRAY_SIZE(ns_ids_after), 0); + ASSERT_GE(ret_after, 0); + + TH_LOG("After 10 churn cycles (100 namespace sets): %zd active namespaces", ret_after); + ASSERT_EQ(ret_before, ret_after); +} + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/namespaces/wrappers.h b/tools/testing/selftests/namespaces/wrappers.h new file mode 100644 index 000000000000..9741a64a5b1d --- /dev/null +++ b/tools/testing/selftests/namespaces/wrappers.h @@ -0,0 +1,35 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/nsfs.h> +#include <linux/types.h> +#include <sys/syscall.h> +#include <unistd.h> + +#ifndef __SELFTESTS_NAMESPACES_WRAPPERS_H__ +#define __SELFTESTS_NAMESPACES_WRAPPERS_H__ + +#ifndef __NR_listns + #if defined __alpha__ + #define __NR_listns 580 + #elif defined _MIPS_SIM + #if _MIPS_SIM == _MIPS_SIM_ABI32 /* o32 */ + #define __NR_listns 4470 + #endif + #if _MIPS_SIM == _MIPS_SIM_NABI32 /* n32 */ + #define __NR_listns 6470 + #endif + #if _MIPS_SIM == _MIPS_SIM_ABI64 /* n64 */ + #define __NR_listns 5470 + #endif + #else + #define __NR_listns 470 + #endif +#endif + +static inline int sys_listns(const struct ns_id_req *req, __u64 *ns_ids, + size_t nr_ns_ids, unsigned int flags) +{ + return syscall(__NR_listns, req, ns_ids, nr_ns_ids, flags); +} + +#endif /* __SELFTESTS_NAMESPACES_WRAPPERS_H__ */ diff --git a/tools/testing/selftests/net/.gitignore b/tools/testing/selftests/net/.gitignore index 8f9850a71f54..6930fe926c58 100644 --- a/tools/testing/selftests/net/.gitignore +++ b/tools/testing/selftests/net/.gitignore @@ -4,10 +4,8 @@ bind_timewait bind_wildcard busy_poller cmsg_sender -diag_uid epoll_busy_poll fin_ack_lat -gro hwtstamp_config io_uring_zerocopy_tx ioam6_parser @@ -18,7 +16,6 @@ ipv6_flowlabel ipv6_flowlabel_mgr ipv6_fragmentation log.txt -msg_oob msg_zerocopy netlink-dumps nettest @@ -35,9 +32,6 @@ reuseport_bpf_numa reuseport_dualstack rxtimestamp sctp_hello -scm_inq -scm_pidfd -scm_rights sk_bind_sendto_listen sk_connect_zero_addr sk_so_peek_off @@ -45,7 +39,6 @@ skf_net_off socket so_incoming_cpu so_netns_cookie -so_peek_off so_txtime so_rcv_listener stress_reuseport_listen @@ -57,7 +50,6 @@ tcp_port_share tfo timestamping tls -toeplitz tools tun txring_overwrite @@ -65,4 +57,3 @@ txtimestamp udpgso udpgso_bench_rx udpgso_bench_tx -unix_connect diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile index b5127e968108..b66ba04f19d9 100644 --- a/tools/testing/selftests/net/Makefile +++ b/tools/testing/selftests/net/Makefile @@ -38,7 +38,6 @@ TEST_PROGS := \ fq_band_pktlimit.sh \ gre_gso.sh \ gre_ipv6_lladdr.sh \ - gro.sh \ icmp.sh \ icmp_redirect.sh \ io_uring_zerocopy_tx.sh \ @@ -121,8 +120,6 @@ TEST_PROGS := \ # end of TEST_PROGS TEST_PROGS_EXTENDED := \ - toeplitz.sh \ - toeplitz_client.sh \ xfrm_policy_add_speed.sh \ # end of TEST_PROGS_EXTENDED @@ -130,7 +127,6 @@ TEST_GEN_FILES := \ bind_bhash \ cmsg_sender \ fin_ack_lat \ - gro \ hwtstamp_config \ io_uring_zerocopy_tx \ ioam6_parser \ @@ -159,7 +155,6 @@ TEST_GEN_FILES := \ tcp_mmap \ tfo \ timestamping \ - toeplitz \ txring_overwrite \ txtimestamp \ udpgso \ @@ -193,8 +188,6 @@ TEST_FILES := \ in_netns.sh \ lib.sh \ settings \ - setup_loopback.sh \ - setup_veth.sh \ # end of TEST_FILES # YNL files, must be before "include ..lib.mk" diff --git a/tools/testing/selftests/net/af_unix/.gitignore b/tools/testing/selftests/net/af_unix/.gitignore new file mode 100644 index 000000000000..240b26740c9e --- /dev/null +++ b/tools/testing/selftests/net/af_unix/.gitignore @@ -0,0 +1,8 @@ +diag_uid +msg_oob +scm_inq +scm_pidfd +scm_rights +so_peek_off +unix_connect +unix_connreset diff --git a/tools/testing/selftests/net/af_unix/Makefile b/tools/testing/selftests/net/af_unix/Makefile index 528d14c598bb..3cd677b72072 100644 --- a/tools/testing/selftests/net/af_unix/Makefile +++ b/tools/testing/selftests/net/af_unix/Makefile @@ -8,6 +8,7 @@ TEST_GEN_PROGS := \ scm_rights \ so_peek_off \ unix_connect \ + unix_connreset \ # end of TEST_GEN_PROGS include ../../lib.mk diff --git a/tools/testing/selftests/net/af_unix/so_peek_off.c b/tools/testing/selftests/net/af_unix/so_peek_off.c index 1a77728128e5..86e7b0fb522d 100644 --- a/tools/testing/selftests/net/af_unix/so_peek_off.c +++ b/tools/testing/selftests/net/af_unix/so_peek_off.c @@ -36,8 +36,8 @@ FIXTURE_VARIANT_ADD(so_peek_off, seqpacket) FIXTURE_SETUP(so_peek_off) { struct timeval timeout = { - .tv_sec = 0, - .tv_usec = 3000, + .tv_sec = 5, + .tv_usec = 0, }; int ret; diff --git a/tools/testing/selftests/net/af_unix/unix_connreset.c b/tools/testing/selftests/net/af_unix/unix_connreset.c new file mode 100644 index 000000000000..08c1de8f5a98 --- /dev/null +++ b/tools/testing/selftests/net/af_unix/unix_connreset.c @@ -0,0 +1,180 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Selftest for AF_UNIX socket close and ECONNRESET behaviour. + * + * This test verifies: + * 1. SOCK_STREAM returns EOF when the peer closes normally. + * 2. SOCK_STREAM returns ECONNRESET if peer closes with unread data. + * 3. SOCK_SEQPACKET returns EOF when the peer closes normally. + * 4. SOCK_SEQPACKET returns ECONNRESET if the peer closes with unread data. + * 5. SOCK_DGRAM does not return ECONNRESET when the peer closes. + * + * These tests document the intended Linux behaviour. + * + */ + +#define _GNU_SOURCE +#include <string.h> +#include <fcntl.h> +#include <unistd.h> +#include <errno.h> +#include <sys/socket.h> +#include <sys/un.h> +#include "../../kselftest_harness.h" + +#define SOCK_PATH "/tmp/af_unix_connreset.sock" + +static void remove_socket_file(void) +{ + unlink(SOCK_PATH); +} + +FIXTURE(unix_sock) +{ + int server; + int client; + int child; +}; + +FIXTURE_VARIANT(unix_sock) +{ + int socket_type; + const char *name; +}; + +FIXTURE_VARIANT_ADD(unix_sock, stream) { + .socket_type = SOCK_STREAM, + .name = "SOCK_STREAM", +}; + +FIXTURE_VARIANT_ADD(unix_sock, dgram) { + .socket_type = SOCK_DGRAM, + .name = "SOCK_DGRAM", +}; + +FIXTURE_VARIANT_ADD(unix_sock, seqpacket) { + .socket_type = SOCK_SEQPACKET, + .name = "SOCK_SEQPACKET", +}; + +FIXTURE_SETUP(unix_sock) +{ + struct sockaddr_un addr = {}; + int err; + + addr.sun_family = AF_UNIX; + strcpy(addr.sun_path, SOCK_PATH); + remove_socket_file(); + + self->server = socket(AF_UNIX, variant->socket_type, 0); + ASSERT_LT(-1, self->server); + + err = bind(self->server, (struct sockaddr *)&addr, sizeof(addr)); + ASSERT_EQ(0, err); + + if (variant->socket_type == SOCK_STREAM || + variant->socket_type == SOCK_SEQPACKET) { + err = listen(self->server, 1); + ASSERT_EQ(0, err); + } + + self->client = socket(AF_UNIX, variant->socket_type | SOCK_NONBLOCK, 0); + ASSERT_LT(-1, self->client); + + err = connect(self->client, (struct sockaddr *)&addr, sizeof(addr)); + ASSERT_EQ(0, err); +} + +FIXTURE_TEARDOWN(unix_sock) +{ + if (variant->socket_type == SOCK_STREAM || + variant->socket_type == SOCK_SEQPACKET) + close(self->child); + + close(self->client); + close(self->server); + remove_socket_file(); +} + +/* Test 1: peer closes normally */ +TEST_F(unix_sock, eof) +{ + char buf[16] = {}; + ssize_t n; + + if (variant->socket_type == SOCK_STREAM || + variant->socket_type == SOCK_SEQPACKET) { + self->child = accept(self->server, NULL, NULL); + ASSERT_LT(-1, self->child); + + close(self->child); + } else { + close(self->server); + } + + n = recv(self->client, buf, sizeof(buf), 0); + + if (variant->socket_type == SOCK_STREAM || + variant->socket_type == SOCK_SEQPACKET) { + ASSERT_EQ(0, n); + } else { + ASSERT_EQ(-1, n); + ASSERT_EQ(EAGAIN, errno); + } +} + +/* Test 2: peer closes with unread data */ +TEST_F(unix_sock, reset_unread_behavior) +{ + char buf[16] = {}; + ssize_t n; + + /* Send data that will remain unread */ + send(self->client, "hello", 5, 0); + + if (variant->socket_type == SOCK_DGRAM) { + /* No real connection, just close the server */ + close(self->server); + } else { + self->child = accept(self->server, NULL, NULL); + ASSERT_LT(-1, self->child); + + /* Peer closes before client reads */ + close(self->child); + } + + n = recv(self->client, buf, sizeof(buf), 0); + ASSERT_EQ(-1, n); + + if (variant->socket_type == SOCK_STREAM || + variant->socket_type == SOCK_SEQPACKET) { + ASSERT_EQ(ECONNRESET, errno); + } else { + ASSERT_EQ(EAGAIN, errno); + } +} + +/* Test 3: closing unaccepted (embryo) server socket should reset client. */ +TEST_F(unix_sock, reset_closed_embryo) +{ + char buf[16] = {}; + ssize_t n; + + if (variant->socket_type == SOCK_DGRAM) { + snprintf(_metadata->results->reason, + sizeof(_metadata->results->reason), + "Test only applies to SOCK_STREAM and SOCK_SEQPACKET"); + exit(KSFT_XFAIL); + } + + /* Close server without accept()ing */ + close(self->server); + + n = recv(self->client, buf, sizeof(buf), 0); + + ASSERT_EQ(-1, n); + ASSERT_EQ(ECONNRESET, errno); +} + +TEST_HARNESS_MAIN + diff --git a/tools/testing/selftests/net/arp_ndisc_evict_nocarrier.sh b/tools/testing/selftests/net/arp_ndisc_evict_nocarrier.sh index 92eb880c52f2..00758f00efbf 100755 --- a/tools/testing/selftests/net/arp_ndisc_evict_nocarrier.sh +++ b/tools/testing/selftests/net/arp_ndisc_evict_nocarrier.sh @@ -75,7 +75,7 @@ setup_v4() { ip neigh get $V4_ADDR1 dev veth0 >/dev/null 2>&1 if [ $? -ne 0 ]; then cleanup_v4 - echo "failed" + echo "failed; is the system using MACAddressPolicy=persistent ?" exit 1 fi diff --git a/tools/testing/selftests/net/busy_poll_test.sh b/tools/testing/selftests/net/busy_poll_test.sh index 7d2d40812074..5ec1c85c1623 100755 --- a/tools/testing/selftests/net/busy_poll_test.sh +++ b/tools/testing/selftests/net/busy_poll_test.sh @@ -27,6 +27,8 @@ NAPI_DEFER_HARD_IRQS=100 GRO_FLUSH_TIMEOUT=50000 SUSPEND_TIMEOUT=20000000 +NAPI_THREADED_MODE_BUSY_POLL=2 + setup_ns() { set -e @@ -62,6 +64,9 @@ cleanup_ns() test_busypoll() { suspend_value=${1:-0} + napi_threaded_value=${2:-0} + prefer_busy_poll_value=${3:-$PREFER_BUSY_POLL} + tmp_file=$(mktemp) out_file=$(mktemp) @@ -73,10 +78,11 @@ test_busypoll() -b${SERVER_IP} \ -m${MAX_EVENTS} \ -u${BUSY_POLL_USECS} \ - -P${PREFER_BUSY_POLL} \ + -P${prefer_busy_poll_value} \ -g${BUSY_POLL_BUDGET} \ -i${NSIM_SV_IFIDX} \ -s${suspend_value} \ + -t${napi_threaded_value} \ -o${out_file}& wait_local_port_listen nssv ${SERVER_PORT} tcp @@ -109,6 +115,15 @@ test_busypoll_with_suspend() return $? } +test_busypoll_with_napi_threaded() +{ + # Only enable napi threaded poll. Set suspend timeout and prefer busy + # poll to 0. + test_busypoll 0 ${NAPI_THREADED_MODE_BUSY_POLL} 0 + + return $? +} + ### ### Code start ### @@ -154,6 +169,13 @@ if [ $? -ne 0 ]; then exit 1 fi +test_busypoll_with_napi_threaded +if [ $? -ne 0 ]; then + echo "test_busypoll_with_napi_threaded failed" + cleanup_ns + exit 1 +fi + echo "$NSIM_SV_FD:$NSIM_SV_IFIDX" > $NSIM_DEV_SYS_UNLINK echo $NSIM_CL_ID > $NSIM_DEV_SYS_DEL diff --git a/tools/testing/selftests/net/busy_poller.c b/tools/testing/selftests/net/busy_poller.c index 04c7ff577bb8..3a81f9c94795 100644 --- a/tools/testing/selftests/net/busy_poller.c +++ b/tools/testing/selftests/net/busy_poller.c @@ -65,15 +65,16 @@ static uint32_t cfg_busy_poll_usecs; static uint16_t cfg_busy_poll_budget; static uint8_t cfg_prefer_busy_poll; -/* IRQ params */ +/* NAPI params */ static uint32_t cfg_defer_hard_irqs; static uint64_t cfg_gro_flush_timeout; static uint64_t cfg_irq_suspend_timeout; +static enum netdev_napi_threaded cfg_napi_threaded_poll = NETDEV_NAPI_THREADED_DISABLED; static void usage(const char *filepath) { error(1, 0, - "Usage: %s -p<port> -b<addr> -m<max_events> -u<busy_poll_usecs> -P<prefer_busy_poll> -g<busy_poll_budget> -o<outfile> -d<defer_hard_irqs> -r<gro_flush_timeout> -s<irq_suspend_timeout> -i<ifindex>", + "Usage: %s -p<port> -b<addr> -m<max_events> -u<busy_poll_usecs> -P<prefer_busy_poll> -g<busy_poll_budget> -o<outfile> -d<defer_hard_irqs> -r<gro_flush_timeout> -s<irq_suspend_timeout> -t<napi_threaded_poll> -i<ifindex>", filepath); } @@ -86,7 +87,7 @@ static void parse_opts(int argc, char **argv) if (argc <= 1) usage(argv[0]); - while ((c = getopt(argc, argv, "p:m:b:u:P:g:o:d:r:s:i:")) != -1) { + while ((c = getopt(argc, argv, "p:m:b:u:P:g:o:d:r:s:i:t:")) != -1) { /* most options take integer values, except o and b, so reduce * code duplication a bit for the common case by calling * strtoull here and leave bounds checking and casting per @@ -168,6 +169,12 @@ static void parse_opts(int argc, char **argv) cfg_ifindex = (int)tmp; break; + case 't': + if (tmp > 2) + error(1, ERANGE, "napi threaded poll value must be 0-2"); + + cfg_napi_threaded_poll = (enum netdev_napi_threaded)tmp; + break; } } @@ -247,6 +254,9 @@ static void setup_queue(void) netdev_napi_set_req_set_irq_suspend_timeout(set_req, cfg_irq_suspend_timeout); + if (cfg_napi_threaded_poll) + netdev_napi_set_req_set_threaded(set_req, cfg_napi_threaded_poll); + if (netdev_napi_set(ys, set_req)) error(1, 0, "can't set NAPI params: %s\n", yerr.msg); diff --git a/tools/testing/selftests/net/fib_tests.sh b/tools/testing/selftests/net/fib_tests.sh index a94b73a53f72..a88f797c549a 100755 --- a/tools/testing/selftests/net/fib_tests.sh +++ b/tools/testing/selftests/net/fib_tests.sh @@ -11,7 +11,8 @@ TESTS="unregister down carrier nexthop suppress ipv6_notify ipv4_notify \ ipv6_rt ipv4_rt ipv6_addr_metric ipv4_addr_metric ipv6_route_metrics \ ipv4_route_metrics ipv4_route_v6_gw rp_filter ipv4_del_addr \ ipv6_del_addr ipv4_mangle ipv6_mangle ipv4_bcast_neigh fib6_gc_test \ - ipv4_mpath_list ipv6_mpath_list ipv4_mpath_balance ipv6_mpath_balance" + ipv4_mpath_list ipv6_mpath_list ipv4_mpath_balance ipv6_mpath_balance \ + fib6_ra_to_static" VERBOSE=0 PAUSE_ON_FAIL=no @@ -1476,6 +1477,68 @@ ipv6_route_metrics_test() route_cleanup } +fib6_ra_to_static() +{ + setup + + echo + echo "Fib6 route promotion from RA-learned to static test" + set -e + + # ra6 is required for the test. (ipv6toolkit) + if [ ! -x "$(command -v ra6)" ]; then + echo "SKIP: ra6 not found." + set +e + cleanup &> /dev/null + return + fi + + # Create a pair of veth devices to send a RA message from one + # device to another. + $IP link add veth1 type veth peer name veth2 + $IP link set dev veth1 up + $IP link set dev veth2 up + $IP -6 address add 2001:10::1/64 dev veth1 nodad + $IP -6 address add 2001:10::2/64 dev veth2 nodad + + # Make veth1 ready to receive RA messages. + $NS_EXEC sysctl -wq net.ipv6.conf.veth1.accept_ra=2 + + # Send a RA message with a prefix from veth2. + $NS_EXEC ra6 -i veth2 -d 2001:10::1 -P 2001:12::/64\#LA\#120\#60 + + # Wait for the RA message. + sleep 1 + + # systemd may mess up the test. Make sure that + # systemd-networkd.service and systemd-networkd.socket are stopped. + check_rt_num_clean 2 $($IP -6 route list|grep expires|wc -l) || return + + # Configure static address on the same prefix + $IP -6 address add 2001:12::dead/64 dev veth1 nodad + + # On-link route won't expire anymore, default route still owned by RA + check_rt_num 1 $($IP -6 route list |grep expires|wc -l) + + # Send a second RA message with a prefix from veth2. + $NS_EXEC ra6 -i veth2 -d 2001:10::1 -P 2001:12::/64\#LA\#120\#60 + sleep 1 + + # Expire is not back, on-link route is still static + check_rt_num 1 $($IP -6 route list |grep expires|wc -l) + + $IP -6 address del 2001:12::dead/64 dev veth1 nodad + + # Expire is back, on-link route is now owned by RA again + check_rt_num 2 $($IP -6 route list |grep expires|wc -l) + + log_test $ret 0 "ipv6 promote RA route to static" + + set +e + + cleanup &> /dev/null +} + # add route for a prefix, flushing any existing routes first # expected to be the first step of a test add_route() @@ -2798,6 +2861,7 @@ do ipv6_mpath_list) ipv6_mpath_list_test;; ipv4_mpath_balance) ipv4_mpath_balance_test;; ipv6_mpath_balance) ipv6_mpath_balance_test;; + fib6_ra_to_static) fib6_ra_to_static;; help) echo "Test names: $TESTS"; exit 0;; esac diff --git a/tools/testing/selftests/net/forwarding/bridge_mdb.sh b/tools/testing/selftests/net/forwarding/bridge_mdb.sh index 8c1597ebc2d3..e86d77946585 100755 --- a/tools/testing/selftests/net/forwarding/bridge_mdb.sh +++ b/tools/testing/selftests/net/forwarding/bridge_mdb.sh @@ -28,6 +28,7 @@ ALL_TESTS=" cfg_test fwd_test ctrl_test + disable_test " NUM_NETIFS=4 @@ -64,7 +65,10 @@ h2_destroy() switch_create() { - ip link add name br0 type bridge vlan_filtering 1 vlan_default_pvid 0 \ + local vlan_filtering=$1; shift + + ip link add name br0 type bridge \ + vlan_filtering "$vlan_filtering" vlan_default_pvid 0 \ mcast_snooping 1 mcast_igmp_version 3 mcast_mld_version 2 bridge vlan add vid 10 dev br0 self bridge vlan add vid 20 dev br0 self @@ -118,7 +122,7 @@ setup_prepare() h1_create h2_create - switch_create + switch_create 1 } cleanup() @@ -1357,6 +1361,98 @@ ctrl_test() ctrl_mldv2_is_in_test } +check_group() +{ + local group=$1; shift + local vid=$1; shift + local should_fail=$1; shift + local when=$1; shift + local -a vidkws + + if ((vid)); then + vidkws=(vid "$vid") + fi + + bridge mdb get dev br0 grp "$group" "${vidkws[@]}" 2>/dev/null | + grep -q "port $swp1" + check_err_fail "$should_fail" $? "$group seen $when snooping disable:" +} + +__disable_test() +{ + local vid=$1; shift + local what=$1; shift + local -a vidkws + + if ((vid)); then + vidkws=(vid "$vid") + fi + + RET=0 + + bridge mdb add dev br0 port "$swp1" grp ff0e::1 permanent \ + "${vidkws[@]}" filter_mode include source_list 2001:db8:1::1 + bridge mdb add dev br0 port "$swp1" grp ff0e::2 permanent \ + "${vidkws[@]}" filter_mode exclude + + bridge mdb add dev br0 port "$swp1" grp ff0e::3 \ + "${vidkws[@]}" filter_mode include source_list 2001:db8:1::2 + bridge mdb add dev br0 port "$swp1" grp ff0e::4 \ + "${vidkws[@]}" filter_mode exclude + + bridge mdb add dev br0 port "$swp1" grp 239.1.1.1 permanent \ + "${vidkws[@]}" filter_mode include source_list 192.0.2.1 + bridge mdb add dev br0 port "$swp1" grp 239.1.1.2 permanent \ + "${vidkws[@]}" filter_mode exclude + + bridge mdb add dev br0 port "$swp1" grp 239.1.1.3 \ + "${vidkws[@]}" filter_mode include source_list 192.0.2.2 + bridge mdb add dev br0 port "$swp1" grp 239.1.1.4 \ + "${vidkws[@]}" filter_mode exclude + + check_group ff0e::1 "$vid" 0 "before" + check_group ff0e::2 "$vid" 0 "before" + check_group ff0e::3 "$vid" 0 "before" + check_group ff0e::4 "$vid" 0 "before" + + check_group 239.1.1.1 "$vid" 0 "before" + check_group 239.1.1.2 "$vid" 0 "before" + check_group 239.1.1.3 "$vid" 0 "before" + check_group 239.1.1.4 "$vid" 0 "before" + + ip link set dev br0 type bridge mcast_snooping 0 + + check_group ff0e::1 "$vid" 0 "after" + check_group ff0e::2 "$vid" 0 "after" + check_group ff0e::3 "$vid" 1 "after" + check_group ff0e::4 "$vid" 1 "after" + + check_group 239.1.1.1 "$vid" 0 "after" + check_group 239.1.1.2 "$vid" 0 "after" + check_group 239.1.1.3 "$vid" 1 "after" + check_group 239.1.1.4 "$vid" 1 "after" + + log_test "$what: Flush after disable" + + ip link set dev br0 type bridge mcast_snooping 1 + sleep 10 +} + +disable_test() +{ + __disable_test 10 802.1q + + switch_destroy + switch_create 0 + setup_wait + + __disable_test 0 802.1d + + switch_destroy + switch_create 1 + setup_wait +} + if ! bridge mdb help 2>&1 | grep -q "flush"; then echo "SKIP: iproute2 too old, missing bridge mdb flush support" exit $ksft_skip diff --git a/tools/testing/selftests/net/gro.sh b/tools/testing/selftests/net/gro.sh deleted file mode 100755 index 4c5144c6f652..000000000000 --- a/tools/testing/selftests/net/gro.sh +++ /dev/null @@ -1,105 +0,0 @@ -#!/bin/bash -# SPDX-License-Identifier: GPL-2.0 - -readonly SERVER_MAC="aa:00:00:00:00:02" -readonly CLIENT_MAC="aa:00:00:00:00:01" -readonly TESTS=("data" "ack" "flags" "tcp" "ip" "large") -readonly PROTOS=("ipv4" "ipv6" "ipip") -dev="" -test="all" -proto="ipv4" - -run_test() { - local server_pid=0 - local exit_code=0 - local protocol=$1 - local test=$2 - local ARGS=( "--${protocol}" "--dmac" "${SERVER_MAC}" \ - "--smac" "${CLIENT_MAC}" "--test" "${test}" "--verbose" ) - - setup_ns - # Each test is run 6 times to deflake, because given the receive timing, - # not all packets that should coalesce will be considered in the same flow - # on every try. - for tries in {1..6}; do - # Actual test starts here - ip netns exec $server_ns ./gro "${ARGS[@]}" "--rx" "--iface" "server" \ - 1>>log.txt & - server_pid=$! - sleep 0.5 # to allow for socket init - ip netns exec $client_ns ./gro "${ARGS[@]}" "--iface" "client" \ - 1>>log.txt - wait "${server_pid}" - exit_code=$? - if [[ ${test} == "large" && -n "${KSFT_MACHINE_SLOW}" && \ - ${exit_code} -ne 0 ]]; then - echo "Ignoring errors due to slow environment" 1>&2 - exit_code=0 - fi - if [[ "${exit_code}" -eq 0 ]]; then - break; - fi - done - cleanup_ns - echo ${exit_code} -} - -run_all_tests() { - local failed_tests=() - for proto in "${PROTOS[@]}"; do - for test in "${TESTS[@]}"; do - echo "running test ${proto} ${test}" >&2 - exit_code=$(run_test $proto $test) - if [[ "${exit_code}" -ne 0 ]]; then - failed_tests+=("${proto}_${test}") - fi; - done; - done - if [[ ${#failed_tests[@]} -ne 0 ]]; then - echo "failed tests: ${failed_tests[*]}. \ - Please see log.txt for more logs" - exit 1 - else - echo "All Tests Succeeded!" - fi; -} - -usage() { - echo "Usage: $0 \ - [-i <DEV>] \ - [-t data|ack|flags|tcp|ip|large] \ - [-p <ipv4|ipv6>]" 1>&2; - exit 1; -} - -while getopts "i:t:p:" opt; do - case "${opt}" in - i) - dev="${OPTARG}" - ;; - t) - test="${OPTARG}" - ;; - p) - proto="${OPTARG}" - ;; - *) - usage - ;; - esac -done - -if [ -n "$dev" ]; then - source setup_loopback.sh -else - source setup_veth.sh -fi - -setup -trap cleanup EXIT -if [[ "${test}" == "all" ]]; then - run_all_tests -else - exit_code=$(run_test "${proto}" "${test}") - exit $exit_code -fi; diff --git a/tools/testing/selftests/net/io_uring_zerocopy_tx.c b/tools/testing/selftests/net/io_uring_zerocopy_tx.c index 76e604e4810e..7bfeeb133705 100644 --- a/tools/testing/selftests/net/io_uring_zerocopy_tx.c +++ b/tools/testing/selftests/net/io_uring_zerocopy_tx.c @@ -106,14 +106,14 @@ static void do_tx(int domain, int type, int protocol) ret = io_uring_queue_init(512, &ring, 0); if (ret) - error(1, ret, "io_uring: queue init"); + error(1, -ret, "io_uring: queue init"); iov.iov_base = payload; iov.iov_len = cfg_payload_len; ret = io_uring_register_buffers(&ring, &iov, 1); if (ret) - error(1, ret, "io_uring: buffer registration"); + error(1, -ret, "io_uring: buffer registration"); tstop = gettimeofday_ms() + cfg_runtime_ms; do { @@ -149,24 +149,24 @@ static void do_tx(int domain, int type, int protocol) ret = io_uring_submit(&ring); if (ret != cfg_nr_reqs) - error(1, ret, "submit"); + error(1, -ret, "submit"); if (cfg_cork) do_setsockopt(fd, IPPROTO_UDP, UDP_CORK, 0); for (i = 0; i < cfg_nr_reqs; i++) { ret = io_uring_wait_cqe(&ring, &cqe); if (ret) - error(1, ret, "wait cqe"); + error(1, -ret, "wait cqe"); if (cqe->user_data != NONZC_TAG && cqe->user_data != ZC_TAG) - error(1, -EINVAL, "invalid cqe->user_data"); + error(1, EINVAL, "invalid cqe->user_data"); if (cqe->flags & IORING_CQE_F_NOTIF) { if (cqe->flags & IORING_CQE_F_MORE) - error(1, -EINVAL, "invalid notif flags"); + error(1, EINVAL, "invalid notif flags"); if (compl_cqes <= 0) - error(1, -EINVAL, "notification mismatch"); + error(1, EINVAL, "notification mismatch"); compl_cqes--; i--; io_uring_cqe_seen(&ring); @@ -174,14 +174,14 @@ static void do_tx(int domain, int type, int protocol) } if (cqe->flags & IORING_CQE_F_MORE) { if (cqe->user_data != ZC_TAG) - error(1, cqe->res, "unexpected F_MORE"); + error(1, -cqe->res, "unexpected F_MORE"); compl_cqes++; } if (cqe->res >= 0) { packets++; bytes += cqe->res; } else if (cqe->res != -EAGAIN) { - error(1, cqe->res, "send failed"); + error(1, -cqe->res, "send failed"); } io_uring_cqe_seen(&ring); } @@ -190,11 +190,11 @@ static void do_tx(int domain, int type, int protocol) while (compl_cqes) { ret = io_uring_wait_cqe(&ring, &cqe); if (ret) - error(1, ret, "wait cqe"); + error(1, -ret, "wait cqe"); if (cqe->flags & IORING_CQE_F_MORE) - error(1, -EINVAL, "invalid notif flags"); + error(1, EINVAL, "invalid notif flags"); if (!(cqe->flags & IORING_CQE_F_NOTIF)) - error(1, -EINVAL, "missing notif flag"); + error(1, EINVAL, "missing notif flag"); io_uring_cqe_seen(&ring); compl_cqes--; diff --git a/tools/testing/selftests/net/lib/Makefile b/tools/testing/selftests/net/lib/Makefile index ce795bc0a1af..5339f56329e1 100644 --- a/tools/testing/selftests/net/lib/Makefile +++ b/tools/testing/selftests/net/lib/Makefile @@ -8,6 +8,7 @@ CFLAGS += -I../../ TEST_FILES := \ ../../../../net/ynl \ ../../../../../Documentation/netlink/specs \ + ksft_setup_loopback.sh \ # end of TEST_FILES TEST_GEN_FILES := \ diff --git a/tools/testing/selftests/net/lib/ksft_setup_loopback.sh b/tools/testing/selftests/net/lib/ksft_setup_loopback.sh new file mode 100755 index 000000000000..3defbb1919c5 --- /dev/null +++ b/tools/testing/selftests/net/lib/ksft_setup_loopback.sh @@ -0,0 +1,111 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +# Setup script for running ksft tests over a real interface in loopback mode. +# This scripts replaces the historical setup_loopback.sh. It puts +# a (presumably) real hardware interface into loopback mode, creates macvlan +# interfaces on top and places them in a network namespace for isolation. +# +# NETIF env variable must be exported to indicate the real target device. +# Note that the test will override NETIF with one of the macvlans, the +# actual ksft test will only see the macvlans. +# +# Example use: +# export NETIF=eth0 +# ./net/lib/ksft_setup_loopback.sh ./drivers/net/gro.py + +if [ -z "$NETIF" ]; then + echo "Error: NETIF variable not set" + exit 1 +fi +if ! [ -d "/sys/class/net/$NETIF" ]; then + echo "Error: Can't find $NETIF, invalid netdevice" + exit 1 +fi + +# Save original settings for cleanup +readonly FLUSH_PATH="/sys/class/net/${NETIF}/gro_flush_timeout" +readonly IRQ_PATH="/sys/class/net/${NETIF}/napi_defer_hard_irqs" +FLUSH_TIMEOUT="$(< "${FLUSH_PATH}")" +readonly FLUSH_TIMEOUT +HARD_IRQS="$(< "${IRQ_PATH}")" +readonly HARD_IRQS + +SERVER_NS=$(mktemp -u server-XXXXXXXX) +readonly SERVER_NS +CLIENT_NS=$(mktemp -u client-XXXXXXXX) +readonly CLIENT_NS +readonly SERVER_MAC="aa:00:00:00:00:02" +readonly CLIENT_MAC="aa:00:00:00:00:01" + +# ksft expects addresses to communicate with remote +export LOCAL_V6=2001:db8:1::1 +export REMOTE_V6=2001:db8:1::2 + +cleanup() { + local exit_code=$? + + echo "Cleaning up..." + + # Remove macvlan interfaces and namespaces + ip -netns "${SERVER_NS}" link del dev server 2>/dev/null || true + ip netns del "${SERVER_NS}" 2>/dev/null || true + ip -netns "${CLIENT_NS}" link del dev client 2>/dev/null || true + ip netns del "${CLIENT_NS}" 2>/dev/null || true + + # Disable loopback + ethtool -K "${NETIF}" loopback off 2>/dev/null || true + sleep 1 + + echo "${FLUSH_TIMEOUT}" >"${FLUSH_PATH}" + echo "${HARD_IRQS}" >"${IRQ_PATH}" + + exit $exit_code +} + +trap cleanup EXIT INT TERM + +# Enable loopback mode +echo "Enabling loopback on ${NETIF}..." +ethtool -K "${NETIF}" loopback on || { + echo "Failed to enable loopback mode" + exit 1 +} +# The interface may need time to get carrier back, but selftests +# will wait for carrier, so no need to wait / sleep here. + +# Use timer on host to trigger the network stack +# Also disable device interrupt to not depend on NIC interrupt +# Reduce test flakiness caused by unexpected interrupts +echo 100000 >"${FLUSH_PATH}" +echo 50 >"${IRQ_PATH}" + +# Create server namespace with macvlan +ip netns add "${SERVER_NS}" +ip link add link "${NETIF}" dev server address "${SERVER_MAC}" type macvlan +ip link set dev server netns "${SERVER_NS}" +ip -netns "${SERVER_NS}" link set dev server up +ip -netns "${SERVER_NS}" addr add $LOCAL_V6/64 dev server +ip -netns "${SERVER_NS}" link set dev lo up + +# Create client namespace with macvlan +ip netns add "${CLIENT_NS}" +ip link add link "${NETIF}" dev client address "${CLIENT_MAC}" type macvlan +ip link set dev client netns "${CLIENT_NS}" +ip -netns "${CLIENT_NS}" link set dev client up +ip -netns "${CLIENT_NS}" addr add $REMOTE_V6/64 dev client +ip -netns "${CLIENT_NS}" link set dev lo up + +echo "Setup complete!" +echo " Device: ${NETIF}" +echo " Server NS: ${SERVER_NS}" +echo " Client NS: ${CLIENT_NS}" +echo "" + +# Setup environment variables for tests +export NETIF=server +export REMOTE_TYPE=netns +export REMOTE_ARGS="${CLIENT_NS}" + +# Run the command +ip netns exec "${SERVER_NS}" "$@" diff --git a/tools/testing/selftests/net/lib/py/__init__.py b/tools/testing/selftests/net/lib/py/__init__.py index 97b7cf2b20eb..40f9ce307dd1 100644 --- a/tools/testing/selftests/net/lib/py/__init__.py +++ b/tools/testing/selftests/net/lib/py/__init__.py @@ -8,7 +8,8 @@ from .consts import KSRC from .ksft import KsftFailEx, KsftSkipEx, KsftXfailEx, ksft_pr, ksft_eq, \ ksft_ne, ksft_true, ksft_not_none, ksft_in, ksft_not_in, ksft_is, \ ksft_ge, ksft_gt, ksft_lt, ksft_raises, ksft_busy_wait, \ - ktap_result, ksft_disruptive, ksft_setup, ksft_run, ksft_exit + ktap_result, ksft_disruptive, ksft_setup, ksft_run, ksft_exit, \ + ksft_variants, KsftNamedVariant from .netns import NetNS, NetNSEnter from .nsim import NetdevSim, NetdevSimDev from .utils import CmdExitFailure, fd_read_timeout, cmd, bkg, defer, \ @@ -21,7 +22,7 @@ __all__ = ["KSRC", "ksft_ne", "ksft_true", "ksft_not_none", "ksft_in", "ksft_not_in", "ksft_is", "ksft_ge", "ksft_gt", "ksft_lt", "ksft_raises", "ksft_busy_wait", "ktap_result", "ksft_disruptive", "ksft_setup", - "ksft_run", "ksft_exit", + "ksft_run", "ksft_exit", "ksft_variants", "KsftNamedVariant", "NetNS", "NetNSEnter", "CmdExitFailure", "fd_read_timeout", "cmd", "bkg", "defer", "bpftool", "ip", "ethtool", "bpftrace", "rand_port", diff --git a/tools/testing/selftests/net/lib/py/ksft.py b/tools/testing/selftests/net/lib/py/ksft.py index 83b1574f7719..531e7fa1b3ea 100644 --- a/tools/testing/selftests/net/lib/py/ksft.py +++ b/tools/testing/selftests/net/lib/py/ksft.py @@ -1,12 +1,12 @@ # SPDX-License-Identifier: GPL-2.0 -import builtins import functools import inspect import signal import sys import time import traceback +from collections import namedtuple from .consts import KSFT_MAIN_NAME from .utils import global_defer_queue @@ -136,7 +136,7 @@ def ksft_busy_wait(cond, sleep=0.005, deadline=1, comment=""): time.sleep(sleep) -def ktap_result(ok, cnt=1, case="", comment=""): +def ktap_result(ok, cnt=1, case_name="", comment=""): global KSFT_RESULT_ALL KSFT_RESULT_ALL = KSFT_RESULT_ALL and ok @@ -146,8 +146,8 @@ def ktap_result(ok, cnt=1, case="", comment=""): res += "ok " res += str(cnt) + " " res += KSFT_MAIN_NAME - if case: - res += "." + str(case.__name__) + if case_name: + res += "." + case_name if comment: res += " # " + comment print(res, flush=True) @@ -163,7 +163,7 @@ def ksft_flush_defer(): entry = global_defer_queue.pop() try: entry.exec_only() - except: + except Exception: ksft_pr(f"Exception while handling defer / cleanup (callback {i} of {qlen_start})!") tb = traceback.format_exc() for line in tb.strip().split('\n'): @@ -171,6 +171,10 @@ def ksft_flush_defer(): KSFT_RESULT = False +KsftCaseFunction = namedtuple("KsftCaseFunction", + ['name', 'original_func', 'variants']) + + def ksft_disruptive(func): """ Decorator that marks the test as disruptive (e.g. the test @@ -181,11 +185,47 @@ def ksft_disruptive(func): @functools.wraps(func) def wrapper(*args, **kwargs): if not KSFT_DISRUPTIVE: - raise KsftSkipEx(f"marked as disruptive") + raise KsftSkipEx("marked as disruptive") return func(*args, **kwargs) return wrapper +class KsftNamedVariant: + """ Named string name + argument list tuple for @ksft_variants """ + + def __init__(self, name, *params): + self.params = params + self.name = name or "_".join([str(x) for x in self.params]) + + +def ksft_variants(params): + """ + Decorator defining the sets of inputs for a test. + The parameters will be included in the name of the resulting sub-case. + Parameters can be either single object, tuple or a KsftNamedVariant. + The argument can be a list or a generator. + + Example: + + @ksft_variants([ + (1, "a"), + (2, "b"), + KsftNamedVariant("three", 3, "c"), + ]) + def my_case(cfg, a, b): + pass # ... + + ksft_run(cases=[my_case], args=(cfg, )) + + Will generate cases: + my_case.1_a + my_case.2_b + my_case.three + """ + + return lambda func: KsftCaseFunction(func.__name__, func, params) + + def ksft_setup(env): """ Setup test framework global state from the environment. @@ -199,7 +239,7 @@ def ksft_setup(env): return False try: return bool(int(value)) - except: + except Exception: raise Exception(f"failed to parse {name}") if "DISRUPTIVE" in env: @@ -220,9 +260,13 @@ def _ksft_intr(signum, frame): ksft_pr(f"Ignoring SIGTERM (cnt: {term_cnt}), already exiting...") -def ksft_run(cases=None, globs=None, case_pfx=None, args=()): +def _ksft_generate_test_cases(cases, globs, case_pfx, args): + """Generate a flat list of (func, args, name) tuples""" + cases = cases or [] + test_cases = [] + # If using the globs method find all relevant functions if globs and case_pfx: for key, value in globs.items(): if not callable(value): @@ -232,6 +276,27 @@ def ksft_run(cases=None, globs=None, case_pfx=None, args=()): cases.append(value) break + for func in cases: + if isinstance(func, KsftCaseFunction): + # Parametrized test - create case for each param + for param in func.variants: + if not isinstance(param, KsftNamedVariant): + if not isinstance(param, tuple): + param = (param, ) + param = KsftNamedVariant(None, *param) + + test_cases.append((func.original_func, + (*args, *param.params), + func.name + "." + param.name)) + else: + test_cases.append((func, args, func.__name__)) + + return test_cases + + +def ksft_run(cases=None, globs=None, case_pfx=None, args=()): + test_cases = _ksft_generate_test_cases(cases, globs, case_pfx, args) + global term_cnt term_cnt = 0 prev_sigterm = signal.signal(signal.SIGTERM, _ksft_intr) @@ -239,19 +304,19 @@ def ksft_run(cases=None, globs=None, case_pfx=None, args=()): totals = {"pass": 0, "fail": 0, "skip": 0, "xfail": 0} print("TAP version 13", flush=True) - print("1.." + str(len(cases)), flush=True) + print("1.." + str(len(test_cases)), flush=True) global KSFT_RESULT cnt = 0 stop = False - for case in cases: + for func, args, name in test_cases: KSFT_RESULT = True cnt += 1 comment = "" cnt_key = "" try: - case(*args) + func(*args) except KsftSkipEx as e: comment = "SKIP " + str(e) cnt_key = 'skip' @@ -268,12 +333,26 @@ def ksft_run(cases=None, globs=None, case_pfx=None, args=()): KSFT_RESULT = False cnt_key = 'fail' - ksft_flush_defer() + try: + ksft_flush_defer() + except BaseException as e: + tb = traceback.format_exc() + for line in tb.strip().split('\n'): + ksft_pr("Exception|", line) + if isinstance(e, KeyboardInterrupt): + ksft_pr() + ksft_pr("WARN: defer() interrupted, cleanup may be incomplete.") + ksft_pr(" Attempting to finish cleanup before exiting.") + ksft_pr(" Interrupt again to exit immediately.") + ksft_pr() + stop = True + # Flush was interrupted, try to finish the job best we can + ksft_flush_defer() if not cnt_key: cnt_key = 'pass' if KSFT_RESULT else 'fail' - ktap_result(KSFT_RESULT, cnt, case, comment=comment) + ktap_result(KSFT_RESULT, cnt, name, comment=comment) totals[cnt_key] += 1 if stop: diff --git a/tools/testing/selftests/net/lib/py/nsim.py b/tools/testing/selftests/net/lib/py/nsim.py index 1a8cbe9acc48..7c640ed64c0b 100644 --- a/tools/testing/selftests/net/lib/py/nsim.py +++ b/tools/testing/selftests/net/lib/py/nsim.py @@ -27,7 +27,7 @@ class NetdevSim: self.port_index = port_index self.ns = ns self.dfs_dir = "%s/ports/%u/" % (nsimdev.dfs_dir, port_index) - ret = ip("-j link show dev %s" % ifname, ns=ns) + ret = ip("-d -j link show dev %s" % ifname, ns=ns) self.dev = json.loads(ret.stdout)[0] self.ifindex = self.dev["ifindex"] diff --git a/tools/testing/selftests/net/lib/py/utils.py b/tools/testing/selftests/net/lib/py/utils.py index cb40ecef9456..106ee1f2df86 100644 --- a/tools/testing/selftests/net/lib/py/utils.py +++ b/tools/testing/selftests/net/lib/py/utils.py @@ -32,7 +32,7 @@ class cmd: Use bkg() instead to run a command in the background. """ def __init__(self, comm, shell=None, fail=True, ns=None, background=False, - host=None, timeout=5, ksft_wait=None): + host=None, timeout=5, ksft_ready=None, ksft_wait=None): if ns: comm = f'ip netns exec {ns} ' + comm @@ -52,21 +52,25 @@ class cmd: # ksft_wait lets us wait for the background process to fully start, # we pass an FD to the child process, and wait for it to write back. # Similarly term_fd tells child it's time to exit. - pass_fds = () + pass_fds = [] env = os.environ.copy() if ksft_wait is not None: - rfd, ready_fd = os.pipe() wait_fd, self.ksft_term_fd = os.pipe() - pass_fds = (ready_fd, wait_fd, ) - env["KSFT_READY_FD"] = str(ready_fd) + pass_fds.append(wait_fd) env["KSFT_WAIT_FD"] = str(wait_fd) + ksft_ready = True # ksft_wait implies ready + if ksft_ready is not None: + rfd, ready_fd = os.pipe() + pass_fds.append(ready_fd) + env["KSFT_READY_FD"] = str(ready_fd) self.proc = subprocess.Popen(comm, shell=shell, stdout=subprocess.PIPE, stderr=subprocess.PIPE, pass_fds=pass_fds, env=env) if ksft_wait is not None: - os.close(ready_fd) os.close(wait_fd) + if ksft_ready is not None: + os.close(ready_fd) msg = fd_read_timeout(rfd, ksft_wait) os.close(rfd) if not msg: @@ -116,10 +120,10 @@ class bkg(cmd): with bkg("my_binary", ksft_wait=5): """ def __init__(self, comm, shell=None, fail=None, ns=None, host=None, - exit_wait=False, ksft_wait=None): + exit_wait=False, ksft_ready=None, ksft_wait=None): super().__init__(comm, background=True, shell=shell, fail=fail, ns=ns, host=host, - ksft_wait=ksft_wait) + ksft_ready=ksft_ready, ksft_wait=ksft_wait) self.terminate = not exit_wait and not ksft_wait self._exit_wait = exit_wait self.check_fail = fail diff --git a/tools/testing/selftests/net/lib/xdp_native.bpf.c b/tools/testing/selftests/net/lib/xdp_native.bpf.c index c368fc045f4b..64f05229ab24 100644 --- a/tools/testing/selftests/net/lib/xdp_native.bpf.c +++ b/tools/testing/selftests/net/lib/xdp_native.bpf.c @@ -332,7 +332,7 @@ static __u16 csum_fold_helper(__u32 csum) } static int xdp_adjst_tail_shrnk_data(struct xdp_md *ctx, __u16 offset, - __u32 hdr_len) + unsigned long hdr_len) { char tmp_buff[MAX_ADJST_OFFSET]; __u32 buff_pos, udp_csum = 0; @@ -422,8 +422,9 @@ static int xdp_adjst_tail(struct xdp_md *ctx, __u16 port) { struct udphdr *udph = NULL; __s32 *adjust_offset, *val; - __u32 key, hdr_len; + unsigned long hdr_len; void *offset_ptr; + __u32 key; __u8 tag; int ret; diff --git a/tools/testing/selftests/net/mptcp/mptcp_connect.c b/tools/testing/selftests/net/mptcp/mptcp_connect.c index fc7e22b503d3..404a77bf366a 100644 --- a/tools/testing/selftests/net/mptcp/mptcp_connect.c +++ b/tools/testing/selftests/net/mptcp/mptcp_connect.c @@ -1072,6 +1072,8 @@ static void check_getpeername_connect(int fd) socklen_t salen = sizeof(ss); char a[INET6_ADDRSTRLEN]; char b[INET6_ADDRSTRLEN]; + const char *iface; + size_t len; if (getpeername(fd, (struct sockaddr *)&ss, &salen) < 0) { perror("getpeername"); @@ -1081,7 +1083,13 @@ static void check_getpeername_connect(int fd) xgetnameinfo((struct sockaddr *)&ss, salen, a, sizeof(a), b, sizeof(b)); - if (strcmp(cfg_host, a) || strcmp(cfg_port, b)) + iface = strchr(cfg_host, '%'); + if (iface) + len = iface - cfg_host; + else + len = strlen(cfg_host) + 1; + + if (strncmp(cfg_host, a, len) || strcmp(cfg_port, b)) fprintf(stderr, "%s: %s vs %s, %s vs %s\n", __func__, cfg_host, a, cfg_port, b); } diff --git a/tools/testing/selftests/net/mptcp/mptcp_connect.sh b/tools/testing/selftests/net/mptcp/mptcp_connect.sh index 9b7b93f8eb0c..a6447f7a31fe 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_connect.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_connect.sh @@ -375,81 +375,75 @@ do_transfer() local capfile="${rndh}-${connector_ns:0:3}-${listener_ns:0:3}-${cl_proto}-${srv_proto}-${connect_addr}-${port}" local capopt="-i any -s 65535 -B 32768 ${capuser}" - ip netns exec ${listener_ns} tcpdump ${capopt} -w "${capfile}-listener.pcap" >> "${capout}" 2>&1 & + ip netns exec ${listener_ns} tcpdump ${capopt} \ + -w "${capfile}-listener.pcap" >> "${capout}" 2>&1 & local cappid_listener=$! - ip netns exec ${connector_ns} tcpdump ${capopt} -w "${capfile}-connector.pcap" >> "${capout}" 2>&1 & - local cappid_connector=$! + if [ ${listener_ns} != ${connector_ns} ]; then + ip netns exec ${connector_ns} tcpdump ${capopt} \ + -w "${capfile}-connector.pcap" >> "${capout}" 2>&1 & + local cappid_connector=$! + fi sleep 1 fi - NSTAT_HISTORY=/tmp/${listener_ns}.nstat ip netns exec ${listener_ns} \ - nstat -n + mptcp_lib_nstat_init "${listener_ns}" if [ ${listener_ns} != ${connector_ns} ]; then - NSTAT_HISTORY=/tmp/${connector_ns}.nstat ip netns exec ${connector_ns} \ - nstat -n - fi - - local stat_synrx_last_l - local stat_ackrx_last_l - local stat_cookietx_last - local stat_cookierx_last - local stat_csum_err_s - local stat_csum_err_c - local stat_tcpfb_last_l - stat_synrx_last_l=$(mptcp_lib_get_counter "${listener_ns}" "MPTcpExtMPCapableSYNRX") - stat_ackrx_last_l=$(mptcp_lib_get_counter "${listener_ns}" "MPTcpExtMPCapableACKRX") - stat_cookietx_last=$(mptcp_lib_get_counter "${listener_ns}" "TcpExtSyncookiesSent") - stat_cookierx_last=$(mptcp_lib_get_counter "${listener_ns}" "TcpExtSyncookiesRecv") - stat_csum_err_s=$(mptcp_lib_get_counter "${listener_ns}" "MPTcpExtDataCsumErr") - stat_csum_err_c=$(mptcp_lib_get_counter "${connector_ns}" "MPTcpExtDataCsumErr") - stat_tcpfb_last_l=$(mptcp_lib_get_counter "${listener_ns}" "MPTcpExtMPCapableFallbackACK") - - timeout ${timeout_test} \ - ip netns exec ${listener_ns} \ - ./mptcp_connect -t ${timeout_poll} -l -p $port -s ${srv_proto} \ - $extra_args $local_addr < "$sin" > "$sout" & + mptcp_lib_nstat_init "${connector_ns}" + fi + + ip netns exec ${listener_ns} \ + ./mptcp_connect -t ${timeout_poll} -l -p $port -s ${srv_proto} \ + $extra_args $local_addr < "$sin" > "$sout" & local spid=$! mptcp_lib_wait_local_port_listen "${listener_ns}" "${port}" local start start=$(date +%s%3N) - timeout ${timeout_test} \ - ip netns exec ${connector_ns} \ - ./mptcp_connect -t ${timeout_poll} -p $port -s ${cl_proto} \ - $extra_args $connect_addr < "$cin" > "$cout" & + ip netns exec ${connector_ns} \ + ./mptcp_connect -t ${timeout_poll} -p $port -s ${cl_proto} \ + $extra_args $connect_addr < "$cin" > "$cout" & local cpid=$! + mptcp_lib_wait_timeout "${timeout_test}" "${listener_ns}" \ + "${connector_ns}" "${port}" "${cpid}" "${spid}" & + local timeout_pid=$! + wait $cpid local retc=$? wait $spid local rets=$? + if kill -0 $timeout_pid; then + # Finished before the timeout: kill the background job + mptcp_lib_kill_group_wait $timeout_pid + timeout_pid=0 + fi + local stop stop=$(date +%s%3N) if $capture; then sleep 1 kill ${cappid_listener} - kill ${cappid_connector} + if [ ${listener_ns} != ${connector_ns} ]; then + kill ${cappid_connector} + fi fi - NSTAT_HISTORY=/tmp/${listener_ns}.nstat ip netns exec ${listener_ns} \ - nstat | grep Tcp > /tmp/${listener_ns}.out + mptcp_lib_nstat_get "${listener_ns}" if [ ${listener_ns} != ${connector_ns} ]; then - NSTAT_HISTORY=/tmp/${connector_ns}.nstat ip netns exec ${connector_ns} \ - nstat | grep Tcp > /tmp/${connector_ns}.out + mptcp_lib_nstat_get "${connector_ns}" fi local duration duration=$((stop-start)) printf "(duration %05sms) " "${duration}" - if [ ${rets} -ne 0 ] || [ ${retc} -ne 0 ]; then + if [ ${rets} -ne 0 ] || [ ${retc} -ne 0 ] || [ ${timeout_pid} -ne 0 ]; then mptcp_lib_pr_fail "client exit code $retc, server $rets" - mptcp_lib_pr_err_stats "${listener_ns}" "${connector_ns}" "${port}" \ - "/tmp/${listener_ns}.out" "/tmp/${connector_ns}.out" + mptcp_lib_pr_err_stats "${listener_ns}" "${connector_ns}" "${port}" echo cat "$capout" @@ -463,38 +457,38 @@ do_transfer() rets=$? local extra="" - local stat_synrx_now_l - local stat_ackrx_now_l - local stat_cookietx_now - local stat_cookierx_now - local stat_ooo_now - local stat_tcpfb_now_l - stat_synrx_now_l=$(mptcp_lib_get_counter "${listener_ns}" "MPTcpExtMPCapableSYNRX") - stat_ackrx_now_l=$(mptcp_lib_get_counter "${listener_ns}" "MPTcpExtMPCapableACKRX") - stat_cookietx_now=$(mptcp_lib_get_counter "${listener_ns}" "TcpExtSyncookiesSent") - stat_cookierx_now=$(mptcp_lib_get_counter "${listener_ns}" "TcpExtSyncookiesRecv") - stat_ooo_now=$(mptcp_lib_get_counter "${listener_ns}" "TcpExtTCPOFOQueue") - stat_tcpfb_now_l=$(mptcp_lib_get_counter "${listener_ns}" "MPTcpExtMPCapableFallbackACK") - - expect_synrx=$((stat_synrx_last_l)) - expect_ackrx=$((stat_ackrx_last_l)) + local stat_synrx + local stat_ackrx + local stat_cookietx + local stat_cookierx + local stat_ooo + local stat_tcpfb + stat_synrx=$(mptcp_lib_get_counter "${listener_ns}" "MPTcpExtMPCapableSYNRX") + stat_ackrx=$(mptcp_lib_get_counter "${listener_ns}" "MPTcpExtMPCapableACKRX") + stat_cookietx=$(mptcp_lib_get_counter "${listener_ns}" "TcpExtSyncookiesSent") + stat_cookierx=$(mptcp_lib_get_counter "${listener_ns}" "TcpExtSyncookiesRecv") + stat_ooo=$(mptcp_lib_get_counter "${listener_ns}" "TcpExtTCPOFOQueue") + stat_tcpfb=$(mptcp_lib_get_counter "${listener_ns}" "MPTcpExtMPCapableFallbackACK") + + expect_synrx=0 + expect_ackrx=0 cookies=$(ip netns exec ${listener_ns} sysctl net.ipv4.tcp_syncookies) cookies=${cookies##*=} if [ ${cl_proto} = "MPTCP" ] && [ ${srv_proto} = "MPTCP" ]; then - expect_synrx=$((stat_synrx_last_l+connect_per_transfer)) - expect_ackrx=$((stat_ackrx_last_l+connect_per_transfer)) + expect_synrx=${connect_per_transfer} + expect_ackrx=${connect_per_transfer} fi - if [ ${stat_synrx_now_l} -lt ${expect_synrx} ]; then - mptcp_lib_pr_fail "lower MPC SYN rx (${stat_synrx_now_l})" \ + if [ ${stat_synrx} -lt ${expect_synrx} ]; then + mptcp_lib_pr_fail "lower MPC SYN rx (${stat_synrx})" \ "than expected (${expect_synrx})" retc=1 fi - if [ ${stat_ackrx_now_l} -lt ${expect_ackrx} ]; then - if [ ${stat_ooo_now} -eq 0 ]; then - mptcp_lib_pr_fail "lower MPC ACK rx (${stat_ackrx_now_l})" \ + if [ ${stat_ackrx} -lt ${expect_ackrx} ]; then + if [ ${stat_ooo} -eq 0 ]; then + mptcp_lib_pr_fail "lower MPC ACK rx (${stat_ackrx})" \ "than expected (${expect_ackrx})" rets=1 else @@ -508,47 +502,45 @@ do_transfer() csum_err_s=$(mptcp_lib_get_counter "${listener_ns}" "MPTcpExtDataCsumErr") csum_err_c=$(mptcp_lib_get_counter "${connector_ns}" "MPTcpExtDataCsumErr") - local csum_err_s_nr=$((csum_err_s - stat_csum_err_s)) - if [ $csum_err_s_nr -gt 0 ]; then - mptcp_lib_pr_fail "server got ${csum_err_s_nr} data checksum error[s]" + if [ $csum_err_s -gt 0 ]; then + mptcp_lib_pr_fail "server got ${csum_err_s} data checksum error[s]" rets=1 fi - local csum_err_c_nr=$((csum_err_c - stat_csum_err_c)) - if [ $csum_err_c_nr -gt 0 ]; then - mptcp_lib_pr_fail "client got ${csum_err_c_nr} data checksum error[s]" + if [ $csum_err_c -gt 0 ]; then + mptcp_lib_pr_fail "client got ${csum_err_c} data checksum error[s]" retc=1 fi fi - if [ ${stat_ooo_now} -eq 0 ] && [ ${stat_tcpfb_last_l} -ne ${stat_tcpfb_now_l} ]; then + if [ ${stat_ooo} -eq 0 ] && [ ${stat_tcpfb} -gt 0 ]; then mptcp_lib_pr_fail "unexpected fallback to TCP" rets=1 fi if [ $cookies -eq 2 ];then - if [ $stat_cookietx_last -ge $stat_cookietx_now ] ;then + if [ $stat_cookietx -eq 0 ] ;then extra+=" WARN: CookieSent: did not advance" fi - if [ $stat_cookierx_last -ge $stat_cookierx_now ] ;then + if [ $stat_cookierx -eq 0 ] ;then extra+=" WARN: CookieRecv: did not advance" fi else - if [ $stat_cookietx_last -ne $stat_cookietx_now ] ;then + if [ $stat_cookietx -gt 0 ] ;then extra+=" WARN: CookieSent: changed" fi - if [ $stat_cookierx_last -ne $stat_cookierx_now ] ;then + if [ $stat_cookierx -gt 0 ] ;then extra+=" WARN: CookieRecv: changed" fi fi - if [ ${stat_synrx_now_l} -gt ${expect_synrx} ]; then + if [ ${stat_synrx} -gt ${expect_synrx} ]; then extra+=" WARN: SYNRX: expect ${expect_synrx}," - extra+=" got ${stat_synrx_now_l} (probably retransmissions)" + extra+=" got ${stat_synrx} (probably retransmissions)" fi - if [ ${stat_ackrx_now_l} -gt ${expect_ackrx} ]; then + if [ ${stat_ackrx} -gt ${expect_ackrx} ]; then extra+=" WARN: ACKRX: expect ${expect_ackrx}," - extra+=" got ${stat_ackrx_now_l} (probably retransmissions)" + extra+=" got ${stat_ackrx} (probably retransmissions)" fi if [ $retc -eq 0 ] && [ $rets -eq 0 ]; then diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh index 43f31f8d587f..b2e6e548f796 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_join.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh @@ -62,6 +62,7 @@ unset sflags unset fastclose unset fullmesh unset speed +unset bind_addr unset join_syn_rej unset join_csum_ns1 unset join_csum_ns2 @@ -645,6 +646,27 @@ wait_mpj() done } +wait_ll_ready() +{ + local ns="${1}" + + local i + for i in $(seq 50); do + ip -n "${ns}" -6 addr show scope link | grep "inet6 fe80" | + grep -qw "tentative" || break + sleep 0.1 + done +} + +get_ll_addr() +{ + local ns="${1}" + local iface="${2}" + + ip -n "${ns}" -6 addr show dev "${iface}" scope link | + grep "inet6 fe80" | sed 's#.*\(fe80::.*\)/.*#\1#' +} + kill_events_pids() { mptcp_lib_kill_wait $evts_ns1_pid @@ -951,6 +973,9 @@ do_transfer() local FAILING_LINKS=${FAILING_LINKS:-""} local fastclose=${fastclose:-""} local speed=${speed:-"fast"} + local bind_addr=${bind_addr:-"::"} + local listener_in="${sin}" + local connector_in="${cin}" port=$(get_port) :> "$cout" @@ -958,10 +983,8 @@ do_transfer() cond_start_capture ${listener_ns} - NSTAT_HISTORY=/tmp/${listener_ns}.nstat ip netns exec ${listener_ns} \ - nstat -n - NSTAT_HISTORY=/tmp/${connector_ns}.nstat ip netns exec ${connector_ns} \ - nstat -n + mptcp_lib_nstat_init "${listener_ns}" + mptcp_lib_nstat_init "${connector_ns}" local extra_args if [ $speed = "fast" ]; then @@ -999,42 +1022,40 @@ do_transfer() extra_srv_args="$extra_args $extra_srv_args" if [ "$test_linkfail" -gt 1 ];then - timeout ${timeout_test} \ - ip netns exec ${listener_ns} \ - ./mptcp_connect -t ${timeout_poll} -l -p $port -s ${srv_proto} \ - $extra_srv_args "::" < "$sinfail" > "$sout" & - else - timeout ${timeout_test} \ - ip netns exec ${listener_ns} \ - ./mptcp_connect -t ${timeout_poll} -l -p $port -s ${srv_proto} \ - $extra_srv_args "::" < "$sin" > "$sout" & + listener_in="${sinfail}" fi + ip netns exec ${listener_ns} \ + ./mptcp_connect -t ${timeout_poll} -l -p ${port} -s ${srv_proto} \ + ${extra_srv_args} "${bind_addr}" < "${listener_in}" > "${sout}" & local spid=$! mptcp_lib_wait_local_port_listen "${listener_ns}" "${port}" extra_cl_args="$extra_args $extra_cl_args" if [ "$test_linkfail" -eq 0 ];then - timeout ${timeout_test} \ - ip netns exec ${connector_ns} \ - ./mptcp_connect -t ${timeout_poll} -p $port -s ${cl_proto} \ - $extra_cl_args $connect_addr < "$cin" > "$cout" & + ip netns exec ${connector_ns} \ + ./mptcp_connect -t ${timeout_poll} -p $port -s ${cl_proto} \ + $extra_cl_args $connect_addr < "$cin" > "$cout" & elif [ "$test_linkfail" -eq 1 ] || [ "$test_linkfail" -eq 2 ];then + connector_in="${cinsent}" ( cat "$cinfail" ; sleep 2; link_failure $listener_ns ; cat "$cinfail" ) | \ tee "$cinsent" | \ - timeout ${timeout_test} \ ip netns exec ${connector_ns} \ ./mptcp_connect -t ${timeout_poll} -p $port -s ${cl_proto} \ $extra_cl_args $connect_addr > "$cout" & else + connector_in="${cinsent}" tee "$cinsent" < "$cinfail" | \ - timeout ${timeout_test} \ - ip netns exec ${connector_ns} \ - ./mptcp_connect -t ${timeout_poll} -p $port -s ${cl_proto} \ - $extra_cl_args $connect_addr > "$cout" & + ip netns exec ${connector_ns} \ + ./mptcp_connect -t ${timeout_poll} -p $port -s ${cl_proto} \ + $extra_cl_args $connect_addr > "$cout" & fi local cpid=$! + mptcp_lib_wait_timeout "${timeout_test}" "${listener_ns}" \ + "${connector_ns}" "${port}" "${cpid}" "${spid}" & + local timeout_pid=$! + pm_nl_set_endpoint $listener_ns $connector_ns $connect_addr check_cestab $listener_ns $connector_ns @@ -1043,31 +1064,26 @@ do_transfer() wait $spid local rets=$? + if kill -0 $timeout_pid; then + # Finished before the timeout: kill the background job + mptcp_lib_kill_group_wait $timeout_pid + timeout_pid=0 + fi + cond_stop_capture - NSTAT_HISTORY=/tmp/${listener_ns}.nstat ip netns exec ${listener_ns} \ - nstat | grep Tcp > /tmp/${listener_ns}.out - NSTAT_HISTORY=/tmp/${connector_ns}.nstat ip netns exec ${connector_ns} \ - nstat | grep Tcp > /tmp/${connector_ns}.out + mptcp_lib_nstat_get "${listener_ns}" + mptcp_lib_nstat_get "${connector_ns}" - if [ ${rets} -ne 0 ] || [ ${retc} -ne 0 ]; then + if [ ${rets} -ne 0 ] || [ ${retc} -ne 0 ] || [ ${timeout_pid} -ne 0 ]; then fail_test "client exit code $retc, server $rets" - mptcp_lib_pr_err_stats "${listener_ns}" "${connector_ns}" "${port}" \ - "/tmp/${listener_ns}.out" "/tmp/${connector_ns}.out" + mptcp_lib_pr_err_stats "${listener_ns}" "${connector_ns}" "${port}" return 1 fi - if [ "$test_linkfail" -gt 1 ];then - check_transfer $sinfail $cout "file received by client" $trunc_size - else - check_transfer $sin $cout "file received by client" $trunc_size - fi + check_transfer $listener_in $cout "file received by client" $trunc_size retc=$? - if [ "$test_linkfail" -eq 0 ];then - check_transfer $cin $sout "file received by server" $trunc_size - else - check_transfer $cinsent $sout "file received by server" $trunc_size - fi + check_transfer $connector_in $sout "file received by server" $trunc_size rets=$? [ $retc -eq 0 ] && [ $rets -eq 0 ] @@ -1136,12 +1152,20 @@ run_tests() do_transfer ${listener_ns} ${connector_ns} MPTCP MPTCP ${connect_addr} } +_dump_stats() +{ + local ns="${1}" + local side="${2}" + + mptcp_lib_print_err "${side} ns stats (${ns2})" + mptcp_lib_pr_nstat "${ns}" + echo +} + dump_stats() { - echo Server ns stats - ip netns exec $ns1 nstat -as | grep Tcp - echo Client ns stats - ip netns exec $ns2 nstat -as | grep Tcp + _dump_stats "${ns1}" "Server" + _dump_stats "${ns2}" "Client" } chk_csum_nr() @@ -2952,7 +2976,11 @@ mixed_tests() pm_nl_add_endpoint $ns1 10.0.1.1 flags signal speed=slow \ run_tests $ns1 $ns2 dead:beef:2::1 - chk_join_nr 1 1 1 + if mptcp_lib_kallsyms_has "mptcp_pm_get_endp_fullmesh_max$"; then + chk_join_nr 0 0 0 + else + chk_join_nr 1 1 1 + fi fi # fullmesh still tries to create all the possibly subflows with @@ -3233,6 +3261,133 @@ add_addr_ports_tests() fi } +bind_tests() +{ + # bind to one address should not allow extra subflows to other addresses + if reset "bind main address v4, no join v4"; then + pm_nl_set_limits $ns1 0 2 + pm_nl_set_limits $ns2 2 2 + pm_nl_add_endpoint $ns1 10.0.2.1 flags signal + bind_addr="10.0.1.1" \ + run_tests $ns1 $ns2 10.0.1.1 + join_syn_tx=1 \ + chk_join_nr 0 0 0 + chk_add_nr 1 1 + fi + + # bind to one address should not allow extra subflows to other addresses + if reset "bind main address v6, no join v6"; then + pm_nl_set_limits $ns1 0 2 + pm_nl_set_limits $ns2 2 2 + pm_nl_add_endpoint $ns1 dead:beef:2::1 flags signal + bind_addr="dead:beef:1::1" \ + run_tests $ns1 $ns2 dead:beef:1::1 + join_syn_tx=1 \ + chk_join_nr 0 0 0 + chk_add_nr 1 1 + fi + + # multiple binds to allow extra subflows to other addresses + if reset "multiple bind to allow joins v4"; then + local extra_bind + + pm_nl_set_limits $ns1 0 2 + pm_nl_set_limits $ns2 2 2 + pm_nl_add_endpoint $ns1 10.0.2.1 flags signal + + # Launching another app listening on a different address + # Note: it could be a totally different app, e.g. nc, socat, ... + ip netns exec ${ns1} ./mptcp_connect -l -t -1 -p "$(get_port)" \ + -s MPTCP 10.0.2.1 & + extra_bind=$! + + bind_addr="10.0.1.1" \ + run_tests $ns1 $ns2 10.0.1.1 + chk_join_nr 1 1 1 + chk_add_nr 1 1 + + kill ${extra_bind} + fi + + # multiple binds to allow extra subflows to other addresses + if reset "multiple bind to allow joins v6"; then + local extra_bind + + pm_nl_set_limits $ns1 0 2 + pm_nl_set_limits $ns2 2 2 + pm_nl_add_endpoint $ns1 dead:beef:2::1 flags signal + + # Launching another app listening on a different address + # Note: it could be a totally different app, e.g. nc, socat, ... + ip netns exec ${ns1} ./mptcp_connect -l -t -1 -p "$(get_port)" \ + -s MPTCP dead:beef:2::1 & + extra_bind=$! + + bind_addr="dead:beef:1::1" \ + run_tests $ns1 $ns2 dead:beef:1::1 + chk_join_nr 1 1 1 + chk_add_nr 1 1 + + kill ${extra_bind} + fi + + # multiple binds to allow extra subflows to other addresses: v6 LL case + if reset "multiple bind to allow joins v6 link-local routing"; then + local extra_bind ns1ll1 ns1ll2 + + ns1ll1="$(get_ll_addr $ns1 ns1eth1)" + ns1ll2="$(get_ll_addr $ns1 ns1eth2)" + + pm_nl_set_limits $ns1 0 2 + pm_nl_set_limits $ns2 2 2 + pm_nl_add_endpoint $ns1 "${ns1ll2}" flags signal + + wait_ll_ready $ns1 # to be able to bind + wait_ll_ready $ns2 # also needed to bind on the client side + ip netns exec ${ns1} ./mptcp_connect -l -t -1 -p "$(get_port)" \ + -s MPTCP "${ns1ll2}%ns1eth2" & + extra_bind=$! + + bind_addr="${ns1ll1}%ns1eth1" \ + run_tests $ns1 $ns2 "${ns1ll1}%ns2eth1" + # it is not possible to connect to the announced LL addr without + # specifying the outgoing interface. + join_connect_err=1 \ + chk_join_nr 0 0 0 + chk_add_nr 1 1 + + kill ${extra_bind} + fi + + # multiple binds to allow extra subflows to v6 LL addresses: laminar + if reset "multiple bind to allow joins v6 link-local laminar" && + continue_if mptcp_lib_kallsyms_has "mptcp_pm_get_endp_laminar_max$"; then + local extra_bind ns1ll1 ns1ll2 ns2ll2 + + ns1ll1="$(get_ll_addr $ns1 ns1eth1)" + ns1ll2="$(get_ll_addr $ns1 ns1eth2)" + ns2ll2="$(get_ll_addr $ns2 ns2eth2)" + + pm_nl_set_limits $ns1 0 2 + pm_nl_set_limits $ns2 2 2 + pm_nl_add_endpoint $ns1 "${ns1ll2}" flags signal + pm_nl_add_endpoint $ns2 "${ns2ll2}" flags laminar dev ns2eth2 + + wait_ll_ready $ns1 # to be able to bind + wait_ll_ready $ns2 # also needed to bind on the client side + ip netns exec ${ns1} ./mptcp_connect -l -t -1 -p "$(get_port)" \ + -s MPTCP "${ns1ll2}%ns1eth2" & + extra_bind=$! + + bind_addr="${ns1ll1}%ns1eth1" \ + run_tests $ns1 $ns2 "${ns1ll1}%ns2eth1" + chk_join_nr 1 1 1 + chk_add_nr 1 1 + + kill ${extra_bind} + fi +} + syncookies_tests() { # single subflow, syncookies @@ -4192,6 +4347,7 @@ all_tests_sorted=( M@mixed_tests b@backup_tests p@add_addr_ports_tests + B@bind_tests k@syncookies_tests S@checksum_tests d@deny_join_id0_tests diff --git a/tools/testing/selftests/net/mptcp/mptcp_lib.sh b/tools/testing/selftests/net/mptcp/mptcp_lib.sh index f4388900016a..5fea7e7df628 100644 --- a/tools/testing/selftests/net/mptcp/mptcp_lib.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_lib.sh @@ -106,23 +106,32 @@ mptcp_lib_pr_info() { mptcp_lib_print_info "INFO: ${*}" } -# $1-2: listener/connector ns ; $3 port ; $4-5 listener/connector stat file +mptcp_lib_pr_nstat() { + local ns="${1}" + local hist="/tmp/${ns}.out" + + if [ -f "${hist}" ]; then + awk '$2 != 0 { print " "$0 }' "${hist}" + else + ip netns exec "${ns}" nstat -as | grep Tcp + fi +} + +# $1-2: listener/connector ns ; $3 port mptcp_lib_pr_err_stats() { local lns="${1}" local cns="${2}" local port="${3}" - local lstat="${4}" - local cstat="${5}" echo -en "${MPTCP_LIB_COLOR_RED}" { printf "\nnetns %s (listener) socket stat for %d:\n" "${lns}" "${port}" ip netns exec "${lns}" ss -Menitam -o "sport = :${port}" - cat "${lstat}" + mptcp_lib_pr_nstat "${lns}" printf "\nnetns %s (connector) socket stat for %d:\n" "${cns}" "${port}" ip netns exec "${cns}" ss -Menitam -o "dport = :${port}" - [ "${lstat}" != "${cstat}" ] && cat "${cstat}" + [ "${lns}" != "${cns}" ] && mptcp_lib_pr_nstat "${cns}" } 1>&2 echo -en "${MPTCP_LIB_COLOR_RESET}" } @@ -341,6 +350,19 @@ mptcp_lib_evts_get_info() { mptcp_lib_get_info_value "${1}" "^type:${3:-1}," } +mptcp_lib_wait_timeout() { + local timeout_test="${1}" + local listener_ns="${2}" + local connector_ns="${3}" + local port="${4}" + shift 4 # rest are PIDs + + sleep "${timeout_test}" + mptcp_lib_print_err "timeout" + mptcp_lib_pr_err_stats "${listener_ns}" "${connector_ns}" "${port}" + kill "${@}" 2>/dev/null +} + # $1: PID mptcp_lib_kill_wait() { [ "${1}" -eq 0 ] && return 0 @@ -376,14 +398,36 @@ mptcp_lib_is_v6() { [ -z "${1##*:*}" ] } +mptcp_lib_nstat_init() { + local ns="${1}" + + rm -f "/tmp/${ns}."{nstat,out} + NSTAT_HISTORY="/tmp/${ns}.nstat" ip netns exec "${ns}" nstat -n +} + +mptcp_lib_nstat_get() { + local ns="${1}" + + # filter out non-*TCP stats, and the rate (last column) + NSTAT_HISTORY="/tmp/${ns}.nstat" ip netns exec "${ns}" nstat -sz | + grep -o ".*Tcp\S\+\s\+[0-9]\+" > "/tmp/${ns}.out" +} + # $1: ns, $2: MIB counter +# Get the counter from the history (mptcp_lib_nstat_{init,get}()) if available. +# If not, get the counter from nstat ignoring any history. mptcp_lib_get_counter() { local ns="${1}" local counter="${2}" + local hist="/tmp/${ns}.out" local count - count=$(ip netns exec "${ns}" nstat -asz "${counter}" | - awk 'NR==1 {next} {print $2}') + if [[ -s "${hist}" && "${counter}" == *"Tcp"* ]]; then + count=$(awk "/^${counter} / {print \$2; exit}" "${hist}") + else + count=$(ip netns exec "${ns}" nstat -asz "${counter}" | + awk 'NR==1 {next} {print $2}') + fi if [ -z "${count}" ]; then mptcp_lib_fail_if_expected_feature "${counter} counter" return 1 diff --git a/tools/testing/selftests/net/mptcp/mptcp_sockopt.sh b/tools/testing/selftests/net/mptcp/mptcp_sockopt.sh index f01989be6e9b..ab8bce06b262 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_sockopt.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_sockopt.sh @@ -169,41 +169,44 @@ do_transfer() cmsg+=",TCPINQ" fi - NSTAT_HISTORY=/tmp/${listener_ns}.nstat ip netns exec ${listener_ns} \ - nstat -n - NSTAT_HISTORY=/tmp/${connector_ns}.nstat ip netns exec ${connector_ns} \ - nstat -n - - timeout ${timeout_test} \ - ip netns exec ${listener_ns} \ - $mptcp_connect -t ${timeout_poll} -l -M 1 -p $port -s ${srv_proto} -c "${cmsg}" \ - ${local_addr} < "$sin" > "$sout" & + mptcp_lib_nstat_init "${listener_ns}" + mptcp_lib_nstat_init "${connector_ns}" + + ip netns exec ${listener_ns} \ + $mptcp_connect -t ${timeout_poll} -l -M 1 -p $port -s ${srv_proto} -c "${cmsg}" \ + ${local_addr} < "$sin" > "$sout" & local spid=$! - sleep 1 + mptcp_lib_wait_local_port_listen "${listener_ns}" "${port}" - timeout ${timeout_test} \ - ip netns exec ${connector_ns} \ - $mptcp_connect -t ${timeout_poll} -M 2 -p $port -s ${cl_proto} -c "${cmsg}" \ - $connect_addr < "$cin" > "$cout" & + ip netns exec ${connector_ns} \ + $mptcp_connect -t ${timeout_poll} -M 2 -p $port -s ${cl_proto} -c "${cmsg}" \ + $connect_addr < "$cin" > "$cout" & local cpid=$! + mptcp_lib_wait_timeout "${timeout_test}" "${listener_ns}" \ + "${connector_ns}" "${port}" "${cpid}" "${spid}" & + local timeout_pid=$! + wait $cpid local retc=$? wait $spid local rets=$? - NSTAT_HISTORY=/tmp/${listener_ns}.nstat ip netns exec ${listener_ns} \ - nstat | grep Tcp > /tmp/${listener_ns}.out - NSTAT_HISTORY=/tmp/${connector_ns}.nstat ip netns exec ${connector_ns} \ - nstat | grep Tcp > /tmp/${connector_ns}.out + if kill -0 $timeout_pid; then + # Finished before the timeout: kill the background job + mptcp_lib_kill_group_wait $timeout_pid + timeout_pid=0 + fi + + mptcp_lib_nstat_get "${listener_ns}" + mptcp_lib_nstat_get "${connector_ns}" print_title "Transfer ${ip:2}" - if [ ${rets} -ne 0 ] || [ ${retc} -ne 0 ]; then + if [ ${rets} -ne 0 ] || [ ${retc} -ne 0 ] || [ ${timeout_pid} -ne 0 ]; then mptcp_lib_pr_fail "client exit code $retc, server $rets" - mptcp_lib_pr_err_stats "${listener_ns}" "${connector_ns}" "${port}" \ - "/tmp/${listener_ns}.out" "/tmp/${connector_ns}.out" + mptcp_lib_pr_err_stats "${listener_ns}" "${connector_ns}" "${port}" mptcp_lib_result_fail "transfer ${ip}" diff --git a/tools/testing/selftests/net/mptcp/simult_flows.sh b/tools/testing/selftests/net/mptcp/simult_flows.sh index 1903e8e84a31..806aaa7d2d61 100755 --- a/tools/testing/selftests/net/mptcp/simult_flows.sh +++ b/tools/testing/selftests/net/mptcp/simult_flows.sh @@ -155,48 +155,53 @@ do_transfer() sleep 1 fi - NSTAT_HISTORY=/tmp/${ns3}.nstat ip netns exec ${ns3} \ - nstat -n - NSTAT_HISTORY=/tmp/${ns1}.nstat ip netns exec ${ns1} \ - nstat -n - - timeout ${timeout_test} \ - ip netns exec ${ns3} \ - ./mptcp_connect -jt ${timeout_poll} -l -p $port -T $max_time \ - 0.0.0.0 < "$sin" > "$sout" & + mptcp_lib_nstat_init "${ns3}" + mptcp_lib_nstat_init "${ns1}" + + ip netns exec ${ns3} \ + ./mptcp_connect -jt ${timeout_poll} -l -p $port -T $max_time \ + 0.0.0.0 < "$sin" > "$sout" & local spid=$! mptcp_lib_wait_local_port_listen "${ns3}" "${port}" - timeout ${timeout_test} \ - ip netns exec ${ns1} \ - ./mptcp_connect -jt ${timeout_poll} -p $port -T $max_time \ - 10.0.3.3 < "$cin" > "$cout" & + ip netns exec ${ns1} \ + ./mptcp_connect -jt ${timeout_poll} -p $port -T $max_time \ + 10.0.3.3 < "$cin" > "$cout" & local cpid=$! + mptcp_lib_wait_timeout "${timeout_test}" "${ns3}" "${ns1}" "${port}" \ + "${cpid}" "${spid}" & + local timeout_pid=$! + wait $cpid local retc=$? wait $spid local rets=$? + if kill -0 $timeout_pid; then + # Finished before the timeout: kill the background job + mptcp_lib_kill_group_wait $timeout_pid + timeout_pid=0 + fi + if $capture; then sleep 1 kill ${cappid_listener} kill ${cappid_connector} fi - NSTAT_HISTORY=/tmp/${ns3}.nstat ip netns exec ${ns3} \ - nstat | grep Tcp > /tmp/${ns3}.out - NSTAT_HISTORY=/tmp/${ns1}.nstat ip netns exec ${ns1} \ - nstat | grep Tcp > /tmp/${ns1}.out + mptcp_lib_nstat_get "${ns3}" + mptcp_lib_nstat_get "${ns1}" cmp $sin $cout > /dev/null 2>&1 local cmps=$? cmp $cin $sout > /dev/null 2>&1 local cmpc=$? - if [ $retc -eq 0 ] && [ $rets -eq 0 ] && \ - [ $cmpc -eq 0 ] && [ $cmps -eq 0 ]; then + if [ $retc -eq 0 ] && [ $rets -eq 0 ] && + [ $cmpc -eq 0 ] && [ $cmps -eq 0 ] && + [ $timeout_pid -eq 0 ]; then printf "%-16s" " max $max_time " mptcp_lib_pr_ok cat "$capout" @@ -204,8 +209,7 @@ do_transfer() fi mptcp_lib_pr_fail "client exit code $retc, server $rets" - mptcp_lib_pr_err_stats "${ns3}" "${ns1}" "${port}" \ - "/tmp/${ns3}.out" "/tmp/${ns1}.out" + mptcp_lib_pr_err_stats "${ns3}" "${ns1}" "${port}" ls -l $sin $cout ls -l $cin $sout diff --git a/tools/testing/selftests/net/mptcp/userspace_pm.sh b/tools/testing/selftests/net/mptcp/userspace_pm.sh index 87323942cb8a..e9ae1806ab07 100755 --- a/tools/testing/selftests/net/mptcp/userspace_pm.sh +++ b/tools/testing/selftests/net/mptcp/userspace_pm.sh @@ -211,7 +211,8 @@ make_connection() ip netns exec "$ns1" \ ./mptcp_connect -s MPTCP -w 300 -p $app_port -l $listen_addr > /dev/null 2>&1 & local server_pid=$! - sleep 0.5 + + mptcp_lib_wait_local_port_listen "${ns1}" "${port}" # Run the client, transfer $file and stay connected to the server # to conduct tests diff --git a/tools/testing/selftests/net/netfilter/nft_flowtable.sh b/tools/testing/selftests/net/netfilter/nft_flowtable.sh index 45832df98295..a68bc882fa4e 100755 --- a/tools/testing/selftests/net/netfilter/nft_flowtable.sh +++ b/tools/testing/selftests/net/netfilter/nft_flowtable.sh @@ -127,6 +127,8 @@ ip -net "$nsr1" addr add fee1:2::1/64 dev veth1 nodad ip -net "$nsr2" addr add 192.168.10.2/24 dev veth0 ip -net "$nsr2" addr add fee1:2::2/64 dev veth0 nodad +ip netns exec "$nsr1" sysctl net.ipv6.conf.all.forwarding=1 > /dev/null +ip netns exec "$nsr2" sysctl net.ipv6.conf.all.forwarding=1 > /dev/null for i in 0 1; do ip netns exec "$nsr1" sysctl net.ipv4.conf.veth$i.forwarding=1 > /dev/null ip netns exec "$nsr2" sysctl net.ipv4.conf.veth$i.forwarding=1 > /dev/null @@ -153,7 +155,9 @@ ip -net "$ns1" route add default via dead:1::1 ip -net "$ns2" route add default via dead:2::1 ip -net "$nsr1" route add default via 192.168.10.2 +ip -6 -net "$nsr1" route add default via fee1:2::2 ip -net "$nsr2" route add default via 192.168.10.1 +ip -6 -net "$nsr2" route add default via fee1:2::1 ip netns exec "$nsr1" nft -f - <<EOF table inet filter { @@ -352,8 +356,9 @@ test_tcp_forwarding_ip() local nsa=$1 local nsb=$2 local pmtu=$3 - local dstip=$4 - local dstport=$5 + local proto=$4 + local dstip=$5 + local dstport=$6 local lret=0 local socatc local socatl @@ -363,12 +368,14 @@ test_tcp_forwarding_ip() infile="$nsin_small" fi - timeout "$SOCAT_TIMEOUT" ip netns exec "$nsb" socat -4 TCP-LISTEN:12345,reuseaddr STDIO < "$infile" > "$ns2out" & + timeout "$SOCAT_TIMEOUT" ip netns exec "$nsb" socat -${proto} \ + TCP"${proto}"-LISTEN:12345,reuseaddr STDIO < "$infile" > "$ns2out" & lpid=$! busywait 1000 listener_ready - timeout "$SOCAT_TIMEOUT" ip netns exec "$nsa" socat -4 TCP:"$dstip":"$dstport" STDIO < "$infile" > "$ns1out" + timeout "$SOCAT_TIMEOUT" ip netns exec "$nsa" socat -${proto} \ + TCP"${proto}":"$dstip":"$dstport" STDIO < "$infile" > "$ns1out" socatc=$? wait $lpid @@ -394,8 +401,11 @@ test_tcp_forwarding_ip() test_tcp_forwarding() { local pmtu="$3" + local proto="$4" + local dstip="$5" + local dstport="$6" - test_tcp_forwarding_ip "$1" "$2" "$pmtu" 10.0.2.99 12345 + test_tcp_forwarding_ip "$1" "$2" "$pmtu" "$proto" "$dstip" "$dstport" return $? } @@ -403,6 +413,9 @@ test_tcp_forwarding() test_tcp_forwarding_set_dscp() { local pmtu="$3" + local proto="$4" + local dstip="$5" + local dstport="$6" ip netns exec "$nsr1" nft -f - <<EOF table netdev dscpmangle { @@ -413,7 +426,7 @@ table netdev dscpmangle { } EOF if [ $? -eq 0 ]; then - test_tcp_forwarding_ip "$1" "$2" "$3" 10.0.2.99 12345 + test_tcp_forwarding_ip "$1" "$2" "$pmtu" "$proto" "$dstip" "$dstport" check_dscp "dscp_ingress" "$pmtu" ip netns exec "$nsr1" nft delete table netdev dscpmangle @@ -430,7 +443,7 @@ table netdev dscpmangle { } EOF if [ $? -eq 0 ]; then - test_tcp_forwarding_ip "$1" "$2" "$pmtu" 10.0.2.99 12345 + test_tcp_forwarding_ip "$1" "$2" "$pmtu" "$proto" "$dstip" "$dstport" check_dscp "dscp_egress" "$pmtu" ip netns exec "$nsr1" nft delete table netdev dscpmangle @@ -441,7 +454,7 @@ fi # partial. If flowtable really works, then both dscp-is-0 and dscp-is-cs3 # counters should have seen packets (before and after ft offload kicks in). ip netns exec "$nsr1" nft -a insert rule inet filter forward ip dscp set cs3 - test_tcp_forwarding_ip "$1" "$2" "$pmtu" 10.0.2.99 12345 + test_tcp_forwarding_ip "$1" "$2" "$pmtu" "$proto" "$dstip" "$dstport" check_dscp "dscp_fwd" "$pmtu" } @@ -455,7 +468,7 @@ test_tcp_forwarding_nat() [ "$pmtu" -eq 0 ] && what="$what (pmtu disabled)" - test_tcp_forwarding_ip "$nsa" "$nsb" "$pmtu" 10.0.2.99 12345 + test_tcp_forwarding_ip "$nsa" "$nsb" "$pmtu" 4 10.0.2.99 12345 lret=$? if [ "$lret" -eq 0 ] ; then @@ -465,7 +478,7 @@ test_tcp_forwarding_nat() echo "PASS: flow offload for ns1/ns2 with masquerade $what" fi - test_tcp_forwarding_ip "$1" "$2" "$pmtu" 10.6.6.6 1666 + test_tcp_forwarding_ip "$1" "$2" "$pmtu" 4 10.6.6.6 1666 lret=$? if [ "$pmtu" -eq 1 ] ;then check_counters "flow offload for ns1/ns2 with dnat $what" @@ -487,7 +500,7 @@ make_file "$nsin_small" "$filesize_small" # Due to MTU mismatch in both directions, all packets (except small packets like pure # acks) have to be handled by normal forwarding path. Therefore, packet counters # are not checked. -if test_tcp_forwarding "$ns1" "$ns2" 0; then +if test_tcp_forwarding "$ns1" "$ns2" 0 4 10.0.2.99 12345; then echo "PASS: flow offloaded for ns1/ns2" else echo "FAIL: flow offload for ns1/ns2:" 1>&2 @@ -495,6 +508,14 @@ else ret=1 fi +if test_tcp_forwarding "$ns1" "$ns2" 0 6 "[dead:2::99]" 12345; then + echo "PASS: IPv6 flow offloaded for ns1/ns2" +else + echo "FAIL: IPv6 flow offload for ns1/ns2:" 1>&2 + ip netns exec "$nsr1" nft list ruleset + ret=1 +fi + # delete default route, i.e. ns2 won't be able to reach ns1 and # will depend on ns1 being masqueraded in nsr1. # expect ns1 has nsr1 address. @@ -520,7 +541,7 @@ table ip nat { EOF check_dscp "dscp_none" "0" -if ! test_tcp_forwarding_set_dscp "$ns1" "$ns2" 0 ""; then +if ! test_tcp_forwarding_set_dscp "$ns1" "$ns2" 0 4 10.0.2.99 12345; then echo "FAIL: flow offload for ns1/ns2 with dscp update and no pmtu discovery" 1>&2 exit 0 fi @@ -546,7 +567,7 @@ ip netns exec "$ns2" sysctl net.ipv4.ip_no_pmtu_disc=0 > /dev/null ip netns exec "$nsr1" nft reset counters table inet filter >/dev/null ip netns exec "$ns2" nft reset counters table inet filter >/dev/null -if ! test_tcp_forwarding_set_dscp "$ns1" "$ns2" 1 ""; then +if ! test_tcp_forwarding_set_dscp "$ns1" "$ns2" 1 4 10.0.2.99 12345; then echo "FAIL: flow offload for ns1/ns2 with dscp update and pmtu discovery" 1>&2 exit 0 fi @@ -558,6 +579,73 @@ if ! test_tcp_forwarding_nat "$ns1" "$ns2" 1 ""; then ip netns exec "$nsr1" nft list ruleset fi +# IPIP tunnel test: +# Add IPIP tunnel interfaces and check flowtable acceleration. +test_ipip() { +if ! ip -net "$nsr1" link add name tun0 type ipip \ + local 192.168.10.1 remote 192.168.10.2 >/dev/null;then + echo "SKIP: could not add ipip tunnel" + [ "$ret" -eq 0 ] && ret=$ksft_skip + return +fi +ip -net "$nsr1" link set tun0 up +ip -net "$nsr1" addr add 192.168.100.1/24 dev tun0 +ip netns exec "$nsr1" sysctl net.ipv4.conf.tun0.forwarding=1 > /dev/null + +ip -net "$nsr2" link add name tun0 type ipip local 192.168.10.2 remote 192.168.10.1 +ip -net "$nsr2" link set tun0 up +ip -net "$nsr2" addr add 192.168.100.2/24 dev tun0 +ip netns exec "$nsr2" sysctl net.ipv4.conf.tun0.forwarding=1 > /dev/null + +ip -net "$nsr1" route change default via 192.168.100.2 +ip -net "$nsr2" route change default via 192.168.100.1 +ip -net "$ns2" route add default via 10.0.2.1 + +ip netns exec "$nsr1" nft -a insert rule inet filter forward 'meta oif tun0 accept' +ip netns exec "$nsr1" nft -a insert rule inet filter forward \ + 'meta oif "veth0" tcp sport 12345 ct mark set 1 flow add @f1 counter name routed_repl accept' + +if ! test_tcp_forwarding_nat "$ns1" "$ns2" 1 "IPIP tunnel"; then + echo "FAIL: flow offload for ns1/ns2 with IPIP tunnel" 1>&2 + ip netns exec "$nsr1" nft list ruleset + ret=1 +fi + +# Create vlan tagged devices for IPIP traffic. +ip -net "$nsr1" link add link veth1 name veth1.10 type vlan id 10 +ip -net "$nsr1" link set veth1.10 up +ip -net "$nsr1" addr add 192.168.20.1/24 dev veth1.10 +ip netns exec "$nsr1" sysctl net.ipv4.conf.veth1/10.forwarding=1 > /dev/null +ip netns exec "$nsr1" nft -a insert rule inet filter forward 'meta oif veth1.10 accept' +ip -net "$nsr1" link add name tun1 type ipip local 192.168.20.1 remote 192.168.20.2 +ip -net "$nsr1" link set tun1 up +ip -net "$nsr1" addr add 192.168.200.1/24 dev tun1 +ip -net "$nsr1" route change default via 192.168.200.2 +ip netns exec "$nsr1" sysctl net.ipv4.conf.tun1.forwarding=1 > /dev/null +ip netns exec "$nsr1" nft -a insert rule inet filter forward 'meta oif tun1 accept' + +ip -net "$nsr2" link add link veth0 name veth0.10 type vlan id 10 +ip -net "$nsr2" link set veth0.10 up +ip -net "$nsr2" addr add 192.168.20.2/24 dev veth0.10 +ip netns exec "$nsr2" sysctl net.ipv4.conf.veth0/10.forwarding=1 > /dev/null +ip -net "$nsr2" link add name tun1 type ipip local 192.168.20.2 remote 192.168.20.1 +ip -net "$nsr2" link set tun1 up +ip -net "$nsr2" addr add 192.168.200.2/24 dev tun1 +ip -net "$nsr2" route change default via 192.168.200.1 +ip netns exec "$nsr2" sysctl net.ipv4.conf.tun1.forwarding=1 > /dev/null + +if ! test_tcp_forwarding_nat "$ns1" "$ns2" 1 "IPIP tunnel over vlan"; then + echo "FAIL: flow offload for ns1/ns2 with IPIP tunnel over vlan" 1>&2 + ip netns exec "$nsr1" nft list ruleset + ret=1 +fi + +# Restore the previous configuration +ip -net "$nsr1" route change default via 192.168.10.2 +ip -net "$nsr2" route change default via 192.168.10.1 +ip -net "$ns2" route del default via 10.0.2.1 +} + # Another test: # Add bridge interface br0 to Router1, with NAT enabled. test_bridge() { @@ -643,6 +731,8 @@ ip -net "$nsr1" addr add dead:1::1/64 dev veth0 nodad ip -net "$nsr1" link set up dev veth0 } +test_ipip + test_bridge KEY_SHA="0x"$(ps -af | sha1sum | cut -d " " -f 1) @@ -683,7 +773,7 @@ ip -net "$ns2" route del 192.168.10.1 via 10.0.2.1 ip -net "$ns2" route add default via 10.0.2.1 ip -net "$ns2" route add default via dead:2::1 -if test_tcp_forwarding "$ns1" "$ns2" 1; then +if test_tcp_forwarding "$ns1" "$ns2" 1 4 10.0.2.99 12345; then check_counters "ipsec tunnel mode for ns1/ns2" else echo "FAIL: ipsec tunnel mode for ns1/ns2" @@ -691,6 +781,14 @@ else ip netns exec "$nsr1" cat /proc/net/xfrm_stat 1>&2 fi +if test_tcp_forwarding "$ns1" "$ns2" 1 6 "[dead:2::99]" 12345; then + check_counters "IPv6 ipsec tunnel mode for ns1/ns2" +else + echo "FAIL: IPv6 ipsec tunnel mode for ns1/ns2" + ip netns exec "$nsr1" nft list ruleset 1>&2 + ip netns exec "$nsr1" cat /proc/net/xfrm_stat 1>&2 +fi + if [ "$1" = "" ]; then low=1280 mtu=$((65536 - low)) diff --git a/tools/testing/selftests/net/netfilter/sctp_collision.c b/tools/testing/selftests/net/netfilter/sctp_collision.c index 21bb1cfd8a85..b282d1785c9b 100644 --- a/tools/testing/selftests/net/netfilter/sctp_collision.c +++ b/tools/testing/selftests/net/netfilter/sctp_collision.c @@ -9,9 +9,10 @@ int main(int argc, char *argv[]) { struct sockaddr_in saddr = {}, daddr = {}; - int sd, ret, len = sizeof(daddr); + socklen_t len = sizeof(daddr); struct timeval tv = {25, 0}; char buf[] = "hello"; + int sd, ret; if (argc != 6 || (strcmp(argv[1], "server") && strcmp(argv[1], "client"))) { printf("%s <server|client> <LOCAL_IP> <LOCAL_PORT> <REMOTE_IP> <REMOTE_PORT>\n", diff --git a/tools/testing/selftests/net/netlink-dumps.c b/tools/testing/selftests/net/netlink-dumps.c index 7618ebe528a4..679b6c77ace7 100644 --- a/tools/testing/selftests/net/netlink-dumps.c +++ b/tools/testing/selftests/net/netlink-dumps.c @@ -143,6 +143,7 @@ TEST(dump_extack) EXPECT_EQ(n, -1); EXPECT_EQ(errno, ENOBUFS); + ret = NO_CTRL; for (i = 0; i < cnt; i++) { struct ext_ack ea = {}; diff --git a/tools/testing/selftests/net/packetdrill/tcp_rto_synack_rto_max.pkt b/tools/testing/selftests/net/packetdrill/tcp_rto_synack_rto_max.pkt new file mode 100644 index 000000000000..47550df124ce --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_rto_synack_rto_max.pkt @@ -0,0 +1,54 @@ +// SPDX-License-Identifier: GPL-2.0 +// +// Test SYN+ACK RTX with 1s RTO. +// +`./defaults.sh + ./set_sysctls.py /proc/sys/net/ipv4/tcp_rto_max_ms=1000` + +// +// Test 1: TFO SYN+ACK +// + 0 socket(..., SOCK_STREAM|SOCK_NONBLOCK, IPPROTO_TCP) = 3 + +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 + +0 bind(3, ..., ...) = 0 + +0 listen(3, 1) = 0 + +0 setsockopt(3, SOL_TCP, TCP_FASTOPEN, [1], 4) = 0 + + +0 < S 0:10(10) win 1000 <mss 1460,sackOK,nop,nop,FO TFO_COOKIE,nop,nop> + +0 > S. 0:0(0) ack 11 <mss 1460,nop,nop,sackOK> + +// RTO must be capped to 1s + +1 > S. 0:0(0) ack 11 <mss 1460,nop,nop,sackOK> + +1 > S. 0:0(0) ack 11 <mss 1460,nop,nop,sackOK> + +1 > S. 0:0(0) ack 11 <mss 1460,nop,nop,sackOK> + + +0 < . 11:11(0) ack 1 win 1000 <mss 1460,nop,nop,sackOK> + +0 accept(3, ..., ...) = 4 + +0 %{ assert (tcpi_options & TCPI_OPT_SYN_DATA) != 0, tcpi_options }% + + +0 close(4) = 0 + +0 close(3) = 0 + + +// +// Test 2: non-TFO SYN+ACK +// + +0 socket(..., SOCK_STREAM|SOCK_NONBLOCK, IPPROTO_TCP) = 3 + +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 + +0 bind(3, ..., ...) = 0 + +0 listen(3, 1) = 0 + + +0 < S 0:0(0) win 1000 <mss 1460,sackOK,nop,nop> + +0 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK> + +// RTO must be capped to 1s + +1 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK> + +1 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK> + +1 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK> + + +0 < . 1:1(0) ack 1 win 1000 <mss 1460,nop,nop,sackOK> + +0 accept(3, ..., ...) = 4 + +0 %{ assert (tcpi_options & TCPI_OPT_SYN_DATA) == 0, tcpi_options }% + + +0 close(4) = 0 + +0 close(3) = 0 diff --git a/tools/testing/selftests/net/packetdrill/tcp_syscall_bad_arg_sendmsg-empty-iov.pkt b/tools/testing/selftests/net/packetdrill/tcp_syscall_bad_arg_sendmsg-empty-iov.pkt index b2b2cdf27e20..454441e7ecff 100644 --- a/tools/testing/selftests/net/packetdrill/tcp_syscall_bad_arg_sendmsg-empty-iov.pkt +++ b/tools/testing/selftests/net/packetdrill/tcp_syscall_bad_arg_sendmsg-empty-iov.pkt @@ -1,6 +1,10 @@ // SPDX-License-Identifier: GPL-2.0 // Test that we correctly skip zero-length IOVs. + +--send_omit_free // do not reuse send buffers with zerocopy + `./defaults.sh` + 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 +0 setsockopt(3, SOL_SOCKET, SO_ZEROCOPY, [1], 4) = 0 +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 diff --git a/tools/testing/selftests/net/packetdrill/tcp_user_timeout_user-timeout-probe.pkt b/tools/testing/selftests/net/packetdrill/tcp_user_timeout_user-timeout-probe.pkt index 183051ba0cae..6882b8240a8a 100644 --- a/tools/testing/selftests/net/packetdrill/tcp_user_timeout_user-timeout-probe.pkt +++ b/tools/testing/selftests/net/packetdrill/tcp_user_timeout_user-timeout-probe.pkt @@ -23,14 +23,16 @@ // install a qdisc dropping all packets +0 `tc qdisc delete dev tun0 root 2>/dev/null ; tc qdisc add dev tun0 root pfifo limit 0` + +0 write(4, ..., 24) = 24 // When qdisc is congested we retry every 500ms // (TCP_RESOURCE_PROBE_INTERVAL) and therefore // we retry 6 times before hitting 3s timeout. // First verify that the connection is alive: -+3.250 write(4, ..., 24) = 24 ++3 write(4, ..., 24) = 24 + // Now verify that shortly after that the socket is dead: - +.100 write(4, ..., 24) = -1 ETIMEDOUT (Connection timed out) ++1 write(4, ..., 24) = -1 ETIMEDOUT (Connection timed out) +0 %{ assert tcpi_probes == 6, tcpi_probes; \ assert tcpi_backoff == 0, tcpi_backoff }% diff --git a/tools/testing/selftests/net/packetdrill/tcp_zerocopy_basic.pkt b/tools/testing/selftests/net/packetdrill/tcp_zerocopy_basic.pkt index a82c8899d36b..0a0700afdaa3 100644 --- a/tools/testing/selftests/net/packetdrill/tcp_zerocopy_basic.pkt +++ b/tools/testing/selftests/net/packetdrill/tcp_zerocopy_basic.pkt @@ -4,6 +4,8 @@ // send a packet with MSG_ZEROCOPY and receive the notification ID // repeat and verify IDs are consecutive +--send_omit_free // do not reuse send buffers with zerocopy + `./defaults.sh` 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 diff --git a/tools/testing/selftests/net/packetdrill/tcp_zerocopy_batch.pkt b/tools/testing/selftests/net/packetdrill/tcp_zerocopy_batch.pkt index c01915e7f4a1..df91675d2991 100644 --- a/tools/testing/selftests/net/packetdrill/tcp_zerocopy_batch.pkt +++ b/tools/testing/selftests/net/packetdrill/tcp_zerocopy_batch.pkt @@ -3,6 +3,8 @@ // // send multiple packets, then read one range of all notifications. +--send_omit_free // do not reuse send buffers with zerocopy + `./defaults.sh` 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 diff --git a/tools/testing/selftests/net/packetdrill/tcp_zerocopy_client.pkt b/tools/testing/selftests/net/packetdrill/tcp_zerocopy_client.pkt index 6509882932e9..2963cfcb14df 100644 --- a/tools/testing/selftests/net/packetdrill/tcp_zerocopy_client.pkt +++ b/tools/testing/selftests/net/packetdrill/tcp_zerocopy_client.pkt @@ -1,6 +1,8 @@ // SPDX-License-Identifier: GPL-2.0 // Minimal client-side zerocopy test +--send_omit_free // do not reuse send buffers with zerocopy + `./defaults.sh` 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 4 diff --git a/tools/testing/selftests/net/packetdrill/tcp_zerocopy_closed.pkt b/tools/testing/selftests/net/packetdrill/tcp_zerocopy_closed.pkt index 2cd78755cb2a..ea0c2fa73c2d 100644 --- a/tools/testing/selftests/net/packetdrill/tcp_zerocopy_closed.pkt +++ b/tools/testing/selftests/net/packetdrill/tcp_zerocopy_closed.pkt @@ -7,6 +7,8 @@ // First send on a closed socket and wait for (absent) notification. // Then connect and send and verify that notification nr. is zero. +--send_omit_free // do not reuse send buffers with zerocopy + `./defaults.sh` 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 4 diff --git a/tools/testing/selftests/net/packetdrill/tcp_zerocopy_epoll_edge.pkt b/tools/testing/selftests/net/packetdrill/tcp_zerocopy_epoll_edge.pkt index 7671c20e01cf..4df978a9b82e 100644 --- a/tools/testing/selftests/net/packetdrill/tcp_zerocopy_epoll_edge.pkt +++ b/tools/testing/selftests/net/packetdrill/tcp_zerocopy_epoll_edge.pkt @@ -7,6 +7,9 @@ // fire two sends with MSG_ZEROCOPY and receive the acks. confirm that EPOLLERR // is correctly fired only once, when EPOLLET is set. send another packet with // MSG_ZEROCOPY. confirm that EPOLLERR is correctly fired again only once. + +--send_omit_free // do not reuse send buffers with zerocopy + `./defaults.sh` 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 diff --git a/tools/testing/selftests/net/packetdrill/tcp_zerocopy_epoll_exclusive.pkt b/tools/testing/selftests/net/packetdrill/tcp_zerocopy_epoll_exclusive.pkt index fadc480fdb7f..36b6edc4858c 100644 --- a/tools/testing/selftests/net/packetdrill/tcp_zerocopy_epoll_exclusive.pkt +++ b/tools/testing/selftests/net/packetdrill/tcp_zerocopy_epoll_exclusive.pkt @@ -8,6 +8,9 @@ // fire two sends with MSG_ZEROCOPY and receive the acks. confirm that EPOLLERR // is correctly fired only once, when EPOLLET is set. send another packet with // MSG_ZEROCOPY. confirm that EPOLLERR is correctly fired again only once. + +--send_omit_free // do not reuse send buffers with zerocopy + `./defaults.sh` 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 diff --git a/tools/testing/selftests/net/packetdrill/tcp_zerocopy_epoll_oneshot.pkt b/tools/testing/selftests/net/packetdrill/tcp_zerocopy_epoll_oneshot.pkt index 5bfa0d1d2f4a..1bea6f3b4558 100644 --- a/tools/testing/selftests/net/packetdrill/tcp_zerocopy_epoll_oneshot.pkt +++ b/tools/testing/selftests/net/packetdrill/tcp_zerocopy_epoll_oneshot.pkt @@ -8,6 +8,9 @@ // is correctly fired only once, when EPOLLONESHOT is set. send another packet // with MSG_ZEROCOPY. confirm that EPOLLERR is not fired. Rearm the FD and // confirm that EPOLLERR is correctly set. + +--send_omit_free // do not reuse send buffers with zerocopy + `./defaults.sh` 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 diff --git a/tools/testing/selftests/net/packetdrill/tcp_zerocopy_fastopen-client.pkt b/tools/testing/selftests/net/packetdrill/tcp_zerocopy_fastopen-client.pkt index 4a73bbf46961..e27c21ff5d18 100644 --- a/tools/testing/selftests/net/packetdrill/tcp_zerocopy_fastopen-client.pkt +++ b/tools/testing/selftests/net/packetdrill/tcp_zerocopy_fastopen-client.pkt @@ -8,6 +8,8 @@ // one will have no data in the initial send. On return 0 the // zerocopy notification counter is not incremented. Verify this too. +--send_omit_free // do not reuse send buffers with zerocopy + `./defaults.sh` // Send a FastOpen request, no cookie yet so no data in SYN diff --git a/tools/testing/selftests/net/packetdrill/tcp_zerocopy_fastopen-server.pkt b/tools/testing/selftests/net/packetdrill/tcp_zerocopy_fastopen-server.pkt index 36086c5877ce..b1fa77c77dfa 100644 --- a/tools/testing/selftests/net/packetdrill/tcp_zerocopy_fastopen-server.pkt +++ b/tools/testing/selftests/net/packetdrill/tcp_zerocopy_fastopen-server.pkt @@ -4,6 +4,8 @@ // send data with MSG_FASTOPEN | MSG_ZEROCOPY and verify that the // kernel returns the notification ID. +--send_omit_free // do not reuse send buffers with zerocopy + `./defaults.sh ./set_sysctls.py /proc/sys/net/ipv4/tcp_fastopen=0x207` diff --git a/tools/testing/selftests/net/packetdrill/tcp_zerocopy_maxfrags.pkt b/tools/testing/selftests/net/packetdrill/tcp_zerocopy_maxfrags.pkt index 672f817faca0..2f5317d0a9fa 100644 --- a/tools/testing/selftests/net/packetdrill/tcp_zerocopy_maxfrags.pkt +++ b/tools/testing/selftests/net/packetdrill/tcp_zerocopy_maxfrags.pkt @@ -7,6 +7,8 @@ // because each iovec element becomes a frag // 3) the PSH bit is set on an skb when it runs out of fragments +--send_omit_free // do not reuse send buffers with zerocopy + `./defaults.sh` 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 diff --git a/tools/testing/selftests/net/packetdrill/tcp_zerocopy_small.pkt b/tools/testing/selftests/net/packetdrill/tcp_zerocopy_small.pkt index a9a1ac0aea4f..9d5272c6b207 100644 --- a/tools/testing/selftests/net/packetdrill/tcp_zerocopy_small.pkt +++ b/tools/testing/selftests/net/packetdrill/tcp_zerocopy_small.pkt @@ -4,6 +4,8 @@ // verify that SO_EE_CODE_ZEROCOPY_COPIED is set on zerocopy // packets of all sizes, including the smallest payload, 1B. +--send_omit_free // do not reuse send buffers with zerocopy + `./defaults.sh` 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 diff --git a/tools/testing/selftests/net/rtnetlink.sh b/tools/testing/selftests/net/rtnetlink.sh index 163a084d525d..248c2b91fe42 100755 --- a/tools/testing/selftests/net/rtnetlink.sh +++ b/tools/testing/selftests/net/rtnetlink.sh @@ -8,6 +8,7 @@ ALL_TESTS=" kci_test_polrouting kci_test_route_get kci_test_addrlft + kci_test_addrlft_route_cleanup kci_test_promote_secondaries kci_test_tc kci_test_gre @@ -323,6 +324,25 @@ kci_test_addrlft() end_test "PASS: preferred_lft addresses have expired" } +kci_test_addrlft_route_cleanup() +{ + local ret=0 + local test_addr="2001:db8:99::1/64" + local test_prefix="2001:db8:99::/64" + + run_cmd ip -6 addr add $test_addr dev "$devdummy" valid_lft 300 preferred_lft 300 + run_cmd_grep "$test_prefix proto kernel" ip -6 route show dev "$devdummy" + run_cmd ip -6 addr del $test_addr dev "$devdummy" + run_cmd_grep_fail "$test_prefix" ip -6 route show dev "$devdummy" + + if [ $ret -ne 0 ]; then + end_test "FAIL: route not cleaned up when address with valid_lft deleted" + return 1 + fi + + end_test "PASS: route cleaned up when address with valid_lft deleted" +} + kci_test_promote_secondaries() { run_cmd ifconfig "$devdummy" diff --git a/tools/testing/selftests/net/setup_loopback.sh b/tools/testing/selftests/net/setup_loopback.sh deleted file mode 100644 index 2070b57849de..000000000000 --- a/tools/testing/selftests/net/setup_loopback.sh +++ /dev/null @@ -1,120 +0,0 @@ -#!/bin/bash -# SPDX-License-Identifier: GPL-2.0 - -readonly FLUSH_PATH="/sys/class/net/${dev}/gro_flush_timeout" -readonly IRQ_PATH="/sys/class/net/${dev}/napi_defer_hard_irqs" -readonly FLUSH_TIMEOUT="$(< ${FLUSH_PATH})" -readonly HARD_IRQS="$(< ${IRQ_PATH})" -readonly server_ns=$(mktemp -u server-XXXXXXXX) -readonly client_ns=$(mktemp -u client-XXXXXXXX) - -netdev_check_for_carrier() { - local -r dev="$1" - - for i in {1..5}; do - carrier="$(cat /sys/class/net/${dev}/carrier)" - if [[ "${carrier}" -ne 1 ]] ; then - echo "carrier not ready yet..." >&2 - sleep 1 - else - echo "carrier ready" >&2 - break - fi - done - echo "${carrier}" -} - -# Assumes that there is no existing ipvlan device on the physical device -setup_loopback_environment() { - local dev="$1" - - # Fail hard if cannot turn on loopback mode for current NIC - ethtool -K "${dev}" loopback on || exit 1 - sleep 1 - - # Check for the carrier - carrier=$(netdev_check_for_carrier ${dev}) - if [[ "${carrier}" -ne 1 ]] ; then - echo "setup_loopback_environment failed" - exit 1 - fi -} - -setup_macvlan_ns(){ - local -r link_dev="$1" - local -r ns_name="$2" - local -r ns_dev="$3" - local -r ns_mac="$4" - local -r addr="$5" - - ip link add link "${link_dev}" dev "${ns_dev}" \ - address "${ns_mac}" type macvlan - exit_code=$? - if [[ "${exit_code}" -ne 0 ]]; then - echo "setup_macvlan_ns failed" - exit $exit_code - fi - - [[ -e /var/run/netns/"${ns_name}" ]] || ip netns add "${ns_name}" - ip link set dev "${ns_dev}" netns "${ns_name}" - ip -netns "${ns_name}" link set dev "${ns_dev}" up - if [[ -n "${addr}" ]]; then - ip -netns "${ns_name}" addr add dev "${ns_dev}" "${addr}" - fi - - sleep 1 -} - -cleanup_macvlan_ns(){ - while (( $# >= 2 )); do - ns_name="$1" - ns_dev="$2" - ip -netns "${ns_name}" link del dev "${ns_dev}" - ip netns del "${ns_name}" - shift 2 - done -} - -cleanup_loopback(){ - local -r dev="$1" - - ethtool -K "${dev}" loopback off - sleep 1 - - # Check for the carrier - carrier=$(netdev_check_for_carrier ${dev}) - if [[ "${carrier}" -ne 1 ]] ; then - echo "setup_loopback_environment failed" - exit 1 - fi -} - -setup_interrupt() { - # Use timer on host to trigger the network stack - # Also disable device interrupt to not depend on NIC interrupt - # Reduce test flakiness caused by unexpected interrupts - echo 100000 >"${FLUSH_PATH}" - echo 50 >"${IRQ_PATH}" -} - -setup_ns() { - # Set up server_ns namespace and client_ns namespace - setup_macvlan_ns "${dev}" ${server_ns} server "${SERVER_MAC}" - setup_macvlan_ns "${dev}" ${client_ns} client "${CLIENT_MAC}" -} - -cleanup_ns() { - cleanup_macvlan_ns ${server_ns} server ${client_ns} client -} - -setup() { - setup_loopback_environment "${dev}" - setup_interrupt -} - -cleanup() { - cleanup_loopback "${dev}" - - echo "${FLUSH_TIMEOUT}" >"${FLUSH_PATH}" - echo "${HARD_IRQS}" >"${IRQ_PATH}" -} diff --git a/tools/testing/selftests/net/setup_veth.sh b/tools/testing/selftests/net/setup_veth.sh deleted file mode 100644 index 152bf4c65747..000000000000 --- a/tools/testing/selftests/net/setup_veth.sh +++ /dev/null @@ -1,45 +0,0 @@ -#!/bin/bash -# SPDX-License-Identifier: GPL-2.0 - -readonly server_ns=$(mktemp -u server-XXXXXXXX) -readonly client_ns=$(mktemp -u client-XXXXXXXX) - -setup_veth_ns() { - local -r link_dev="$1" - local -r ns_name="$2" - local -r ns_dev="$3" - local -r ns_mac="$4" - - [[ -e /var/run/netns/"${ns_name}" ]] || ip netns add "${ns_name}" - echo 200000 > "/sys/class/net/${ns_dev}/gro_flush_timeout" - echo 1 > "/sys/class/net/${ns_dev}/napi_defer_hard_irqs" - ip link set dev "${ns_dev}" netns "${ns_name}" mtu 65535 - ip -netns "${ns_name}" link set dev "${ns_dev}" up - - ip netns exec "${ns_name}" ethtool -K "${ns_dev}" gro on tso off -} - -setup_ns() { - # Set up server_ns namespace and client_ns namespace - ip link add name server type veth peer name client - - setup_veth_ns "${dev}" ${server_ns} server "${SERVER_MAC}" - setup_veth_ns "${dev}" ${client_ns} client "${CLIENT_MAC}" -} - -cleanup_ns() { - local ns_name - - for ns_name in ${client_ns} ${server_ns}; do - [[ -e /var/run/netns/"${ns_name}" ]] && ip netns del "${ns_name}" - done -} - -setup() { - # no global init setup step needed - : -} - -cleanup() { - cleanup_ns -} diff --git a/tools/testing/selftests/net/so_txtime.c b/tools/testing/selftests/net/so_txtime.c index 8457b7ccbc09..b76df1efc2ef 100644 --- a/tools/testing/selftests/net/so_txtime.c +++ b/tools/testing/selftests/net/so_txtime.c @@ -174,7 +174,7 @@ static int do_recv_errqueue_timeout(int fdt) msg.msg_controllen = sizeof(control); while (1) { - const char *reason; + const char *reason = NULL; ret = recvmsg(fdt, &msg, MSG_ERRQUEUE); if (ret == -1 && errno == EAGAIN) diff --git a/tools/testing/selftests/net/tls.c b/tools/testing/selftests/net/tls.c index 5c6d8215021c..da1b50b30719 100644 --- a/tools/testing/selftests/net/tls.c +++ b/tools/testing/selftests/net/tls.c @@ -2856,6 +2856,147 @@ TEST_F(tls_err, oob_pressure) EXPECT_EQ(send(self->fd2, buf, 5, MSG_OOB), 5); } +/* + * Parse a stream of TLS records and ensure that each record respects + * the specified @max_payload_len. + */ +static size_t parse_tls_records(struct __test_metadata *_metadata, + const __u8 *rx_buf, int rx_len, int overhead, + __u16 max_payload_len) +{ + const __u8 *rec = rx_buf; + size_t total_plaintext_rx = 0; + const __u8 rec_header_len = 5; + + while (rec < rx_buf + rx_len) { + __u16 record_payload_len; + __u16 plaintext_len; + + /* Sanity check that it's a TLS header for application data */ + ASSERT_EQ(rec[0], 23); + ASSERT_EQ(rec[1], 0x3); + ASSERT_EQ(rec[2], 0x3); + + memcpy(&record_payload_len, rec + 3, 2); + record_payload_len = ntohs(record_payload_len); + ASSERT_GE(record_payload_len, overhead); + + plaintext_len = record_payload_len - overhead; + total_plaintext_rx += plaintext_len; + + /* Plaintext must not exceed the specified limit */ + ASSERT_LE(plaintext_len, max_payload_len); + rec += rec_header_len + record_payload_len; + } + + return total_plaintext_rx; +} + +TEST(tls_12_tx_max_payload_len) +{ + struct tls_crypto_info_keys tls12; + int cfd, ret, fd, overhead; + size_t total_plaintext_rx = 0; + __u8 tx[1024], rx[2000]; + __u16 limit = 128; + __u16 opt = 0; + unsigned int optlen = sizeof(opt); + bool notls; + + tls_crypto_info_init(TLS_1_2_VERSION, TLS_CIPHER_AES_CCM_128, + &tls12, 0); + + ulp_sock_pair(_metadata, &fd, &cfd, ¬ls); + + if (notls) + exit(KSFT_SKIP); + + /* Don't install keys on fd, we'll parse raw records */ + ret = setsockopt(cfd, SOL_TLS, TLS_TX, &tls12, tls12.len); + ASSERT_EQ(ret, 0); + + ret = setsockopt(cfd, SOL_TLS, TLS_TX_MAX_PAYLOAD_LEN, &limit, + sizeof(limit)); + ASSERT_EQ(ret, 0); + + ret = getsockopt(cfd, SOL_TLS, TLS_TX_MAX_PAYLOAD_LEN, &opt, &optlen); + EXPECT_EQ(ret, 0); + EXPECT_EQ(limit, opt); + EXPECT_EQ(optlen, sizeof(limit)); + + memset(tx, 0, sizeof(tx)); + ASSERT_EQ(send(cfd, tx, sizeof(tx), 0), sizeof(tx)); + close(cfd); + + ret = recv(fd, rx, sizeof(rx), 0); + + /* + * 16B tag + 8B IV -- record header (5B) is not counted but we'll + * need it to walk the record stream + */ + overhead = 16 + 8; + total_plaintext_rx = parse_tls_records(_metadata, rx, ret, overhead, + limit); + + ASSERT_EQ(total_plaintext_rx, sizeof(tx)); + close(fd); +} + +TEST(tls_12_tx_max_payload_len_open_rec) +{ + struct tls_crypto_info_keys tls12; + int cfd, ret, fd, overhead; + size_t total_plaintext_rx = 0; + __u8 tx[1024], rx[2000]; + __u16 tx_partial = 256; + __u16 og_limit = 512, limit = 128; + bool notls; + + tls_crypto_info_init(TLS_1_2_VERSION, TLS_CIPHER_AES_CCM_128, + &tls12, 0); + + ulp_sock_pair(_metadata, &fd, &cfd, ¬ls); + + if (notls) + exit(KSFT_SKIP); + + /* Don't install keys on fd, we'll parse raw records */ + ret = setsockopt(cfd, SOL_TLS, TLS_TX, &tls12, tls12.len); + ASSERT_EQ(ret, 0); + + ret = setsockopt(cfd, SOL_TLS, TLS_TX_MAX_PAYLOAD_LEN, &og_limit, + sizeof(og_limit)); + ASSERT_EQ(ret, 0); + + memset(tx, 0, sizeof(tx)); + ASSERT_EQ(send(cfd, tx, tx_partial, MSG_MORE), tx_partial); + + /* + * Changing the payload limit with a pending open record should + * not be allowed. + */ + ret = setsockopt(cfd, SOL_TLS, TLS_TX_MAX_PAYLOAD_LEN, &limit, + sizeof(limit)); + ASSERT_EQ(ret, -1); + ASSERT_EQ(errno, EBUSY); + + ASSERT_EQ(send(cfd, tx + tx_partial, sizeof(tx) - tx_partial, MSG_EOR), + sizeof(tx) - tx_partial); + close(cfd); + + ret = recv(fd, rx, sizeof(rx), 0); + + /* + * 16B tag + 8B IV -- record header (5B) is not counted but we'll + * need it to walk the record stream + */ + overhead = 16 + 8; + total_plaintext_rx = parse_tls_records(_metadata, rx, ret, overhead, + og_limit); + ASSERT_EQ(total_plaintext_rx, sizeof(tx)); + close(fd); +} + TEST(non_established) { struct tls12_crypto_info_aes_gcm_256 tls12; struct sockaddr_in addr; diff --git a/tools/testing/selftests/net/toeplitz.sh b/tools/testing/selftests/net/toeplitz.sh deleted file mode 100755 index 8ff172f7bb1b..000000000000 --- a/tools/testing/selftests/net/toeplitz.sh +++ /dev/null @@ -1,199 +0,0 @@ -#!/bin/bash -# SPDX-License-Identifier: GPL-2.0 -# -# extended toeplitz test: test rxhash plus, optionally, either (1) rss mapping -# from rxhash to rx queue ('-rss') or (2) rps mapping from rxhash to cpu -# ('-rps <rps_map>') -# -# irq-pattern-prefix can be derived from /sys/kernel/irq/*/action, -# which is a driver-specific encoding. -# -# invoke as ./toeplitz.sh (-i <iface>) -u|-t -4|-6 \ -# [(-rss -irq_prefix <irq-pattern-prefix>)|(-rps <rps_map>)] - -source setup_loopback.sh -readonly SERVER_IP4="192.168.1.200/24" -readonly SERVER_IP6="fda8::1/64" -readonly SERVER_MAC="aa:00:00:00:00:02" - -readonly CLIENT_IP4="192.168.1.100/24" -readonly CLIENT_IP6="fda8::2/64" -readonly CLIENT_MAC="aa:00:00:00:00:01" - -PORT=8000 -KEY="$(</proc/sys/net/core/netdev_rss_key)" -TEST_RSS=false -RPS_MAP="" -PROTO_FLAG="" -IP_FLAG="" -DEV="eth0" - -# Return the number of rxqs among which RSS is configured to spread packets. -# This is determined by reading the RSS indirection table using ethtool. -get_rss_cfg_num_rxqs() { - echo $(ethtool -x "${DEV}" | - grep -E [[:space:]]+[0-9]+:[[:space:]]+ | - cut -d: -f2- | - awk '{$1=$1};1' | - tr ' ' '\n' | - sort -u | - wc -l) -} - -# Return a list of the receive irq handler cpus. -# The list is ordered by the irqs, so first rxq-0 cpu, then rxq-1 cpu, etc. -# Reads /sys/kernel/irq/ in order, so algorithm depends on -# irq_{rxq-0} < irq_{rxq-1}, etc. -get_rx_irq_cpus() { - CPUS="" - # sort so that irq 2 is read before irq 10 - SORTED_IRQS=$(for i in /sys/kernel/irq/*; do echo $i; done | sort -V) - # Consider only as many queues as RSS actually uses. We assume that - # if RSS_CFG_NUM_RXQS=N, then RSS uses rxqs 0-(N-1). - RSS_CFG_NUM_RXQS=$(get_rss_cfg_num_rxqs) - RXQ_COUNT=0 - - for i in ${SORTED_IRQS} - do - [[ "${RXQ_COUNT}" -lt "${RSS_CFG_NUM_RXQS}" ]] || break - # lookup relevant IRQs by action name - [[ -e "$i/actions" ]] || continue - cat "$i/actions" | grep -q "${IRQ_PATTERN}" || continue - irqname=$(<"$i/actions") - - # does the IRQ get called - irqcount=$(cat "$i/per_cpu_count" | tr -d '0,') - [[ -n "${irqcount}" ]] || continue - - # lookup CPU - irq=$(basename "$i") - cpu=$(cat "/proc/irq/$irq/smp_affinity_list") - - if [[ -z "${CPUS}" ]]; then - CPUS="${cpu}" - else - CPUS="${CPUS},${cpu}" - fi - RXQ_COUNT=$((RXQ_COUNT+1)) - done - - echo "${CPUS}" -} - -get_disable_rfs_cmd() { - echo "echo 0 > /proc/sys/net/core/rps_sock_flow_entries;" -} - -get_set_rps_bitmaps_cmd() { - CMD="" - for i in /sys/class/net/${DEV}/queues/rx-*/rps_cpus - do - CMD="${CMD} echo $1 > ${i};" - done - - echo "${CMD}" -} - -get_disable_rps_cmd() { - echo "$(get_set_rps_bitmaps_cmd 0)" -} - -die() { - echo "$1" - exit 1 -} - -check_nic_rxhash_enabled() { - local -r pattern="receive-hashing:\ on" - - ethtool -k "${DEV}" | grep -q "${pattern}" || die "rxhash must be enabled" -} - -parse_opts() { - local prog=$0 - shift 1 - - while [[ "$1" =~ "-" ]]; do - if [[ "$1" = "-irq_prefix" ]]; then - shift - IRQ_PATTERN="^$1-[0-9]*$" - elif [[ "$1" = "-u" || "$1" = "-t" ]]; then - PROTO_FLAG="$1" - elif [[ "$1" = "-4" ]]; then - IP_FLAG="$1" - SERVER_IP="${SERVER_IP4}" - CLIENT_IP="${CLIENT_IP4}" - elif [[ "$1" = "-6" ]]; then - IP_FLAG="$1" - SERVER_IP="${SERVER_IP6}" - CLIENT_IP="${CLIENT_IP6}" - elif [[ "$1" = "-rss" ]]; then - TEST_RSS=true - elif [[ "$1" = "-rps" ]]; then - shift - RPS_MAP="$1" - elif [[ "$1" = "-i" ]]; then - shift - DEV="$1" - else - die "Usage: ${prog} (-i <iface>) -u|-t -4|-6 \ - [(-rss -irq_prefix <irq-pattern-prefix>)|(-rps <rps_map>)]" - fi - shift - done -} - -setup() { - setup_loopback_environment "${DEV}" - - # Set up server_ns namespace and client_ns namespace - setup_macvlan_ns "${DEV}" $server_ns server \ - "${SERVER_MAC}" "${SERVER_IP}" - setup_macvlan_ns "${DEV}" $client_ns client \ - "${CLIENT_MAC}" "${CLIENT_IP}" -} - -cleanup() { - cleanup_macvlan_ns $server_ns server $client_ns client - cleanup_loopback "${DEV}" -} - -parse_opts $0 $@ - -setup -trap cleanup EXIT - -check_nic_rxhash_enabled - -# Actual test starts here -if [[ "${TEST_RSS}" = true ]]; then - # RPS/RFS must be disabled because they move packets between cpus, - # which breaks the PACKET_FANOUT_CPU identification of RSS decisions. - eval "$(get_disable_rfs_cmd) $(get_disable_rps_cmd)" \ - ip netns exec $server_ns ./toeplitz "${IP_FLAG}" "${PROTO_FLAG}" \ - -d "${PORT}" -i "${DEV}" -k "${KEY}" -T 1000 \ - -C "$(get_rx_irq_cpus)" -s -v & -elif [[ ! -z "${RPS_MAP}" ]]; then - eval "$(get_disable_rfs_cmd) $(get_set_rps_bitmaps_cmd ${RPS_MAP})" \ - ip netns exec $server_ns ./toeplitz "${IP_FLAG}" "${PROTO_FLAG}" \ - -d "${PORT}" -i "${DEV}" -k "${KEY}" -T 1000 \ - -r "0x${RPS_MAP}" -s -v & -else - ip netns exec $server_ns ./toeplitz "${IP_FLAG}" "${PROTO_FLAG}" \ - -d "${PORT}" -i "${DEV}" -k "${KEY}" -T 1000 -s -v & -fi - -server_pid=$! - -ip netns exec $client_ns ./toeplitz_client.sh "${PROTO_FLAG}" \ - "${IP_FLAG}" "${SERVER_IP%%/*}" "${PORT}" & - -client_pid=$! - -wait "${server_pid}" -exit_code=$? -kill -9 "${client_pid}" -if [[ "${exit_code}" -eq 0 ]]; then - echo "Test Succeeded!" -fi -exit "${exit_code}" diff --git a/tools/testing/selftests/net/toeplitz_client.sh b/tools/testing/selftests/net/toeplitz_client.sh deleted file mode 100755 index 2fef34f4aba1..000000000000 --- a/tools/testing/selftests/net/toeplitz_client.sh +++ /dev/null @@ -1,28 +0,0 @@ -#!/bin/bash -# SPDX-License-Identifier: GPL-2.0 -# -# A simple program for generating traffic for the toeplitz test. -# -# This program sends packets periodically for, conservatively, 20 seconds. The -# intent is for the calling program to kill this program once it is no longer -# needed, rather than waiting for the 20 second expiration. - -send_traffic() { - expiration=$((SECONDS+20)) - while [[ "${SECONDS}" -lt "${expiration}" ]] - do - if [[ "${PROTO}" == "-u" ]]; then - echo "msg $i" | nc "${IPVER}" -u -w 0 "${ADDR}" "${PORT}" - else - echo "msg $i" | nc "${IPVER}" -w 0 "${ADDR}" "${PORT}" - fi - sleep 0.001 - done -} - -PROTO=$1 -IPVER=$2 -ADDR=$3 -PORT=$4 - -send_traffic diff --git a/tools/testing/selftests/net/traceroute.sh b/tools/testing/selftests/net/traceroute.sh index dbb34c7e09ce..a7c6ab8a0347 100755 --- a/tools/testing/selftests/net/traceroute.sh +++ b/tools/testing/selftests/net/traceroute.sh @@ -36,6 +36,35 @@ run_cmd() return $rc } +__check_traceroute_version() +{ + local cmd=$1; shift + local req_ver=$1; shift + local ver + + req_ver=$(echo "$req_ver" | sed 's/\.//g') + ver=$($cmd -V 2>&1 | grep -Eo '[0-9]+.[0-9]+.[0-9]+' | sed 's/\.//g') + if [[ $ver -lt $req_ver ]]; then + return 1 + else + return 0 + fi +} + +check_traceroute6_version() +{ + local req_ver=$1; shift + + __check_traceroute_version traceroute6 "$req_ver" +} + +check_traceroute_version() +{ + local req_ver=$1; shift + + __check_traceroute_version traceroute "$req_ver" +} + ################################################################################ # create namespaces and interconnects @@ -59,6 +88,8 @@ create_ns() ip netns exec ${ns} ip -6 ro add unreachable default metric 8192 ip netns exec ${ns} sysctl -qw net.ipv4.ip_forward=1 + ip netns exec ${ns} sysctl -qw net.ipv4.icmp_ratelimit=0 + ip netns exec ${ns} sysctl -qw net.ipv6.icmp.ratelimit=0 ip netns exec ${ns} sysctl -qw net.ipv6.conf.all.keep_addr_on_down=1 ip netns exec ${ns} sysctl -qw net.ipv6.conf.all.forwarding=1 ip netns exec ${ns} sysctl -qw net.ipv6.conf.default.forwarding=1 @@ -298,6 +329,144 @@ run_traceroute6_vrf() } ################################################################################ +# traceroute6 with ICMP extensions test +# +# Verify that in this scenario +# +# ---- ---- ---- +# |H1|--------------------------|R1|--------------------------|H2| +# ---- N1 ---- N2 ---- +# +# ICMP extensions are correctly reported. The loopback interfaces on all the +# nodes are assigned global addresses and the interfaces connecting the nodes +# are assigned IPv6 link-local addresses. + +cleanup_traceroute6_ext() +{ + cleanup_all_ns +} + +setup_traceroute6_ext() +{ + # Start clean + cleanup_traceroute6_ext + + setup_ns h1 r1 h2 + create_ns "$h1" + create_ns "$r1" + create_ns "$h2" + + # Setup N1 + connect_ns "$h1" eth1 - fe80::1/64 "$r1" eth1 - fe80::2/64 + # Setup N2 + connect_ns "$r1" eth2 - fe80::3/64 "$h2" eth2 - fe80::4/64 + + # Setup H1 + ip -n "$h1" address add 2001:db8:1::1/128 dev lo + ip -n "$h1" route add ::/0 nexthop via fe80::2 dev eth1 + + # Setup R1 + ip -n "$r1" address add 2001:db8:1::2/128 dev lo + ip -n "$r1" route add 2001:db8:1::1/128 nexthop via fe80::1 dev eth1 + ip -n "$r1" route add 2001:db8:1::3/128 nexthop via fe80::4 dev eth2 + + # Setup H2 + ip -n "$h2" address add 2001:db8:1::3/128 dev lo + ip -n "$h2" route add ::/0 nexthop via fe80::3 dev eth2 + + # Prime the network + ip netns exec "$h1" ping6 -c5 2001:db8:1::3 >/dev/null 2>&1 +} + +traceroute6_ext_iio_iif_test() +{ + local r1_ifindex h2_ifindex + local pkt_len=$1; shift + + # Test that incoming interface info is not appended by default. + run_cmd "$h1" "traceroute6 -e 2001:db8:1::3 $pkt_len | grep INC" + check_fail $? "Incoming interface info appended by default when should not" + + # Test that the extension is appended when enabled. + run_cmd "$r1" "bash -c \"echo 0x01 > /proc/sys/net/ipv6/icmp/errors_extension_mask\"" + check_err $? "Failed to enable incoming interface info extension on R1" + + run_cmd "$h1" "traceroute6 -e 2001:db8:1::3 $pkt_len | grep INC" + check_err $? "Incoming interface info not appended after enable" + + # Test that the extension is not appended when disabled. + run_cmd "$r1" "bash -c \"echo 0x00 > /proc/sys/net/ipv6/icmp/errors_extension_mask\"" + check_err $? "Failed to disable incoming interface info extension on R1" + + run_cmd "$h1" "traceroute6 -e 2001:db8:1::3 $pkt_len | grep INC" + check_fail $? "Incoming interface info appended after disable" + + # Test that the extension is sent correctly from both R1 and H2. + run_cmd "$r1" "sysctl -w net.ipv6.icmp.errors_extension_mask=0x01" + r1_ifindex=$(ip -n "$r1" -j link show dev eth1 | jq '.[]["ifindex"]') + run_cmd "$h1" "traceroute6 -e 2001:db8:1::3 $pkt_len | grep '<INC:$r1_ifindex,\"eth1\",mtu=1500>'" + check_err $? "Wrong incoming interface info reported from R1" + + run_cmd "$h2" "sysctl -w net.ipv6.icmp.errors_extension_mask=0x01" + h2_ifindex=$(ip -n "$h2" -j link show dev eth2 | jq '.[]["ifindex"]') + run_cmd "$h1" "traceroute6 -e 2001:db8:1::3 $pkt_len | grep '<INC:$h2_ifindex,\"eth2\",mtu=1500>'" + check_err $? "Wrong incoming interface info reported from H2" + + # Add a global address on the incoming interface of R1 and check that + # it is reported. + run_cmd "$r1" "ip address add 2001:db8:100::1/64 dev eth1 nodad" + run_cmd "$h1" "traceroute6 -e 2001:db8:1::3 $pkt_len | grep '<INC:$r1_ifindex,2001:db8:100::1,\"eth1\",mtu=1500>'" + check_err $? "Wrong incoming interface info reported from R1 after address addition" + run_cmd "$r1" "ip address del 2001:db8:100::1/64 dev eth1" + + # Change name and MTU and make sure the result is still correct. + run_cmd "$r1" "ip link set dev eth1 name eth1tag mtu 1501" + run_cmd "$h1" "traceroute6 -e 2001:db8:1::3 $pkt_len | grep '<INC:$r1_ifindex,\"eth1tag\",mtu=1501>'" + check_err $? "Wrong incoming interface info reported from R1 after name and MTU change" + run_cmd "$r1" "ip link set dev eth1tag name eth1 mtu 1500" + + run_cmd "$r1" "sysctl -w net.ipv6.icmp.errors_extension_mask=0x00" + run_cmd "$h2" "sysctl -w net.ipv6.icmp.errors_extension_mask=0x00" +} + +run_traceroute6_ext() +{ + # Need at least version 2.1.5 for RFC 5837 support. + if ! check_traceroute6_version 2.1.5; then + log_test_skip "traceroute6 too old, missing ICMP extensions support" + return + fi + + setup_traceroute6_ext + + RET=0 + + ## General ICMP extensions tests + + # Test that ICMP extensions are disabled by default. + run_cmd "$h1" "sysctl net.ipv6.icmp.errors_extension_mask | grep \"= 0$\"" + check_err $? "ICMP extensions are not disabled by default" + + # Test that unsupported values are rejected. Do not use "sysctl" as + # older versions do not return an error code upon failure. + run_cmd "$h1" "bash -c \"echo 0x80 > /proc/sys/net/ipv6/icmp/errors_extension_mask\"" + check_fail $? "Unsupported sysctl value was not rejected" + + ## Extension-specific tests + + # Incoming interface info test. Test with various packet sizes, + # including the default one. + traceroute6_ext_iio_iif_test + traceroute6_ext_iio_iif_test 127 + traceroute6_ext_iio_iif_test 128 + traceroute6_ext_iio_iif_test 129 + + log_test "IPv6 traceroute with ICMP extensions" + + cleanup_traceroute6_ext +} + +################################################################################ # traceroute test # # Verify that traceroute from H1 to H2 shows 1.0.3.1 and 1.0.1.1 when @@ -438,14 +607,157 @@ run_traceroute_vrf() } ################################################################################ +# traceroute with ICMP extensions test +# +# Verify that in this scenario +# +# ---- ---- ---- +# |H1|--------------------------|R1|--------------------------|H2| +# ---- N1 ---- N2 ---- +# +# ICMP extensions are correctly reported. The loopback interfaces on all the +# nodes are assigned global addresses and the interfaces connecting the nodes +# are assigned IPv6 link-local addresses. + +cleanup_traceroute_ext() +{ + cleanup_all_ns +} + +setup_traceroute_ext() +{ + # Start clean + cleanup_traceroute_ext + + setup_ns h1 r1 h2 + create_ns "$h1" + create_ns "$r1" + create_ns "$h2" + + # Setup N1 + connect_ns "$h1" eth1 - fe80::1/64 "$r1" eth1 - fe80::2/64 + # Setup N2 + connect_ns "$r1" eth2 - fe80::3/64 "$h2" eth2 - fe80::4/64 + + # Setup H1 + ip -n "$h1" address add 192.0.2.1/32 dev lo + ip -n "$h1" route add 0.0.0.0/0 nexthop via inet6 fe80::2 dev eth1 + + # Setup R1 + ip -n "$r1" address add 192.0.2.2/32 dev lo + ip -n "$r1" route add 192.0.2.1/32 nexthop via inet6 fe80::1 dev eth1 + ip -n "$r1" route add 192.0.2.3/32 nexthop via inet6 fe80::4 dev eth2 + + # Setup H2 + ip -n "$h2" address add 192.0.2.3/32 dev lo + ip -n "$h2" route add 0.0.0.0/0 nexthop via inet6 fe80::3 dev eth2 + + # Prime the network + ip netns exec "$h1" ping -c5 192.0.2.3 >/dev/null 2>&1 +} + +traceroute_ext_iio_iif_test() +{ + local r1_ifindex h2_ifindex + local pkt_len=$1; shift + + # Test that incoming interface info is not appended by default. + run_cmd "$h1" "traceroute -e 192.0.2.3 $pkt_len | grep INC" + check_fail $? "Incoming interface info appended by default when should not" + + # Test that the extension is appended when enabled. + run_cmd "$r1" "bash -c \"echo 0x01 > /proc/sys/net/ipv4/icmp_errors_extension_mask\"" + check_err $? "Failed to enable incoming interface info extension on R1" + + run_cmd "$h1" "traceroute -e 192.0.2.3 $pkt_len | grep INC" + check_err $? "Incoming interface info not appended after enable" + + # Test that the extension is not appended when disabled. + run_cmd "$r1" "bash -c \"echo 0x00 > /proc/sys/net/ipv4/icmp_errors_extension_mask\"" + check_err $? "Failed to disable incoming interface info extension on R1" + + run_cmd "$h1" "traceroute -e 192.0.2.3 $pkt_len | grep INC" + check_fail $? "Incoming interface info appended after disable" + + # Test that the extension is sent correctly from both R1 and H2. + run_cmd "$r1" "sysctl -w net.ipv4.icmp_errors_extension_mask=0x01" + r1_ifindex=$(ip -n "$r1" -j link show dev eth1 | jq '.[]["ifindex"]') + run_cmd "$h1" "traceroute -e 192.0.2.3 $pkt_len | grep '<INC:$r1_ifindex,\"eth1\",mtu=1500>'" + check_err $? "Wrong incoming interface info reported from R1" + + run_cmd "$h2" "sysctl -w net.ipv4.icmp_errors_extension_mask=0x01" + h2_ifindex=$(ip -n "$h2" -j link show dev eth2 | jq '.[]["ifindex"]') + run_cmd "$h1" "traceroute -e 192.0.2.3 $pkt_len | grep '<INC:$h2_ifindex,\"eth2\",mtu=1500>'" + check_err $? "Wrong incoming interface info reported from H2" + + # Add a global address on the incoming interface of R1 and check that + # it is reported. + run_cmd "$r1" "ip address add 198.51.100.1/24 dev eth1" + run_cmd "$h1" "traceroute -e 192.0.2.3 $pkt_len | grep '<INC:$r1_ifindex,198.51.100.1,\"eth1\",mtu=1500>'" + check_err $? "Wrong incoming interface info reported from R1 after address addition" + run_cmd "$r1" "ip address del 198.51.100.1/24 dev eth1" + + # Change name and MTU and make sure the result is still correct. + # Re-add the route towards H1 since it was deleted when we removed the + # last IPv4 address from eth1 on R1. + run_cmd "$r1" "ip route add 192.0.2.1/32 nexthop via inet6 fe80::1 dev eth1" + run_cmd "$r1" "ip link set dev eth1 name eth1tag mtu 1501" + run_cmd "$h1" "traceroute -e 192.0.2.3 $pkt_len | grep '<INC:$r1_ifindex,\"eth1tag\",mtu=1501>'" + check_err $? "Wrong incoming interface info reported from R1 after name and MTU change" + run_cmd "$r1" "ip link set dev eth1tag name eth1 mtu 1500" + + run_cmd "$r1" "sysctl -w net.ipv4.icmp_errors_extension_mask=0x00" + run_cmd "$h2" "sysctl -w net.ipv4.icmp_errors_extension_mask=0x00" +} + +run_traceroute_ext() +{ + # Need at least version 2.1.5 for RFC 5837 support. + if ! check_traceroute_version 2.1.5; then + log_test_skip "traceroute too old, missing ICMP extensions support" + return + fi + + setup_traceroute_ext + + RET=0 + + ## General ICMP extensions tests + + # Test that ICMP extensions are disabled by default. + run_cmd "$h1" "sysctl net.ipv4.icmp_errors_extension_mask | grep \"= 0$\"" + check_err $? "ICMP extensions are not disabled by default" + + # Test that unsupported values are rejected. Do not use "sysctl" as + # older versions do not return an error code upon failure. + run_cmd "$h1" "bash -c \"echo 0x80 > /proc/sys/net/ipv4/icmp_errors_extension_mask\"" + check_fail $? "Unsupported sysctl value was not rejected" + + ## Extension-specific tests + + # Incoming interface info test. Test with various packet sizes, + # including the default one. + traceroute_ext_iio_iif_test + traceroute_ext_iio_iif_test 127 + traceroute_ext_iio_iif_test 128 + traceroute_ext_iio_iif_test 129 + + log_test "IPv4 traceroute with ICMP extensions" + + cleanup_traceroute_ext +} + +################################################################################ # Run tests run_tests() { run_traceroute6 run_traceroute6_vrf + run_traceroute6_ext run_traceroute run_traceroute_vrf + run_traceroute_ext } ################################################################################ @@ -462,6 +774,7 @@ done require_command traceroute6 require_command traceroute +require_command jq run_tests diff --git a/tools/testing/selftests/net/txtimestamp.c b/tools/testing/selftests/net/txtimestamp.c index dae91eb97d69..bcc14688661d 100644 --- a/tools/testing/selftests/net/txtimestamp.c +++ b/tools/testing/selftests/net/txtimestamp.c @@ -217,7 +217,7 @@ static void print_timestamp_usr(void) static void print_timestamp(struct scm_timestamping *tss, int tstype, int tskey, int payload_len) { - const char *tsname; + const char *tsname = NULL; validate_key(tskey, tstype); diff --git a/tools/testing/selftests/nolibc/Makefile.nolibc b/tools/testing/selftests/nolibc/Makefile.nolibc index 330e000baeb1..f9d43cbdc894 100644 --- a/tools/testing/selftests/nolibc/Makefile.nolibc +++ b/tools/testing/selftests/nolibc/Makefile.nolibc @@ -87,7 +87,6 @@ IMAGE_riscv = arch/riscv/boot/Image IMAGE_riscv32 = arch/riscv/boot/Image IMAGE_riscv64 = arch/riscv/boot/Image IMAGE_s390x = arch/s390/boot/bzImage -IMAGE_s390 = arch/s390/boot/bzImage IMAGE_loongarch = arch/loongarch/boot/vmlinuz.efi IMAGE_sparc32 = arch/sparc/boot/image IMAGE_sparc64 = arch/sparc/boot/image @@ -117,7 +116,6 @@ DEFCONFIG_riscv = defconfig DEFCONFIG_riscv32 = rv32_defconfig DEFCONFIG_riscv64 = defconfig DEFCONFIG_s390x = defconfig -DEFCONFIG_s390 = defconfig compat.config DEFCONFIG_loongarch = defconfig DEFCONFIG_sparc32 = sparc32_defconfig DEFCONFIG_sparc64 = sparc64_defconfig @@ -156,7 +154,6 @@ QEMU_ARCH_riscv = riscv64 QEMU_ARCH_riscv32 = riscv32 QEMU_ARCH_riscv64 = riscv64 QEMU_ARCH_s390x = s390x -QEMU_ARCH_s390 = s390x QEMU_ARCH_loongarch = loongarch64 QEMU_ARCH_sparc32 = sparc QEMU_ARCH_sparc64 = sparc64 @@ -197,7 +194,6 @@ QEMU_ARGS_riscv = -M virt -append "console=ttyS0 panic=-1 $(TEST:%=NOLIBC_T QEMU_ARGS_riscv32 = -M virt -append "console=ttyS0 panic=-1 $(TEST:%=NOLIBC_TEST=%)" QEMU_ARGS_riscv64 = -M virt -append "console=ttyS0 panic=-1 $(TEST:%=NOLIBC_TEST=%)" QEMU_ARGS_s390x = -M s390-ccw-virtio -append "console=ttyS0 panic=-1 $(TEST:%=NOLIBC_TEST=%)" -QEMU_ARGS_s390 = -M s390-ccw-virtio -append "console=ttyS0 panic=-1 $(TEST:%=NOLIBC_TEST=%)" QEMU_ARGS_loongarch = -M virt -append "console=ttyS0,115200 panic=-1 $(TEST:%=NOLIBC_TEST=%)" QEMU_ARGS_sparc32 = -M SS-5 -m 256M -append "console=ttyS0,115200 panic=-1 $(TEST:%=NOLIBC_TEST=%)" QEMU_ARGS_sparc64 = -M sun4u -append "console=ttyS0,115200 panic=-1 $(TEST:%=NOLIBC_TEST=%)" @@ -223,13 +219,13 @@ CFLAGS_ppc = -m32 -mbig-endian -mno-vsx $(call cc-option,-mmultiple) CFLAGS_ppc64 = -m64 -mbig-endian -mno-vsx $(call cc-option,-mmultiple) CFLAGS_ppc64le = -m64 -mlittle-endian -mno-vsx $(call cc-option,-mabi=elfv2) CFLAGS_s390x = -m64 -CFLAGS_s390 = -m31 CFLAGS_mips32le = -EL -mabi=32 -fPIC CFLAGS_mips32be = -EB -mabi=32 CFLAGS_mipsn32le = -EL -mabi=n32 -fPIC -march=mips64r2 CFLAGS_mipsn32be = -EB -mabi=n32 -march=mips64r6 CFLAGS_mips64le = -EL -mabi=64 -march=mips64r6 CFLAGS_mips64be = -EB -mabi=64 -march=mips64r2 +CFLAGS_loongarch = $(if $(LLVM),-fuse-ld=lld) CFLAGS_sparc32 = $(call cc-option,-m32) CFLAGS_sh4 = -ml -m4 ifeq ($(origin XARCH),command line) diff --git a/tools/testing/selftests/nolibc/nolibc-test.c b/tools/testing/selftests/nolibc/nolibc-test.c index 29de21595fc9..3c5a226dad3a 100644 --- a/tools/testing/selftests/nolibc/nolibc-test.c +++ b/tools/testing/selftests/nolibc/nolibc-test.c @@ -25,6 +25,7 @@ #include <sys/sysmacros.h> #include <sys/time.h> #include <sys/timerfd.h> +#include <sys/uio.h> #include <sys/utsname.h> #include <sys/wait.h> #include <dirent.h> @@ -1282,6 +1283,10 @@ int run_syscall(int min, int max) int proc; int test; int tmp; + struct iovec iov_one = { + .iov_base = &tmp, + .iov_len = 1, + }; int ret = 0; void *p1, *p2; int has_gettid = 1; @@ -1343,6 +1348,8 @@ int run_syscall(int min, int max) CASE_TEST(dup3_0); tmp = dup3(0, 100, 0); EXPECT_SYSNE(1, tmp, -1); close(tmp); break; CASE_TEST(dup3_m1); tmp = dup3(-1, 100, 0); EXPECT_SYSER(1, tmp, -1, EBADF); if (tmp != -1) close(tmp); break; CASE_TEST(execve_root); EXPECT_SYSER(1, execve("/", (char*[]){ [0] = "/", [1] = NULL }, NULL), -1, EACCES); break; + CASE_TEST(fchdir_stdin); EXPECT_SYSER(1, fchdir(STDIN_FILENO), -1, ENOTDIR); break; + CASE_TEST(fchdir_badfd); EXPECT_SYSER(1, fchdir(-1), -1, EBADF); break; CASE_TEST(file_stream); EXPECT_SYSZR(1, test_file_stream()); break; CASE_TEST(fork); EXPECT_SYSZR(1, test_fork(FORK_STANDARD)); break; CASE_TEST(getdents64_root); EXPECT_SYSNE(1, test_getdents64("/"), -1); break; @@ -1395,6 +1402,10 @@ int run_syscall(int min, int max) CASE_TEST(waitpid_child); EXPECT_SYSER(1, waitpid(getpid(), &tmp, WNOHANG), -1, ECHILD); break; CASE_TEST(write_badf); EXPECT_SYSER(1, write(-1, &tmp, 1), -1, EBADF); break; CASE_TEST(write_zero); EXPECT_SYSZR(1, write(1, &tmp, 0)); break; + CASE_TEST(readv_badf); EXPECT_SYSER(1, readv(-1, &iov_one, 1), -1, EBADF); break; + CASE_TEST(readv_zero); EXPECT_SYSZR(1, readv(1, NULL, 0)); break; + CASE_TEST(writev_badf); EXPECT_SYSER(1, writev(-1, &iov_one, 1), -1, EBADF); break; + CASE_TEST(writev_zero); EXPECT_SYSZR(1, writev(1, NULL, 0)); break; CASE_TEST(syscall_noargs); EXPECT_SYSEQ(1, syscall(__NR_getpid), getpid()); break; CASE_TEST(syscall_args); EXPECT_SYSER(1, syscall(__NR_statx, 0, NULL, 0, 0, NULL), -1, EFAULT); break; CASE_TEST(namespace); EXPECT_SYSZR(euid0 && proc, test_namespace()); break; @@ -1540,6 +1551,8 @@ int run_stdlib(int min, int max) CASE_TEST(abs); EXPECT_EQ(1, abs(-10), 10); break; CASE_TEST(abs_noop); EXPECT_EQ(1, abs(10), 10); break; CASE_TEST(difftime); EXPECT_ZR(1, test_difftime()); break; + CASE_TEST(memchr_foobar6_o); EXPECT_STREQ(1, memchr("foobar", 'o', 6), "oobar"); break; + CASE_TEST(memchr_foobar3_b); EXPECT_STRZR(1, memchr("foobar", 'b', 3)); break; case __LINE__: return ret; /* must be last */ diff --git a/tools/testing/selftests/nolibc/run-tests.sh b/tools/testing/selftests/nolibc/run-tests.sh index e8af1fb505cf..3917cfb8fdc4 100755 --- a/tools/testing/selftests/nolibc/run-tests.sh +++ b/tools/testing/selftests/nolibc/run-tests.sh @@ -23,7 +23,7 @@ all_archs=( mips32le mips32be mipsn32le mipsn32be mips64le mips64be ppc ppc64 ppc64le riscv32 riscv64 - s390x s390 + s390x loongarch sparc32 sparc64 m68k @@ -169,7 +169,7 @@ test_arch() { cross_compile=$(realpath "${download_location}gcc-${crosstool_version}-nolibc/${ct_arch}-${ct_abi}/bin/${ct_arch}-${ct_abi}-") build_dir="${build_location}/${arch}" if [ "$werror" -ne 0 ]; then - CFLAGS_EXTRA="$CFLAGS_EXTRA -Werror" + CFLAGS_EXTRA="$CFLAGS_EXTRA -Werror -Wl,--fatal-warnings" fi MAKE=(make -f Makefile.nolibc -j"${nproc}" XARCH="${arch}" CROSS_COMPILE="${cross_compile}" LLVM="${llvm}" O="${build_dir}") @@ -185,10 +185,6 @@ test_arch() { exit 1 esac printf '%-15s' "$arch:" - if [ "$arch" = "s390" ] && ([ "$llvm" = "1" ] || [ "$test_mode" = "user" ]); then - echo "Unsupported configuration" - return - fi if [ "$arch" = "m68k" -o "$arch" = "sh4" ] && [ "$llvm" = "1" ]; then echo "Unsupported configuration" return diff --git a/tools/testing/selftests/pidfd/pidfd.h b/tools/testing/selftests/pidfd/pidfd.h index f87993def738..d60f10a873bb 100644 --- a/tools/testing/selftests/pidfd/pidfd.h +++ b/tools/testing/selftests/pidfd/pidfd.h @@ -148,6 +148,14 @@ #define PIDFD_INFO_COREDUMP (1UL << 4) #endif +#ifndef PIDFD_INFO_SUPPORTED_MASK +#define PIDFD_INFO_SUPPORTED_MASK (1UL << 5) +#endif + +#ifndef PIDFD_INFO_COREDUMP_SIGNAL +#define PIDFD_INFO_COREDUMP_SIGNAL (1UL << 6) +#endif + #ifndef PIDFD_COREDUMPED #define PIDFD_COREDUMPED (1U << 0) /* Did crash and... */ #endif @@ -183,8 +191,11 @@ struct pidfd_info { __u32 fsuid; __u32 fsgid; __s32 exit_code; - __u32 coredump_mask; - __u32 __spare1; + struct { + __u32 coredump_mask; + __u32 coredump_signal; + }; + __u64 supported_mask; }; /* diff --git a/tools/testing/selftests/pidfd/pidfd_info_test.c b/tools/testing/selftests/pidfd/pidfd_info_test.c index a0eb6e81eaa2..cb5430a2fd75 100644 --- a/tools/testing/selftests/pidfd/pidfd_info_test.c +++ b/tools/testing/selftests/pidfd/pidfd_info_test.c @@ -690,4 +690,77 @@ TEST_F(pidfd_info, thread_group_exec_thread) EXPECT_EQ(close(pidfd_thread), 0); } +/* + * Test: PIDFD_INFO_SUPPORTED_MASK field + * + * Verify that when PIDFD_INFO_SUPPORTED_MASK is requested, the kernel + * returns the supported_mask field indicating which flags the kernel supports. + */ +TEST(supported_mask_field) +{ + struct pidfd_info info = { + .mask = PIDFD_INFO_SUPPORTED_MASK, + }; + int pidfd; + pid_t pid; + + pid = create_child(&pidfd, 0); + ASSERT_GE(pid, 0); + + if (pid == 0) + pause(); + + /* Request supported_mask field */ + ASSERT_EQ(ioctl(pidfd, PIDFD_GET_INFO, &info), 0); + + /* Verify PIDFD_INFO_SUPPORTED_MASK is set in the reply */ + ASSERT_TRUE(!!(info.mask & PIDFD_INFO_SUPPORTED_MASK)); + + /* Verify supported_mask contains expected flags */ + ASSERT_TRUE(!!(info.supported_mask & PIDFD_INFO_PID)); + ASSERT_TRUE(!!(info.supported_mask & PIDFD_INFO_CREDS)); + ASSERT_TRUE(!!(info.supported_mask & PIDFD_INFO_CGROUPID)); + ASSERT_TRUE(!!(info.supported_mask & PIDFD_INFO_EXIT)); + ASSERT_TRUE(!!(info.supported_mask & PIDFD_INFO_COREDUMP)); + ASSERT_TRUE(!!(info.supported_mask & PIDFD_INFO_SUPPORTED_MASK)); + ASSERT_TRUE(!!(info.supported_mask & PIDFD_INFO_COREDUMP_SIGNAL)); + + /* Clean up */ + sys_pidfd_send_signal(pidfd, SIGKILL, NULL, 0); + sys_waitid(P_PIDFD, pidfd, NULL, WEXITED); + close(pidfd); +} + +/* + * Test: PIDFD_INFO_SUPPORTED_MASK always available + * + * Verify that supported_mask is returned even when other fields are requested. + */ +TEST(supported_mask_with_other_fields) +{ + struct pidfd_info info = { + .mask = PIDFD_INFO_CGROUPID | PIDFD_INFO_SUPPORTED_MASK, + }; + int pidfd; + pid_t pid; + + pid = create_child(&pidfd, 0); + ASSERT_GE(pid, 0); + + if (pid == 0) + pause(); + + ASSERT_EQ(ioctl(pidfd, PIDFD_GET_INFO, &info), 0); + + /* Both fields should be present */ + ASSERT_TRUE(!!(info.mask & PIDFD_INFO_CGROUPID)); + ASSERT_TRUE(!!(info.mask & PIDFD_INFO_SUPPORTED_MASK)); + ASSERT_NE(info.supported_mask, 0); + + /* Clean up */ + sys_pidfd_send_signal(pidfd, SIGKILL, NULL, 0); + sys_waitid(P_PIDFD, pidfd, NULL, WEXITED); + close(pidfd); +} + TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/rcutorture/bin/kvm-again.sh b/tools/testing/selftests/rcutorture/bin/kvm-again.sh index 88ca4e368489..b5239b52cb5d 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-again.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-again.sh @@ -31,7 +31,7 @@ fi if ! cp "$oldrun/scenarios" $T/scenarios.oldrun then # Later on, can reconstitute this from console.log files. - echo Prior run batches file does not exist: $oldrun/batches + echo Prior run scenarios file does not exist: $oldrun/scenarios exit 1 fi @@ -68,7 +68,7 @@ usage () { echo " --datestamp string" echo " --dryrun" echo " --duration minutes | <seconds>s | <hours>h | <days>d" - echo " --link hard|soft|copy" + echo " --link hard|soft|copy|inplace|inplace-force" echo " --remote" echo " --rundir /new/res/path" echo "Command line: $scriptname $args" @@ -121,7 +121,7 @@ do shift ;; --link) - checkarg --link "hard|soft|copy" "$#" "$2" 'hard\|soft\|copy' '^--' + checkarg --link "hard|soft|copy|inplace|inplace-force" "$#" "$2" 'hard\|soft\|copy\|inplace\|inplace-force' '^--' case "$2" in copy) arg_link="cp -R" @@ -132,6 +132,14 @@ do soft) arg_link="cp -Rs" ;; + inplace) + arg_link="inplace" + rundir="$oldrun" + ;; + inplace-force) + arg_link="inplace-force" + rundir="$oldrun" + ;; esac shift ;; @@ -172,21 +180,37 @@ fi echo ---- Re-run results directory: $rundir -# Copy old run directory tree over and adjust. -mkdir -p "`dirname "$rundir"`" -if ! $arg_link "$oldrun" "$rundir" -then - echo "Cannot copy from $oldrun to $rundir." - usage -fi -rm -f "$rundir"/*/{console.log,console.log.diags,qemu_pid,qemu-pid,qemu-retval,Warnings,kvm-test-1-run.sh.out,kvm-test-1-run-qemu.sh.out,vmlinux} "$rundir"/log -touch "$rundir/log" -echo $scriptname $args | tee -a "$rundir/log" -echo $oldrun > "$rundir/re-run" -if ! test -d "$rundir/../../bin" +if test "$oldrun" != "$rundir" then - $arg_link "$oldrun/../../bin" "$rundir/../.." + # Copy old run directory tree over and adjust. + mkdir -p "`dirname "$rundir"`" + if ! $arg_link "$oldrun" "$rundir" + then + echo "Cannot copy from $oldrun to $rundir." + usage + fi + rm -f "$rundir"/*/{console.log,console.log.diags,qemu_pid,qemu-pid,qemu-retval,Warnings,kvm-test-1-run.sh.out,kvm-test-1-run-qemu.sh.out,vmlinux} "$rundir"/log + touch "$rundir/log" + echo $scriptname $args | tee -a "$rundir/log" + echo $oldrun > "$rundir/re-run" + if ! test -d "$rundir/../../bin" + then + $arg_link "$oldrun/../../bin" "$rundir/../.." + fi +else + # Check for a run having already happened. + find "$rundir" -name console.log -print > $T/oldrun-console.log + if test -s $T/oldrun-console.log + then + echo Run already took place in $rundir + if test "$arg_link" = inplace + then + usage + fi + fi fi + +# Find runs to be done based on their qemu-cmd files. for i in $rundir/*/qemu-cmd do cp "$i" $T diff --git a/tools/testing/selftests/rcutorture/bin/kvm-series.sh b/tools/testing/selftests/rcutorture/bin/kvm-series.sh new file mode 100755 index 000000000000..2ff905a1853b --- /dev/null +++ b/tools/testing/selftests/rcutorture/bin/kvm-series.sh @@ -0,0 +1,116 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0+ +# +# Usage: kvm-series.sh config-list commit-id-list [ kvm.sh parameters ] +# +# Tests the specified list of unadorned configs ("TREE01 SRCU-P" but not +# "CFLIST" or "3*TRACE01") and an indication of a set of commits to test, +# then runs each commit through the specified list of commits using kvm.sh. +# The runs are grouped into a -series/config/commit directory tree. +# Each run defaults to a duration of one minute. +# +# Run in top-level Linux source directory. Please note that this is in +# no way a replacement for "git bisect"!!! +# +# This script is intended to replace kvm-check-branches.sh by providing +# ease of use and faster execution. + +T="`mktemp -d ${TMPDIR-/tmp}/kvm-series.sh.XXXXXX`" +trap 'rm -rf $T' 0 + +scriptname=$0 +args="$*" + +config_list="${1}" +if test -z "${config_list}" +then + echo "$0: Need a quoted list of --config arguments for first argument." + exit 1 +fi +if test -z "${config_list}" || echo "${config_list}" | grep -q '\*' +then + echo "$0: Repetition ('*') not allowed in config list." + exit 1 +fi + +commit_list="${2}" +if test -z "${commit_list}" +then + echo "$0: Need a list of commits (e.g., HEAD^^^..) for second argument." + exit 2 +fi +git log --pretty=format:"%h" "${commit_list}" > $T/commits +ret=$? +if test "${ret}" -ne 0 +then + echo "$0: Invalid commit list ('${commit_list}')." + exit 2 +fi +sha1_list=`cat $T/commits` + +shift +shift + +RCUTORTURE="`pwd`/tools/testing/selftests/rcutorture"; export RCUTORTURE +PATH=${RCUTORTURE}/bin:$PATH; export PATH +. functions.sh + +ret=0 +nfail=0 +nsuccess=0 +faillist= +successlist= +cursha1="`git rev-parse --abbrev-ref HEAD`" +ds="`date +%Y.%m.%d-%H.%M.%S`-series" +startdate="`date`" +starttime="`get_starttime`" + +echo " --- " $scriptname $args | tee -a $T/log +echo " --- Results directory: " $ds | tee -a $T/log + +for config in ${config_list} +do + sha_n=0 + for sha in ${sha1_list} + do + sha1=${sha_n}.${sha} # Enable "sort -k1nr" to list commits in order. + echo Starting ${config}/${sha1} at `date` | tee -a $T/log + git checkout "${sha}" + time tools/testing/selftests/rcutorture/bin/kvm.sh --configs "$config" --datestamp "$ds/${config}/${sha1}" --duration 1 "$@" + curret=$? + if test "${curret}" -ne 0 + then + nfail=$((nfail+1)) + faillist="$faillist ${config}/${sha1}(${curret})" + else + nsuccess=$((nsuccess+1)) + successlist="$successlist ${config}/${sha1}" + # Successful run, so remove large files. + rm -f ${RCUTORTURE}/$ds/${config}/${sha1}/{vmlinux,bzImage,System.map,Module.symvers} + fi + if test "${ret}" -eq 0 + then + ret=${curret} + fi + sha_n=$((sha_n+1)) + done +done +git checkout "${cursha1}" + +echo ${nsuccess} SUCCESSES: | tee -a $T/log +echo ${successlist} | fmt | tee -a $T/log +echo | tee -a $T/log +echo ${nfail} FAILURES: | tee -a $T/log +echo ${faillist} | fmt | tee -a $T/log +if test -n "${faillist}" +then + echo | tee -a $T/log + echo Failures across commits: | tee -a $T/log + echo ${faillist} | tr ' ' '\012' | sed -e 's,^[^/]*/,,' -e 's/([0-9]*)//' | + sort | uniq -c | sort -k2n | tee -a $T/log +fi +echo Started at $startdate, ended at `date`, duration `get_starttime_duration $starttime`. | tee -a $T/log +echo Summary: Successes: ${nsuccess} Failures: ${nfail} | tee -a $T/log +cp $T/log tools/testing/selftests/rcutorture/res/${ds} + +exit "${ret}" diff --git a/tools/testing/selftests/rcutorture/bin/kvm.sh b/tools/testing/selftests/rcutorture/bin/kvm.sh index 617cba339d28..fff15821c44c 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm.sh @@ -199,7 +199,7 @@ do fi ;; --kconfig|--kconfigs) - checkarg --kconfig "(Kconfig options)" $# "$2" '^\(#CHECK#\)\?CONFIG_[A-Z0-9_]\+=\([ynm]\|[0-9]\+\|"[^"]*"\)\( \+\(#CHECK#\)\?CONFIG_[A-Z0-9_]\+=\([ynm]\|[0-9]\+\|"[^"]*"\)\)* *$' '^error$' + checkarg --kconfig "(Kconfig options)" $# "$2" '^\(#CHECK#\)\?CONFIG_[A-Z0-9_]\+=\([ynm]\|-\?[0-9]\+\|"[^"]*"\)\( \+\(#CHECK#\)\?CONFIG_[A-Z0-9_]\+=\([ynm]\|-\?[0-9]\+\|"[^"]*"\)\)* *$' '^error$' TORTURE_KCONFIG_ARG="`echo "$TORTURE_KCONFIG_ARG $2" | sed -e 's/^ *//' -e 's/ *$//'`" shift ;; diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE04 b/tools/testing/selftests/rcutorture/configs/rcu/TREE04 index dc4985064b3a..67caf4276bb0 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/TREE04 +++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE04 @@ -16,3 +16,4 @@ CONFIG_DEBUG_OBJECTS_RCU_HEAD=n CONFIG_RCU_EXPERT=y CONFIG_RCU_EQS_DEBUG=y CONFIG_RCU_LAZY=y +CONFIG_RCU_DYNTICKS_TORTURE=y diff --git a/tools/testing/selftests/riscv/hwprobe/cbo.c b/tools/testing/selftests/riscv/hwprobe/cbo.c index 5e96ef785d0d..6d99726aceac 100644 --- a/tools/testing/selftests/riscv/hwprobe/cbo.c +++ b/tools/testing/selftests/riscv/hwprobe/cbo.c @@ -15,24 +15,31 @@ #include <linux/compiler.h> #include <linux/kernel.h> #include <asm/ucontext.h> +#include <getopt.h> #include "hwprobe.h" #include "../../kselftest.h" #define MK_CBO(fn) le32_bswap((uint32_t)(fn) << 20 | 10 << 15 | 2 << 12 | 0 << 7 | 15) +#define MK_PREFETCH(fn) \ + le32_bswap(0 << 25 | (uint32_t)(fn) << 20 | 10 << 15 | 6 << 12 | 0 << 7 | 19) static char mem[4096] __aligned(4096) = { [0 ... 4095] = 0xa5 }; -static bool illegal_insn; +static bool got_fault; -static void sigill_handler(int sig, siginfo_t *info, void *context) +static void fault_handler(int sig, siginfo_t *info, void *context) { unsigned long *regs = (unsigned long *)&((ucontext_t *)context)->uc_mcontext; uint32_t insn = *(uint32_t *)regs[0]; - assert(insn == MK_CBO(regs[11])); + if (sig == SIGILL) + assert(insn == MK_CBO(regs[11])); - illegal_insn = true; + if (sig == SIGSEGV || sig == SIGBUS) + assert(insn == MK_PREFETCH(regs[11])); + + got_fault = true; regs[0] += 4; } @@ -45,39 +52,51 @@ static void sigill_handler(int sig, siginfo_t *info, void *context) : : "r" (base), "i" (fn), "i" (MK_CBO(fn)) : "a0", "a1", "memory"); \ }) +#define prefetch_insn(base, fn) \ +({ \ + asm volatile( \ + "mv a0, %0\n" \ + "li a1, %1\n" \ + ".4byte %2\n" \ + : : "r" (base), "i" (fn), "i" (MK_PREFETCH(fn)) : "a0", "a1"); \ +}) + static void cbo_inval(char *base) { cbo_insn(base, 0); } static void cbo_clean(char *base) { cbo_insn(base, 1); } static void cbo_flush(char *base) { cbo_insn(base, 2); } static void cbo_zero(char *base) { cbo_insn(base, 4); } +static void prefetch_i(char *base) { prefetch_insn(base, 0); } +static void prefetch_r(char *base) { prefetch_insn(base, 1); } +static void prefetch_w(char *base) { prefetch_insn(base, 3); } static void test_no_cbo_inval(void *arg) { ksft_print_msg("Testing cbo.inval instruction remain privileged\n"); - illegal_insn = false; + got_fault = false; cbo_inval(&mem[0]); - ksft_test_result(illegal_insn, "No cbo.inval\n"); + ksft_test_result(got_fault, "No cbo.inval\n"); } static void test_no_zicbom(void *arg) { ksft_print_msg("Testing Zicbom instructions remain privileged\n"); - illegal_insn = false; + got_fault = false; cbo_clean(&mem[0]); - ksft_test_result(illegal_insn, "No cbo.clean\n"); + ksft_test_result(got_fault, "No cbo.clean\n"); - illegal_insn = false; + got_fault = false; cbo_flush(&mem[0]); - ksft_test_result(illegal_insn, "No cbo.flush\n"); + ksft_test_result(got_fault, "No cbo.flush\n"); } static void test_no_zicboz(void *arg) { ksft_print_msg("No Zicboz, testing cbo.zero remains privileged\n"); - illegal_insn = false; + got_fault = false; cbo_zero(&mem[0]); - ksft_test_result(illegal_insn, "No cbo.zero\n"); + ksft_test_result(got_fault, "No cbo.zero\n"); } static bool is_power_of_2(__u64 n) @@ -85,6 +104,51 @@ static bool is_power_of_2(__u64 n) return n != 0 && (n & (n - 1)) == 0; } +static void test_zicbop(void *arg) +{ + struct riscv_hwprobe pair = { + .key = RISCV_HWPROBE_KEY_ZICBOP_BLOCK_SIZE, + }; + struct sigaction act = { + .sa_sigaction = &fault_handler, + .sa_flags = SA_SIGINFO + }; + struct sigaction dfl = { + .sa_handler = SIG_DFL + }; + cpu_set_t *cpus = (cpu_set_t *)arg; + __u64 block_size; + long rc; + + rc = sigaction(SIGSEGV, &act, NULL); + assert(rc == 0); + rc = sigaction(SIGBUS, &act, NULL); + assert(rc == 0); + + rc = riscv_hwprobe(&pair, 1, sizeof(cpu_set_t), (unsigned long *)cpus, 0); + block_size = pair.value; + ksft_test_result(rc == 0 && pair.key == RISCV_HWPROBE_KEY_ZICBOP_BLOCK_SIZE && + is_power_of_2(block_size), "Zicbop block size\n"); + ksft_print_msg("Zicbop block size: %llu\n", block_size); + + got_fault = false; + prefetch_i(&mem[0]); + prefetch_r(&mem[0]); + prefetch_w(&mem[0]); + ksft_test_result(!got_fault, "Zicbop prefetch.* on valid address\n"); + + got_fault = false; + prefetch_i(NULL); + prefetch_r(NULL); + prefetch_w(NULL); + ksft_test_result(!got_fault, "Zicbop prefetch.* on NULL\n"); + + rc = sigaction(SIGBUS, &dfl, NULL); + assert(rc == 0); + rc = sigaction(SIGSEGV, &dfl, NULL); + assert(rc == 0); +} + static void test_zicbom(void *arg) { struct riscv_hwprobe pair = { @@ -100,13 +164,13 @@ static void test_zicbom(void *arg) is_power_of_2(block_size), "Zicbom block size\n"); ksft_print_msg("Zicbom block size: %llu\n", block_size); - illegal_insn = false; + got_fault = false; cbo_clean(&mem[block_size]); - ksft_test_result(!illegal_insn, "cbo.clean\n"); + ksft_test_result(!got_fault, "cbo.clean\n"); - illegal_insn = false; + got_fault = false; cbo_flush(&mem[block_size]); - ksft_test_result(!illegal_insn, "cbo.flush\n"); + ksft_test_result(!got_fault, "cbo.flush\n"); } static void test_zicboz(void *arg) @@ -125,11 +189,11 @@ static void test_zicboz(void *arg) is_power_of_2(block_size), "Zicboz block size\n"); ksft_print_msg("Zicboz block size: %llu\n", block_size); - illegal_insn = false; + got_fault = false; cbo_zero(&mem[block_size]); - ksft_test_result(!illegal_insn, "cbo.zero\n"); + ksft_test_result(!got_fault, "cbo.zero\n"); - if (illegal_insn || !is_power_of_2(block_size)) { + if (got_fault || !is_power_of_2(block_size)) { ksft_test_result_skip("cbo.zero check\n"); return; } @@ -177,7 +241,19 @@ static void check_no_zicbo_cpus(cpu_set_t *cpus, __u64 cbo) rc = riscv_hwprobe(&pair, 1, sizeof(cpu_set_t), (unsigned long *)&one_cpu, 0); assert(rc == 0 && pair.key == RISCV_HWPROBE_KEY_IMA_EXT_0); - cbostr = cbo == RISCV_HWPROBE_EXT_ZICBOZ ? "Zicboz" : "Zicbom"; + switch (cbo) { + case RISCV_HWPROBE_EXT_ZICBOZ: + cbostr = "Zicboz"; + break; + case RISCV_HWPROBE_EXT_ZICBOM: + cbostr = "Zicbom"; + break; + case RISCV_HWPROBE_EXT_ZICBOP: + cbostr = "Zicbop"; + break; + default: + ksft_exit_fail_msg("Internal error: invalid cbo %llu\n", cbo); + } if (pair.value & cbo) ksft_exit_fail_msg("%s is only present on a subset of harts.\n" @@ -194,6 +270,7 @@ enum { TEST_ZICBOM, TEST_NO_ZICBOM, TEST_NO_CBO_INVAL, + TEST_ZICBOP, }; static struct test_info { @@ -206,26 +283,51 @@ static struct test_info { [TEST_ZICBOM] = { .nr_tests = 3, test_zicbom }, [TEST_NO_ZICBOM] = { .nr_tests = 2, test_no_zicbom }, [TEST_NO_CBO_INVAL] = { .nr_tests = 1, test_no_cbo_inval }, + [TEST_ZICBOP] = { .nr_tests = 3, test_zicbop }, +}; + +static const struct option long_opts[] = { + {"zicbom-raises-sigill", no_argument, 0, 'm'}, + {"zicboz-raises-sigill", no_argument, 0, 'z'}, + {0, 0, 0, 0} }; int main(int argc, char **argv) { struct sigaction act = { - .sa_sigaction = &sigill_handler, + .sa_sigaction = &fault_handler, .sa_flags = SA_SIGINFO, }; struct riscv_hwprobe pair; unsigned int plan = 0; cpu_set_t cpus; long rc; - int i; - - if (argc > 1 && !strcmp(argv[1], "--sigill")) { - rc = sigaction(SIGILL, &act, NULL); - assert(rc == 0); - tests[TEST_NO_ZICBOZ].enabled = true; - tests[TEST_NO_ZICBOM].enabled = true; - tests[TEST_NO_CBO_INVAL].enabled = true; + int i, opt, long_index; + + long_index = 0; + + while ((opt = getopt_long(argc, argv, "mz", long_opts, &long_index)) != -1) { + switch (opt) { + case 'm': + tests[TEST_NO_ZICBOM].enabled = true; + tests[TEST_NO_CBO_INVAL].enabled = true; + rc = sigaction(SIGILL, &act, NULL); + assert(rc == 0); + break; + case 'z': + tests[TEST_NO_ZICBOZ].enabled = true; + tests[TEST_NO_CBO_INVAL].enabled = true; + rc = sigaction(SIGILL, &act, NULL); + assert(rc == 0); + break; + case '?': + fprintf(stderr, + "Usage: %s [--zicbom-raises-sigill|-m] [--zicboz-raises-sigill|-z]\n", + argv[0]); + exit(1); + default: + break; + } } rc = sched_getaffinity(0, sizeof(cpu_set_t), &cpus); @@ -253,6 +355,11 @@ int main(int argc, char **argv) check_no_zicbo_cpus(&cpus, RISCV_HWPROBE_EXT_ZICBOM); } + if (pair.value & RISCV_HWPROBE_EXT_ZICBOP) + tests[TEST_ZICBOP].enabled = true; + else + check_no_zicbo_cpus(&cpus, RISCV_HWPROBE_EXT_ZICBOP); + for (i = 0; i < ARRAY_SIZE(tests); ++i) plan += tests[i].enabled ? tests[i].nr_tests : 0; diff --git a/tools/testing/selftests/riscv/vector/Makefile b/tools/testing/selftests/riscv/vector/Makefile index 6f7497f4e7b3..2c2a33fc083e 100644 --- a/tools/testing/selftests/riscv/vector/Makefile +++ b/tools/testing/selftests/riscv/vector/Makefile @@ -2,7 +2,7 @@ # Copyright (C) 2021 ARM Limited # Originally tools/testing/arm64/abi/Makefile -TEST_GEN_PROGS := v_initval vstate_prctl +TEST_GEN_PROGS := v_initval vstate_prctl vstate_ptrace TEST_GEN_PROGS_EXTENDED := vstate_exec_nolibc v_exec_initval_nolibc include ../../lib.mk @@ -26,3 +26,6 @@ $(OUTPUT)/v_initval: v_initval.c $(OUTPUT)/sys_hwprobe.o $(OUTPUT)/v_helpers.o $(OUTPUT)/v_exec_initval_nolibc: v_exec_initval_nolibc.c $(CC) -nostdlib -static -include ../../../../include/nolibc/nolibc.h \ -Wall $(CFLAGS) $(LDFLAGS) $^ -o $@ -lgcc + +$(OUTPUT)/vstate_ptrace: vstate_ptrace.c $(OUTPUT)/sys_hwprobe.o $(OUTPUT)/v_helpers.o + $(CC) -static -o$@ $(CFLAGS) $(LDFLAGS) $^ diff --git a/tools/testing/selftests/riscv/vector/vstate_ptrace.c b/tools/testing/selftests/riscv/vector/vstate_ptrace.c new file mode 100644 index 000000000000..1479abc0c9cb --- /dev/null +++ b/tools/testing/selftests/riscv/vector/vstate_ptrace.c @@ -0,0 +1,134 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include <stdio.h> +#include <stdlib.h> +#include <asm/ptrace.h> +#include <linux/elf.h> +#include <sys/ptrace.h> +#include <sys/uio.h> +#include <sys/wait.h> +#include "../../kselftest.h" +#include "v_helpers.h" + +int parent_set_val, child_set_val; + +static long do_ptrace(enum __ptrace_request op, pid_t pid, long type, size_t size, void *data) +{ + struct iovec v_iovec = { + .iov_len = size, + .iov_base = data + }; + + return ptrace(op, pid, type, &v_iovec); +} + +static int do_child(void) +{ + int out; + + if (ptrace(PTRACE_TRACEME, -1, NULL, NULL)) { + ksft_perror("PTRACE_TRACEME failed\n"); + return EXIT_FAILURE; + } + + asm volatile (".option push\n\t" + ".option arch, +v\n\t" + ".option norvc\n\t" + "vsetivli x0, 1, e32, m1, ta, ma\n\t" + "vmv.s.x v31, %[in]\n\t" + "ebreak\n\t" + "vmv.x.s %[out], v31\n\t" + ".option pop\n\t" + : [out] "=r" (out) + : [in] "r" (child_set_val)); + + if (out != parent_set_val) + return EXIT_FAILURE; + + return EXIT_SUCCESS; +} + +static void do_parent(pid_t child) +{ + int status; + void *data = NULL; + + /* Attach to the child */ + while (waitpid(child, &status, 0)) { + if (WIFEXITED(status)) { + ksft_test_result(WEXITSTATUS(status) == 0, "SETREGSET vector\n"); + goto out; + } else if (WIFSTOPPED(status) && (WSTOPSIG(status) == SIGTRAP)) { + size_t size; + void *data, *v31; + struct __riscv_v_regset_state *v_regset_hdr; + struct user_regs_struct *gpreg; + + size = sizeof(*v_regset_hdr); + data = malloc(size); + if (!data) + goto out; + v_regset_hdr = (struct __riscv_v_regset_state *)data; + + if (do_ptrace(PTRACE_GETREGSET, child, NT_RISCV_VECTOR, size, data)) + goto out; + + ksft_print_msg("vlenb %ld\n", v_regset_hdr->vlenb); + data = realloc(data, size + v_regset_hdr->vlenb * 32); + if (!data) + goto out; + v_regset_hdr = (struct __riscv_v_regset_state *)data; + v31 = (void *)(data + size + v_regset_hdr->vlenb * 31); + size += v_regset_hdr->vlenb * 32; + + if (do_ptrace(PTRACE_GETREGSET, child, NT_RISCV_VECTOR, size, data)) + goto out; + + ksft_test_result(*(int *)v31 == child_set_val, "GETREGSET vector\n"); + + *(int *)v31 = parent_set_val; + if (do_ptrace(PTRACE_SETREGSET, child, NT_RISCV_VECTOR, size, data)) + goto out; + + /* move the pc forward */ + size = sizeof(*gpreg); + data = realloc(data, size); + gpreg = (struct user_regs_struct *)data; + + if (do_ptrace(PTRACE_GETREGSET, child, NT_PRSTATUS, size, data)) + goto out; + + gpreg->pc += 4; + if (do_ptrace(PTRACE_SETREGSET, child, NT_PRSTATUS, size, data)) + goto out; + } + + ptrace(PTRACE_CONT, child, NULL, NULL); + } + +out: + free(data); +} + +int main(void) +{ + pid_t child; + + ksft_set_plan(2); + if (!is_vector_supported() && !is_xtheadvector_supported()) + ksft_exit_skip("Vector not supported\n"); + + srandom(getpid()); + parent_set_val = rand(); + child_set_val = rand(); + + child = fork(); + if (child < 0) + ksft_exit_fail_msg("Fork failed %d\n", child); + + if (!child) + return do_child(); + + do_parent(child); + + ksft_finished(); +} diff --git a/tools/testing/selftests/rseq/rseq-s390.h b/tools/testing/selftests/rseq/rseq-s390.h index 33baaa9f9997..e7b858cd3736 100644 --- a/tools/testing/selftests/rseq/rseq-s390.h +++ b/tools/testing/selftests/rseq/rseq-s390.h @@ -28,8 +28,6 @@ do { \ RSEQ_WRITE_ONCE(*(p), v); \ } while (0) -#ifdef __s390x__ - #define LONG_L "lg" #define LONG_S "stg" #define LONG_LT_R "ltgr" @@ -63,43 +61,6 @@ do { \ ".quad " __rseq_str(start_ip) ", " __rseq_str(exit_ip) "\n\t" \ ".popsection\n\t" -#elif __s390__ - -#define __RSEQ_ASM_DEFINE_TABLE(label, version, flags, \ - start_ip, post_commit_offset, abort_ip) \ - ".pushsection __rseq_cs, \"aw\"\n\t" \ - ".balign 32\n\t" \ - __rseq_str(label) ":\n\t" \ - ".long " __rseq_str(version) ", " __rseq_str(flags) "\n\t" \ - ".long 0x0, " __rseq_str(start_ip) ", 0x0, " __rseq_str(post_commit_offset) ", 0x0, " __rseq_str(abort_ip) "\n\t" \ - ".popsection\n\t" \ - ".pushsection __rseq_cs_ptr_array, \"aw\"\n\t" \ - ".long 0x0, " __rseq_str(label) "b\n\t" \ - ".popsection\n\t" - -/* - * Exit points of a rseq critical section consist of all instructions outside - * of the critical section where a critical section can either branch to or - * reach through the normal course of its execution. The abort IP and the - * post-commit IP are already part of the __rseq_cs section and should not be - * explicitly defined as additional exit points. Knowing all exit points is - * useful to assist debuggers stepping over the critical section. - */ -#define RSEQ_ASM_DEFINE_EXIT_POINT(start_ip, exit_ip) \ - ".pushsection __rseq_exit_point_array, \"aw\"\n\t" \ - ".long 0x0, " __rseq_str(start_ip) ", 0x0, " __rseq_str(exit_ip) "\n\t" \ - ".popsection\n\t" - -#define LONG_L "l" -#define LONG_S "st" -#define LONG_LT_R "ltr" -#define LONG_CMP "c" -#define LONG_CMP_R "cr" -#define LONG_ADDI "ahi" -#define LONG_ADD_R "ar" - -#endif - #define RSEQ_ASM_DEFINE_TABLE(label, start_ip, post_commit_ip, abort_ip) \ __RSEQ_ASM_DEFINE_TABLE(label, 0x0, 0x0, start_ip, \ (post_commit_ip - start_ip), abort_ip) diff --git a/tools/testing/selftests/run_kselftest.sh b/tools/testing/selftests/run_kselftest.sh index 0443beacf362..d4be97498b32 100755 --- a/tools/testing/selftests/run_kselftest.sh +++ b/tools/testing/selftests/run_kselftest.sh @@ -33,6 +33,7 @@ Usage: $0 [OPTIONS] -c | --collection COLLECTION Run all tests from COLLECTION -l | --list List the available collection:test entries -d | --dry-run Don't actually run any tests + -f | --no-error-on-fail Don't exit with an error just because tests failed -n | --netns Run each test in namespace -h | --help Show this usage info -o | --override-timeout Number of seconds after which we timeout @@ -44,6 +45,7 @@ COLLECTIONS="" TESTS="" dryrun="" kselftest_override_timeout="" +ERROR_ON_FAIL=true while true; do case "$1" in -s | --summary) @@ -65,6 +67,9 @@ while true; do -d | --dry-run) dryrun="echo" shift ;; + -f | --no-error-on-fail) + ERROR_ON_FAIL=false + shift ;; -n | --netns) RUN_IN_NETNS=1 shift ;; @@ -105,9 +110,18 @@ if [ -n "$TESTS" ]; then available="$(echo "$valid" | sed -e 's/ /\n/g')" fi +kselftest_failures_file="$(mktemp --tmpdir kselftest-failures-XXXXXX)" +export kselftest_failures_file + collections=$(echo "$available" | cut -d: -f1 | sort | uniq) for collection in $collections ; do [ -w /dev/kmsg ] && echo "kselftest: Running tests in $collection" >> /dev/kmsg tests=$(echo "$available" | grep "^$collection:" | cut -d: -f2) ($dryrun cd "$collection" && $dryrun run_many $tests) done + +failures="$(cat "$kselftest_failures_file")" +rm "$kselftest_failures_file" +if "$ERROR_ON_FAIL" && [ "$failures" ]; then + exit 1 +fi diff --git a/tools/testing/selftests/sched_ext/Makefile b/tools/testing/selftests/sched_ext/Makefile index 9d9d6b4c38b0..5fe45f9c5f8f 100644 --- a/tools/testing/selftests/sched_ext/Makefile +++ b/tools/testing/selftests/sched_ext/Makefile @@ -174,6 +174,7 @@ auto-test-targets := \ minimal \ numa \ allowed_cpus \ + peek_dsq \ prog_run \ reload_loop \ select_cpu_dfl \ diff --git a/tools/testing/selftests/sched_ext/peek_dsq.bpf.c b/tools/testing/selftests/sched_ext/peek_dsq.bpf.c new file mode 100644 index 000000000000..a3faf5bb49d6 --- /dev/null +++ b/tools/testing/selftests/sched_ext/peek_dsq.bpf.c @@ -0,0 +1,251 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * A BPF program for testing DSQ operations and peek in particular. + * + * Copyright (c) 2025 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2025 Ryan Newton <ryan.newton@alum.mit.edu> + */ + +#include <scx/common.bpf.h> +#include <scx/compat.bpf.h> + +char _license[] SEC("license") = "GPL"; + +UEI_DEFINE(uei); /* Error handling */ + +#define MAX_SAMPLES 100 +#define MAX_CPUS 512 +#define DSQ_POOL_SIZE 8 +int max_samples = MAX_SAMPLES; +int max_cpus = MAX_CPUS; +int dsq_pool_size = DSQ_POOL_SIZE; + +/* Global variables to store test results */ +int dsq_peek_result1 = -1; +long dsq_inserted_pid = -1; +int insert_test_cpu = -1; /* Set to the cpu that performs the test */ +long dsq_peek_result2 = -1; +long dsq_peek_result2_pid = -1; +long dsq_peek_result2_expected = -1; +int test_dsq_id = 1234; /* Use a simple ID like create_dsq example */ +int real_dsq_id = 1235; /* DSQ for normal operation */ +int enqueue_count = -1; +int dispatch_count = -1; +bool debug_ksym_exists; + +/* DSQ pool for stress testing */ +int dsq_pool_base_id = 2000; +int phase1_complete = -1; +long total_peek_attempts = -1; +long successful_peeks = -1; + +/* BPF map for sharing peek results with userspace */ +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, MAX_SAMPLES); + __type(key, u32); + __type(value, long); +} peek_results SEC(".maps"); + +static int get_random_dsq_id(void) +{ + u64 time = bpf_ktime_get_ns(); + + return dsq_pool_base_id + (time % DSQ_POOL_SIZE); +} + +static void record_peek_result(long pid) +{ + u32 slot_key; + long *slot_pid_ptr; + int ix; + + if (pid <= 0) + return; + + /* Find an empty slot or one with the same PID */ + bpf_for(ix, 0, 10) { + slot_key = (pid + ix) % MAX_SAMPLES; + slot_pid_ptr = bpf_map_lookup_elem(&peek_results, &slot_key); + if (!slot_pid_ptr) + continue; + + if (*slot_pid_ptr == -1 || *slot_pid_ptr == pid) { + *slot_pid_ptr = pid; + break; + } + } +} + +/* Scan all DSQs in the pool and try to move a task to local */ +static int scan_dsq_pool(void) +{ + struct task_struct *task; + int moved = 0; + int i; + + bpf_for(i, 0, DSQ_POOL_SIZE) { + int dsq_id = dsq_pool_base_id + i; + + total_peek_attempts++; + + task = __COMPAT_scx_bpf_dsq_peek(dsq_id); + if (task) { + successful_peeks++; + record_peek_result(task->pid); + + /* Try to move this task to local */ + if (!moved && scx_bpf_dsq_move_to_local(dsq_id) == 0) { + moved = 1; + break; + } + } + } + return moved; +} + +/* Struct_ops scheduler for testing DSQ peek operations */ +void BPF_STRUCT_OPS(peek_dsq_enqueue, struct task_struct *p, u64 enq_flags) +{ + struct task_struct *peek_result; + int last_insert_test_cpu, cpu; + + enqueue_count++; + cpu = bpf_get_smp_processor_id(); + last_insert_test_cpu = __sync_val_compare_and_swap(&insert_test_cpu, -1, cpu); + + /* Phase 1: Simple insert-then-peek test (only on first task) */ + if (last_insert_test_cpu == -1) { + bpf_printk("peek_dsq_enqueue beginning phase 1 peek test on cpu %d", cpu); + + /* Test 1: Peek empty DSQ - should return NULL */ + peek_result = __COMPAT_scx_bpf_dsq_peek(test_dsq_id); + dsq_peek_result1 = (long)peek_result; /* Should be 0 (NULL) */ + + /* Test 2: Insert task into test DSQ for testing in dispatch callback */ + dsq_inserted_pid = p->pid; + scx_bpf_dsq_insert(p, test_dsq_id, 0, enq_flags); + dsq_peek_result2_expected = (long)p; /* Expected the task we just inserted */ + } else if (!phase1_complete) { + /* Still in phase 1, use real DSQ */ + scx_bpf_dsq_insert(p, real_dsq_id, 0, enq_flags); + } else { + /* Phase 2: Random DSQ insertion for stress testing */ + int random_dsq_id = get_random_dsq_id(); + + scx_bpf_dsq_insert(p, random_dsq_id, 0, enq_flags); + } +} + +void BPF_STRUCT_OPS(peek_dsq_dispatch, s32 cpu, struct task_struct *prev) +{ + dispatch_count++; + + /* Phase 1: Complete the simple peek test if we inserted a task but + * haven't tested peek yet + */ + if (insert_test_cpu == cpu && dsq_peek_result2 == -1) { + struct task_struct *peek_result; + + bpf_printk("peek_dsq_dispatch completing phase 1 peek test on cpu %d", cpu); + + /* Test 3: Peek DSQ after insert - should return the task we inserted */ + peek_result = __COMPAT_scx_bpf_dsq_peek(test_dsq_id); + /* Store the PID of the peeked task for comparison */ + dsq_peek_result2 = (long)peek_result; + dsq_peek_result2_pid = peek_result ? peek_result->pid : -1; + + /* Now consume the task since we've peeked at it */ + scx_bpf_dsq_move_to_local(test_dsq_id); + + /* Mark phase 1 as complete */ + phase1_complete = 1; + bpf_printk("Phase 1 complete, starting phase 2 stress testing"); + } else if (!phase1_complete) { + /* Still in phase 1, use real DSQ */ + scx_bpf_dsq_move_to_local(real_dsq_id); + } else { + /* Phase 2: Scan all DSQs in the pool and try to move a task */ + if (!scan_dsq_pool()) { + /* No tasks found in DSQ pool, fall back to real DSQ */ + scx_bpf_dsq_move_to_local(real_dsq_id); + } + } +} + +s32 BPF_STRUCT_OPS_SLEEPABLE(peek_dsq_init) +{ + s32 err; + int i; + + /* Always set debug values so we can see which version we're using */ + debug_ksym_exists = bpf_ksym_exists(scx_bpf_dsq_peek) ? 1 : 0; + + /* Initialize state first */ + insert_test_cpu = -1; + enqueue_count = 0; + dispatch_count = 0; + phase1_complete = 0; + total_peek_attempts = 0; + successful_peeks = 0; + + /* Create the test and real DSQs */ + err = scx_bpf_create_dsq(test_dsq_id, -1); + if (err) { + scx_bpf_error("Failed to create DSQ %d: %d", test_dsq_id, err); + return err; + } + err = scx_bpf_create_dsq(real_dsq_id, -1); + if (err) { + scx_bpf_error("Failed to create DSQ %d: %d", test_dsq_id, err); + return err; + } + + /* Create the DSQ pool for stress testing */ + bpf_for(i, 0, DSQ_POOL_SIZE) { + int dsq_id = dsq_pool_base_id + i; + + err = scx_bpf_create_dsq(dsq_id, -1); + if (err) { + scx_bpf_error("Failed to create DSQ pool entry %d: %d", dsq_id, err); + return err; + } + } + + /* Initialize the peek results map */ + bpf_for(i, 0, MAX_SAMPLES) { + u32 key = i; + long pid = -1; + + bpf_map_update_elem(&peek_results, &key, &pid, BPF_ANY); + } + + return 0; +} + +void BPF_STRUCT_OPS(peek_dsq_exit, struct scx_exit_info *ei) +{ + int i; + + /* Destroy the primary DSQs */ + scx_bpf_destroy_dsq(test_dsq_id); + scx_bpf_destroy_dsq(real_dsq_id); + + /* Destroy the DSQ pool */ + bpf_for(i, 0, DSQ_POOL_SIZE) { + int dsq_id = dsq_pool_base_id + i; + + scx_bpf_destroy_dsq(dsq_id); + } + + UEI_RECORD(uei, ei); +} + +SEC(".struct_ops.link") +struct sched_ext_ops peek_dsq_ops = { + .enqueue = (void *)peek_dsq_enqueue, + .dispatch = (void *)peek_dsq_dispatch, + .init = (void *)peek_dsq_init, + .exit = (void *)peek_dsq_exit, + .name = "peek_dsq", +}; diff --git a/tools/testing/selftests/sched_ext/peek_dsq.c b/tools/testing/selftests/sched_ext/peek_dsq.c new file mode 100644 index 000000000000..a717384a3224 --- /dev/null +++ b/tools/testing/selftests/sched_ext/peek_dsq.c @@ -0,0 +1,224 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Test for DSQ operations including create, destroy, and peek operations. + * + * Copyright (c) 2025 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2025 Ryan Newton <ryan.newton@alum.mit.edu> + */ +#include <bpf/bpf.h> +#include <scx/common.h> +#include <sys/wait.h> +#include <unistd.h> +#include <pthread.h> +#include <string.h> +#include <sched.h> +#include "peek_dsq.bpf.skel.h" +#include "scx_test.h" + +#define NUM_WORKERS 4 + +static bool workload_running = true; +static pthread_t workload_threads[NUM_WORKERS]; + +/** + * Background workload thread that sleeps and wakes rapidly to exercise + * the scheduler's enqueue operations and ensure DSQ operations get tested. + */ +static void *workload_thread_fn(void *arg) +{ + while (workload_running) { + /* Sleep for a very short time to trigger scheduler activity */ + usleep(1000); /* 1ms sleep */ + /* Yield to ensure we go through the scheduler */ + sched_yield(); + } + return NULL; +} + +static enum scx_test_status setup(void **ctx) +{ + struct peek_dsq *skel; + + skel = peek_dsq__open(); + SCX_FAIL_IF(!skel, "Failed to open"); + SCX_ENUM_INIT(skel); + SCX_FAIL_IF(peek_dsq__load(skel), "Failed to load skel"); + + *ctx = skel; + + return SCX_TEST_PASS; +} + +static int print_observed_pids(struct bpf_map *map, int max_samples, const char *dsq_name) +{ + long count = 0; + + printf("Observed %s DSQ peek pids:\n", dsq_name); + for (int i = 0; i < max_samples; i++) { + long pid; + int err; + + err = bpf_map_lookup_elem(bpf_map__fd(map), &i, &pid); + if (err == 0) { + if (pid == 0) { + printf(" Sample %d: NULL peek\n", i); + } else if (pid > 0) { + printf(" Sample %d: pid %ld\n", i, pid); + count++; + } + } else { + printf(" Sample %d: error reading pid (err=%d)\n", i, err); + } + } + printf("Observed ~%ld pids in the %s DSQ(s)\n", count, dsq_name); + return count; +} + +static enum scx_test_status run(void *ctx) +{ + struct peek_dsq *skel = ctx; + bool failed = false; + int seconds = 3; + int err; + + /* Enable the scheduler to test DSQ operations */ + printf("Enabling scheduler to test DSQ insert operations...\n"); + + struct bpf_link *link = + bpf_map__attach_struct_ops(skel->maps.peek_dsq_ops); + + if (!link) { + SCX_ERR("Failed to attach struct_ops"); + return SCX_TEST_FAIL; + } + + printf("Starting %d background workload threads...\n", NUM_WORKERS); + workload_running = true; + for (int i = 0; i < NUM_WORKERS; i++) { + err = pthread_create(&workload_threads[i], NULL, workload_thread_fn, NULL); + if (err) { + SCX_ERR("Failed to create workload thread %d: %s", i, strerror(err)); + /* Stop already created threads */ + workload_running = false; + for (int j = 0; j < i; j++) + pthread_join(workload_threads[j], NULL); + bpf_link__destroy(link); + return SCX_TEST_FAIL; + } + } + + printf("Waiting for enqueue events.\n"); + sleep(seconds); + while (skel->data->enqueue_count <= 0) { + printf("."); + fflush(stdout); + sleep(1); + seconds++; + if (seconds >= 30) { + printf("\n\u2717 Timeout waiting for enqueue events\n"); + /* Stop workload threads and cleanup */ + workload_running = false; + for (int i = 0; i < NUM_WORKERS; i++) + pthread_join(workload_threads[i], NULL); + bpf_link__destroy(link); + return SCX_TEST_FAIL; + } + } + + workload_running = false; + for (int i = 0; i < NUM_WORKERS; i++) { + err = pthread_join(workload_threads[i], NULL); + if (err) { + SCX_ERR("Failed to join workload thread %d: %s", i, strerror(err)); + bpf_link__destroy(link); + return SCX_TEST_FAIL; + } + } + printf("Background workload threads stopped.\n"); + + SCX_EQ(skel->data->uei.kind, EXIT_KIND(SCX_EXIT_NONE)); + + /* Detach the scheduler */ + bpf_link__destroy(link); + + printf("Enqueue/dispatch count over %d seconds: %d / %d\n", seconds, + skel->data->enqueue_count, skel->data->dispatch_count); + printf("Debug: ksym_exists=%d\n", + skel->bss->debug_ksym_exists); + + /* Check DSQ insert result */ + printf("DSQ insert test done on cpu: %d\n", skel->data->insert_test_cpu); + if (skel->data->insert_test_cpu != -1) + printf("\u2713 DSQ insert succeeded !\n"); + else { + printf("\u2717 DSQ insert failed or not attempted\n"); + failed = true; + } + + /* Check DSQ peek results */ + printf(" DSQ peek result 1 (before insert): %d\n", + skel->data->dsq_peek_result1); + if (skel->data->dsq_peek_result1 == 0) + printf("\u2713 DSQ peek verification success: peek returned NULL!\n"); + else { + printf("\u2717 DSQ peek verification failed\n"); + failed = true; + } + + printf(" DSQ peek result 2 (after insert): %ld\n", + skel->data->dsq_peek_result2); + printf(" DSQ peek result 2, expected: %ld\n", + skel->data->dsq_peek_result2_expected); + if (skel->data->dsq_peek_result2 == + skel->data->dsq_peek_result2_expected) + printf("\u2713 DSQ peek verification success: peek returned the inserted task!\n"); + else { + printf("\u2717 DSQ peek verification failed\n"); + failed = true; + } + + printf(" Inserted test task -> pid: %ld\n", skel->data->dsq_inserted_pid); + printf(" DSQ peek result 2 -> pid: %ld\n", skel->data->dsq_peek_result2_pid); + + int pid_count; + + pid_count = print_observed_pids(skel->maps.peek_results, + skel->data->max_samples, "DSQ pool"); + printf("Total non-null peek observations: %ld out of %ld\n", + skel->data->successful_peeks, skel->data->total_peek_attempts); + + if (skel->bss->debug_ksym_exists && pid_count == 0) { + printf("\u2717 DSQ pool test failed: no successful peeks in native mode\n"); + failed = true; + } + if (skel->bss->debug_ksym_exists && pid_count > 0) + printf("\u2713 DSQ pool test success: observed successful peeks in native mode\n"); + + if (failed) + return SCX_TEST_FAIL; + else + return SCX_TEST_PASS; +} + +static void cleanup(void *ctx) +{ + struct peek_dsq *skel = ctx; + + if (workload_running) { + workload_running = false; + for (int i = 0; i < NUM_WORKERS; i++) + pthread_join(workload_threads[i], NULL); + } + + peek_dsq__destroy(skel); +} + +struct scx_test peek_dsq = { + .name = "peek_dsq", + .description = + "Test DSQ create/destroy operations and future peek functionality", + .setup = setup, + .run = run, + .cleanup = cleanup, +}; +REGISTER_SCX_TEST(&peek_dsq) diff --git a/tools/testing/selftests/tc-testing/tc-tests/infra/qdiscs.json b/tools/testing/selftests/tc-testing/tc-tests/infra/qdiscs.json index 0091bcd91c2c..47de27fd4f90 100644 --- a/tools/testing/selftests/tc-testing/tc-tests/infra/qdiscs.json +++ b/tools/testing/selftests/tc-testing/tc-tests/infra/qdiscs.json @@ -1005,5 +1005,33 @@ "teardown": [ "$TC qdisc del dev $DUMMY clsact" ] + }, + { + "id": "4366", + "name": "CAKE with QFQ Parent - CAKE enqueue with packets dropping", + "category": [ + "qdisc", + "cake", + "netem" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup":[ + "$TC qdisc add dev $DUMMY handle 1: root qfq", + "$TC class add dev $DUMMY parent 1: classid 1:1 qfq maxpkt 1024", + "$TC qdisc add dev $DUMMY parent 1:1 handle 2: cake memlimit 9", + "$TC filter add dev $DUMMY protocol ip parent 1: prio 1 u32 match ip protocol 1 0xff flowid 1:1", + "ping -I$DUMMY -f -c1 -s64 -W1 10.10.10.1 || true", + "$TC qdisc replace dev $DUMMY parent 1:1 handle 3: netem delay 0ms" + ], + "cmdUnderTest": "ping -I$DUMMY -f -c1 -s64 -W1 10.10.10.1 || true", + "expExitCode": "0", + "verifyCmd": "$TC -s qdisc show dev $DUMMY", + "matchPattern": "qdisc qfq 1:", + "matchCount": "1", + "teardown": [ + "$TC qdisc del dev $DUMMY handle 1: root" + ] } ] diff --git a/tools/testing/selftests/timers/nanosleep.c b/tools/testing/selftests/timers/nanosleep.c index 252c6308c569..10badae13ebe 100644 --- a/tools/testing/selftests/timers/nanosleep.c +++ b/tools/testing/selftests/timers/nanosleep.c @@ -116,6 +116,56 @@ int nanosleep_test(int clockid, long long ns) return 0; } +static void dummy_event_handler(int val) +{ + /* No action needed */ +} + +static int nanosleep_test_remaining(int clockid) +{ + struct timespec rqtp = {}, rmtp = {}; + struct itimerspec itimer = {}; + struct sigaction sa = {}; + timer_t timer; + int ret; + + sa.sa_handler = dummy_event_handler; + ret = sigaction(SIGALRM, &sa, NULL); + if (ret) + return -1; + + ret = timer_create(clockid, NULL, &timer); + if (ret) + return -1; + + itimer.it_value.tv_nsec = NSEC_PER_SEC / 4; + ret = timer_settime(timer, 0, &itimer, NULL); + if (ret) + return -1; + + rqtp.tv_nsec = NSEC_PER_SEC / 2; + ret = clock_nanosleep(clockid, 0, &rqtp, &rmtp); + if (ret != EINTR) + return -1; + + ret = timer_delete(timer); + if (ret) + return -1; + + sa.sa_handler = SIG_DFL; + ret = sigaction(SIGALRM, &sa, NULL); + if (ret) + return -1; + + if (!in_order((struct timespec) {}, rmtp)) + return -1; + + if (!in_order(rmtp, rqtp)) + return -1; + + return 0; +} + int main(int argc, char **argv) { long long length; @@ -150,6 +200,11 @@ int main(int argc, char **argv) } length *= 100; } + ret = nanosleep_test_remaining(clockid); + if (ret < 0) { + ksft_test_result_fail("%-31s\n", clockstring(clockid)); + ksft_exit_fail(); + } ksft_test_result_pass("%-31s\n", clockstring(clockid)); next: ret = 0; diff --git a/tools/testing/selftests/timers/posix_timers.c b/tools/testing/selftests/timers/posix_timers.c index f0eceb0faf34..a563c438ac79 100644 --- a/tools/testing/selftests/timers/posix_timers.c +++ b/tools/testing/selftests/timers/posix_timers.c @@ -18,6 +18,7 @@ #include <time.h> #include <include/vdso/time64.h> #include <pthread.h> +#include <stdbool.h> #include "../kselftest.h" @@ -670,8 +671,14 @@ static void check_timer_create_exact(void) int main(int argc, char **argv) { + bool run_sig_ign_tests = ksft_min_kernel_version(6, 13); + ksft_print_header(); - ksft_set_plan(19); + if (run_sig_ign_tests) { + ksft_set_plan(19); + } else { + ksft_set_plan(10); + } ksft_print_msg("Testing posix timers. False negative may happen on CPU execution \n"); ksft_print_msg("based timers if other threads run on the CPU...\n"); @@ -695,15 +702,20 @@ int main(int argc, char **argv) check_timer_create(CLOCK_PROCESS_CPUTIME_ID, "CLOCK_PROCESS_CPUTIME_ID"); check_timer_distribution(); - check_sig_ign(0); - check_sig_ign(1); - check_rearm(); - check_delete(); - check_sigev_none(CLOCK_MONOTONIC, "CLOCK_MONOTONIC"); - check_sigev_none(CLOCK_PROCESS_CPUTIME_ID, "CLOCK_PROCESS_CPUTIME_ID"); - check_gettime(CLOCK_MONOTONIC, "CLOCK_MONOTONIC"); - check_gettime(CLOCK_PROCESS_CPUTIME_ID, "CLOCK_PROCESS_CPUTIME_ID"); - check_gettime(CLOCK_THREAD_CPUTIME_ID, "CLOCK_THREAD_CPUTIME_ID"); + if (run_sig_ign_tests) { + check_sig_ign(0); + check_sig_ign(1); + check_rearm(); + check_delete(); + check_sigev_none(CLOCK_MONOTONIC, "CLOCK_MONOTONIC"); + check_sigev_none(CLOCK_PROCESS_CPUTIME_ID, "CLOCK_PROCESS_CPUTIME_ID"); + check_gettime(CLOCK_MONOTONIC, "CLOCK_MONOTONIC"); + check_gettime(CLOCK_PROCESS_CPUTIME_ID, "CLOCK_PROCESS_CPUTIME_ID"); + check_gettime(CLOCK_THREAD_CPUTIME_ID, "CLOCK_THREAD_CPUTIME_ID"); + } else { + ksft_print_msg("Skipping SIG_IGN tests on kernel < 6.13\n"); + } + check_overrun(CLOCK_MONOTONIC, "CLOCK_MONOTONIC"); check_overrun(CLOCK_PROCESS_CPUTIME_ID, "CLOCK_PROCESS_CPUTIME_ID"); check_overrun(CLOCK_THREAD_CPUTIME_ID, "CLOCK_THREAD_CPUTIME_ID"); diff --git a/tools/testing/selftests/tpm2/tpm2.py b/tools/testing/selftests/tpm2/tpm2.py index bba8cb54548e..3d130c30bc7c 100644 --- a/tools/testing/selftests/tpm2/tpm2.py +++ b/tools/testing/selftests/tpm2/tpm2.py @@ -437,7 +437,7 @@ class Client: def extend_pcr(self, i, dig, bank_alg = TPM2_ALG_SHA1): ds = get_digest_size(bank_alg) - assert(ds == len(dig)) + assert ds == len(dig) auth_cmd = AuthCommand() @@ -589,7 +589,7 @@ class Client: def seal(self, parent_key, data, auth_value, policy_dig, name_alg = TPM2_ALG_SHA1): ds = get_digest_size(name_alg) - assert(not policy_dig or ds == len(policy_dig)) + assert not policy_dig or ds == len(policy_dig) attributes = 0 if not policy_dig: diff --git a/tools/testing/selftests/ublk/kublk.c b/tools/testing/selftests/ublk/kublk.c index 6b8123c12a7a..f8fa102a627f 100644 --- a/tools/testing/selftests/ublk/kublk.c +++ b/tools/testing/selftests/ublk/kublk.c @@ -836,56 +836,70 @@ static int ublk_process_io(struct ublk_thread *t) return reapped; } -static void ublk_thread_set_sched_affinity(const struct ublk_thread *t, - cpu_set_t *cpuset) -{ - if (sched_setaffinity(0, sizeof(*cpuset), cpuset) < 0) - ublk_err("ublk dev %u thread %u set affinity failed", - t->dev->dev_info.dev_id, t->idx); -} - struct ublk_thread_info { struct ublk_dev *dev; + pthread_t thread; unsigned idx; sem_t *ready; cpu_set_t *affinity; unsigned long long extra_flags; }; -static void *ublk_io_handler_fn(void *data) +static void ublk_thread_set_sched_affinity(const struct ublk_thread_info *info) { - struct ublk_thread_info *info = data; - struct ublk_thread *t = &info->dev->threads[info->idx]; + if (pthread_setaffinity_np(pthread_self(), sizeof(*info->affinity), info->affinity) < 0) + ublk_err("ublk dev %u thread %u set affinity failed", + info->dev->dev_info.dev_id, info->idx); +} + +static __attribute__((noinline)) int __ublk_io_handler_fn(struct ublk_thread_info *info) +{ + struct ublk_thread t = { + .dev = info->dev, + .idx = info->idx, + }; int dev_id = info->dev->dev_info.dev_id; int ret; - t->dev = info->dev; - t->idx = info->idx; - - ret = ublk_thread_init(t, info->extra_flags); + ret = ublk_thread_init(&t, info->extra_flags); if (ret) { ublk_err("ublk dev %d thread %u init failed\n", - dev_id, t->idx); - return NULL; + dev_id, t.idx); + return ret; } - /* IO perf is sensitive with queue pthread affinity on NUMA machine*/ - if (info->affinity) - ublk_thread_set_sched_affinity(t, info->affinity); sem_post(info->ready); ublk_dbg(UBLK_DBG_THREAD, "tid %d: ublk dev %d thread %u started\n", - gettid(), dev_id, t->idx); + gettid(), dev_id, t.idx); /* submit all io commands to ublk driver */ - ublk_submit_fetch_commands(t); + ublk_submit_fetch_commands(&t); do { - if (ublk_process_io(t) < 0) + if (ublk_process_io(&t) < 0) break; } while (1); ublk_dbg(UBLK_DBG_THREAD, "tid %d: ublk dev %d thread %d exiting\n", - gettid(), dev_id, t->idx); - ublk_thread_deinit(t); + gettid(), dev_id, t.idx); + ublk_thread_deinit(&t); + return 0; +} + +static void *ublk_io_handler_fn(void *data) +{ + struct ublk_thread_info *info = data; + + /* + * IO perf is sensitive with queue pthread affinity on NUMA machine + * + * Set sched_affinity at beginning, so following allocated memory/pages + * could be CPU/NUMA aware. + */ + if (info->affinity) + ublk_thread_set_sched_affinity(info); + + __ublk_io_handler_fn(info); + return NULL; } @@ -983,14 +997,13 @@ static int ublk_start_daemon(const struct dev_ctx *ctx, struct ublk_dev *dev) */ if (dev->nthreads == dinfo->nr_hw_queues) tinfo[i].affinity = &affinity_buf[i]; - pthread_create(&dev->threads[i].thread, NULL, + pthread_create(&tinfo[i].thread, NULL, ublk_io_handler_fn, &tinfo[i]); } for (i = 0; i < dev->nthreads; i++) sem_wait(&ready); - free(tinfo); free(affinity_buf); /* everything is fine now, start us */ @@ -1013,7 +1026,8 @@ static int ublk_start_daemon(const struct dev_ctx *ctx, struct ublk_dev *dev) /* wait until we are terminated */ for (i = 0; i < dev->nthreads; i++) - pthread_join(dev->threads[i].thread, &thread_ret); + pthread_join(tinfo[i].thread, &thread_ret); + free(tinfo); fail: for (i = 0; i < dinfo->nr_hw_queues; i++) ublk_queue_deinit(&dev->q[i]); diff --git a/tools/testing/selftests/ublk/kublk.h b/tools/testing/selftests/ublk/kublk.h index 5e55484fb0aa..fe42705c6d42 100644 --- a/tools/testing/selftests/ublk/kublk.h +++ b/tools/testing/selftests/ublk/kublk.h @@ -175,23 +175,20 @@ struct ublk_queue { struct ublk_thread { struct ublk_dev *dev; - struct io_uring ring; - unsigned int cmd_inflight; - unsigned int io_inflight; - - pthread_t thread; unsigned idx; #define UBLKS_T_STOPPING (1U << 0) #define UBLKS_T_IDLE (1U << 1) unsigned state; + unsigned int cmd_inflight; + unsigned int io_inflight; + struct io_uring ring; }; struct ublk_dev { struct ublk_tgt tgt; struct ublksrv_ctrl_dev_info dev_info; struct ublk_queue q[UBLK_MAX_QUEUES]; - struct ublk_thread threads[UBLK_MAX_THREADS]; unsigned nthreads; unsigned per_io_tasks; diff --git a/tools/testing/selftests/vDSO/vdso_config.h b/tools/testing/selftests/vDSO/vdso_config.h index 5fdd0f362337..50c261005111 100644 --- a/tools/testing/selftests/vDSO/vdso_config.h +++ b/tools/testing/selftests/vDSO/vdso_config.h @@ -25,10 +25,6 @@ #define VDSO_VERSION 1 #define VDSO_NAMES 0 #define VDSO_32BIT 1 -#elif defined (__s390__) && !defined(__s390x__) -#define VDSO_VERSION 2 -#define VDSO_NAMES 0 -#define VDSO_32BIT 1 #elif defined (__s390x__) #define VDSO_VERSION 2 #define VDSO_NAMES 0 diff --git a/tools/bpf/runqslower/.gitignore b/tools/testing/selftests/verification/.gitignore index ffdb70230c8b..2659417cb2c7 100644 --- a/tools/bpf/runqslower/.gitignore +++ b/tools/testing/selftests/verification/.gitignore @@ -1,2 +1,2 @@ # SPDX-License-Identifier: GPL-2.0-only -/.output +logs diff --git a/tools/testing/selftests/verification/Makefile b/tools/testing/selftests/verification/Makefile new file mode 100644 index 000000000000..aa8790c22a71 --- /dev/null +++ b/tools/testing/selftests/verification/Makefile @@ -0,0 +1,8 @@ +# SPDX-License-Identifier: GPL-2.0 +all: + +TEST_PROGS := verificationtest-ktap +TEST_FILES := test.d settings +EXTRA_CLEAN := $(OUTPUT)/logs/* + +include ../lib.mk diff --git a/tools/testing/selftests/verification/config b/tools/testing/selftests/verification/config new file mode 100644 index 000000000000..43072c1c38f4 --- /dev/null +++ b/tools/testing/selftests/verification/config @@ -0,0 +1 @@ +CONFIG_RV=y diff --git a/tools/testing/selftests/verification/settings b/tools/testing/selftests/verification/settings new file mode 100644 index 000000000000..e7b9417537fb --- /dev/null +++ b/tools/testing/selftests/verification/settings @@ -0,0 +1 @@ +timeout=0 diff --git a/tools/testing/selftests/verification/test.d/functions b/tools/testing/selftests/verification/test.d/functions new file mode 100644 index 000000000000..ec36a092f56e --- /dev/null +++ b/tools/testing/selftests/verification/test.d/functions @@ -0,0 +1,39 @@ +check_requires() { # Check required files, monitors and reactors + for i in "$@" ; do + p=${i%:program} + m=${i%:monitor} + r=${i%:reactor} + if [ $p != $i ]; then + if ! which $p ; then + echo "Required program $p is not found." + exit_unresolved + fi + elif [ $m != $i ]; then + if ! grep -wq $m available_monitors ; then + echo "Required monitor $m is not configured." + exit_unsupported + fi + elif [ $r != $i ]; then + if ! grep -wq $r available_reactors ; then + echo "Required reactor $r is not configured." + exit_unsupported + fi + elif [ ! -e $i ]; then + echo "Required feature interface $i doesn't exist." + exit_unsupported + fi + done +} + +initialize_system() { # Reset RV to initial-state + echo > enabled_monitors + for m in monitors/*; do + echo nop > $m/reactors || true + done + echo 1 > monitoring_on + echo 1 > reacting_on || true +} + +finish_system() { + initialize_system +} diff --git a/tools/testing/selftests/verification/test.d/rv_monitor_enable_disable.tc b/tools/testing/selftests/verification/test.d/rv_monitor_enable_disable.tc new file mode 100644 index 000000000000..f29236defb5a --- /dev/null +++ b/tools/testing/selftests/verification/test.d/rv_monitor_enable_disable.tc @@ -0,0 +1,75 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0-or-later +# description: Test monitor enable/disable + +test_simple_monitor() { + local monitor="$1" + local prefix="$2" # nested monitors + + echo 1 > "monitors/$prefix$monitor/enable" + grep -q "$monitor$" enabled_monitors + + echo 0 > "monitors/$prefix$monitor/enable" + ! grep -q "$monitor$" enabled_monitors + + echo "$monitor" >> enabled_monitors + grep -q 1 "monitors/$prefix$monitor/enable" + + echo "!$monitor" >> enabled_monitors + grep -q 0 "monitors/$prefix$monitor/enable" +} + +test_container_monitor() { + local monitor="$1" + local nested + + echo 1 > "monitors/$monitor/enable" + grep -q "^$monitor$" enabled_monitors + + for nested_dir in "monitors/$monitor"/*; do + [ -d "$nested_dir" ] || continue + nested=$(basename "$nested_dir") + grep -q "^$monitor:$nested$" enabled_monitors + done + test -n "$nested" + + echo 0 > "monitors/$monitor/enable" + ! grep -q "^$monitor$" enabled_monitors + + for nested_dir in "monitors/$monitor"/*; do + [ -d "$nested_dir" ] || continue + nested=$(basename "$nested_dir") + ! grep -q "^$monitor:$nested$" enabled_monitors + done + + echo "$monitor" >> enabled_monitors + grep -q 1 "monitors/$monitor/enable" + + for nested_dir in "monitors/$monitor"/*; do + [ -d "$nested_dir" ] || continue + nested=$(basename "$nested_dir") + grep -q "^$monitor:$nested$" enabled_monitors + done + + echo "!$monitor" >> enabled_monitors + grep -q 0 "monitors/$monitor/enable" + + for nested_dir in "monitors/$monitor"/*; do + [ -d "$nested_dir" ] || continue + nested=$(basename "$nested_dir") + test_simple_monitor "$nested" "$monitor/" + done +} + +for monitor_dir in monitors/*; do + monitor=$(basename "$monitor_dir") + + if find "$monitor_dir" -mindepth 1 -type d | grep -q .; then + test_container_monitor "$monitor" + else + test_simple_monitor "$monitor" + fi +done + +! echo non_existent_monitor > enabled_monitors +! grep -q "^non_existent_monitor$" enabled_monitors diff --git a/tools/testing/selftests/verification/test.d/rv_monitor_reactor.tc b/tools/testing/selftests/verification/test.d/rv_monitor_reactor.tc new file mode 100644 index 000000000000..2958bf849338 --- /dev/null +++ b/tools/testing/selftests/verification/test.d/rv_monitor_reactor.tc @@ -0,0 +1,68 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0-or-later +# description: Test monitor reactor setting +# requires: available_reactors + +test_monitor_reactor() { + local monitor="$1" + local prefix="$2" # nested monitors + + while read -r reactor; do + [ "$reactor" = nop ] && continue + + echo "$reactor" > "monitors/$prefix$monitor/reactors" + grep -q "\\[$reactor\\]" "monitors/$prefix$monitor/reactors" + done < available_reactors + + echo nop > "monitors/$prefix$monitor/reactors" + grep -q "\\[nop\\]" "monitors/$prefix$monitor/reactors" +} + +test_container_monitor() { + local monitor="$1" + local nested + + while read -r reactor; do + [ "$reactor" = nop ] && continue + + echo "$reactor" > "monitors/$monitor/reactors" + grep -q "\\[$reactor\\]" "monitors/$monitor/reactors" + + for nested_dir in "monitors/$monitor"/*; do + [ -d "$nested_dir" ] || continue + nested=$(basename "$nested_dir") + grep -q "\\[$reactor\\]" "monitors/$monitor/$nested/reactors" + done + done < available_reactors + test -n "$nested" + + echo nop > "monitors/$monitor/reactors" + grep -q "\\[nop\\]" "monitors/$monitor/reactors" + + for nested_dir in "monitors/$monitor"/*; do + [ -d "$nested_dir" ] || continue + nested=$(basename "$nested_dir") + grep -q "\\[nop\\]" "monitors/$monitor/$nested/reactors" + done + + for nested_dir in "monitors/$monitor"/*; do + [ -d "$nested_dir" ] || continue + nested=$(basename "$nested_dir") + test_monitor_reactor "$nested" "$monitor/" + done +} + +for monitor_dir in monitors/*; do + monitor=$(basename "$monitor_dir") + + if find "$monitor_dir" -mindepth 1 -type d | grep -q .; then + test_container_monitor "$monitor" + else + test_monitor_reactor "$monitor" + fi +done + +monitor=$(ls /sys/kernel/tracing/rv/monitors -1 | head -n 1) +test -f "monitors/$monitor/reactors" +! echo non_existent_reactor > "monitors/$monitor/reactors" +! grep -q "\\[non_existent_reactor\\]" "monitors/$monitor/reactors" diff --git a/tools/testing/selftests/verification/test.d/rv_monitors_available.tc b/tools/testing/selftests/verification/test.d/rv_monitors_available.tc new file mode 100644 index 000000000000..e6a4a1410690 --- /dev/null +++ b/tools/testing/selftests/verification/test.d/rv_monitors_available.tc @@ -0,0 +1,18 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0-or-later +# description: Check available monitors + +for monitor_dir in monitors/*; do + monitor=$(basename "$monitor_dir") + + grep -q "^$monitor$" available_monitors + grep -q . "$monitor_dir"/desc + + for nested_dir in "$monitor_dir"/*; do + [ -d "$nested_dir" ] || continue + nested=$(basename "$nested_dir") + + grep -q "^$monitor:$nested$" available_monitors + grep -q . "$nested_dir"/desc + done +done diff --git a/tools/testing/selftests/verification/test.d/rv_wwnr_printk.tc b/tools/testing/selftests/verification/test.d/rv_wwnr_printk.tc new file mode 100644 index 000000000000..5a59432b1d93 --- /dev/null +++ b/tools/testing/selftests/verification/test.d/rv_wwnr_printk.tc @@ -0,0 +1,30 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0-or-later +# description: Test wwnr monitor with printk reactor +# requires: available_reactors wwnr:monitor printk:reactor stress-ng:program + +load() { # returns true if there was a reaction + local lines_before num + num=$((($(nproc) + 1) / 2)) + lines_before=$(dmesg | wc -l) + stress-ng --cpu-sched "$num" --timer "$num" -t 5 -q + dmesg | tail -n $((lines_before + 1)) | grep -q "rv: monitor wwnr does not allow event" +} + +echo 1 > monitors/wwnr/enable +echo printk > monitors/wwnr/reactors + +load + +echo 0 > monitoring_on +! load +echo 1 > monitoring_on + +load + +echo 0 > reacting_on +! load +echo 1 > reacting_on + +echo nop > monitors/wwnr/reactors +echo 0 > monitors/wwnr/enable diff --git a/tools/testing/selftests/verification/verificationtest-ktap b/tools/testing/selftests/verification/verificationtest-ktap new file mode 100755 index 000000000000..18f7fe324e2f --- /dev/null +++ b/tools/testing/selftests/verification/verificationtest-ktap @@ -0,0 +1,8 @@ +#!/bin/sh -e +# SPDX-License-Identifier: GPL-2.0-only +# +# ftracetest-ktap: Wrapper to integrate ftracetest with the kselftest runner +# +# Copyright (C) Arm Ltd., 2023 + +../ftrace/ftracetest -K -v --rv ../verification diff --git a/tools/testing/selftests/vfio/Makefile b/tools/testing/selftests/vfio/Makefile index 324ba0175a33..3c796ca99a50 100644 --- a/tools/testing/selftests/vfio/Makefile +++ b/tools/testing/selftests/vfio/Makefile @@ -2,8 +2,14 @@ CFLAGS = $(KHDR_INCLUDES) TEST_GEN_PROGS += vfio_dma_mapping_test TEST_GEN_PROGS += vfio_iommufd_setup_test TEST_GEN_PROGS += vfio_pci_device_test +TEST_GEN_PROGS += vfio_pci_device_init_perf_test TEST_GEN_PROGS += vfio_pci_driver_test -TEST_PROGS_EXTENDED := run.sh + +TEST_FILES += scripts/cleanup.sh +TEST_FILES += scripts/lib.sh +TEST_FILES += scripts/run.sh +TEST_FILES += scripts/setup.sh + include ../lib.mk include lib/libvfio.mk @@ -11,6 +17,8 @@ CFLAGS += -I$(top_srcdir)/tools/include CFLAGS += -MD CFLAGS += $(EXTRA_CFLAGS) +LDFLAGS += -pthread + $(TEST_GEN_PROGS): %: %.o $(LIBVFIO_O) $(CC) $(CFLAGS) $(CPPFLAGS) $(LDFLAGS) $< $(LIBVFIO_O) $(LDLIBS) -o $@ diff --git a/tools/testing/selftests/vfio/lib/drivers/dsa/dsa.c b/tools/testing/selftests/vfio/lib/drivers/dsa/dsa.c index 0ca2cbc2a316..c75045bcab79 100644 --- a/tools/testing/selftests/vfio/lib/drivers/dsa/dsa.c +++ b/tools/testing/selftests/vfio/lib/drivers/dsa/dsa.c @@ -9,7 +9,7 @@ #include <linux/pci_ids.h> #include <linux/sizes.h> -#include <vfio_util.h> +#include <libvfio.h> #include "registers.h" @@ -70,7 +70,7 @@ static int dsa_probe(struct vfio_pci_device *device) return -EINVAL; if (dsa_int_handle_request_required(device)) { - printf("Device requires requesting interrupt handles\n"); + dev_err(device, "Device requires requesting interrupt handles\n"); return -EINVAL; } @@ -91,23 +91,23 @@ static void dsa_check_sw_err(struct vfio_pci_device *device) return; } - fprintf(stderr, "SWERR: 0x%016lx 0x%016lx 0x%016lx 0x%016lx\n", + dev_err(device, "SWERR: 0x%016lx 0x%016lx 0x%016lx 0x%016lx\n", err.bits[0], err.bits[1], err.bits[2], err.bits[3]); - fprintf(stderr, " valid: 0x%x\n", err.valid); - fprintf(stderr, " overflow: 0x%x\n", err.overflow); - fprintf(stderr, " desc_valid: 0x%x\n", err.desc_valid); - fprintf(stderr, " wq_idx_valid: 0x%x\n", err.wq_idx_valid); - fprintf(stderr, " batch: 0x%x\n", err.batch); - fprintf(stderr, " fault_rw: 0x%x\n", err.fault_rw); - fprintf(stderr, " priv: 0x%x\n", err.priv); - fprintf(stderr, " error: 0x%x\n", err.error); - fprintf(stderr, " wq_idx: 0x%x\n", err.wq_idx); - fprintf(stderr, " operation: 0x%x\n", err.operation); - fprintf(stderr, " pasid: 0x%x\n", err.pasid); - fprintf(stderr, " batch_idx: 0x%x\n", err.batch_idx); - fprintf(stderr, " invalid_flags: 0x%x\n", err.invalid_flags); - fprintf(stderr, " fault_addr: 0x%lx\n", err.fault_addr); + dev_err(device, " valid: 0x%x\n", err.valid); + dev_err(device, " overflow: 0x%x\n", err.overflow); + dev_err(device, " desc_valid: 0x%x\n", err.desc_valid); + dev_err(device, " wq_idx_valid: 0x%x\n", err.wq_idx_valid); + dev_err(device, " batch: 0x%x\n", err.batch); + dev_err(device, " fault_rw: 0x%x\n", err.fault_rw); + dev_err(device, " priv: 0x%x\n", err.priv); + dev_err(device, " error: 0x%x\n", err.error); + dev_err(device, " wq_idx: 0x%x\n", err.wq_idx); + dev_err(device, " operation: 0x%x\n", err.operation); + dev_err(device, " pasid: 0x%x\n", err.pasid); + dev_err(device, " batch_idx: 0x%x\n", err.batch_idx); + dev_err(device, " invalid_flags: 0x%x\n", err.invalid_flags); + dev_err(device, " fault_addr: 0x%lx\n", err.fault_addr); VFIO_FAIL("Software Error Detected!\n"); } @@ -256,7 +256,7 @@ static int dsa_completion_wait(struct vfio_pci_device *device, if (status == DSA_COMP_SUCCESS) return 0; - printf("Error detected during memcpy operation: 0x%x\n", status); + dev_err(device, "Error detected during memcpy operation: 0x%x\n", status); return -1; } diff --git a/tools/testing/selftests/vfio/lib/drivers/ioat/ioat.c b/tools/testing/selftests/vfio/lib/drivers/ioat/ioat.c index c3b91d9b1f59..a871b935542b 100644 --- a/tools/testing/selftests/vfio/lib/drivers/ioat/ioat.c +++ b/tools/testing/selftests/vfio/lib/drivers/ioat/ioat.c @@ -7,7 +7,7 @@ #include <linux/pci_ids.h> #include <linux/sizes.h> -#include <vfio_util.h> +#include <libvfio.h> #include "hw.h" #include "registers.h" @@ -51,7 +51,7 @@ static int ioat_probe(struct vfio_pci_device *device) r = 0; break; default: - printf("ioat: Unsupported version: 0x%x\n", version); + dev_err(device, "ioat: Unsupported version: 0x%x\n", version); r = -EINVAL; } return r; @@ -135,13 +135,13 @@ static void ioat_handle_error(struct vfio_pci_device *device) { void *registers = ioat_channel_registers(device); - printf("Error detected during memcpy operation!\n" - " CHANERR: 0x%x\n" - " CHANERR_INT: 0x%x\n" - " DMAUNCERRSTS: 0x%x\n", - readl(registers + IOAT_CHANERR_OFFSET), - vfio_pci_config_readl(device, IOAT_PCI_CHANERR_INT_OFFSET), - vfio_pci_config_readl(device, IOAT_PCI_DMAUNCERRSTS_OFFSET)); + dev_err(device, "Error detected during memcpy operation!\n" + " CHANERR: 0x%x\n" + " CHANERR_INT: 0x%x\n" + " DMAUNCERRSTS: 0x%x\n", + readl(registers + IOAT_CHANERR_OFFSET), + vfio_pci_config_readl(device, IOAT_PCI_CHANERR_INT_OFFSET), + vfio_pci_config_readl(device, IOAT_PCI_DMAUNCERRSTS_OFFSET)); ioat_reset(device); } diff --git a/tools/testing/selftests/vfio/lib/include/libvfio.h b/tools/testing/selftests/vfio/lib/include/libvfio.h new file mode 100644 index 000000000000..279ddcd70194 --- /dev/null +++ b/tools/testing/selftests/vfio/lib/include/libvfio.h @@ -0,0 +1,26 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef SELFTESTS_VFIO_LIB_INCLUDE_LIBVFIO_H +#define SELFTESTS_VFIO_LIB_INCLUDE_LIBVFIO_H + +#include <libvfio/assert.h> +#include <libvfio/iommu.h> +#include <libvfio/iova_allocator.h> +#include <libvfio/vfio_pci_device.h> +#include <libvfio/vfio_pci_driver.h> + +/* + * Return the BDF string of the device that the test should use. + * + * If a BDF string is provided by the user on the command line (as the last + * element of argv[]), then this function will return that and decrement argc + * by 1. + * + * Otherwise this function will attempt to use the environment variable + * $VFIO_SELFTESTS_BDF. + * + * If BDF cannot be determined then the test will exit with KSFT_SKIP. + */ +const char *vfio_selftests_get_bdf(int *argc, char *argv[]); +char **vfio_selftests_get_bdfs(int *argc, char *argv[], int *nr_bdfs); + +#endif /* SELFTESTS_VFIO_LIB_INCLUDE_LIBVFIO_H */ diff --git a/tools/testing/selftests/vfio/lib/include/libvfio/assert.h b/tools/testing/selftests/vfio/lib/include/libvfio/assert.h new file mode 100644 index 000000000000..f4ebd122d9b6 --- /dev/null +++ b/tools/testing/selftests/vfio/lib/include/libvfio/assert.h @@ -0,0 +1,54 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef SELFTESTS_VFIO_LIB_INCLUDE_LIBVFIO_ASSERT_H +#define SELFTESTS_VFIO_LIB_INCLUDE_LIBVFIO_ASSERT_H + +#include <stdio.h> +#include <string.h> +#include <sys/ioctl.h> + +#include "../../../../kselftest.h" + +#define VFIO_LOG_AND_EXIT(...) do { \ + fprintf(stderr, " " __VA_ARGS__); \ + fprintf(stderr, "\n"); \ + exit(KSFT_FAIL); \ +} while (0) + +#define VFIO_ASSERT_OP(_lhs, _rhs, _op, ...) do { \ + typeof(_lhs) __lhs = (_lhs); \ + typeof(_rhs) __rhs = (_rhs); \ + \ + if (__lhs _op __rhs) \ + break; \ + \ + fprintf(stderr, "%s:%u: Assertion Failure\n\n", __FILE__, __LINE__); \ + fprintf(stderr, " Expression: " #_lhs " " #_op " " #_rhs "\n"); \ + fprintf(stderr, " Observed: %#lx %s %#lx\n", \ + (u64)__lhs, #_op, (u64)__rhs); \ + fprintf(stderr, " [errno: %d - %s]\n", errno, strerror(errno)); \ + VFIO_LOG_AND_EXIT(__VA_ARGS__); \ +} while (0) + +#define VFIO_ASSERT_EQ(_a, _b, ...) VFIO_ASSERT_OP(_a, _b, ==, ##__VA_ARGS__) +#define VFIO_ASSERT_NE(_a, _b, ...) VFIO_ASSERT_OP(_a, _b, !=, ##__VA_ARGS__) +#define VFIO_ASSERT_LT(_a, _b, ...) VFIO_ASSERT_OP(_a, _b, <, ##__VA_ARGS__) +#define VFIO_ASSERT_LE(_a, _b, ...) VFIO_ASSERT_OP(_a, _b, <=, ##__VA_ARGS__) +#define VFIO_ASSERT_GT(_a, _b, ...) VFIO_ASSERT_OP(_a, _b, >, ##__VA_ARGS__) +#define VFIO_ASSERT_GE(_a, _b, ...) VFIO_ASSERT_OP(_a, _b, >=, ##__VA_ARGS__) +#define VFIO_ASSERT_TRUE(_a, ...) VFIO_ASSERT_NE(false, (_a), ##__VA_ARGS__) +#define VFIO_ASSERT_FALSE(_a, ...) VFIO_ASSERT_EQ(false, (_a), ##__VA_ARGS__) +#define VFIO_ASSERT_NULL(_a, ...) VFIO_ASSERT_EQ(NULL, _a, ##__VA_ARGS__) +#define VFIO_ASSERT_NOT_NULL(_a, ...) VFIO_ASSERT_NE(NULL, _a, ##__VA_ARGS__) + +#define VFIO_FAIL(_fmt, ...) do { \ + fprintf(stderr, "%s:%u: FAIL\n\n", __FILE__, __LINE__); \ + VFIO_LOG_AND_EXIT(_fmt, ##__VA_ARGS__); \ +} while (0) + +#define ioctl_assert(_fd, _op, _arg) do { \ + void *__arg = (_arg); \ + int __ret = ioctl((_fd), (_op), (__arg)); \ + VFIO_ASSERT_EQ(__ret, 0, "ioctl(%s, %s, %s) returned %d\n", #_fd, #_op, #_arg, __ret); \ +} while (0) + +#endif /* SELFTESTS_VFIO_LIB_INCLUDE_LIBVFIO_ASSERT_H */ diff --git a/tools/testing/selftests/vfio/lib/include/libvfio/iommu.h b/tools/testing/selftests/vfio/lib/include/libvfio/iommu.h new file mode 100644 index 000000000000..5c9b9dc6d993 --- /dev/null +++ b/tools/testing/selftests/vfio/lib/include/libvfio/iommu.h @@ -0,0 +1,76 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef SELFTESTS_VFIO_LIB_INCLUDE_LIBVFIO_IOMMU_H +#define SELFTESTS_VFIO_LIB_INCLUDE_LIBVFIO_IOMMU_H + +#include <linux/list.h> +#include <linux/types.h> + +#include <libvfio/assert.h> + +typedef u64 iova_t; + +struct iommu_mode { + const char *name; + const char *container_path; + unsigned long iommu_type; +}; + +extern const char *default_iommu_mode; + +struct dma_region { + struct list_head link; + void *vaddr; + iova_t iova; + u64 size; +}; + +struct iommu { + const struct iommu_mode *mode; + int container_fd; + int iommufd; + u32 ioas_id; + struct list_head dma_regions; +}; + +struct iommu *iommu_init(const char *iommu_mode); +void iommu_cleanup(struct iommu *iommu); + +int __iommu_map(struct iommu *iommu, struct dma_region *region); + +static inline void iommu_map(struct iommu *iommu, struct dma_region *region) +{ + VFIO_ASSERT_EQ(__iommu_map(iommu, region), 0); +} + +int __iommu_unmap(struct iommu *iommu, struct dma_region *region, u64 *unmapped); + +static inline void iommu_unmap(struct iommu *iommu, struct dma_region *region) +{ + VFIO_ASSERT_EQ(__iommu_unmap(iommu, region, NULL), 0); +} + +int __iommu_unmap_all(struct iommu *iommu, u64 *unmapped); + +static inline void iommu_unmap_all(struct iommu *iommu) +{ + VFIO_ASSERT_EQ(__iommu_unmap_all(iommu, NULL), 0); +} + +int __iommu_hva2iova(struct iommu *iommu, void *vaddr, iova_t *iova); +iova_t iommu_hva2iova(struct iommu *iommu, void *vaddr); + +struct iommu_iova_range *iommu_iova_ranges(struct iommu *iommu, u32 *nranges); + +/* + * Generator for VFIO selftests fixture variants that replicate across all + * possible IOMMU modes. Tests must define FIXTURE_VARIANT_ADD_IOMMU_MODE() + * which should then use FIXTURE_VARIANT_ADD() to create the variant. + */ +#define FIXTURE_VARIANT_ADD_ALL_IOMMU_MODES(...) \ +FIXTURE_VARIANT_ADD_IOMMU_MODE(vfio_type1_iommu, ##__VA_ARGS__); \ +FIXTURE_VARIANT_ADD_IOMMU_MODE(vfio_type1v2_iommu, ##__VA_ARGS__); \ +FIXTURE_VARIANT_ADD_IOMMU_MODE(iommufd_compat_type1, ##__VA_ARGS__); \ +FIXTURE_VARIANT_ADD_IOMMU_MODE(iommufd_compat_type1v2, ##__VA_ARGS__); \ +FIXTURE_VARIANT_ADD_IOMMU_MODE(iommufd, ##__VA_ARGS__) + +#endif /* SELFTESTS_VFIO_LIB_INCLUDE_LIBVFIO_IOMMU_H */ diff --git a/tools/testing/selftests/vfio/lib/include/libvfio/iova_allocator.h b/tools/testing/selftests/vfio/lib/include/libvfio/iova_allocator.h new file mode 100644 index 000000000000..8f1d994e9ea2 --- /dev/null +++ b/tools/testing/selftests/vfio/lib/include/libvfio/iova_allocator.h @@ -0,0 +1,23 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef SELFTESTS_VFIO_LIB_INCLUDE_LIBVFIO_IOVA_ALLOCATOR_H +#define SELFTESTS_VFIO_LIB_INCLUDE_LIBVFIO_IOVA_ALLOCATOR_H + +#include <uapi/linux/types.h> +#include <linux/list.h> +#include <linux/types.h> +#include <linux/iommufd.h> + +#include <libvfio/iommu.h> + +struct iova_allocator { + struct iommu_iova_range *ranges; + u32 nranges; + u32 range_idx; + u64 range_offset; +}; + +struct iova_allocator *iova_allocator_init(struct iommu *iommu); +void iova_allocator_cleanup(struct iova_allocator *allocator); +iova_t iova_allocator_alloc(struct iova_allocator *allocator, size_t size); + +#endif /* SELFTESTS_VFIO_LIB_INCLUDE_LIBVFIO_IOVA_ALLOCATOR_H */ diff --git a/tools/testing/selftests/vfio/lib/include/libvfio/vfio_pci_device.h b/tools/testing/selftests/vfio/lib/include/libvfio/vfio_pci_device.h new file mode 100644 index 000000000000..2858885a89bb --- /dev/null +++ b/tools/testing/selftests/vfio/lib/include/libvfio/vfio_pci_device.h @@ -0,0 +1,125 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef SELFTESTS_VFIO_LIB_INCLUDE_LIBVFIO_VFIO_PCI_DEVICE_H +#define SELFTESTS_VFIO_LIB_INCLUDE_LIBVFIO_VFIO_PCI_DEVICE_H + +#include <fcntl.h> +#include <linux/vfio.h> +#include <linux/pci_regs.h> + +#include <libvfio/assert.h> +#include <libvfio/iommu.h> +#include <libvfio/vfio_pci_driver.h> + +struct vfio_pci_bar { + struct vfio_region_info info; + void *vaddr; +}; + +struct vfio_pci_device { + const char *bdf; + int fd; + int group_fd; + + struct iommu *iommu; + + struct vfio_device_info info; + struct vfio_region_info config_space; + struct vfio_pci_bar bars[PCI_STD_NUM_BARS]; + + struct vfio_irq_info msi_info; + struct vfio_irq_info msix_info; + + /* eventfds for MSI and MSI-x interrupts */ + int msi_eventfds[PCI_MSIX_FLAGS_QSIZE + 1]; + + struct vfio_pci_driver driver; +}; + +#define dev_info(_dev, _fmt, ...) printf("%s: " _fmt, (_dev)->bdf, ##__VA_ARGS__) +#define dev_err(_dev, _fmt, ...) fprintf(stderr, "%s: " _fmt, (_dev)->bdf, ##__VA_ARGS__) + +struct vfio_pci_device *vfio_pci_device_init(const char *bdf, struct iommu *iommu); +void vfio_pci_device_cleanup(struct vfio_pci_device *device); + +void vfio_pci_device_reset(struct vfio_pci_device *device); + +void vfio_pci_config_access(struct vfio_pci_device *device, bool write, + size_t config, size_t size, void *data); + +#define vfio_pci_config_read(_device, _offset, _type) ({ \ + _type __data; \ + vfio_pci_config_access((_device), false, _offset, sizeof(__data), &__data); \ + __data; \ +}) + +#define vfio_pci_config_readb(_d, _o) vfio_pci_config_read(_d, _o, u8) +#define vfio_pci_config_readw(_d, _o) vfio_pci_config_read(_d, _o, u16) +#define vfio_pci_config_readl(_d, _o) vfio_pci_config_read(_d, _o, u32) + +#define vfio_pci_config_write(_device, _offset, _value, _type) do { \ + _type __data = (_value); \ + vfio_pci_config_access((_device), true, _offset, sizeof(_type), &__data); \ +} while (0) + +#define vfio_pci_config_writeb(_d, _o, _v) vfio_pci_config_write(_d, _o, _v, u8) +#define vfio_pci_config_writew(_d, _o, _v) vfio_pci_config_write(_d, _o, _v, u16) +#define vfio_pci_config_writel(_d, _o, _v) vfio_pci_config_write(_d, _o, _v, u32) + +void vfio_pci_irq_enable(struct vfio_pci_device *device, u32 index, + u32 vector, int count); +void vfio_pci_irq_disable(struct vfio_pci_device *device, u32 index); +void vfio_pci_irq_trigger(struct vfio_pci_device *device, u32 index, u32 vector); + +static inline void fcntl_set_nonblock(int fd) +{ + int r; + + r = fcntl(fd, F_GETFL, 0); + VFIO_ASSERT_NE(r, -1, "F_GETFL failed for fd %d\n", fd); + + r = fcntl(fd, F_SETFL, r | O_NONBLOCK); + VFIO_ASSERT_NE(r, -1, "F_SETFL O_NONBLOCK failed for fd %d\n", fd); +} + +static inline void vfio_pci_msi_enable(struct vfio_pci_device *device, + u32 vector, int count) +{ + vfio_pci_irq_enable(device, VFIO_PCI_MSI_IRQ_INDEX, vector, count); +} + +static inline void vfio_pci_msi_disable(struct vfio_pci_device *device) +{ + vfio_pci_irq_disable(device, VFIO_PCI_MSI_IRQ_INDEX); +} + +static inline void vfio_pci_msix_enable(struct vfio_pci_device *device, + u32 vector, int count) +{ + vfio_pci_irq_enable(device, VFIO_PCI_MSIX_IRQ_INDEX, vector, count); +} + +static inline void vfio_pci_msix_disable(struct vfio_pci_device *device) +{ + vfio_pci_irq_disable(device, VFIO_PCI_MSIX_IRQ_INDEX); +} + +static inline int __to_iova(struct vfio_pci_device *device, void *vaddr, iova_t *iova) +{ + return __iommu_hva2iova(device->iommu, vaddr, iova); +} + +static inline iova_t to_iova(struct vfio_pci_device *device, void *vaddr) +{ + return iommu_hva2iova(device->iommu, vaddr); +} + +static inline bool vfio_pci_device_match(struct vfio_pci_device *device, + u16 vendor_id, u16 device_id) +{ + return (vendor_id == vfio_pci_config_readw(device, PCI_VENDOR_ID)) && + (device_id == vfio_pci_config_readw(device, PCI_DEVICE_ID)); +} + +const char *vfio_pci_get_cdev_path(const char *bdf); + +#endif /* SELFTESTS_VFIO_LIB_INCLUDE_LIBVFIO_VFIO_PCI_DEVICE_H */ diff --git a/tools/testing/selftests/vfio/lib/include/libvfio/vfio_pci_driver.h b/tools/testing/selftests/vfio/lib/include/libvfio/vfio_pci_driver.h new file mode 100644 index 000000000000..e5ada209b1d1 --- /dev/null +++ b/tools/testing/selftests/vfio/lib/include/libvfio/vfio_pci_driver.h @@ -0,0 +1,97 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef SELFTESTS_VFIO_LIB_INCLUDE_LIBVFIO_VFIO_PCI_DRIVER_H +#define SELFTESTS_VFIO_LIB_INCLUDE_LIBVFIO_VFIO_PCI_DRIVER_H + +#include <libvfio/iommu.h> + +struct vfio_pci_device; + +struct vfio_pci_driver_ops { + const char *name; + + /** + * @probe() - Check if the driver supports the given device. + * + * Return: 0 on success, non-0 on failure. + */ + int (*probe)(struct vfio_pci_device *device); + + /** + * @init() - Initialize the driver for @device. + * + * Must be called after device->driver.region has been initialized. + */ + void (*init)(struct vfio_pci_device *device); + + /** + * remove() - Deinitialize the driver for @device. + */ + void (*remove)(struct vfio_pci_device *device); + + /** + * memcpy_start() - Kick off @count repeated memcpy operations from + * [@src, @src + @size) to [@dst, @dst + @size). + * + * Guarantees: + * - The device will attempt DMA reads on [src, src + size). + * - The device will attempt DMA writes on [dst, dst + size). + * - The device will not generate any interrupts. + * + * memcpy_start() returns immediately, it does not wait for the + * copies to complete. + */ + void (*memcpy_start)(struct vfio_pci_device *device, + iova_t src, iova_t dst, u64 size, u64 count); + + /** + * memcpy_wait() - Wait until the memcpy operations started by + * memcpy_start() have finished. + * + * Guarantees: + * - All in-flight DMAs initiated by memcpy_start() are fully complete + * before memcpy_wait() returns. + * + * Returns non-0 if the driver detects that an error occurred during the + * memcpy, 0 otherwise. + */ + int (*memcpy_wait)(struct vfio_pci_device *device); + + /** + * send_msi() - Make the device send the MSI device->driver.msi. + * + * Guarantees: + * - The device will send the MSI once. + */ + void (*send_msi)(struct vfio_pci_device *device); +}; + +struct vfio_pci_driver { + const struct vfio_pci_driver_ops *ops; + bool initialized; + bool memcpy_in_progress; + + /* Region to be used by the driver (e.g. for in-memory descriptors) */ + struct dma_region region; + + /* The maximum size that can be passed to memcpy_start(). */ + u64 max_memcpy_size; + + /* The maximum count that can be passed to memcpy_start(). */ + u64 max_memcpy_count; + + /* The MSI vector the device will signal in ops->send_msi(). */ + int msi; +}; + +void vfio_pci_driver_probe(struct vfio_pci_device *device); +void vfio_pci_driver_init(struct vfio_pci_device *device); +void vfio_pci_driver_remove(struct vfio_pci_device *device); +int vfio_pci_driver_memcpy(struct vfio_pci_device *device, + iova_t src, iova_t dst, u64 size); +void vfio_pci_driver_memcpy_start(struct vfio_pci_device *device, + iova_t src, iova_t dst, u64 size, + u64 count); +int vfio_pci_driver_memcpy_wait(struct vfio_pci_device *device); +void vfio_pci_driver_send_msi(struct vfio_pci_device *device); + +#endif /* SELFTESTS_VFIO_LIB_INCLUDE_LIBVFIO_VFIO_PCI_DRIVER_H */ diff --git a/tools/testing/selftests/vfio/lib/include/vfio_util.h b/tools/testing/selftests/vfio/lib/include/vfio_util.h deleted file mode 100644 index 69ec0c856481..000000000000 --- a/tools/testing/selftests/vfio/lib/include/vfio_util.h +++ /dev/null @@ -1,331 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -#ifndef SELFTESTS_VFIO_LIB_INCLUDE_VFIO_UTIL_H -#define SELFTESTS_VFIO_LIB_INCLUDE_VFIO_UTIL_H - -#include <fcntl.h> -#include <string.h> - -#include <uapi/linux/types.h> -#include <linux/iommufd.h> -#include <linux/list.h> -#include <linux/pci_regs.h> -#include <linux/vfio.h> - -#include "../../../kselftest.h" - -#define VFIO_LOG_AND_EXIT(...) do { \ - fprintf(stderr, " " __VA_ARGS__); \ - fprintf(stderr, "\n"); \ - exit(KSFT_FAIL); \ -} while (0) - -#define VFIO_ASSERT_OP(_lhs, _rhs, _op, ...) do { \ - typeof(_lhs) __lhs = (_lhs); \ - typeof(_rhs) __rhs = (_rhs); \ - \ - if (__lhs _op __rhs) \ - break; \ - \ - fprintf(stderr, "%s:%u: Assertion Failure\n\n", __FILE__, __LINE__); \ - fprintf(stderr, " Expression: " #_lhs " " #_op " " #_rhs "\n"); \ - fprintf(stderr, " Observed: %#lx %s %#lx\n", \ - (u64)__lhs, #_op, (u64)__rhs); \ - fprintf(stderr, " [errno: %d - %s]\n", errno, strerror(errno)); \ - VFIO_LOG_AND_EXIT(__VA_ARGS__); \ -} while (0) - -#define VFIO_ASSERT_EQ(_a, _b, ...) VFIO_ASSERT_OP(_a, _b, ==, ##__VA_ARGS__) -#define VFIO_ASSERT_NE(_a, _b, ...) VFIO_ASSERT_OP(_a, _b, !=, ##__VA_ARGS__) -#define VFIO_ASSERT_LT(_a, _b, ...) VFIO_ASSERT_OP(_a, _b, <, ##__VA_ARGS__) -#define VFIO_ASSERT_LE(_a, _b, ...) VFIO_ASSERT_OP(_a, _b, <=, ##__VA_ARGS__) -#define VFIO_ASSERT_GT(_a, _b, ...) VFIO_ASSERT_OP(_a, _b, >, ##__VA_ARGS__) -#define VFIO_ASSERT_GE(_a, _b, ...) VFIO_ASSERT_OP(_a, _b, >=, ##__VA_ARGS__) -#define VFIO_ASSERT_TRUE(_a, ...) VFIO_ASSERT_NE(false, (_a), ##__VA_ARGS__) -#define VFIO_ASSERT_FALSE(_a, ...) VFIO_ASSERT_EQ(false, (_a), ##__VA_ARGS__) -#define VFIO_ASSERT_NULL(_a, ...) VFIO_ASSERT_EQ(NULL, _a, ##__VA_ARGS__) -#define VFIO_ASSERT_NOT_NULL(_a, ...) VFIO_ASSERT_NE(NULL, _a, ##__VA_ARGS__) - -#define VFIO_FAIL(_fmt, ...) do { \ - fprintf(stderr, "%s:%u: FAIL\n\n", __FILE__, __LINE__); \ - VFIO_LOG_AND_EXIT(_fmt, ##__VA_ARGS__); \ -} while (0) - -struct vfio_iommu_mode { - const char *name; - const char *container_path; - unsigned long iommu_type; -}; - -/* - * Generator for VFIO selftests fixture variants that replicate across all - * possible IOMMU modes. Tests must define FIXTURE_VARIANT_ADD_IOMMU_MODE() - * which should then use FIXTURE_VARIANT_ADD() to create the variant. - */ -#define FIXTURE_VARIANT_ADD_ALL_IOMMU_MODES(...) \ -FIXTURE_VARIANT_ADD_IOMMU_MODE(vfio_type1_iommu, ##__VA_ARGS__); \ -FIXTURE_VARIANT_ADD_IOMMU_MODE(vfio_type1v2_iommu, ##__VA_ARGS__); \ -FIXTURE_VARIANT_ADD_IOMMU_MODE(iommufd_compat_type1, ##__VA_ARGS__); \ -FIXTURE_VARIANT_ADD_IOMMU_MODE(iommufd_compat_type1v2, ##__VA_ARGS__); \ -FIXTURE_VARIANT_ADD_IOMMU_MODE(iommufd, ##__VA_ARGS__) - -struct vfio_pci_bar { - struct vfio_region_info info; - void *vaddr; -}; - -typedef u64 iova_t; - -#define INVALID_IOVA UINT64_MAX - -struct vfio_dma_region { - struct list_head link; - void *vaddr; - iova_t iova; - u64 size; -}; - -struct vfio_pci_device; - -struct vfio_pci_driver_ops { - const char *name; - - /** - * @probe() - Check if the driver supports the given device. - * - * Return: 0 on success, non-0 on failure. - */ - int (*probe)(struct vfio_pci_device *device); - - /** - * @init() - Initialize the driver for @device. - * - * Must be called after device->driver.region has been initialized. - */ - void (*init)(struct vfio_pci_device *device); - - /** - * remove() - Deinitialize the driver for @device. - */ - void (*remove)(struct vfio_pci_device *device); - - /** - * memcpy_start() - Kick off @count repeated memcpy operations from - * [@src, @src + @size) to [@dst, @dst + @size). - * - * Guarantees: - * - The device will attempt DMA reads on [src, src + size). - * - The device will attempt DMA writes on [dst, dst + size). - * - The device will not generate any interrupts. - * - * memcpy_start() returns immediately, it does not wait for the - * copies to complete. - */ - void (*memcpy_start)(struct vfio_pci_device *device, - iova_t src, iova_t dst, u64 size, u64 count); - - /** - * memcpy_wait() - Wait until the memcpy operations started by - * memcpy_start() have finished. - * - * Guarantees: - * - All in-flight DMAs initiated by memcpy_start() are fully complete - * before memcpy_wait() returns. - * - * Returns non-0 if the driver detects that an error occurred during the - * memcpy, 0 otherwise. - */ - int (*memcpy_wait)(struct vfio_pci_device *device); - - /** - * send_msi() - Make the device send the MSI device->driver.msi. - * - * Guarantees: - * - The device will send the MSI once. - */ - void (*send_msi)(struct vfio_pci_device *device); -}; - -struct vfio_pci_driver { - const struct vfio_pci_driver_ops *ops; - bool initialized; - bool memcpy_in_progress; - - /* Region to be used by the driver (e.g. for in-memory descriptors) */ - struct vfio_dma_region region; - - /* The maximum size that can be passed to memcpy_start(). */ - u64 max_memcpy_size; - - /* The maximum count that can be passed to memcpy_start(). */ - u64 max_memcpy_count; - - /* The MSI vector the device will signal in ops->send_msi(). */ - int msi; -}; - -struct vfio_pci_device { - int fd; - - const struct vfio_iommu_mode *iommu_mode; - int group_fd; - int container_fd; - - int iommufd; - u32 ioas_id; - - struct vfio_device_info info; - struct vfio_region_info config_space; - struct vfio_pci_bar bars[PCI_STD_NUM_BARS]; - - struct vfio_irq_info msi_info; - struct vfio_irq_info msix_info; - - struct list_head dma_regions; - - /* eventfds for MSI and MSI-x interrupts */ - int msi_eventfds[PCI_MSIX_FLAGS_QSIZE + 1]; - - struct vfio_pci_driver driver; -}; - -struct iova_allocator { - struct iommu_iova_range *ranges; - u32 nranges; - u32 range_idx; - u64 range_offset; -}; - -/* - * Return the BDF string of the device that the test should use. - * - * If a BDF string is provided by the user on the command line (as the last - * element of argv[]), then this function will return that and decrement argc - * by 1. - * - * Otherwise this function will attempt to use the environment variable - * $VFIO_SELFTESTS_BDF. - * - * If BDF cannot be determined then the test will exit with KSFT_SKIP. - */ -const char *vfio_selftests_get_bdf(int *argc, char *argv[]); -const char *vfio_pci_get_cdev_path(const char *bdf); - -extern const char *default_iommu_mode; - -struct vfio_pci_device *vfio_pci_device_init(const char *bdf, const char *iommu_mode); -void vfio_pci_device_cleanup(struct vfio_pci_device *device); -void vfio_pci_device_reset(struct vfio_pci_device *device); - -struct iommu_iova_range *vfio_pci_iova_ranges(struct vfio_pci_device *device, - u32 *nranges); - -struct iova_allocator *iova_allocator_init(struct vfio_pci_device *device); -void iova_allocator_cleanup(struct iova_allocator *allocator); -iova_t iova_allocator_alloc(struct iova_allocator *allocator, size_t size); - -int __vfio_pci_dma_map(struct vfio_pci_device *device, - struct vfio_dma_region *region); -int __vfio_pci_dma_unmap(struct vfio_pci_device *device, - struct vfio_dma_region *region, - u64 *unmapped); -int __vfio_pci_dma_unmap_all(struct vfio_pci_device *device, u64 *unmapped); - -static inline void vfio_pci_dma_map(struct vfio_pci_device *device, - struct vfio_dma_region *region) -{ - VFIO_ASSERT_EQ(__vfio_pci_dma_map(device, region), 0); -} - -static inline void vfio_pci_dma_unmap(struct vfio_pci_device *device, - struct vfio_dma_region *region) -{ - VFIO_ASSERT_EQ(__vfio_pci_dma_unmap(device, region, NULL), 0); -} - -static inline void vfio_pci_dma_unmap_all(struct vfio_pci_device *device) -{ - VFIO_ASSERT_EQ(__vfio_pci_dma_unmap_all(device, NULL), 0); -} - -void vfio_pci_config_access(struct vfio_pci_device *device, bool write, - size_t config, size_t size, void *data); - -#define vfio_pci_config_read(_device, _offset, _type) ({ \ - _type __data; \ - vfio_pci_config_access((_device), false, _offset, sizeof(__data), &__data); \ - __data; \ -}) - -#define vfio_pci_config_readb(_d, _o) vfio_pci_config_read(_d, _o, u8) -#define vfio_pci_config_readw(_d, _o) vfio_pci_config_read(_d, _o, u16) -#define vfio_pci_config_readl(_d, _o) vfio_pci_config_read(_d, _o, u32) - -#define vfio_pci_config_write(_device, _offset, _value, _type) do { \ - _type __data = (_value); \ - vfio_pci_config_access((_device), true, _offset, sizeof(_type), &__data); \ -} while (0) - -#define vfio_pci_config_writeb(_d, _o, _v) vfio_pci_config_write(_d, _o, _v, u8) -#define vfio_pci_config_writew(_d, _o, _v) vfio_pci_config_write(_d, _o, _v, u16) -#define vfio_pci_config_writel(_d, _o, _v) vfio_pci_config_write(_d, _o, _v, u32) - -void vfio_pci_irq_enable(struct vfio_pci_device *device, u32 index, - u32 vector, int count); -void vfio_pci_irq_disable(struct vfio_pci_device *device, u32 index); -void vfio_pci_irq_trigger(struct vfio_pci_device *device, u32 index, u32 vector); - -static inline void fcntl_set_nonblock(int fd) -{ - int r; - - r = fcntl(fd, F_GETFL, 0); - VFIO_ASSERT_NE(r, -1, "F_GETFL failed for fd %d\n", fd); - - r = fcntl(fd, F_SETFL, r | O_NONBLOCK); - VFIO_ASSERT_NE(r, -1, "F_SETFL O_NONBLOCK failed for fd %d\n", fd); -} - -static inline void vfio_pci_msi_enable(struct vfio_pci_device *device, - u32 vector, int count) -{ - vfio_pci_irq_enable(device, VFIO_PCI_MSI_IRQ_INDEX, vector, count); -} - -static inline void vfio_pci_msi_disable(struct vfio_pci_device *device) -{ - vfio_pci_irq_disable(device, VFIO_PCI_MSI_IRQ_INDEX); -} - -static inline void vfio_pci_msix_enable(struct vfio_pci_device *device, - u32 vector, int count) -{ - vfio_pci_irq_enable(device, VFIO_PCI_MSIX_IRQ_INDEX, vector, count); -} - -static inline void vfio_pci_msix_disable(struct vfio_pci_device *device) -{ - vfio_pci_irq_disable(device, VFIO_PCI_MSIX_IRQ_INDEX); -} - -iova_t __to_iova(struct vfio_pci_device *device, void *vaddr); -iova_t to_iova(struct vfio_pci_device *device, void *vaddr); - -static inline bool vfio_pci_device_match(struct vfio_pci_device *device, - u16 vendor_id, u16 device_id) -{ - return (vendor_id == vfio_pci_config_readw(device, PCI_VENDOR_ID)) && - (device_id == vfio_pci_config_readw(device, PCI_DEVICE_ID)); -} - -void vfio_pci_driver_probe(struct vfio_pci_device *device); -void vfio_pci_driver_init(struct vfio_pci_device *device); -void vfio_pci_driver_remove(struct vfio_pci_device *device); -int vfio_pci_driver_memcpy(struct vfio_pci_device *device, - iova_t src, iova_t dst, u64 size); -void vfio_pci_driver_memcpy_start(struct vfio_pci_device *device, - iova_t src, iova_t dst, u64 size, - u64 count); -int vfio_pci_driver_memcpy_wait(struct vfio_pci_device *device); -void vfio_pci_driver_send_msi(struct vfio_pci_device *device); - -#endif /* SELFTESTS_VFIO_LIB_INCLUDE_VFIO_UTIL_H */ diff --git a/tools/testing/selftests/vfio/lib/iommu.c b/tools/testing/selftests/vfio/lib/iommu.c new file mode 100644 index 000000000000..8079d43523f3 --- /dev/null +++ b/tools/testing/selftests/vfio/lib/iommu.c @@ -0,0 +1,465 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include <dirent.h> +#include <fcntl.h> +#include <libgen.h> +#include <stdint.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> + +#include <sys/eventfd.h> +#include <sys/ioctl.h> +#include <sys/mman.h> + +#include <uapi/linux/types.h> +#include <linux/limits.h> +#include <linux/mman.h> +#include <linux/types.h> +#include <linux/vfio.h> +#include <linux/iommufd.h> + +#include "../../../kselftest.h" +#include <libvfio.h> + +const char *default_iommu_mode = "iommufd"; + +/* Reminder: Keep in sync with FIXTURE_VARIANT_ADD_ALL_IOMMU_MODES(). */ +static const struct iommu_mode iommu_modes[] = { + { + .name = "vfio_type1_iommu", + .container_path = "/dev/vfio/vfio", + .iommu_type = VFIO_TYPE1_IOMMU, + }, + { + .name = "vfio_type1v2_iommu", + .container_path = "/dev/vfio/vfio", + .iommu_type = VFIO_TYPE1v2_IOMMU, + }, + { + .name = "iommufd_compat_type1", + .container_path = "/dev/iommu", + .iommu_type = VFIO_TYPE1_IOMMU, + }, + { + .name = "iommufd_compat_type1v2", + .container_path = "/dev/iommu", + .iommu_type = VFIO_TYPE1v2_IOMMU, + }, + { + .name = "iommufd", + }, +}; + +static const struct iommu_mode *lookup_iommu_mode(const char *iommu_mode) +{ + int i; + + if (!iommu_mode) + iommu_mode = default_iommu_mode; + + for (i = 0; i < ARRAY_SIZE(iommu_modes); i++) { + if (strcmp(iommu_mode, iommu_modes[i].name)) + continue; + + return &iommu_modes[i]; + } + + VFIO_FAIL("Unrecognized IOMMU mode: %s\n", iommu_mode); +} + +int __iommu_hva2iova(struct iommu *iommu, void *vaddr, iova_t *iova) +{ + struct dma_region *region; + + list_for_each_entry(region, &iommu->dma_regions, link) { + if (vaddr < region->vaddr) + continue; + + if (vaddr >= region->vaddr + region->size) + continue; + + if (iova) + *iova = region->iova + (vaddr - region->vaddr); + + return 0; + } + + return -ENOENT; +} + +iova_t iommu_hva2iova(struct iommu *iommu, void *vaddr) +{ + iova_t iova; + int ret; + + ret = __iommu_hva2iova(iommu, vaddr, &iova); + VFIO_ASSERT_EQ(ret, 0, "%p is not mapped into the iommu\n", vaddr); + + return iova; +} + +static int vfio_iommu_map(struct iommu *iommu, struct dma_region *region) +{ + struct vfio_iommu_type1_dma_map args = { + .argsz = sizeof(args), + .flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE, + .vaddr = (u64)region->vaddr, + .iova = region->iova, + .size = region->size, + }; + + if (ioctl(iommu->container_fd, VFIO_IOMMU_MAP_DMA, &args)) + return -errno; + + return 0; +} + +static int iommufd_map(struct iommu *iommu, struct dma_region *region) +{ + struct iommu_ioas_map args = { + .size = sizeof(args), + .flags = IOMMU_IOAS_MAP_READABLE | + IOMMU_IOAS_MAP_WRITEABLE | + IOMMU_IOAS_MAP_FIXED_IOVA, + .user_va = (u64)region->vaddr, + .iova = region->iova, + .length = region->size, + .ioas_id = iommu->ioas_id, + }; + + if (ioctl(iommu->iommufd, IOMMU_IOAS_MAP, &args)) + return -errno; + + return 0; +} + +int __iommu_map(struct iommu *iommu, struct dma_region *region) +{ + int ret; + + if (iommu->iommufd) + ret = iommufd_map(iommu, region); + else + ret = vfio_iommu_map(iommu, region); + + if (ret) + return ret; + + list_add(®ion->link, &iommu->dma_regions); + + return 0; +} + +static int __vfio_iommu_unmap(int fd, u64 iova, u64 size, u32 flags, u64 *unmapped) +{ + struct vfio_iommu_type1_dma_unmap args = { + .argsz = sizeof(args), + .iova = iova, + .size = size, + .flags = flags, + }; + + if (ioctl(fd, VFIO_IOMMU_UNMAP_DMA, &args)) + return -errno; + + if (unmapped) + *unmapped = args.size; + + return 0; +} + +static int vfio_iommu_unmap(struct iommu *iommu, struct dma_region *region, + u64 *unmapped) +{ + return __vfio_iommu_unmap(iommu->container_fd, region->iova, + region->size, 0, unmapped); +} + +static int __iommufd_unmap(int fd, u64 iova, u64 length, u32 ioas_id, u64 *unmapped) +{ + struct iommu_ioas_unmap args = { + .size = sizeof(args), + .iova = iova, + .length = length, + .ioas_id = ioas_id, + }; + + if (ioctl(fd, IOMMU_IOAS_UNMAP, &args)) + return -errno; + + if (unmapped) + *unmapped = args.length; + + return 0; +} + +static int iommufd_unmap(struct iommu *iommu, struct dma_region *region, + u64 *unmapped) +{ + return __iommufd_unmap(iommu->iommufd, region->iova, region->size, + iommu->ioas_id, unmapped); +} + +int __iommu_unmap(struct iommu *iommu, struct dma_region *region, u64 *unmapped) +{ + int ret; + + if (iommu->iommufd) + ret = iommufd_unmap(iommu, region, unmapped); + else + ret = vfio_iommu_unmap(iommu, region, unmapped); + + if (ret) + return ret; + + list_del_init(®ion->link); + + return 0; +} + +int __iommu_unmap_all(struct iommu *iommu, u64 *unmapped) +{ + int ret; + struct dma_region *curr, *next; + + if (iommu->iommufd) + ret = __iommufd_unmap(iommu->iommufd, 0, UINT64_MAX, + iommu->ioas_id, unmapped); + else + ret = __vfio_iommu_unmap(iommu->container_fd, 0, 0, + VFIO_DMA_UNMAP_FLAG_ALL, unmapped); + + if (ret) + return ret; + + list_for_each_entry_safe(curr, next, &iommu->dma_regions, link) + list_del_init(&curr->link); + + return 0; +} + +static struct vfio_info_cap_header *next_cap_hdr(void *buf, u32 bufsz, + u32 *cap_offset) +{ + struct vfio_info_cap_header *hdr; + + if (!*cap_offset) + return NULL; + + VFIO_ASSERT_LT(*cap_offset, bufsz); + VFIO_ASSERT_GE(bufsz - *cap_offset, sizeof(*hdr)); + + hdr = (struct vfio_info_cap_header *)((u8 *)buf + *cap_offset); + *cap_offset = hdr->next; + + return hdr; +} + +static struct vfio_info_cap_header *vfio_iommu_info_cap_hdr(struct vfio_iommu_type1_info *info, + u16 cap_id) +{ + struct vfio_info_cap_header *hdr; + u32 cap_offset = info->cap_offset; + u32 max_depth; + u32 depth = 0; + + if (!(info->flags & VFIO_IOMMU_INFO_CAPS)) + return NULL; + + if (cap_offset) + VFIO_ASSERT_GE(cap_offset, sizeof(*info)); + + max_depth = (info->argsz - sizeof(*info)) / sizeof(*hdr); + + while ((hdr = next_cap_hdr(info, info->argsz, &cap_offset))) { + depth++; + VFIO_ASSERT_LE(depth, max_depth, "Capability chain contains a cycle\n"); + + if (hdr->id == cap_id) + return hdr; + } + + return NULL; +} + +/* Return buffer including capability chain, if present. Free with free() */ +static struct vfio_iommu_type1_info *vfio_iommu_get_info(int container_fd) +{ + struct vfio_iommu_type1_info *info; + + info = malloc(sizeof(*info)); + VFIO_ASSERT_NOT_NULL(info); + + *info = (struct vfio_iommu_type1_info) { + .argsz = sizeof(*info), + }; + + ioctl_assert(container_fd, VFIO_IOMMU_GET_INFO, info); + VFIO_ASSERT_GE(info->argsz, sizeof(*info)); + + info = realloc(info, info->argsz); + VFIO_ASSERT_NOT_NULL(info); + + ioctl_assert(container_fd, VFIO_IOMMU_GET_INFO, info); + VFIO_ASSERT_GE(info->argsz, sizeof(*info)); + + return info; +} + +/* + * Return iova ranges for the device's container. Normalize vfio_iommu_type1 to + * report iommufd's iommu_iova_range. Free with free(). + */ +static struct iommu_iova_range *vfio_iommu_iova_ranges(struct iommu *iommu, + u32 *nranges) +{ + struct vfio_iommu_type1_info_cap_iova_range *cap_range; + struct vfio_iommu_type1_info *info; + struct vfio_info_cap_header *hdr; + struct iommu_iova_range *ranges = NULL; + + info = vfio_iommu_get_info(iommu->container_fd); + hdr = vfio_iommu_info_cap_hdr(info, VFIO_IOMMU_TYPE1_INFO_CAP_IOVA_RANGE); + VFIO_ASSERT_NOT_NULL(hdr); + + cap_range = container_of(hdr, struct vfio_iommu_type1_info_cap_iova_range, header); + VFIO_ASSERT_GT(cap_range->nr_iovas, 0); + + ranges = calloc(cap_range->nr_iovas, sizeof(*ranges)); + VFIO_ASSERT_NOT_NULL(ranges); + + for (u32 i = 0; i < cap_range->nr_iovas; i++) { + ranges[i] = (struct iommu_iova_range){ + .start = cap_range->iova_ranges[i].start, + .last = cap_range->iova_ranges[i].end, + }; + } + + *nranges = cap_range->nr_iovas; + + free(info); + return ranges; +} + +/* Return iova ranges of the device's IOAS. Free with free() */ +static struct iommu_iova_range *iommufd_iova_ranges(struct iommu *iommu, + u32 *nranges) +{ + struct iommu_iova_range *ranges; + int ret; + + struct iommu_ioas_iova_ranges query = { + .size = sizeof(query), + .ioas_id = iommu->ioas_id, + }; + + ret = ioctl(iommu->iommufd, IOMMU_IOAS_IOVA_RANGES, &query); + VFIO_ASSERT_EQ(ret, -1); + VFIO_ASSERT_EQ(errno, EMSGSIZE); + VFIO_ASSERT_GT(query.num_iovas, 0); + + ranges = calloc(query.num_iovas, sizeof(*ranges)); + VFIO_ASSERT_NOT_NULL(ranges); + + query.allowed_iovas = (uintptr_t)ranges; + + ioctl_assert(iommu->iommufd, IOMMU_IOAS_IOVA_RANGES, &query); + *nranges = query.num_iovas; + + return ranges; +} + +static int iova_range_comp(const void *a, const void *b) +{ + const struct iommu_iova_range *ra = a, *rb = b; + + if (ra->start < rb->start) + return -1; + + if (ra->start > rb->start) + return 1; + + return 0; +} + +/* Return sorted IOVA ranges of the device. Free with free(). */ +struct iommu_iova_range *iommu_iova_ranges(struct iommu *iommu, u32 *nranges) +{ + struct iommu_iova_range *ranges; + + if (iommu->iommufd) + ranges = iommufd_iova_ranges(iommu, nranges); + else + ranges = vfio_iommu_iova_ranges(iommu, nranges); + + if (!ranges) + return NULL; + + VFIO_ASSERT_GT(*nranges, 0); + + /* Sort and check that ranges are sane and non-overlapping */ + qsort(ranges, *nranges, sizeof(*ranges), iova_range_comp); + VFIO_ASSERT_LT(ranges[0].start, ranges[0].last); + + for (u32 i = 1; i < *nranges; i++) { + VFIO_ASSERT_LT(ranges[i].start, ranges[i].last); + VFIO_ASSERT_LT(ranges[i - 1].last, ranges[i].start); + } + + return ranges; +} + +static u32 iommufd_ioas_alloc(int iommufd) +{ + struct iommu_ioas_alloc args = { + .size = sizeof(args), + }; + + ioctl_assert(iommufd, IOMMU_IOAS_ALLOC, &args); + return args.out_ioas_id; +} + +struct iommu *iommu_init(const char *iommu_mode) +{ + const char *container_path; + struct iommu *iommu; + int version; + + iommu = calloc(1, sizeof(*iommu)); + VFIO_ASSERT_NOT_NULL(iommu); + + INIT_LIST_HEAD(&iommu->dma_regions); + + iommu->mode = lookup_iommu_mode(iommu_mode); + + container_path = iommu->mode->container_path; + if (container_path) { + iommu->container_fd = open(container_path, O_RDWR); + VFIO_ASSERT_GE(iommu->container_fd, 0, "open(%s) failed\n", container_path); + + version = ioctl(iommu->container_fd, VFIO_GET_API_VERSION); + VFIO_ASSERT_EQ(version, VFIO_API_VERSION, "Unsupported version: %d\n", version); + } else { + /* + * Require device->iommufd to be >0 so that a simple non-0 check can be + * used to check if iommufd is enabled. In practice open() will never + * return 0 unless stdin is closed. + */ + iommu->iommufd = open("/dev/iommu", O_RDWR); + VFIO_ASSERT_GT(iommu->iommufd, 0); + + iommu->ioas_id = iommufd_ioas_alloc(iommu->iommufd); + } + + return iommu; +} + +void iommu_cleanup(struct iommu *iommu) +{ + if (iommu->iommufd) + VFIO_ASSERT_EQ(close(iommu->iommufd), 0); + else + VFIO_ASSERT_EQ(close(iommu->container_fd), 0); + + free(iommu); +} diff --git a/tools/testing/selftests/vfio/lib/iova_allocator.c b/tools/testing/selftests/vfio/lib/iova_allocator.c new file mode 100644 index 000000000000..a12b0a51e9e6 --- /dev/null +++ b/tools/testing/selftests/vfio/lib/iova_allocator.c @@ -0,0 +1,94 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include <dirent.h> +#include <fcntl.h> +#include <libgen.h> +#include <stdint.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> + +#include <sys/eventfd.h> +#include <sys/ioctl.h> +#include <sys/mman.h> + +#include <uapi/linux/types.h> +#include <linux/iommufd.h> +#include <linux/limits.h> +#include <linux/mman.h> +#include <linux/overflow.h> +#include <linux/types.h> +#include <linux/vfio.h> + +#include <libvfio.h> + +struct iova_allocator *iova_allocator_init(struct iommu *iommu) +{ + struct iova_allocator *allocator; + struct iommu_iova_range *ranges; + u32 nranges; + + ranges = iommu_iova_ranges(iommu, &nranges); + VFIO_ASSERT_NOT_NULL(ranges); + + allocator = malloc(sizeof(*allocator)); + VFIO_ASSERT_NOT_NULL(allocator); + + *allocator = (struct iova_allocator){ + .ranges = ranges, + .nranges = nranges, + .range_idx = 0, + .range_offset = 0, + }; + + return allocator; +} + +void iova_allocator_cleanup(struct iova_allocator *allocator) +{ + free(allocator->ranges); + free(allocator); +} + +iova_t iova_allocator_alloc(struct iova_allocator *allocator, size_t size) +{ + VFIO_ASSERT_GT(size, 0, "Invalid size arg, zero\n"); + VFIO_ASSERT_EQ(size & (size - 1), 0, "Invalid size arg, non-power-of-2\n"); + + for (;;) { + struct iommu_iova_range *range; + iova_t iova, last; + + VFIO_ASSERT_LT(allocator->range_idx, allocator->nranges, + "IOVA allocator out of space\n"); + + range = &allocator->ranges[allocator->range_idx]; + iova = range->start + allocator->range_offset; + + /* Check for sufficient space at the current offset */ + if (check_add_overflow(iova, size - 1, &last) || + last > range->last) + goto next_range; + + /* Align iova to size */ + iova = last & ~(size - 1); + + /* Check for sufficient space at the aligned iova */ + if (check_add_overflow(iova, size - 1, &last) || + last > range->last) + goto next_range; + + if (last == range->last) { + allocator->range_idx++; + allocator->range_offset = 0; + } else { + allocator->range_offset = last - range->start + 1; + } + + return iova; + +next_range: + allocator->range_idx++; + allocator->range_offset = 0; + } +} + diff --git a/tools/testing/selftests/vfio/lib/libvfio.c b/tools/testing/selftests/vfio/lib/libvfio.c new file mode 100644 index 000000000000..a23a3cc5be69 --- /dev/null +++ b/tools/testing/selftests/vfio/lib/libvfio.c @@ -0,0 +1,78 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include <stdio.h> +#include <stdlib.h> + +#include "../../../kselftest.h" +#include <libvfio.h> + +static bool is_bdf(const char *str) +{ + unsigned int s, b, d, f; + int length, count; + + count = sscanf(str, "%4x:%2x:%2x.%2x%n", &s, &b, &d, &f, &length); + return count == 4 && length == strlen(str); +} + +static char **get_bdfs_cmdline(int *argc, char *argv[], int *nr_bdfs) +{ + int i; + + for (i = *argc - 1; i > 0 && is_bdf(argv[i]); i--) + continue; + + i++; + *nr_bdfs = *argc - i; + *argc -= *nr_bdfs; + + return *nr_bdfs ? &argv[i] : NULL; +} + +static char *get_bdf_env(void) +{ + char *bdf; + + bdf = getenv("VFIO_SELFTESTS_BDF"); + if (!bdf) + return NULL; + + VFIO_ASSERT_TRUE(is_bdf(bdf), "Invalid BDF: %s\n", bdf); + return bdf; +} + +char **vfio_selftests_get_bdfs(int *argc, char *argv[], int *nr_bdfs) +{ + static char *env_bdf; + char **bdfs; + + bdfs = get_bdfs_cmdline(argc, argv, nr_bdfs); + if (bdfs) + return bdfs; + + env_bdf = get_bdf_env(); + if (env_bdf) { + *nr_bdfs = 1; + return &env_bdf; + } + + fprintf(stderr, "Unable to determine which device(s) to use, skipping test.\n"); + fprintf(stderr, "\n"); + fprintf(stderr, "To pass the device address via environment variable:\n"); + fprintf(stderr, "\n"); + fprintf(stderr, " export VFIO_SELFTESTS_BDF=\"segment:bus:device.function\"\n"); + fprintf(stderr, " %s [options]\n", argv[0]); + fprintf(stderr, "\n"); + fprintf(stderr, "To pass the device address(es) via argv:\n"); + fprintf(stderr, "\n"); + fprintf(stderr, " %s [options] segment:bus:device.function ...\n", argv[0]); + fprintf(stderr, "\n"); + exit(KSFT_SKIP); +} + +const char *vfio_selftests_get_bdf(int *argc, char *argv[]) +{ + int nr_bdfs; + + return vfio_selftests_get_bdfs(argc, argv, &nr_bdfs)[0]; +} diff --git a/tools/testing/selftests/vfio/lib/libvfio.mk b/tools/testing/selftests/vfio/lib/libvfio.mk index 5d11c3a89a28..9f47bceed16f 100644 --- a/tools/testing/selftests/vfio/lib/libvfio.mk +++ b/tools/testing/selftests/vfio/lib/libvfio.mk @@ -1,24 +1,29 @@ include $(top_srcdir)/scripts/subarch.include ARCH ?= $(SUBARCH) -VFIO_DIR := $(selfdir)/vfio +LIBVFIO_SRCDIR := $(selfdir)/vfio/lib -LIBVFIO_C := lib/vfio_pci_device.c -LIBVFIO_C += lib/vfio_pci_driver.c +LIBVFIO_C := iommu.c +LIBVFIO_C += iova_allocator.c +LIBVFIO_C += libvfio.c +LIBVFIO_C += vfio_pci_device.c +LIBVFIO_C += vfio_pci_driver.c ifeq ($(ARCH:x86_64=x86),x86) -LIBVFIO_C += lib/drivers/ioat/ioat.c -LIBVFIO_C += lib/drivers/dsa/dsa.c +LIBVFIO_C += drivers/ioat/ioat.c +LIBVFIO_C += drivers/dsa/dsa.c endif -LIBVFIO_O := $(patsubst %.c, $(OUTPUT)/%.o, $(LIBVFIO_C)) +LIBVFIO_OUTPUT := $(OUTPUT)/libvfio + +LIBVFIO_O := $(patsubst %.c, $(LIBVFIO_OUTPUT)/%.o, $(LIBVFIO_C)) LIBVFIO_O_DIRS := $(shell dirname $(LIBVFIO_O) | uniq) $(shell mkdir -p $(LIBVFIO_O_DIRS)) -CFLAGS += -I$(VFIO_DIR)/lib/include +CFLAGS += -I$(LIBVFIO_SRCDIR)/include -$(LIBVFIO_O): $(OUTPUT)/%.o : $(VFIO_DIR)/%.c +$(LIBVFIO_O): $(LIBVFIO_OUTPUT)/%.o : $(LIBVFIO_SRCDIR)/%.c $(CC) $(CFLAGS) $(CPPFLAGS) $(TARGET_ARCH) -c $< -o $@ -EXTRA_CLEAN += $(LIBVFIO_O) +EXTRA_CLEAN += $(LIBVFIO_OUTPUT) diff --git a/tools/testing/selftests/vfio/lib/vfio_pci_device.c b/tools/testing/selftests/vfio/lib/vfio_pci_device.c index b479a359da12..13fdb4b0b10f 100644 --- a/tools/testing/selftests/vfio/lib/vfio_pci_device.c +++ b/tools/testing/selftests/vfio/lib/vfio_pci_device.c @@ -20,286 +20,10 @@ #include <linux/vfio.h> #include "../../../kselftest.h" -#include <vfio_util.h> +#include <libvfio.h> #define PCI_SYSFS_PATH "/sys/bus/pci/devices" -#define ioctl_assert(_fd, _op, _arg) do { \ - void *__arg = (_arg); \ - int __ret = ioctl((_fd), (_op), (__arg)); \ - VFIO_ASSERT_EQ(__ret, 0, "ioctl(%s, %s, %s) returned %d\n", #_fd, #_op, #_arg, __ret); \ -} while (0) - -static struct vfio_info_cap_header *next_cap_hdr(void *buf, u32 bufsz, - u32 *cap_offset) -{ - struct vfio_info_cap_header *hdr; - - if (!*cap_offset) - return NULL; - - VFIO_ASSERT_LT(*cap_offset, bufsz); - VFIO_ASSERT_GE(bufsz - *cap_offset, sizeof(*hdr)); - - hdr = (struct vfio_info_cap_header *)((u8 *)buf + *cap_offset); - *cap_offset = hdr->next; - - return hdr; -} - -static struct vfio_info_cap_header *vfio_iommu_info_cap_hdr(struct vfio_iommu_type1_info *info, - u16 cap_id) -{ - struct vfio_info_cap_header *hdr; - u32 cap_offset = info->cap_offset; - u32 max_depth; - u32 depth = 0; - - if (!(info->flags & VFIO_IOMMU_INFO_CAPS)) - return NULL; - - if (cap_offset) - VFIO_ASSERT_GE(cap_offset, sizeof(*info)); - - max_depth = (info->argsz - sizeof(*info)) / sizeof(*hdr); - - while ((hdr = next_cap_hdr(info, info->argsz, &cap_offset))) { - depth++; - VFIO_ASSERT_LE(depth, max_depth, "Capability chain contains a cycle\n"); - - if (hdr->id == cap_id) - return hdr; - } - - return NULL; -} - -/* Return buffer including capability chain, if present. Free with free() */ -static struct vfio_iommu_type1_info *vfio_iommu_get_info(struct vfio_pci_device *device) -{ - struct vfio_iommu_type1_info *info; - - info = malloc(sizeof(*info)); - VFIO_ASSERT_NOT_NULL(info); - - *info = (struct vfio_iommu_type1_info) { - .argsz = sizeof(*info), - }; - - ioctl_assert(device->container_fd, VFIO_IOMMU_GET_INFO, info); - VFIO_ASSERT_GE(info->argsz, sizeof(*info)); - - info = realloc(info, info->argsz); - VFIO_ASSERT_NOT_NULL(info); - - ioctl_assert(device->container_fd, VFIO_IOMMU_GET_INFO, info); - VFIO_ASSERT_GE(info->argsz, sizeof(*info)); - - return info; -} - -/* - * Return iova ranges for the device's container. Normalize vfio_iommu_type1 to - * report iommufd's iommu_iova_range. Free with free(). - */ -static struct iommu_iova_range *vfio_iommu_iova_ranges(struct vfio_pci_device *device, - u32 *nranges) -{ - struct vfio_iommu_type1_info_cap_iova_range *cap_range; - struct vfio_iommu_type1_info *info; - struct vfio_info_cap_header *hdr; - struct iommu_iova_range *ranges = NULL; - - info = vfio_iommu_get_info(device); - hdr = vfio_iommu_info_cap_hdr(info, VFIO_IOMMU_TYPE1_INFO_CAP_IOVA_RANGE); - VFIO_ASSERT_NOT_NULL(hdr); - - cap_range = container_of(hdr, struct vfio_iommu_type1_info_cap_iova_range, header); - VFIO_ASSERT_GT(cap_range->nr_iovas, 0); - - ranges = calloc(cap_range->nr_iovas, sizeof(*ranges)); - VFIO_ASSERT_NOT_NULL(ranges); - - for (u32 i = 0; i < cap_range->nr_iovas; i++) { - ranges[i] = (struct iommu_iova_range){ - .start = cap_range->iova_ranges[i].start, - .last = cap_range->iova_ranges[i].end, - }; - } - - *nranges = cap_range->nr_iovas; - - free(info); - return ranges; -} - -/* Return iova ranges of the device's IOAS. Free with free() */ -static struct iommu_iova_range *iommufd_iova_ranges(struct vfio_pci_device *device, - u32 *nranges) -{ - struct iommu_iova_range *ranges; - int ret; - - struct iommu_ioas_iova_ranges query = { - .size = sizeof(query), - .ioas_id = device->ioas_id, - }; - - ret = ioctl(device->iommufd, IOMMU_IOAS_IOVA_RANGES, &query); - VFIO_ASSERT_EQ(ret, -1); - VFIO_ASSERT_EQ(errno, EMSGSIZE); - VFIO_ASSERT_GT(query.num_iovas, 0); - - ranges = calloc(query.num_iovas, sizeof(*ranges)); - VFIO_ASSERT_NOT_NULL(ranges); - - query.allowed_iovas = (uintptr_t)ranges; - - ioctl_assert(device->iommufd, IOMMU_IOAS_IOVA_RANGES, &query); - *nranges = query.num_iovas; - - return ranges; -} - -static int iova_range_comp(const void *a, const void *b) -{ - const struct iommu_iova_range *ra = a, *rb = b; - - if (ra->start < rb->start) - return -1; - - if (ra->start > rb->start) - return 1; - - return 0; -} - -/* Return sorted IOVA ranges of the device. Free with free(). */ -struct iommu_iova_range *vfio_pci_iova_ranges(struct vfio_pci_device *device, - u32 *nranges) -{ - struct iommu_iova_range *ranges; - - if (device->iommufd) - ranges = iommufd_iova_ranges(device, nranges); - else - ranges = vfio_iommu_iova_ranges(device, nranges); - - if (!ranges) - return NULL; - - VFIO_ASSERT_GT(*nranges, 0); - - /* Sort and check that ranges are sane and non-overlapping */ - qsort(ranges, *nranges, sizeof(*ranges), iova_range_comp); - VFIO_ASSERT_LT(ranges[0].start, ranges[0].last); - - for (u32 i = 1; i < *nranges; i++) { - VFIO_ASSERT_LT(ranges[i].start, ranges[i].last); - VFIO_ASSERT_LT(ranges[i - 1].last, ranges[i].start); - } - - return ranges; -} - -struct iova_allocator *iova_allocator_init(struct vfio_pci_device *device) -{ - struct iova_allocator *allocator; - struct iommu_iova_range *ranges; - u32 nranges; - - ranges = vfio_pci_iova_ranges(device, &nranges); - VFIO_ASSERT_NOT_NULL(ranges); - - allocator = malloc(sizeof(*allocator)); - VFIO_ASSERT_NOT_NULL(allocator); - - *allocator = (struct iova_allocator){ - .ranges = ranges, - .nranges = nranges, - .range_idx = 0, - .range_offset = 0, - }; - - return allocator; -} - -void iova_allocator_cleanup(struct iova_allocator *allocator) -{ - free(allocator->ranges); - free(allocator); -} - -iova_t iova_allocator_alloc(struct iova_allocator *allocator, size_t size) -{ - VFIO_ASSERT_GT(size, 0, "Invalid size arg, zero\n"); - VFIO_ASSERT_EQ(size & (size - 1), 0, "Invalid size arg, non-power-of-2\n"); - - for (;;) { - struct iommu_iova_range *range; - iova_t iova, last; - - VFIO_ASSERT_LT(allocator->range_idx, allocator->nranges, - "IOVA allocator out of space\n"); - - range = &allocator->ranges[allocator->range_idx]; - iova = range->start + allocator->range_offset; - - /* Check for sufficient space at the current offset */ - if (check_add_overflow(iova, size - 1, &last) || - last > range->last) - goto next_range; - - /* Align iova to size */ - iova = last & ~(size - 1); - - /* Check for sufficient space at the aligned iova */ - if (check_add_overflow(iova, size - 1, &last) || - last > range->last) - goto next_range; - - if (last == range->last) { - allocator->range_idx++; - allocator->range_offset = 0; - } else { - allocator->range_offset = last - range->start + 1; - } - - return iova; - -next_range: - allocator->range_idx++; - allocator->range_offset = 0; - } -} - -iova_t __to_iova(struct vfio_pci_device *device, void *vaddr) -{ - struct vfio_dma_region *region; - - list_for_each_entry(region, &device->dma_regions, link) { - if (vaddr < region->vaddr) - continue; - - if (vaddr >= region->vaddr + region->size) - continue; - - return region->iova + (vaddr - region->vaddr); - } - - return INVALID_IOVA; -} - -iova_t to_iova(struct vfio_pci_device *device, void *vaddr) -{ - iova_t iova; - - iova = __to_iova(device, vaddr); - VFIO_ASSERT_NE(iova, INVALID_IOVA, "%p is not mapped into device.\n", vaddr); - - return iova; -} - static void vfio_pci_irq_set(struct vfio_pci_device *device, u32 index, u32 vector, u32 count, int *fds) { @@ -386,141 +110,6 @@ static void vfio_pci_irq_get(struct vfio_pci_device *device, u32 index, ioctl_assert(device->fd, VFIO_DEVICE_GET_IRQ_INFO, irq_info); } -static int vfio_iommu_dma_map(struct vfio_pci_device *device, - struct vfio_dma_region *region) -{ - struct vfio_iommu_type1_dma_map args = { - .argsz = sizeof(args), - .flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE, - .vaddr = (u64)region->vaddr, - .iova = region->iova, - .size = region->size, - }; - - if (ioctl(device->container_fd, VFIO_IOMMU_MAP_DMA, &args)) - return -errno; - - return 0; -} - -static int iommufd_dma_map(struct vfio_pci_device *device, - struct vfio_dma_region *region) -{ - struct iommu_ioas_map args = { - .size = sizeof(args), - .flags = IOMMU_IOAS_MAP_READABLE | - IOMMU_IOAS_MAP_WRITEABLE | - IOMMU_IOAS_MAP_FIXED_IOVA, - .user_va = (u64)region->vaddr, - .iova = region->iova, - .length = region->size, - .ioas_id = device->ioas_id, - }; - - if (ioctl(device->iommufd, IOMMU_IOAS_MAP, &args)) - return -errno; - - return 0; -} - -int __vfio_pci_dma_map(struct vfio_pci_device *device, - struct vfio_dma_region *region) -{ - int ret; - - if (device->iommufd) - ret = iommufd_dma_map(device, region); - else - ret = vfio_iommu_dma_map(device, region); - - if (ret) - return ret; - - list_add(®ion->link, &device->dma_regions); - - return 0; -} - -static int vfio_iommu_dma_unmap(int fd, u64 iova, u64 size, u32 flags, - u64 *unmapped) -{ - struct vfio_iommu_type1_dma_unmap args = { - .argsz = sizeof(args), - .iova = iova, - .size = size, - .flags = flags, - }; - - if (ioctl(fd, VFIO_IOMMU_UNMAP_DMA, &args)) - return -errno; - - if (unmapped) - *unmapped = args.size; - - return 0; -} - -static int iommufd_dma_unmap(int fd, u64 iova, u64 length, u32 ioas_id, - u64 *unmapped) -{ - struct iommu_ioas_unmap args = { - .size = sizeof(args), - .iova = iova, - .length = length, - .ioas_id = ioas_id, - }; - - if (ioctl(fd, IOMMU_IOAS_UNMAP, &args)) - return -errno; - - if (unmapped) - *unmapped = args.length; - - return 0; -} - -int __vfio_pci_dma_unmap(struct vfio_pci_device *device, - struct vfio_dma_region *region, u64 *unmapped) -{ - int ret; - - if (device->iommufd) - ret = iommufd_dma_unmap(device->iommufd, region->iova, - region->size, device->ioas_id, - unmapped); - else - ret = vfio_iommu_dma_unmap(device->container_fd, region->iova, - region->size, 0, unmapped); - - if (ret) - return ret; - - list_del_init(®ion->link); - - return 0; -} - -int __vfio_pci_dma_unmap_all(struct vfio_pci_device *device, u64 *unmapped) -{ - int ret; - struct vfio_dma_region *curr, *next; - - if (device->iommufd) - ret = iommufd_dma_unmap(device->iommufd, 0, UINT64_MAX, - device->ioas_id, unmapped); - else - ret = vfio_iommu_dma_unmap(device->container_fd, 0, 0, - VFIO_DMA_UNMAP_FLAG_ALL, unmapped); - - if (ret) - return ret; - - list_for_each_entry_safe(curr, next, &device->dma_regions, link) - list_del_init(&curr->link); - - return 0; -} - static void vfio_pci_region_get(struct vfio_pci_device *device, int index, struct vfio_region_info *info) { @@ -627,28 +216,26 @@ static void vfio_pci_group_setup(struct vfio_pci_device *device, const char *bdf ioctl_assert(device->group_fd, VFIO_GROUP_GET_STATUS, &group_status); VFIO_ASSERT_TRUE(group_status.flags & VFIO_GROUP_FLAGS_VIABLE); - ioctl_assert(device->group_fd, VFIO_GROUP_SET_CONTAINER, &device->container_fd); + ioctl_assert(device->group_fd, VFIO_GROUP_SET_CONTAINER, &device->iommu->container_fd); } static void vfio_pci_container_setup(struct vfio_pci_device *device, const char *bdf) { - unsigned long iommu_type = device->iommu_mode->iommu_type; - const char *path = device->iommu_mode->container_path; - int version; + struct iommu *iommu = device->iommu; + unsigned long iommu_type = iommu->mode->iommu_type; int ret; - device->container_fd = open(path, O_RDWR); - VFIO_ASSERT_GE(device->container_fd, 0, "open(%s) failed\n", path); - - version = ioctl(device->container_fd, VFIO_GET_API_VERSION); - VFIO_ASSERT_EQ(version, VFIO_API_VERSION, "Unsupported version: %d\n", version); - vfio_pci_group_setup(device, bdf); - ret = ioctl(device->container_fd, VFIO_CHECK_EXTENSION, iommu_type); + ret = ioctl(iommu->container_fd, VFIO_CHECK_EXTENSION, iommu_type); VFIO_ASSERT_GT(ret, 0, "VFIO IOMMU type %lu not supported\n", iommu_type); - ioctl_assert(device->container_fd, VFIO_SET_IOMMU, (void *)iommu_type); + /* + * Allow multiple threads to race to set the IOMMU type on the + * container. The first will succeed and the rest should fail + * because the IOMMU type is already set. + */ + (void)ioctl(iommu->container_fd, VFIO_SET_IOMMU, (void *)iommu_type); device->fd = ioctl(device->group_fd, VFIO_GROUP_GET_DEVICE_FD, bdf); VFIO_ASSERT_GE(device->fd, 0); @@ -712,52 +299,6 @@ const char *vfio_pci_get_cdev_path(const char *bdf) return cdev_path; } -/* Reminder: Keep in sync with FIXTURE_VARIANT_ADD_ALL_IOMMU_MODES(). */ -static const struct vfio_iommu_mode iommu_modes[] = { - { - .name = "vfio_type1_iommu", - .container_path = "/dev/vfio/vfio", - .iommu_type = VFIO_TYPE1_IOMMU, - }, - { - .name = "vfio_type1v2_iommu", - .container_path = "/dev/vfio/vfio", - .iommu_type = VFIO_TYPE1v2_IOMMU, - }, - { - .name = "iommufd_compat_type1", - .container_path = "/dev/iommu", - .iommu_type = VFIO_TYPE1_IOMMU, - }, - { - .name = "iommufd_compat_type1v2", - .container_path = "/dev/iommu", - .iommu_type = VFIO_TYPE1v2_IOMMU, - }, - { - .name = "iommufd", - }, -}; - -const char *default_iommu_mode = "iommufd"; - -static const struct vfio_iommu_mode *lookup_iommu_mode(const char *iommu_mode) -{ - int i; - - if (!iommu_mode) - iommu_mode = default_iommu_mode; - - for (i = 0; i < ARRAY_SIZE(iommu_modes); i++) { - if (strcmp(iommu_mode, iommu_modes[i].name)) - continue; - - return &iommu_modes[i]; - } - - VFIO_FAIL("Unrecognized IOMMU mode: %s\n", iommu_mode); -} - static void vfio_device_bind_iommufd(int device_fd, int iommufd) { struct vfio_device_bind_iommufd args = { @@ -768,16 +309,6 @@ static void vfio_device_bind_iommufd(int device_fd, int iommufd) ioctl_assert(device_fd, VFIO_DEVICE_BIND_IOMMUFD, &args); } -static u32 iommufd_ioas_alloc(int iommufd) -{ - struct iommu_ioas_alloc args = { - .size = sizeof(args), - }; - - ioctl_assert(iommufd, IOMMU_IOAS_ALLOC, &args); - return args.out_ioas_id; -} - static void vfio_device_attach_iommufd_pt(int device_fd, u32 pt_id) { struct vfio_device_attach_iommufd_pt args = { @@ -796,31 +327,22 @@ static void vfio_pci_iommufd_setup(struct vfio_pci_device *device, const char *b VFIO_ASSERT_GE(device->fd, 0); free((void *)cdev_path); - /* - * Require device->iommufd to be >0 so that a simple non-0 check can be - * used to check if iommufd is enabled. In practice open() will never - * return 0 unless stdin is closed. - */ - device->iommufd = open("/dev/iommu", O_RDWR); - VFIO_ASSERT_GT(device->iommufd, 0); - - vfio_device_bind_iommufd(device->fd, device->iommufd); - device->ioas_id = iommufd_ioas_alloc(device->iommufd); - vfio_device_attach_iommufd_pt(device->fd, device->ioas_id); + vfio_device_bind_iommufd(device->fd, device->iommu->iommufd); + vfio_device_attach_iommufd_pt(device->fd, device->iommu->ioas_id); } -struct vfio_pci_device *vfio_pci_device_init(const char *bdf, const char *iommu_mode) +struct vfio_pci_device *vfio_pci_device_init(const char *bdf, struct iommu *iommu) { struct vfio_pci_device *device; device = calloc(1, sizeof(*device)); VFIO_ASSERT_NOT_NULL(device); - INIT_LIST_HEAD(&device->dma_regions); - - device->iommu_mode = lookup_iommu_mode(iommu_mode); + VFIO_ASSERT_NOT_NULL(iommu); + device->iommu = iommu; + device->bdf = bdf; - if (device->iommu_mode->container_path) + if (iommu->mode->container_path) vfio_pci_container_setup(device, bdf); else vfio_pci_iommufd_setup(device, bdf); @@ -849,48 +371,8 @@ void vfio_pci_device_cleanup(struct vfio_pci_device *device) VFIO_ASSERT_EQ(close(device->msi_eventfds[i]), 0); } - if (device->iommufd) { - VFIO_ASSERT_EQ(close(device->iommufd), 0); - } else { + if (device->group_fd) VFIO_ASSERT_EQ(close(device->group_fd), 0); - VFIO_ASSERT_EQ(close(device->container_fd), 0); - } free(device); } - -static bool is_bdf(const char *str) -{ - unsigned int s, b, d, f; - int length, count; - - count = sscanf(str, "%4x:%2x:%2x.%2x%n", &s, &b, &d, &f, &length); - return count == 4 && length == strlen(str); -} - -const char *vfio_selftests_get_bdf(int *argc, char *argv[]) -{ - char *bdf; - - if (*argc > 1 && is_bdf(argv[*argc - 1])) - return argv[--(*argc)]; - - bdf = getenv("VFIO_SELFTESTS_BDF"); - if (bdf) { - VFIO_ASSERT_TRUE(is_bdf(bdf), "Invalid BDF: %s\n", bdf); - return bdf; - } - - fprintf(stderr, "Unable to determine which device to use, skipping test.\n"); - fprintf(stderr, "\n"); - fprintf(stderr, "To pass the device address via environment variable:\n"); - fprintf(stderr, "\n"); - fprintf(stderr, " export VFIO_SELFTESTS_BDF=segment:bus:device.function\n"); - fprintf(stderr, " %s [options]\n", argv[0]); - fprintf(stderr, "\n"); - fprintf(stderr, "To pass the device address via argv:\n"); - fprintf(stderr, "\n"); - fprintf(stderr, " %s [options] segment:bus:device.function\n", argv[0]); - fprintf(stderr, "\n"); - exit(KSFT_SKIP); -} diff --git a/tools/testing/selftests/vfio/lib/vfio_pci_driver.c b/tools/testing/selftests/vfio/lib/vfio_pci_driver.c index e5e8723ecb41..ca0e25efbfa1 100644 --- a/tools/testing/selftests/vfio/lib/vfio_pci_driver.c +++ b/tools/testing/selftests/vfio/lib/vfio_pci_driver.c @@ -1,8 +1,6 @@ // SPDX-License-Identifier: GPL-2.0-only -#include <stdio.h> - #include "../../../kselftest.h" -#include <vfio_util.h> +#include <libvfio.h> #ifdef __x86_64__ extern struct vfio_pci_driver_ops dsa_ops; @@ -29,7 +27,6 @@ void vfio_pci_driver_probe(struct vfio_pci_device *device) if (ops->probe(device)) continue; - printf("Driver found: %s\n", ops->name); device->driver.ops = ops; } } @@ -58,17 +55,6 @@ void vfio_pci_driver_init(struct vfio_pci_device *device) driver->ops->init(device); driver->initialized = true; - - printf("%s: region: vaddr %p, iova 0x%lx, size 0x%lx\n", - driver->ops->name, - driver->region.vaddr, - driver->region.iova, - driver->region.size); - - printf("%s: max_memcpy_size 0x%lx, max_memcpy_count 0x%lx\n", - driver->ops->name, - driver->max_memcpy_size, - driver->max_memcpy_count); } void vfio_pci_driver_remove(struct vfio_pci_device *device) diff --git a/tools/testing/selftests/vfio/run.sh b/tools/testing/selftests/vfio/run.sh deleted file mode 100755 index 0476b6d7adc3..000000000000 --- a/tools/testing/selftests/vfio/run.sh +++ /dev/null @@ -1,109 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0-or-later - -# Global variables initialized in main() and then used during cleanup() when -# the script exits. -declare DEVICE_BDF -declare NEW_DRIVER -declare OLD_DRIVER -declare OLD_NUMVFS -declare DRIVER_OVERRIDE - -function write_to() { - # Unfortunately set -x does not show redirects so use echo to manually - # tell the user what commands are being run. - echo "+ echo \"${2}\" > ${1}" - echo "${2}" > ${1} -} - -function bind() { - write_to /sys/bus/pci/drivers/${2}/bind ${1} -} - -function unbind() { - write_to /sys/bus/pci/drivers/${2}/unbind ${1} -} - -function set_sriov_numvfs() { - write_to /sys/bus/pci/devices/${1}/sriov_numvfs ${2} -} - -function set_driver_override() { - write_to /sys/bus/pci/devices/${1}/driver_override ${2} -} - -function clear_driver_override() { - set_driver_override ${1} "" -} - -function cleanup() { - if [ "${NEW_DRIVER}" ]; then unbind ${DEVICE_BDF} ${NEW_DRIVER} ; fi - if [ "${DRIVER_OVERRIDE}" ]; then clear_driver_override ${DEVICE_BDF} ; fi - if [ "${OLD_DRIVER}" ]; then bind ${DEVICE_BDF} ${OLD_DRIVER} ; fi - if [ "${OLD_NUMVFS}" ]; then set_sriov_numvfs ${DEVICE_BDF} ${OLD_NUMVFS} ; fi -} - -function usage() { - echo "usage: $0 [-d segment:bus:device.function] [-s] [-h] [cmd ...]" >&2 - echo >&2 - echo " -d: The BDF of the device to use for the test (required)" >&2 - echo " -h: Show this help message" >&2 - echo " -s: Drop into a shell rather than running a command" >&2 - echo >&2 - echo " cmd: The command to run and arguments to pass to it." >&2 - echo " Required when not using -s. The SBDF will be " >&2 - echo " appended to the argument list." >&2 - exit 1 -} - -function main() { - local shell - - while getopts "d:hs" opt; do - case $opt in - d) DEVICE_BDF="$OPTARG" ;; - s) shell=true ;; - *) usage ;; - esac - done - - # Shift past all optional arguments. - shift $((OPTIND - 1)) - - # Check that the user passed in the command to run. - [ ! "${shell}" ] && [ $# = 0 ] && usage - - # Check that the user passed in a BDF. - [ "${DEVICE_BDF}" ] || usage - - trap cleanup EXIT - set -e - - test -d /sys/bus/pci/devices/${DEVICE_BDF} - - if [ -f /sys/bus/pci/devices/${DEVICE_BDF}/sriov_numvfs ]; then - OLD_NUMVFS=$(cat /sys/bus/pci/devices/${DEVICE_BDF}/sriov_numvfs) - set_sriov_numvfs ${DEVICE_BDF} 0 - fi - - if [ -L /sys/bus/pci/devices/${DEVICE_BDF}/driver ]; then - OLD_DRIVER=$(basename $(readlink -m /sys/bus/pci/devices/${DEVICE_BDF}/driver)) - unbind ${DEVICE_BDF} ${OLD_DRIVER} - fi - - set_driver_override ${DEVICE_BDF} vfio-pci - DRIVER_OVERRIDE=true - - bind ${DEVICE_BDF} vfio-pci - NEW_DRIVER=vfio-pci - - echo - if [ "${shell}" ]; then - echo "Dropping into ${SHELL} with VFIO_SELFTESTS_BDF=${DEVICE_BDF}" - VFIO_SELFTESTS_BDF=${DEVICE_BDF} ${SHELL} - else - "$@" ${DEVICE_BDF} - fi - echo -} - -main "$@" diff --git a/tools/testing/selftests/vfio/scripts/cleanup.sh b/tools/testing/selftests/vfio/scripts/cleanup.sh new file mode 100755 index 000000000000..69c922d8aafb --- /dev/null +++ b/tools/testing/selftests/vfio/scripts/cleanup.sh @@ -0,0 +1,41 @@ +# SPDX-License-Identifier: GPL-2.0-or-later + +source $(dirname -- "${BASH_SOURCE[0]}")/lib.sh + +function cleanup_devices() { + local device_bdf + local device_dir + + for device_bdf in "$@"; do + device_dir=${DEVICES_DIR}/${device_bdf} + + if [ -f ${device_dir}/vfio-pci ]; then + unbind ${device_bdf} vfio-pci + fi + + if [ -f ${device_dir}/driver_override ]; then + clear_driver_override ${device_bdf} + fi + + if [ -f ${device_dir}/driver ]; then + bind ${device_bdf} $(cat ${device_dir}/driver) + fi + + if [ -f ${device_dir}/sriov_numvfs ]; then + set_sriov_numvfs ${device_bdf} $(cat ${device_dir}/sriov_numvfs) + fi + + rm -rf ${device_dir} + done +} + +function main() { + if [ $# = 0 ]; then + cleanup_devices $(ls ${DEVICES_DIR}) + rmdir ${DEVICES_DIR} + else + cleanup_devices "$@" + fi +} + +main "$@" diff --git a/tools/testing/selftests/vfio/scripts/lib.sh b/tools/testing/selftests/vfio/scripts/lib.sh new file mode 100755 index 000000000000..9f05f29c7b86 --- /dev/null +++ b/tools/testing/selftests/vfio/scripts/lib.sh @@ -0,0 +1,42 @@ +# SPDX-License-Identifier: GPL-2.0-or-later + +readonly DEVICES_DIR="${TMPDIR:-/tmp}/vfio-selftests-devices" + +function write_to() { + # Unfortunately set -x does not show redirects so use echo to manually + # tell the user what commands are being run. + echo "+ echo \"${2}\" > ${1}" + echo "${2}" > ${1} +} + +function get_driver() { + if [ -L /sys/bus/pci/devices/${1}/driver ]; then + basename $(readlink -m /sys/bus/pci/devices/${1}/driver) + fi +} + +function bind() { + write_to /sys/bus/pci/drivers/${2}/bind ${1} +} + +function unbind() { + write_to /sys/bus/pci/drivers/${2}/unbind ${1} +} + +function set_sriov_numvfs() { + write_to /sys/bus/pci/devices/${1}/sriov_numvfs ${2} +} + +function get_sriov_numvfs() { + if [ -f /sys/bus/pci/devices/${1}/sriov_numvfs ]; then + cat /sys/bus/pci/devices/${1}/sriov_numvfs + fi +} + +function set_driver_override() { + write_to /sys/bus/pci/devices/${1}/driver_override ${2} +} + +function clear_driver_override() { + set_driver_override ${1} "" +} diff --git a/tools/testing/selftests/vfio/scripts/run.sh b/tools/testing/selftests/vfio/scripts/run.sh new file mode 100755 index 000000000000..91fd38f9f6f6 --- /dev/null +++ b/tools/testing/selftests/vfio/scripts/run.sh @@ -0,0 +1,16 @@ +# SPDX-License-Identifier: GPL-2.0-or-later + +source $(dirname -- "${BASH_SOURCE[0]}")/lib.sh + +function main() { + local device_bdfs=$(ls ${DEVICES_DIR}) + + if [ -z "${device_bdfs}" ]; then + echo "No devices found, skipping." + exit 4 + fi + + "$@" ${device_bdfs} +} + +main "$@" diff --git a/tools/testing/selftests/vfio/scripts/setup.sh b/tools/testing/selftests/vfio/scripts/setup.sh new file mode 100755 index 000000000000..49a499e51cbe --- /dev/null +++ b/tools/testing/selftests/vfio/scripts/setup.sh @@ -0,0 +1,48 @@ +# SPDX-License-Identifier: GPL-2.0-or-later +set -e + +source $(dirname -- "${BASH_SOURCE[0]}")/lib.sh + +function main() { + local device_bdf + local device_dir + local numvfs + local driver + + if [ $# = 0 ]; then + echo "usage: $0 segment:bus:device.function ..." >&2 + exit 1 + fi + + for device_bdf in "$@"; do + test -d /sys/bus/pci/devices/${device_bdf} + + device_dir=${DEVICES_DIR}/${device_bdf} + if [ -d "${device_dir}" ]; then + echo "${device_bdf} has already been set up, exiting." + exit 0 + fi + + mkdir -p ${device_dir} + + numvfs=$(get_sriov_numvfs ${device_bdf}) + if [ "${numvfs}" ]; then + set_sriov_numvfs ${device_bdf} 0 + echo ${numvfs} > ${device_dir}/sriov_numvfs + fi + + driver=$(get_driver ${device_bdf}) + if [ "${driver}" ]; then + unbind ${device_bdf} ${driver} + echo ${driver} > ${device_dir}/driver + fi + + set_driver_override ${device_bdf} vfio-pci + touch ${device_dir}/driver_override + + bind ${device_bdf} vfio-pci + touch ${device_dir}/vfio-pci + done +} + +main "$@" diff --git a/tools/testing/selftests/vfio/vfio_dma_mapping_test.c b/tools/testing/selftests/vfio/vfio_dma_mapping_test.c index 102603d4407d..5397822c3dd4 100644 --- a/tools/testing/selftests/vfio/vfio_dma_mapping_test.c +++ b/tools/testing/selftests/vfio/vfio_dma_mapping_test.c @@ -10,7 +10,7 @@ #include <linux/sizes.h> #include <linux/vfio.h> -#include <vfio_util.h> +#include <libvfio.h> #include "../kselftest_harness.h" @@ -94,6 +94,7 @@ static int iommu_mapping_get(const char *bdf, u64 iova, } FIXTURE(vfio_dma_mapping_test) { + struct iommu *iommu; struct vfio_pci_device *device; struct iova_allocator *iova_allocator; }; @@ -119,21 +120,23 @@ FIXTURE_VARIANT_ADD_ALL_IOMMU_MODES(anonymous_hugetlb_1gb, SZ_1G, MAP_HUGETLB | FIXTURE_SETUP(vfio_dma_mapping_test) { - self->device = vfio_pci_device_init(device_bdf, variant->iommu_mode); - self->iova_allocator = iova_allocator_init(self->device); + self->iommu = iommu_init(variant->iommu_mode); + self->device = vfio_pci_device_init(device_bdf, self->iommu); + self->iova_allocator = iova_allocator_init(self->iommu); } FIXTURE_TEARDOWN(vfio_dma_mapping_test) { iova_allocator_cleanup(self->iova_allocator); vfio_pci_device_cleanup(self->device); + iommu_cleanup(self->iommu); } TEST_F(vfio_dma_mapping_test, dma_map_unmap) { const u64 size = variant->size ?: getpagesize(); const int flags = variant->mmap_flags; - struct vfio_dma_region region; + struct dma_region region; struct iommu_mapping mapping; u64 mapping_size = size; u64 unmapped; @@ -150,7 +153,7 @@ TEST_F(vfio_dma_mapping_test, dma_map_unmap) region.iova = iova_allocator_alloc(self->iova_allocator, size); region.size = size; - vfio_pci_dma_map(self->device, ®ion); + iommu_map(self->iommu, ®ion); printf("Mapped HVA %p (size 0x%lx) at IOVA 0x%lx\n", region.vaddr, size, region.iova); ASSERT_EQ(region.iova, to_iova(self->device, region.vaddr)); @@ -192,19 +195,20 @@ TEST_F(vfio_dma_mapping_test, dma_map_unmap) } unmap: - rc = __vfio_pci_dma_unmap(self->device, ®ion, &unmapped); + rc = __iommu_unmap(self->iommu, ®ion, &unmapped); ASSERT_EQ(rc, 0); ASSERT_EQ(unmapped, region.size); printf("Unmapped IOVA 0x%lx\n", region.iova); - ASSERT_EQ(INVALID_IOVA, __to_iova(self->device, region.vaddr)); + ASSERT_NE(0, __to_iova(self->device, region.vaddr, NULL)); ASSERT_NE(0, iommu_mapping_get(device_bdf, region.iova, &mapping)); ASSERT_TRUE(!munmap(region.vaddr, size)); } FIXTURE(vfio_dma_map_limit_test) { + struct iommu *iommu; struct vfio_pci_device *device; - struct vfio_dma_region region; + struct dma_region region; size_t mmap_size; }; @@ -223,7 +227,7 @@ FIXTURE_VARIANT_ADD_ALL_IOMMU_MODES(); FIXTURE_SETUP(vfio_dma_map_limit_test) { - struct vfio_dma_region *region = &self->region; + struct dma_region *region = &self->region; struct iommu_iova_range *ranges; u64 region_size = getpagesize(); iova_t last_iova; @@ -235,12 +239,13 @@ FIXTURE_SETUP(vfio_dma_map_limit_test) */ self->mmap_size = 2 * region_size; - self->device = vfio_pci_device_init(device_bdf, variant->iommu_mode); + self->iommu = iommu_init(variant->iommu_mode); + self->device = vfio_pci_device_init(device_bdf, self->iommu); region->vaddr = mmap(NULL, self->mmap_size, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); ASSERT_NE(region->vaddr, MAP_FAILED); - ranges = vfio_pci_iova_ranges(self->device, &nranges); + ranges = iommu_iova_ranges(self->iommu, &nranges); VFIO_ASSERT_NOT_NULL(ranges); last_iova = ranges[nranges - 1].last; free(ranges); @@ -253,49 +258,50 @@ FIXTURE_SETUP(vfio_dma_map_limit_test) FIXTURE_TEARDOWN(vfio_dma_map_limit_test) { vfio_pci_device_cleanup(self->device); + iommu_cleanup(self->iommu); ASSERT_EQ(munmap(self->region.vaddr, self->mmap_size), 0); } TEST_F(vfio_dma_map_limit_test, unmap_range) { - struct vfio_dma_region *region = &self->region; + struct dma_region *region = &self->region; u64 unmapped; int rc; - vfio_pci_dma_map(self->device, region); + iommu_map(self->iommu, region); ASSERT_EQ(region->iova, to_iova(self->device, region->vaddr)); - rc = __vfio_pci_dma_unmap(self->device, region, &unmapped); + rc = __iommu_unmap(self->iommu, region, &unmapped); ASSERT_EQ(rc, 0); ASSERT_EQ(unmapped, region->size); } TEST_F(vfio_dma_map_limit_test, unmap_all) { - struct vfio_dma_region *region = &self->region; + struct dma_region *region = &self->region; u64 unmapped; int rc; - vfio_pci_dma_map(self->device, region); + iommu_map(self->iommu, region); ASSERT_EQ(region->iova, to_iova(self->device, region->vaddr)); - rc = __vfio_pci_dma_unmap_all(self->device, &unmapped); + rc = __iommu_unmap_all(self->iommu, &unmapped); ASSERT_EQ(rc, 0); ASSERT_EQ(unmapped, region->size); } TEST_F(vfio_dma_map_limit_test, overflow) { - struct vfio_dma_region *region = &self->region; + struct dma_region *region = &self->region; int rc; region->iova = ~(iova_t)0 & ~(region->size - 1); region->size = self->mmap_size; - rc = __vfio_pci_dma_map(self->device, region); + rc = __iommu_map(self->iommu, region); ASSERT_EQ(rc, -EOVERFLOW); - rc = __vfio_pci_dma_unmap(self->device, region, NULL); + rc = __iommu_unmap(self->iommu, region, NULL); ASSERT_EQ(rc, -EOVERFLOW); } diff --git a/tools/testing/selftests/vfio/vfio_iommufd_setup_test.c b/tools/testing/selftests/vfio/vfio_iommufd_setup_test.c index 3655106b912d..caf1c6291f3d 100644 --- a/tools/testing/selftests/vfio/vfio_iommufd_setup_test.c +++ b/tools/testing/selftests/vfio/vfio_iommufd_setup_test.c @@ -10,7 +10,7 @@ #include <sys/ioctl.h> #include <unistd.h> -#include <vfio_util.h> +#include <libvfio.h> #include "../kselftest_harness.h" static const char iommu_dev_path[] = "/dev/iommu"; diff --git a/tools/testing/selftests/vfio/vfio_pci_device_init_perf_test.c b/tools/testing/selftests/vfio/vfio_pci_device_init_perf_test.c new file mode 100644 index 000000000000..33b0c31fe2ed --- /dev/null +++ b/tools/testing/selftests/vfio/vfio_pci_device_init_perf_test.c @@ -0,0 +1,168 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include <pthread.h> +#include <sys/ioctl.h> +#include <sys/mman.h> + +#include <linux/sizes.h> +#include <linux/time64.h> +#include <linux/vfio.h> + +#include <libvfio.h> + +#include "../kselftest_harness.h" + +static char **device_bdfs; +static int nr_devices; + +struct thread_args { + struct iommu *iommu; + int device_index; + struct timespec start; + struct timespec end; + pthread_barrier_t *barrier; +}; + +FIXTURE(vfio_pci_device_init_perf_test) { + pthread_t *threads; + pthread_barrier_t barrier; + struct thread_args *thread_args; + struct iommu *iommu; +}; + +FIXTURE_VARIANT(vfio_pci_device_init_perf_test) { + const char *iommu_mode; +}; + +#define FIXTURE_VARIANT_ADD_IOMMU_MODE(_iommu_mode) \ +FIXTURE_VARIANT_ADD(vfio_pci_device_init_perf_test, _iommu_mode) { \ + .iommu_mode = #_iommu_mode, \ +} + +FIXTURE_VARIANT_ADD_ALL_IOMMU_MODES(); + +FIXTURE_SETUP(vfio_pci_device_init_perf_test) +{ + int i; + + self->iommu = iommu_init(variant->iommu_mode); + self->threads = calloc(nr_devices, sizeof(self->threads[0])); + self->thread_args = calloc(nr_devices, sizeof(self->thread_args[0])); + + pthread_barrier_init(&self->barrier, NULL, nr_devices); + + for (i = 0; i < nr_devices; i++) { + self->thread_args[i].iommu = self->iommu; + self->thread_args[i].barrier = &self->barrier; + self->thread_args[i].device_index = i; + } +} + +FIXTURE_TEARDOWN(vfio_pci_device_init_perf_test) +{ + iommu_cleanup(self->iommu); + free(self->threads); + free(self->thread_args); +} + +static s64 to_ns(struct timespec ts) +{ + return (s64)ts.tv_nsec + NSEC_PER_SEC * (s64)ts.tv_sec; +} + +static struct timespec to_timespec(s64 ns) +{ + struct timespec ts = { + .tv_nsec = ns % NSEC_PER_SEC, + .tv_sec = ns / NSEC_PER_SEC, + }; + + return ts; +} + +static struct timespec timespec_sub(struct timespec a, struct timespec b) +{ + return to_timespec(to_ns(a) - to_ns(b)); +} + +static struct timespec timespec_min(struct timespec a, struct timespec b) +{ + return to_ns(a) < to_ns(b) ? a : b; +} + +static struct timespec timespec_max(struct timespec a, struct timespec b) +{ + return to_ns(a) > to_ns(b) ? a : b; +} + +static void *thread_main(void *__args) +{ + struct thread_args *args = __args; + struct vfio_pci_device *device; + + pthread_barrier_wait(args->barrier); + + clock_gettime(CLOCK_MONOTONIC, &args->start); + device = vfio_pci_device_init(device_bdfs[args->device_index], args->iommu); + clock_gettime(CLOCK_MONOTONIC, &args->end); + + pthread_barrier_wait(args->barrier); + + vfio_pci_device_cleanup(device); + return NULL; +} + +TEST_F(vfio_pci_device_init_perf_test, init) +{ + struct timespec start = to_timespec(INT64_MAX), end = {}; + struct timespec min = to_timespec(INT64_MAX); + struct timespec max = {}; + struct timespec avg = {}; + struct timespec wall_time; + s64 thread_ns = 0; + int i; + + for (i = 0; i < nr_devices; i++) { + pthread_create(&self->threads[i], NULL, thread_main, + &self->thread_args[i]); + } + + for (i = 0; i < nr_devices; i++) { + struct thread_args *args = &self->thread_args[i]; + struct timespec init_time; + + pthread_join(self->threads[i], NULL); + + start = timespec_min(start, args->start); + end = timespec_max(end, args->end); + + init_time = timespec_sub(args->end, args->start); + min = timespec_min(min, init_time); + max = timespec_max(max, init_time); + thread_ns += to_ns(init_time); + } + + avg = to_timespec(thread_ns / nr_devices); + wall_time = timespec_sub(end, start); + + printf("Wall time: %lu.%09lus\n", + wall_time.tv_sec, wall_time.tv_nsec); + printf("Min init time (per device): %lu.%09lus\n", + min.tv_sec, min.tv_nsec); + printf("Max init time (per device): %lu.%09lus\n", + max.tv_sec, max.tv_nsec); + printf("Avg init time (per device): %lu.%09lus\n", + avg.tv_sec, avg.tv_nsec); +} + +int main(int argc, char *argv[]) +{ + int i; + + device_bdfs = vfio_selftests_get_bdfs(&argc, argv, &nr_devices); + + printf("Testing parallel initialization of %d devices:\n", nr_devices); + for (i = 0; i < nr_devices; i++) + printf(" %s\n", device_bdfs[i]); + + return test_harness_run(argc, argv); +} diff --git a/tools/testing/selftests/vfio/vfio_pci_device_test.c b/tools/testing/selftests/vfio/vfio_pci_device_test.c index 7a270698e4d2..ecbb669b3765 100644 --- a/tools/testing/selftests/vfio/vfio_pci_device_test.c +++ b/tools/testing/selftests/vfio/vfio_pci_device_test.c @@ -10,7 +10,7 @@ #include <linux/sizes.h> #include <linux/vfio.h> -#include <vfio_util.h> +#include <libvfio.h> #include "../kselftest_harness.h" @@ -23,17 +23,20 @@ static const char *device_bdf; #define MAX_TEST_MSI 16U FIXTURE(vfio_pci_device_test) { + struct iommu *iommu; struct vfio_pci_device *device; }; FIXTURE_SETUP(vfio_pci_device_test) { - self->device = vfio_pci_device_init(device_bdf, default_iommu_mode); + self->iommu = iommu_init(default_iommu_mode); + self->device = vfio_pci_device_init(device_bdf, self->iommu); } FIXTURE_TEARDOWN(vfio_pci_device_test) { vfio_pci_device_cleanup(self->device); + iommu_cleanup(self->iommu); } #define read_pci_id_from_sysfs(_file) ({ \ @@ -99,6 +102,7 @@ TEST_F(vfio_pci_device_test, validate_bars) } FIXTURE(vfio_pci_irq_test) { + struct iommu *iommu; struct vfio_pci_device *device; }; @@ -116,12 +120,14 @@ FIXTURE_VARIANT_ADD(vfio_pci_irq_test, msix) { FIXTURE_SETUP(vfio_pci_irq_test) { - self->device = vfio_pci_device_init(device_bdf, default_iommu_mode); + self->iommu = iommu_init(default_iommu_mode); + self->device = vfio_pci_device_init(device_bdf, self->iommu); } FIXTURE_TEARDOWN(vfio_pci_irq_test) { vfio_pci_device_cleanup(self->device); + iommu_cleanup(self->iommu); } TEST_F(vfio_pci_irq_test, enable_trigger_disable) diff --git a/tools/testing/selftests/vfio/vfio_pci_driver_test.c b/tools/testing/selftests/vfio/vfio_pci_driver_test.c index f69eec8b928d..f0ca8310d6a8 100644 --- a/tools/testing/selftests/vfio/vfio_pci_driver_test.c +++ b/tools/testing/selftests/vfio/vfio_pci_driver_test.c @@ -5,7 +5,7 @@ #include <linux/sizes.h> #include <linux/vfio.h> -#include <vfio_util.h> +#include <libvfio.h> #include "../kselftest_harness.h" @@ -18,9 +18,9 @@ static const char *device_bdf; ASSERT_EQ(EAGAIN, errno); \ } while (0) -static void region_setup(struct vfio_pci_device *device, +static void region_setup(struct iommu *iommu, struct iova_allocator *iova_allocator, - struct vfio_dma_region *region, u64 size) + struct dma_region *region, u64 size) { const int flags = MAP_SHARED | MAP_ANONYMOUS; const int prot = PROT_READ | PROT_WRITE; @@ -33,20 +33,20 @@ static void region_setup(struct vfio_pci_device *device, region->iova = iova_allocator_alloc(iova_allocator, size); region->size = size; - vfio_pci_dma_map(device, region); + iommu_map(iommu, region); } -static void region_teardown(struct vfio_pci_device *device, - struct vfio_dma_region *region) +static void region_teardown(struct iommu *iommu, struct dma_region *region) { - vfio_pci_dma_unmap(device, region); + iommu_unmap(iommu, region); VFIO_ASSERT_EQ(munmap(region->vaddr, region->size), 0); } FIXTURE(vfio_pci_driver_test) { + struct iommu *iommu; struct vfio_pci_device *device; struct iova_allocator *iova_allocator; - struct vfio_dma_region memcpy_region; + struct dma_region memcpy_region; void *vaddr; int msi_fd; @@ -73,13 +73,14 @@ FIXTURE_SETUP(vfio_pci_driver_test) { struct vfio_pci_driver *driver; - self->device = vfio_pci_device_init(device_bdf, variant->iommu_mode); - self->iova_allocator = iova_allocator_init(self->device); + self->iommu = iommu_init(variant->iommu_mode); + self->device = vfio_pci_device_init(device_bdf, self->iommu); + self->iova_allocator = iova_allocator_init(self->iommu); driver = &self->device->driver; - region_setup(self->device, self->iova_allocator, &self->memcpy_region, SZ_1G); - region_setup(self->device, self->iova_allocator, &driver->region, SZ_2M); + region_setup(self->iommu, self->iova_allocator, &self->memcpy_region, SZ_1G); + region_setup(self->iommu, self->iova_allocator, &driver->region, SZ_2M); /* Any IOVA that doesn't overlap memcpy_region and driver->region. */ self->unmapped_iova = iova_allocator_alloc(self->iova_allocator, SZ_1G); @@ -108,11 +109,12 @@ FIXTURE_TEARDOWN(vfio_pci_driver_test) vfio_pci_driver_remove(self->device); - region_teardown(self->device, &self->memcpy_region); - region_teardown(self->device, &driver->region); + region_teardown(self->iommu, &self->memcpy_region); + region_teardown(self->iommu, &driver->region); iova_allocator_cleanup(self->iova_allocator); vfio_pci_device_cleanup(self->device); + iommu_cleanup(self->iommu); } TEST_F(vfio_pci_driver_test, init_remove) @@ -231,18 +233,31 @@ TEST_F_TIMEOUT(vfio_pci_driver_test, memcpy_storm, 60) ASSERT_NO_MSI(self->msi_fd); } -int main(int argc, char *argv[]) +static bool device_has_selftests_driver(const char *bdf) { struct vfio_pci_device *device; + struct iommu *iommu; + bool has_driver; + + iommu = iommu_init(default_iommu_mode); + device = vfio_pci_device_init(device_bdf, iommu); + + has_driver = !!device->driver.ops; + + vfio_pci_device_cleanup(device); + iommu_cleanup(iommu); + return has_driver; +} + +int main(int argc, char *argv[]) +{ device_bdf = vfio_selftests_get_bdf(&argc, argv); - device = vfio_pci_device_init(device_bdf, default_iommu_mode); - if (!device->driver.ops) { + if (!device_has_selftests_driver(device_bdf)) { fprintf(stderr, "No driver found for device %s\n", device_bdf); return KSFT_SKIP; } - vfio_pci_device_cleanup(device); return test_harness_run(argc, argv); } diff --git a/tools/testing/selftests/vsock/vmtest.sh b/tools/testing/selftests/vsock/vmtest.sh index 8ceeb8a7894f..c7b270dd77a9 100755 --- a/tools/testing/selftests/vsock/vmtest.sh +++ b/tools/testing/selftests/vsock/vmtest.sh @@ -7,6 +7,8 @@ # * virtme-ng # * busybox-static (used by virtme-ng) # * qemu (used by virtme-ng) +# +# shellcheck disable=SC2317,SC2119 readonly SCRIPT_DIR="$(cd -P -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd -P)" readonly KERNEL_CHECKOUT=$(realpath "${SCRIPT_DIR}"/../../../../) @@ -22,8 +24,9 @@ readonly SSH_HOST_PORT=2222 readonly VSOCK_CID=1234 readonly WAIT_PERIOD=3 readonly WAIT_PERIOD_MAX=60 -readonly WAIT_TOTAL=$(( WAIT_PERIOD * WAIT_PERIOD_MAX )) -readonly QEMU_PIDFILE=$(mktemp /tmp/qemu_vsock_vmtest_XXXX.pid) +readonly WAIT_QEMU=5 +readonly PIDFILE_TEMPLATE=/tmp/vsock_vmtest_XXXX.pid +declare -A PIDFILES # virtme-ng offers a netdev for ssh when using "--ssh", but we also need a # control port forwarded for vsock_test. Because virtme-ng doesn't support @@ -33,12 +36,6 @@ readonly QEMU_PIDFILE=$(mktemp /tmp/qemu_vsock_vmtest_XXXX.pid) # add the kernel cmdline options that virtme-init uses to setup the interface. readonly QEMU_TEST_PORT_FWD="hostfwd=tcp::${TEST_HOST_PORT}-:${TEST_GUEST_PORT}" readonly QEMU_SSH_PORT_FWD="hostfwd=tcp::${SSH_HOST_PORT}-:${SSH_GUEST_PORT}" -readonly QEMU_OPTS="\ - -netdev user,id=n0,${QEMU_TEST_PORT_FWD},${QEMU_SSH_PORT_FWD} \ - -device virtio-net-pci,netdev=n0 \ - -device vhost-vsock-pci,guest-cid=${VSOCK_CID} \ - --pidfile ${QEMU_PIDFILE} \ -" readonly KERNEL_CMDLINE="\ virtme.dhcp net.ifnames=0 biosdevname=0 \ virtme.ssh virtme_ssh_channel=tcp virtme_ssh_user=$USER \ @@ -51,6 +48,8 @@ readonly TEST_DESCS=( "Run vsock_test using the loopback transport in the VM." ) +readonly USE_SHARED_VM=(vm_server_host_client vm_client_host_server vm_loopback) + VERBOSE=0 usage() { @@ -84,21 +83,33 @@ die() { exit "${KSFT_FAIL}" } +check_result() { + local rc arg + + rc=$1 + arg=$2 + + cnt_total=$(( cnt_total + 1 )) + + if [[ ${rc} -eq ${KSFT_PASS} ]]; then + cnt_pass=$(( cnt_pass + 1 )) + echo "ok ${cnt_total} ${arg}" + elif [[ ${rc} -eq ${KSFT_SKIP} ]]; then + cnt_skip=$(( cnt_skip + 1 )) + echo "ok ${cnt_total} ${arg} # SKIP" + elif [[ ${rc} -eq ${KSFT_FAIL} ]]; then + cnt_fail=$(( cnt_fail + 1 )) + echo "not ok ${cnt_total} ${arg} # exit=${rc}" + fi +} + vm_ssh() { ssh -q -o UserKnownHostsFile=/dev/null -p ${SSH_HOST_PORT} localhost "$@" return $? } cleanup() { - if [[ -s "${QEMU_PIDFILE}" ]]; then - pkill -SIGTERM -F "${QEMU_PIDFILE}" > /dev/null 2>&1 - fi - - # If failure occurred during or before qemu start up, then we need - # to clean this up ourselves. - if [[ -e "${QEMU_PIDFILE}" ]]; then - rm "${QEMU_PIDFILE}" - fi + terminate_pidfiles "${!PIDFILES[@]}" } check_args() { @@ -147,7 +158,7 @@ check_vng() { local version local ok - tested_versions=("1.33" "1.36") + tested_versions=("1.33" "1.36" "1.37") version="$(vng --version)" ok=0 @@ -188,10 +199,37 @@ handle_build() { popd &>/dev/null } +create_pidfile() { + local pidfile + + pidfile=$(mktemp "${PIDFILE_TEMPLATE}") + PIDFILES["${pidfile}"]=1 + + echo "${pidfile}" +} + +terminate_pidfiles() { + local pidfile + + for pidfile in "$@"; do + if [[ -s "${pidfile}" ]]; then + pkill -SIGTERM -F "${pidfile}" > /dev/null 2>&1 + fi + + if [[ -e "${pidfile}" ]]; then + rm -f "${pidfile}" + fi + + unset "PIDFILES[${pidfile}]" + done +} + vm_start() { + local pidfile=$1 local logfile=/dev/null local verbose_opt="" local kernel_opt="" + local qemu_opts="" local qemu qemu=$(command -v "${QEMU}") @@ -201,6 +239,13 @@ vm_start() { logfile=/dev/stdout fi + qemu_opts="\ + -netdev user,id=n0,${QEMU_TEST_PORT_FWD},${QEMU_SSH_PORT_FWD} \ + -device virtio-net-pci,netdev=n0 \ + -device vhost-vsock-pci,guest-cid=${VSOCK_CID} \ + --pidfile ${pidfile} + " + if [[ "${BUILD}" -eq 1 ]]; then kernel_opt="${KERNEL_CHECKOUT}" fi @@ -209,16 +254,14 @@ vm_start() { --run \ ${kernel_opt} \ ${verbose_opt} \ - --qemu-opts="${QEMU_OPTS}" \ + --qemu-opts="${qemu_opts}" \ --qemu="${qemu}" \ --user root \ --append "${KERNEL_CMDLINE}" \ --rw &> ${logfile} & - if ! timeout ${WAIT_TOTAL} \ - bash -c 'while [[ ! -s '"${QEMU_PIDFILE}"' ]]; do sleep 1; done; exit 0'; then - die "failed to boot VM" - fi + timeout "${WAIT_QEMU}" \ + bash -c 'while [[ ! -s '"${pidfile}"' ]]; do sleep 1; done; exit 0' } vm_wait_for_ssh() { @@ -251,9 +294,11 @@ wait_for_listener() # for tcp protocol additionally check the socket state [ "${protocol}" = "tcp" ] && pattern="${pattern}0A" + for i in $(seq "${max_intervals}"); do - if awk '{print $2" "$4}' /proc/net/"${protocol}"* | \ - grep -q "${pattern}"; then + if awk -v pattern="${pattern}" \ + 'BEGIN {rc=1} $2" "$4 ~ pattern {rc=0} END {exit rc}' \ + /proc/net/"${protocol}"*; then break fi sleep "${interval}" @@ -270,113 +315,196 @@ EOF } host_wait_for_listener() { - wait_for_listener "${TEST_HOST_PORT_LISTENER}" "${WAIT_PERIOD}" "${WAIT_PERIOD_MAX}" + local port=$1 + + wait_for_listener "${port}" "${WAIT_PERIOD}" "${WAIT_PERIOD_MAX}" } -__log_stdin() { - cat | awk '{ printf "%s:\t%s\n","'"${prefix}"'", $0 }' +vm_vsock_test() { + local host=$1 + local cid=$2 + local port=$3 + local rc + + # log output and use pipefail to respect vsock_test errors + set -o pipefail + if [[ "${host}" != server ]]; then + vm_ssh -- "${VSOCK_TEST}" \ + --mode=client \ + --control-host="${host}" \ + --peer-cid="${cid}" \ + --control-port="${port}" \ + 2>&1 | log_guest + rc=$? + else + vm_ssh -- "${VSOCK_TEST}" \ + --mode=server \ + --peer-cid="${cid}" \ + --control-port="${port}" \ + 2>&1 | log_guest & + rc=$? + + if [[ $rc -ne 0 ]]; then + set +o pipefail + return $rc + fi + + vm_wait_for_listener "${port}" + rc=$? + fi + set +o pipefail + + return $rc } -__log_args() { - echo "$*" | awk '{ printf "%s:\t%s\n","'"${prefix}"'", $0 }' +host_vsock_test() { + local host=$1 + local cid=$2 + local port=$3 + local rc + + # log output and use pipefail to respect vsock_test errors + set -o pipefail + if [[ "${host}" != server ]]; then + ${VSOCK_TEST} \ + --mode=client \ + --peer-cid="${cid}" \ + --control-host="${host}" \ + --control-port="${port}" 2>&1 | log_host + rc=$? + else + ${VSOCK_TEST} \ + --mode=server \ + --peer-cid="${cid}" \ + --control-port="${port}" 2>&1 | log_host & + rc=$? + + if [[ $rc -ne 0 ]]; then + set +o pipefail + return $rc + fi + + host_wait_for_listener "${port}" + rc=$? + fi + set +o pipefail + + return $rc } log() { - local prefix="$1" + local redirect + local prefix - shift - local redirect= if [[ ${VERBOSE} -eq 0 ]]; then redirect=/dev/null else redirect=/dev/stdout fi + prefix="${LOG_PREFIX:-}" + if [[ "$#" -eq 0 ]]; then - __log_stdin | tee -a "${LOG}" > ${redirect} + if [[ -n "${prefix}" ]]; then + awk -v prefix="${prefix}" '{printf "%s: %s\n", prefix, $0}' + else + cat + fi else - __log_args "$@" | tee -a "${LOG}" > ${redirect} - fi -} - -log_setup() { - log "setup" "$@" + if [[ -n "${prefix}" ]]; then + echo "${prefix}: " "$@" + else + echo "$@" + fi + fi | tee -a "${LOG}" > "${redirect}" } log_host() { - local testname=$1 - - shift - log "test:${testname}:host" "$@" + LOG_PREFIX=host log "$@" } log_guest() { - local testname=$1 - - shift - log "test:${testname}:guest" "$@" + LOG_PREFIX=guest log "$@" } test_vm_server_host_client() { - local testname="${FUNCNAME[0]#test_}" + if ! vm_vsock_test "server" 2 "${TEST_GUEST_PORT}"; then + return "${KSFT_FAIL}" + fi - vm_ssh -- "${VSOCK_TEST}" \ - --mode=server \ - --control-port="${TEST_GUEST_PORT}" \ - --peer-cid=2 \ - 2>&1 | log_guest "${testname}" & + if ! host_vsock_test "127.0.0.1" "${VSOCK_CID}" "${TEST_HOST_PORT}"; then + return "${KSFT_FAIL}" + fi - vm_wait_for_listener "${TEST_GUEST_PORT}" + return "${KSFT_PASS}" +} - ${VSOCK_TEST} \ - --mode=client \ - --control-host=127.0.0.1 \ - --peer-cid="${VSOCK_CID}" \ - --control-port="${TEST_HOST_PORT}" 2>&1 | log_host "${testname}" +test_vm_client_host_server() { + if ! host_vsock_test "server" "${VSOCK_CID}" "${TEST_HOST_PORT_LISTENER}"; then + return "${KSFT_FAIL}" + fi - return $? + if ! vm_vsock_test "10.0.2.2" 2 "${TEST_HOST_PORT_LISTENER}"; then + return "${KSFT_FAIL}" + fi + + return "${KSFT_PASS}" } -test_vm_client_host_server() { - local testname="${FUNCNAME[0]#test_}" +test_vm_loopback() { + local port=60000 # non-forwarded local port - ${VSOCK_TEST} \ - --mode "server" \ - --control-port "${TEST_HOST_PORT_LISTENER}" \ - --peer-cid "${VSOCK_CID}" 2>&1 | log_host "${testname}" & + vm_ssh -- modprobe vsock_loopback &> /dev/null || : - host_wait_for_listener + if ! vm_vsock_test "server" 1 "${port}"; then + return "${KSFT_FAIL}" + fi - vm_ssh -- "${VSOCK_TEST}" \ - --mode=client \ - --control-host=10.0.2.2 \ - --peer-cid=2 \ - --control-port="${TEST_HOST_PORT_LISTENER}" 2>&1 | log_guest "${testname}" + if ! vm_vsock_test "127.0.0.1" 1 "${port}"; then + return "${KSFT_FAIL}" + fi - return $? + return "${KSFT_PASS}" } -test_vm_loopback() { - local testname="${FUNCNAME[0]#test_}" - local port=60000 # non-forwarded local port +shared_vm_test() { + local tname + + tname="${1}" + + for testname in "${USE_SHARED_VM[@]}"; do + if [[ "${tname}" == "${testname}" ]]; then + return 0 + fi + done - vm_ssh -- "${VSOCK_TEST}" \ - --mode=server \ - --control-port="${port}" \ - --peer-cid=1 2>&1 | log_guest "${testname}" & + return 1 +} - vm_wait_for_listener "${port}" +shared_vm_tests_requested() { + for arg in "$@"; do + if shared_vm_test "${arg}"; then + return 0 + fi + done - vm_ssh -- "${VSOCK_TEST}" \ - --mode=client \ - --control-host="127.0.0.1" \ - --control-port="${port}" \ - --peer-cid=1 2>&1 | log_guest "${testname}" + return 1 +} - return $? +run_shared_vm_tests() { + local arg + + for arg in "$@"; do + if ! shared_vm_test "${arg}"; then + continue + fi + + run_shared_vm_test "${arg}" + check_result "$?" "${arg}" + done } -run_test() { +run_shared_vm_test() { local host_oops_cnt_before local host_warn_cnt_before local vm_oops_cnt_before @@ -399,31 +527,32 @@ run_test() { host_oops_cnt_after=$(dmesg | grep -i 'Oops' | wc -l) if [[ ${host_oops_cnt_after} -gt ${host_oops_cnt_before} ]]; then - echo "FAIL: kernel oops detected on host" | log_host "${name}" + echo "FAIL: kernel oops detected on host" | log_host rc=$KSFT_FAIL fi host_warn_cnt_after=$(dmesg --level=warn | grep -c -i 'vsock') if [[ ${host_warn_cnt_after} -gt ${host_warn_cnt_before} ]]; then - echo "FAIL: kernel warning detected on host" | log_host "${name}" + echo "FAIL: kernel warning detected on host" | log_host rc=$KSFT_FAIL fi vm_oops_cnt_after=$(vm_ssh -- dmesg | grep -i 'Oops' | wc -l) if [[ ${vm_oops_cnt_after} -gt ${vm_oops_cnt_before} ]]; then - echo "FAIL: kernel oops detected on vm" | log_host "${name}" + echo "FAIL: kernel oops detected on vm" | log_host rc=$KSFT_FAIL fi vm_warn_cnt_after=$(vm_ssh -- dmesg --level=warn | grep -c -i 'vsock') if [[ ${vm_warn_cnt_after} -gt ${vm_warn_cnt_before} ]]; then - echo "FAIL: kernel warning detected on vm" | log_host "${name}" + echo "FAIL: kernel warning detected on vm" | log_host rc=$KSFT_FAIL fi return "${rc}" } +BUILD=0 QEMU="qemu-system-$(uname -m)" while getopts :hvsq:b o @@ -452,30 +581,21 @@ handle_build echo "1..${#ARGS[@]}" -log_setup "Booting up VM" -vm_start -vm_wait_for_ssh -log_setup "VM booted up" - cnt_pass=0 cnt_fail=0 cnt_skip=0 cnt_total=0 -for arg in "${ARGS[@]}"; do - run_test "${arg}" - rc=$? - if [[ ${rc} -eq $KSFT_PASS ]]; then - cnt_pass=$(( cnt_pass + 1 )) - echo "ok ${cnt_total} ${arg}" - elif [[ ${rc} -eq $KSFT_SKIP ]]; then - cnt_skip=$(( cnt_skip + 1 )) - echo "ok ${cnt_total} ${arg} # SKIP" - elif [[ ${rc} -eq $KSFT_FAIL ]]; then - cnt_fail=$(( cnt_fail + 1 )) - echo "not ok ${cnt_total} ${arg} # exit=$rc" - fi - cnt_total=$(( cnt_total + 1 )) -done + +if shared_vm_tests_requested "${ARGS[@]}"; then + log_host "Booting up VM" + pidfile="$(create_pidfile)" + vm_start "${pidfile}" + vm_wait_for_ssh + log_host "VM booted up" + + run_shared_vm_tests "${ARGS[@]}" + terminate_pidfiles "${pidfile}" +fi echo "SUMMARY: PASS=${cnt_pass} SKIP=${cnt_skip} FAIL=${cnt_fail}" echo "Log: ${LOG}" diff --git a/tools/testing/selftests/x86/test_vsyscall.c b/tools/testing/selftests/x86/test_vsyscall.c index 05e1e6774fba..918eaec8bfbe 100644 --- a/tools/testing/selftests/x86/test_vsyscall.c +++ b/tools/testing/selftests/x86/test_vsyscall.c @@ -308,12 +308,13 @@ static void test_getcpu(int cpu) #ifdef __x86_64__ static jmp_buf jmpbuf; -static volatile unsigned long segv_err; +static volatile unsigned long segv_err, segv_trapno; static void sigsegv(int sig, siginfo_t *info, void *ctx_void) { ucontext_t *ctx = (ucontext_t *)ctx_void; + segv_trapno = ctx->uc_mcontext.gregs[REG_TRAPNO]; segv_err = ctx->uc_mcontext.gregs[REG_ERR]; siglongjmp(jmpbuf, 1); } @@ -336,7 +337,8 @@ static void test_vsys_r(void) else if (can_read) ksft_test_result_pass("We have read access\n"); else - ksft_test_result_pass("We do not have read access: #PF(0x%lx)\n", segv_err); + ksft_test_result_pass("We do not have read access (trap=%ld, error=0x%lx)\n", + segv_trapno, segv_err); } static void test_vsys_x(void) @@ -347,7 +349,7 @@ static void test_vsys_x(void) return; } - ksft_print_msg("Make sure that vsyscalls really page fault\n"); + ksft_print_msg("Make sure that vsyscalls really cause a fault\n"); bool can_exec; if (sigsetjmp(jmpbuf, 1) == 0) { @@ -358,13 +360,14 @@ static void test_vsys_x(void) } if (can_exec) - ksft_test_result_fail("Executing the vsyscall did not page fault\n"); - else if (segv_err & (1 << 4)) /* INSTR */ - ksft_test_result_pass("Executing the vsyscall page failed: #PF(0x%lx)\n", - segv_err); + ksft_test_result_fail("Executing the vsyscall did not fault\n"); + /* #GP or #PF (with X86_PF_INSTR) */ + else if ((segv_trapno == 13) || ((segv_trapno == 14) && (segv_err & (1 << 4)))) + ksft_test_result_pass("Executing the vsyscall page failed (trap=%ld, error=0x%lx)\n", + segv_trapno, segv_err); else - ksft_test_result_fail("Execution failed with the wrong error: #PF(0x%lx)\n", - segv_err); + ksft_test_result_fail("Execution failed with the wrong error (trap=%ld, error=0x%lx)\n", + segv_trapno, segv_err); } /* diff --git a/tools/testing/vma/vma.c b/tools/testing/vma/vma.c index 656e1c75b711..93d21bc7e112 100644 --- a/tools/testing/vma/vma.c +++ b/tools/testing/vma/vma.c @@ -48,6 +48,8 @@ static struct anon_vma dummy_anon_vma; #define ASSERT_EQ(_val1, _val2) ASSERT_TRUE((_val1) == (_val2)) #define ASSERT_NE(_val1, _val2) ASSERT_TRUE((_val1) != (_val2)) +#define IS_SET(_val, _flags) ((_val & _flags) == _flags) + static struct task_struct __current; struct task_struct *get_current(void) @@ -67,18 +69,18 @@ static struct vm_area_struct *alloc_vma(struct mm_struct *mm, pgoff_t pgoff, vm_flags_t vm_flags) { - struct vm_area_struct *ret = vm_area_alloc(mm); + struct vm_area_struct *vma = vm_area_alloc(mm); - if (ret == NULL) + if (vma == NULL) return NULL; - ret->vm_start = start; - ret->vm_end = end; - ret->vm_pgoff = pgoff; - ret->__vm_flags = vm_flags; - vma_assert_detached(ret); + vma->vm_start = start; + vma->vm_end = end; + vma->vm_pgoff = pgoff; + vm_flags_reset(vma, vm_flags); + vma_assert_detached(vma); - return ret; + return vma; } /* Helper function to allocate a VMA and link it to the tree. */ @@ -339,6 +341,7 @@ static bool test_simple_modify(void) struct mm_struct mm = {}; struct vm_area_struct *init_vma = alloc_vma(&mm, 0, 0x3000, 0, vm_flags); VMA_ITERATOR(vmi, &mm, 0x1000); + vm_flags_t flags = VM_READ | VM_MAYREAD; ASSERT_FALSE(attach_vma(&mm, init_vma)); @@ -347,7 +350,7 @@ static bool test_simple_modify(void) * performs the merge/split only. */ vma = vma_modify_flags(&vmi, init_vma, init_vma, - 0x1000, 0x2000, VM_READ | VM_MAYREAD); + 0x1000, 0x2000, &flags); ASSERT_NE(vma, NULL); /* We modify the provided VMA, and on split allocate new VMAs. */ ASSERT_EQ(vma, init_vma); @@ -441,7 +444,7 @@ static bool test_simple_shrink(void) return true; } -static bool test_merge_new(void) +static bool __test_merge_new(bool is_sticky, bool a_is_sticky, bool b_is_sticky, bool c_is_sticky) { vm_flags_t vm_flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE; struct mm_struct mm = {}; @@ -469,23 +472,32 @@ static bool test_merge_new(void) struct vm_area_struct *vma, *vma_a, *vma_b, *vma_c, *vma_d; bool merged; + if (is_sticky) + vm_flags |= VM_STICKY; + /* * 0123456789abc * AA B CC */ vma_a = alloc_and_link_vma(&mm, 0, 0x2000, 0, vm_flags); ASSERT_NE(vma_a, NULL); + if (a_is_sticky) + vm_flags_set(vma_a, VM_STICKY); /* We give each VMA a single avc so we can test anon_vma duplication. */ INIT_LIST_HEAD(&vma_a->anon_vma_chain); list_add(&dummy_anon_vma_chain_a.same_vma, &vma_a->anon_vma_chain); vma_b = alloc_and_link_vma(&mm, 0x3000, 0x4000, 3, vm_flags); ASSERT_NE(vma_b, NULL); + if (b_is_sticky) + vm_flags_set(vma_b, VM_STICKY); INIT_LIST_HEAD(&vma_b->anon_vma_chain); list_add(&dummy_anon_vma_chain_b.same_vma, &vma_b->anon_vma_chain); vma_c = alloc_and_link_vma(&mm, 0xb000, 0xc000, 0xb, vm_flags); ASSERT_NE(vma_c, NULL); + if (c_is_sticky) + vm_flags_set(vma_c, VM_STICKY); INIT_LIST_HEAD(&vma_c->anon_vma_chain); list_add(&dummy_anon_vma_chain_c.same_vma, &vma_c->anon_vma_chain); @@ -520,6 +532,8 @@ static bool test_merge_new(void) ASSERT_EQ(vma->anon_vma, &dummy_anon_vma); ASSERT_TRUE(vma_write_started(vma)); ASSERT_EQ(mm.map_count, 3); + if (is_sticky || a_is_sticky || b_is_sticky) + ASSERT_TRUE(IS_SET(vma->vm_flags, VM_STICKY)); /* * Merge to PREVIOUS VMA. @@ -537,6 +551,8 @@ static bool test_merge_new(void) ASSERT_EQ(vma->anon_vma, &dummy_anon_vma); ASSERT_TRUE(vma_write_started(vma)); ASSERT_EQ(mm.map_count, 3); + if (is_sticky || a_is_sticky) + ASSERT_TRUE(IS_SET(vma->vm_flags, VM_STICKY)); /* * Merge to NEXT VMA. @@ -556,6 +572,8 @@ static bool test_merge_new(void) ASSERT_EQ(vma->anon_vma, &dummy_anon_vma); ASSERT_TRUE(vma_write_started(vma)); ASSERT_EQ(mm.map_count, 3); + if (is_sticky) /* D uses is_sticky. */ + ASSERT_TRUE(IS_SET(vma->vm_flags, VM_STICKY)); /* * Merge BOTH sides. @@ -574,6 +592,8 @@ static bool test_merge_new(void) ASSERT_EQ(vma->anon_vma, &dummy_anon_vma); ASSERT_TRUE(vma_write_started(vma)); ASSERT_EQ(mm.map_count, 2); + if (is_sticky || a_is_sticky) + ASSERT_TRUE(IS_SET(vma->vm_flags, VM_STICKY)); /* * Merge to NEXT VMA. @@ -592,6 +612,8 @@ static bool test_merge_new(void) ASSERT_EQ(vma->anon_vma, &dummy_anon_vma); ASSERT_TRUE(vma_write_started(vma)); ASSERT_EQ(mm.map_count, 2); + if (is_sticky || c_is_sticky) + ASSERT_TRUE(IS_SET(vma->vm_flags, VM_STICKY)); /* * Merge BOTH sides. @@ -609,6 +631,8 @@ static bool test_merge_new(void) ASSERT_EQ(vma->anon_vma, &dummy_anon_vma); ASSERT_TRUE(vma_write_started(vma)); ASSERT_EQ(mm.map_count, 1); + if (is_sticky || a_is_sticky || c_is_sticky) + ASSERT_TRUE(IS_SET(vma->vm_flags, VM_STICKY)); /* * Final state. @@ -637,6 +661,20 @@ static bool test_merge_new(void) return true; } +static bool test_merge_new(void) +{ + int i, j, k, l; + + /* Generate every possible permutation of sticky flags. */ + for (i = 0; i < 2; i++) + for (j = 0; j < 2; j++) + for (k = 0; k < 2; k++) + for (l = 0; l < 2; l++) + ASSERT_TRUE(__test_merge_new(i, j, k, l)); + + return true; +} + static bool test_vma_merge_special_flags(void) { vm_flags_t vm_flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE; @@ -676,7 +714,7 @@ static bool test_vma_merge_special_flags(void) for (i = 0; i < ARRAY_SIZE(special_flags); i++) { vm_flags_t special_flag = special_flags[i]; - vma_left->__vm_flags = vm_flags | special_flag; + vm_flags_reset(vma_left, vm_flags | special_flag); vmg.vm_flags = vm_flags | special_flag; vma = merge_new(&vmg); ASSERT_EQ(vma, NULL); @@ -698,7 +736,7 @@ static bool test_vma_merge_special_flags(void) for (i = 0; i < ARRAY_SIZE(special_flags); i++) { vm_flags_t special_flag = special_flags[i]; - vma_left->__vm_flags = vm_flags | special_flag; + vm_flags_reset(vma_left, vm_flags | special_flag); vmg.vm_flags = vm_flags | special_flag; vma = merge_existing(&vmg); ASSERT_EQ(vma, NULL); @@ -973,9 +1011,11 @@ static bool test_vma_merge_new_with_close(void) return true; } -static bool test_merge_existing(void) +static bool __test_merge_existing(bool prev_is_sticky, bool middle_is_sticky, bool next_is_sticky) { vm_flags_t vm_flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE; + vm_flags_t prev_flags = vm_flags; + vm_flags_t next_flags = vm_flags; struct mm_struct mm = {}; VMA_ITERATOR(vmi, &mm, 0); struct vm_area_struct *vma, *vma_prev, *vma_next; @@ -988,6 +1028,13 @@ static bool test_merge_existing(void) }; struct anon_vma_chain avc = {}; + if (prev_is_sticky) + prev_flags |= VM_STICKY; + if (middle_is_sticky) + vm_flags |= VM_STICKY; + if (next_is_sticky) + next_flags |= VM_STICKY; + /* * Merge right case - partial span. * @@ -1000,7 +1047,7 @@ static bool test_merge_existing(void) */ vma = alloc_and_link_vma(&mm, 0x2000, 0x6000, 2, vm_flags); vma->vm_ops = &vm_ops; /* This should have no impact. */ - vma_next = alloc_and_link_vma(&mm, 0x6000, 0x9000, 6, vm_flags); + vma_next = alloc_and_link_vma(&mm, 0x6000, 0x9000, 6, next_flags); vma_next->vm_ops = &vm_ops; /* This should have no impact. */ vmg_set_range_anon_vma(&vmg, 0x3000, 0x6000, 3, vm_flags, &dummy_anon_vma); vmg.middle = vma; @@ -1018,6 +1065,8 @@ static bool test_merge_existing(void) ASSERT_TRUE(vma_write_started(vma)); ASSERT_TRUE(vma_write_started(vma_next)); ASSERT_EQ(mm.map_count, 2); + if (middle_is_sticky || next_is_sticky) + ASSERT_TRUE(IS_SET(vma_next->vm_flags, VM_STICKY)); /* Clear down and reset. */ ASSERT_EQ(cleanup_mm(&mm, &vmi), 2); @@ -1033,7 +1082,7 @@ static bool test_merge_existing(void) * NNNNNNN */ vma = alloc_and_link_vma(&mm, 0x2000, 0x6000, 2, vm_flags); - vma_next = alloc_and_link_vma(&mm, 0x6000, 0x9000, 6, vm_flags); + vma_next = alloc_and_link_vma(&mm, 0x6000, 0x9000, 6, next_flags); vma_next->vm_ops = &vm_ops; /* This should have no impact. */ vmg_set_range_anon_vma(&vmg, 0x2000, 0x6000, 2, vm_flags, &dummy_anon_vma); vmg.middle = vma; @@ -1046,6 +1095,8 @@ static bool test_merge_existing(void) ASSERT_EQ(vma_next->anon_vma, &dummy_anon_vma); ASSERT_TRUE(vma_write_started(vma_next)); ASSERT_EQ(mm.map_count, 1); + if (middle_is_sticky || next_is_sticky) + ASSERT_TRUE(IS_SET(vma_next->vm_flags, VM_STICKY)); /* Clear down and reset. We should have deleted vma. */ ASSERT_EQ(cleanup_mm(&mm, &vmi), 1); @@ -1060,7 +1111,7 @@ static bool test_merge_existing(void) * 0123456789 * PPPPPPV */ - vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, vm_flags); + vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, prev_flags); vma_prev->vm_ops = &vm_ops; /* This should have no impact. */ vma = alloc_and_link_vma(&mm, 0x3000, 0x7000, 3, vm_flags); vma->vm_ops = &vm_ops; /* This should have no impact. */ @@ -1080,6 +1131,8 @@ static bool test_merge_existing(void) ASSERT_TRUE(vma_write_started(vma_prev)); ASSERT_TRUE(vma_write_started(vma)); ASSERT_EQ(mm.map_count, 2); + if (prev_is_sticky || middle_is_sticky) + ASSERT_TRUE(IS_SET(vma_prev->vm_flags, VM_STICKY)); /* Clear down and reset. */ ASSERT_EQ(cleanup_mm(&mm, &vmi), 2); @@ -1094,7 +1147,7 @@ static bool test_merge_existing(void) * 0123456789 * PPPPPPP */ - vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, vm_flags); + vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, prev_flags); vma_prev->vm_ops = &vm_ops; /* This should have no impact. */ vma = alloc_and_link_vma(&mm, 0x3000, 0x7000, 3, vm_flags); vmg_set_range_anon_vma(&vmg, 0x3000, 0x7000, 3, vm_flags, &dummy_anon_vma); @@ -1109,6 +1162,8 @@ static bool test_merge_existing(void) ASSERT_EQ(vma_prev->anon_vma, &dummy_anon_vma); ASSERT_TRUE(vma_write_started(vma_prev)); ASSERT_EQ(mm.map_count, 1); + if (prev_is_sticky || middle_is_sticky) + ASSERT_TRUE(IS_SET(vma_prev->vm_flags, VM_STICKY)); /* Clear down and reset. We should have deleted vma. */ ASSERT_EQ(cleanup_mm(&mm, &vmi), 1); @@ -1123,10 +1178,10 @@ static bool test_merge_existing(void) * 0123456789 * PPPPPPPPPP */ - vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, vm_flags); + vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, prev_flags); vma_prev->vm_ops = &vm_ops; /* This should have no impact. */ vma = alloc_and_link_vma(&mm, 0x3000, 0x7000, 3, vm_flags); - vma_next = alloc_and_link_vma(&mm, 0x7000, 0x9000, 7, vm_flags); + vma_next = alloc_and_link_vma(&mm, 0x7000, 0x9000, 7, next_flags); vmg_set_range_anon_vma(&vmg, 0x3000, 0x7000, 3, vm_flags, &dummy_anon_vma); vmg.prev = vma_prev; vmg.middle = vma; @@ -1139,6 +1194,8 @@ static bool test_merge_existing(void) ASSERT_EQ(vma_prev->anon_vma, &dummy_anon_vma); ASSERT_TRUE(vma_write_started(vma_prev)); ASSERT_EQ(mm.map_count, 1); + if (prev_is_sticky || middle_is_sticky || next_is_sticky) + ASSERT_TRUE(IS_SET(vma_prev->vm_flags, VM_STICKY)); /* Clear down and reset. We should have deleted prev and next. */ ASSERT_EQ(cleanup_mm(&mm, &vmi), 1); @@ -1158,9 +1215,9 @@ static bool test_merge_existing(void) * PPPVVVVVNNN */ - vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, vm_flags); + vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, prev_flags); vma = alloc_and_link_vma(&mm, 0x3000, 0x8000, 3, vm_flags); - vma_next = alloc_and_link_vma(&mm, 0x8000, 0xa000, 8, vm_flags); + vma_next = alloc_and_link_vma(&mm, 0x8000, 0xa000, 8, next_flags); vmg_set_range(&vmg, 0x4000, 0x5000, 4, vm_flags); vmg.prev = vma; @@ -1203,6 +1260,19 @@ static bool test_merge_existing(void) return true; } +static bool test_merge_existing(void) +{ + int i, j, k; + + /* Generate every possible permutation of sticky flags. */ + for (i = 0; i < 2; i++) + for (j = 0; j < 2; j++) + for (k = 0; k < 2; k++) + ASSERT_TRUE(__test_merge_existing(i, j, k)); + + return true; +} + static bool test_anon_vma_non_mergeable(void) { vm_flags_t vm_flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE; diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h index dc976a285ad2..9f0a9f5ed0fe 100644 --- a/tools/testing/vma/vma_internal.h +++ b/tools/testing/vma/vma_internal.h @@ -46,42 +46,272 @@ extern unsigned long dac_mmap_min_addr; #define MMF_HAS_MDWE 28 +/* + * vm_flags in vm_area_struct, see mm_types.h. + * When changing, update also include/trace/events/mmflags.h + */ + #define VM_NONE 0x00000000 -#define VM_READ 0x00000001 -#define VM_WRITE 0x00000002 -#define VM_EXEC 0x00000004 -#define VM_SHARED 0x00000008 -#define VM_MAYREAD 0x00000010 -#define VM_MAYWRITE 0x00000020 -#define VM_MAYEXEC 0x00000040 -#define VM_GROWSDOWN 0x00000100 -#define VM_PFNMAP 0x00000400 -#define VM_LOCKED 0x00002000 -#define VM_IO 0x00004000 -#define VM_SEQ_READ 0x00008000 /* App will access data sequentially */ -#define VM_RAND_READ 0x00010000 /* App will not benefit from clustered reads */ -#define VM_DONTEXPAND 0x00040000 -#define VM_LOCKONFAULT 0x00080000 -#define VM_ACCOUNT 0x00100000 -#define VM_NORESERVE 0x00200000 -#define VM_MIXEDMAP 0x10000000 -#define VM_STACK VM_GROWSDOWN -#define VM_SHADOW_STACK VM_NONE -#define VM_SOFTDIRTY 0 -#define VM_ARCH_1 0x01000000 /* Architecture-specific flag */ -#define VM_GROWSUP VM_NONE -#define VM_ACCESS_FLAGS (VM_READ | VM_WRITE | VM_EXEC) -#define VM_SPECIAL (VM_IO | VM_DONTEXPAND | VM_PFNMAP | VM_MIXEDMAP) +/** + * typedef vma_flag_t - specifies an individual VMA flag by bit number. + * + * This value is made type safe by sparse to avoid passing invalid flag values + * around. + */ +typedef int __bitwise vma_flag_t; +#define DECLARE_VMA_BIT(name, bitnum) \ + VMA_ ## name ## _BIT = ((__force vma_flag_t)bitnum) +#define DECLARE_VMA_BIT_ALIAS(name, aliased) \ + VMA_ ## name ## _BIT = VMA_ ## aliased ## _BIT +enum { + DECLARE_VMA_BIT(READ, 0), + DECLARE_VMA_BIT(WRITE, 1), + DECLARE_VMA_BIT(EXEC, 2), + DECLARE_VMA_BIT(SHARED, 3), + /* mprotect() hardcodes VM_MAYREAD >> 4 == VM_READ, and so for r/w/x bits. */ + DECLARE_VMA_BIT(MAYREAD, 4), /* limits for mprotect() etc. */ + DECLARE_VMA_BIT(MAYWRITE, 5), + DECLARE_VMA_BIT(MAYEXEC, 6), + DECLARE_VMA_BIT(MAYSHARE, 7), + DECLARE_VMA_BIT(GROWSDOWN, 8), /* general info on the segment */ +#ifdef CONFIG_MMU + DECLARE_VMA_BIT(UFFD_MISSING, 9),/* missing pages tracking */ +#else + /* nommu: R/O MAP_PRIVATE mapping that might overlay a file mapping */ + DECLARE_VMA_BIT(MAYOVERLAY, 9), +#endif /* CONFIG_MMU */ + /* Page-ranges managed without "struct page", just pure PFN */ + DECLARE_VMA_BIT(PFNMAP, 10), + DECLARE_VMA_BIT(MAYBE_GUARD, 11), + DECLARE_VMA_BIT(UFFD_WP, 12), /* wrprotect pages tracking */ + DECLARE_VMA_BIT(LOCKED, 13), + DECLARE_VMA_BIT(IO, 14), /* Memory mapped I/O or similar */ + DECLARE_VMA_BIT(SEQ_READ, 15), /* App will access data sequentially */ + DECLARE_VMA_BIT(RAND_READ, 16), /* App will not benefit from clustered reads */ + DECLARE_VMA_BIT(DONTCOPY, 17), /* Do not copy this vma on fork */ + DECLARE_VMA_BIT(DONTEXPAND, 18),/* Cannot expand with mremap() */ + DECLARE_VMA_BIT(LOCKONFAULT, 19),/* Lock pages covered when faulted in */ + DECLARE_VMA_BIT(ACCOUNT, 20), /* Is a VM accounted object */ + DECLARE_VMA_BIT(NORESERVE, 21), /* should the VM suppress accounting */ + DECLARE_VMA_BIT(HUGETLB, 22), /* Huge TLB Page VM */ + DECLARE_VMA_BIT(SYNC, 23), /* Synchronous page faults */ + DECLARE_VMA_BIT(ARCH_1, 24), /* Architecture-specific flag */ + DECLARE_VMA_BIT(WIPEONFORK, 25),/* Wipe VMA contents in child. */ + DECLARE_VMA_BIT(DONTDUMP, 26), /* Do not include in the core dump */ + DECLARE_VMA_BIT(SOFTDIRTY, 27), /* NOT soft dirty clean area */ + DECLARE_VMA_BIT(MIXEDMAP, 28), /* Can contain struct page and pure PFN pages */ + DECLARE_VMA_BIT(HUGEPAGE, 29), /* MADV_HUGEPAGE marked this vma */ + DECLARE_VMA_BIT(NOHUGEPAGE, 30),/* MADV_NOHUGEPAGE marked this vma */ + DECLARE_VMA_BIT(MERGEABLE, 31), /* KSM may merge identical pages */ + /* These bits are reused, we define specific uses below. */ + DECLARE_VMA_BIT(HIGH_ARCH_0, 32), + DECLARE_VMA_BIT(HIGH_ARCH_1, 33), + DECLARE_VMA_BIT(HIGH_ARCH_2, 34), + DECLARE_VMA_BIT(HIGH_ARCH_3, 35), + DECLARE_VMA_BIT(HIGH_ARCH_4, 36), + DECLARE_VMA_BIT(HIGH_ARCH_5, 37), + DECLARE_VMA_BIT(HIGH_ARCH_6, 38), + /* + * This flag is used to connect VFIO to arch specific KVM code. It + * indicates that the memory under this VMA is safe for use with any + * non-cachable memory type inside KVM. Some VFIO devices, on some + * platforms, are thought to be unsafe and can cause machine crashes + * if KVM does not lock down the memory type. + */ + DECLARE_VMA_BIT(ALLOW_ANY_UNCACHED, 39), +#ifdef CONFIG_PPC32 + DECLARE_VMA_BIT_ALIAS(DROPPABLE, ARCH_1), +#else + DECLARE_VMA_BIT(DROPPABLE, 40), +#endif + DECLARE_VMA_BIT(UFFD_MINOR, 41), + DECLARE_VMA_BIT(SEALED, 42), + /* Flags that reuse flags above. */ + DECLARE_VMA_BIT_ALIAS(PKEY_BIT0, HIGH_ARCH_0), + DECLARE_VMA_BIT_ALIAS(PKEY_BIT1, HIGH_ARCH_1), + DECLARE_VMA_BIT_ALIAS(PKEY_BIT2, HIGH_ARCH_2), + DECLARE_VMA_BIT_ALIAS(PKEY_BIT3, HIGH_ARCH_3), + DECLARE_VMA_BIT_ALIAS(PKEY_BIT4, HIGH_ARCH_4), +#if defined(CONFIG_X86_USER_SHADOW_STACK) + /* + * VM_SHADOW_STACK should not be set with VM_SHARED because of lack of + * support core mm. + * + * These VMAs will get a single end guard page. This helps userspace + * protect itself from attacks. A single page is enough for current + * shadow stack archs (x86). See the comments near alloc_shstk() in + * arch/x86/kernel/shstk.c for more details on the guard size. + */ + DECLARE_VMA_BIT_ALIAS(SHADOW_STACK, HIGH_ARCH_5), +#elif defined(CONFIG_ARM64_GCS) + /* + * arm64's Guarded Control Stack implements similar functionality and + * has similar constraints to shadow stacks. + */ + DECLARE_VMA_BIT_ALIAS(SHADOW_STACK, HIGH_ARCH_6), +#endif + DECLARE_VMA_BIT_ALIAS(SAO, ARCH_1), /* Strong Access Ordering (powerpc) */ + DECLARE_VMA_BIT_ALIAS(GROWSUP, ARCH_1), /* parisc */ + DECLARE_VMA_BIT_ALIAS(SPARC_ADI, ARCH_1), /* sparc64 */ + DECLARE_VMA_BIT_ALIAS(ARM64_BTI, ARCH_1), /* arm64 */ + DECLARE_VMA_BIT_ALIAS(ARCH_CLEAR, ARCH_1), /* sparc64, arm64 */ + DECLARE_VMA_BIT_ALIAS(MAPPED_COPY, ARCH_1), /* !CONFIG_MMU */ + DECLARE_VMA_BIT_ALIAS(MTE, HIGH_ARCH_4), /* arm64 */ + DECLARE_VMA_BIT_ALIAS(MTE_ALLOWED, HIGH_ARCH_5),/* arm64 */ #ifdef CONFIG_STACK_GROWSUP -#define VM_STACK VM_GROWSUP -#define VM_STACK_EARLY VM_GROWSDOWN + DECLARE_VMA_BIT_ALIAS(STACK, GROWSUP), + DECLARE_VMA_BIT_ALIAS(STACK_EARLY, GROWSDOWN), +#else + DECLARE_VMA_BIT_ALIAS(STACK, GROWSDOWN), +#endif +}; + +#define INIT_VM_FLAG(name) BIT((__force int) VMA_ ## name ## _BIT) +#define VM_READ INIT_VM_FLAG(READ) +#define VM_WRITE INIT_VM_FLAG(WRITE) +#define VM_EXEC INIT_VM_FLAG(EXEC) +#define VM_SHARED INIT_VM_FLAG(SHARED) +#define VM_MAYREAD INIT_VM_FLAG(MAYREAD) +#define VM_MAYWRITE INIT_VM_FLAG(MAYWRITE) +#define VM_MAYEXEC INIT_VM_FLAG(MAYEXEC) +#define VM_MAYSHARE INIT_VM_FLAG(MAYSHARE) +#define VM_GROWSDOWN INIT_VM_FLAG(GROWSDOWN) +#ifdef CONFIG_MMU +#define VM_UFFD_MISSING INIT_VM_FLAG(UFFD_MISSING) +#else +#define VM_UFFD_MISSING VM_NONE +#define VM_MAYOVERLAY INIT_VM_FLAG(MAYOVERLAY) +#endif +#define VM_PFNMAP INIT_VM_FLAG(PFNMAP) +#define VM_MAYBE_GUARD INIT_VM_FLAG(MAYBE_GUARD) +#define VM_UFFD_WP INIT_VM_FLAG(UFFD_WP) +#define VM_LOCKED INIT_VM_FLAG(LOCKED) +#define VM_IO INIT_VM_FLAG(IO) +#define VM_SEQ_READ INIT_VM_FLAG(SEQ_READ) +#define VM_RAND_READ INIT_VM_FLAG(RAND_READ) +#define VM_DONTCOPY INIT_VM_FLAG(DONTCOPY) +#define VM_DONTEXPAND INIT_VM_FLAG(DONTEXPAND) +#define VM_LOCKONFAULT INIT_VM_FLAG(LOCKONFAULT) +#define VM_ACCOUNT INIT_VM_FLAG(ACCOUNT) +#define VM_NORESERVE INIT_VM_FLAG(NORESERVE) +#define VM_HUGETLB INIT_VM_FLAG(HUGETLB) +#define VM_SYNC INIT_VM_FLAG(SYNC) +#define VM_ARCH_1 INIT_VM_FLAG(ARCH_1) +#define VM_WIPEONFORK INIT_VM_FLAG(WIPEONFORK) +#define VM_DONTDUMP INIT_VM_FLAG(DONTDUMP) +#ifdef CONFIG_MEM_SOFT_DIRTY +#define VM_SOFTDIRTY INIT_VM_FLAG(SOFTDIRTY) +#else +#define VM_SOFTDIRTY VM_NONE +#endif +#define VM_MIXEDMAP INIT_VM_FLAG(MIXEDMAP) +#define VM_HUGEPAGE INIT_VM_FLAG(HUGEPAGE) +#define VM_NOHUGEPAGE INIT_VM_FLAG(NOHUGEPAGE) +#define VM_MERGEABLE INIT_VM_FLAG(MERGEABLE) +#define VM_STACK INIT_VM_FLAG(STACK) +#ifdef CONFIG_STACK_GROWS_UP +#define VM_STACK_EARLY INIT_VM_FLAG(STACK_EARLY) +#else +#define VM_STACK_EARLY VM_NONE +#endif +#ifdef CONFIG_ARCH_HAS_PKEYS +#define VM_PKEY_SHIFT ((__force int)VMA_HIGH_ARCH_0_BIT) +/* Despite the naming, these are FLAGS not bits. */ +#define VM_PKEY_BIT0 INIT_VM_FLAG(PKEY_BIT0) +#define VM_PKEY_BIT1 INIT_VM_FLAG(PKEY_BIT1) +#define VM_PKEY_BIT2 INIT_VM_FLAG(PKEY_BIT2) +#if CONFIG_ARCH_PKEY_BITS > 3 +#define VM_PKEY_BIT3 INIT_VM_FLAG(PKEY_BIT3) +#else +#define VM_PKEY_BIT3 VM_NONE +#endif /* CONFIG_ARCH_PKEY_BITS > 3 */ +#if CONFIG_ARCH_PKEY_BITS > 4 +#define VM_PKEY_BIT4 INIT_VM_FLAG(PKEY_BIT4) +#else +#define VM_PKEY_BIT4 VM_NONE +#endif /* CONFIG_ARCH_PKEY_BITS > 4 */ +#endif /* CONFIG_ARCH_HAS_PKEYS */ +#if defined(CONFIG_X86_USER_SHADOW_STACK) || defined(CONFIG_ARM64_GCS) +#define VM_SHADOW_STACK INIT_VM_FLAG(SHADOW_STACK) +#else +#define VM_SHADOW_STACK VM_NONE +#endif +#if defined(CONFIG_PPC64) +#define VM_SAO INIT_VM_FLAG(SAO) +#elif defined(CONFIG_PARISC) +#define VM_GROWSUP INIT_VM_FLAG(GROWSUP) +#elif defined(CONFIG_SPARC64) +#define VM_SPARC_ADI INIT_VM_FLAG(SPARC_ADI) +#define VM_ARCH_CLEAR INIT_VM_FLAG(ARCH_CLEAR) +#elif defined(CONFIG_ARM64) +#define VM_ARM64_BTI INIT_VM_FLAG(ARM64_BTI) +#define VM_ARCH_CLEAR INIT_VM_FLAG(ARCH_CLEAR) +#elif !defined(CONFIG_MMU) +#define VM_MAPPED_COPY INIT_VM_FLAG(MAPPED_COPY) +#endif +#ifndef VM_GROWSUP +#define VM_GROWSUP VM_NONE +#endif +#ifdef CONFIG_ARM64_MTE +#define VM_MTE INIT_VM_FLAG(MTE) +#define VM_MTE_ALLOWED INIT_VM_FLAG(MTE_ALLOWED) +#else +#define VM_MTE VM_NONE +#define VM_MTE_ALLOWED VM_NONE +#endif +#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR +#define VM_UFFD_MINOR INIT_VM_FLAG(UFFD_MINOR) #else -#define VM_STACK VM_GROWSDOWN -#define VM_STACK_EARLY 0 +#define VM_UFFD_MINOR VM_NONE +#endif +#ifdef CONFIG_64BIT +#define VM_ALLOW_ANY_UNCACHED INIT_VM_FLAG(ALLOW_ANY_UNCACHED) +#define VM_SEALED INIT_VM_FLAG(SEALED) +#else +#define VM_ALLOW_ANY_UNCACHED VM_NONE +#define VM_SEALED VM_NONE +#endif +#if defined(CONFIG_64BIT) || defined(CONFIG_PPC32) +#define VM_DROPPABLE INIT_VM_FLAG(DROPPABLE) +#else +#define VM_DROPPABLE VM_NONE +#endif + +/* Bits set in the VMA until the stack is in its final location */ +#define VM_STACK_INCOMPLETE_SETUP (VM_RAND_READ | VM_SEQ_READ | VM_STACK_EARLY) + +#define TASK_EXEC ((current->personality & READ_IMPLIES_EXEC) ? VM_EXEC : 0) + +/* Common data flag combinations */ +#define VM_DATA_FLAGS_TSK_EXEC (VM_READ | VM_WRITE | TASK_EXEC | \ + VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) +#define VM_DATA_FLAGS_NON_EXEC (VM_READ | VM_WRITE | VM_MAYREAD | \ + VM_MAYWRITE | VM_MAYEXEC) +#define VM_DATA_FLAGS_EXEC (VM_READ | VM_WRITE | VM_EXEC | \ + VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) + +#ifndef VM_DATA_DEFAULT_FLAGS /* arch can override this */ +#define VM_DATA_DEFAULT_FLAGS VM_DATA_FLAGS_EXEC +#endif + +#ifndef VM_STACK_DEFAULT_FLAGS /* arch can override this */ +#define VM_STACK_DEFAULT_FLAGS VM_DATA_DEFAULT_FLAGS #endif +#define VM_STARTGAP_FLAGS (VM_GROWSDOWN | VM_SHADOW_STACK) + +#define VM_STACK_FLAGS (VM_STACK | VM_STACK_DEFAULT_FLAGS | VM_ACCOUNT) + +/* VMA basic access permission flags */ +#define VM_ACCESS_FLAGS (VM_READ | VM_WRITE | VM_EXEC) + +/* + * Special vmas that are non-mergable, non-mlock()able. + */ +#define VM_SPECIAL (VM_IO | VM_DONTEXPAND | VM_PFNMAP | VM_MIXEDMAP) + #define DEFAULT_MAP_WINDOW ((1UL << 47) - PAGE_SIZE) #define TASK_SIZE_LOW DEFAULT_MAP_WINDOW #define TASK_SIZE_MAX DEFAULT_MAP_WINDOW @@ -96,25 +326,58 @@ extern unsigned long dac_mmap_min_addr; #define VM_DATA_FLAGS_TSK_EXEC (VM_READ | VM_WRITE | TASK_EXEC | \ VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) -#define VM_DATA_DEFAULT_FLAGS VM_DATA_FLAGS_TSK_EXEC - -#define VM_STARTGAP_FLAGS (VM_GROWSDOWN | VM_SHADOW_STACK) - -#define VM_STACK_DEFAULT_FLAGS VM_DATA_DEFAULT_FLAGS -#define VM_STACK_FLAGS (VM_STACK | VM_STACK_DEFAULT_FLAGS | VM_ACCOUNT) -#define VM_STACK_INCOMPLETE_SETUP (VM_RAND_READ | VM_SEQ_READ | VM_STACK_EARLY) - #define RLIMIT_STACK 3 /* max stack size */ #define RLIMIT_MEMLOCK 8 /* max locked-in-memory address space */ #define CAP_IPC_LOCK 14 -#ifdef CONFIG_64BIT -#define VM_SEALED_BIT 42 -#define VM_SEALED BIT(VM_SEALED_BIT) -#else -#define VM_SEALED VM_NONE -#endif +/* + * Flags which should be 'sticky' on merge - that is, flags which, when one VMA + * possesses it but the other does not, the merged VMA should nonetheless have + * applied to it: + * + * VM_SOFTDIRTY - if a VMA is marked soft-dirty, that is has not had its + * references cleared via /proc/$pid/clear_refs, any merged VMA + * should be considered soft-dirty also as it operates at a VMA + * granularity. + */ +#define VM_STICKY (VM_SOFTDIRTY | VM_MAYBE_GUARD) + +/* + * VMA flags we ignore for the purposes of merge, i.e. one VMA possessing one + * of these flags and the other not does not preclude a merge. + * + * VM_STICKY - When merging VMAs, VMA flags must match, unless they are + * 'sticky'. If any sticky flags exist in either VMA, we simply + * set all of them on the merged VMA. + */ +#define VM_IGNORE_MERGE VM_STICKY + +/* + * Flags which should result in page tables being copied on fork. These are + * flags which indicate that the VMA maps page tables which cannot be + * reconsistuted upon page fault, so necessitate page table copying upon + * + * VM_PFNMAP / VM_MIXEDMAP - These contain kernel-mapped data which cannot be + * reasonably reconstructed on page fault. + * + * VM_UFFD_WP - Encodes metadata about an installed uffd + * write protect handler, which cannot be + * reconstructed on page fault. + * + * We always copy pgtables when dst_vma has uffd-wp + * enabled even if it's file-backed + * (e.g. shmem). Because when uffd-wp is enabled, + * pgtable contains uffd-wp protection information, + * that's something we can't retrieve from page cache, + * and skip copying will lose those info. + * + * VM_MAYBE_GUARD - Could contain page guard region markers which + * by design are a property of the page tables + * only and thus cannot be reconstructed on page + * fault. + */ +#define VM_COPY_ON_FORK (VM_PFNMAP | VM_MIXEDMAP | VM_UFFD_WP | VM_MAYBE_GUARD) #define FIRST_USER_ADDRESS 0UL #define USER_PGTABLES_CEILING 0UL @@ -163,6 +426,8 @@ typedef __bitwise unsigned int vm_fault_t; #define ASSERT_EXCLUSIVE_WRITER(x) +#define pgtable_supports_soft_dirty() 1 + /** * swap - swap values of @a and @b * @a: first value @@ -259,6 +524,15 @@ typedef struct { __private DECLARE_BITMAP(__mm_flags, NUM_MM_FLAG_BITS); } mm_flags_t; +/* + * Opaque type representing current VMA (vm_area_struct) flag state. Must be + * accessed via vma_flags_xxx() helper functions. + */ +#define NUM_VMA_FLAG_BITS BITS_PER_LONG +typedef struct { + DECLARE_BITMAP(__vma_flags, NUM_VMA_FLAG_BITS); +} __private vma_flags_t; + struct mm_struct { struct maple_tree mm_mt; int map_count; /* number of VMAs */ @@ -275,6 +549,57 @@ struct mm_struct { struct vm_area_struct; + +/* What action should be taken after an .mmap_prepare call is complete? */ +enum mmap_action_type { + MMAP_NOTHING, /* Mapping is complete, no further action. */ + MMAP_REMAP_PFN, /* Remap PFN range. */ + MMAP_IO_REMAP_PFN, /* I/O remap PFN range. */ +}; + +/* + * Describes an action an mmap_prepare hook can instruct to be taken to complete + * the mapping of a VMA. Specified in vm_area_desc. + */ +struct mmap_action { + union { + /* Remap range. */ + struct { + unsigned long start; + unsigned long start_pfn; + unsigned long size; + pgprot_t pgprot; + } remap; + }; + enum mmap_action_type type; + + /* + * If specified, this hook is invoked after the selected action has been + * successfully completed. Note that the VMA write lock still held. + * + * The absolute minimum ought to be done here. + * + * Returns 0 on success, or an error code. + */ + int (*success_hook)(const struct vm_area_struct *vma); + + /* + * If specified, this hook is invoked when an error occurred when + * attempting the selection action. + * + * The hook can return an error code in order to filter the error, but + * it is not valid to clear the error here. + */ + int (*error_hook)(int err); + + /* + * This should be set in rare instances where the operation required + * that the rmap should not be able to access the VMA until + * completely set up. + */ + bool hide_from_rmap_until_complete :1; +}; + /* * Describes a VMA that is about to be mmap()'ed. Drivers may choose to * manipulate mutable fields which will cause those fields to be updated in the @@ -292,12 +617,18 @@ struct vm_area_desc { /* Mutable fields. Populated with initial state. */ pgoff_t pgoff; struct file *vm_file; - vm_flags_t vm_flags; + union { + vm_flags_t vm_flags; + vma_flags_t vma_flags; + }; pgprot_t page_prot; /* Write-only fields. */ const struct vm_operations_struct *vm_ops; void *private_data; + + /* Take further action? */ + struct mmap_action action; }; struct file_operations { @@ -335,7 +666,7 @@ struct vm_area_struct { */ union { const vm_flags_t vm_flags; - vm_flags_t __private __vm_flags; + vma_flags_t flags; }; #ifdef CONFIG_PER_VMA_LOCK @@ -794,8 +1125,7 @@ static inline void update_hiwater_vm(struct mm_struct *mm) static inline void unmap_vmas(struct mmu_gather *tlb, struct ma_state *mas, struct vm_area_struct *vma, unsigned long start_addr, - unsigned long end_addr, unsigned long tree_end, - bool mm_wr_locked) + unsigned long end_addr, unsigned long tree_end) { } @@ -844,6 +1174,14 @@ static inline void vma_start_write(struct vm_area_struct *vma) vma->vm_lock_seq++; } +static inline __must_check +int vma_start_write_killable(struct vm_area_struct *vma) +{ + /* Used to indicate to tests that a write operation has begun. */ + vma->vm_lock_seq++; + return 0; +} + static inline void vma_adjust_trans_huge(struct vm_area_struct *vma, unsigned long start, unsigned long end, @@ -1042,26 +1380,6 @@ static inline bool may_expand_vm(struct mm_struct *mm, vm_flags_t flags, return true; } -static inline void vm_flags_init(struct vm_area_struct *vma, - vm_flags_t flags) -{ - vma->__vm_flags = flags; -} - -static inline void vm_flags_set(struct vm_area_struct *vma, - vm_flags_t flags) -{ - vma_start_write(vma); - vma->__vm_flags |= flags; -} - -static inline void vm_flags_clear(struct vm_area_struct *vma, - vm_flags_t flags) -{ - vma_start_write(vma); - vma->__vm_flags &= ~flags; -} - static inline int shmem_zero_setup(struct vm_area_struct *vma) { return 0; @@ -1218,13 +1536,118 @@ static inline void userfaultfd_unmap_complete(struct mm_struct *mm, { } -# define ACCESS_PRIVATE(p, member) ((p)->member) +#define ACCESS_PRIVATE(p, member) ((p)->member) + +#define bitmap_size(nbits) (ALIGN(nbits, BITS_PER_LONG) / BITS_PER_BYTE) + +static __always_inline void bitmap_zero(unsigned long *dst, unsigned int nbits) +{ + unsigned int len = bitmap_size(nbits); + + if (small_const_nbits(nbits)) + *dst = 0; + else + memset(dst, 0, len); +} static inline bool mm_flags_test(int flag, const struct mm_struct *mm) { return test_bit(flag, ACCESS_PRIVATE(&mm->flags, __mm_flags)); } +/* Clears all bits in the VMA flags bitmap, non-atomically. */ +static inline void vma_flags_clear_all(vma_flags_t *flags) +{ + bitmap_zero(ACCESS_PRIVATE(flags, __vma_flags), NUM_VMA_FLAG_BITS); +} + +/* + * Copy value to the first system word of VMA flags, non-atomically. + * + * IMPORTANT: This does not overwrite bytes past the first system word. The + * caller must account for this. + */ +static inline void vma_flags_overwrite_word(vma_flags_t *flags, unsigned long value) +{ + *ACCESS_PRIVATE(flags, __vma_flags) = value; +} + +/* + * Copy value to the first system word of VMA flags ONCE, non-atomically. + * + * IMPORTANT: This does not overwrite bytes past the first system word. The + * caller must account for this. + */ +static inline void vma_flags_overwrite_word_once(vma_flags_t *flags, unsigned long value) +{ + unsigned long *bitmap = ACCESS_PRIVATE(flags, __vma_flags); + + WRITE_ONCE(*bitmap, value); +} + +/* Update the first system word of VMA flags setting bits, non-atomically. */ +static inline void vma_flags_set_word(vma_flags_t *flags, unsigned long value) +{ + unsigned long *bitmap = ACCESS_PRIVATE(flags, __vma_flags); + + *bitmap |= value; +} + +/* Update the first system word of VMA flags clearing bits, non-atomically. */ +static inline void vma_flags_clear_word(vma_flags_t *flags, unsigned long value) +{ + unsigned long *bitmap = ACCESS_PRIVATE(flags, __vma_flags); + + *bitmap &= ~value; +} + + +/* Use when VMA is not part of the VMA tree and needs no locking */ +static inline void vm_flags_init(struct vm_area_struct *vma, + vm_flags_t flags) +{ + vma_flags_clear_all(&vma->flags); + vma_flags_overwrite_word(&vma->flags, flags); +} + +/* + * Use when VMA is part of the VMA tree and modifications need coordination + * Note: vm_flags_reset and vm_flags_reset_once do not lock the vma and + * it should be locked explicitly beforehand. + */ +static inline void vm_flags_reset(struct vm_area_struct *vma, + vm_flags_t flags) +{ + vma_assert_write_locked(vma); + vm_flags_init(vma, flags); +} + +static inline void vm_flags_reset_once(struct vm_area_struct *vma, + vm_flags_t flags) +{ + vma_assert_write_locked(vma); + /* + * The user should only be interested in avoiding reordering of + * assignment to the first word. + */ + vma_flags_clear_all(&vma->flags); + vma_flags_overwrite_word_once(&vma->flags, flags); +} + +static inline void vm_flags_set(struct vm_area_struct *vma, + vm_flags_t flags) +{ + vma_start_write(vma); + vma_flags_set_word(&vma->flags, flags); +} + +static inline void vm_flags_clear(struct vm_area_struct *vma, + vm_flags_t flags) +{ + vma_start_write(vma); + vma_flags_clear_word(&vma->flags, flags); +} + /* * Denies creating a writable executable mapping or gaining executable permissions. * @@ -1326,12 +1749,23 @@ static inline void free_anon_vma_name(struct vm_area_struct *vma) static inline void set_vma_from_desc(struct vm_area_struct *vma, struct vm_area_desc *desc); -static inline int __compat_vma_mmap_prepare(const struct file_operations *f_op, +static inline void mmap_action_prepare(struct mmap_action *action, + struct vm_area_desc *desc) +{ +} + +static inline int mmap_action_complete(struct mmap_action *action, + struct vm_area_struct *vma) +{ + return 0; +} + +static inline int __compat_vma_mmap(const struct file_operations *f_op, struct file *file, struct vm_area_struct *vma) { struct vm_area_desc desc = { .mm = vma->vm_mm, - .file = vma->vm_file, + .file = file, .start = vma->vm_start, .end = vma->vm_end, @@ -1339,21 +1773,24 @@ static inline int __compat_vma_mmap_prepare(const struct file_operations *f_op, .vm_file = vma->vm_file, .vm_flags = vma->vm_flags, .page_prot = vma->vm_page_prot, + + .action.type = MMAP_NOTHING, /* Default */ }; int err; err = f_op->mmap_prepare(&desc); if (err) return err; - set_vma_from_desc(vma, &desc); - return 0; + mmap_action_prepare(&desc.action, &desc); + set_vma_from_desc(vma, &desc); + return mmap_action_complete(&desc.action, vma); } -static inline int compat_vma_mmap_prepare(struct file *file, +static inline int compat_vma_mmap(struct file *file, struct vm_area_struct *vma) { - return __compat_vma_mmap_prepare(file->f_op, file, vma); + return __compat_vma_mmap(file->f_op, file, vma); } /* Did the driver provide valid mmap hook configuration? */ @@ -1374,7 +1811,7 @@ static inline bool can_mmap_file(struct file *file) static inline int vfs_mmap(struct file *file, struct vm_area_struct *vma) { if (file->f_op->mmap_prepare) - return compat_vma_mmap_prepare(file, vma); + return compat_vma_mmap(file, vma); return file->f_op->mmap(file, vma); } @@ -1407,4 +1844,20 @@ static inline vm_flags_t ksm_vma_flags(const struct mm_struct *mm, return vm_flags; } +static inline void remap_pfn_range_prepare(struct vm_area_desc *desc, unsigned long pfn) +{ +} + +static inline int remap_pfn_range_complete(struct vm_area_struct *vma, unsigned long addr, + unsigned long pfn, unsigned long size, pgprot_t pgprot) +{ + return 0; +} + +static inline int do_munmap(struct mm_struct *, unsigned long, size_t, + struct list_head *uf) +{ + return 0; +} + #endif /* __MM_VMA_INTERNAL_H */ diff --git a/tools/testing/vsock/vsock_test.c b/tools/testing/vsock/vsock_test.c index d4517386e551..9e1250790f33 100644 --- a/tools/testing/vsock/vsock_test.c +++ b/tools/testing/vsock/vsock_test.c @@ -2015,6 +2015,11 @@ static void test_stream_transport_change_client(const struct test_opts *opts) exit(EXIT_FAILURE); } + /* Although setting SO_LINGER does not affect the original test + * for null-ptr-deref, it may trigger a lockdep warning. + */ + enable_so_linger(s, 1); + ret = connect(s, (struct sockaddr *)&sa, sizeof(sa)); /* The connect can fail due to signals coming from the thread, * or because the receiver connection queue is full. @@ -2352,7 +2357,7 @@ static struct test_case test_cases[] = { .run_server = test_stream_nolinger_server, }, { - .name = "SOCK_STREAM transport change null-ptr-deref", + .name = "SOCK_STREAM transport change null-ptr-deref, lockdep warn", .run_client = test_stream_transport_change_client, .run_server = test_stream_transport_change_server, }, diff --git a/tools/thermal/thermal-engine/thermal-engine.c b/tools/thermal/thermal-engine/thermal-engine.c index 0764dc754771..66b0ba1fcd23 100644 --- a/tools/thermal/thermal-engine/thermal-engine.c +++ b/tools/thermal/thermal-engine/thermal-engine.c @@ -374,7 +374,7 @@ int main(int argc, char *argv[]) } if (options.daemonize && daemon(0, 0)) { - ERROR("Failed to daemonize: %p\n"); + ERROR("Failed to daemonize: %m\n"); return THERMAL_ENGINE_DAEMON_ERROR; } diff --git a/tools/tracing/rtla/Makefile.rtla b/tools/tracing/rtla/Makefile.rtla index 08c1b40883d3..1743d91829d4 100644 --- a/tools/tracing/rtla/Makefile.rtla +++ b/tools/tracing/rtla/Makefile.rtla @@ -18,7 +18,7 @@ export CC AR STRIP PKG_CONFIG LD_SO_CONF_PATH LDCONFIG FOPTS := -flto=auto -ffat-lto-objects -fexceptions -fstack-protector-strong \ -fasynchronous-unwind-tables -fstack-clash-protection WOPTS := -O -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 \ - -Wp,-D_GLIBCXX_ASSERTIONS -Wno-maybe-uninitialized + -Wp,-D_GLIBCXX_ASSERTIONS ifeq ($(CC),clang) FOPTS := $(filter-out -flto=auto -ffat-lto-objects, $(FOPTS)) diff --git a/tools/tracing/rtla/src/common.c b/tools/tracing/rtla/src/common.c index 2e6e3dac1897..b197037fc58b 100644 --- a/tools/tracing/rtla/src/common.c +++ b/tools/tracing/rtla/src/common.c @@ -268,6 +268,10 @@ int top_main_loop(struct osnoise_tool *tool) tool->ops->print_stats(tool); if (osnoise_trace_is_off(tool, record)) { + if (stop_tracing) + /* stop tracing requested, do not perform actions */ + return 0; + actions_perform(¶ms->threshold_actions); if (!params->threshold_actions.continue_flag) @@ -315,20 +319,22 @@ int hist_main_loop(struct osnoise_tool *tool) } if (osnoise_trace_is_off(tool, tool->record)) { + if (stop_tracing) + /* stop tracing requested, do not perform actions */ + break; + actions_perform(¶ms->threshold_actions); - if (!params->threshold_actions.continue_flag) { + if (!params->threshold_actions.continue_flag) /* continue flag not set, break */ break; - /* continue action reached, re-enable tracing */ - if (tool->record) - trace_instance_start(&tool->record->trace); - if (tool->aa) - trace_instance_start(&tool->aa->trace); - trace_instance_start(&tool->trace); - } - break; + /* continue action reached, re-enable tracing */ + if (tool->record) + trace_instance_start(&tool->record->trace); + if (tool->aa) + trace_instance_start(&tool->aa->trace); + trace_instance_start(&tool->trace); } /* is there still any user-threads ? */ diff --git a/tools/tracing/rtla/src/common.h b/tools/tracing/rtla/src/common.h index 355f113a14a3..9ec2b7632c37 100644 --- a/tools/tracing/rtla/src/common.h +++ b/tools/tracing/rtla/src/common.h @@ -107,6 +107,10 @@ struct common_params { struct timerlat_u_params user; }; +#define for_each_monitored_cpu(cpu, nr_cpus, common) \ + for (cpu = 0; cpu < nr_cpus; cpu++) \ + if (!(common)->cpus || CPU_ISSET(cpu, &(common)->monitored_cpus)) + struct tool_ops; /* diff --git a/tools/tracing/rtla/src/osnoise_hist.c b/tools/tracing/rtla/src/osnoise_hist.c index dffb6d0a98d7..ff8c231e47c4 100644 --- a/tools/tracing/rtla/src/osnoise_hist.c +++ b/tools/tracing/rtla/src/osnoise_hist.c @@ -247,9 +247,7 @@ static void osnoise_hist_header(struct osnoise_tool *tool) if (!params->common.hist.no_index) trace_seq_printf(s, "Index"); - for (cpu = 0; cpu < data->nr_cpus; cpu++) { - if (params->common.cpus && !CPU_ISSET(cpu, ¶ms->common.monitored_cpus)) - continue; + for_each_monitored_cpu(cpu, data->nr_cpus, ¶ms->common) { if (!data->hist[cpu].count) continue; @@ -278,9 +276,7 @@ osnoise_print_summary(struct osnoise_params *params, if (!params->common.hist.no_index) trace_seq_printf(trace->seq, "count:"); - for (cpu = 0; cpu < data->nr_cpus; cpu++) { - if (params->common.cpus && !CPU_ISSET(cpu, ¶ms->common.monitored_cpus)) - continue; + for_each_monitored_cpu(cpu, data->nr_cpus, ¶ms->common) { if (!data->hist[cpu].count) continue; @@ -292,9 +288,7 @@ osnoise_print_summary(struct osnoise_params *params, if (!params->common.hist.no_index) trace_seq_printf(trace->seq, "min: "); - for (cpu = 0; cpu < data->nr_cpus; cpu++) { - if (params->common.cpus && !CPU_ISSET(cpu, ¶ms->common.monitored_cpus)) - continue; + for_each_monitored_cpu(cpu, data->nr_cpus, ¶ms->common) { if (!data->hist[cpu].count) continue; @@ -307,9 +301,7 @@ osnoise_print_summary(struct osnoise_params *params, if (!params->common.hist.no_index) trace_seq_printf(trace->seq, "avg: "); - for (cpu = 0; cpu < data->nr_cpus; cpu++) { - if (params->common.cpus && !CPU_ISSET(cpu, ¶ms->common.monitored_cpus)) - continue; + for_each_monitored_cpu(cpu, data->nr_cpus, ¶ms->common) { if (!data->hist[cpu].count) continue; @@ -325,9 +317,7 @@ osnoise_print_summary(struct osnoise_params *params, if (!params->common.hist.no_index) trace_seq_printf(trace->seq, "max: "); - for (cpu = 0; cpu < data->nr_cpus; cpu++) { - if (params->common.cpus && !CPU_ISSET(cpu, ¶ms->common.monitored_cpus)) - continue; + for_each_monitored_cpu(cpu, data->nr_cpus, ¶ms->common) { if (!data->hist[cpu].count) continue; @@ -362,9 +352,7 @@ osnoise_print_stats(struct osnoise_tool *tool) trace_seq_printf(trace->seq, "%-6d", bucket * data->bucket_size); - for (cpu = 0; cpu < data->nr_cpus; cpu++) { - if (params->common.cpus && !CPU_ISSET(cpu, ¶ms->common.monitored_cpus)) - continue; + for_each_monitored_cpu(cpu, data->nr_cpus, ¶ms->common) { if (!data->hist[cpu].count) continue; @@ -400,9 +388,7 @@ osnoise_print_stats(struct osnoise_tool *tool) if (!params->common.hist.no_index) trace_seq_printf(trace->seq, "over: "); - for (cpu = 0; cpu < data->nr_cpus; cpu++) { - if (params->common.cpus && !CPU_ISSET(cpu, ¶ms->common.monitored_cpus)) - continue; + for_each_monitored_cpu(cpu, data->nr_cpus, ¶ms->common) { if (!data->hist[cpu].count) continue; @@ -421,16 +407,16 @@ osnoise_print_stats(struct osnoise_tool *tool) /* * osnoise_hist_usage - prints osnoise hist usage message */ -static void osnoise_hist_usage(char *usage) +static void osnoise_hist_usage(void) { int i; static const char * const msg[] = { "", " usage: rtla osnoise hist [-h] [-D] [-d s] [-a us] [-p us] [-r us] [-s us] [-S us] \\", - " [-T us] [-t[file]] [-e sys[:event]] [--filter <filter>] [--trigger <trigger>] \\", + " [-T us] [-t [file]] [-e sys[:event]] [--filter <filter>] [--trigger <trigger>] \\", " [-c cpu-list] [-H cpu-list] [-P priority] [-b N] [-E N] [--no-header] [--no-summary] \\", - " [--no-index] [--with-zeros] [-C[=cgroup_name]] [--warm-up]", + " [--no-index] [--with-zeros] [-C [cgroup_name]] [--warm-up]", "", " -h/--help: print this menu", " -a/--auto: set automatic trace mode, stopping the session if argument in us sample is hit", @@ -441,10 +427,10 @@ static void osnoise_hist_usage(char *usage) " -T/--threshold us: the minimum delta to be considered a noise", " -c/--cpus cpu-list: list of cpus to run osnoise threads", " -H/--house-keeping cpus: run rtla control threads only on the given cpus", - " -C/--cgroup[=cgroup_name]: set cgroup, if no cgroup_name is passed, the rtla's cgroup will be inherited", + " -C/--cgroup [cgroup_name]: set cgroup, if no cgroup_name is passed, the rtla's cgroup will be inherited", " -d/--duration time[s|m|h|d]: duration of the session", " -D/--debug: print debug info", - " -t/--trace[file]: save the stopped trace to [file|osnoise_trace.txt]", + " -t/--trace [file]: save the stopped trace to [file|osnoise_trace.txt]", " -e/--event <sys:event>: enable the <sys:event> in the trace instance, multiple -e are allowed", " --filter <filter>: enable a trace event filter to the previous -e event", " --trigger <trigger>: enable a trace event trigger to the previous -e event", @@ -467,18 +453,12 @@ static void osnoise_hist_usage(char *usage) NULL, }; - if (usage) - fprintf(stderr, "%s\n", usage); - fprintf(stderr, "rtla osnoise hist: a per-cpu histogram of the OS noise (version %s)\n", VERSION); for (i = 0; msg[i]; i++) fprintf(stderr, "%s\n", msg[i]); - if (usage) - exit(EXIT_FAILURE); - exit(EXIT_SUCCESS); } @@ -538,11 +518,8 @@ static struct common_params {0, 0, 0, 0} }; - /* getopt_long stores the option index here. */ - int option_index = 0; - c = getopt_long(argc, argv, "a:c:C::b:d:e:E:DhH:p:P:r:s:S:t::T:01234:5:6:7:", - long_options, &option_index); + long_options, NULL); /* detect the end of the options. */ if (c == -1) @@ -557,30 +534,25 @@ static struct common_params params->threshold = 1; /* set trace */ - trace_output = "osnoise_trace.txt"; + if (!trace_output) + trace_output = "osnoise_trace.txt"; break; case 'b': params->common.hist.bucket_size = get_llong_from_str(optarg); if (params->common.hist.bucket_size == 0 || params->common.hist.bucket_size >= 1000000) - osnoise_hist_usage("Bucket size needs to be > 0 and <= 1000000\n"); + fatal("Bucket size needs to be > 0 and <= 1000000"); break; case 'c': retval = parse_cpu_set(optarg, ¶ms->common.monitored_cpus); if (retval) - osnoise_hist_usage("\nInvalid -c cpu list\n"); + fatal("Invalid -c cpu list"); params->common.cpus = optarg; break; case 'C': params->common.cgroup = 1; - if (!optarg) { - /* will inherit this cgroup */ - params->common.cgroup_name = NULL; - } else if (*optarg == '=') { - /* skip the = */ - params->common.cgroup_name = ++optarg; - } + params->common.cgroup_name = parse_optional_arg(argc, argv); break; case 'D': config_debug = 1; @@ -588,14 +560,12 @@ static struct common_params case 'd': params->common.duration = parse_seconds_duration(optarg); if (!params->common.duration) - osnoise_hist_usage("Invalid -D duration\n"); + fatal("Invalid -D duration"); break; case 'e': tevent = trace_event_alloc(optarg); - if (!tevent) { - err_msg("Error alloc trace event"); - exit(EXIT_FAILURE); - } + if (!tevent) + fatal("Error alloc trace event"); if (params->common.events) tevent->next = params->common.events; @@ -606,35 +576,33 @@ static struct common_params params->common.hist.entries = get_llong_from_str(optarg); if (params->common.hist.entries < 10 || params->common.hist.entries > 9999999) - osnoise_hist_usage("Entries must be > 10 and < 9999999\n"); + fatal("Entries must be > 10 and < 9999999"); break; case 'h': case '?': - osnoise_hist_usage(NULL); + osnoise_hist_usage(); break; case 'H': params->common.hk_cpus = 1; retval = parse_cpu_set(optarg, ¶ms->common.hk_cpu_set); - if (retval) { - err_msg("Error parsing house keeping CPUs\n"); - exit(EXIT_FAILURE); - } + if (retval) + fatal("Error parsing house keeping CPUs"); break; case 'p': params->period = get_llong_from_str(optarg); if (params->period > 10000000) - osnoise_hist_usage("Period longer than 10 s\n"); + fatal("Period longer than 10 s"); break; case 'P': retval = parse_prio(optarg, ¶ms->common.sched_param); if (retval == -1) - osnoise_hist_usage("Invalid -P priority"); + fatal("Invalid -P priority"); params->common.set_sched = 1; break; case 'r': params->runtime = get_llong_from_str(optarg); if (params->runtime < 100) - osnoise_hist_usage("Runtime shorter than 100 us\n"); + fatal("Runtime shorter than 100 us"); break; case 's': params->common.stop_us = get_llong_from_str(optarg); @@ -646,14 +614,8 @@ static struct common_params params->threshold = get_llong_from_str(optarg); break; case 't': - if (optarg) { - if (optarg[0] == '=') - trace_output = &optarg[1]; - else - trace_output = &optarg[0]; - } else if (optind < argc && argv[optind][0] != '0') - trace_output = argv[optind]; - else + trace_output = parse_optional_arg(argc, argv); + if (!trace_output) trace_output = "osnoise_trace.txt"; break; case '0': /* no header */ @@ -671,23 +633,19 @@ static struct common_params case '4': /* trigger */ if (params->common.events) { retval = trace_event_add_trigger(params->common.events, optarg); - if (retval) { - err_msg("Error adding trigger %s\n", optarg); - exit(EXIT_FAILURE); - } + if (retval) + fatal("Error adding trigger %s", optarg); } else { - osnoise_hist_usage("--trigger requires a previous -e\n"); + fatal("--trigger requires a previous -e"); } break; case '5': /* filter */ if (params->common.events) { retval = trace_event_add_filter(params->common.events, optarg); - if (retval) { - err_msg("Error adding filter %s\n", optarg); - exit(EXIT_FAILURE); - } + if (retval) + fatal("Error adding filter %s", optarg); } else { - osnoise_hist_usage("--filter requires a previous -e\n"); + fatal("--filter requires a previous -e"); } break; case '6': @@ -699,34 +657,28 @@ static struct common_params case '8': retval = actions_parse(¶ms->common.threshold_actions, optarg, "osnoise_trace.txt"); - if (retval) { - err_msg("Invalid action %s\n", optarg); - exit(EXIT_FAILURE); - } + if (retval) + fatal("Invalid action %s", optarg); break; case '9': retval = actions_parse(¶ms->common.end_actions, optarg, "osnoise_trace.txt"); - if (retval) { - err_msg("Invalid action %s\n", optarg); - exit(EXIT_FAILURE); - } + if (retval) + fatal("Invalid action %s", optarg); break; default: - osnoise_hist_usage("Invalid option"); + fatal("Invalid option"); } } if (trace_output) actions_add_trace_output(¶ms->common.threshold_actions, trace_output); - if (geteuid()) { - err_msg("rtla needs root permission\n"); - exit(EXIT_FAILURE); - } + if (geteuid()) + fatal("rtla needs root permission"); if (params->common.hist.no_index && !params->common.hist.with_zeros) - osnoise_hist_usage("no-index set and with-zeros not set - it does not make sense"); + fatal("no-index set and with-zeros not set - it does not make sense"); return ¶ms->common; } diff --git a/tools/tracing/rtla/src/osnoise_top.c b/tools/tracing/rtla/src/osnoise_top.c index 95418f7ecc96..04c699bdd736 100644 --- a/tools/tracing/rtla/src/osnoise_top.c +++ b/tools/tracing/rtla/src/osnoise_top.c @@ -243,9 +243,7 @@ osnoise_print_stats(struct osnoise_tool *top) osnoise_top_header(top); - for (i = 0; i < nr_cpus; i++) { - if (params->common.cpus && !CPU_ISSET(i, ¶ms->common.monitored_cpus)) - continue; + for_each_monitored_cpu(i, nr_cpus, ¶ms->common) { osnoise_top_print(top, i); } @@ -257,14 +255,14 @@ osnoise_print_stats(struct osnoise_tool *top) /* * osnoise_top_usage - prints osnoise top usage message */ -static void osnoise_top_usage(struct osnoise_params *params, char *usage) +static void osnoise_top_usage(struct osnoise_params *params) { int i; static const char * const msg[] = { " [-h] [-q] [-D] [-d s] [-a us] [-p us] [-r us] [-s us] [-S us] \\", - " [-T us] [-t[file]] [-e sys[:event]] [--filter <filter>] [--trigger <trigger>] \\", - " [-c cpu-list] [-H cpu-list] [-P priority] [-C[=cgroup_name]] [--warm-up s]", + " [-T us] [-t [file]] [-e sys[:event]] [--filter <filter>] [--trigger <trigger>] \\", + " [-c cpu-list] [-H cpu-list] [-P priority] [-C [cgroup_name]] [--warm-up s]", "", " -h/--help: print this menu", " -a/--auto: set automatic trace mode, stopping the session if argument in us sample is hit", @@ -275,10 +273,10 @@ static void osnoise_top_usage(struct osnoise_params *params, char *usage) " -T/--threshold us: the minimum delta to be considered a noise", " -c/--cpus cpu-list: list of cpus to run osnoise threads", " -H/--house-keeping cpus: run rtla control threads only on the given cpus", - " -C/--cgroup[=cgroup_name]: set cgroup, if no cgroup_name is passed, the rtla's cgroup will be inherited", + " -C/--cgroup [cgroup_name]: set cgroup, if no cgroup_name is passed, the rtla's cgroup will be inherited", " -d/--duration time[s|m|h|d]: duration of the session", " -D/--debug: print debug info", - " -t/--trace[file]: save the stopped trace to [file|osnoise_trace.txt]", + " -t/--trace [file]: save the stopped trace to [file|osnoise_trace.txt]", " -e/--event <sys:event>: enable the <sys:event> in the trace instance, multiple -e are allowed", " --filter <filter>: enable a trace event filter to the previous -e event", " --trigger <trigger>: enable a trace event trigger to the previous -e event", @@ -296,9 +294,6 @@ static void osnoise_top_usage(struct osnoise_params *params, char *usage) NULL, }; - if (usage) - fprintf(stderr, "%s\n", usage); - if (params->mode == MODE_OSNOISE) { fprintf(stderr, "rtla osnoise top: a per-cpu summary of the OS noise (version %s)\n", @@ -318,9 +313,6 @@ static void osnoise_top_usage(struct osnoise_params *params, char *usage) for (i = 0; msg[i]; i++) fprintf(stderr, "%s\n", msg[i]); - if (usage) - exit(EXIT_FAILURE); - exit(EXIT_SUCCESS); } @@ -378,11 +370,8 @@ struct common_params *osnoise_top_parse_args(int argc, char **argv) {0, 0, 0, 0} }; - /* getopt_long stores the option index here. */ - int option_index = 0; - c = getopt_long(argc, argv, "a:c:C::d:De:hH:p:P:qr:s:S:t::T:0:1:2:3:", - long_options, &option_index); + long_options, NULL); /* Detect the end of the options. */ if (c == -1) @@ -397,24 +386,19 @@ struct common_params *osnoise_top_parse_args(int argc, char **argv) params->threshold = 1; /* set trace */ - trace_output = "osnoise_trace.txt"; + if (!trace_output) + trace_output = "osnoise_trace.txt"; break; case 'c': retval = parse_cpu_set(optarg, ¶ms->common.monitored_cpus); if (retval) - osnoise_top_usage(params, "\nInvalid -c cpu list\n"); + fatal("Invalid -c cpu list"); params->common.cpus = optarg; break; case 'C': params->common.cgroup = 1; - if (!optarg) { - /* will inherit this cgroup */ - params->common.cgroup_name = NULL; - } else if (*optarg == '=') { - /* skip the = */ - params->common.cgroup_name = ++optarg; - } + params->common.cgroup_name = parse_optional_arg(argc, argv); break; case 'D': config_debug = 1; @@ -422,14 +406,12 @@ struct common_params *osnoise_top_parse_args(int argc, char **argv) case 'd': params->common.duration = parse_seconds_duration(optarg); if (!params->common.duration) - osnoise_top_usage(params, "Invalid -d duration\n"); + fatal("Invalid -d duration"); break; case 'e': tevent = trace_event_alloc(optarg); - if (!tevent) { - err_msg("Error alloc trace event"); - exit(EXIT_FAILURE); - } + if (!tevent) + fatal("Error alloc trace event"); if (params->common.events) tevent->next = params->common.events; @@ -438,25 +420,23 @@ struct common_params *osnoise_top_parse_args(int argc, char **argv) break; case 'h': case '?': - osnoise_top_usage(params, NULL); + osnoise_top_usage(params); break; case 'H': params->common.hk_cpus = 1; retval = parse_cpu_set(optarg, ¶ms->common.hk_cpu_set); - if (retval) { - err_msg("Error parsing house keeping CPUs\n"); - exit(EXIT_FAILURE); - } + if (retval) + fatal("Error parsing house keeping CPUs"); break; case 'p': params->period = get_llong_from_str(optarg); if (params->period > 10000000) - osnoise_top_usage(params, "Period longer than 10 s\n"); + fatal("Period longer than 10 s"); break; case 'P': retval = parse_prio(optarg, ¶ms->common.sched_param); if (retval == -1) - osnoise_top_usage(params, "Invalid -P priority"); + fatal("Invalid -P priority"); params->common.set_sched = 1; break; case 'q': @@ -465,7 +445,7 @@ struct common_params *osnoise_top_parse_args(int argc, char **argv) case 'r': params->runtime = get_llong_from_str(optarg); if (params->runtime < 100) - osnoise_top_usage(params, "Runtime shorter than 100 us\n"); + fatal("Runtime shorter than 100 us"); break; case 's': params->common.stop_us = get_llong_from_str(optarg); @@ -474,14 +454,8 @@ struct common_params *osnoise_top_parse_args(int argc, char **argv) params->common.stop_total_us = get_llong_from_str(optarg); break; case 't': - if (optarg) { - if (optarg[0] == '=') - trace_output = &optarg[1]; - else - trace_output = &optarg[0]; - } else if (optind < argc && argv[optind][0] != '-') - trace_output = argv[optind]; - else + trace_output = parse_optional_arg(argc, argv); + if (!trace_output) trace_output = "osnoise_trace.txt"; break; case 'T': @@ -490,23 +464,19 @@ struct common_params *osnoise_top_parse_args(int argc, char **argv) case '0': /* trigger */ if (params->common.events) { retval = trace_event_add_trigger(params->common.events, optarg); - if (retval) { - err_msg("Error adding trigger %s\n", optarg); - exit(EXIT_FAILURE); - } + if (retval) + fatal("Error adding trigger %s", optarg); } else { - osnoise_top_usage(params, "--trigger requires a previous -e\n"); + fatal("--trigger requires a previous -e"); } break; case '1': /* filter */ if (params->common.events) { retval = trace_event_add_filter(params->common.events, optarg); - if (retval) { - err_msg("Error adding filter %s\n", optarg); - exit(EXIT_FAILURE); - } + if (retval) + fatal("Error adding filter %s", optarg); } else { - osnoise_top_usage(params, "--filter requires a previous -e\n"); + fatal("--filter requires a previous -e"); } break; case '2': @@ -518,31 +488,25 @@ struct common_params *osnoise_top_parse_args(int argc, char **argv) case '4': retval = actions_parse(¶ms->common.threshold_actions, optarg, "osnoise_trace.txt"); - if (retval) { - err_msg("Invalid action %s\n", optarg); - exit(EXIT_FAILURE); - } + if (retval) + fatal("Invalid action %s", optarg); break; case '5': retval = actions_parse(¶ms->common.end_actions, optarg, "osnoise_trace.txt"); - if (retval) { - err_msg("Invalid action %s\n", optarg); - exit(EXIT_FAILURE); - } + if (retval) + fatal("Invalid action %s", optarg); break; default: - osnoise_top_usage(params, "Invalid option"); + fatal("Invalid option"); } } if (trace_output) actions_add_trace_output(¶ms->common.threshold_actions, trace_output); - if (geteuid()) { - err_msg("osnoise needs root permission\n"); - exit(EXIT_FAILURE); - } + if (geteuid()) + fatal("osnoise needs root permission"); return ¶ms->common; } diff --git a/tools/tracing/rtla/src/timerlat.bpf.c b/tools/tracing/rtla/src/timerlat.bpf.c index 084cd10c21fc..e2265b5d6491 100644 --- a/tools/tracing/rtla/src/timerlat.bpf.c +++ b/tools/tracing/rtla/src/timerlat.bpf.c @@ -148,6 +148,9 @@ int handle_timerlat_sample(struct trace_event_raw_timerlat_sample *tp_args) } else { update_main_hist(&hist_user, bucket); update_summary(&summary_user, latency, bucket); + + if (thread_threshold != 0 && latency_us >= thread_threshold) + set_stop_tracing(); } return 0; diff --git a/tools/tracing/rtla/src/timerlat.c b/tools/tracing/rtla/src/timerlat.c index b69212874127..df4f9bfe3433 100644 --- a/tools/tracing/rtla/src/timerlat.c +++ b/tools/tracing/rtla/src/timerlat.c @@ -126,9 +126,7 @@ int timerlat_enable(struct osnoise_tool *tool) nr_cpus = sysconf(_SC_NPROCESSORS_CONF); - for (i = 0; i < nr_cpus; i++) { - if (params->common.cpus && !CPU_ISSET(i, ¶ms->common.monitored_cpus)) - continue; + for_each_monitored_cpu(i, nr_cpus, ¶ms->common) { if (save_cpu_idle_disable_state(i) < 0) { err_msg("Could not save cpu idle state.\n"); return -1; @@ -215,16 +213,14 @@ void timerlat_analyze(struct osnoise_tool *tool, bool stopped) void timerlat_free(struct osnoise_tool *tool) { struct timerlat_params *params = to_timerlat_params(tool->params); - int nr_cpus, i; + int nr_cpus = sysconf(_SC_NPROCESSORS_CONF); + int i; timerlat_aa_destroy(); if (dma_latency_fd >= 0) close(dma_latency_fd); if (params->deepest_idle_state >= -1) { - for (i = 0; i < nr_cpus; i++) { - if (params->common.cpus && - !CPU_ISSET(i, ¶ms->common.monitored_cpus)) - continue; + for_each_monitored_cpu(i, nr_cpus, ¶ms->common) { restore_cpu_idle_disable_state(i); } } diff --git a/tools/tracing/rtla/src/timerlat_hist.c b/tools/tracing/rtla/src/timerlat_hist.c index 606c1688057b..1fb471a787b7 100644 --- a/tools/tracing/rtla/src/timerlat_hist.c +++ b/tools/tracing/rtla/src/timerlat_hist.c @@ -305,9 +305,7 @@ static void timerlat_hist_header(struct osnoise_tool *tool) if (!params->common.hist.no_index) trace_seq_printf(s, "Index"); - for (cpu = 0; cpu < data->nr_cpus; cpu++) { - if (params->common.cpus && !CPU_ISSET(cpu, ¶ms->common.monitored_cpus)) - continue; + for_each_monitored_cpu(cpu, data->nr_cpus, ¶ms->common) { if (!data->hist[cpu].irq_count && !data->hist[cpu].thread_count) continue; @@ -359,9 +357,7 @@ timerlat_print_summary(struct timerlat_params *params, if (!params->common.hist.no_index) trace_seq_printf(trace->seq, "count:"); - for (cpu = 0; cpu < data->nr_cpus; cpu++) { - if (params->common.cpus && !CPU_ISSET(cpu, ¶ms->common.monitored_cpus)) - continue; + for_each_monitored_cpu(cpu, data->nr_cpus, ¶ms->common) { if (!data->hist[cpu].irq_count && !data->hist[cpu].thread_count) continue; @@ -383,9 +379,7 @@ timerlat_print_summary(struct timerlat_params *params, if (!params->common.hist.no_index) trace_seq_printf(trace->seq, "min: "); - for (cpu = 0; cpu < data->nr_cpus; cpu++) { - if (params->common.cpus && !CPU_ISSET(cpu, ¶ms->common.monitored_cpus)) - continue; + for_each_monitored_cpu(cpu, data->nr_cpus, ¶ms->common) { if (!data->hist[cpu].irq_count && !data->hist[cpu].thread_count) continue; @@ -413,9 +407,7 @@ timerlat_print_summary(struct timerlat_params *params, if (!params->common.hist.no_index) trace_seq_printf(trace->seq, "avg: "); - for (cpu = 0; cpu < data->nr_cpus; cpu++) { - if (params->common.cpus && !CPU_ISSET(cpu, ¶ms->common.monitored_cpus)) - continue; + for_each_monitored_cpu(cpu, data->nr_cpus, ¶ms->common) { if (!data->hist[cpu].irq_count && !data->hist[cpu].thread_count) continue; @@ -443,9 +435,7 @@ timerlat_print_summary(struct timerlat_params *params, if (!params->common.hist.no_index) trace_seq_printf(trace->seq, "max: "); - for (cpu = 0; cpu < data->nr_cpus; cpu++) { - if (params->common.cpus && !CPU_ISSET(cpu, ¶ms->common.monitored_cpus)) - continue; + for_each_monitored_cpu(cpu, data->nr_cpus, ¶ms->common) { if (!data->hist[cpu].irq_count && !data->hist[cpu].thread_count) continue; @@ -490,9 +480,7 @@ timerlat_print_stats_all(struct timerlat_params *params, sum.min_thread = ~0; sum.min_user = ~0; - for (cpu = 0; cpu < data->nr_cpus; cpu++) { - if (params->common.cpus && !CPU_ISSET(cpu, ¶ms->common.monitored_cpus)) - continue; + for_each_monitored_cpu(cpu, data->nr_cpus, ¶ms->common) { if (!data->hist[cpu].irq_count && !data->hist[cpu].thread_count) continue; @@ -639,9 +627,7 @@ timerlat_print_stats(struct osnoise_tool *tool) trace_seq_printf(trace->seq, "%-6d", bucket * data->bucket_size); - for (cpu = 0; cpu < data->nr_cpus; cpu++) { - if (params->common.cpus && !CPU_ISSET(cpu, ¶ms->common.monitored_cpus)) - continue; + for_each_monitored_cpu(cpu, data->nr_cpus, ¶ms->common) { if (!data->hist[cpu].irq_count && !data->hist[cpu].thread_count) continue; @@ -679,9 +665,7 @@ timerlat_print_stats(struct osnoise_tool *tool) if (!params->common.hist.no_index) trace_seq_printf(trace->seq, "over: "); - for (cpu = 0; cpu < data->nr_cpus; cpu++) { - if (params->common.cpus && !CPU_ISSET(cpu, ¶ms->common.monitored_cpus)) - continue; + for_each_monitored_cpu(cpu, data->nr_cpus, ¶ms->common) { if (!data->hist[cpu].irq_count && !data->hist[cpu].thread_count) continue; @@ -710,16 +694,16 @@ timerlat_print_stats(struct osnoise_tool *tool) /* * timerlat_hist_usage - prints timerlat top usage message */ -static void timerlat_hist_usage(char *usage) +static void timerlat_hist_usage(void) { int i; char *msg[] = { "", " usage: [rtla] timerlat hist [-h] [-q] [-d s] [-D] [-n] [-a us] [-p us] [-i us] [-T us] [-s us] \\", - " [-t[file]] [-e sys[:event]] [--filter <filter>] [--trigger <trigger>] [-c cpu-list] [-H cpu-list]\\", + " [-t [file]] [-e sys[:event]] [--filter <filter>] [--trigger <trigger>] [-c cpu-list] [-H cpu-list]\\", " [-P priority] [-E N] [-b N] [--no-irq] [--no-thread] [--no-header] [--no-summary] \\", - " [--no-index] [--with-zeros] [--dma-latency us] [-C[=cgroup_name]] [--no-aa] [--dump-task] [-u|-k]", + " [--no-index] [--with-zeros] [--dma-latency us] [-C [cgroup_name]] [--no-aa] [--dump-task] [-u|-k]", " [--warm-up s] [--deepest-idle-state n]", "", " -h/--help: print this menu", @@ -730,11 +714,11 @@ static void timerlat_hist_usage(char *usage) " -s/--stack us: save the stack trace at the IRQ if a thread latency is higher than the argument in us", " -c/--cpus cpus: run the tracer only on the given cpus", " -H/--house-keeping cpus: run rtla control threads only on the given cpus", - " -C/--cgroup[=cgroup_name]: set cgroup, if no cgroup_name is passed, the rtla's cgroup will be inherited", + " -C/--cgroup [cgroup_name]: set cgroup, if no cgroup_name is passed, the rtla's cgroup will be inherited", " -d/--duration time[m|h|d]: duration of the session in seconds", " --dump-tasks: prints the task running on all CPUs if stop conditions are met (depends on !--no-aa)", " -D/--debug: print debug info", - " -t/--trace[file]: save the stopped trace to [file|timerlat_trace.txt]", + " -t/--trace [file]: save the stopped trace to [file|timerlat_trace.txt]", " -e/--event <sys:event>: enable the <sys:event> in the trace instance, multiple -e are allowed", " --filter <filter>: enable a trace event filter to the previous -e event", " --trigger <trigger>: enable a trace event trigger to the previous -e event", @@ -766,18 +750,12 @@ static void timerlat_hist_usage(char *usage) NULL, }; - if (usage) - fprintf(stderr, "%s\n", usage); - fprintf(stderr, "rtla timerlat hist: a per-cpu histogram of the timer latency (version %s)\n", VERSION); for (i = 0; msg[i]; i++) fprintf(stderr, "%s\n", msg[i]); - if (usage) - exit(EXIT_FAILURE); - exit(EXIT_SUCCESS); } @@ -856,11 +834,8 @@ static struct common_params {0, 0, 0, 0} }; - /* getopt_long stores the option index here. */ - int option_index = 0; - c = getopt_long(argc, argv, "a:c:C::b:d:e:E:DhH:i:knp:P:s:t::T:uU0123456:7:8:9\1\2:\3:", - long_options, &option_index); + long_options, NULL); /* detect the end of the options. */ if (c == -1) @@ -878,30 +853,25 @@ static struct common_params params->print_stack = auto_thresh; /* set trace */ - trace_output = "timerlat_trace.txt"; + if (!trace_output) + trace_output = "timerlat_trace.txt"; break; case 'c': retval = parse_cpu_set(optarg, ¶ms->common.monitored_cpus); if (retval) - timerlat_hist_usage("\nInvalid -c cpu list\n"); + fatal("Invalid -c cpu list"); params->common.cpus = optarg; break; case 'C': params->common.cgroup = 1; - if (!optarg) { - /* will inherit this cgroup */ - params->common.cgroup_name = NULL; - } else if (*optarg == '=') { - /* skip the = */ - params->common.cgroup_name = ++optarg; - } + params->common.cgroup_name = parse_optional_arg(argc, argv); break; case 'b': params->common.hist.bucket_size = get_llong_from_str(optarg); if (params->common.hist.bucket_size == 0 || params->common.hist.bucket_size >= 1000000) - timerlat_hist_usage("Bucket size needs to be > 0 and <= 1000000\n"); + fatal("Bucket size needs to be > 0 and <= 1000000"); break; case 'D': config_debug = 1; @@ -909,14 +879,12 @@ static struct common_params case 'd': params->common.duration = parse_seconds_duration(optarg); if (!params->common.duration) - timerlat_hist_usage("Invalid -D duration\n"); + fatal("Invalid -D duration"); break; case 'e': tevent = trace_event_alloc(optarg); - if (!tevent) { - err_msg("Error alloc trace event"); - exit(EXIT_FAILURE); - } + if (!tevent) + fatal("Error alloc trace event"); if (params->common.events) tevent->next = params->common.events; @@ -927,19 +895,17 @@ static struct common_params params->common.hist.entries = get_llong_from_str(optarg); if (params->common.hist.entries < 10 || params->common.hist.entries > 9999999) - timerlat_hist_usage("Entries must be > 10 and < 9999999\n"); + fatal("Entries must be > 10 and < 9999999"); break; case 'h': case '?': - timerlat_hist_usage(NULL); + timerlat_hist_usage(); break; case 'H': params->common.hk_cpus = 1; retval = parse_cpu_set(optarg, ¶ms->common.hk_cpu_set); - if (retval) { - err_msg("Error parsing house keeping CPUs\n"); - exit(EXIT_FAILURE); - } + if (retval) + fatal("Error parsing house keeping CPUs"); break; case 'i': params->common.stop_us = get_llong_from_str(optarg); @@ -953,12 +919,12 @@ static struct common_params case 'p': params->timerlat_period_us = get_llong_from_str(optarg); if (params->timerlat_period_us > 1000000) - timerlat_hist_usage("Period longer than 1 s\n"); + fatal("Period longer than 1 s"); break; case 'P': retval = parse_prio(optarg, ¶ms->common.sched_param); if (retval == -1) - timerlat_hist_usage("Invalid -P priority"); + fatal("Invalid -P priority"); params->common.set_sched = 1; break; case 's': @@ -968,14 +934,8 @@ static struct common_params params->common.stop_total_us = get_llong_from_str(optarg); break; case 't': - if (optarg) { - if (optarg[0] == '=') - trace_output = &optarg[1]; - else - trace_output = &optarg[0]; - } else if (optind < argc && argv[optind][0] != '-') - trace_output = argv[optind]; - else + trace_output = parse_optional_arg(argc, argv); + if (!trace_output) trace_output = "timerlat_trace.txt"; break; case 'u': @@ -1005,31 +965,25 @@ static struct common_params case '6': /* trigger */ if (params->common.events) { retval = trace_event_add_trigger(params->common.events, optarg); - if (retval) { - err_msg("Error adding trigger %s\n", optarg); - exit(EXIT_FAILURE); - } + if (retval) + fatal("Error adding trigger %s", optarg); } else { - timerlat_hist_usage("--trigger requires a previous -e\n"); + fatal("--trigger requires a previous -e"); } break; case '7': /* filter */ if (params->common.events) { retval = trace_event_add_filter(params->common.events, optarg); - if (retval) { - err_msg("Error adding filter %s\n", optarg); - exit(EXIT_FAILURE); - } + if (retval) + fatal("Error adding filter %s", optarg); } else { - timerlat_hist_usage("--filter requires a previous -e\n"); + fatal("--filter requires a previous -e"); } break; case '8': params->dma_latency = get_llong_from_str(optarg); - if (params->dma_latency < 0 || params->dma_latency > 10000) { - err_msg("--dma-latency needs to be >= 0 and < 10000"); - exit(EXIT_FAILURE); - } + if (params->dma_latency < 0 || params->dma_latency > 10000) + fatal("--dma-latency needs to be >= 0 and < 10000"); break; case '9': params->no_aa = 1; @@ -1049,37 +1003,31 @@ static struct common_params case '\5': retval = actions_parse(¶ms->common.threshold_actions, optarg, "timerlat_trace.txt"); - if (retval) { - err_msg("Invalid action %s\n", optarg); - exit(EXIT_FAILURE); - } + if (retval) + fatal("Invalid action %s", optarg); break; case '\6': retval = actions_parse(¶ms->common.end_actions, optarg, "timerlat_trace.txt"); - if (retval) { - err_msg("Invalid action %s\n", optarg); - exit(EXIT_FAILURE); - } + if (retval) + fatal("Invalid action %s", optarg); break; default: - timerlat_hist_usage("Invalid option"); + fatal("Invalid option"); } } if (trace_output) actions_add_trace_output(¶ms->common.threshold_actions, trace_output); - if (geteuid()) { - err_msg("rtla needs root permission\n"); - exit(EXIT_FAILURE); - } + if (geteuid()) + fatal("rtla needs root permission"); if (params->common.hist.no_irq && params->common.hist.no_thread) - timerlat_hist_usage("no-irq and no-thread set, there is nothing to do here"); + fatal("no-irq and no-thread set, there is nothing to do here"); if (params->common.hist.no_index && !params->common.hist.with_zeros) - timerlat_hist_usage("no-index set with with-zeros is not set - it does not make sense"); + fatal("no-index set with with-zeros is not set - it does not make sense"); /* * Auto analysis only happens if stop tracing, thus: @@ -1088,7 +1036,7 @@ static struct common_params params->no_aa = 1; if (params->common.kernel_workload && params->common.user_workload) - timerlat_hist_usage("--kernel-threads and --user-threads are mutually exclusive!"); + fatal("--kernel-threads and --user-threads are mutually exclusive!"); /* * If auto-analysis or trace output is enabled, switch from BPF mode to diff --git a/tools/tracing/rtla/src/timerlat_top.c b/tools/tracing/rtla/src/timerlat_top.c index fc479a0dcb59..29c2c1f717ed 100644 --- a/tools/tracing/rtla/src/timerlat_top.c +++ b/tools/tracing/rtla/src/timerlat_top.c @@ -459,9 +459,7 @@ timerlat_print_stats(struct osnoise_tool *top) timerlat_top_header(params, top); - for (i = 0; i < nr_cpus; i++) { - if (params->common.cpus && !CPU_ISSET(i, ¶ms->common.monitored_cpus)) - continue; + for_each_monitored_cpu(i, nr_cpus, ¶ms->common) { timerlat_top_print(top, i); timerlat_top_update_sum(top, i, &summary); } @@ -476,15 +474,15 @@ timerlat_print_stats(struct osnoise_tool *top) /* * timerlat_top_usage - prints timerlat top usage message */ -static void timerlat_top_usage(char *usage) +static void timerlat_top_usage(void) { int i; static const char *const msg[] = { "", " usage: rtla timerlat [top] [-h] [-q] [-a us] [-d s] [-D] [-n] [-p us] [-i us] [-T us] [-s us] \\", - " [[-t[file]] [-e sys[:event]] [--filter <filter>] [--trigger <trigger>] [-c cpu-list] [-H cpu-list]\\", - " [-P priority] [--dma-latency us] [--aa-only us] [-C[=cgroup_name]] [-u|-k] [--warm-up s] [--deepest-idle-state n]", + " [[-t [file]] [-e sys[:event]] [--filter <filter>] [--trigger <trigger>] [-c cpu-list] [-H cpu-list]\\", + " [-P priority] [--dma-latency us] [--aa-only us] [-C [cgroup_name]] [-u|-k] [--warm-up s] [--deepest-idle-state n]", "", " -h/--help: print this menu", " -a/--auto: set automatic trace mode, stopping the session if argument in us latency is hit", @@ -495,11 +493,11 @@ static void timerlat_top_usage(char *usage) " -s/--stack us: save the stack trace at the IRQ if a thread latency is higher than the argument in us", " -c/--cpus cpus: run the tracer only on the given cpus", " -H/--house-keeping cpus: run rtla control threads only on the given cpus", - " -C/--cgroup[=cgroup_name]: set cgroup, if no cgroup_name is passed, the rtla's cgroup will be inherited", + " -C/--cgroup [cgroup_name]: set cgroup, if no cgroup_name is passed, the rtla's cgroup will be inherited", " -d/--duration time[s|m|h|d]: duration of the session", " -D/--debug: print debug info", " --dump-tasks: prints the task running on all CPUs if stop conditions are met (depends on !--no-aa)", - " -t/--trace[file]: save the stopped trace to [file|timerlat_trace.txt]", + " -t/--trace [file]: save the stopped trace to [file|timerlat_trace.txt]", " -e/--event <sys:event>: enable the <sys:event> in the trace instance, multiple -e are allowed", " --filter <command>: enable a trace event filter to the previous -e event", " --trigger <command>: enable a trace event trigger to the previous -e event", @@ -524,18 +522,12 @@ static void timerlat_top_usage(char *usage) NULL, }; - if (usage) - fprintf(stderr, "%s\n", usage); - fprintf(stderr, "rtla timerlat top: a per-cpu summary of the timer latency (version %s)\n", VERSION); for (i = 0; msg[i]; i++) fprintf(stderr, "%s\n", msg[i]); - if (usage) - exit(EXIT_FAILURE); - exit(EXIT_SUCCESS); } @@ -606,11 +598,8 @@ static struct common_params {0, 0, 0, 0} }; - /* getopt_long stores the option index here. */ - int option_index = 0; - c = getopt_long(argc, argv, "a:c:C::d:De:hH:i:knp:P:qs:t::T:uU0:1:2:345:6:7:", - long_options, &option_index); + long_options, NULL); /* detect the end of the options. */ if (c == -1) @@ -628,7 +617,8 @@ static struct common_params params->print_stack = auto_thresh; /* set trace */ - trace_output = "timerlat_trace.txt"; + if (!trace_output) + trace_output = "timerlat_trace.txt"; break; case '5': @@ -648,18 +638,12 @@ static struct common_params case 'c': retval = parse_cpu_set(optarg, ¶ms->common.monitored_cpus); if (retval) - timerlat_top_usage("\nInvalid -c cpu list\n"); + fatal("Invalid -c cpu list"); params->common.cpus = optarg; break; case 'C': params->common.cgroup = 1; - if (!optarg) { - /* will inherit this cgroup */ - params->common.cgroup_name = NULL; - } else if (*optarg == '=') { - /* skip the = */ - params->common.cgroup_name = ++optarg; - } + params->common.cgroup_name = optarg; break; case 'D': config_debug = 1; @@ -667,14 +651,12 @@ static struct common_params case 'd': params->common.duration = parse_seconds_duration(optarg); if (!params->common.duration) - timerlat_top_usage("Invalid -d duration\n"); + fatal("Invalid -d duration"); break; case 'e': tevent = trace_event_alloc(optarg); - if (!tevent) { - err_msg("Error alloc trace event"); - exit(EXIT_FAILURE); - } + if (!tevent) + fatal("Error alloc trace event"); if (params->common.events) tevent->next = params->common.events; @@ -682,15 +664,13 @@ static struct common_params break; case 'h': case '?': - timerlat_top_usage(NULL); + timerlat_top_usage(); break; case 'H': params->common.hk_cpus = 1; retval = parse_cpu_set(optarg, ¶ms->common.hk_cpu_set); - if (retval) { - err_msg("Error parsing house keeping CPUs\n"); - exit(EXIT_FAILURE); - } + if (retval) + fatal("Error parsing house keeping CPUs"); break; case 'i': params->common.stop_us = get_llong_from_str(optarg); @@ -704,12 +684,12 @@ static struct common_params case 'p': params->timerlat_period_us = get_llong_from_str(optarg); if (params->timerlat_period_us > 1000000) - timerlat_top_usage("Period longer than 1 s\n"); + fatal("Period longer than 1 s"); break; case 'P': retval = parse_prio(optarg, ¶ms->common.sched_param); if (retval == -1) - timerlat_top_usage("Invalid -P priority"); + fatal("Invalid -P priority"); params->common.set_sched = 1; break; case 'q': @@ -722,14 +702,8 @@ static struct common_params params->common.stop_total_us = get_llong_from_str(optarg); break; case 't': - if (optarg) { - if (optarg[0] == '=') - trace_output = &optarg[1]; - else - trace_output = &optarg[0]; - } else if (optind < argc && argv[optind][0] != '-') - trace_output = argv[optind]; - else + trace_output = parse_optional_arg(argc, argv); + if (!trace_output) trace_output = "timerlat_trace.txt"; break; case 'u': @@ -741,31 +715,25 @@ static struct common_params case '0': /* trigger */ if (params->common.events) { retval = trace_event_add_trigger(params->common.events, optarg); - if (retval) { - err_msg("Error adding trigger %s\n", optarg); - exit(EXIT_FAILURE); - } + if (retval) + fatal("Error adding trigger %s", optarg); } else { - timerlat_top_usage("--trigger requires a previous -e\n"); + fatal("--trigger requires a previous -e"); } break; case '1': /* filter */ if (params->common.events) { retval = trace_event_add_filter(params->common.events, optarg); - if (retval) { - err_msg("Error adding filter %s\n", optarg); - exit(EXIT_FAILURE); - } + if (retval) + fatal("Error adding filter %s", optarg); } else { - timerlat_top_usage("--filter requires a previous -e\n"); + fatal("--filter requires a previous -e"); } break; case '2': /* dma-latency */ params->dma_latency = get_llong_from_str(optarg); - if (params->dma_latency < 0 || params->dma_latency > 10000) { - err_msg("--dma-latency needs to be >= 0 and < 10000"); - exit(EXIT_FAILURE); - } + if (params->dma_latency < 0 || params->dma_latency > 10000) + fatal("--dma-latency needs to be >= 0 and < 10000"); break; case '3': /* no-aa */ params->no_aa = 1; @@ -785,31 +753,25 @@ static struct common_params case '9': retval = actions_parse(¶ms->common.threshold_actions, optarg, "timerlat_trace.txt"); - if (retval) { - err_msg("Invalid action %s\n", optarg); - exit(EXIT_FAILURE); - } + if (retval) + fatal("Invalid action %s", optarg); break; case '\1': retval = actions_parse(¶ms->common.end_actions, optarg, "timerlat_trace.txt"); - if (retval) { - err_msg("Invalid action %s\n", optarg); - exit(EXIT_FAILURE); - } + if (retval) + fatal("Invalid action %s", optarg); break; default: - timerlat_top_usage("Invalid option"); + fatal("Invalid option"); } } if (trace_output) actions_add_trace_output(¶ms->common.threshold_actions, trace_output); - if (geteuid()) { - err_msg("rtla needs root permission\n"); - exit(EXIT_FAILURE); - } + if (geteuid()) + fatal("rtla needs root permission"); /* * Auto analysis only happens if stop tracing, thus: @@ -818,10 +780,10 @@ static struct common_params params->no_aa = 1; if (params->no_aa && params->common.aa_only) - timerlat_top_usage("--no-aa and --aa-only are mutually exclusive!"); + fatal("--no-aa and --aa-only are mutually exclusive!"); if (params->common.kernel_workload && params->common.user_workload) - timerlat_top_usage("--kernel-threads and --user-threads are mutually exclusive!"); + fatal("--kernel-threads and --user-threads are mutually exclusive!"); /* * If auto-analysis or trace output is enabled, switch from BPF mode to @@ -916,7 +878,7 @@ timerlat_top_bpf_main_loop(struct osnoise_tool *tool) if (!params->common.quiet) timerlat_print_stats(tool); - if (wait_retval == 1) { + if (wait_retval != 0) { /* Stopping requested by tracer */ actions_perform(¶ms->common.threshold_actions); diff --git a/tools/tracing/rtla/src/timerlat_u.c b/tools/tracing/rtla/src/timerlat_u.c index 01dbf9a6b5a5..ce68e39d25fd 100644 --- a/tools/tracing/rtla/src/timerlat_u.c +++ b/tools/tracing/rtla/src/timerlat_u.c @@ -51,10 +51,8 @@ static int timerlat_u_main(int cpu, struct timerlat_u_params *params) if (!params->sched_param) { retval = sched_setscheduler(0, SCHED_FIFO, &sp); - if (retval < 0) { - err_msg("Error setting timerlat u default priority: %s\n", strerror(errno)); - exit(1); - } + if (retval < 0) + fatal("Error setting timerlat u default priority: %s", strerror(errno)); } else { retval = __set_sched_attr(getpid(), params->sched_param); if (retval) { @@ -78,10 +76,8 @@ static int timerlat_u_main(int cpu, struct timerlat_u_params *params) snprintf(buffer, sizeof(buffer), "osnoise/per_cpu/cpu%d/timerlat_fd", cpu); timerlat_fd = tracefs_instance_file_open(NULL, buffer, O_RDONLY); - if (timerlat_fd < 0) { - err_msg("Error opening %s:%s\n", buffer, strerror(errno)); - exit(1); - } + if (timerlat_fd < 0) + fatal("Error opening %s:%s", buffer, strerror(errno)); debug_msg("User-space timerlat pid %d on cpu %d\n", gettid(), cpu); diff --git a/tools/tracing/rtla/src/utils.c b/tools/tracing/rtla/src/utils.c index d6ab15dcb490..9cf5a0098e9a 100644 --- a/tools/tracing/rtla/src/utils.c +++ b/tools/tracing/rtla/src/utils.c @@ -57,6 +57,21 @@ void debug_msg(const char *fmt, ...) } /* + * fatal - print an error message and EOL to stderr and exit with ERROR + */ +void fatal(const char *fmt, ...) +{ + va_list ap; + + va_start(ap, fmt); + vfprintf(stderr, fmt, ap); + va_end(ap); + fprintf(stderr, "\n"); + + exit(ERROR); +} + +/* * get_llong_from_str - get a long long int from a string */ long long get_llong_from_str(char *start) @@ -959,3 +974,29 @@ int auto_house_keeping(cpu_set_t *monitored_cpus) return 1; } + +/** + * parse_optional_arg - Parse optional argument value + * + * Parse optional argument value, which can be in the form of: + * -sarg, -s/--long=arg, -s/--long arg + * + * Returns arg value if found, NULL otherwise. + */ +char *parse_optional_arg(int argc, char **argv) +{ + if (optarg) { + if (optarg[0] == '=') { + /* skip the = */ + return &optarg[1]; + } else { + return optarg; + } + /* parse argument of form -s [arg] and --long [arg]*/ + } else if (optind < argc && argv[optind][0] != '-') { + /* consume optind */ + return argv[optind++]; + } else { + return NULL; + } +} diff --git a/tools/tracing/rtla/src/utils.h b/tools/tracing/rtla/src/utils.h index a2a6f89f342d..091df4ba4587 100644 --- a/tools/tracing/rtla/src/utils.h +++ b/tools/tracing/rtla/src/utils.h @@ -19,11 +19,13 @@ extern int config_debug; void debug_msg(const char *fmt, ...); void err_msg(const char *fmt, ...); +void fatal(const char *fmt, ...); long parse_seconds_duration(char *val); void get_duration(time_t start_time, char *output, int output_size); int parse_cpu_list(char *cpu_list, char **monitored_cpus); +char *parse_optional_arg(int argc, char **argv); long long get_llong_from_str(char *start); static inline void diff --git a/tools/tracing/rtla/tests/osnoise.t b/tools/tracing/rtla/tests/osnoise.t index e3c89d45a6bb..396334608920 100644 --- a/tools/tracing/rtla/tests/osnoise.t +++ b/tools/tracing/rtla/tests/osnoise.t @@ -37,11 +37,11 @@ check "multiple actions" \ check "hist stop at failed action" \ "osnoise hist -S 2 --on-threshold shell,command='echo -n 1; false' --on-threshold shell,command='echo -n 2'" 2 "^1# RTLA osnoise histogram$" check "top stop at failed action" \ - "timerlat top -T 2 --on-threshold shell,command='echo -n abc; false' --on-threshold shell,command='echo -n defgh'" 2 "^abc" "defgh" + "osnoise top -S 2 --on-threshold shell,command='echo -n abc; false' --on-threshold shell,command='echo -n defgh'" 2 "^abc" "defgh" check "hist with continue" \ - "osnoise hist -S 2 -d 1s --on-threshold shell,command='echo TestOutput' --on-threshold continue" 0 "^TestOutput$" + "osnoise hist -S 2 -d 5s --on-threshold shell,command='echo TestOutput' --on-threshold continue" 0 "^TestOutput$" check "top with continue" \ - "osnoise top -q -S 2 -d 1s --on-threshold shell,command='echo TestOutput' --on-threshold continue" 0 "^TestOutput$" + "osnoise top -q -S 2 -d 5s --on-threshold shell,command='echo TestOutput' --on-threshold continue" 0 "^TestOutput$" check "hist with trace output at end" \ "osnoise hist -d 1s --on-end trace" 0 "^ Saving trace to osnoise_trace.txt$" check "top with trace output at end" \ diff --git a/tools/tracing/rtla/tests/timerlat.t b/tools/tracing/rtla/tests/timerlat.t index b5d1e7260a9b..bbaa1897d8a8 100644 --- a/tools/tracing/rtla/tests/timerlat.t +++ b/tools/tracing/rtla/tests/timerlat.t @@ -58,11 +58,11 @@ check "multiple actions" \ check "hist stop at failed action" \ "timerlat hist -T 2 --on-threshold shell,command='echo -n 1; false' --on-threshold shell,command='echo -n 2'" 2 "^1# RTLA timerlat histogram$" check "top stop at failed action" \ - "timerlat top -T 2 --on-threshold shell,command='echo -n 1; false' --on-threshold shell,command='echo -n 2'" 2 "^1ALL" + "timerlat top -T 2 --on-threshold shell,command='echo -n abc; false' --on-threshold shell,command='echo -n defgh'" 2 "^abc" "defgh" check "hist with continue" \ - "timerlat hist -T 2 -d 1s --on-threshold shell,command='echo TestOutput' --on-threshold continue" 0 "^TestOutput$" + "timerlat hist -T 2 -d 5s --on-threshold shell,command='echo TestOutput' --on-threshold continue" 0 "^TestOutput$" check "top with continue" \ - "timerlat top -q -T 2 -d 1s --on-threshold shell,command='echo TestOutput' --on-threshold continue" 0 "^TestOutput$" + "timerlat top -q -T 2 -d 5s --on-threshold shell,command='echo TestOutput' --on-threshold continue" 0 "^TestOutput$" check "hist with trace output at end" \ "timerlat hist -d 1s --on-end trace" 0 "^ Saving trace to timerlat_trace.txt$" check "top with trace output at end" \ |
